Linux 6.9-rc4
[sfrench/cifs-2.6.git] / kernel / resource.c
index 158f04ec1d4fad91702c7c38a972f3f1775fb727..fcbca39dbc4508ca9b349ef7213332029b87f9ea 100644 (file)
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
+#include <linux/pseudo_fs.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/device.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/resource_ext.h>
+#include <uapi/linux/magic.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
 #include <asm/io.h>
 
 
@@ -53,33 +58,17 @@ struct resource_constraint {
 
 static DEFINE_RWLOCK(resource_lock);
 
-/*
- * For memory hotplug, there is no way to free resource entries allocated
- * by boot mem after the system is up. So for reusing the resource entry
- * we need to remember the resource.
- */
-static struct resource *bootmem_resource_free;
-static DEFINE_SPINLOCK(bootmem_resource_lock);
-
-static struct resource *next_resource(struct resource *p, bool sibling_only)
+static struct resource *next_resource(struct resource *p, bool skip_children)
 {
-       /* Caller wants to traverse through siblings only */
-       if (sibling_only)
-               return p->sibling;
-
-       if (p->child)
+       if (!skip_children && p->child)
                return p->child;
        while (!p->sibling && p->parent)
                p = p->parent;
        return p->sibling;
 }
 
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
-       struct resource *p = v;
-       (*pos)++;
-       return (void *)next_resource(p, false);
-}
+#define for_each_resource(_root, _p, _skip_children) \
+       for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children))
 
 #ifdef CONFIG_PROC_FS
 
@@ -88,14 +77,28 @@ enum { MAX_IORES_LEVEL = 5 };
 static void *r_start(struct seq_file *m, loff_t *pos)
        __acquires(resource_lock)
 {
-       struct resource *p = PDE_DATA(file_inode(m->file));
-       loff_t l = 0;
+       struct resource *root = pde_data(file_inode(m->file));
+       struct resource *p;
+       loff_t l = *pos;
+
        read_lock(&resource_lock);
-       for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
-               ;
+       for_each_resource(root, p, false) {
+               if (l-- == 0)
+                       break;
+       }
+
        return p;
 }
 
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct resource *p = v;
+
+       (*pos)++;
+
+       return (void *)next_resource(p, false);
+}
+
 static void r_stop(struct seq_file *m, void *v)
        __releases(resource_lock)
 {
@@ -104,7 +107,7 @@ static void r_stop(struct seq_file *m, void *v)
 
 static int r_show(struct seq_file *m, void *v)
 {
-       struct resource *root = PDE_DATA(file_inode(m->file));
+       struct resource *root = pde_data(file_inode(m->file));
        struct resource *r = v, *p;
        unsigned long long start, end;
        int width = root->end < 0x10000 ? 4 : 8;
@@ -149,36 +152,19 @@ __initcall(ioresources_init);
 
 static void free_resource(struct resource *res)
 {
-       if (!res)
-               return;
-
-       if (!PageSlab(virt_to_head_page(res))) {
-               spin_lock(&bootmem_resource_lock);
-               res->sibling = bootmem_resource_free;
-               bootmem_resource_free = res;
-               spin_unlock(&bootmem_resource_lock);
-       } else {
+       /**
+        * If the resource was allocated using memblock early during boot
+        * we'll leak it here: we can only return full pages back to the
+        * buddy and trying to be smart and reusing them eventually in
+        * alloc_resource() overcomplicates resource handling.
+        */
+       if (res && PageSlab(virt_to_head_page(res)))
                kfree(res);
-       }
 }
 
 static struct resource *alloc_resource(gfp_t flags)
 {
-       struct resource *res = NULL;
-
-       spin_lock(&bootmem_resource_lock);
-       if (bootmem_resource_free) {
-               res = bootmem_resource_free;
-               bootmem_resource_free = res->sibling;
-       }
-       spin_unlock(&bootmem_resource_lock);
-
-       if (res)
-               memset(res, 0, sizeof(struct resource));
-       else
-               res = kzalloc(sizeof(struct resource), flags);
-
-       return res;
+       return kzalloc(sizeof(struct resource), flags);
 }
 
 /* Return the conflict entry if you can't request it */
@@ -320,27 +306,25 @@ int release_resource(struct resource *old)
 EXPORT_SYMBOL(release_resource);
 
 /**
- * Finds the lowest iomem resource that covers part of [@start..@end].  The
- * caller must specify @start, @end, @flags, and @desc (which may be
- * IORES_DESC_NONE).
+ * find_next_iomem_res - Finds the lowest iomem resource that covers part of
+ *                      [@start..@end].
  *
  * If a resource is found, returns 0 and @*res is overwritten with the part
  * of the resource that's within [@start..@end]; if none is found, returns
- * -1 or -EINVAL for other invalid parameters.
- *
- * This function walks the whole tree and not just first level children
- * unless @first_lvl is true.
+ * -ENODEV.  Returns -EINVAL for invalid parameters.
  *
  * @start:     start address of the resource searched for
  * @end:       end address of same resource
  * @flags:     flags which the resource must have
  * @desc:      descriptor the resource must have
- * @first_lvl: walk only the first level children, if set
  * @res:       return ptr, if resource found
+ *
+ * The caller must specify @start, @end, @flags, and @desc
+ * (which may be IORES_DESC_NONE).
  */
 static int find_next_iomem_res(resource_size_t start, resource_size_t end,
                               unsigned long flags, unsigned long desc,
-                              bool first_lvl, struct resource *res)
+                              struct resource *res)
 {
        struct resource *p;
 
@@ -352,41 +336,51 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
 
        read_lock(&resource_lock);
 
-       for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
-               if ((p->flags & flags) != flags)
-                       continue;
-               if ((desc != IORES_DESC_NONE) && (desc != p->desc))
-                       continue;
+       for_each_resource(&iomem_resource, p, false) {
+               /* If we passed the resource we are looking for, stop */
                if (p->start > end) {
                        p = NULL;
                        break;
                }
-               if ((p->end >= start) && (p->start <= end))
-                       break;
+
+               /* Skip until we find a range that matches what we look for */
+               if (p->end < start)
+                       continue;
+
+               if ((p->flags & flags) != flags)
+                       continue;
+               if ((desc != IORES_DESC_NONE) && (desc != p->desc))
+                       continue;
+
+               /* Found a match, break */
+               break;
+       }
+
+       if (p) {
+               /* copy data */
+               *res = (struct resource) {
+                       .start = max(start, p->start),
+                       .end = min(end, p->end),
+                       .flags = p->flags,
+                       .desc = p->desc,
+                       .parent = p->parent,
+               };
        }
 
        read_unlock(&resource_lock);
-       if (!p)
-               return -1;
-
-       /* copy data */
-       res->start = max(start, p->start);
-       res->end = min(end, p->end);
-       res->flags = p->flags;
-       res->desc = p->desc;
-       return 0;
+       return p ? 0 : -ENODEV;
 }
 
 static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
                                 unsigned long flags, unsigned long desc,
-                                bool first_lvl, void *arg,
+                                void *arg,
                                 int (*func)(struct resource *, void *))
 {
        struct resource res;
        int ret = -EINVAL;
 
        while (start < end &&
-              !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
+              !find_next_iomem_res(start, end, flags, desc, &res)) {
                ret = (*func)(&res, arg);
                if (ret)
                        break;
@@ -398,11 +392,9 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
 }
 
 /**
- * Walks through iomem resources and calls func() with matching resource
- * ranges. This walks through whole tree and not just first level children.
- * All the memory ranges which overlap start,end and also match flags and
- * desc are valid candidates.
- *
+ * walk_iomem_res_desc - Walks through iomem resources and calls func()
+ *                      with matching resource ranges.
+ * *
  * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
  * @flags: I/O resource flags
  * @start: start addr
@@ -410,13 +402,16 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
  * @arg: function argument for the callback @func
  * @func: callback function that is called for each qualifying resource area
  *
+ * All the memory ranges which overlap start,end and also match flags and
+ * desc are valid candidates.
+ *
  * NOTE: For a new descriptor search, define a new IORES_DESC in
  * <linux/ioport.h> and set it in 'desc' of a target resource entry.
  */
 int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
                u64 end, void *arg, int (*func)(struct resource *, void *))
 {
-       return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
+       return __walk_iomem_res_desc(start, end, flags, desc, arg, func);
 }
 EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
 
@@ -432,8 +427,63 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 {
        unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
-       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
-                                    arg, func);
+       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+                                    func);
+}
+
+/*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+                               int (*func)(struct resource *, void *))
+{
+       struct resource res, *rams;
+       int rams_size = 16, i;
+       unsigned long flags;
+       int ret = -1;
+
+       /* create a list */
+       rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL);
+       if (!rams)
+               return ret;
+
+       flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+       i = 0;
+       while ((start < end) &&
+               (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) {
+               if (i >= rams_size) {
+                       /* re-alloc */
+                       struct resource *rams_new;
+
+                       rams_new = kvrealloc(rams, rams_size * sizeof(struct resource),
+                                            (rams_size + 16) * sizeof(struct resource),
+                                            GFP_KERNEL);
+                       if (!rams_new)
+                               goto out;
+
+                       rams = rams_new;
+                       rams_size += 16;
+               }
+
+               rams[i].start = res.start;
+               rams[i++].end = res.end;
+
+               start = res.end + 1;
+       }
+
+       /* go reverse */
+       for (i--; i >= 0; i--) {
+               ret = (*func)(&rams[i], arg);
+               if (ret)
+                       break;
+       }
+
+out:
+       kvfree(rams);
+       return ret;
 }
 
 /*
@@ -445,17 +495,14 @@ int walk_mem_res(u64 start, u64 end, void *arg,
 {
        unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 
-       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
-                                    arg, func);
+       return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+                                    func);
 }
 
 /*
  * This function calls the @func callback against all memory ranges of type
  * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
  * It is to be used only for System RAM.
- *
- * This will find System RAM ranges that are children of top-level resources
- * in addition to top-level System RAM resources.
  */
 int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                          void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -470,10 +517,9 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
        flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
        while (start < end &&
-              !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
-                                   false, &res)) {
-               pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
-               end_pfn = (res.end + 1) >> PAGE_SHIFT;
+              !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) {
+               pfn = PFN_UP(res.start);
+               end_pfn = PFN_DOWN(res.end + 1);
                if (end_pfn > pfn)
                        ret = (*func)(pfn, end_pfn - pfn, arg);
                if (ret)
@@ -498,6 +544,35 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
 
+static int __region_intersects(struct resource *parent, resource_size_t start,
+                              size_t size, unsigned long flags,
+                              unsigned long desc)
+{
+       struct resource res;
+       int type = 0; int other = 0;
+       struct resource *p;
+
+       res.start = start;
+       res.end = start + size - 1;
+
+       for (p = parent->child; p ; p = p->sibling) {
+               bool is_type = (((p->flags & flags) == flags) &&
+                               ((desc == IORES_DESC_NONE) ||
+                                (desc == p->desc)));
+
+               if (resource_overlaps(p, &res))
+                       is_type ? type++ : other++;
+       }
+
+       if (type == 0)
+               return REGION_DISJOINT;
+
+       if (other == 0)
+               return REGION_INTERSECTS;
+
+       return REGION_MIXED;
+}
+
 /**
  * region_intersects() - determine intersection of region with known resources
  * @start: region start address
@@ -521,31 +596,13 @@ EXPORT_SYMBOL_GPL(page_is_ram);
 int region_intersects(resource_size_t start, size_t size, unsigned long flags,
                      unsigned long desc)
 {
-       struct resource res;
-       int type = 0; int other = 0;
-       struct resource *p;
-
-       res.start = start;
-       res.end = start + size - 1;
+       int ret;
 
        read_lock(&resource_lock);
-       for (p = iomem_resource.child; p ; p = p->sibling) {
-               bool is_type = (((p->flags & flags) == flags) &&
-                               ((desc == IORES_DESC_NONE) ||
-                                (desc == p->desc)));
-
-               if (resource_overlaps(p, &res))
-                       is_type ? type++ : other++;
-       }
+       ret = __region_intersects(&iomem_resource, start, size, flags, desc);
        read_unlock(&resource_lock);
 
-       if (other == 0)
-               return type ? REGION_INTERSECTS : REGION_DISJOINT;
-
-       if (type)
-               return REGION_MIXED;
-
-       return REGION_DISJOINT;
+       return ret;
 }
 EXPORT_SYMBOL_GPL(region_intersects);
 
@@ -886,10 +943,17 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
                if (conflict->end > new->end)
                        new->end = conflict->end;
 
-               printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+               pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
        }
        write_unlock(&resource_lock);
 }
+/*
+ * Not for general consumption, only early boot memory map parsing, PCI
+ * resource discovery, and late discovery of CXL resources are expected
+ * to use this interface. The former are built-in and only the latter,
+ * CXL, is a module.
+ */
+EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL);
 
 /**
  * remove_resource - Remove a resource in the resource tree
@@ -1097,30 +1161,65 @@ resource_size_t resource_alignment(struct resource *res)
 
 static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
 
-/**
- * __request_region - create a new busy resource region
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- * @name: reserving caller's ID string
- * @flags: IO resource flags
- */
-struct resource * __request_region(struct resource *parent,
+static struct inode *iomem_inode;
+
+#ifdef CONFIG_IO_STRICT_DEVMEM
+static void revoke_iomem(struct resource *res)
+{
+       /* pairs with smp_store_release() in iomem_init_inode() */
+       struct inode *inode = smp_load_acquire(&iomem_inode);
+
+       /*
+        * Check that the initialization has completed. Losing the race
+        * is ok because it means drivers are claiming resources before
+        * the fs_initcall level of init and prevent iomem_get_mapping users
+        * from establishing mappings.
+        */
+       if (!inode)
+               return;
+
+       /*
+        * The expectation is that the driver has successfully marked
+        * the resource busy by this point, so devmem_is_allowed()
+        * should start returning false, however for performance this
+        * does not iterate the entire resource range.
+        */
+       if (devmem_is_allowed(PHYS_PFN(res->start)) &&
+           devmem_is_allowed(PHYS_PFN(res->end))) {
+               /*
+                * *cringe* iomem=relaxed says "go ahead, what's the
+                * worst that can happen?"
+                */
+               return;
+       }
+
+       unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1);
+}
+#else
+static void revoke_iomem(struct resource *res) {}
+#endif
+
+struct address_space *iomem_get_mapping(void)
+{
+       /*
+        * This function is only called from file open paths, hence guaranteed
+        * that fs_initcalls have completed and no need to check for NULL. But
+        * since revoke_iomem can be called before the initcall we still need
+        * the barrier to appease checkers.
+        */
+       return smp_load_acquire(&iomem_inode)->i_mapping;
+}
+
+static int __request_region_locked(struct resource *res, struct resource *parent,
                                   resource_size_t start, resource_size_t n,
                                   const char *name, int flags)
 {
        DECLARE_WAITQUEUE(wait, current);
-       struct resource *res = alloc_resource(GFP_KERNEL);
-
-       if (!res)
-               return NULL;
 
        res->name = name;
        res->start = start;
        res->end = start + n - 1;
 
-       write_lock(&resource_lock);
-
        for (;;) {
                struct resource *conflict;
 
@@ -1156,11 +1255,42 @@ struct resource * __request_region(struct resource *parent,
                        continue;
                }
                /* Uhhuh, that didn't work out.. */
-               free_resource(res);
-               res = NULL;
-               break;
+               return -EBUSY;
        }
+
+       return 0;
+}
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ * @flags: IO resource flags
+ */
+struct resource *__request_region(struct resource *parent,
+                                 resource_size_t start, resource_size_t n,
+                                 const char *name, int flags)
+{
+       struct resource *res = alloc_resource(GFP_KERNEL);
+       int ret;
+
+       if (!res)
+               return NULL;
+
+       write_lock(&resource_lock);
+       ret = __request_region_locked(res, parent, start, n, name, flags);
        write_unlock(&resource_lock);
+
+       if (ret) {
+               free_resource(res);
+               return NULL;
+       }
+
+       if (parent == &iomem_resource)
+               revoke_iomem(res);
+
        return res;
 }
 EXPORT_SYMBOL(__request_region);
@@ -1208,16 +1338,13 @@ void __release_region(struct resource *parent, resource_size_t start,
 
        write_unlock(&resource_lock);
 
-       printk(KERN_WARNING "Trying to free nonexistent resource "
-               "<%016llx-%016llx>\n", (unsigned long long)start,
-               (unsigned long long)end);
+       pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end);
 }
 EXPORT_SYMBOL(__release_region);
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /**
  * release_mem_region_adjustable - release a previously reserved memory region
- * @parent: parent resource descriptor
  * @start: resource start address
  * @size: resource region size
  *
@@ -1235,21 +1362,28 @@ EXPORT_SYMBOL(__release_region);
  *   assumes that all children remain in the lower address entry for
  *   simplicity.  Enhance this logic when necessary.
  */
-int release_mem_region_adjustable(struct resource *parent,
-                                 resource_size_t start, resource_size_t size)
+void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
 {
+       struct resource *parent = &iomem_resource;
+       struct resource *new_res = NULL;
+       bool alloc_nofail = false;
        struct resource **p;
        struct resource *res;
-       struct resource *new_res;
        resource_size_t end;
-       int ret = -EINVAL;
 
        end = start + size - 1;
-       if ((start < parent->start) || (end > parent->end))
-               return ret;
+       if (WARN_ON_ONCE((start < parent->start) || (end > parent->end)))
+               return;
 
-       /* The alloc_resource() result gets checked later */
-       new_res = alloc_resource(GFP_KERNEL);
+       /*
+        * We free up quite a lot of memory on memory hotunplug (esp., memap),
+        * just before releasing the region. This is highly unlikely to
+        * fail - let's play save and make it never fail as the caller cannot
+        * perform any error handling (e.g., trying to re-add memory will fail
+        * similarly).
+        */
+retry:
+       new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0));
 
        p = &parent->child;
        write_lock(&resource_lock);
@@ -1264,21 +1398,6 @@ int release_mem_region_adjustable(struct resource *parent,
                        continue;
                }
 
-               /*
-                * All memory regions added from memory-hotplug path have the
-                * flag IORESOURCE_SYSTEM_RAM. If the resource does not have
-                * this flag, we know that we are dealing with a resource coming
-                * from HMM/devm. HMM/devm use another mechanism to add/release
-                * a resource. This goes via devm_request_mem_region and
-                * devm_release_mem_region.
-                * HMM/devm take care to release their resources when they want,
-                * so if we are dealing with them, let us just back off here.
-                */
-               if (!(res->flags & IORESOURCE_SYSRAM)) {
-                       ret = 0;
-                       break;
-               }
-
                if (!(res->flags & IORESOURCE_MEM))
                        break;
 
@@ -1292,20 +1411,23 @@ int release_mem_region_adjustable(struct resource *parent,
                        /* free the whole entry */
                        *p = res->sibling;
                        free_resource(res);
-                       ret = 0;
                } else if (res->start == start && res->end != end) {
                        /* adjust the start */
-                       ret = __adjust_resource(res, end + 1,
-                                               res->end - end);
+                       WARN_ON_ONCE(__adjust_resource(res, end + 1,
+                                                      res->end - end));
                } else if (res->start != start && res->end == end) {
                        /* adjust the end */
-                       ret = __adjust_resource(res, res->start,
-                                               start - res->start);
+                       WARN_ON_ONCE(__adjust_resource(res, res->start,
+                                                      start - res->start));
                } else {
-                       /* split into two entries */
+                       /* split into two entries - we need a new resource */
                        if (!new_res) {
-                               ret = -ENOMEM;
-                               break;
+                               new_res = alloc_resource(GFP_ATOMIC);
+                               if (!new_res) {
+                                       alloc_nofail = true;
+                                       write_unlock(&resource_lock);
+                                       goto retry;
+                               }
                        }
                        new_res->name = res->name;
                        new_res->start = end + 1;
@@ -1316,9 +1438,8 @@ int release_mem_region_adjustable(struct resource *parent,
                        new_res->sibling = res->sibling;
                        new_res->child = NULL;
 
-                       ret = __adjust_resource(res, res->start,
-                                               start - res->start);
-                       if (ret)
+                       if (WARN_ON_ONCE(__adjust_resource(res, res->start,
+                                                          start - res->start)))
                                break;
                        res->sibling = new_res;
                        new_res = NULL;
@@ -1329,10 +1450,69 @@ int release_mem_region_adjustable(struct resource *parent,
 
        write_unlock(&resource_lock);
        free_resource(new_res);
-       return ret;
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool system_ram_resources_mergeable(struct resource *r1,
+                                          struct resource *r2)
+{
+       /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */
+       return r1->flags == r2->flags && r1->end + 1 == r2->start &&
+              r1->name == r2->name && r1->desc == r2->desc &&
+              !r1->child && !r2->child;
+}
+
+/**
+ * merge_system_ram_resource - mark the System RAM resource mergeable and try to
+ *     merge it with adjacent, mergeable resources
+ * @res: resource descriptor
+ *
+ * This interface is intended for memory hotplug, whereby lots of contiguous
+ * system ram resources are added (e.g., via add_memory*()) by a driver, and
+ * the actual resource boundaries are not of interest (e.g., it might be
+ * relevant for DIMMs). Only resources that are marked mergeable, that have the
+ * same parent, and that don't have any children are considered. All mergeable
+ * resources must be immutable during the request.
+ *
+ * Note:
+ * - The caller has to make sure that no pointers to resources that are
+ *   marked mergeable are used anymore after this call - the resource might
+ *   be freed and the pointer might be stale!
+ * - release_mem_region_adjustable() will split on demand on memory hotunplug
+ */
+void merge_system_ram_resource(struct resource *res)
+{
+       const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+       struct resource *cur;
+
+       if (WARN_ON_ONCE((res->flags & flags) != flags))
+               return;
+
+       write_lock(&resource_lock);
+       res->flags |= IORESOURCE_SYSRAM_MERGEABLE;
+
+       /* Try to merge with next item in the list. */
+       cur = res->sibling;
+       if (cur && system_ram_resources_mergeable(res, cur)) {
+               res->end = cur->end;
+               res->sibling = cur->sibling;
+               free_resource(cur);
+       }
+
+       /* Try to merge with previous item in the list. */
+       cur = res->parent->child;
+       while (cur && cur->sibling != res)
+               cur = cur->sibling;
+       if (cur && system_ram_resources_mergeable(cur, res)) {
+               cur->end = res->end;
+               cur->sibling = res->sibling;
+               free_resource(res);
+       }
+       write_unlock(&resource_lock);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
 /*
  * Managed region resource
  */
@@ -1516,22 +1696,22 @@ __setup("reserve=", reserve_setup);
  */
 int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
 {
-       struct resource *p = &iomem_resource;
+       resource_size_t end = addr + size - 1;
+       struct resource *p;
        int err = 0;
-       loff_t l;
 
        read_lock(&resource_lock);
-       for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+       for_each_resource(&iomem_resource, p, false) {
                /*
                 * We can probably skip the resources without
                 * IORESOURCE_IO attribute?
                 */
-               if (p->start >= addr + size)
+               if (p->start > end)
                        continue;
                if (p->end < addr)
                        continue;
                if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
-                   PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
+                   PFN_DOWN(p->end) >= PFN_DOWN(end))
                        continue;
                /*
                 * if a resource is "BUSY", it's not a hardware resource
@@ -1542,10 +1722,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
                if (p->flags & IORESOURCE_BUSY)
                        continue;
 
-               printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
-                      (unsigned long long)addr,
-                      (unsigned long long)(addr + size - 1),
-                      p->name, p);
+               pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n",
+                       &addr, &end, p->name, p);
                err = -1;
                break;
        }
@@ -1561,37 +1739,46 @@ static int strict_iomem_checks;
 #endif
 
 /*
- * check if an address is reserved in the iomem resource tree
- * returns true if reserved, false if not reserved.
+ * Check if an address is exclusive to the kernel and must not be mapped to
+ * user space, for example, via /dev/mem.
+ *
+ * Returns true if exclusive to the kernel, otherwise returns false.
  */
-bool iomem_is_exclusive(u64 addr)
+bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size)
 {
-       struct resource *p = &iomem_resource;
-       bool err = false;
-       loff_t l;
-       int size = PAGE_SIZE;
-
-       if (!strict_iomem_checks)
-               return false;
-
-       addr = addr & PAGE_MASK;
+       const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
+                                                 IORESOURCE_EXCLUSIVE;
+       bool skip_children = false, err = false;
+       struct resource *p;
 
        read_lock(&resource_lock);
-       for (p = p->child; p ; p = r_next(NULL, p, &l)) {
-               /*
-                * We can probably skip the resources without
-                * IORESOURCE_IO attribute?
-                */
+       for_each_resource(root, p, skip_children) {
                if (p->start >= addr + size)
                        break;
-               if (p->end < addr)
+               if (p->end < addr) {
+                       skip_children = true;
                        continue;
+               }
+               skip_children = false;
+
+               /*
+                * IORESOURCE_SYSTEM_RAM resources are exclusive if
+                * IORESOURCE_EXCLUSIVE is set, even if they
+                * are not busy and even if "iomem=relaxed" is set. The
+                * responsible driver dynamically adds/removes system RAM within
+                * such an area and uncontrolled access is dangerous.
+                */
+               if ((p->flags & exclusive_system_ram) == exclusive_system_ram) {
+                       err = true;
+                       break;
+               }
+
                /*
                 * A resource is exclusive if IORESOURCE_EXCLUSIVE is set
                 * or CONFIG_IO_STRICT_DEVMEM is enabled and the
                 * resource is busy.
                 */
-               if ((p->flags & IORESOURCE_BUSY) == 0)
+               if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY))
                        continue;
                if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
                                || p->flags & IORESOURCE_EXCLUSIVE) {
@@ -1604,6 +1791,12 @@ bool iomem_is_exclusive(u64 addr)
        return err;
 }
 
+bool iomem_is_exclusive(u64 addr)
+{
+       return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK,
+                                    PAGE_SIZE);
+}
+
 struct resource_entry *resource_list_create_entry(struct resource *res,
                                                  size_t extra_size)
 {
@@ -1628,6 +1821,200 @@ void resource_list_free(struct list_head *head)
 }
 EXPORT_SYMBOL(resource_list_free);
 
+#ifdef CONFIG_GET_FREE_REGION
+#define GFR_DESCENDING         (1UL << 0)
+#define GFR_REQUEST_REGION     (1UL << 1)
+#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+
+static resource_size_t gfr_start(struct resource *base, resource_size_t size,
+                                resource_size_t align, unsigned long flags)
+{
+       if (flags & GFR_DESCENDING) {
+               resource_size_t end;
+
+               end = min_t(resource_size_t, base->end,
+                           (1ULL << MAX_PHYSMEM_BITS) - 1);
+               return end - size + 1;
+       }
+
+       return ALIGN(base->start, align);
+}
+
+static bool gfr_continue(struct resource *base, resource_size_t addr,
+                        resource_size_t size, unsigned long flags)
+{
+       if (flags & GFR_DESCENDING)
+               return addr > size && addr >= base->start;
+       /*
+        * In the ascend case be careful that the last increment by
+        * @size did not wrap 0.
+        */
+       return addr > addr - size &&
+              addr <= min_t(resource_size_t, base->end,
+                            (1ULL << MAX_PHYSMEM_BITS) - 1);
+}
+
+static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
+                               unsigned long flags)
+{
+       if (flags & GFR_DESCENDING)
+               return addr - size;
+       return addr + size;
+}
+
+static void remove_free_mem_region(void *_res)
+{
+       struct resource *res = _res;
+
+       if (res->parent)
+               remove_resource(res);
+       free_resource(res);
+}
+
+static struct resource *
+get_free_mem_region(struct device *dev, struct resource *base,
+                   resource_size_t size, const unsigned long align,
+                   const char *name, const unsigned long desc,
+                   const unsigned long flags)
+{
+       resource_size_t addr;
+       struct resource *res;
+       struct region_devres *dr = NULL;
+
+       size = ALIGN(size, align);
+
+       res = alloc_resource(GFP_KERNEL);
+       if (!res)
+               return ERR_PTR(-ENOMEM);
+
+       if (dev && (flags & GFR_REQUEST_REGION)) {
+               dr = devres_alloc(devm_region_release,
+                               sizeof(struct region_devres), GFP_KERNEL);
+               if (!dr) {
+                       free_resource(res);
+                       return ERR_PTR(-ENOMEM);
+               }
+       } else if (dev) {
+               if (devm_add_action_or_reset(dev, remove_free_mem_region, res))
+                       return ERR_PTR(-ENOMEM);
+       }
+
+       write_lock(&resource_lock);
+       for (addr = gfr_start(base, size, align, flags);
+            gfr_continue(base, addr, align, flags);
+            addr = gfr_next(addr, align, flags)) {
+               if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
+                   REGION_DISJOINT)
+                       continue;
+
+               if (flags & GFR_REQUEST_REGION) {
+                       if (__request_region_locked(res, &iomem_resource, addr,
+                                                   size, name, 0))
+                               break;
+
+                       if (dev) {
+                               dr->parent = &iomem_resource;
+                               dr->start = addr;
+                               dr->n = size;
+                               devres_add(dev, dr);
+                       }
+
+                       res->desc = desc;
+                       write_unlock(&resource_lock);
+
+
+                       /*
+                        * A driver is claiming this region so revoke any
+                        * mappings.
+                        */
+                       revoke_iomem(res);
+               } else {
+                       res->start = addr;
+                       res->end = addr + size - 1;
+                       res->name = name;
+                       res->desc = desc;
+                       res->flags = IORESOURCE_MEM;
+
+                       /*
+                        * Only succeed if the resource hosts an exclusive
+                        * range after the insert
+                        */
+                       if (__insert_resource(base, res) || res->child)
+                               break;
+
+                       write_unlock(&resource_lock);
+               }
+
+               return res;
+       }
+       write_unlock(&resource_lock);
+
+       if (flags & GFR_REQUEST_REGION) {
+               free_resource(res);
+               devres_free(dr);
+       } else if (dev)
+               devm_release_action(dev, remove_free_mem_region, res);
+
+       return ERR_PTR(-ERANGE);
+}
+
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+               struct resource *base, unsigned long size)
+{
+       unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+       return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
+                                  dev_name(dev),
+                                  IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
+}
+EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+
+struct resource *request_free_mem_region(struct resource *base,
+               unsigned long size, const char *name)
+{
+       unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+       return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
+                                  IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
+}
+EXPORT_SYMBOL_GPL(request_free_mem_region);
+
+/**
+ * alloc_free_mem_region - find a free region relative to @base
+ * @base: resource that will parent the new resource
+ * @size: size in bytes of memory to allocate from @base
+ * @align: alignment requirements for the allocation
+ * @name: resource name
+ *
+ * Buses like CXL, that can dynamically instantiate new memory regions,
+ * need a method to allocate physical address space for those regions.
+ * Allocate and insert a new resource to cover a free, unclaimed by a
+ * descendant of @base, range in the span of @base.
+ */
+struct resource *alloc_free_mem_region(struct resource *base,
+                                      unsigned long size, unsigned long align,
+                                      const char *name)
+{
+       /* Default of ascending direction and insert resource */
+       unsigned long flags = 0;
+
+       return get_free_mem_region(NULL, base, size, align, name,
+                                  IORES_DESC_NONE, flags);
+}
+EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL);
+#endif /* CONFIG_GET_FREE_REGION */
+
 static int __init strict_iomem(char *str)
 {
        if (strstr(str, "relaxed"))
@@ -1637,4 +2024,48 @@ static int __init strict_iomem(char *str)
        return 1;
 }
 
+static int iomem_fs_init_fs_context(struct fs_context *fc)
+{
+       return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type iomem_fs_type = {
+       .name           = "iomem",
+       .owner          = THIS_MODULE,
+       .init_fs_context = iomem_fs_init_fs_context,
+       .kill_sb        = kill_anon_super,
+};
+
+static int __init iomem_init_inode(void)
+{
+       static struct vfsmount *iomem_vfs_mount;
+       static int iomem_fs_cnt;
+       struct inode *inode;
+       int rc;
+
+       rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt);
+       if (rc < 0) {
+               pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc);
+               return rc;
+       }
+
+       inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
+       if (IS_ERR(inode)) {
+               rc = PTR_ERR(inode);
+               pr_err("Cannot allocate inode for iomem: %d\n", rc);
+               simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt);
+               return rc;
+       }
+
+       /*
+        * Publish iomem revocation inode initialized.
+        * Pairs with smp_load_acquire() in revoke_iomem().
+        */
+       smp_store_release(&iomem_inode, inode);
+
+       return 0;
+}
+
+fs_initcall(iomem_init_inode);
+
 __setup("iomem=", strict_iomem);