#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
+#include <linux/pseudo_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/resource_ext.h>
+#include <uapi/linux/magic.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
#include <asm/io.h>
static DEFINE_RWLOCK(resource_lock);
-/*
- * For memory hotplug, there is no way to free resource entries allocated
- * by boot mem after the system is up. So for reusing the resource entry
- * we need to remember the resource.
- */
-static struct resource *bootmem_resource_free;
-static DEFINE_SPINLOCK(bootmem_resource_lock);
-
-static struct resource *next_resource(struct resource *p, bool sibling_only)
+static struct resource *next_resource(struct resource *p, bool skip_children)
{
- /* Caller wants to traverse through siblings only */
- if (sibling_only)
- return p->sibling;
-
- if (p->child)
+ if (!skip_children && p->child)
return p->child;
while (!p->sibling && p->parent)
p = p->parent;
return p->sibling;
}
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct resource *p = v;
- (*pos)++;
- return (void *)next_resource(p, false);
-}
+#define for_each_resource(_root, _p, _skip_children) \
+ for ((_p) = (_root)->child; (_p); (_p) = next_resource(_p, _skip_children))
#ifdef CONFIG_PROC_FS
static void *r_start(struct seq_file *m, loff_t *pos)
__acquires(resource_lock)
{
- struct resource *p = PDE_DATA(file_inode(m->file));
- loff_t l = 0;
+ struct resource *root = pde_data(file_inode(m->file));
+ struct resource *p;
+ loff_t l = *pos;
+
read_lock(&resource_lock);
- for (p = p->child; p && l < *pos; p = r_next(m, p, &l))
- ;
+ for_each_resource(root, p, false) {
+ if (l-- == 0)
+ break;
+ }
+
return p;
}
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+
+ (*pos)++;
+
+ return (void *)next_resource(p, false);
+}
+
static void r_stop(struct seq_file *m, void *v)
__releases(resource_lock)
{
static int r_show(struct seq_file *m, void *v)
{
- struct resource *root = PDE_DATA(file_inode(m->file));
+ struct resource *root = pde_data(file_inode(m->file));
struct resource *r = v, *p;
unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
static void free_resource(struct resource *res)
{
- if (!res)
- return;
-
- if (!PageSlab(virt_to_head_page(res))) {
- spin_lock(&bootmem_resource_lock);
- res->sibling = bootmem_resource_free;
- bootmem_resource_free = res;
- spin_unlock(&bootmem_resource_lock);
- } else {
+ /**
+ * If the resource was allocated using memblock early during boot
+ * we'll leak it here: we can only return full pages back to the
+ * buddy and trying to be smart and reusing them eventually in
+ * alloc_resource() overcomplicates resource handling.
+ */
+ if (res && PageSlab(virt_to_head_page(res)))
kfree(res);
- }
}
static struct resource *alloc_resource(gfp_t flags)
{
- struct resource *res = NULL;
-
- spin_lock(&bootmem_resource_lock);
- if (bootmem_resource_free) {
- res = bootmem_resource_free;
- bootmem_resource_free = res->sibling;
- }
- spin_unlock(&bootmem_resource_lock);
-
- if (res)
- memset(res, 0, sizeof(struct resource));
- else
- res = kzalloc(sizeof(struct resource), flags);
-
- return res;
+ return kzalloc(sizeof(struct resource), flags);
}
/* Return the conflict entry if you can't request it */
EXPORT_SYMBOL(release_resource);
/**
- * Finds the lowest iomem resource that covers part of [@start..@end]. The
- * caller must specify @start, @end, @flags, and @desc (which may be
- * IORES_DESC_NONE).
+ * find_next_iomem_res - Finds the lowest iomem resource that covers part of
+ * [@start..@end].
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
- * -1 or -EINVAL for other invalid parameters.
- *
- * This function walks the whole tree and not just first level children
- * unless @first_lvl is true.
+ * -ENODEV. Returns -EINVAL for invalid parameters.
*
* @start: start address of the resource searched for
* @end: end address of same resource
* @flags: flags which the resource must have
* @desc: descriptor the resource must have
- * @first_lvl: walk only the first level children, if set
* @res: return ptr, if resource found
+ *
+ * The caller must specify @start, @end, @flags, and @desc
+ * (which may be IORES_DESC_NONE).
*/
static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
- bool first_lvl, struct resource *res)
+ struct resource *res)
{
struct resource *p;
read_lock(&resource_lock);
- for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
- if ((p->flags & flags) != flags)
- continue;
- if ((desc != IORES_DESC_NONE) && (desc != p->desc))
- continue;
+ for_each_resource(&iomem_resource, p, false) {
+ /* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
break;
}
- if ((p->end >= start) && (p->start <= end))
- break;
+
+ /* Skip until we find a range that matches what we look for */
+ if (p->end < start)
+ continue;
+
+ if ((p->flags & flags) != flags)
+ continue;
+ if ((desc != IORES_DESC_NONE) && (desc != p->desc))
+ continue;
+
+ /* Found a match, break */
+ break;
+ }
+
+ if (p) {
+ /* copy data */
+ *res = (struct resource) {
+ .start = max(start, p->start),
+ .end = min(end, p->end),
+ .flags = p->flags,
+ .desc = p->desc,
+ .parent = p->parent,
+ };
}
read_unlock(&resource_lock);
- if (!p)
- return -1;
-
- /* copy data */
- res->start = max(start, p->start);
- res->end = min(end, p->end);
- res->flags = p->flags;
- res->desc = p->desc;
- return 0;
+ return p ? 0 : -ENODEV;
}
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
- bool first_lvl, void *arg,
+ void *arg,
int (*func)(struct resource *, void *))
{
struct resource res;
int ret = -EINVAL;
while (start < end &&
- !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
+ !find_next_iomem_res(start, end, flags, desc, &res)) {
ret = (*func)(&res, arg);
if (ret)
break;
}
/**
- * Walks through iomem resources and calls func() with matching resource
- * ranges. This walks through whole tree and not just first level children.
- * All the memory ranges which overlap start,end and also match flags and
- * desc are valid candidates.
- *
+ * walk_iomem_res_desc - Walks through iomem resources and calls func()
+ * with matching resource ranges.
+ * *
* @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
* @flags: I/O resource flags
* @start: start addr
* @arg: function argument for the callback @func
* @func: callback function that is called for each qualifying resource area
*
+ * All the memory ranges which overlap start,end and also match flags and
+ * desc are valid candidates.
+ *
* NOTE: For a new descriptor search, define a new IORES_DESC in
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
*/
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
u64 end, void *arg, int (*func)(struct resource *, void *))
{
- return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
+ return __walk_iomem_res_desc(start, end, flags, desc, arg, func);
}
EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
{
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
- arg, func);
+ return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+ func);
+}
+
+/*
+ * This function, being a variant of walk_system_ram_res(), calls the @func
+ * callback against all memory ranges of type System RAM which are marked as
+ * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from
+ * higher to lower.
+ */
+int walk_system_ram_res_rev(u64 start, u64 end, void *arg,
+ int (*func)(struct resource *, void *))
+{
+ struct resource res, *rams;
+ int rams_size = 16, i;
+ unsigned long flags;
+ int ret = -1;
+
+ /* create a list */
+ rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL);
+ if (!rams)
+ return ret;
+
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ i = 0;
+ while ((start < end) &&
+ (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) {
+ if (i >= rams_size) {
+ /* re-alloc */
+ struct resource *rams_new;
+
+ rams_new = kvrealloc(rams, rams_size * sizeof(struct resource),
+ (rams_size + 16) * sizeof(struct resource),
+ GFP_KERNEL);
+ if (!rams_new)
+ goto out;
+
+ rams = rams_new;
+ rams_size += 16;
+ }
+
+ rams[i].start = res.start;
+ rams[i++].end = res.end;
+
+ start = res.end + 1;
+ }
+
+ /* go reverse */
+ for (i--; i >= 0; i--) {
+ ret = (*func)(&rams[i], arg);
+ if (ret)
+ break;
+ }
+
+out:
+ kvfree(rams);
+ return ret;
}
/*
{
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
- arg, func);
+ return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg,
+ func);
}
/*
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
* It is to be used only for System RAM.
- *
- * This will find System RAM ranges that are children of top-level resources
- * in addition to top-level System RAM resources.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
while (start < end &&
- !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
- false, &res)) {
- pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
- end_pfn = (res.end + 1) >> PAGE_SHIFT;
+ !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) {
+ pfn = PFN_UP(res.start);
+ end_pfn = PFN_DOWN(res.end + 1);
if (end_pfn > pfn)
ret = (*func)(pfn, end_pfn - pfn, arg);
if (ret)
}
EXPORT_SYMBOL_GPL(page_is_ram);
+static int __region_intersects(struct resource *parent, resource_size_t start,
+ size_t size, unsigned long flags,
+ unsigned long desc)
+{
+ struct resource res;
+ int type = 0; int other = 0;
+ struct resource *p;
+
+ res.start = start;
+ res.end = start + size - 1;
+
+ for (p = parent->child; p ; p = p->sibling) {
+ bool is_type = (((p->flags & flags) == flags) &&
+ ((desc == IORES_DESC_NONE) ||
+ (desc == p->desc)));
+
+ if (resource_overlaps(p, &res))
+ is_type ? type++ : other++;
+ }
+
+ if (type == 0)
+ return REGION_DISJOINT;
+
+ if (other == 0)
+ return REGION_INTERSECTS;
+
+ return REGION_MIXED;
+}
+
/**
* region_intersects() - determine intersection of region with known resources
* @start: region start address
int region_intersects(resource_size_t start, size_t size, unsigned long flags,
unsigned long desc)
{
- struct resource res;
- int type = 0; int other = 0;
- struct resource *p;
-
- res.start = start;
- res.end = start + size - 1;
+ int ret;
read_lock(&resource_lock);
- for (p = iomem_resource.child; p ; p = p->sibling) {
- bool is_type = (((p->flags & flags) == flags) &&
- ((desc == IORES_DESC_NONE) ||
- (desc == p->desc)));
-
- if (resource_overlaps(p, &res))
- is_type ? type++ : other++;
- }
+ ret = __region_intersects(&iomem_resource, start, size, flags, desc);
read_unlock(&resource_lock);
- if (other == 0)
- return type ? REGION_INTERSECTS : REGION_DISJOINT;
-
- if (type)
- return REGION_MIXED;
-
- return REGION_DISJOINT;
+ return ret;
}
EXPORT_SYMBOL_GPL(region_intersects);
if (conflict->end > new->end)
new->end = conflict->end;
- printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+ pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
}
write_unlock(&resource_lock);
}
+/*
+ * Not for general consumption, only early boot memory map parsing, PCI
+ * resource discovery, and late discovery of CXL resources are expected
+ * to use this interface. The former are built-in and only the latter,
+ * CXL, is a module.
+ */
+EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL);
/**
* remove_resource - Remove a resource in the resource tree
static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
-/**
- * __request_region - create a new busy resource region
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- * @name: reserving caller's ID string
- * @flags: IO resource flags
- */
-struct resource * __request_region(struct resource *parent,
+static struct inode *iomem_inode;
+
+#ifdef CONFIG_IO_STRICT_DEVMEM
+static void revoke_iomem(struct resource *res)
+{
+ /* pairs with smp_store_release() in iomem_init_inode() */
+ struct inode *inode = smp_load_acquire(&iomem_inode);
+
+ /*
+ * Check that the initialization has completed. Losing the race
+ * is ok because it means drivers are claiming resources before
+ * the fs_initcall level of init and prevent iomem_get_mapping users
+ * from establishing mappings.
+ */
+ if (!inode)
+ return;
+
+ /*
+ * The expectation is that the driver has successfully marked
+ * the resource busy by this point, so devmem_is_allowed()
+ * should start returning false, however for performance this
+ * does not iterate the entire resource range.
+ */
+ if (devmem_is_allowed(PHYS_PFN(res->start)) &&
+ devmem_is_allowed(PHYS_PFN(res->end))) {
+ /*
+ * *cringe* iomem=relaxed says "go ahead, what's the
+ * worst that can happen?"
+ */
+ return;
+ }
+
+ unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1);
+}
+#else
+static void revoke_iomem(struct resource *res) {}
+#endif
+
+struct address_space *iomem_get_mapping(void)
+{
+ /*
+ * This function is only called from file open paths, hence guaranteed
+ * that fs_initcalls have completed and no need to check for NULL. But
+ * since revoke_iomem can be called before the initcall we still need
+ * the barrier to appease checkers.
+ */
+ return smp_load_acquire(&iomem_inode)->i_mapping;
+}
+
+static int __request_region_locked(struct resource *res, struct resource *parent,
resource_size_t start, resource_size_t n,
const char *name, int flags)
{
DECLARE_WAITQUEUE(wait, current);
- struct resource *res = alloc_resource(GFP_KERNEL);
-
- if (!res)
- return NULL;
res->name = name;
res->start = start;
res->end = start + n - 1;
- write_lock(&resource_lock);
-
for (;;) {
struct resource *conflict;
continue;
}
/* Uhhuh, that didn't work out.. */
- free_resource(res);
- res = NULL;
- break;
+ return -EBUSY;
}
+
+ return 0;
+}
+
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
+ * @flags: IO resource flags
+ */
+struct resource *__request_region(struct resource *parent,
+ resource_size_t start, resource_size_t n,
+ const char *name, int flags)
+{
+ struct resource *res = alloc_resource(GFP_KERNEL);
+ int ret;
+
+ if (!res)
+ return NULL;
+
+ write_lock(&resource_lock);
+ ret = __request_region_locked(res, parent, start, n, name, flags);
write_unlock(&resource_lock);
+
+ if (ret) {
+ free_resource(res);
+ return NULL;
+ }
+
+ if (parent == &iomem_resource)
+ revoke_iomem(res);
+
return res;
}
EXPORT_SYMBOL(__request_region);
write_unlock(&resource_lock);
- printk(KERN_WARNING "Trying to free nonexistent resource "
- "<%016llx-%016llx>\n", (unsigned long long)start,
- (unsigned long long)end);
+ pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end);
}
EXPORT_SYMBOL(__release_region);
#ifdef CONFIG_MEMORY_HOTREMOVE
/**
* release_mem_region_adjustable - release a previously reserved memory region
- * @parent: parent resource descriptor
* @start: resource start address
* @size: resource region size
*
* assumes that all children remain in the lower address entry for
* simplicity. Enhance this logic when necessary.
*/
-int release_mem_region_adjustable(struct resource *parent,
- resource_size_t start, resource_size_t size)
+void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
{
+ struct resource *parent = &iomem_resource;
+ struct resource *new_res = NULL;
+ bool alloc_nofail = false;
struct resource **p;
struct resource *res;
- struct resource *new_res;
resource_size_t end;
- int ret = -EINVAL;
end = start + size - 1;
- if ((start < parent->start) || (end > parent->end))
- return ret;
+ if (WARN_ON_ONCE((start < parent->start) || (end > parent->end)))
+ return;
- /* The alloc_resource() result gets checked later */
- new_res = alloc_resource(GFP_KERNEL);
+ /*
+ * We free up quite a lot of memory on memory hotunplug (esp., memap),
+ * just before releasing the region. This is highly unlikely to
+ * fail - let's play save and make it never fail as the caller cannot
+ * perform any error handling (e.g., trying to re-add memory will fail
+ * similarly).
+ */
+retry:
+ new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0));
p = &parent->child;
write_lock(&resource_lock);
continue;
}
- /*
- * All memory regions added from memory-hotplug path have the
- * flag IORESOURCE_SYSTEM_RAM. If the resource does not have
- * this flag, we know that we are dealing with a resource coming
- * from HMM/devm. HMM/devm use another mechanism to add/release
- * a resource. This goes via devm_request_mem_region and
- * devm_release_mem_region.
- * HMM/devm take care to release their resources when they want,
- * so if we are dealing with them, let us just back off here.
- */
- if (!(res->flags & IORESOURCE_SYSRAM)) {
- ret = 0;
- break;
- }
-
if (!(res->flags & IORESOURCE_MEM))
break;
/* free the whole entry */
*p = res->sibling;
free_resource(res);
- ret = 0;
} else if (res->start == start && res->end != end) {
/* adjust the start */
- ret = __adjust_resource(res, end + 1,
- res->end - end);
+ WARN_ON_ONCE(__adjust_resource(res, end + 1,
+ res->end - end));
} else if (res->start != start && res->end == end) {
/* adjust the end */
- ret = __adjust_resource(res, res->start,
- start - res->start);
+ WARN_ON_ONCE(__adjust_resource(res, res->start,
+ start - res->start));
} else {
- /* split into two entries */
+ /* split into two entries - we need a new resource */
if (!new_res) {
- ret = -ENOMEM;
- break;
+ new_res = alloc_resource(GFP_ATOMIC);
+ if (!new_res) {
+ alloc_nofail = true;
+ write_unlock(&resource_lock);
+ goto retry;
+ }
}
new_res->name = res->name;
new_res->start = end + 1;
new_res->sibling = res->sibling;
new_res->child = NULL;
- ret = __adjust_resource(res, res->start,
- start - res->start);
- if (ret)
+ if (WARN_ON_ONCE(__adjust_resource(res, res->start,
+ start - res->start)))
break;
res->sibling = new_res;
new_res = NULL;
write_unlock(&resource_lock);
free_resource(new_res);
- return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool system_ram_resources_mergeable(struct resource *r1,
+ struct resource *r2)
+{
+ /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */
+ return r1->flags == r2->flags && r1->end + 1 == r2->start &&
+ r1->name == r2->name && r1->desc == r2->desc &&
+ !r1->child && !r2->child;
+}
+
+/**
+ * merge_system_ram_resource - mark the System RAM resource mergeable and try to
+ * merge it with adjacent, mergeable resources
+ * @res: resource descriptor
+ *
+ * This interface is intended for memory hotplug, whereby lots of contiguous
+ * system ram resources are added (e.g., via add_memory*()) by a driver, and
+ * the actual resource boundaries are not of interest (e.g., it might be
+ * relevant for DIMMs). Only resources that are marked mergeable, that have the
+ * same parent, and that don't have any children are considered. All mergeable
+ * resources must be immutable during the request.
+ *
+ * Note:
+ * - The caller has to make sure that no pointers to resources that are
+ * marked mergeable are used anymore after this call - the resource might
+ * be freed and the pointer might be stale!
+ * - release_mem_region_adjustable() will split on demand on memory hotunplug
+ */
+void merge_system_ram_resource(struct resource *res)
+{
+ const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ struct resource *cur;
+
+ if (WARN_ON_ONCE((res->flags & flags) != flags))
+ return;
+
+ write_lock(&resource_lock);
+ res->flags |= IORESOURCE_SYSRAM_MERGEABLE;
+
+ /* Try to merge with next item in the list. */
+ cur = res->sibling;
+ if (cur && system_ram_resources_mergeable(res, cur)) {
+ res->end = cur->end;
+ res->sibling = cur->sibling;
+ free_resource(cur);
+ }
+
+ /* Try to merge with previous item in the list. */
+ cur = res->parent->child;
+ while (cur && cur->sibling != res)
+ cur = cur->sibling;
+ if (cur && system_ram_resources_mergeable(cur, res)) {
+ cur->end = res->end;
+ cur->sibling = res->sibling;
+ free_resource(res);
+ }
+ write_unlock(&resource_lock);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
/*
* Managed region resource
*/
*/
int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
{
- struct resource *p = &iomem_resource;
+ resource_size_t end = addr + size - 1;
+ struct resource *p;
int err = 0;
- loff_t l;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
+ for_each_resource(&iomem_resource, p, false) {
/*
* We can probably skip the resources without
* IORESOURCE_IO attribute?
*/
- if (p->start >= addr + size)
+ if (p->start > end)
continue;
if (p->end < addr)
continue;
if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
- PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
+ PFN_DOWN(p->end) >= PFN_DOWN(end))
continue;
/*
* if a resource is "BUSY", it's not a hardware resource
if (p->flags & IORESOURCE_BUSY)
continue;
- printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
- (unsigned long long)addr,
- (unsigned long long)(addr + size - 1),
- p->name, p);
+ pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n",
+ &addr, &end, p->name, p);
err = -1;
break;
}
#endif
/*
- * check if an address is reserved in the iomem resource tree
- * returns true if reserved, false if not reserved.
+ * Check if an address is exclusive to the kernel and must not be mapped to
+ * user space, for example, via /dev/mem.
+ *
+ * Returns true if exclusive to the kernel, otherwise returns false.
*/
-bool iomem_is_exclusive(u64 addr)
+bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size)
{
- struct resource *p = &iomem_resource;
- bool err = false;
- loff_t l;
- int size = PAGE_SIZE;
-
- if (!strict_iomem_checks)
- return false;
-
- addr = addr & PAGE_MASK;
+ const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
+ IORESOURCE_EXCLUSIVE;
+ bool skip_children = false, err = false;
+ struct resource *p;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
- /*
- * We can probably skip the resources without
- * IORESOURCE_IO attribute?
- */
+ for_each_resource(root, p, skip_children) {
if (p->start >= addr + size)
break;
- if (p->end < addr)
+ if (p->end < addr) {
+ skip_children = true;
continue;
+ }
+ skip_children = false;
+
+ /*
+ * IORESOURCE_SYSTEM_RAM resources are exclusive if
+ * IORESOURCE_EXCLUSIVE is set, even if they
+ * are not busy and even if "iomem=relaxed" is set. The
+ * responsible driver dynamically adds/removes system RAM within
+ * such an area and uncontrolled access is dangerous.
+ */
+ if ((p->flags & exclusive_system_ram) == exclusive_system_ram) {
+ err = true;
+ break;
+ }
+
/*
* A resource is exclusive if IORESOURCE_EXCLUSIVE is set
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
* resource is busy.
*/
- if ((p->flags & IORESOURCE_BUSY) == 0)
+ if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY))
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
return err;
}
+bool iomem_is_exclusive(u64 addr)
+{
+ return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK,
+ PAGE_SIZE);
+}
+
struct resource_entry *resource_list_create_entry(struct resource *res,
size_t extra_size)
{
}
EXPORT_SYMBOL(resource_list_free);
+#ifdef CONFIG_GET_FREE_REGION
+#define GFR_DESCENDING (1UL << 0)
+#define GFR_REQUEST_REGION (1UL << 1)
+#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+
+static resource_size_t gfr_start(struct resource *base, resource_size_t size,
+ resource_size_t align, unsigned long flags)
+{
+ if (flags & GFR_DESCENDING) {
+ resource_size_t end;
+
+ end = min_t(resource_size_t, base->end,
+ (1ULL << MAX_PHYSMEM_BITS) - 1);
+ return end - size + 1;
+ }
+
+ return ALIGN(base->start, align);
+}
+
+static bool gfr_continue(struct resource *base, resource_size_t addr,
+ resource_size_t size, unsigned long flags)
+{
+ if (flags & GFR_DESCENDING)
+ return addr > size && addr >= base->start;
+ /*
+ * In the ascend case be careful that the last increment by
+ * @size did not wrap 0.
+ */
+ return addr > addr - size &&
+ addr <= min_t(resource_size_t, base->end,
+ (1ULL << MAX_PHYSMEM_BITS) - 1);
+}
+
+static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
+ unsigned long flags)
+{
+ if (flags & GFR_DESCENDING)
+ return addr - size;
+ return addr + size;
+}
+
+static void remove_free_mem_region(void *_res)
+{
+ struct resource *res = _res;
+
+ if (res->parent)
+ remove_resource(res);
+ free_resource(res);
+}
+
+static struct resource *
+get_free_mem_region(struct device *dev, struct resource *base,
+ resource_size_t size, const unsigned long align,
+ const char *name, const unsigned long desc,
+ const unsigned long flags)
+{
+ resource_size_t addr;
+ struct resource *res;
+ struct region_devres *dr = NULL;
+
+ size = ALIGN(size, align);
+
+ res = alloc_resource(GFP_KERNEL);
+ if (!res)
+ return ERR_PTR(-ENOMEM);
+
+ if (dev && (flags & GFR_REQUEST_REGION)) {
+ dr = devres_alloc(devm_region_release,
+ sizeof(struct region_devres), GFP_KERNEL);
+ if (!dr) {
+ free_resource(res);
+ return ERR_PTR(-ENOMEM);
+ }
+ } else if (dev) {
+ if (devm_add_action_or_reset(dev, remove_free_mem_region, res))
+ return ERR_PTR(-ENOMEM);
+ }
+
+ write_lock(&resource_lock);
+ for (addr = gfr_start(base, size, align, flags);
+ gfr_continue(base, addr, align, flags);
+ addr = gfr_next(addr, align, flags)) {
+ if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
+ REGION_DISJOINT)
+ continue;
+
+ if (flags & GFR_REQUEST_REGION) {
+ if (__request_region_locked(res, &iomem_resource, addr,
+ size, name, 0))
+ break;
+
+ if (dev) {
+ dr->parent = &iomem_resource;
+ dr->start = addr;
+ dr->n = size;
+ devres_add(dev, dr);
+ }
+
+ res->desc = desc;
+ write_unlock(&resource_lock);
+
+
+ /*
+ * A driver is claiming this region so revoke any
+ * mappings.
+ */
+ revoke_iomem(res);
+ } else {
+ res->start = addr;
+ res->end = addr + size - 1;
+ res->name = name;
+ res->desc = desc;
+ res->flags = IORESOURCE_MEM;
+
+ /*
+ * Only succeed if the resource hosts an exclusive
+ * range after the insert
+ */
+ if (__insert_resource(base, res) || res->child)
+ break;
+
+ write_unlock(&resource_lock);
+ }
+
+ return res;
+ }
+ write_unlock(&resource_lock);
+
+ if (flags & GFR_REQUEST_REGION) {
+ free_resource(res);
+ devres_free(dr);
+ } else if (dev)
+ devm_release_action(dev, remove_free_mem_region, res);
+
+ return ERR_PTR(-ERANGE);
+}
+
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+ struct resource *base, unsigned long size)
+{
+ unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+ return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
+ dev_name(dev),
+ IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
+}
+EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+
+struct resource *request_free_mem_region(struct resource *base,
+ unsigned long size, const char *name)
+{
+ unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+ return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
+ IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
+}
+EXPORT_SYMBOL_GPL(request_free_mem_region);
+
+/**
+ * alloc_free_mem_region - find a free region relative to @base
+ * @base: resource that will parent the new resource
+ * @size: size in bytes of memory to allocate from @base
+ * @align: alignment requirements for the allocation
+ * @name: resource name
+ *
+ * Buses like CXL, that can dynamically instantiate new memory regions,
+ * need a method to allocate physical address space for those regions.
+ * Allocate and insert a new resource to cover a free, unclaimed by a
+ * descendant of @base, range in the span of @base.
+ */
+struct resource *alloc_free_mem_region(struct resource *base,
+ unsigned long size, unsigned long align,
+ const char *name)
+{
+ /* Default of ascending direction and insert resource */
+ unsigned long flags = 0;
+
+ return get_free_mem_region(NULL, base, size, align, name,
+ IORES_DESC_NONE, flags);
+}
+EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL);
+#endif /* CONFIG_GET_FREE_REGION */
+
static int __init strict_iomem(char *str)
{
if (strstr(str, "relaxed"))
return 1;
}
+static int iomem_fs_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type iomem_fs_type = {
+ .name = "iomem",
+ .owner = THIS_MODULE,
+ .init_fs_context = iomem_fs_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int __init iomem_init_inode(void)
+{
+ static struct vfsmount *iomem_vfs_mount;
+ static int iomem_fs_cnt;
+ struct inode *inode;
+ int rc;
+
+ rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt);
+ if (rc < 0) {
+ pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc);
+ return rc;
+ }
+
+ inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb);
+ if (IS_ERR(inode)) {
+ rc = PTR_ERR(inode);
+ pr_err("Cannot allocate inode for iomem: %d\n", rc);
+ simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt);
+ return rc;
+ }
+
+ /*
+ * Publish iomem revocation inode initialized.
+ * Pairs with smp_load_acquire() in revoke_iomem().
+ */
+ smp_store_release(&iomem_inode, inode);
+
+ return 0;
+}
+
+fs_initcall(iomem_init_inode);
+
__setup("iomem=", strict_iomem);