Amount of cached filesystem data that was modified and
is currently being written back to disk
+ anon_thp
+ Amount of memory used in anonymous mappings backed by
+ transparent hugepages
+
inactive_anon, active_anon, inactive_file, active_file, unevictable
Amount of memory, swap-backed and filesystem-backed,
on the internal memory management lists used by the
Amount of reclaimed lazyfree pages
+ thp_fault_alloc
+
+ Number of transparent hugepages which were allocated to satisfy
+ a page fault, including COW faults. This counter is not present
+ when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+
+ thp_collapse_alloc
+
+ Number of transparent hugepages which were allocated to allow
+ collapsing an existing range of pages. This counter is not
+ present when CONFIG_TRANSPARENT_HUGEPAGE is not set.
+
memory.swap.current
A read-only single value file which exists on non-root
cgroups.
20. NOPAGE
21. KSM
22. THP
- 23. BALLOON
+ 23. OFFLINE
24. ZERO_PAGE
25. IDLE
+ 26. PGTABLE
* ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the
memory cgroup each page is charged to, indexed by PFN. Only available when
identical memory pages dynamically shared between one or more processes
22 - THP
contiguous pages which construct transparent hugepages
-23 - BALLOON
- balloon compaction page
+23 - OFFLINE
+ page is logically offline
24 - ZERO_PAGE
zero page for pfn_zero or huge_zero page
25 - IDLE
Note that this flag may be stale in case the page was accessed via
a PTE. To make sure the flag is up-to-date one has to read
``/sys/kernel/mm/page_idle/bitmap`` first.
+26 - PGTABLE
+ page is in use as a page table
IO related page flags
---------------------
8. LRU
Each memcg has its own private LRU. Now, its handling is under global
- VM's control (means that it's handled under global zone_lru_lock).
+ VM's control (means that it's handled under global pgdat->lru_lock).
Almost all routines around memcg's LRU is called by global LRU's
- list management functions under zone_lru_lock().
+ list management functions under pgdat->lru_lock.
A special function is mem_cgroup_isolate_pages(). This scans
memcg's private LRU and call __isolate_lru_page() to extract a page
Other lock order is following:
PG_locked.
mm->page_table_lock
- zone_lru_lock
+ pgdat->lru_lock
lock_page_cgroup.
In many cases, just lock_page_cgroup() is called.
per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
- zone_lru_lock, it has no lock of its own.
+ pgdat->lru_lock, it has no lock of its own.
2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
F: include/uapi/linux/membarrier.h
F: arch/powerpc/include/asm/membarrier.h
+MEMBLOCK
+M: Mike Rapoport <rppt@linux.ibm.com>
+L: linux-mm@kvack.org
+S: Maintained
+F: include/linux/memblock.h
+F: mm/memblock.c
+F: Documentation/core-api/boot-time-mm.rst
+
MEMORY MANAGEMENT
L: linux-mm@kvack.org
W: http://www.linux-mm.org
#include <linux/smp.h>
#include <linux/threads.h>
+#include <linux/numa.h>
#include <asm/machvec.h>
#ifdef CONFIG_NUMA
{
int cpu;
- if (node == -1)
+ if (node == NUMA_NO_NODE)
return cpu_all_mask;
cpumask_clear(&node_to_cpumask_map[node]);
def_bool y
depends on COMPAT && SYSVIPC
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+ def_bool y
+ depends on HUGETLB_PAGE && MIGRATION
+
menu "Power management options"
source "kernel/power/Kconfig"
#include <asm/page.h>
+#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+#define arch_hugetlb_migration_supported arch_hugetlb_migration_supported
+extern bool arch_hugetlb_migration_supported(struct hstate *h);
+#endif
+
#define __HAVE_ARCH_HUGE_PTEP_GET
static inline pte_t huge_ptep_get(pte_t *ptep)
{
*/
#ifdef CONFIG_KASAN
#define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT))
-#ifdef CONFIG_KASAN_EXTRA
-#define KASAN_THREAD_SHIFT 2
-#else
#define KASAN_THREAD_SHIFT 1
-#endif /* CONFIG_KASAN_EXTRA */
#else
#define KASAN_SHADOW_SIZE (0)
#define KASAN_THREAD_SHIFT 0
* but does not hold any data of loaded kernel image.
*
* Note that all the pages in crash dump kernel memory have been initially
- * marked as Reserved in kexec_reserve_crashkres_pages().
+ * marked as Reserved as memory was allocated via memblock_reserve().
*
* In hibernation, the pages which are Reserved and yet "nosave" are excluded
* from the hibernation iamge. crash_is_nosave() does thich check for crash
for (addr = begin; addr < end; addr += PAGE_SIZE) {
page = phys_to_page(addr);
- ClearPageReserved(page);
free_reserved_page(page);
}
}
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
+#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+bool arch_hugetlb_migration_supported(struct hstate *h)
+{
+ size_t pagesize = huge_page_size(h);
+
+ switch (pagesize) {
+#ifdef CONFIG_ARM64_4K_PAGES
+ case PUD_SIZE:
+#endif
+ case PMD_SIZE:
+ case CONT_PMD_SIZE:
+ case CONT_PTE_SIZE:
+ return true;
+ }
+ pr_warn("%s: unrecognized huge page size 0x%lx\n",
+ __func__, pagesize);
+ return false;
+}
+#endif
+
int pmd_huge(pmd_t pmd)
{
return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
}
-
-static void __init kexec_reserve_crashkres_pages(void)
-{
-#ifdef CONFIG_HIBERNATION
- phys_addr_t addr;
- struct page *page;
-
- if (!crashk_res.end)
- return;
-
- /*
- * To reduce the size of hibernation image, all the pages are
- * marked as Reserved initially.
- */
- for (addr = crashk_res.start; addr < (crashk_res.end + 1);
- addr += PAGE_SIZE) {
- page = phys_to_page(addr);
- SetPageReserved(page);
- }
-#endif
-}
#else
static void __init reserve_crashkernel(void)
{
}
-
-static void __init kexec_reserve_crashkres_pages(void)
-{
-}
#endif /* CONFIG_KEXEC_CORE */
#ifdef CONFIG_CRASH_DUMP
/* this will put all unused low memory onto the freelists */
memblock_free_all();
- kexec_reserve_crashkres_pages();
-
mem_init_print_info(NULL);
/*
}
/* cpumask_of_node() will now work */
- pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
+ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
/*
cpumask_clear(&node_to_cpu_mask[node]);
for_each_possible_early_cpu(cpu) {
- node = -1;
+ node = NUMA_NO_NODE;
for (i = 0; i < NR_CPUS; ++i)
if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
node = node_cpuid[i].nid;
if (task != current) put_task_struct(task);
}
-static inline void
-pfm_reserve_page(unsigned long a)
-{
- SetPageReserved(vmalloc_to_page((void *)a));
-}
-static inline void
-pfm_unreserve_page(unsigned long a)
-{
- ClearPageReserved(vmalloc_to_page((void*)a));
-}
-
static inline unsigned long
pfm_protect_ctx_ctxsw(pfm_context_t *x)
{
DPRINT(("ctx=%p msgq reset\n", ctx));
}
-static void *
-pfm_rvmalloc(unsigned long size)
-{
- void *mem;
- unsigned long addr;
-
- size = PAGE_ALIGN(size);
- mem = vzalloc(size);
- if (mem) {
- //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
- addr = (unsigned long)mem;
- while (size > 0) {
- pfm_reserve_page(addr);
- addr+=PAGE_SIZE;
- size-=PAGE_SIZE;
- }
- }
- return mem;
-}
-
-static void
-pfm_rvfree(void *mem, unsigned long size)
-{
- unsigned long addr;
-
- if (mem) {
- DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
- addr = (unsigned long) mem;
- while ((long) size > 0) {
- pfm_unreserve_page(addr);
- addr+=PAGE_SIZE;
- size-=PAGE_SIZE;
- }
- vfree(mem);
- }
- return;
-}
-
static pfm_context_t *
pfm_context_alloc(int ctx_flags)
{
/*
* free the buffer
*/
- pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size);
+ vfree(ctx->ctx_smpl_hdr);
ctx->ctx_smpl_hdr = NULL;
ctx->ctx_smpl_size = 0UL;
* All memory free operations (especially for vmalloc'ed memory)
* MUST be done with interrupts ENABLED.
*/
- if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size);
+ vfree(smpl_buf_addr);
/*
* return the memory used by the context
/*
* We do the easy to undo allocations first.
- *
- * pfm_rvmalloc(), clears the buffer, so there is no leak
*/
- smpl_buf = pfm_rvmalloc(size);
+ smpl_buf = vzalloc(size);
if (smpl_buf == NULL) {
DPRINT(("Can't allocate sampling buffer\n"));
return -ENOMEM;
error:
vm_area_free(vma);
error_kmem:
- pfm_rvfree(smpl_buf, size);
+ vfree(smpl_buf);
return -ENOMEM;
}
* CPUs are put into groups according to node. Walk cpu_map
* and create new groups at node boundaries.
*/
- prev_node = -1;
+ prev_node = NUMA_NO_NODE;
ai->nr_groups = 0;
for (unit = 0; unit < nr_units; unit++) {
cpu = cpu_map[unit];
{
void *ptr = NULL;
u8 best = 0xff;
- int bestnode = -1, node, anynode = 0;
+ int bestnode = NUMA_NO_NODE, node, anynode = 0;
for_each_online_node(node) {
if (node_isset(node, memory_less_mask))
anynode = node;
}
- if (bestnode == -1)
+ if (bestnode == NUMA_NO_NODE)
bestnode = anynode;
ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE,
pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp));
/* unreserve the page so it's possible to free that page */
- PD_PAGE(dp)->flags &= ~(1 << PG_reserved);
+ __ClearPageReserved(PD_PAGE(dp));
init_page_count(PD_PAGE(dp));
return;
unsigned long len, unsigned long pgoff,
unsigned long flags);
+extern void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte);
+
static inline int hstate_get_psize(struct hstate *hstate)
{
unsigned long shift;
/* hugepd entry valid bit */
#define HUGEPD_VAL_BITS (0x8000000000000000UL)
+#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
+extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+
+#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
+extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t new_pte);
#endif
BUILD_BUG();
return 0;
}
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
+void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
+ pte_t *, pte_t, pte_t);
+
+/*
+ * Returns true for a R -> RW upgrade of pte
+ */
+static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_val)
+{
+ if (!(old_val & _PAGE_READ))
+ return false;
+
+ if ((!(old_val & _PAGE_WRITE)) && (new_val & _PAGE_WRITE))
+ return true;
+
+ return false;
+}
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
pte_t entry, unsigned long address,
int psize);
+extern void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte);
+
static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr,
unsigned long set)
{
#include <linux/pci.h>
#include <linux/list.h>
#include <linux/ioport.h>
+#include <linux/numa.h>
struct device_node;
#ifdef CONFIG_NUMA
#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE))
#else
-#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = -1)
+#define PHB_SET_NODE(PHB, NODE) ((PHB)->node = NUMA_NO_NODE)
#endif
#endif /* CONFIG_PPC64 */
#include <linux/export.h>
#include <linux/memblock.h>
#include <linux/sched/task.h>
+#include <linux/numa.h>
#include <asm/lppaca.h>
#include <asm/paca.h>
* which will put its paca in the right place.
*/
if (cpu == boot_cpuid) {
- nid = -1;
+ nid = NUMA_NO_NODE;
memblock_set_bottom_up(true);
} else {
nid = early_cpu_to_node(cpu);
#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/vgaarb.h>
+#include <linux/numa.h>
#include <asm/processor.h>
#include <asm/io.h>
int nid = of_node_to_nid(dev);
if (nid < 0 || !node_online(nid))
- nid = -1;
+ nid = NUMA_NO_NODE;
PHB_SET_NODE(phb, nid);
}
BUG_ON(vdso32_pagelist == NULL);
for (i = 0; i < vdso32_pages; i++) {
struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
- ClearPageReserved(pg);
get_page(pg);
vdso32_pagelist[i] = pg;
}
BUG_ON(vdso64_pagelist == NULL);
for (i = 0; i < vdso64_pages; i++) {
struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
- ClearPageReserved(pg);
get_page(pg);
vdso64_pagelist[i] = pg;
}
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
+
+pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ unsigned long pte_val;
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep,
+ _PAGE_PRESENT, _PAGE_INVALID, 1);
+
+ return __pte(pte_val);
+}
+
+void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+
+ if (radix_enabled())
+ return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
+ old_pte, pte);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
return vm_unmapped_area(&info);
}
+
+void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * To avoid NMMU hang while relaxing access we need to flush the tlb before
+ * we set the new value.
+ */
+ if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ (atomic_read(&mm->context.copros) > 0))
+ radix__flush_hugetlb_page(vma, addr);
+
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
#include <linux/sizes.h>
#include <asm/mmu_context.h>
#include <asm/pte-walk.h>
+#include <linux/mm_inline.h>
static DEFINE_MUTEX(mem_list_mutex);
atomic64_t mapped;
unsigned int pageshift;
u64 ua; /* userspace address */
- u64 entries; /* number of entries in hpas[] */
- u64 *hpas; /* vmalloc'ed */
+ u64 entries; /* number of entries in hpas/hpages[] */
+ /*
+ * in mm_iommu_get we temporarily use this to store
+ * struct page address.
+ *
+ * We need to convert ua to hpa in real mode. Make it
+ * simpler by storing physical address.
+ */
+ union {
+ struct page **hpages; /* vmalloc'ed */
+ phys_addr_t *hpas;
+ };
#define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1)
u64 dev_hpa; /* Device memory base address */
};
}
EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
-/*
- * Taken from alloc_migrate_target with changes to remove CMA allocations
- */
-struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
-{
- gfp_t gfp_mask = GFP_USER;
- struct page *new_page;
-
- if (PageCompound(page))
- return NULL;
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
- /*
- * We don't want the allocation to force an OOM if possibe
- */
- new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
- return new_page;
-}
-
-static int mm_iommu_move_page_from_cma(struct page *page)
-{
- int ret = 0;
- LIST_HEAD(cma_migrate_pages);
-
- /* Ignore huge pages for now */
- if (PageCompound(page))
- return -EBUSY;
-
- lru_add_drain();
- ret = isolate_lru_page(page);
- if (ret)
- return ret;
-
- list_add(&page->lru, &cma_migrate_pages);
- put_page(page); /* Drop the gup reference */
-
- ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
- NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE);
- if (ret) {
- if (!list_empty(&cma_migrate_pages))
- putback_movable_pages(&cma_migrate_pages);
- }
-
- return 0;
-}
-
static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
- unsigned long entries, unsigned long dev_hpa,
- struct mm_iommu_table_group_mem_t **pmem)
+ unsigned long entries, unsigned long dev_hpa,
+ struct mm_iommu_table_group_mem_t **pmem)
{
struct mm_iommu_table_group_mem_t *mem;
- long i, j, ret = 0, locked_entries = 0;
+ long i, ret, locked_entries = 0;
unsigned int pageshift;
- unsigned long flags;
- unsigned long cur_ua;
- struct page *page = NULL;
mutex_lock(&mem_list_mutex);
goto unlock_exit;
}
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL);
+ up_read(&mm->mmap_sem);
+ if (ret != entries) {
+ /* free the reference taken */
+ for (i = 0; i < ret; i++)
+ put_page(mem->hpages[i]);
+
+ vfree(mem->hpas);
+ kfree(mem);
+ ret = -EFAULT;
+ goto unlock_exit;
+ }
+
+ pageshift = PAGE_SHIFT;
for (i = 0; i < entries; ++i) {
- cur_ua = ua + (i << PAGE_SHIFT);
- if (1 != get_user_pages_fast(cur_ua,
- 1/* pages */, 1/* iswrite */, &page)) {
- ret = -EFAULT;
- for (j = 0; j < i; ++j)
- put_page(pfn_to_page(mem->hpas[j] >>
- PAGE_SHIFT));
- vfree(mem->hpas);
- kfree(mem);
- goto unlock_exit;
- }
+ struct page *page = mem->hpages[i];
+
/*
- * If we get a page from the CMA zone, since we are going to
- * be pinning these entries, we might as well move them out
- * of the CMA zone if possible. NOTE: faulting in + migration
- * can be expensive. Batching can be considered later
+ * Allow to use larger than 64k IOMMU pages. Only do that
+ * if we are backed by hugetlb.
*/
- if (is_migrate_cma_page(page)) {
- if (mm_iommu_move_page_from_cma(page))
- goto populate;
- if (1 != get_user_pages_fast(cur_ua,
- 1/* pages */, 1/* iswrite */,
- &page)) {
- ret = -EFAULT;
- for (j = 0; j < i; ++j)
- put_page(pfn_to_page(mem->hpas[j] >>
- PAGE_SHIFT));
- vfree(mem->hpas);
- kfree(mem);
- goto unlock_exit;
- }
- }
-populate:
- pageshift = PAGE_SHIFT;
- if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
- pte_t *pte;
+ if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
struct page *head = compound_head(page);
- unsigned int compshift = compound_order(head);
- unsigned int pteshift;
-
- local_irq_save(flags); /* disables as well */
- pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift);
-
- /* Double check it is still the same pinned page */
- if (pte && pte_page(*pte) == head &&
- pteshift == compshift + PAGE_SHIFT)
- pageshift = max_t(unsigned int, pteshift,
- PAGE_SHIFT);
- local_irq_restore(flags);
+
+ pageshift = compound_order(head) + PAGE_SHIFT;
}
mem->pageshift = min(mem->pageshift, pageshift);
+ /*
+ * We don't need struct page reference any more, switch
+ * to physical address.
+ */
mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
}
good_exit:
+ ret = 0;
atomic64_set(&mem->mapped, 1);
mem->used = 1;
mem->ua = ua;
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
- dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
+ dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
}
static int __init fake_numa_create_new_node(unsigned long end_pfn,
*/
static int associativity_to_nid(const __be32 *associativity)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
if (min_common_depth == -1)
goto out;
/* POWER4 LPAR uses 0xffff as invalid node */
if (nid == 0xffff || nid >= MAX_NUMNODES)
- nid = -1;
+ nid = NUMA_NO_NODE;
if (nid > 0 &&
of_read_number(associativity, 1) >= distance_ref_points_depth) {
*/
static int of_node_to_nid_single(struct device_node *device)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
const __be32 *tmp;
tmp = of_get_associativity(device);
/* Walk the device tree upwards, looking for an associativity id */
int of_node_to_nid(struct device_node *device)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
of_node_get(device);
while (device) {
*/
static int numa_setup_cpu(unsigned long lcpu)
{
- int nid = -1;
+ int nid = NUMA_NO_NODE;
struct device_node *cpu;
/*
{
struct drmem_lmb *lmb;
unsigned long lmb_size;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
lmb_size = drmem_lmb_size();
static int hot_add_node_scn_to_nid(unsigned long scn_addr)
{
struct device_node *memory;
- int nid = -1;
+ int nid = NUMA_NO_NODE;
for_each_node_by_type(memory, "memory") {
unsigned long start, size;
}
#endif /* CONFIG_PROC_FS */
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep)
+{
+ unsigned long pte_val;
+
+ /*
+ * Clear the _PAGE_PRESENT so that no hardware parallel update is
+ * possible. Also keep the pte_present true so that we don't take
+ * wrong fault.
+ */
+ pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0);
+
+ return __pte(pte_val);
+
+}
+
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
+{
+ if (radix_enabled())
+ return radix__ptep_modify_prot_commit(vma, addr,
+ ptep, old_pte, pte);
+ set_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+
/*
* For hash translation mode, we use the deposited table to store hash slot
* information and they are stored at PTRS_PER_PMD offset from related pmd
}
/* See ptesync comment in radix__set_pte_at */
}
+
+void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ struct mm_struct *mm = vma->vm_mm;
+
+ /*
+ * To avoid NMMU hang while relaxing access we need to flush the tlb before
+ * we set the new value. We need to do this only for radix, because hash
+ * translation does flush when updating the linux pte.
+ */
+ if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
+ (atomic_read(&mm->context.copros) > 0))
+ radix__flush_tlb_page(vma, addr);
+
+ set_pte_at(mm, addr, ptep, pte);
+}
#include <linux/slab.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
+#include <linux/numa.h>
#include <asm/machdep.h>
#include <asm/debugfs.h>
ent = &memtrace_array[i];
/* We have onlined this chunk previously */
- if (ent->nid == -1)
+ if (ent->nid == NUMA_NO_NODE)
continue;
/* Remove from io mappings */
*/
debugfs_remove_recursive(ent->dir);
pr_info("Added trace memory back to node %d\n", ent->nid);
- ent->size = ent->start = ent->nid = -1;
+ ent->size = ent->start = ent->nid = NUMA_NO_NODE;
}
if (ret)
return ret;
struct page *pg;
pg = virt_to_page(vdso_start + (i << PAGE_SHIFT));
- ClearPageReserved(pg);
vdso_pagelist[i] = pg;
}
vdso_pagelist[i] = virt_to_page(vdso_data);
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
-pte_t ptep_modify_prot_start(struct mm_struct *, unsigned long, pte_t *);
-void ptep_modify_prot_commit(struct mm_struct *, unsigned long, pte_t *, pte_t);
+pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *);
+void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long,
+ pte_t *, pte_t, pte_t);
#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
BUG_ON(vdso32_pagelist == NULL);
for (i = 0; i < vdso32_pages - 1; i++) {
struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
- ClearPageReserved(pg);
get_page(pg);
vdso32_pagelist[i] = pg;
}
BUG_ON(vdso64_pagelist == NULL);
for (i = 0; i < vdso64_pages - 1; i++) {
struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
- ClearPageReserved(pg);
get_page(pg);
vdso64_pagelist[i] = pg;
}
}
EXPORT_SYMBOL(ptep_xchg_lazy);
-pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep)
{
pgste_t pgste;
pte_t old;
int nodat;
+ struct mm_struct *mm = vma->vm_mm;
preempt_disable();
pgste = ptep_xchg_start(mm, addr, ptep);
return old;
}
-void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
pgste_t pgste;
+ struct mm_struct *mm = vma->vm_mm;
if (!MACHINE_HAS_NX)
pte_val(pte) &= ~_PAGE_NOEXEC;
t_entry="$3"
while [ $t_nxt -lt $t_nr ]; do
- printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}"
+ printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
t_nxt=$((t_nxt+1))
done
- printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}"
+ printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
}
grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
#include <linux/sys.h>
#include <linux/linkage.h>
-#define __SYSCALL(nr, entry, nargs) .long entry
+#define __SYSCALL(nr, entry) .long entry
.data
ENTRY(sys_call_table)
#include <asm/syscall_table.h>
#include <linux/export.h>
#include <linux/irq.h>
#include <linux/of_device.h>
+#include <linux/numa.h>
#include <asm/prom.h>
#include <asm/irq.h>
struct device_node *dp = op->dev.of_node;
int err;
- pbm->numa_node = -1;
+ pbm->numa_node = NUMA_NO_NODE;
pbm->pci_ops = &sun4u_pci_ops;
pbm->config_space_reg_bits = 12;
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/of_device.h>
+#include <linux/numa.h>
#include <asm/iommu.h>
#include <asm/irq.h>
pbm->next = pci_pbm_root;
pci_pbm_root = pbm;
- pbm->numa_node = -1;
+ pbm->numa_node = NUMA_NO_NODE;
pbm->pci_ops = &sun4u_pci_ops;
pbm->config_space_reg_bits = 8;
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/numa.h>
#include <asm/upa.h>
struct device_node *dp = op->dev.of_node;
pbm->name = dp->full_name;
- pbm->numa_node = -1;
+ pbm->numa_node = NUMA_NO_NODE;
pbm->chip_type = chip_type;
pbm->chip_version = of_getintprop_default(dp, "version#", 0);
pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0);
#include <linux/interrupt.h>
#include <linux/of.h>
#include <linux/of_device.h>
+#include <linux/numa.h>
#include <asm/page.h>
#include <asm/io.h>
op->dev.archdata.iommu = iommu;
op->dev.archdata.stc = strbuf;
- op->dev.archdata.numa_node = -1;
+ op->dev.archdata.numa_node = NUMA_NO_NODE;
reg_base = regs + SYSIO_IOMMUREG_BASE;
iommu->iommu_control = reg_base + IOMMU_CONTROL;
{
int prev_nid, new_nid;
- prev_nid = -1;
+ prev_nid = NUMA_NO_NODE;
for ( ; start < end; start += PAGE_SIZE) {
for (new_nid = 0; new_nid < num_node_masks; new_nid++) {
struct node_mem_mask *p = &node_masks[new_nid];
if ((start & p->mask) == p->match) {
- if (prev_nid == -1)
+ if (prev_nid == NUMA_NO_NODE)
prev_nid = new_nid;
break;
}
md = mdesc_grab();
count = 0;
- nid = -1;
+ nid = NUMA_NO_NODE;
mdesc_for_each_node_by_name(md, grp, "group") {
if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
nid = count;
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
-static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep)
{
pteval_t ret;
- ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep);
+ ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep);
return (pte_t) { .pte = ret };
}
-static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
+
if (sizeof(pteval_t) > sizeof(long))
/* 5 arg words */
- pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte);
+ pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte);
else
PVOP_VCALL4(mmu.ptep_modify_prot_commit,
- mm, addr, ptep, pte.pte);
+ vma, addr, ptep, pte.pte);
}
static inline void set_pte(pte_t *ptep, pte_t pte)
struct cpumask;
struct flush_tlb_info;
struct mmu_gather;
+struct vm_area_struct;
/*
* Wrapper type for pointers to code which uses the non-standard
pte_t *ptep, pte_t pteval);
void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
- pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
+ pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep);
- void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
+ void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t pte);
struct paravirt_callee_save pte_val;
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/scatterlist.h>
+#include <linux/numa.h>
#include <asm/io.h>
#include <asm/pat.h>
#include <asm/x86_init.h>
int node;
node = __pcibus_to_node(bus);
- return (node == -1) ? cpu_online_mask :
+ return (node == NUMA_NO_NODE) ? cpu_online_mask :
cpumask_of_node(node);
}
#endif
#endif
/**
- * access_ok: - Checks if a user space pointer is valid
+ * access_ok - Checks if a user space pointer is valid
* @addr: User space pointer to start of block to check
* @size: Size of block to check
*
*
* Checks if a pointer to a block of memory in user space is valid.
*
- * Returns true (nonzero) if the memory block may be valid, false (zero)
- * if it is definitely invalid.
- *
* Note that, depending on architecture, this function probably just
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
+ *
+ * Return: true (nonzero) if the memory block may be valid, false (zero)
+ * if it is definitely invalid.
*/
#define access_ok(addr, size) \
({ \
__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
/**
- * get_user: - Get a simple variable from user space.
+ * get_user - Get a simple variable from user space.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
* @ptr must have pointer-to-simple-variable type, and the result of
* dereferencing @ptr must be assignable to @x without a cast.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
/*
extern void __put_user_8(void);
/**
- * put_user: - Write a simple value into user space.
+ * put_user - Write a simple value into user space.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
* @ptr must have pointer-to-simple-variable type, and @x must be assignable
* to the result of dereferencing @ptr.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
*/
#define put_user(x, ptr) \
({ \
} while (0)
/**
- * __get_user: - Get a simple variable from user space, with less checking.
+ * __get_user - Get a simple variable from user space, with less checking.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
* Caller must check the pointer with access_ok() before calling this
* function.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
__get_user_nocheck((x), (ptr), sizeof(*(ptr)))
/**
- * __put_user: - Write a simple value into user space, with less checking.
+ * __put_user - Write a simple value into user space, with less checking.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
* Caller must check the pointer with access_ok() before calling this
* function.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
*/
#define __put_user(x, ptr) \
#include <linux/crash_dump.h>
#include <linux/reboot.h>
#include <linux/memory.h>
+#include <linux/numa.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
}
/* Set socket -> node values: */
- lnid = -1;
+ lnid = NUMA_NO_NODE;
for_each_present_cpu(cpu) {
int nid = cpu_to_node(cpu);
int apicid, sockid;
new_hub->pnode = 0xffff;
new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
- new_hub->memory_nid = -1;
+ new_hub->memory_nid = NUMA_NO_NODE;
new_hub->nr_possible_cpus = 0;
new_hub->nr_online_cpus = 0;
}
uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
- if (uv_cpu_hub_info(cpu)->memory_nid == -1)
+ if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE)
uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
/* Init memoryless node: */
unsigned long delta;
int rc;
- pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n",
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
#include <linux/stackprotector.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/numa.h>
#include <asm/acpi.h>
#include <asm/desc.h>
/* reduce the number of lines printed when booting a large cpu count system */
static void announce_cpu(int cpu, int apicid)
{
- static int current_node = -1;
+ static int current_node = NUMA_NO_NODE;
int node = early_cpu_to_node(cpu);
static int width, node_width;
} while (0)
/**
- * clear_user: - Zero a block of memory in user space.
+ * clear_user - Zero a block of memory in user space.
* @to: Destination address, in user space.
* @n: Number of bytes to zero.
*
* Zero a block of memory in user space.
*
- * Returns number of bytes that could not be cleared.
+ * Return: number of bytes that could not be cleared.
* On success, this will be zero.
*/
unsigned long
EXPORT_SYMBOL(clear_user);
/**
- * __clear_user: - Zero a block of memory in user space, with less checking.
+ * __clear_user - Zero a block of memory in user space, with less checking.
* @to: Destination address, in user space.
* @n: Number of bytes to zero.
*
* Zero a block of memory in user space. Caller must check
* the specified block with access_ok() before calling this function.
*
- * Returns number of bytes that could not be cleared.
+ * Return: number of bytes that could not be cleared.
* On success, this will be zero.
*/
unsigned long
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
- pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
+ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
{
if (node >= nr_node_ids) {
printk(KERN_WARNING
- "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+ "cpumask_of_node(%d): node > nr_node_ids(%u)\n",
node, nr_node_ids);
dump_stack();
return cpu_none_mask;
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep);
+void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t pte);
unsigned long xen_read_cr2_direct(void);
__xen_set_pte(ptep, pteval);
}
-pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
- trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
+ trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep);
return *ptep;
}
-void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t pte)
{
struct mmu_update u;
- trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
+ trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte);
xen_mc_batch();
u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
#include <linux/export.h>
#include <linux/debugfs.h>
#include <linux/prefetch.h>
+#include <linux/numa.h>
#include "mtip32xx.h"
#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32)
/* Helper for selecting a node in round robin mode */
static inline int mtip_get_next_rr_node(void)
{
- static int next_node = -1;
+ static int next_node = NUMA_NO_NODE;
- if (next_node == -1) {
+ if (next_node == NUMA_NO_NODE) {
next_node = first_online_node;
return next_node;
}
unsigned long page = efficeon_private.l1_table[index];
if (page) {
efficeon_private.l1_table[index] = 0;
- ClearPageReserved(virt_to_page((char *)page));
free_page(page);
freed++;
}
efficeon_free_gatt_table(agp_bridge);
return -ENOMEM;
}
- SetPageReserved(virt_to_page((char *)page));
for (offset = 0; offset < PAGE_SIZE; offset += clflush_chunk)
clflush((char *)page+offset);
#include <linux/acpi_dma.h>
#include <linux/of_dma.h>
#include <linux/mempool.h>
+#include <linux/numa.h>
static DEFINE_MUTEX(dma_list_mutex);
static DEFINE_IDA(dma_ida);
static bool dma_chan_is_local(struct dma_chan *chan, int cpu)
{
int node = dev_to_node(chan->device->dev);
- return node == -1 || cpumask_test_cpu(cpu, cpumask_of_node(node));
+ return node == NUMA_NO_NODE ||
+ cpumask_test_cpu(cpu, cpumask_of_node(node));
}
/**
#include <linux/list.h>
-static inline int list_is_first(const struct list_head *list,
- const struct list_head *head)
-{
- return head->next == list;
-}
-
static inline void __list_del_many(struct list_head *head,
struct list_head *first)
{
/* Check if the particular page is backed and can be onlined and online it. */
static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
{
- if (!has_pfn_is_backed(has, page_to_pfn(pg)))
+ if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
+ if (!PageOffline(pg))
+ __SetPageOffline(pg);
return;
+ }
+ if (PageOffline(pg))
+ __ClearPageOffline(pg);
/* This frame is currently backed; online the page. */
__online_page_set_limits(pg);
}
}
-static void hv_online_page(struct page *pg)
+static void hv_online_page(struct page *pg, unsigned int order)
{
struct hv_hotadd_state *has;
unsigned long flags;
spin_lock_irqsave(&dm_device.ha_lock, flags);
list_for_each_entry(has, &dm_device.ha_region_list, list) {
/* The page belongs to a different HAS. */
- if ((pfn < has->start_pfn) || (pfn >= has->end_pfn))
+ if ((pfn < has->start_pfn) ||
+ (pfn + (1UL << order) > has->end_pfn))
continue;
- hv_page_online_one(has, pg);
+ hv_bring_pgs_online(has, pfn, 1UL << order);
break;
}
spin_unlock_irqrestore(&dm_device.ha_lock, flags);
for (i = 0; i < num_pages; i++) {
pg = pfn_to_page(i + start_frame);
+ __ClearPageOffline(pg);
__free_page(pg);
dm->num_pages_ballooned--;
}
struct dm_balloon_response *bl_resp,
int alloc_unit)
{
- unsigned int i = 0;
+ unsigned int i, j;
struct page *pg;
if (num_pages < alloc_unit)
if (alloc_unit != 1)
split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
+ /* mark all pages offline */
+ for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++)
+ __SetPageOffline(pg + j);
+
bl_resp->range_count++;
bl_resp->range_array[i].finfo.start_page =
page_to_pfn(pg);
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/numa.h>
#include "hfi.h"
#include "affinity.h"
_dev_comp_vect_cpu_mask_clean_up(dd, entry);
unlock:
mutex_unlock(&node_affinity.lock);
- dd->node = -1;
+ dd->node = NUMA_NO_NODE;
}
/*
#include <linux/printk.h>
#include <linux/hrtimer.h>
#include <linux/bitmap.h>
+#include <linux/numa.h>
#include <rdma/rdma_vt.h>
#include "hfi.h"
dd->unit = ret;
list_add(&dd->list, &hfi1_dev_list);
}
- dd->node = -1;
+ dd->node = NUMA_NO_NODE;
spin_unlock_irqrestore(&hfi1_devs_lock, flags);
idr_preload_end();
#include <linux/dmi.h>
#include <linux/slab.h>
#include <linux/iommu.h>
+#include <linux/numa.h>
#include <asm/irq_remapping.h>
#include <asm/iommu_table.h>
int node = acpi_map_pxm_to_node(rhsa->proximity_domain);
if (!node_online(node))
- node = -1;
+ node = NUMA_NO_NODE;
drhd->iommu->node = node;
return 0;
}
iommu->msagaw = msagaw;
iommu->segment = drhd->segment;
- iommu->node = -1;
+ iommu->node = NUMA_NO_NODE;
ver = readl(iommu->reg + DMAR_VER_REG);
pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n",
#include <linux/dma-contiguous.h>
#include <linux/dma-direct.h>
#include <linux/crash_dump.h>
+#include <linux/numa.h>
#include <asm/irq_remapping.h>
#include <asm/cacheflush.h>
#include <asm/iommu.h>
return NULL;
memset(domain, 0, sizeof(*domain));
- domain->nid = -1;
+ domain->nid = NUMA_NO_NODE;
domain->flags = flags;
domain->has_iotlb_device = false;
INIT_LIST_HEAD(&domain->devices);
#include <linux/module.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/numa.h>
#include <asm/uv/uv_hub.h>
#if defined CONFIG_X86_64
#include <asm/uv/bios.h>
XPC_NOTIFY_MSG_SIZE_UV)
#define XPC_NOTIFY_IRQ_NAME "xpc_notify"
-static int xpc_mq_node = -1;
+static int xpc_mq_node = NUMA_NO_NODE;
static struct xpc_gru_mq_uv *xpc_activate_mq_uv;
static struct xpc_gru_mq_uv *xpc_notify_mq_uv;
return 1 << vmballoon_page_order(page_size);
}
+/**
+ * vmballoon_mark_page_offline() - mark a page as offline
+ * @page: pointer for the page.
+ * @page_size: the size of the page.
+ */
+static void
+vmballoon_mark_page_offline(struct page *page,
+ enum vmballoon_page_size_type page_size)
+{
+ int i;
+
+ for (i = 0; i < vmballoon_page_in_frames(page_size); i++)
+ __SetPageOffline(page + i);
+}
+
+/**
+ * vmballoon_mark_page_online() - mark a page as online
+ * @page: pointer for the page.
+ * @page_size: the size of the page.
+ */
+static void
+vmballoon_mark_page_online(struct page *page,
+ enum vmballoon_page_size_type page_size)
+{
+ int i;
+
+ for (i = 0; i < vmballoon_page_in_frames(page_size); i++)
+ __ClearPageOffline(page + i);
+}
+
/**
* vmballoon_send_get_target() - Retrieve desired balloon size from the host.
*
ctl->page_size);
if (page) {
+ vmballoon_mark_page_offline(page, ctl->page_size);
/* Success. Add the page to the list and continue. */
list_add(&page->lru, &ctl->pages);
continue;
list_for_each_entry_safe(page, tmp, page_list, lru) {
list_del(&page->lru);
+ vmballoon_mark_page_online(page, page_size);
__free_pages(page, vmballoon_page_order(page_size));
}
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/atomic.h>
+#include <linux/numa.h>
#include <scsi/fc/fc_fcoe.h>
#include <net/udp_tunnel.h>
#include <net/pkt_cls.h>
{
struct device *dev = tx_ring->dev;
int orig_node = dev_to_node(dev);
- int ring_node = -1;
+ int ring_node = NUMA_NO_NODE;
int size;
size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count;
{
struct device *dev = rx_ring->dev;
int orig_node = dev_to_node(dev);
- int ring_node = -1;
+ int ring_node = NUMA_NO_NODE;
int size;
size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
return BP_ECANCELED;
}
-static void xen_online_page(struct page *page)
+static void xen_online_page(struct page *page, unsigned int order)
{
- __online_page_set_limits(page);
+ unsigned long i, size = (1 << order);
+ unsigned long start_pfn = page_to_pfn(page);
+ struct page *p;
+ pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
mutex_lock(&balloon_mutex);
-
- __balloon_append(page);
-
+ for (i = 0; i < size; i++) {
+ p = pfn_to_page(start_pfn + i);
+ __online_page_set_limits(p);
+ __SetPageOffline(p);
+ __balloon_append(p);
+ }
mutex_unlock(&balloon_mutex);
}
xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]);
/* Relinquish the page back to the allocator. */
+ __ClearPageOffline(page);
free_reserved_page(page);
}
state = BP_EAGAIN;
break;
}
+ __SetPageOffline(page);
adjust_managed_page_count(page, -1);
xenmem_reservation_scrub_page(page);
list_add(&page->lru, &pages);
.full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+ .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
inode_lock(inode);
/* protected by i_mutex */
- if (info->seals & F_SEAL_WRITE) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
inode_unlock(inode);
return -EPERM;
}
void inode_set_flags(struct inode *inode, unsigned int flags,
unsigned int mask)
{
- unsigned int old_flags, new_flags;
-
WARN_ON_ONCE(flags & ~mask);
- do {
- old_flags = READ_ONCE(inode->i_flags);
- new_flags = (old_flags & ~mask) | flags;
- } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
- new_flags) != old_flags));
+ set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);
* to see if it supports poll (Neither 'poll' nor 'select' return
* an appropriate error code). When in doubt, set a suitable timeout value.
*/
+__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
+{
+ struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
+ struct kernfs_open_node *on = kn->attr.open;
+
+ poll_wait(of->file, &on->poll, wait);
+
+ if (of->event != atomic_read(&on->event))
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+
+ return DEFAULT_POLLMASK;
+}
+
static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
{
struct kernfs_open_file *of = kernfs_of(filp);
struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
- struct kernfs_open_node *on = kn->attr.open;
+ __poll_t ret;
if (!kernfs_get_active(kn))
- goto trigger;
+ return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
- poll_wait(filp, &on->poll, wait);
+ if (kn->attr.ops->poll)
+ ret = kn->attr.ops->poll(of, wait);
+ else
+ ret = kernfs_generic_poll(of, wait);
kernfs_put_active(kn);
-
- if (of->event != atomic_read(&on->event))
- goto trigger;
-
- return DEFAULT_POLLMASK;
-
- trigger:
- return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
+ return ret;
}
static void kernfs_notify_workfn(struct work_struct *work)
return count;
}
-int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+static
+int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
- u64 start, len, trimmed, first_group, last_group, group;
+ u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
int ret, cnt;
u32 first_bit, last_bit, minlen;
struct buffer_head *main_bm_bh = NULL;
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
- struct ocfs2_trim_fs_info info, *pinfo = NULL;
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL;
+ trace_ocfs2_trim_mainbm(start, len, minlen);
+
+next_group:
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
}
main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
- if (start >= le32_to_cpu(main_bm->i_clusters)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- len = range->len >> osb->s_clustersize_bits;
- if (start + len > le32_to_cpu(main_bm->i_clusters))
- len = le32_to_cpu(main_bm->i_clusters) - start;
-
- trace_ocfs2_trim_fs(start, len, minlen);
-
- ocfs2_trim_fs_lock_res_init(osb);
- ret = ocfs2_trim_fs_lock(osb, NULL, 1);
- if (ret < 0) {
- if (ret != -EAGAIN) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
+ /*
+ * Do some check before trim the first group.
+ */
+ if (!group) {
+ if (start >= le32_to_cpu(main_bm->i_clusters)) {
+ ret = -EINVAL;
goto out_unlock;
}
- mlog(ML_NOTICE, "Wait for trim on device (%s) to "
- "finish, which is running from another node.\n",
- osb->dev_str);
- ret = ocfs2_trim_fs_lock(osb, &info, 0);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_trim_fs_lock_res_uninit(osb);
- goto out_unlock;
- }
+ if (start + len > le32_to_cpu(main_bm->i_clusters))
+ len = le32_to_cpu(main_bm->i_clusters) - start;
- if (info.tf_valid && info.tf_success &&
- info.tf_start == start && info.tf_len == len &&
- info.tf_minlen == minlen) {
- /* Avoid sending duplicated trim to a shared device */
- mlog(ML_NOTICE, "The same trim on device (%s) was "
- "just done from node (%u), return.\n",
- osb->dev_str, info.tf_nodenum);
- range->len = info.tf_trimlen;
- goto out_trimunlock;
- }
+ /*
+ * Determine first and last group to examine based on
+ * start and len
+ */
+ first_group = ocfs2_which_cluster_group(main_bm_inode, start);
+ if (first_group == osb->first_cluster_group_blkno)
+ first_bit = start;
+ else
+ first_bit = start - ocfs2_blocks_to_clusters(sb,
+ first_group);
+ last_group = ocfs2_which_cluster_group(main_bm_inode,
+ start + len - 1);
+ group = first_group;
}
- info.tf_nodenum = osb->node_num;
- info.tf_start = start;
- info.tf_len = len;
- info.tf_minlen = minlen;
-
- /* Determine first and last group to examine based on start and len */
- first_group = ocfs2_which_cluster_group(main_bm_inode, start);
- if (first_group == osb->first_cluster_group_blkno)
- first_bit = start;
- else
- first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
- last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
- last_bit = osb->bitmap_cpg;
-
- trimmed = 0;
- for (group = first_group; group <= last_group;) {
+ do {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
else
group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
else
group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
- }
- range->len = trimmed * sb->s_blocksize;
+ } while (0);
- info.tf_trimlen = range->len;
- info.tf_success = (ret ? 0 : 1);
- pinfo = &info;
-out_trimunlock:
- ocfs2_trim_fs_unlock(osb, pinfo);
- ocfs2_trim_fs_lock_res_uninit(osb);
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
+ main_bm_bh = NULL;
out_mutex:
inode_unlock(main_bm_inode);
iput(main_bm_inode);
+
+ /*
+ * If all the groups trim are not done or failed, but we should release
+ * main_bm related locks for avoiding the current IO starve, then go to
+ * trim the next group
+ */
+ if (ret >= 0 && group <= last_group)
+ goto next_group;
out:
+ range->len = trimmed * sb->s_blocksize;
+ return ret;
+}
+
+int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ struct ocfs2_trim_fs_info info, *pinfo = NULL;
+
+ ocfs2_trim_fs_lock_res_init(osb);
+
+ trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
+
+ ret = ocfs2_trim_fs_lock(osb, NULL, 1);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ mlog(ML_NOTICE, "Wait for trim on device (%s) to "
+ "finish, which is running from another node.\n",
+ osb->dev_str);
+ ret = ocfs2_trim_fs_lock(osb, &info, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_trim_fs_lock_res_uninit(osb);
+ return ret;
+ }
+
+ if (info.tf_valid && info.tf_success &&
+ info.tf_start == range->start &&
+ info.tf_len == range->len &&
+ info.tf_minlen == range->minlen) {
+ /* Avoid sending duplicated trim to a shared device */
+ mlog(ML_NOTICE, "The same trim on device (%s) was "
+ "just done from node (%u), return.\n",
+ osb->dev_str, info.tf_nodenum);
+ range->len = info.tf_trimlen;
+ goto out;
+ }
+ }
+
+ info.tf_nodenum = osb->node_num;
+ info.tf_start = range->start;
+ info.tf_len = range->len;
+ info.tf_minlen = range->minlen;
+
+ ret = ocfs2_trim_mainbm(sb, range);
+
+ info.tf_trimlen = range->len;
+ info.tf_success = (ret < 0 ? 0 : 1);
+ pinfo = &info;
+out:
+ ocfs2_trim_fs_unlock(osb, pinfo);
+ ocfs2_trim_fs_lock_res_uninit(osb);
return ret;
}
struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
- o2net_disconnect_node(node);
+ if (cluster->cl_nodes[node->nd_num] == node) {
+ o2net_disconnect_node(node);
- if (cluster->cl_has_local &&
- (cluster->cl_local_node == node->nd_num)) {
- cluster->cl_has_local = 0;
- cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
- o2net_stop_listening(node);
+ if (cluster->cl_has_local &&
+ (cluster->cl_local_node == node->nd_num)) {
+ cluster->cl_has_local = 0;
+ cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+ o2net_stop_listening(node);
+ }
}
/* XXX call into net to stop this node from trading messages */
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
+ /* Only one trimfs thread are allowed to work at the same time. */
+ mutex_lock(&osb->obs_trim_fs_mutex);
+
ocfs2_lock_res_init_once(lockres);
ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
ocfs2_simple_drop_lockres(osb, lockres);
ocfs2_lock_res_free(lockres);
+
+ mutex_unlock(&osb->obs_trim_fs_mutex);
}
static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_lock_res osb_trim_fs_lockres;
+ struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
+
DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
/* End of trace events for fs/ocfs2/alloc.c. */
unsigned int si_blocks;
struct buffer_head **si_bh;
unsigned int si_num_slots;
- struct ocfs2_slot *si_slots;
+ struct ocfs2_slot si_slots[];
};
struct inode *inode = NULL;
struct ocfs2_slot_info *si;
- si = kzalloc(sizeof(struct ocfs2_slot_info) +
- (sizeof(struct ocfs2_slot) * osb->max_slots),
- GFP_KERNEL);
+ si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL);
if (!si) {
status = -ENOMEM;
mlog_errno(status);
si->si_extended = ocfs2_uses_extended_slot_map(osb);
si->si_num_slots = osb->max_slots;
- si->si_slots = (struct ocfs2_slot *)((char *)si +
- sizeof(struct ocfs2_slot_info));
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT);
if (ocfs2_is_hard_readonly(osb))
goto leave;
+ mutex_init(&osb->obs_trim_fs_mutex);
+
status = ocfs2_dlm_init(osb);
if (status < 0) {
mlog_errno(status);
struct page *page = buf->page;
if (page_count(page) == 1) {
- if (memcg_kmem_enabled())
- memcg_kmem_uncharge(page, 0);
+ memcg_kmem_uncharge(page, 0);
__SetPageLocked(page);
return 0;
}
#ifdef CONFIG_SECCOMP
seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
#endif
- seq_printf(m, "\nSpeculation_Store_Bypass:\t");
+ seq_puts(m, "\nSpeculation_Store_Bypass:\t");
switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) {
case -EINVAL:
- seq_printf(m, "unknown");
+ seq_puts(m, "unknown");
break;
case PR_SPEC_NOT_AFFECTED:
- seq_printf(m, "not vulnerable");
+ seq_puts(m, "not vulnerable");
break;
case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE:
- seq_printf(m, "thread force mitigated");
+ seq_puts(m, "thread force mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_DISABLE:
- seq_printf(m, "thread mitigated");
+ seq_puts(m, "thread mitigated");
break;
case PR_SPEC_PRCTL | PR_SPEC_ENABLE:
- seq_printf(m, "thread vulnerable");
+ seq_puts(m, "thread vulnerable");
break;
case PR_SPEC_DISABLE:
- seq_printf(m, "globally mitigated");
+ seq_puts(m, "globally mitigated");
break;
default:
- seq_printf(m, "vulnerable");
+ seq_puts(m, "vulnerable");
break;
}
seq_putc(m, '\n');
struct pid *pid, struct task_struct *task)
{
if (unlikely(!sched_info_on()))
- seq_printf(m, "0 0 0\n");
+ seq_puts(m, "0 0 0\n");
else
seq_printf(m, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
return d_splice_alias(inode, dentry);
}
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
+struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
{
struct task_struct *task;
unsigned tgid;
extern void pid_update_inode(struct task_struct *, struct inode *);
extern int pid_delete_dentry(const struct dentry *);
extern int proc_pid_readdir(struct file *, struct dir_context *);
-extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+struct dentry *proc_pid_lookup(struct dentry *, unsigned int);
extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
else if (page_count(page) == 0 && is_free_buddy_page(page))
u |= 1 << KPF_BUDDY;
- if (PageBalloon(page))
- u |= 1 << KPF_BALLOON;
+ if (PageOffline(page))
+ u |= 1 << KPF_OFFLINE;
if (PageTable(page))
u |= 1 << KPF_PGTABLE;
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
- if (!proc_pid_lookup(dir, dentry, flags))
+ if (!proc_pid_lookup(dentry, flags))
return NULL;
return proc_lookup(dir, dentry, flags);
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_self_inode_operations;
d_add(self, inode);
+ ret = 0;
} else {
dput(self);
- self = ERR_PTR(-ENOMEM);
}
- } else {
- self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/self\n");
- return PTR_ERR(self);
- }
- ns->proc_self = self;
- return 0;
+ else
+ ns->proc_self = self;
+
+ return ret;
}
void __init proc_self_init(void)
#ifdef arch_idle_time
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle;
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
idle += arch_idle_time(cpu);
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait;
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
if (cpu_online(cpu) && nr_iowait_cpu(cpu))
iowait += arch_idle_time(cpu);
return iowait;
#else
-static u64 get_idle_time(int cpu)
+static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu)
{
u64 idle, idle_usecs = -1ULL;
if (idle_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
- idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+ idle = kcs->cpustat[CPUTIME_IDLE];
else
idle = idle_usecs * NSEC_PER_USEC;
return idle;
}
-static u64 get_iowait_time(int cpu)
+static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu)
{
u64 iowait, iowait_usecs = -1ULL;
if (iowait_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
- iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+ iowait = kcs->cpustat[CPUTIME_IOWAIT];
else
iowait = iowait_usecs * NSEC_PER_USEC;
getboottime64(&boottime);
for_each_possible_cpu(i) {
- user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle += get_idle_time(i);
- iowait += get_iowait_time(i);
- irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
+ user += kcs->cpustat[CPUTIME_USER];
+ nice += kcs->cpustat[CPUTIME_NICE];
+ system += kcs->cpustat[CPUTIME_SYSTEM];
+ idle += get_idle_time(kcs, i);
+ iowait += get_iowait_time(kcs, i);
+ irq += kcs->cpustat[CPUTIME_IRQ];
+ softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal += kcs->cpustat[CPUTIME_STEAL];
+ guest += kcs->cpustat[CPUTIME_GUEST];
+ guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
seq_putc(p, '\n');
for_each_online_cpu(i) {
+ struct kernel_cpustat *kcs = &kcpustat_cpu(i);
+
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
- user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
- nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
- system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
- idle = get_idle_time(i);
- iowait = get_iowait_time(i);
- irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
- softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
- steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
- guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
- guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ user = kcs->cpustat[CPUTIME_USER];
+ nice = kcs->cpustat[CPUTIME_NICE];
+ system = kcs->cpustat[CPUTIME_SYSTEM];
+ idle = get_idle_time(kcs, i);
+ iowait = get_iowait_time(kcs, i);
+ irq = kcs->cpustat[CPUTIME_IRQ];
+ softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
+ steal = kcs->cpustat[CPUTIME_STEAL];
+ guest = kcs->cpustat[CPUTIME_GUEST];
+ guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
pte_t ptent = *pte;
if (pte_present(ptent)) {
- ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
- ptent = pte_wrprotect(ptent);
+ pte_t old_pte;
+
+ old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
- ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
+ ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
seq_file_path(m, file, "");
} else if (mm && is_stack(vma)) {
seq_pad(m, ' ');
- seq_printf(m, "[stack]");
+ seq_puts(m, "[stack]");
}
seq_putc(m, '\n');
struct inode *root_inode = d_inode(s->s_root);
struct pid_namespace *ns = proc_pid_ns(root_inode);
struct dentry *thread_self;
+ int ret = -ENOMEM;
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
inode->i_gid = GLOBAL_ROOT_GID;
inode->i_op = &proc_thread_self_inode_operations;
d_add(thread_self, inode);
+ ret = 0;
} else {
dput(thread_self);
- thread_self = ERR_PTR(-ENOMEM);
}
- } else {
- thread_self = ERR_PTR(-ENOMEM);
}
inode_unlock(root_inode);
- if (IS_ERR(thread_self)) {
+
+ if (ret)
pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
- return PTR_ERR(thread_self);
- }
- ns->proc_thread_self = thread_self;
- return 0;
+ else
+ ns->proc_thread_self = thread_self;
+
+ return ret;
}
void __init proc_thread_self_init(void)
return 0;
}
-static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm,
+static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep)
{
* non-present, preventing the hardware from asynchronously
* updating it.
*/
- return ptep_get_and_clear(mm, addr, ptep);
+ return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}
-static inline void __ptep_modify_prot_commit(struct mm_struct *mm,
+static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep, pte_t pte)
{
* The pte is non-present, so there's no hardware state to
* preserve.
*/
- set_pte_at(mm, addr, ptep, pte);
+ set_pte_at(vma->vm_mm, addr, ptep, pte);
}
#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
* queue the update to be done at some later time. The update must be
* actually committed before the pte lock is released, however.
*/
-static inline pte_t ptep_modify_prot_start(struct mm_struct *mm,
+static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep)
{
- return __ptep_modify_prot_start(mm, addr, ptep);
+ return __ptep_modify_prot_start(vma, addr, ptep);
}
/*
* Commit an update to a pte, leaving any hardware-controlled bits in
* the PTE unmodified.
*/
-static inline void ptep_modify_prot_commit(struct mm_struct *mm,
+static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr,
- pte_t *ptep, pte_t pte)
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
- __ptep_modify_prot_commit(mm, addr, ptep, pte);
+ __ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
#endif /* CONFIG_MMU */
rcu_read_lock();
/*
- * Paired with store_release in inode_switch_wb_work_fn() and
+ * Paired with store_release in inode_switch_wbs_work_fn() and
* ensures that we see the new wb if we see cleared I_WB_SWITCH.
*/
cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
*
* Common interface definitions for making balloon pages movable by compaction.
*
- * Despite being perfectly possible to perform ballooned pages migration, they
- * make a special corner case to compaction scans because balloon pages are not
- * enlisted at any LRU list like the other pages we do compact / migrate.
+ * Balloon page migration makes use of the general non-lru movable page
+ * feature.
+ *
+ * page->private is used to reference the responsible balloon device.
+ * page->mapping is used in context of non-lru page migration to reference
+ * the address space operations for page isolation/migration/compaction.
*
* As the page isolation scanning step a compaction thread does is a lockless
* procedure (from a page standpoint), it might bring some racy situations while
* performing balloon page compaction. In order to sort out these racy scenarios
* and safely perform balloon's page compaction and migration we must, always,
- * ensure following these three simple rules:
+ * ensure following these simple rules:
*
* i. when updating a balloon's page ->mapping element, strictly do it under
* the following lock order, independently of the far superior
* +--spin_lock_irq(&b_dev_info->pages_lock);
* ... page->mapping updates here ...
*
- * ii. before isolating or dequeueing a balloon page from the balloon device
- * pages list, the page reference counter must be raised by one and the
- * extra refcount must be dropped when the page is enqueued back into
- * the balloon device page list, thus a balloon page keeps its reference
- * counter raised only while it is under our special handling;
- *
- * iii. after the lockless scan step have selected a potential balloon page for
- * isolation, re-test the PageBalloon mark and the PagePrivate flag
- * under the proper page lock, to ensure isolating a valid balloon page
- * (not yet isolated, nor under release procedure)
- *
- * iv. isolation or dequeueing procedure must clear PagePrivate flag under
- * page lock together with removing page from balloon device page list.
+ * ii. isolation or dequeueing procedure must remove the page from balloon
+ * device page list under b_dev_info->pages_lock.
*
* The functions provided by this interface are placed to help on coping with
* the aforementioned balloon page corner case, as well as to ensure the simple
static inline void balloon_page_insert(struct balloon_dev_info *balloon,
struct page *page)
{
- __SetPageBalloon(page);
+ __SetPageOffline(page);
__SetPageMovable(page, balloon->inode->i_mapping);
set_page_private(page, (unsigned long)balloon);
list_add(&page->lru, &balloon->pages);
*/
static inline void balloon_page_delete(struct page *page)
{
- __ClearPageBalloon(page);
+ __ClearPageOffline(page);
__ClearPageMovable(page);
set_page_private(page, 0);
/*
static inline void balloon_page_insert(struct balloon_dev_info *balloon,
struct page *page)
{
- __SetPageBalloon(page);
+ __SetPageOffline(page);
list_add(&page->lru, &balloon->pages);
}
static inline void balloon_page_delete(struct page *page)
{
- __ClearPageBalloon(page);
+ __ClearPageOffline(page);
list_del(&page->lru);
}
struct kernfs_ops;
struct kernfs_open_file;
struct seq_file;
+struct poll_table_struct;
#define MAX_CGROUP_TYPE_NAMELEN 32
#define MAX_CGROUP_ROOT_NAMELEN 64
ssize_t (*write)(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
+ __poll_t (*poll)(struct kernfs_open_file *of,
+ struct poll_table_struct *pt);
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lock_class_key lockdep_key;
#endif
extern int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos);
extern int sysctl_extfrag_threshold;
-extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos);
extern int sysctl_compact_unevictable_allowed;
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
unsigned int order, unsigned int alloc_flags,
- const struct alloc_context *ac, enum compact_priority prio);
+ const struct alloc_context *ac, enum compact_priority prio,
+ struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags, int classzone_idx);
#endif /* CONFIG_COMPACTION */
-#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
struct node;
+#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
extern int compaction_register_node(struct node *node);
extern void compaction_unregister_node(struct node *node);
#else
static inline int dev_to_node(struct device *dev)
{
- return -1;
+ return NUMA_NO_NODE;
}
static inline void set_dev_node(struct device *dev, int node)
{
#include <linux/bitops.h>
#include <linux/jump_label.h>
+/*
+ * Return code to denote that requested number of
+ * frontswap pages are unused(moved to page cache).
+ * Used in in shmem_unuse and try_to_unuse.
+ */
+#define FRONTSWAP_PAGES_UNUSED 2
+
struct frontswap_ops {
void (*init)(unsigned); /* this swap type was just swapon'ed */
int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
* I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to
* synchronize competing switching instances and to tell
* wb stat updates to grab the i_pages lock. See
- * inode_switch_wb_work_fn() for details.
+ * inode_switch_wbs_work_fn() for details.
*
* I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper
* and work dirs among overlayfs mounts.
#define ___GFP_HIGH 0x20u
#define ___GFP_IO 0x40u
#define ___GFP_FS 0x80u
-#define ___GFP_WRITE 0x100u
-#define ___GFP_NOWARN 0x200u
-#define ___GFP_RETRY_MAYFAIL 0x400u
-#define ___GFP_NOFAIL 0x800u
-#define ___GFP_NORETRY 0x1000u
-#define ___GFP_MEMALLOC 0x2000u
-#define ___GFP_COMP 0x4000u
-#define ___GFP_ZERO 0x8000u
-#define ___GFP_NOMEMALLOC 0x10000u
-#define ___GFP_HARDWALL 0x20000u
-#define ___GFP_THISNODE 0x40000u
-#define ___GFP_ATOMIC 0x80000u
-#define ___GFP_ACCOUNT 0x100000u
-#define ___GFP_DIRECT_RECLAIM 0x200000u
-#define ___GFP_KSWAPD_RECLAIM 0x400000u
+#define ___GFP_ZERO 0x100u
+#define ___GFP_ATOMIC 0x200u
+#define ___GFP_DIRECT_RECLAIM 0x400u
+#define ___GFP_KSWAPD_RECLAIM 0x800u
+#define ___GFP_WRITE 0x1000u
+#define ___GFP_NOWARN 0x2000u
+#define ___GFP_RETRY_MAYFAIL 0x4000u
+#define ___GFP_NOFAIL 0x8000u
+#define ___GFP_NORETRY 0x10000u
+#define ___GFP_MEMALLOC 0x20000u
+#define ___GFP_COMP 0x40000u
+#define ___GFP_NOMEMALLOC 0x80000u
+#define ___GFP_HARDWALL 0x100000u
+#define ___GFP_THISNODE 0x200000u
+#define ___GFP_ACCOUNT 0x400000u
#ifdef CONFIG_LOCKDEP
#define ___GFP_NOLOCKDEP 0x800000u
#else
nodemask_t *nmask);
struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address);
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask);
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx);
extern int dissolve_free_huge_page(struct page *page);
extern int dissolve_free_huge_pages(unsigned long start_pfn,
unsigned long end_pfn);
-static inline bool hugepage_migration_supported(struct hstate *h)
-{
+
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
+#ifndef arch_hugetlb_migration_supported
+static inline bool arch_hugetlb_migration_supported(struct hstate *h)
+{
if ((huge_page_shift(h) == PMD_SHIFT) ||
- (huge_page_shift(h) == PGDIR_SHIFT))
+ (huge_page_shift(h) == PUD_SHIFT) ||
+ (huge_page_shift(h) == PGDIR_SHIFT))
return true;
else
return false;
+}
+#endif
#else
+static inline bool arch_hugetlb_migration_supported(struct hstate *h)
+{
return false;
+}
#endif
+
+static inline bool hugepage_migration_supported(struct hstate *h)
+{
+ return arch_hugetlb_migration_supported(h);
+}
+
+/*
+ * Movability check is different as compared to migration check.
+ * It determines whether or not a huge page should be placed on
+ * movable zone or not. Movability of any huge page should be
+ * required only if huge page size is supported for migration.
+ * There wont be any reason for the huge page to be movable if
+ * it is not migratable to start with. Also the size of the huge
+ * page should be large enough to be placed under a movable zone
+ * and still feasible enough to be migratable. Just the presence
+ * in movable zone does not make the migration feasible.
+ *
+ * So even though large huge page sizes like the gigantic ones
+ * are migratable they should not be movable because its not
+ * feasible to migrate them from movable zone.
+ */
+static inline bool hugepage_movable_supported(struct hstate *h)
+{
+ if (!hugepage_migration_supported(h))
+ return false;
+
+ if (hstate_is_gigantic(h))
+ return false;
+ return true;
}
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
set_huge_pte_at(mm, addr, ptep, pte);
}
#endif
+
+#ifndef huge_ptep_modify_prot_start
+#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
+static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
+}
+#endif
+
+#ifndef huge_ptep_modify_prot_commit
+#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
+static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t old_pte, pte_t pte)
+{
+ set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
+}
+#endif
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
#define alloc_huge_page(v, a, r) NULL
return false;
}
+static inline bool hugepage_movable_supported(struct hstate *h)
+{
+ return false;
+}
+
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
struct mm_struct *mm, pte_t *pte)
{
#ifndef _LINUX_KASAN_CHECKS_H
#define _LINUX_KASAN_CHECKS_H
-#ifdef CONFIG_KASAN
+#if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL)
void kasan_check_read(const volatile void *p, unsigned int size);
void kasan_check_write(const volatile void *p, unsigned int size);
#else
struct vm_area_struct;
struct super_block;
struct file_system_type;
+struct poll_table_struct;
struct kernfs_open_node;
struct kernfs_iattrs;
ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
loff_t off);
+ __poll_t (*poll)(struct kernfs_open_file *of,
+ struct poll_table_struct *pt);
+
int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name, const void *new_ns);
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
+__poll_t kernfs_generic_poll(struct kernfs_open_file *of,
+ struct poll_table_struct *pt);
void kernfs_notify(struct kernfs_node *kn);
const void *kernfs_super_ns(struct super_block *sb);
void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma, unsigned long address);
#else /* !CONFIG_KSM */
static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
}
+static inline bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ return false;
+}
#endif /* CONFIG_MMU */
#endif /* !CONFIG_KSM */
head->prev = last;
}
+/**
+ * list_is_first -- tests whether @ list is the first entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_first(const struct list_head *list,
+ const struct list_head *head)
+{
+ return list->prev == head;
+}
+
/**
* list_is_last - tests whether @list is the last entry in list @head
* @list: the entry to test
}
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
+static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
+{
+ return mem_cgroup_from_css(seq_css(m));
+}
+
static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
struct mem_cgroup_per_node *mz;
return NULL;
}
+static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
+{
+ return NULL;
+}
+
static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
return NULL;
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
void memcg_kmem_put_cache(struct kmem_cache *cachep);
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
- struct mem_cgroup *memcg);
#ifdef CONFIG_MEMCG_KMEM
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
-void memcg_kmem_uncharge(struct page *page, int order);
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
+void __memcg_kmem_uncharge(struct page *page, int order);
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+ struct mem_cgroup *memcg);
extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
return static_branch_unlikely(&memcg_kmem_enabled_key);
}
+static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+ if (memcg_kmem_enabled())
+ return __memcg_kmem_charge(page, gfp, order);
+ return 0;
+}
+
+static inline void memcg_kmem_uncharge(struct page *page, int order)
+{
+ if (memcg_kmem_enabled())
+ __memcg_kmem_uncharge(page, order);
+}
+
+static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp,
+ int order, struct mem_cgroup *memcg)
+{
+ if (memcg_kmem_enabled())
+ return __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ return 0;
+}
/*
* helper for accessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
{
}
+static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void __memcg_kmem_uncharge(struct page *page, int order)
+{
+}
+
#define for_each_memcg_cache_index(_idx) \
for (; NULL; )
unsigned long *valid_start, unsigned long *valid_end);
extern void __offline_isolated_pages(unsigned long, unsigned long);
-typedef void (*online_page_callback_t)(struct page *page);
+typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
extern int set_online_page_callback(online_page_callback_t callback);
extern int restore_online_page_callback(online_page_callback_t callback);
unsigned int gup_flags, struct page **pages, int *locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
-#ifdef CONFIG_FS_DAX
+
+#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA)
long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas);
struct { /* Page cache and anonymous pages */
/**
* @lru: Pageout list, eg. active_list protected by
- * zone_lru_lock. Sometimes used as a generic list
+ * pgdat->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
struct list_head lru;
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
+ unsigned long compact_init_migrate_pfn;
+ unsigned long compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
-static inline spinlock_t *zone_lru_lock(struct zone *zone)
-{
- return &zone->zone_pgdat->lru_lock;
-}
static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
{
/*
* If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
- * need to check pfn validility within that MAX_ORDER_NR_PAGES block.
+ * need to check pfn validity within that MAX_ORDER_NR_PAGES block.
* pfn_valid_within() should be used in this case; we optimise this away
* when we have no holes within a MAX_ORDER_NR_PAGES block.
*/
return next_node(nid, node_states[N_MEMORY]);
}
-extern int nr_node_ids;
-extern int nr_online_nodes;
+extern unsigned int nr_node_ids;
+extern unsigned int nr_online_nodes;
static inline void node_set_online(int nid)
{
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
-#define nr_node_ids 1
-#define nr_online_nodes 1
+#define nr_node_ids 1U
+#define nr_online_nodes 1U
#define node_set_online(node) node_set_state((node), N_ONLINE)
#define node_set_offline(node) node_clear_state((node), N_ONLINE)
/*
* Various page->flags bits:
*
- * PG_reserved is set for special pages, which can never be swapped out. Some
- * of them might not even exist...
+ * PG_reserved is set for special pages. The "struct page" of such a page
+ * should in general not be touched (e.g. set dirty) except by its owner.
+ * Pages marked as PG_reserved include:
+ * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
+ * initrd, HW tables)
+ * - Pages reserved or allocated early during boot (before the page allocator
+ * was initialized). This includes (depending on the architecture) the
+ * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
+ * much more. Once (if ever) freed, PG_reserved is cleared and they will
+ * be given to the page allocator.
+ * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
+ * to read/write these pages might end badly. Don't touch!
+ * - The zero page(s)
+ * - Pages not added to the page allocator when onlining a section because
+ * they were excluded via the online_page_callback() or because they are
+ * PG_hwpoison.
+ * - Pages allocated in the context of kexec/kdump (loaded kernel image,
+ * control pages, vmcoreinfo)
+ * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
+ * not marked PG_reserved (as they might be in use by somebody else who does
+ * not respect the caching strategy).
+ * - Pages part of an offline section (struct pages of offline sections should
+ * not be trusted as they will be initialized when first onlined).
+ * - MCA pages on ia64
+ * - Pages holding CPU notes for POWER Firmware Assisted Dump
+ * - Device memory (e.g. PMEM, DAX, HMM)
+ * Some PG_reserved pages will be excluded from the hibernation image.
+ * PG_reserved does in general not hinder anybody from dumping or swapping
+ * and is no longer required for remap_pfn_range(). ioremap might require it.
+ * Consequently, PG_reserved for a page mapped into user space can indicate
+ * the zero page, the vDSO, MMIO pages or device memory.
*
* The PG_private bitflag is set on pagecache pages if they contain filesystem
* specific data (which is normally at page->private). It can be used by
/* Reserve 0x0000007f to catch underflows of page_mapcount */
#define PAGE_MAPCOUNT_RESERVE -128
#define PG_buddy 0x00000080
-#define PG_balloon 0x00000100
+#define PG_offline 0x00000100
#define PG_kmemcg 0x00000200
#define PG_table 0x00000400
PAGE_TYPE_OPS(Buddy, buddy)
/*
- * PageBalloon() is true for pages that are on the balloon page list
- * (see mm/balloon_compaction.c).
+ * PageOffline() indicates that the page is logically offline although the
+ * containing section is online. (e.g. inflated in a balloon driver or
+ * not onlined when onlining the section).
+ * The content of these pages is effectively stale. Such pages should not
+ * be touched (read/write/dump/save) except by their owner.
*/
-PAGE_TYPE_OPS(Balloon, balloon)
+PAGE_TYPE_OPS(Offline, offline)
/*
* If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
* will find the page or it will not. Likewise, the old find_get_page could run
* either before the insertion or afterwards, depending on timing.
*/
-static inline int page_cache_get_speculative(struct page *page)
+static inline int __page_cache_add_speculative(struct page *page, int count)
{
#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
* SMP requires.
*/
VM_BUG_ON_PAGE(page_count(page) == 0, page);
- page_ref_inc(page);
+ page_ref_add(page, count);
#else
- if (unlikely(!get_page_unless_zero(page))) {
+ if (unlikely(!page_ref_add_unless(page, count, 0))) {
/*
* Either the page has been freed, or will be freed.
* In either case, retry here and the caller should
return 1;
}
-/*
- * Same as above, but add instead of inc (could just be merged)
- */
-static inline int page_cache_add_speculative(struct page *page, int count)
+static inline int page_cache_get_speculative(struct page *page)
{
- VM_BUG_ON(in_interrupt());
-
-#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT_COUNT
- VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
- VM_BUG_ON_PAGE(page_count(page) == 0, page);
- page_ref_add(page, count);
-
-#else
- if (unlikely(!page_ref_add_unless(page, count, 0)))
- return 0;
-#endif
- VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page);
+ return __page_cache_add_speculative(page, 1);
+}
- return 1;
+static inline int page_cache_add_speculative(struct page *page, int count)
+{
+ return __page_cache_add_speculative(page, count);
}
#ifdef CONFIG_NUMA
*/
#define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA)
-/********** mm/debug-pagealloc.c **********/
+/********** mm/page_poison.c **********/
#ifdef CONFIG_PAGE_POISONING_ZERO
#define PAGE_POISON 0x00
#else
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
+struct capture_control;
struct robust_list_head;
struct sched_attr;
struct sched_param;
struct io_context *io_context;
+#ifdef CONFIG_COMPACTION
+ struct capture_control *capture_control;
+#endif
/* Ptrace state: */
unsigned long ptrace_message;
kernel_siginfo_t *last_siginfo;
#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
+#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
* Applies per-task gfp context to the given allocation flags.
* PF_MEMALLOC_NOIO implies GFP_NOIO
* PF_MEMALLOC_NOFS implies GFP_NOFS
+ * PF_MEMALLOC_NOCMA implies no allocation from CMA region.
*/
static inline gfp_t current_gfp_context(gfp_t flags)
{
- /*
- * NOIO implies both NOIO and NOFS and it is a weaker context
- * so always make sure it makes precedence
- */
- if (unlikely(current->flags & PF_MEMALLOC_NOIO))
- flags &= ~(__GFP_IO | __GFP_FS);
- else if (unlikely(current->flags & PF_MEMALLOC_NOFS))
- flags &= ~__GFP_FS;
+ if (unlikely(current->flags &
+ (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) {
+ /*
+ * NOIO implies both NOIO and NOFS and it is a weaker context
+ * so always make sure it makes precedence
+ */
+ if (current->flags & PF_MEMALLOC_NOIO)
+ flags &= ~(__GFP_IO | __GFP_FS);
+ else if (current->flags & PF_MEMALLOC_NOFS)
+ flags &= ~__GFP_FS;
+#ifdef CONFIG_CMA
+ if (current->flags & PF_MEMALLOC_NOCMA)
+ flags &= ~__GFP_MOVABLE;
+#endif
+ }
return flags;
}
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
+#ifdef CONFIG_CMA
+static inline unsigned int memalloc_nocma_save(void)
+{
+ unsigned int flags = current->flags & PF_MEMALLOC_NOCMA;
+
+ current->flags |= PF_MEMALLOC_NOCMA;
+ return flags;
+}
+
+static inline void memalloc_nocma_restore(unsigned int flags)
+{
+ current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags;
+}
+#else
+static inline unsigned int memalloc_nocma_save(void)
+{
+ return 0;
+}
+
+static inline void memalloc_nocma_restore(unsigned int flags)
+{
+}
+#endif
+
#ifdef CONFIG_MEMCG
/**
* memalloc_use_memcg - Starts the remote memcg charging scope.
extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
-extern int shmem_unuse(swp_entry_t entry, struct page *page);
+extern int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse);
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
*/
struct kmem_cache {
struct kmem_cache_cpu __percpu *cpu_slab;
- /* Used for retriving partial slabs etc */
+ /* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
unsigned long min_partial;
- unsigned int size; /* The size of an object including meta data */
- unsigned int object_size;/* The size of an object without meta data */
- unsigned int offset; /* Free pointer offset. */
+ unsigned int size; /* The size of an object including metadata */
+ unsigned int object_size;/* The size of an object without metadata */
+ unsigned int offset; /* Free pointer offset */
#ifdef CONFIG_SLUB_CPU_PARTIAL
/* Number of per cpu partial objects to keep around */
unsigned int cpu_partial;
#endif
#ifdef CONFIG_MEMCG
struct memcg_cache_params memcg_params;
- /* for propagation, maximum size of a stored attr */
+ /* For propagation, maximum size of a stored attr */
unsigned int max_attr_size;
#ifdef CONFIG_SYSFS
struct kset *memcg_kset;
#else
#define slub_cpu_partial(s) (0)
#define slub_set_cpu_partial(s, n)
-#endif // CONFIG_SLUB_CPU_PARTIAL
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
#ifdef CONFIG_SYSFS
#define SLAB_SUPPORTS_SYSFS
};
/* linux/mm/workingset.c */
-void *workingset_eviction(struct address_space *mapping, struct page *page);
+void *workingset_eviction(struct page *page);
void workingset_refault(struct page *page, void *shadow);
void workingset_activation(struct page *page);
return vm_swappiness;
/* root ? */
- if (mem_cgroup_disabled() || !memcg->css.parent)
+ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
return vm_swappiness;
return memcg->swappiness;
#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
#define F_SEAL_GROW 0x0004 /* prevent file from growing */
#define F_SEAL_WRITE 0x0008 /* prevent writes */
+#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
/* (1U << 31) is reserved for signed error codes */
/*
#define KPF_KSM 21
#define KPF_THP 22
-#define KPF_BALLOON 23
+#define KPF_OFFLINE 23
#define KPF_ZERO_PAGE 24
#define KPF_IDLE 25
#define KPF_PGTABLE 26
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/audit.h>
+#include <linux/numa.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
.vtime.state = VTIME_SYS,
#endif
#ifdef CONFIG_NUMA_BALANCING
- .numa_preferred_nid = -1,
+ .numa_preferred_nid = NUMA_NO_NODE,
.numa_group = NULL,
.numa_faults = NULL,
#endif
return ret ?: nbytes;
}
+static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->poll)
+ return cft->poll(of, pt);
+
+ return kernfs_generic_poll(of, pt);
+}
+
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
return seq_cft(seq)->seq_start(seq, ppos);
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_show = cgroup_seqfile_show,
};
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+ VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
arch_crash_save_vmcoreinfo();
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/numa.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
{
struct kthread_worker *worker;
struct task_struct *task;
- int node = -1;
+ int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(!PageHighMem(page));
- if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
- PageReserved(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page) || PageOffline(page))
return NULL;
if (page_is_guard(page))
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(PageHighMem(page));
if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
+ if (PageOffline(page))
+ return NULL;
+
if (PageReserved(page)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
init_numa_balancing(clone_flags, p);
}
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = -1;
+ p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1;
+ int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
+ src_nid != p->numa_preferred_nid))
return;
}
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = sysctl_extfrag_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
config FRAME_WARN
int "Warn for stack frames larger than (needs gcc 4.4)"
range 0 8192
- default 3072 if KASAN_EXTRA
default 2048 if GCC_PLUGIN_LATENT_ENTROPY
default 1280 if (!64BIT && PARISC)
default 1024 if (!64BIT && !PARISC)
you really need it, and what the merge plan to the mainline kernel for
your module is.
-config PAGE_OWNER
- bool "Track page owner"
- depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
- select DEBUG_FS
- select STACKTRACE
- select STACKDEPOT
- select PAGE_EXTENSION
- help
- This keeps track of what call chain is the owner of a page, may
- help to find bare alloc_page(s) leaks. Even if you include this
- feature on your build, it is disabled in default. You should pass
- "page_owner=on" to boot parameter in order to enable it. Eats
- a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
- for user-space helper.
-
- If unsure, say N.
-
config DEBUG_FS
bool "Debug Filesystem"
help
If unsure, say N.
+config TEST_VMALLOC
+ tristate "Test module for stress/performance analysis of vmalloc allocator"
+ default n
+ depends on MMU
+ depends on m
+ help
+ This builds the "test_vmalloc" module that should be used for
+ stress and performance analysis. So, any new change for vmalloc
+ subsystem can be evaluated from performance and stability point
+ of view.
+
+ If unsure, say N.
+
config TEST_USER_COPY
tristate "Test user/kernel boundary protections"
depends on m
endchoice
-config KASAN_EXTRA
- bool "KASAN: extra checks"
- depends on KASAN_GENERIC && DEBUG_KERNEL && !COMPILE_TEST
- help
- This enables further checks in generic KASAN, for now it only
- includes the address-use-after-scope check that can lead to
- excessive kernel stack usage, frame size warnings and longer
- compile time.
- See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715
-
choice
prompt "Instrumentation type"
depends on KASAN
obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o
obj-$(CONFIG_TEST_LKM) += test_module.o
+obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o
obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o
obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
obj-$(CONFIG_TEST_SORT) += test_sort.o
#include <linux/cpumask.h>
#include <linux/export.h>
#include <linux/memblock.h>
+#include <linux/numa.h>
/**
* cpumask_next - get the next cpu in a cpumask
/* Wrap: we always want a cpu. */
i %= num_online_cpus();
- if (node == -1) {
+ if (node == NUMA_NO_NODE) {
for_each_cpu(cpu, cpu_online_mask)
if (i-- == 0)
return cpu;
kfree(kmem);
}
-static noinline void __init use_after_scope_test(void)
-{
- volatile char *volatile p;
-
- pr_info("use-after-scope on int\n");
- {
- int local = 0;
-
- p = (char *)&local;
- }
- p[0] = 1;
- p[3] = 1;
-
- pr_info("use-after-scope on array\n");
- {
- char local[1024] = {0};
-
- p = local;
- }
- p[0] = 1;
- p[1023] = 1;
-}
-
static noinline void __init kasan_alloca_oob_left(void)
{
volatile int i = 10;
kasan_alloca_oob_right();
ksize_unpoisons_memory();
copy_user_test();
- use_after_scope_test();
kmem_cache_double_free();
kmem_cache_invalid_free();
kasan_memchr();
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test module for stress and analyze performance of vmalloc allocator.
+ * (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/kthread.h>
+#include <linux/moduleparam.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/rwsem.h>
+#include <linux/mm.h>
+
+#define __param(type, name, init, msg) \
+ static type name = init; \
+ module_param(name, type, 0444); \
+ MODULE_PARM_DESC(name, msg) \
+
+__param(bool, single_cpu_test, false,
+ "Use single first online CPU to run tests");
+
+__param(bool, sequential_test_order, false,
+ "Use sequential stress tests order");
+
+__param(int, test_repeat_count, 1,
+ "Set test repeat counter");
+
+__param(int, test_loop_count, 1000000,
+ "Set test loop counter");
+
+__param(int, run_test_mask, INT_MAX,
+ "Set tests specified in the mask.\n\n"
+ "\t\tid: 1, name: fix_size_alloc_test\n"
+ "\t\tid: 2, name: full_fit_alloc_test\n"
+ "\t\tid: 4, name: long_busy_list_alloc_test\n"
+ "\t\tid: 8, name: random_size_alloc_test\n"
+ "\t\tid: 16, name: fix_align_alloc_test\n"
+ "\t\tid: 32, name: random_size_align_alloc_test\n"
+ "\t\tid: 64, name: align_shift_alloc_test\n"
+ "\t\tid: 128, name: pcpu_alloc_test\n"
+ /* Add a new test case description here. */
+);
+
+/*
+ * Depends on single_cpu_test parameter. If it is true, then
+ * use first online CPU to trigger a test on, otherwise go with
+ * all online CPUs.
+ */
+static cpumask_t cpus_run_test_mask = CPU_MASK_NONE;
+
+/*
+ * Read write semaphore for synchronization of setup
+ * phase that is done in main thread and workers.
+ */
+static DECLARE_RWSEM(prepare_for_test_rwsem);
+
+/*
+ * Completion tracking for worker threads.
+ */
+static DECLARE_COMPLETION(test_all_done_comp);
+static atomic_t test_n_undone = ATOMIC_INIT(0);
+
+static inline void
+test_report_one_done(void)
+{
+ if (atomic_dec_and_test(&test_n_undone))
+ complete(&test_all_done_comp);
+}
+
+static int random_size_align_alloc_test(void)
+{
+ unsigned long size, align, rnd;
+ void *ptr;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ get_random_bytes(&rnd, sizeof(rnd));
+
+ /*
+ * Maximum 1024 pages, if PAGE_SIZE is 4096.
+ */
+ align = 1 << (rnd % 23);
+
+ /*
+ * Maximum 10 pages.
+ */
+ size = ((rnd % 10) + 1) * PAGE_SIZE;
+
+ ptr = __vmalloc_node_range(size, align,
+ VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL,
+ 0, 0, __builtin_return_address(0));
+
+ if (!ptr)
+ return -1;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+/*
+ * This test case is supposed to be failed.
+ */
+static int align_shift_alloc_test(void)
+{
+ unsigned long align;
+ void *ptr;
+ int i;
+
+ for (i = 0; i < BITS_PER_LONG; i++) {
+ align = ((unsigned long) 1) << i;
+
+ ptr = __vmalloc_node_range(PAGE_SIZE, align,
+ VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL,
+ 0, 0, __builtin_return_address(0));
+
+ if (!ptr)
+ return -1;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+static int fix_align_alloc_test(void)
+{
+ void *ptr;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ ptr = __vmalloc_node_range(5 * PAGE_SIZE,
+ THREAD_ALIGN << 1,
+ VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO,
+ PAGE_KERNEL,
+ 0, 0, __builtin_return_address(0));
+
+ if (!ptr)
+ return -1;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+static int random_size_alloc_test(void)
+{
+ unsigned int n;
+ void *p;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ get_random_bytes(&n, sizeof(i));
+ n = (n % 100) + 1;
+
+ p = vmalloc(n * PAGE_SIZE);
+
+ if (!p)
+ return -1;
+
+ *((__u8 *)p) = 1;
+ vfree(p);
+ }
+
+ return 0;
+}
+
+static int long_busy_list_alloc_test(void)
+{
+ void *ptr_1, *ptr_2;
+ void **ptr;
+ int rv = -1;
+ int i;
+
+ ptr = vmalloc(sizeof(void *) * 15000);
+ if (!ptr)
+ return rv;
+
+ for (i = 0; i < 15000; i++)
+ ptr[i] = vmalloc(1 * PAGE_SIZE);
+
+ for (i = 0; i < test_loop_count; i++) {
+ ptr_1 = vmalloc(100 * PAGE_SIZE);
+ if (!ptr_1)
+ goto leave;
+
+ ptr_2 = vmalloc(1 * PAGE_SIZE);
+ if (!ptr_2) {
+ vfree(ptr_1);
+ goto leave;
+ }
+
+ *((__u8 *)ptr_1) = 0;
+ *((__u8 *)ptr_2) = 1;
+
+ vfree(ptr_1);
+ vfree(ptr_2);
+ }
+
+ /* Success */
+ rv = 0;
+
+leave:
+ for (i = 0; i < 15000; i++)
+ vfree(ptr[i]);
+
+ vfree(ptr);
+ return rv;
+}
+
+static int full_fit_alloc_test(void)
+{
+ void **ptr, **junk_ptr, *tmp;
+ int junk_length;
+ int rv = -1;
+ int i;
+
+ junk_length = fls(num_online_cpus());
+ junk_length *= (32 * 1024 * 1024 / PAGE_SIZE);
+
+ ptr = vmalloc(sizeof(void *) * junk_length);
+ if (!ptr)
+ return rv;
+
+ junk_ptr = vmalloc(sizeof(void *) * junk_length);
+ if (!junk_ptr) {
+ vfree(ptr);
+ return rv;
+ }
+
+ for (i = 0; i < junk_length; i++) {
+ ptr[i] = vmalloc(1 * PAGE_SIZE);
+ junk_ptr[i] = vmalloc(1 * PAGE_SIZE);
+ }
+
+ for (i = 0; i < junk_length; i++)
+ vfree(junk_ptr[i]);
+
+ for (i = 0; i < test_loop_count; i++) {
+ tmp = vmalloc(1 * PAGE_SIZE);
+
+ if (!tmp)
+ goto error;
+
+ *((__u8 *)tmp) = 1;
+ vfree(tmp);
+ }
+
+ /* Success */
+ rv = 0;
+
+error:
+ for (i = 0; i < junk_length; i++)
+ vfree(ptr[i]);
+
+ vfree(ptr);
+ vfree(junk_ptr);
+
+ return rv;
+}
+
+static int fix_size_alloc_test(void)
+{
+ void *ptr;
+ int i;
+
+ for (i = 0; i < test_loop_count; i++) {
+ ptr = vmalloc(3 * PAGE_SIZE);
+
+ if (!ptr)
+ return -1;
+
+ *((__u8 *)ptr) = 0;
+
+ vfree(ptr);
+ }
+
+ return 0;
+}
+
+static int
+pcpu_alloc_test(void)
+{
+ int rv = 0;
+#ifndef CONFIG_NEED_PER_CPU_KM
+ void __percpu **pcpu;
+ size_t size, align;
+ int i;
+
+ pcpu = vmalloc(sizeof(void __percpu *) * 35000);
+ if (!pcpu)
+ return -1;
+
+ for (i = 0; i < 35000; i++) {
+ unsigned int r;
+
+ get_random_bytes(&r, sizeof(i));
+ size = (r % (PAGE_SIZE / 4)) + 1;
+
+ /*
+ * Maximum PAGE_SIZE
+ */
+ get_random_bytes(&r, sizeof(i));
+ align = 1 << ((i % 11) + 1);
+
+ pcpu[i] = __alloc_percpu(size, align);
+ if (!pcpu[i])
+ rv = -1;
+ }
+
+ for (i = 0; i < 35000; i++)
+ free_percpu(pcpu[i]);
+
+ vfree(pcpu);
+#endif
+ return rv;
+}
+
+struct test_case_desc {
+ const char *test_name;
+ int (*test_func)(void);
+};
+
+static struct test_case_desc test_case_array[] = {
+ { "fix_size_alloc_test", fix_size_alloc_test },
+ { "full_fit_alloc_test", full_fit_alloc_test },
+ { "long_busy_list_alloc_test", long_busy_list_alloc_test },
+ { "random_size_alloc_test", random_size_alloc_test },
+ { "fix_align_alloc_test", fix_align_alloc_test },
+ { "random_size_align_alloc_test", random_size_align_alloc_test },
+ { "align_shift_alloc_test", align_shift_alloc_test },
+ { "pcpu_alloc_test", pcpu_alloc_test },
+ /* Add a new test case here. */
+};
+
+struct test_case_data {
+ int test_failed;
+ int test_passed;
+ u64 time;
+};
+
+/* Split it to get rid of: WARNING: line over 80 characters */
+static struct test_case_data
+ per_cpu_test_data[NR_CPUS][ARRAY_SIZE(test_case_array)];
+
+static struct test_driver {
+ struct task_struct *task;
+ unsigned long start;
+ unsigned long stop;
+ int cpu;
+} per_cpu_test_driver[NR_CPUS];
+
+static void shuffle_array(int *arr, int n)
+{
+ unsigned int rnd;
+ int i, j, x;
+
+ for (i = n - 1; i > 0; i--) {
+ get_random_bytes(&rnd, sizeof(rnd));
+
+ /* Cut the range. */
+ j = rnd % i;
+
+ /* Swap indexes. */
+ x = arr[i];
+ arr[i] = arr[j];
+ arr[j] = x;
+ }
+}
+
+static int test_func(void *private)
+{
+ struct test_driver *t = private;
+ cpumask_t newmask = CPU_MASK_NONE;
+ int random_array[ARRAY_SIZE(test_case_array)];
+ int index, i, j, ret;
+ ktime_t kt;
+ u64 delta;
+
+ cpumask_set_cpu(t->cpu, &newmask);
+ set_cpus_allowed_ptr(current, &newmask);
+
+ for (i = 0; i < ARRAY_SIZE(test_case_array); i++)
+ random_array[i] = i;
+
+ if (!sequential_test_order)
+ shuffle_array(random_array, ARRAY_SIZE(test_case_array));
+
+ /*
+ * Block until initialization is done.
+ */
+ down_read(&prepare_for_test_rwsem);
+
+ t->start = get_cycles();
+ for (i = 0; i < ARRAY_SIZE(test_case_array); i++) {
+ index = random_array[i];
+
+ /*
+ * Skip tests if run_test_mask has been specified.
+ */
+ if (!((run_test_mask & (1 << index)) >> index))
+ continue;
+
+ kt = ktime_get();
+ for (j = 0; j < test_repeat_count; j++) {
+ ret = test_case_array[index].test_func();
+ if (!ret)
+ per_cpu_test_data[t->cpu][index].test_passed++;
+ else
+ per_cpu_test_data[t->cpu][index].test_failed++;
+ }
+
+ /*
+ * Take an average time that test took.
+ */
+ delta = (u64) ktime_us_delta(ktime_get(), kt);
+ do_div(delta, (u32) test_repeat_count);
+
+ per_cpu_test_data[t->cpu][index].time = delta;
+ }
+ t->stop = get_cycles();
+
+ up_read(&prepare_for_test_rwsem);
+ test_report_one_done();
+
+ /*
+ * Wait for the kthread_stop() call.
+ */
+ while (!kthread_should_stop())
+ msleep(10);
+
+ return 0;
+}
+
+static void
+init_test_configurtion(void)
+{
+ /*
+ * Reset all data of all CPUs.
+ */
+ memset(per_cpu_test_data, 0, sizeof(per_cpu_test_data));
+
+ if (single_cpu_test)
+ cpumask_set_cpu(cpumask_first(cpu_online_mask),
+ &cpus_run_test_mask);
+ else
+ cpumask_and(&cpus_run_test_mask, cpu_online_mask,
+ cpu_online_mask);
+
+ if (test_repeat_count <= 0)
+ test_repeat_count = 1;
+
+ if (test_loop_count <= 0)
+ test_loop_count = 1;
+}
+
+static void do_concurrent_test(void)
+{
+ int cpu, ret;
+
+ /*
+ * Set some basic configurations plus sanity check.
+ */
+ init_test_configurtion();
+
+ /*
+ * Put on hold all workers.
+ */
+ down_write(&prepare_for_test_rwsem);
+
+ for_each_cpu(cpu, &cpus_run_test_mask) {
+ struct test_driver *t = &per_cpu_test_driver[cpu];
+
+ t->cpu = cpu;
+ t->task = kthread_run(test_func, t, "vmalloc_test/%d", cpu);
+
+ if (!IS_ERR(t->task))
+ /* Success. */
+ atomic_inc(&test_n_undone);
+ else
+ pr_err("Failed to start kthread for %d CPU\n", cpu);
+ }
+
+ /*
+ * Now let the workers do their job.
+ */
+ up_write(&prepare_for_test_rwsem);
+
+ /*
+ * Sleep quiet until all workers are done with 1 second
+ * interval. Since the test can take a lot of time we
+ * can run into a stack trace of the hung task. That is
+ * why we go with completion_timeout and HZ value.
+ */
+ do {
+ ret = wait_for_completion_timeout(&test_all_done_comp, HZ);
+ } while (!ret);
+
+ for_each_cpu(cpu, &cpus_run_test_mask) {
+ struct test_driver *t = &per_cpu_test_driver[cpu];
+ int i;
+
+ if (!IS_ERR(t->task))
+ kthread_stop(t->task);
+
+ for (i = 0; i < ARRAY_SIZE(test_case_array); i++) {
+ if (!((run_test_mask & (1 << i)) >> i))
+ continue;
+
+ pr_info(
+ "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n",
+ test_case_array[i].test_name,
+ per_cpu_test_data[cpu][i].test_passed,
+ per_cpu_test_data[cpu][i].test_failed,
+ test_repeat_count, test_loop_count,
+ per_cpu_test_data[cpu][i].time);
+ }
+
+ pr_info("All test took CPU%d=%lu cycles\n",
+ cpu, t->stop - t->start);
+ }
+}
+
+static int vmalloc_test_init(void)
+{
+ do_concurrent_test();
+ return -EAGAIN; /* Fail will directly unload the module */
+}
+
+static void vmalloc_test_exit(void)
+{
+}
+
+module_init(vmalloc_test_init)
+module_exit(vmalloc_test_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Uladzislau Rezki");
+MODULE_DESCRIPTION("vmalloc test module");
Enable debug page memory allocations by default? This value
can be overridden by debug_pagealloc=off|on.
+config PAGE_OWNER
+ bool "Track page owner"
+ depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
+ select DEBUG_FS
+ select STACKTRACE
+ select STACKDEPOT
+ select PAGE_EXTENSION
+ help
+ This keeps track of what call chain is the owner of a page, may
+ help to find bare alloc_page(s) leaks. Even if you include this
+ feature on your build, it is disabled in default. You should pass
+ "page_owner=on" to boot parameter in order to enable it. Eats
+ a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+ for user-space helper.
+
+ If unsure, say N.
+
config PAGE_POISONING
bool "Poison pages after freeing"
select PAGE_POISONING_NO_SANITY if HIBERNATION
ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
if (ret)
- goto err;
+ goto free_mem;
pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
&base);
return 0;
+free_mem:
+ memblock_free(base, size);
err:
pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
return ret;
unsigned long n;
};
-static struct dentry *cma_debugfs_root;
-
static int cma_debugfs_get(void *data, u64 *val)
{
unsigned long *p = data;
}
DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
-static void cma_debugfs_add_one(struct cma *cma, int idx)
+static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
char name[16];
scnprintf(name, sizeof(name), "cma-%s", cma->name);
- tmp = debugfs_create_dir(name, cma_debugfs_root);
+ tmp = debugfs_create_dir(name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
static int __init cma_debugfs_init(void)
{
+ struct dentry *cma_debugfs_root;
int i;
cma_debugfs_root = debugfs_create_dir("cma", NULL);
- if (!cma_debugfs_root)
- return -ENOMEM;
for (i = 0; i < cma_area_count; i++)
- cma_debugfs_add_one(&cma_areas[i], i);
+ cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root);
return 0;
}
return high_pfn;
}
-static void map_pages(struct list_head *list)
+static void split_map_pages(struct list_head *list)
{
unsigned int i, order, nr_pages;
struct page *page, *next;
return false;
}
+static bool
+__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
+ bool check_target)
+{
+ struct page *page = pfn_to_online_page(pfn);
+ struct page *end_page;
+ unsigned long block_pfn;
+
+ if (!page)
+ return false;
+ if (zone != page_zone(page))
+ return false;
+ if (pageblock_skip_persistent(page))
+ return false;
+
+ /*
+ * If skip is already cleared do no further checking once the
+ * restart points have been set.
+ */
+ if (check_source && check_target && !get_pageblock_skip(page))
+ return true;
+
+ /*
+ * If clearing skip for the target scanner, do not select a
+ * non-movable pageblock as the starting point.
+ */
+ if (!check_source && check_target &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ return false;
+
+ /*
+ * Only clear the hint if a sample indicates there is either a
+ * free page or an LRU page in the block. One or other condition
+ * is necessary for the block to be a migration source/target.
+ */
+ block_pfn = pageblock_start_pfn(pfn);
+ pfn = max(block_pfn, zone->zone_start_pfn);
+ page = pfn_to_page(pfn);
+ if (zone != page_zone(page))
+ return false;
+ pfn = block_pfn + pageblock_nr_pages;
+ pfn = min(pfn, zone_end_pfn(zone));
+ end_page = pfn_to_page(pfn);
+
+ do {
+ if (pfn_valid_within(pfn)) {
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+ }
+
+ page += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ } while (page < end_page);
+
+ return false;
+}
+
/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
*/
static void __reset_isolation_suitable(struct zone *zone)
{
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
- unsigned long pfn;
+ unsigned long migrate_pfn = zone->zone_start_pfn;
+ unsigned long free_pfn = zone_end_pfn(zone);
+ unsigned long reset_migrate = free_pfn;
+ unsigned long reset_free = migrate_pfn;
+ bool source_set = false;
+ bool free_set = false;
+
+ if (!zone->compact_blockskip_flush)
+ return;
zone->compact_blockskip_flush = false;
- /* Walk the zone and mark every pageblock as suitable for isolation */
- for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
- struct page *page;
-
+ /*
+ * Walk the zone and update pageblock skip information. Source looks
+ * for PageLRU while target looks for PageBuddy. When the scanner
+ * is found, both PageBuddy and PageLRU are checked as the pageblock
+ * is suitable as both source and target.
+ */
+ for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
+ free_pfn -= pageblock_nr_pages) {
cond_resched();
- page = pfn_to_online_page(pfn);
- if (!page)
- continue;
- if (zone != page_zone(page))
- continue;
- if (pageblock_skip_persistent(page))
- continue;
+ /* Update the migrate PFN */
+ if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
+ migrate_pfn < reset_migrate) {
+ source_set = true;
+ reset_migrate = migrate_pfn;
+ zone->compact_init_migrate_pfn = reset_migrate;
+ zone->compact_cached_migrate_pfn[0] = reset_migrate;
+ zone->compact_cached_migrate_pfn[1] = reset_migrate;
+ }
- clear_pageblock_skip(page);
+ /* Update the free PFN */
+ if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
+ free_pfn > reset_free) {
+ free_set = true;
+ reset_free = free_pfn;
+ zone->compact_init_free_pfn = reset_free;
+ zone->compact_cached_free_pfn = reset_free;
+ }
}
- reset_cached_positions(zone);
+ /* Leave no distance if no suitable block was reset */
+ if (reset_migrate >= reset_free) {
+ zone->compact_cached_migrate_pfn[0] = migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = migrate_pfn;
+ zone->compact_cached_free_pfn = free_pfn;
+ }
}
void reset_isolation_suitable(pg_data_t *pgdat)
}
}
+/*
+ * Sets the pageblock skip bit if it was clear. Note that this is a hint as
+ * locks are not required for read/writers. Returns true if it was already set.
+ */
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
+{
+ bool skip;
+
+ /* Do no update if skip hint is being ignored */
+ if (cc->ignore_skip_hint)
+ return false;
+
+ if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+ return false;
+
+ skip = get_pageblock_skip(page);
+ if (!skip && !cc->no_set_skip_hint)
+ set_pageblock_skip(page);
+
+ return skip;
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+ struct zone *zone = cc->zone;
+
+ pfn = pageblock_end_pfn(pfn);
+
+ /* Set for isolation rather than compaction */
+ if (cc->no_set_skip_hint)
+ return;
+
+ if (pfn > zone->compact_cached_migrate_pfn[0])
+ zone->compact_cached_migrate_pfn[0] = pfn;
+ if (cc->mode != MIGRATE_ASYNC &&
+ pfn > zone->compact_cached_migrate_pfn[1])
+ zone->compact_cached_migrate_pfn[1] = pfn;
+}
+
/*
* If no pages were isolated then mark this pageblock to be skipped in the
* future. The information is later cleared by __reset_isolation_suitable().
*/
static void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
{
struct zone *zone = cc->zone;
- unsigned long pfn;
if (cc->no_set_skip_hint)
return;
if (!page)
return;
- if (nr_isolated)
- return;
-
set_pageblock_skip(page);
- pfn = page_to_pfn(page);
-
/* Update where async and sync compaction should restart */
- if (migrate_scanner) {
- if (pfn > zone->compact_cached_migrate_pfn[0])
- zone->compact_cached_migrate_pfn[0] = pfn;
- if (cc->mode != MIGRATE_ASYNC &&
- pfn > zone->compact_cached_migrate_pfn[1])
- zone->compact_cached_migrate_pfn[1] = pfn;
- } else {
- if (pfn < zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = pfn;
- }
+ if (pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
}
#else
static inline bool isolation_suitable(struct compact_control *cc,
}
static inline void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
+{
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+}
+
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
{
+ return false;
}
#endif /* CONFIG_COMPACTION */
/*
* Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. For async compaction, back out if the lock cannot
- * be taken immediately. For sync compaction, spin on the lock if needed.
+ * very heavily contended. For async compaction, trylock and record if the
+ * lock is contended. The lock will still be acquired but compaction will
+ * abort when the current block is finished regardless of success rate.
+ * Sync compaction acquires the lock.
*
- * Returns true if the lock is held
- * Returns false if the lock is not held and compaction should abort
+ * Always returns true which makes it easier to track lock state in callers.
*/
-static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
+static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
struct compact_control *cc)
{
- if (cc->mode == MIGRATE_ASYNC) {
- if (!spin_trylock_irqsave(lock, *flags)) {
- cc->contended = true;
- return false;
- }
- } else {
- spin_lock_irqsave(lock, *flags);
+ /* Track if the lock is contended in async mode */
+ if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
+ if (spin_trylock_irqsave(lock, *flags))
+ return true;
+
+ cc->contended = true;
}
+ spin_lock_irqsave(lock, *flags);
return true;
}
return true;
}
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
- cond_resched();
- }
-
- return false;
-}
-
-/*
- * Aside from avoiding lock contention, compaction also periodically checks
- * need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_unlock_should_abort() does, but
- * is used where no lock is concerned.
- *
- * Returns false when no scheduling was needed, or sync compaction scheduled.
- * Returns true when async compaction should abort.
- */
-static inline bool compact_should_abort(struct compact_control *cc)
-{
- /* async compaction aborts if contended */
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
-
- cond_resched();
- }
+ cond_resched();
return false;
}
unsigned long *start_pfn,
unsigned long end_pfn,
struct list_head *freelist,
+ unsigned int stride,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
- struct page *cursor, *valid_page = NULL;
+ struct page *cursor;
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
unsigned int order;
+ /* Strict mode is for isolation, speed is secondary */
+ if (strict)
+ stride = 1;
+
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. */
- for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+ for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
int isolated;
struct page *page = cursor;
if (!pfn_valid_within(blockpfn))
goto isolate_fail;
- if (!valid_page)
- valid_page = page;
-
/*
* For compound pages such as THP and hugetlbfs, we can save
* potentially a lot of iterations if we skip them at once.
* recheck as well.
*/
if (!locked) {
- /*
- * The zone lock must be held to isolate freepages.
- * Unfortunately this is a very coarse lock and can be
- * heavily contended if there are parallel allocations
- * or parallel compactions. For async compaction do not
- * spin on the lock and we acquire the lock as late as
- * possible.
- */
- locked = compact_trylock_irqsave(&cc->zone->lock,
+ locked = compact_lock_irqsave(&cc->zone->lock,
&flags, cc);
- if (!locked)
- break;
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
if (strict && blockpfn < end_pfn)
total_isolated = 0;
- /* Update the pageblock-skip if the whole pageblock was scanned */
- if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, false);
-
cc->total_free_scanned += nr_scanned;
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
- block_end_pfn, &freelist, true);
+ block_end_pfn, &freelist, 0, true);
/*
* In strict mode, isolate_freepages_block() returns 0 if
}
/* __isolate_free_page() does not map the pages */
- map_pages(&freelist);
+ split_map_pages(&freelist);
if (pfn < end_pfn) {
/* Loop terminated early, cleanup. */
}
/* Similar to reclaim, but different enough that they don't share logic */
-static bool too_many_isolated(struct zone *zone)
+static bool too_many_isolated(pg_data_t *pgdat)
{
unsigned long active, inactive, isolated;
- inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
- node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
- active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
- node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
- isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
- node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
+ inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_ANON);
+ active = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_ACTIVE_ANON);
+ isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
+ node_page_state(pgdat, NR_ISOLATED_ANON);
return isolated > (inactive + active) / 2;
}
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t isolate_mode)
{
- struct zone *zone = cc->zone;
+ pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
+ bool skip_updated = false;
/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
- while (unlikely(too_many_isolated(zone))) {
+ while (unlikely(too_many_isolated(pgdat))) {
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
return 0;
return 0;
}
- if (compact_should_abort(cc))
- return 0;
+ cond_resched();
if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
skip_on_failure = true;
* if contended.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(zone_lru_lock(zone), flags,
- &locked, cc))
+ && compact_unlock_should_abort(&pgdat->lru_lock,
+ flags, &locked, cc))
break;
if (!pfn_valid_within(low_pfn))
page = pfn_to_page(low_pfn);
- if (!valid_page)
+ /*
+ * Check if the pageblock has already been marked skipped.
+ * Only the aligned PFN is checked as the caller isolates
+ * COMPACT_CLUSTER_MAX at a time so the second call must
+ * not falsely conclude that the block should be skipped.
+ */
+ if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
+ if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+ low_pfn = end_pfn;
+ goto isolate_abort;
+ }
valid_page = page;
+ }
/*
* Skip if free. We read page order here without zone lock
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(zone_lru_lock(zone),
+ spin_unlock_irqrestore(&pgdat->lru_lock,
flags);
locked = false;
}
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
- locked = compact_trylock_irqsave(zone_lru_lock(zone),
+ locked = compact_lock_irqsave(&pgdat->lru_lock,
&flags, cc);
- if (!locked)
- break;
+
+ /* Try get exclusive access under lock */
+ if (!skip_updated) {
+ skip_updated = true;
+ if (test_and_set_skip(cc, page, low_pfn))
+ goto isolate_abort;
+ }
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
}
}
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
nr_isolated++;
/*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator.
- * - this is the lowest page that was isolated and likely be
- * then freed by migration.
+ * Avoid isolating too much unless this block is being
+ * rescanned (e.g. dirty/writeback pages, parallel allocation)
+ * or a lock is contended. For contention, isolate quickly to
+ * potentially remove one source of contention.
*/
- if (!cc->last_migrated_pfn)
- cc->last_migrated_pfn = low_pfn;
-
- /* Avoid isolating too much */
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
+ !cc->rescan && !cc->contended) {
++low_pfn;
break;
}
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
locked = false;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- cc->last_migrated_pfn = 0;
nr_isolated = 0;
}
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
+isolate_abort:
if (locked)
- spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
/*
- * Update the pageblock-skip information and cached scanner pfn,
- * if the whole pageblock was scanned without isolating any page.
+ * Updated the cached scanner pfn once the pageblock has been scanned
+ * Pages will either be migrated in which case there is no point
+ * scanning in the near future or migration failed in which case the
+ * failure reason may persist. The block is marked for skipping if
+ * there were no pages isolated in the block or if the block is
+ * rescanned twice in a row.
*/
- if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated, true);
+ if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
+ if (valid_page && !skip_updated)
+ set_pageblock_skip(valid_page);
+ update_cached_migrate(cc, low_pfn);
+ }
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
{
int block_mt;
+ if (pageblock_skip_persistent(page))
+ return false;
+
if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
return true;
return false;
}
+static inline unsigned int
+freelist_scan_limit(struct compact_control *cc)
+{
+ return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
+}
+
/*
* Test whether the free scanner has reached the same or lower pageblock than
* the migration scanner, and compaction should thus terminate.
<= (cc->migrate_pfn >> pageblock_order);
}
+/*
+ * Used when scanning for a suitable migration target which scans freelists
+ * in reverse. Reorders the list such as the unscanned pages are scanned
+ * first on the next iteration of the free scanner
+ */
+static void
+move_freelist_head(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_last(freelist, &freepage->lru)) {
+ list_cut_before(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+/*
+ * Similar to move_freelist_head except used by the migration scanner
+ * when scanning forward. It's possible for these list operations to
+ * move against each other if they search the free list exactly in
+ * lockstep.
+ */
+static void
+move_freelist_tail(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+