#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/interrupt.h>
#include <asm/e820.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
struct cpa_data {
unsigned long vaddr;
pgprot_t mask_set;
int flushtlb;
};
-enum {
- CPA_NO_SPLIT = 0,
- CPA_SPLIT,
-};
-
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
/*
* Only flush present addresses:
*/
- if (pte && pte_present(*pte))
+ if (pte && (pte_val(*pte) & _PAGE_PRESENT))
clflush_cache_range((void *) addr, PAGE_SIZE);
}
}
if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext)))
pgprot_val(forbidden) |= _PAGE_NX;
-
-#ifdef CONFIG_DEBUG_RODATA
/* The .rodata section needs to be read-only */
if (within(address, (unsigned long)__start_rodata,
(unsigned long)__end_rodata))
if (within(address, virt_to_highmap(__start_rodata),
virt_to_highmap(__end_rodata)))
pgprot_val(forbidden) |= _PAGE_RW;
-#endif
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
return prot;
}
-pte_t *lookup_address(unsigned long address, int *level)
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
{
pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
if (pgd_none(*pgd))
return NULL;
+
pud = pud_offset(pgd, address);
if (pud_none(*pud))
return NULL;
+
+ *level = PG_LEVEL_1G;
+ if (pud_large(*pud) || !pud_present(*pud))
+ return (pte_t *)pud;
+
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
*level = PG_LEVEL_2M;
- if (pmd_large(*pmd))
+ if (pmd_large(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
*level = PG_LEVEL_4K;
+
return pte_offset_kernel(pmd, address);
}
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
{
/* change init_mm */
#endif
}
-static int try_preserve_large_page(pte_t *kpte, unsigned long address,
- struct cpa_data *cpa)
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+ struct cpa_data *cpa)
{
- unsigned long nextpage_addr, numpages, pmask, psize, flags;
+ unsigned long nextpage_addr, numpages, pmask, psize, flags, addr;
pte_t new_pte, old_pte, *tmp;
pgprot_t old_prot, new_prot;
- int level, res = CPA_SPLIT;
+ int i, do_split = 1;
+ unsigned int level;
spin_lock_irqsave(&pgd_lock, flags);
/*
switch (level) {
case PG_LEVEL_2M:
- psize = LARGE_PAGE_SIZE;
- pmask = LARGE_PAGE_MASK;
+ psize = PMD_PAGE_SIZE;
+ pmask = PMD_PAGE_MASK;
break;
+#ifdef CONFIG_X86_64
case PG_LEVEL_1G:
+ psize = PMD_PAGE_SIZE;
+ pmask = PMD_PAGE_MASK;
+ break;
+#endif
default:
- res = -EINVAL;
+ do_split = -EINVAL;
goto out_unlock;
}
pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
new_prot = static_protections(new_prot, address);
+ /*
+ * We need to check the full range, whether
+ * static_protection() requires a different pgprot for one of
+ * the pages in the range we try to preserve:
+ */
+ addr = address + PAGE_SIZE;
+ for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE) {
+ pgprot_t chk_prot = static_protections(new_prot, addr);
+
+ if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+ goto out_unlock;
+ }
+
/*
* If there are no changes, return. maxpages has been updated
* above:
*/
if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
- res = CPA_NO_SPLIT;
+ do_split = 0;
goto out_unlock;
}
new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
__set_pmd_pte(kpte, address, new_pte);
cpa->flushtlb = 1;
- res = CPA_NO_SPLIT;
+ do_split = 0;
}
out_unlock:
spin_unlock_irqrestore(&pgd_lock, flags);
- return res;
+
+ return do_split;
+}
+
+static LIST_HEAD(page_pool);
+static unsigned long pool_size, pool_pages, pool_low;
+static unsigned long pool_used, pool_failed, pool_refill;
+
+static void cpa_fill_pool(void)
+{
+ struct page *p;
+ gfp_t gfp = GFP_KERNEL;
+
+ /* Do not allocate from interrupt context */
+ if (in_irq() || irqs_disabled())
+ return;
+ /*
+ * Check unlocked. I does not matter when we have one more
+ * page in the pool. The bit lock avoids recursive pool
+ * allocations:
+ */
+ if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill))
+ return;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * We could do:
+ * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
+ * but this fails on !PREEMPT kernels
+ */
+ gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
+#endif
+
+ while (pool_pages < pool_size) {
+ p = alloc_pages(gfp, 0);
+ if (!p) {
+ pool_failed++;
+ break;
+ }
+ spin_lock_irq(&pgd_lock);
+ list_add(&p->lru, &page_pool);
+ pool_pages++;
+ spin_unlock_irq(&pgd_lock);
+ }
+ clear_bit_unlock(0, &pool_refill);
+}
+
+#define SHIFT_MB (20 - PAGE_SHIFT)
+#define ROUND_MB_GB ((1 << 10) - 1)
+#define SHIFT_MB_GB 10
+#define POOL_PAGES_PER_GB 16
+
+void __init cpa_init(void)
+{
+ struct sysinfo si;
+ unsigned long gb;
+
+ si_meminfo(&si);
+ /*
+ * Calculate the number of pool pages:
+ *
+ * Convert totalram (nr of pages) to MiB and round to the next
+ * GiB. Shift MiB to Gib and multiply the result by
+ * POOL_PAGES_PER_GB:
+ */
+ gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
+ pool_size = POOL_PAGES_PER_GB * gb;
+ pool_low = pool_size;
+
+ cpa_fill_pool();
+ printk(KERN_DEBUG
+ "CPA: page pool initialized %lu of %lu pages preallocated\n",
+ pool_pages, pool_size);
}
static int split_large_page(pte_t *kpte, unsigned long address)
{
- pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte));
- gfp_t gfp_flags = GFP_KERNEL;
- unsigned long flags, addr, pfn;
+ unsigned long flags, pfn, pfninc = 1;
+ unsigned int i, level;
pte_t *pbase, *tmp;
+ pgprot_t ref_prot;
struct page *base;
- unsigned int i, level;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN;
- gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
-#endif
- base = alloc_pages(gfp_flags, 0);
- if (!base)
+ /*
+ * Get a page from the pool. The pool list is protected by the
+ * pgd_lock, which we have to take anyway for the split
+ * operation:
+ */
+ spin_lock_irqsave(&pgd_lock, flags);
+ if (list_empty(&page_pool)) {
+ spin_unlock_irqrestore(&pgd_lock, flags);
return -ENOMEM;
+ }
+
+ base = list_first_entry(&page_pool, struct page, lru);
+ list_del(&base->lru);
+ pool_pages--;
+
+ if (pool_pages < pool_low)
+ pool_low = pool_pages;
- spin_lock_irqsave(&pgd_lock, flags);
/*
* Check for races, another CPU might have split this page
* up for us already:
*/
tmp = lookup_address(address, &level);
- if (tmp != kpte) {
- WARN_ON_ONCE(1);
+ if (tmp != kpte)
goto out_unlock;
- }
- address = __pa(address);
- addr = address & LARGE_PAGE_MASK;
pbase = (pte_t *)page_address(base);
#ifdef CONFIG_X86_32
paravirt_alloc_pt(&init_mm, page_to_pfn(base));
#endif
+ ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+
+#ifdef CONFIG_X86_64
+ if (level == PG_LEVEL_1G) {
+ pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+ pgprot_val(ref_prot) |= _PAGE_PSE;
+ }
+#endif
/*
* Get the target pfn from the original entry:
*/
pfn = pte_pfn(*kpte);
- for (i = 0; i < PTRS_PER_PTE; i++, pfn++)
+ for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
/*
- * Install the new, split up pagetable. Important detail here:
+ * Install the new, split up pagetable. Important details here:
*
* On Intel the NX bit of all levels must be cleared to make a
* page executable. See section 4.13.2 of Intel 64 and IA-32
* Architectures Software Developer's Manual).
+ *
+ * Mark the entry present. The current mapping might be
+ * set to not present, which we preserved above.
*/
ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
+ pgprot_val(ref_prot) |= _PAGE_PRESENT;
__set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
base = NULL;
out_unlock:
+ /*
+ * If we dropped out via the lookup_address check under
+ * pgd_lock then stick the page back into the pool:
+ */
+ if (base) {
+ list_add(&base->lru, &page_pool);
+ pool_pages++;
+ } else
+ pool_used++;
spin_unlock_irqrestore(&pgd_lock, flags);
- if (base)
- __free_pages(base, 0);
-
return 0;
}
static int __change_page_attr(unsigned long address, struct cpa_data *cpa)
{
+ int do_split, err;
+ unsigned int level;
struct page *kpte_page;
- int level, res;
pte_t *kpte;
repeat:
* Check, whether we can keep the large page intact
* and just change the pte:
*/
- res = try_preserve_large_page(kpte, address, cpa);
- if (res < 0)
- return res;
-
+ do_split = try_preserve_large_page(kpte, address, cpa);
/*
* When the range fits into the existing large page,
* return. cp->numpages and cpa->tlbflush have been updated in
* try_large_page:
*/
- if (res == CPA_NO_SPLIT)
- return 0;
+ if (do_split <= 0)
+ return do_split;
/*
* We have to split the large page:
*/
- res = split_large_page(kpte, address);
- if (res)
- return res;
- cpa->flushtlb = 1;
- goto repeat;
+ err = split_large_page(kpte, address);
+ if (!err) {
+ cpa->flushtlb = 1;
+ goto repeat;
+ }
+
+ return err;
}
/**
*
* Modules and drivers should use the set_memory_* APIs instead.
*/
-
static int change_page_attr_addr(struct cpa_data *cpa)
{
int err;
* Check whether we really changed something:
*/
if (!cpa.flushtlb)
- return ret;
+ goto out;
/*
* No need to flush, when we did not set any of the caching
else
cpa_flush_all(cache);
+out:
+ cpa_fill_pool();
return ret;
}
* but that can deadlock->flush only current cpu:
*/
__flush_tlb_all();
+
+ /*
+ * Try to refill the page pool here. We can do this only after
+ * the tlb flush.
+ */
+ cpa_fill_pool();
}
#endif