From: Linus Torvalds Date: Wed, 27 Oct 2010 01:20:19 +0000 (-0700) Subject: Merge branches 'upstream/xenfs' and 'upstream/core' of git://git.kernel.org/pub/scm... X-Git-Tag: v2.6.37-rc1~93 X-Git-Url: http://git.samba.org/samba.git/?p=sfrench%2Fcifs-2.6.git;a=commitdiff_plain;h=520045db940a381d2bee1c1b2179f7921b40fb10 Merge branches 'upstream/xenfs' and 'upstream/core' of git://git./linux/kernel/git/jeremy/xen * 'upstream/xenfs' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: xen/privcmd: make privcmd visible in domU xen/privcmd: move remap_domain_mfn_range() to core xen code and export. privcmd: MMAPBATCH: Fix error handling/reporting xenbus: export xen_store_interface for xenfs xen/privcmd: make sure vma is ours before doing anything to it xen/privcmd: print SIGBUS faults xen/xenfs: set_page_dirty is supposed to return true if it dirties xen/privcmd: create address space to allow writable mmaps xen: add privcmd driver xen: add variable hypercall caller xen: add xen_set_domain_pte() xen: add /proc/xen/xsd_{kva,port} to xenfs * 'upstream/core' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: (29 commits) xen: include xen/xen.h for definition of xen_initial_domain() xen: use host E820 map for dom0 xen: correctly rebuild mfn list list after migration. xen: improvements to VIRQ_DEBUG output xen: set up IRQ before binding virq to evtchn xen: ensure that all event channels start off bound to VCPU 0 xen/hvc: only notify if we actually sent something xen: don't add extra_pages for RAM after mem_end xen: add support for PAT xen: make sure xen_max_p2m_pfn is up to date xen: limit extra memory to a certain ratio of base xen: add extra pages for E820 RAM regions, even if beyond mem_end xen: make sure xen_extra_mem_start is beyond all non-RAM e820 xen: implement "extra" memory to reserve space for pages not present at boot xen: Use host-provided E820 map xen: don't map missing memory xen: defer building p2m mfn structures until kernel is mapped xen: add return value to set_phys_to_machine() xen: convert p2m to a 3 level tree xen: make install_p2mtop_page() static ... Fix up trivial conflict in arch/x86/xen/mmu.c, and fix the use of 'reserve_early()' - in the new memblock world order it is now 'memblock_x86_reserve_range()' instead. Pointed out by Jeremy. --- 520045db940a381d2bee1c1b2179f7921b40fb10 diff --cc arch/x86/xen/mmu.c index f72d18c69221,f08ea045620f,e41683cf290a..9631c90907eb --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@@@ -56,7 -55,6 -55,7 +56,8 @@@@ #include #include #include ++#include ++ #include #include #include @@@@ -171,23 -169,23 -171,52 +173,52 @@@@ DEFINE_PER_CPU(unsigned long, xen_curre */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) ++ /* ++ * Xen leaves the responsibility for maintaining p2m mappings to the ++ * guests themselves, but it must also access and update the p2m array ++ * during suspend/resume when all the pages are reallocated. ++ * ++ * The p2m table is logically a flat array, but we implement it as a ++ * three-level tree to allow the address space to be sparse. ++ * ++ * Xen ++ * | ++ * p2m_top p2m_top_mfn ++ * / \ / \ ++ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn ++ * / \ / \ / / ++ * p2m p2m p2m p2m p2m p2m p2m ... ++ * ++ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. ++ * ++ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the ++ * maximum representable pseudo-physical address space is: ++ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages ++ * ++ * P2M_PER_PAGE depends on the architecture, as a mfn is always ++ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to ++ * 512 and 1024 entries respectively. ++ */ + - #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) - #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) ++ unsigned long xen_max_p2m_pfn __read_mostly; - #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) - #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) - /* Placeholder for holes in the address space */ - static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = - { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; ++ #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) ++ #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) ++ #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - /* Placeholder for holes in the address space */ - static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = - { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; - /* Array of pointers to pages containing p2m entries */ - static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = - { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; ++ #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - /* Array of pointers to pages containing p2m entries */ - static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = - { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; - /* Arrays of p2m arrays expressed in mfns used for save/restore */ - static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; ++ /* Placeholders for holes in the address space */ ++ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); ++ static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); ++ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); - /* Arrays of p2m arrays expressed in mfns used for save/restore */ - static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; - static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] - __page_aligned_bss; ++ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); ++ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); ++ static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); + - static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] - __page_aligned_bss; ++ RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); ++ RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); static inline unsigned p2m_top_index(unsigned long pfn) { @@@@ -197,23 -195,23 -231,122 +233,122 @@@@ static inline unsigned p2m_mid_index(un static inline unsigned p2m_index(unsigned long pfn) { -- return pfn % P2M_ENTRIES_PER_PAGE; ++ return pfn % P2M_PER_PAGE; + } + - /* Build the parallel p2m_top_mfn structures */ ++ static void p2m_top_init(unsigned long ***top) ++ { ++ unsigned i; ++ ++ for (i = 0; i < P2M_TOP_PER_PAGE; i++) ++ top[i] = p2m_mid_missing; ++ } ++ ++ static void p2m_top_mfn_init(unsigned long *top) ++ { ++ unsigned i; ++ ++ for (i = 0; i < P2M_TOP_PER_PAGE; i++) ++ top[i] = virt_to_mfn(p2m_mid_missing_mfn); ++ } ++ ++ static void p2m_top_mfn_p_init(unsigned long **top) ++ { ++ unsigned i; ++ ++ for (i = 0; i < P2M_TOP_PER_PAGE; i++) ++ top[i] = p2m_mid_missing_mfn; ++ } ++ ++ static void p2m_mid_init(unsigned long **mid) ++ { ++ unsigned i; ++ ++ for (i = 0; i < P2M_MID_PER_PAGE; i++) ++ mid[i] = p2m_missing; ++ } ++ ++ static void p2m_mid_mfn_init(unsigned long *mid) ++ { ++ unsigned i; ++ ++ for (i = 0; i < P2M_MID_PER_PAGE; i++) ++ mid[i] = virt_to_mfn(p2m_missing); + } + - /* Build the parallel p2m_top_mfn structures */ ++ static void p2m_init(unsigned long *p2m) ++ { ++ unsigned i; ++ ++ for (i = 0; i < P2M_MID_PER_PAGE; i++) ++ p2m[i] = INVALID_P2M_ENTRY; ++ } ++ ++ /* ++ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures ++ * ++ * This is called both at boot time, and after resuming from suspend: ++ * - At boot time we're called very early, and must use extend_brk() ++ * to allocate memory. ++ * ++ * - After resume we're called from within stop_machine, but the mfn ++ * tree should alreay be completely allocated. ++ */ void xen_build_mfn_list_list(void) { -- unsigned pfn, idx; ++ unsigned long pfn; -- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { -- unsigned topidx = p2m_top_index(pfn); ++ /* Pre-initialize p2m_top_mfn to be completely missing */ ++ if (p2m_top_mfn == NULL) { ++ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_mid_mfn_init(p2m_mid_missing_mfn); ++ ++ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_top_mfn_p_init(p2m_top_mfn_p); -- p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); ++ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_top_mfn_init(p2m_top_mfn); ++ } else { ++ /* Reinitialise, mfn's all change after migration */ ++ p2m_mid_mfn_init(p2m_mid_missing_mfn); } -- for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { -- unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; -- p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); ++ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { ++ unsigned topidx = p2m_top_index(pfn); ++ unsigned mididx = p2m_mid_index(pfn); ++ unsigned long **mid; ++ unsigned long *mid_mfn_p; ++ ++ mid = p2m_top[topidx]; ++ mid_mfn_p = p2m_top_mfn_p[topidx]; ++ ++ /* Don't bother allocating any mfn mid levels if ++ * they're just missing, just update the stored mfn, ++ * since all could have changed over a migrate. ++ */ ++ if (mid == p2m_mid_missing) { ++ BUG_ON(mididx); ++ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); ++ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); ++ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; ++ continue; ++ } ++ ++ if (mid_mfn_p == p2m_mid_missing_mfn) { ++ /* ++ * XXX boot-time only! We should never find ++ * missing parts of the mfn tree after ++ * runtime. extend_brk() will BUG if we call ++ * it too late. ++ */ ++ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_mid_mfn_init(mid_mfn_p); ++ ++ p2m_top_mfn_p[topidx] = mid_mfn_p; ++ } ++ ++ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); ++ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); } } @@@@ -231,15 -229,15 -364,37 +366,37 @@@@ void __init xen_build_dynamic_phys_to_m { unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); -- unsigned pfn; ++ unsigned long pfn; ++ ++ xen_max_p2m_pfn = max_pfn; + - for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { ++ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_init(p2m_missing); + - for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { ++ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_mid_init(p2m_mid_missing); ++ ++ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_top_init(p2m_top); ++ ++ /* ++ * The domain builder gives us a pre-constructed p2m array in ++ * mfn_list for all the pages initially given to us, so we just ++ * need to graft that into our tree structure. ++ */ ++ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); ++ unsigned mididx = p2m_mid_index(pfn); -- p2m_top[topidx] = &mfn_list[pfn]; -- } ++ if (p2m_top[topidx] == p2m_mid_missing) { ++ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); ++ p2m_mid_init(mid); ++ ++ p2m_top[topidx] = mid; ++ } -- xen_build_mfn_list_list(); ++ p2m_top[topidx][mididx] = &mfn_list[pfn]; ++ } } unsigned long get_phys_to_machine(unsigned long pfn) @@@@ -255,36 -253,36 -412,88 +414,88 @@@@ } EXPORT_SYMBOL_GPL(get_phys_to_machine); -- /* install a new p2m_top page */ -- bool install_p2mtop_page(unsigned long pfn, unsigned long *p) ++ static void *alloc_p2m_page(void) { -- unsigned topidx = p2m_top_index(pfn); -- unsigned long **pfnp, *mfnp; -- unsigned i; ++ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); ++ } -- pfnp = &p2m_top[topidx]; -- mfnp = &p2m_top_mfn[topidx]; ++ static void free_p2m_page(void *p) ++ { ++ free_page((unsigned long)p); ++ } -- for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) -- p[i] = INVALID_P2M_ENTRY; ++ /* ++ * Fully allocate the p2m structure for a given pfn. We need to check ++ * that both the top and mid levels are allocated, and make sure the ++ * parallel mfn tree is kept in sync. We may race with other cpus, so ++ * the new pages are installed with cmpxchg; if we lose the race then ++ * simply free the page we allocated and use the one that's there. ++ */ ++ static bool alloc_p2m(unsigned long pfn) ++ { ++ unsigned topidx, mididx; ++ unsigned long ***top_p, **mid; ++ unsigned long *top_mfn_p, *mid_mfn; -- if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { -- *mfnp = virt_to_mfn(p); -- return true; ++ topidx = p2m_top_index(pfn); ++ mididx = p2m_mid_index(pfn); ++ ++ top_p = &p2m_top[topidx]; ++ mid = *top_p; ++ ++ if (mid == p2m_mid_missing) { ++ /* Mid level is missing, allocate a new one */ ++ mid = alloc_p2m_page(); ++ if (!mid) ++ return false; ++ ++ p2m_mid_init(mid); ++ ++ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) ++ free_p2m_page(mid); } -- return false; -- } ++ top_mfn_p = &p2m_top_mfn[topidx]; ++ mid_mfn = p2m_top_mfn_p[topidx]; -- static void alloc_p2m(unsigned long pfn) -- { -- unsigned long *p; ++ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); ++ ++ if (mid_mfn == p2m_mid_missing_mfn) { ++ /* Separately check the mid mfn level */ ++ unsigned long missing_mfn; ++ unsigned long mid_mfn_mfn; + - p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); - BUG_ON(p == NULL); ++ mid_mfn = alloc_p2m_page(); ++ if (!mid_mfn) ++ return false; + - if (!install_p2mtop_page(pfn, p)) - free_page((unsigned long)p); ++ p2m_mid_mfn_init(mid_mfn); ++ ++ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); ++ mid_mfn_mfn = virt_to_mfn(mid_mfn); ++ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) ++ free_p2m_page(mid_mfn); ++ else ++ p2m_top_mfn_p[topidx] = mid_mfn; ++ } ++ ++ if (p2m_top[topidx][mididx] == p2m_missing) { ++ /* p2m leaf page is missing */ ++ unsigned long *p2m; ++ ++ p2m = alloc_p2m_page(); ++ if (!p2m) ++ return false; + - p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); - BUG_ON(p == NULL); ++ p2m_init(p2m); ++ ++ if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) ++ free_p2m_page(p2m); ++ else ++ mid_mfn[mididx] = virt_to_mfn(p2m); ++ } + - if (!install_p2mtop_page(pfn, p)) - free_page((unsigned long)p); ++ return true; } /* Try to install p2m mapping; fail if intermediate bits missing */ diff --cc arch/x86/xen/setup.c index 9729c903404b,328b00305426,8e2c9f21fa37..105db2501050 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@@@ -34,6 -33,6 -35,39 +36,39 @@@@ extern void xen_sysenter_target(void) extern void xen_syscall_target(void); extern void xen_syscall32_target(void); ++ /* Amount of extra memory space we add to the e820 ranges */ ++ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; ++ ++ /* ++ * The maximum amount of extra memory compared to the base size. The ++ * main scaling factor is the size of struct page. At extreme ratios ++ * of base:extra, all the base memory can be filled with page ++ * structures for the extra memory, leaving no space for anything ++ * else. ++ * ++ * 10x seems like a reasonable balance between scaling flexibility and ++ * leaving a practically usable system. ++ */ ++ #define EXTRA_MEM_RATIO (10) ++ ++ static __init void xen_add_extra_mem(unsigned long pages) ++ { ++ u64 size = (u64)pages * PAGE_SIZE; ++ u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; ++ ++ if (!pages) ++ return; ++ ++ e820_add_region(extra_start, size, E820_RAM); ++ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); ++ - reserve_early(extra_start, extra_start + size, "XEN EXTRA"); +++ memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); ++ ++ xen_extra_mem_size += size; ++ ++ xen_max_p2m_pfn = PFN_DOWN(extra_start + size); ++ } ++ static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr) {