From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 27 Oct 2010 01:20:19 +0000 (-0700)
Subject: Merge branches 'upstream/xenfs' and 'upstream/core' of git://git.kernel.org/pub/scm... 
X-Git-Tag: v2.6.37-rc1~93
X-Git-Url: http://git.samba.org/samba.git/?p=sfrench%2Fcifs-2.6.git;a=commitdiff_plain;h=520045db940a381d2bee1c1b2179f7921b40fb10

Merge branches 'upstream/xenfs' and 'upstream/core' of git://git./linux/kernel/git/jeremy/xen

* 'upstream/xenfs' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen:
  xen/privcmd: make privcmd visible in domU
  xen/privcmd: move remap_domain_mfn_range() to core xen code and export.
  privcmd: MMAPBATCH: Fix error handling/reporting
  xenbus: export xen_store_interface for xenfs
  xen/privcmd: make sure vma is ours before doing anything to it
  xen/privcmd: print SIGBUS faults
  xen/xenfs: set_page_dirty is supposed to return true if it dirties
  xen/privcmd: create address space to allow writable mmaps
  xen: add privcmd driver
  xen: add variable hypercall caller
  xen: add xen_set_domain_pte()
  xen: add /proc/xen/xsd_{kva,port} to xenfs

* 'upstream/core' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: (29 commits)
  xen: include xen/xen.h for definition of xen_initial_domain()
  xen: use host E820 map for dom0
  xen: correctly rebuild mfn list list after migration.
  xen: improvements to VIRQ_DEBUG output
  xen: set up IRQ before binding virq to evtchn
  xen: ensure that all event channels start off bound to VCPU 0
  xen/hvc: only notify if we actually sent something
  xen: don't add extra_pages for RAM after mem_end
  xen: add support for PAT
  xen: make sure xen_max_p2m_pfn is up to date
  xen: limit extra memory to a certain ratio of base
  xen: add extra pages for E820 RAM regions, even if beyond mem_end
  xen: make sure xen_extra_mem_start is beyond all non-RAM e820
  xen: implement "extra" memory to reserve space for pages not present at boot
  xen: Use host-provided E820 map
  xen: don't map missing memory
  xen: defer building p2m mfn structures until kernel is mapped
  xen: add return value to set_phys_to_machine()
  xen: convert p2m to a 3 level tree
  xen: make install_p2mtop_page() static
  ...

Fix up trivial conflict in arch/x86/xen/mmu.c, and fix the use of
'reserve_early()' - in the new memblock world order it is now
'memblock_x86_reserve_range()' instead. Pointed out by Jeremy.
---

520045db940a381d2bee1c1b2179f7921b40fb10
diff --cc arch/x86/xen/mmu.c
index f72d18c69221,f08ea045620f,e41683cf290a..9631c90907eb
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@@@ -56,7 -55,6 -55,7 +56,8 @@@@
   #include <asm/e820.h>
   #include <asm/linkage.h>
   #include <asm/page.h>
 ++#include <asm/init.h>
++ #include <asm/pat.h>
   
   #include <asm/xen/hypercall.h>
   #include <asm/xen/hypervisor.h>
@@@@ -171,23 -169,23 -171,52 +173,52 @@@@ DEFINE_PER_CPU(unsigned long, xen_curre
    */
   #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
   
++ /*
++  * Xen leaves the responsibility for maintaining p2m mappings to the
++  * guests themselves, but it must also access and update the p2m array
++  * during suspend/resume when all the pages are reallocated.
++  *
++  * The p2m table is logically a flat array, but we implement it as a
++  * three-level tree to allow the address space to be sparse.
++  *
++  *                               Xen
++  *                                |
++  *     p2m_top              p2m_top_mfn
++  *       /  \                   /   \
++  * p2m_mid p2m_mid	p2m_mid_mfn p2m_mid_mfn
++  *    / \      / \         /           /
++  *  p2m p2m p2m p2m p2m p2m p2m ...
++  *
++  * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
++  *
++  * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
++  * maximum representable pseudo-physical address space is:
++  *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
++  *
++  * P2M_PER_PAGE depends on the architecture, as a mfn is always
++  * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
++  * 512 and 1024 entries respectively. 
++  */
+  
 - #define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
 - #define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
++ unsigned long xen_max_p2m_pfn __read_mostly;
   
-  #define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
-  #define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
 - /* Placeholder for holes in the address space */
 - static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
 - 		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
++ #define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
++ #define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
++ #define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
   
-  /* Placeholder for holes in the address space */
-  static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
-  		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
 -  /* Array of pointers to pages containing p2m entries */
 - static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
 - 		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
++ #define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
   
-   /* Array of pointers to pages containing p2m entries */
-  static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
-  		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
 - /* Arrays of p2m arrays expressed in mfns used for save/restore */
 - static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
++ /* Placeholders for holes in the address space */
++ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
   
-  /* Arrays of p2m arrays expressed in mfns used for save/restore */
-  static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
 - static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
 - 	__page_aligned_bss;
++ static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
++ static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
 + 
-  static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
-  	__page_aligned_bss;
++ RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
++ RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
   
   static inline unsigned p2m_top_index(unsigned long pfn)
   {
@@@@ -197,23 -195,23 -231,122 +233,122 @@@@ static inline unsigned p2m_mid_index(un
   
   static inline unsigned p2m_index(unsigned long pfn)
   {
-- 	return pfn % P2M_ENTRIES_PER_PAGE;
++ 	return pfn % P2M_PER_PAGE;
+  }
+  
 - /* Build the parallel p2m_top_mfn structures */
++ static void p2m_top_init(unsigned long ***top)
++ {
++ 	unsigned i;
++ 
++ 	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++ 		top[i] = p2m_mid_missing;
++ }
++ 
++ static void p2m_top_mfn_init(unsigned long *top)
++ {
++ 	unsigned i;
++ 
++ 	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++ 		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
++ }
++ 
++ static void p2m_top_mfn_p_init(unsigned long **top)
++ {
++ 	unsigned i;
++ 
++ 	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++ 		top[i] = p2m_mid_missing_mfn;
++ }
++ 
++ static void p2m_mid_init(unsigned long **mid)
++ {
++ 	unsigned i;
++ 
++ 	for (i = 0; i < P2M_MID_PER_PAGE; i++)
++ 		mid[i] = p2m_missing;
++ }
++ 
++ static void p2m_mid_mfn_init(unsigned long *mid)
++ {
++ 	unsigned i;
++ 
++ 	for (i = 0; i < P2M_MID_PER_PAGE; i++)
++ 		mid[i] = virt_to_mfn(p2m_missing);
 + }
 + 
-  /* Build the parallel p2m_top_mfn structures */
++ static void p2m_init(unsigned long *p2m)
++ {
++ 	unsigned i;
++ 
++ 	for (i = 0; i < P2M_MID_PER_PAGE; i++)
++ 		p2m[i] = INVALID_P2M_ENTRY;
++ }
++ 
++ /*
++  * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
++  *
++  * This is called both at boot time, and after resuming from suspend:
++  * - At boot time we're called very early, and must use extend_brk()
++  *   to allocate memory.
++  *
++  * - After resume we're called from within stop_machine, but the mfn
++  *   tree should alreay be completely allocated.
++  */
   void xen_build_mfn_list_list(void)
   {
-- 	unsigned pfn, idx;
++ 	unsigned long pfn;
   
-- 	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
-- 		unsigned topidx = p2m_top_index(pfn);
++ 	/* Pre-initialize p2m_top_mfn to be completely missing */
++ 	if (p2m_top_mfn == NULL) {
++ 		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ 		p2m_mid_mfn_init(p2m_mid_missing_mfn);
++ 
++ 		p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ 		p2m_top_mfn_p_init(p2m_top_mfn_p);
   
-- 		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
++ 		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ 		p2m_top_mfn_init(p2m_top_mfn);
++ 	} else {
++ 		/* Reinitialise, mfn's all change after migration */
++ 		p2m_mid_mfn_init(p2m_mid_missing_mfn);
   	}
   
-- 	for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
-- 		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
-- 		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
++ 	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
++ 		unsigned topidx = p2m_top_index(pfn);
++ 		unsigned mididx = p2m_mid_index(pfn);
++ 		unsigned long **mid;
++ 		unsigned long *mid_mfn_p;
++ 
++ 		mid = p2m_top[topidx];
++ 		mid_mfn_p = p2m_top_mfn_p[topidx];
++ 
++ 		/* Don't bother allocating any mfn mid levels if
++ 		 * they're just missing, just update the stored mfn,
++ 		 * since all could have changed over a migrate.
++ 		 */
++ 		if (mid == p2m_mid_missing) {
++ 			BUG_ON(mididx);
++ 			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
++ 			p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
++ 			pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
++ 			continue;
++ 		}
++ 
++ 		if (mid_mfn_p == p2m_mid_missing_mfn) {
++ 			/*
++ 			 * XXX boot-time only!  We should never find
++ 			 * missing parts of the mfn tree after
++ 			 * runtime.  extend_brk() will BUG if we call
++ 			 * it too late.
++ 			 */
++ 			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ 			p2m_mid_mfn_init(mid_mfn_p);
++ 
++ 			p2m_top_mfn_p[topidx] = mid_mfn_p;
++ 		}
++ 
++ 		p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
++ 		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
   	}
   }
   
@@@@ -231,15 -229,15 -364,37 +366,37 @@@@ void __init xen_build_dynamic_phys_to_m
   {
   	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
   	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
-- 	unsigned pfn;
++ 	unsigned long pfn;
++ 
++ 	xen_max_p2m_pfn = max_pfn;
 + 
-  	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
++ 	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ 	p2m_init(p2m_missing);
+  
 - 	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
++ 	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ 	p2m_mid_init(p2m_mid_missing);
++ 
++ 	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ 	p2m_top_init(p2m_top);
++ 
++ 	/*
++ 	 * The domain builder gives us a pre-constructed p2m array in
++ 	 * mfn_list for all the pages initially given to us, so we just
++ 	 * need to graft that into our tree structure.
++ 	 */
++ 	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
   		unsigned topidx = p2m_top_index(pfn);
++ 		unsigned mididx = p2m_mid_index(pfn);
   
-- 		p2m_top[topidx] = &mfn_list[pfn];
-- 	}
++ 		if (p2m_top[topidx] == p2m_mid_missing) {
++ 			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
++ 			p2m_mid_init(mid);
++ 
++ 			p2m_top[topidx] = mid;
++ 		}
   
-- 	xen_build_mfn_list_list();
++ 		p2m_top[topidx][mididx] = &mfn_list[pfn];
++ 	}
   }
   
   unsigned long get_phys_to_machine(unsigned long pfn)
@@@@ -255,36 -253,36 -412,88 +414,88 @@@@
   }
   EXPORT_SYMBOL_GPL(get_phys_to_machine);
   
-- /* install a  new p2m_top page */
-- bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
++ static void *alloc_p2m_page(void)
   {
-- 	unsigned topidx = p2m_top_index(pfn);
-- 	unsigned long **pfnp, *mfnp;
-- 	unsigned i;
++ 	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
++ }
   
-- 	pfnp = &p2m_top[topidx];
-- 	mfnp = &p2m_top_mfn[topidx];
++ static void free_p2m_page(void *p)
++ {
++ 	free_page((unsigned long)p);
++ }
   
-- 	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
-- 		p[i] = INVALID_P2M_ENTRY;
++ /* 
++  * Fully allocate the p2m structure for a given pfn.  We need to check
++  * that both the top and mid levels are allocated, and make sure the
++  * parallel mfn tree is kept in sync.  We may race with other cpus, so
++  * the new pages are installed with cmpxchg; if we lose the race then
++  * simply free the page we allocated and use the one that's there.
++  */
++ static bool alloc_p2m(unsigned long pfn)
++ {
++ 	unsigned topidx, mididx;
++ 	unsigned long ***top_p, **mid;
++ 	unsigned long *top_mfn_p, *mid_mfn;
   
-- 	if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
-- 		*mfnp = virt_to_mfn(p);
-- 		return true;
++ 	topidx = p2m_top_index(pfn);
++ 	mididx = p2m_mid_index(pfn);
++ 
++ 	top_p = &p2m_top[topidx];
++ 	mid = *top_p;
++ 
++ 	if (mid == p2m_mid_missing) {
++ 		/* Mid level is missing, allocate a new one */
++ 		mid = alloc_p2m_page();
++ 		if (!mid)
++ 			return false;
++ 
++ 		p2m_mid_init(mid);
++ 
++ 		if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
++ 			free_p2m_page(mid);
   	}
   
-- 	return false;
-- }
++ 	top_mfn_p = &p2m_top_mfn[topidx];
++ 	mid_mfn = p2m_top_mfn_p[topidx];
   
-- static void alloc_p2m(unsigned long pfn)
-- {
-- 	unsigned long *p;
++ 	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
++ 
++ 	if (mid_mfn == p2m_mid_missing_mfn) {
++ 		/* Separately check the mid mfn level */
++ 		unsigned long missing_mfn;
++ 		unsigned long mid_mfn_mfn;
+  
 - 	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
 - 	BUG_ON(p == NULL);
++ 		mid_mfn = alloc_p2m_page();
++ 		if (!mid_mfn)
++ 			return false;
+  
 - 	if (!install_p2mtop_page(pfn, p))
 - 		free_page((unsigned long)p);
++ 		p2m_mid_mfn_init(mid_mfn);
++ 
++ 		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
++ 		mid_mfn_mfn = virt_to_mfn(mid_mfn);
++ 		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
++ 			free_p2m_page(mid_mfn);
++ 		else
++ 			p2m_top_mfn_p[topidx] = mid_mfn;
++ 	}
++ 
++ 	if (p2m_top[topidx][mididx] == p2m_missing) {
++ 		/* p2m leaf page is missing */
++ 		unsigned long *p2m;
++ 
++ 		p2m = alloc_p2m_page();
++ 		if (!p2m)
++ 			return false;
 + 
-  	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
-  	BUG_ON(p == NULL);
++ 		p2m_init(p2m);
++ 
++ 		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
++ 			free_p2m_page(p2m);
++ 		else
++ 			mid_mfn[mididx] = virt_to_mfn(p2m);
++ 	}
 + 
-  	if (!install_p2mtop_page(pfn, p))
-  		free_page((unsigned long)p);
++ 	return true;
   }
   
   /* Try to install p2m mapping; fail if intermediate bits missing */
diff --cc arch/x86/xen/setup.c
index 9729c903404b,328b00305426,8e2c9f21fa37..105db2501050
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@@@ -34,6 -33,6 -35,39 +36,39 @@@@ extern void xen_sysenter_target(void)
   extern void xen_syscall_target(void);
   extern void xen_syscall32_target(void);
   
++ /* Amount of extra memory space we add to the e820 ranges */
++ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
++ 
++ /* 
++  * The maximum amount of extra memory compared to the base size.  The
++  * main scaling factor is the size of struct page.  At extreme ratios
++  * of base:extra, all the base memory can be filled with page
++  * structures for the extra memory, leaving no space for anything
++  * else.
++  * 
++  * 10x seems like a reasonable balance between scaling flexibility and
++  * leaving a practically usable system.
++  */
++ #define EXTRA_MEM_RATIO		(10)
++ 
++ static __init void xen_add_extra_mem(unsigned long pages)
++ {
++ 	u64 size = (u64)pages * PAGE_SIZE;
++ 	u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
++ 
++ 	if (!pages)
++ 		return;
++ 
++ 	e820_add_region(extra_start, size, E820_RAM);
++ 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
++ 
  -	reserve_early(extra_start, extra_start + size, "XEN EXTRA");
+++	memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
++ 
++ 	xen_extra_mem_size += size;
++ 
++ 	xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
++ }
++ 
   static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
   					      phys_addr_t end_addr)
   {