Merge branch 'x86/mpparse' into x86/devel
[sfrench/cifs-2.6.git] / arch / x86 / mm / discontig_32.c
index 8b4eac0ca07d8265314e0b2551daef03933f6ae2..a2f73ba42b8bcbe6bf1eddc18e2052644d2b0882 100644 (file)
@@ -38,6 +38,7 @@
 #include <asm/setup.h>
 #include <asm/mmzone.h>
 #include <asm/bios_ebda.h>
+#include <asm/proto.h>
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
@@ -59,14 +60,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
 /*
  * 4) physnode_map     - the mapping between a pfn and owning node
  * physnode_map keeps track of the physical memory layout of a generic
- * numa node on a 256Mb break (each element of the array will
- * represent 256Mb of memory and will be marked by the node id.  so,
+ * numa node on a 64Mb break (each element of the array will
+ * represent 64Mb of memory and will be marked by the node id.  so,
  * if the first gig is on node 0, and the second gig is on node 1
  * physnode_map will contain:
  *
- *     physnode_map[0-3] = 0;
- *     physnode_map[4-7] = 1;
- *     physnode_map[8- ] = -1;
+ *     physnode_map[0-15] = 0;
+ *     physnode_map[16-31] = 1;
+ *     physnode_map[32- ] = -1;
  */
 s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
 EXPORT_SYMBOL(physnode_map);
@@ -81,9 +82,9 @@ void memory_present(int nid, unsigned long start, unsigned long end)
        printk(KERN_DEBUG "  ");
        for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
                physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
-               printk("%ld ", pfn);
+               printk(KERN_CONT "%ld ", pfn);
        }
-       printk("\n");
+       printk(KERN_CONT "\n");
 }
 
 unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 #endif
 
 extern unsigned long find_max_low_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
 extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -119,11 +119,11 @@ int __init get_memcfg_numa_flat(void)
 {
        printk("NUMA - single node, flat memory mode\n");
 
-       /* Run the memory configuration and find the top of memory. */
-       propagate_e820_map();
        node_start_pfn[0] = 0;
        node_end_pfn[0] = max_pfn;
+       e820_register_active_regions(0, 0, max_pfn);
        memory_present(0, 0, max_pfn);
+       node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
 
         /* Indicate there is one node available. */
        nodes_clear(node_online_map);
@@ -159,9 +159,17 @@ static void __init allocate_pgdat(int nid)
        if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid])
                NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
        else {
-               NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn));
-               min_low_pfn += PFN_UP(sizeof(pg_data_t));
+               unsigned long pgdat_phys;
+               pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+                                (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT,
+                                sizeof(pg_data_t),
+                                PAGE_SIZE);
+               NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
+               reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t),
+                             "NODE_DATA");
        }
+       printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
+               nid, (unsigned long)NODE_DATA(nid));
 }
 
 /*
@@ -199,8 +207,12 @@ void __init remap_numa_kva(void)
        int node;
 
        for_each_online_node(node) {
+               printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
                for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
                        vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
+                       printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
+                               (unsigned long)vaddr,
+                               node_remap_start_pfn[node] + pfn);
                        set_pmd_pfn((ulong) vaddr, 
                                node_remap_start_pfn[node] + pfn, 
                                PAGE_KERNEL_LARGE);
@@ -212,17 +224,21 @@ static unsigned long calculate_numa_remap_pages(void)
 {
        int nid;
        unsigned long size, reserve_pages = 0;
-       unsigned long pfn;
 
        for_each_online_node(nid) {
-               unsigned old_end_pfn = node_end_pfn[nid];
+               u64 node_kva_target;
+               u64 node_kva_final;
 
                /*
                 * The acpi/srat node info can show hot-add memroy zones
                 * where memory could be added but not currently present.
                 */
+               printk("node %d pfn: [%lx - %lx]\n",
+                       nid, node_start_pfn[nid], node_end_pfn[nid]);
                if (node_start_pfn[nid] > max_pfn)
                        continue;
+               if (!node_end_pfn[nid])
+                       continue;
                if (node_end_pfn[nid] > max_pfn)
                        node_end_pfn[nid] = max_pfn;
 
@@ -234,39 +250,45 @@ static unsigned long calculate_numa_remap_pages(void)
                /* now the roundup is correct, convert to PAGE_SIZE pages */
                size = size * PTRS_PER_PTE;
 
-               /*
-                * Validate the region we are allocating only contains valid
-                * pages.
-                */
-               for (pfn = node_end_pfn[nid] - size;
-                    pfn < node_end_pfn[nid]; pfn++)
-                       if (!page_is_ram(pfn))
-                               break;
+               node_kva_target = round_down(node_end_pfn[nid] - size,
+                                                PTRS_PER_PTE);
+               node_kva_target <<= PAGE_SHIFT;
+               do {
+                       node_kva_final = find_e820_area(node_kva_target,
+                                       ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
+                                               ((u64)size)<<PAGE_SHIFT,
+                                               LARGE_PAGE_BYTES);
+                       node_kva_target -= LARGE_PAGE_BYTES;
+               } while (node_kva_final == -1ULL &&
+                        (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
+
+               if (node_kva_final == -1ULL)
+                       panic("Can not get kva ram\n");
 
-               if (pfn != node_end_pfn[nid])
-                       size = 0;
-
-               printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
-                               size, nid);
                node_remap_size[nid] = size;
                node_remap_offset[nid] = reserve_pages;
                reserve_pages += size;
-               printk("Shrinking node %d from %ld pages to %ld pages\n",
-                       nid, node_end_pfn[nid], node_end_pfn[nid] - size);
-
-               if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) {
-                       /*
-                        * Align node_end_pfn[] and node_remap_start_pfn[] to
-                        * pmd boundary. remap_numa_kva will barf otherwise.
-                        */
-                       printk("Shrinking node %d further by %ld pages for proper alignment\n",
-                               nid, node_end_pfn[nid] & (PTRS_PER_PTE-1));
-                       size +=  node_end_pfn[nid] & (PTRS_PER_PTE-1);
-               }
+               printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
+                               size, nid, node_kva_final>>PAGE_SHIFT);
+
+               /*
+                *  prevent kva address below max_low_pfn want it on system
+                *  with less memory later.
+                *  layout will be: KVA address , KVA RAM
+                *
+                *  we are supposed to only record the one less then max_low_pfn
+                *  but we could have some hole in high memory, and it will only
+                *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
+                *  to use it as free.
+                *  So reserve_early here, hope we don't run out of that array
+                */
+               reserve_early(node_kva_final,
+                             node_kva_final+(((u64)size)<<PAGE_SHIFT),
+                             "KVA RAM");
 
-               node_end_pfn[nid] -= size;
-               node_remap_start_pfn[nid] = node_end_pfn[nid];
-               shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
+               node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
+               remove_active_range(nid, node_remap_start_pfn[nid],
+                                        node_remap_start_pfn[nid] + size);
        }
        printk("Reserving total of %ld pages for numa KVA remap\n",
                        reserve_pages);
@@ -284,8 +306,7 @@ static void init_remap_allocator(int nid)
 
        printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
                (ulong) node_remap_start_vaddr[nid],
-               (ulong) pfn_to_kaddr(highstart_pfn
-                  + node_remap_offset[nid] + node_remap_size[nid]));
+               (ulong) node_remap_end_vaddr[nid]);
 }
 
 extern void setup_bootmem_allocator(void);
@@ -293,7 +314,7 @@ unsigned long __init setup_memory(void)
 {
        int nid;
        unsigned long system_start_pfn, system_max_low_pfn;
-       unsigned long wasted_pages;
+       long kva_target_pfn;
 
        /*
         * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -302,34 +323,38 @@ unsigned long __init setup_memory(void)
         * this space and use it to adjust the boundary between ZONE_NORMAL
         * and ZONE_HIGHMEM.
         */
+
+       /* call find_max_low_pfn at first, it could update max_pfn */
+       system_max_low_pfn = max_low_pfn = find_max_low_pfn();
+
+       remove_all_active_ranges();
        get_memcfg_numa();
 
-       kva_pages = calculate_numa_remap_pages();
+       kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
 
        /* partially used pages are not usable - thus round upwards */
        system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
 
-       kva_start_pfn = find_max_low_pfn() - kva_pages;
-
-#ifdef CONFIG_BLK_DEV_INITRD
-       /* Numa kva area is below the initrd */
-       if (initrd_start)
-               kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
-                       - kva_pages;
-#endif
+       kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
+       do {
+               kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
+                                       max_low_pfn<<PAGE_SHIFT,
+                                       kva_pages<<PAGE_SHIFT,
+                                       PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
+               kva_target_pfn -= PTRS_PER_PTE;
+       } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
 
-       /*
-        * We waste pages past at the end of the KVA for no good reason other
-        * than how it is located. This is bad.
-        */
-       wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
-       kva_start_pfn -= wasted_pages;
-       kva_pages += wasted_pages;
+       if (kva_start_pfn == -1UL)
+               panic("Can not get kva space\n");
 
-       system_max_low_pfn = max_low_pfn = find_max_low_pfn();
        printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
                kva_start_pfn, max_low_pfn);
        printk("max_pfn = %ld\n", max_pfn);
+
+       /* avoid clash with initrd */
+       reserve_early(kva_start_pfn<<PAGE_SHIFT,
+                     (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
+                    "KVA PG");
 #ifdef CONFIG_HIGHMEM
        highstart_pfn = highend_pfn = max_pfn;
        if (max_pfn > system_max_low_pfn)
@@ -365,16 +390,8 @@ unsigned long __init setup_memory(void)
        return max_low_pfn;
 }
 
-void __init numa_kva_reserve(void)
-{
-       if (kva_pages)
-               reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
-                               BOOTMEM_DEFAULT);
-}
-
 void __init zone_sizes_init(void)
 {
-       int nid;
        unsigned long max_zone_pfns[MAX_NR_ZONES];
        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
        max_zone_pfns[ZONE_DMA] =
@@ -384,27 +401,18 @@ void __init zone_sizes_init(void)
        max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
 #endif
 
-       /* If SRAT has not registered memory, register it now */
-       if (find_max_pfn_with_active_regions() == 0) {
-               for_each_online_node(nid) {
-                       if (node_has_online_mem(nid))
-                               add_active_range(nid, node_start_pfn[nid],
-                                                       node_end_pfn[nid]);
-               }
-       }
-
        free_area_init_nodes(max_zone_pfns);
        return;
 }
 
-void __init set_highmem_pages_init(int bad_ppro) 
+void __init set_highmem_pages_init(void)
 {
 #ifdef CONFIG_HIGHMEM
        struct zone *zone;
-       struct page *page;
+       int nid;
 
        for_each_zone(zone) {
-               unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+               unsigned long zone_start_pfn, zone_end_pfn;
 
                if (!is_highmem(zone))
                        continue;
@@ -412,16 +420,12 @@ void __init set_highmem_pages_init(int bad_ppro)
                zone_start_pfn = zone->zone_start_pfn;
                zone_end_pfn = zone_start_pfn + zone->spanned_pages;
 
+               nid = zone_to_nid(zone);
                printk("Initializing %s for node %d (%08lx:%08lx)\n",
-                               zone->name, zone_to_nid(zone),
-                               zone_start_pfn, zone_end_pfn);
-
-               for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
-                       if (!pfn_valid(node_pfn))
-                               continue;
-                       page = pfn_to_page(node_pfn);
-                       add_one_highpage_init(page, node_pfn, bad_ppro);
-               }
+                               zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+               add_highpages_with_active_regions(nid, zone_start_pfn,
+                                zone_end_pfn);
        }
        totalram_pages += totalhigh_pages;
 #endif