x86 setup: don't recalculate ss:esp unless really necessary
[sfrench/cifs-2.6.git] / arch / x86 / mm / init_64.c
1 /*
2  *  linux/arch/x86_64/mm/init.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
6  *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7  */
8
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
17 #include <linux/mm.h>
18 #include <linux/swap.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h>
23 #include <linux/proc_fs.h>
24 #include <linux/pci.h>
25 #include <linux/pfn.h>
26 #include <linux/poison.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/module.h>
29 #include <linux/memory_hotplug.h>
30 #include <linux/nmi.h>
31
32 #include <asm/processor.h>
33 #include <asm/system.h>
34 #include <asm/uaccess.h>
35 #include <asm/pgtable.h>
36 #include <asm/pgalloc.h>
37 #include <asm/dma.h>
38 #include <asm/fixmap.h>
39 #include <asm/e820.h>
40 #include <asm/apic.h>
41 #include <asm/tlb.h>
42 #include <asm/mmu_context.h>
43 #include <asm/proto.h>
44 #include <asm/smp.h>
45 #include <asm/sections.h>
46
47 #ifndef Dprintk
48 #define Dprintk(x...)
49 #endif
50
51 const struct dma_mapping_ops* dma_ops;
52 EXPORT_SYMBOL(dma_ops);
53
54 static unsigned long dma_reserve __initdata;
55
56 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
57
58 /*
59  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
60  * physical space so we can cache the place of the first one and move
61  * around without checking the pgd every time.
62  */
63
64 void show_mem(void)
65 {
66         long i, total = 0, reserved = 0;
67         long shared = 0, cached = 0;
68         pg_data_t *pgdat;
69         struct page *page;
70
71         printk(KERN_INFO "Mem-info:\n");
72         show_free_areas();
73         printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
74
75         for_each_online_pgdat(pgdat) {
76                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
77                         /* this loop can take a while with 256 GB and 4k pages
78                            so update the NMI watchdog */
79                         if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
80                                 touch_nmi_watchdog();
81                         }
82                         if (!pfn_valid(pgdat->node_start_pfn + i))
83                                 continue;
84                         page = pfn_to_page(pgdat->node_start_pfn + i);
85                         total++;
86                         if (PageReserved(page))
87                                 reserved++;
88                         else if (PageSwapCache(page))
89                                 cached++;
90                         else if (page_count(page))
91                                 shared += page_count(page) - 1;
92                }
93         }
94         printk(KERN_INFO "%lu pages of RAM\n", total);
95         printk(KERN_INFO "%lu reserved pages\n",reserved);
96         printk(KERN_INFO "%lu pages shared\n",shared);
97         printk(KERN_INFO "%lu pages swap cached\n",cached);
98 }
99
100 int after_bootmem;
101
102 static __init void *spp_getpage(void)
103
104         void *ptr;
105         if (after_bootmem)
106                 ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
107         else
108                 ptr = alloc_bootmem_pages(PAGE_SIZE);
109         if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
110                 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
111
112         Dprintk("spp_getpage %p\n", ptr);
113         return ptr;
114
115
116 static __init void set_pte_phys(unsigned long vaddr,
117                          unsigned long phys, pgprot_t prot)
118 {
119         pgd_t *pgd;
120         pud_t *pud;
121         pmd_t *pmd;
122         pte_t *pte, new_pte;
123
124         Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
125
126         pgd = pgd_offset_k(vaddr);
127         if (pgd_none(*pgd)) {
128                 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
129                 return;
130         }
131         pud = pud_offset(pgd, vaddr);
132         if (pud_none(*pud)) {
133                 pmd = (pmd_t *) spp_getpage(); 
134                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
135                 if (pmd != pmd_offset(pud, 0)) {
136                         printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
137                         return;
138                 }
139         }
140         pmd = pmd_offset(pud, vaddr);
141         if (pmd_none(*pmd)) {
142                 pte = (pte_t *) spp_getpage();
143                 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
144                 if (pte != pte_offset_kernel(pmd, 0)) {
145                         printk("PAGETABLE BUG #02!\n");
146                         return;
147                 }
148         }
149         new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
150
151         pte = pte_offset_kernel(pmd, vaddr);
152         if (!pte_none(*pte) &&
153             pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
154                 pte_ERROR(*pte);
155         set_pte(pte, new_pte);
156
157         /*
158          * It's enough to flush this one mapping.
159          * (PGE mappings get flushed as well)
160          */
161         __flush_tlb_one(vaddr);
162 }
163
164 /* NOTE: this is meant to be run only at boot */
165 void __init 
166 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
167 {
168         unsigned long address = __fix_to_virt(idx);
169
170         if (idx >= __end_of_fixed_addresses) {
171                 printk("Invalid __set_fixmap\n");
172                 return;
173         }
174         set_pte_phys(address, phys, prot);
175 }
176
177 unsigned long __meminitdata table_start, table_end;
178
179 static __meminit void *alloc_low_page(unsigned long *phys)
180
181         unsigned long pfn = table_end++;
182         void *adr;
183
184         if (after_bootmem) {
185                 adr = (void *)get_zeroed_page(GFP_ATOMIC);
186                 *phys = __pa(adr);
187                 return adr;
188         }
189
190         if (pfn >= end_pfn) 
191                 panic("alloc_low_page: ran out of memory"); 
192
193         adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
194         memset(adr, 0, PAGE_SIZE);
195         *phys  = pfn * PAGE_SIZE;
196         return adr;
197 }
198
199 static __meminit void unmap_low_page(void *adr)
200
201
202         if (after_bootmem)
203                 return;
204
205         early_iounmap(adr, PAGE_SIZE);
206
207
208 /* Must run before zap_low_mappings */
209 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
210 {
211         unsigned long vaddr;
212         pmd_t *pmd, *last_pmd;
213         int i, pmds;
214
215         pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
216         vaddr = __START_KERNEL_map;
217         pmd = level2_kernel_pgt;
218         last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
219         for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
220                 for (i = 0; i < pmds; i++) {
221                         if (pmd_present(pmd[i]))
222                                 goto next;
223                 }
224                 vaddr += addr & ~PMD_MASK;
225                 addr &= PMD_MASK;
226                 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
227                         set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
228                 __flush_tlb();
229                 return (void *)vaddr;
230         next:
231                 ;
232         }
233         printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
234         return NULL;
235 }
236
237 /* To avoid virtual aliases later */
238 __meminit void early_iounmap(void *addr, unsigned long size)
239 {
240         unsigned long vaddr;
241         pmd_t *pmd;
242         int i, pmds;
243
244         vaddr = (unsigned long)addr;
245         pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
246         pmd = level2_kernel_pgt + pmd_index(vaddr);
247         for (i = 0; i < pmds; i++)
248                 pmd_clear(pmd + i);
249         __flush_tlb();
250 }
251
252 static void __meminit
253 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
254 {
255         int i = pmd_index(address);
256
257         for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
258                 unsigned long entry;
259                 pmd_t *pmd = pmd_page + pmd_index(address);
260
261                 if (address >= end) {
262                         if (!after_bootmem)
263                                 for (; i < PTRS_PER_PMD; i++, pmd++)
264                                         set_pmd(pmd, __pmd(0));
265                         break;
266                 }
267
268                 if (pmd_val(*pmd))
269                         continue;
270
271                 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
272                 entry &= __supported_pte_mask;
273                 set_pmd(pmd, __pmd(entry));
274         }
275 }
276
277 static void __meminit
278 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
279 {
280         pmd_t *pmd = pmd_offset(pud,0);
281         spin_lock(&init_mm.page_table_lock);
282         phys_pmd_init(pmd, address, end);
283         spin_unlock(&init_mm.page_table_lock);
284         __flush_tlb_all();
285 }
286
287 static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
288
289         int i = pud_index(addr);
290
291
292         for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
293                 unsigned long pmd_phys;
294                 pud_t *pud = pud_page + pud_index(addr);
295                 pmd_t *pmd;
296
297                 if (addr >= end)
298                         break;
299
300                 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
301                         set_pud(pud, __pud(0)); 
302                         continue;
303                 } 
304
305                 if (pud_val(*pud)) {
306                         phys_pmd_update(pud, addr, end);
307                         continue;
308                 }
309
310                 pmd = alloc_low_page(&pmd_phys);
311                 spin_lock(&init_mm.page_table_lock);
312                 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
313                 phys_pmd_init(pmd, addr, end);
314                 spin_unlock(&init_mm.page_table_lock);
315                 unmap_low_page(pmd);
316         }
317         __flush_tlb();
318
319
320 static void __init find_early_table_space(unsigned long end)
321 {
322         unsigned long puds, pmds, tables, start;
323
324         puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
325         pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
326         tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
327                  round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
328
329         /* RED-PEN putting page tables only on node 0 could
330            cause a hotspot and fill up ZONE_DMA. The page tables
331            need roughly 0.5KB per GB. */
332         start = 0x8000;
333         table_start = find_e820_area(start, end, tables);
334         if (table_start == -1UL)
335                 panic("Cannot find space for the kernel page tables");
336
337         table_start >>= PAGE_SHIFT;
338         table_end = table_start;
339
340         early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
341                 end, table_start << PAGE_SHIFT,
342                 (table_start << PAGE_SHIFT) + tables);
343 }
344
345 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
346    This runs before bootmem is initialized and gets pages directly from the 
347    physical memory. To access them they are temporarily mapped. */
348 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
349
350         unsigned long next; 
351
352         Dprintk("init_memory_mapping\n");
353
354         /* 
355          * Find space for the kernel direct mapping tables.
356          * Later we should allocate these tables in the local node of the memory
357          * mapped.  Unfortunately this is done currently before the nodes are 
358          * discovered.
359          */
360         if (!after_bootmem)
361                 find_early_table_space(end);
362
363         start = (unsigned long)__va(start);
364         end = (unsigned long)__va(end);
365
366         for (; start < end; start = next) {
367                 unsigned long pud_phys; 
368                 pgd_t *pgd = pgd_offset_k(start);
369                 pud_t *pud;
370
371                 if (after_bootmem)
372                         pud = pud_offset(pgd, start & PGDIR_MASK);
373                 else
374                         pud = alloc_low_page(&pud_phys);
375
376                 next = start + PGDIR_SIZE;
377                 if (next > end) 
378                         next = end; 
379                 phys_pud_init(pud, __pa(start), __pa(next));
380                 if (!after_bootmem)
381                         set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
382                 unmap_low_page(pud);
383         } 
384
385         if (!after_bootmem)
386                 mmu_cr4_features = read_cr4();
387         __flush_tlb_all();
388 }
389
390 #ifndef CONFIG_NUMA
391 void __init paging_init(void)
392 {
393         unsigned long max_zone_pfns[MAX_NR_ZONES];
394         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
395         max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
396         max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
397         max_zone_pfns[ZONE_NORMAL] = end_pfn;
398
399         memory_present(0, 0, end_pfn);
400         sparse_init();
401         free_area_init_nodes(max_zone_pfns);
402 }
403 #endif
404
405 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
406    from the CPU leading to inconsistent cache lines. address and size
407    must be aligned to 2MB boundaries. 
408    Does nothing when the mapping doesn't exist. */
409 void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
410 {
411         unsigned long end = address + size;
412
413         BUG_ON(address & ~LARGE_PAGE_MASK);
414         BUG_ON(size & ~LARGE_PAGE_MASK); 
415         
416         for (; address < end; address += LARGE_PAGE_SIZE) { 
417                 pgd_t *pgd = pgd_offset_k(address);
418                 pud_t *pud;
419                 pmd_t *pmd;
420                 if (pgd_none(*pgd))
421                         continue;
422                 pud = pud_offset(pgd, address);
423                 if (pud_none(*pud))
424                         continue; 
425                 pmd = pmd_offset(pud, address);
426                 if (!pmd || pmd_none(*pmd))
427                         continue; 
428                 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
429                         /* Could handle this, but it should not happen currently. */
430                         printk(KERN_ERR 
431                "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
432                         pmd_ERROR(*pmd); 
433                 }
434                 set_pmd(pmd, __pmd(0));                 
435         }
436         __flush_tlb_all();
437
438
439 /*
440  * Memory hotplug specific functions
441  */
442 void online_page(struct page *page)
443 {
444         ClearPageReserved(page);
445         init_page_count(page);
446         __free_page(page);
447         totalram_pages++;
448         num_physpages++;
449 }
450
451 #ifdef CONFIG_MEMORY_HOTPLUG
452 /*
453  * Memory is added always to NORMAL zone. This means you will never get
454  * additional DMA/DMA32 memory.
455  */
456 int arch_add_memory(int nid, u64 start, u64 size)
457 {
458         struct pglist_data *pgdat = NODE_DATA(nid);
459         struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
460         unsigned long start_pfn = start >> PAGE_SHIFT;
461         unsigned long nr_pages = size >> PAGE_SHIFT;
462         int ret;
463
464         init_memory_mapping(start, (start + size -1));
465
466         ret = __add_pages(zone, start_pfn, nr_pages);
467         if (ret)
468                 goto error;
469
470         return ret;
471 error:
472         printk("%s: Problem encountered in __add_pages!\n", __func__);
473         return ret;
474 }
475 EXPORT_SYMBOL_GPL(arch_add_memory);
476
477 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
478 int memory_add_physaddr_to_nid(u64 start)
479 {
480         return 0;
481 }
482 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
483 #endif
484
485 #endif /* CONFIG_MEMORY_HOTPLUG */
486
487 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
488 /*
489  * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
490  * just online the pages.
491  */
492 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
493 {
494         int err = -EIO;
495         unsigned long pfn;
496         unsigned long total = 0, mem = 0;
497         for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
498                 if (pfn_valid(pfn)) {
499                         online_page(pfn_to_page(pfn));
500                         err = 0;
501                         mem++;
502                 }
503                 total++;
504         }
505         if (!err) {
506                 z->spanned_pages += total;
507                 z->present_pages += mem;
508                 z->zone_pgdat->node_spanned_pages += total;
509                 z->zone_pgdat->node_present_pages += mem;
510         }
511         return err;
512 }
513 #endif
514
515 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
516                          kcore_vsyscall;
517
518 void __init mem_init(void)
519 {
520         long codesize, reservedpages, datasize, initsize;
521
522         pci_iommu_alloc();
523
524         /* clear the zero-page */
525         memset(empty_zero_page, 0, PAGE_SIZE);
526
527         reservedpages = 0;
528
529         /* this will put all low memory onto the freelists */
530 #ifdef CONFIG_NUMA
531         totalram_pages = numa_free_all_bootmem();
532 #else
533         totalram_pages = free_all_bootmem();
534 #endif
535         reservedpages = end_pfn - totalram_pages -
536                                         absent_pages_in_range(0, end_pfn);
537
538         after_bootmem = 1;
539
540         codesize =  (unsigned long) &_etext - (unsigned long) &_text;
541         datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
542         initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
543
544         /* Register memory areas for /proc/kcore */
545         kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
546         kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
547                    VMALLOC_END-VMALLOC_START);
548         kclist_add(&kcore_kernel, &_stext, _end - _stext);
549         kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
550         kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
551                                  VSYSCALL_END - VSYSCALL_START);
552
553         printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
554                 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
555                 end_pfn << (PAGE_SHIFT-10),
556                 codesize >> 10,
557                 reservedpages << (PAGE_SHIFT-10),
558                 datasize >> 10,
559                 initsize >> 10);
560 }
561
562 void free_init_pages(char *what, unsigned long begin, unsigned long end)
563 {
564         unsigned long addr;
565
566         if (begin >= end)
567                 return;
568
569         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
570         for (addr = begin; addr < end; addr += PAGE_SIZE) {
571                 ClearPageReserved(virt_to_page(addr));
572                 init_page_count(virt_to_page(addr));
573                 memset((void *)(addr & ~(PAGE_SIZE-1)),
574                         POISON_FREE_INITMEM, PAGE_SIZE);
575                 if (addr >= __START_KERNEL_map)
576                         change_page_attr_addr(addr, 1, __pgprot(0));
577                 free_page(addr);
578                 totalram_pages++;
579         }
580         if (addr > __START_KERNEL_map)
581                 global_flush_tlb();
582 }
583
584 void free_initmem(void)
585 {
586         free_init_pages("unused kernel memory",
587                         (unsigned long)(&__init_begin),
588                         (unsigned long)(&__init_end));
589 }
590
591 #ifdef CONFIG_DEBUG_RODATA
592
593 void mark_rodata_ro(void)
594 {
595         unsigned long start = (unsigned long)_stext, end;
596
597 #ifdef CONFIG_HOTPLUG_CPU
598         /* It must still be possible to apply SMP alternatives. */
599         if (num_possible_cpus() > 1)
600                 start = (unsigned long)_etext;
601 #endif
602
603 #ifdef CONFIG_KPROBES
604         start = (unsigned long)__start_rodata;
605 #endif
606         
607         end = (unsigned long)__end_rodata;
608         start = (start + PAGE_SIZE - 1) & PAGE_MASK;
609         end &= PAGE_MASK;
610         if (end <= start)
611                 return;
612
613         change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
614
615         printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
616                (end - start) >> 10);
617
618         /*
619          * change_page_attr_addr() requires a global_flush_tlb() call after it.
620          * We do this after the printk so that if something went wrong in the
621          * change, the printk gets out at least to give a better debug hint
622          * of who is the culprit.
623          */
624         global_flush_tlb();
625 }
626 #endif
627
628 #ifdef CONFIG_BLK_DEV_INITRD
629 void free_initrd_mem(unsigned long start, unsigned long end)
630 {
631         free_init_pages("initrd memory", start, end);
632 }
633 #endif
634
635 void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
636
637 #ifdef CONFIG_NUMA
638         int nid = phys_to_nid(phys);
639 #endif
640         unsigned long pfn = phys >> PAGE_SHIFT;
641         if (pfn >= end_pfn) {
642                 /* This can happen with kdump kernels when accessing firmware
643                    tables. */
644                 if (pfn < end_pfn_map)
645                         return;
646                 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
647                                 phys, len);
648                 return;
649         }
650
651         /* Should check here against the e820 map to avoid double free */
652 #ifdef CONFIG_NUMA
653         reserve_bootmem_node(NODE_DATA(nid), phys, len);
654 #else                   
655         reserve_bootmem(phys, len);    
656 #endif
657         if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
658                 dma_reserve += len / PAGE_SIZE;
659                 set_dma_reserve(dma_reserve);
660         }
661 }
662
663 int kern_addr_valid(unsigned long addr) 
664
665         unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
666        pgd_t *pgd;
667        pud_t *pud;
668        pmd_t *pmd;
669        pte_t *pte;
670
671         if (above != 0 && above != -1UL)
672                 return 0; 
673         
674         pgd = pgd_offset_k(addr);
675         if (pgd_none(*pgd))
676                 return 0;
677
678         pud = pud_offset(pgd, addr);
679         if (pud_none(*pud))
680                 return 0; 
681
682         pmd = pmd_offset(pud, addr);
683         if (pmd_none(*pmd))
684                 return 0;
685         if (pmd_large(*pmd))
686                 return pfn_valid(pmd_pfn(*pmd));
687
688         pte = pte_offset_kernel(pmd, addr);
689         if (pte_none(*pte))
690                 return 0;
691         return pfn_valid(pte_pfn(*pte));
692 }
693
694 /* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
695    covers the 64bit vsyscall page now. 32bit has a real VMA now and does
696    not need special handling anymore. */
697
698 static struct vm_area_struct gate_vma = {
699         .vm_start = VSYSCALL_START,
700         .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
701         .vm_page_prot = PAGE_READONLY_EXEC,
702         .vm_flags = VM_READ | VM_EXEC
703 };
704
705 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
706 {
707 #ifdef CONFIG_IA32_EMULATION
708         if (test_tsk_thread_flag(tsk, TIF_IA32))
709                 return NULL;
710 #endif
711         return &gate_vma;
712 }
713
714 int in_gate_area(struct task_struct *task, unsigned long addr)
715 {
716         struct vm_area_struct *vma = get_gate_vma(task);
717         if (!vma)
718                 return 0;
719         return (addr >= vma->vm_start) && (addr < vma->vm_end);
720 }
721
722 /* Use this when you have no reliable task/vma, typically from interrupt
723  * context.  It is less reliable than using the task's vma and may give
724  * false positives.
725  */
726 int in_gate_area_no_task(unsigned long addr)
727 {
728         return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
729 }
730
731 const char *arch_vma_name(struct vm_area_struct *vma)
732 {
733         if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
734                 return "[vdso]";
735         if (vma == &gate_vma)
736                 return "[vsyscall]";
737         return NULL;
738 }
739
740 #ifdef CONFIG_SPARSEMEM_VMEMMAP
741 /*
742  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
743  */
744 int __meminit vmemmap_populate(struct page *start_page,
745                                                 unsigned long size, int node)
746 {
747         unsigned long addr = (unsigned long)start_page;
748         unsigned long end = (unsigned long)(start_page + size);
749         unsigned long next;
750         pgd_t *pgd;
751         pud_t *pud;
752         pmd_t *pmd;
753
754         for (; addr < end; addr = next) {
755                 next = pmd_addr_end(addr, end);
756
757                 pgd = vmemmap_pgd_populate(addr, node);
758                 if (!pgd)
759                         return -ENOMEM;
760                 pud = vmemmap_pud_populate(pgd, addr, node);
761                 if (!pud)
762                         return -ENOMEM;
763
764                 pmd = pmd_offset(pud, addr);
765                 if (pmd_none(*pmd)) {
766                         pte_t entry;
767                         void *p = vmemmap_alloc_block(PMD_SIZE, node);
768                         if (!p)
769                                 return -ENOMEM;
770
771                         entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
772                         mk_pte_huge(entry);
773                         set_pmd(pmd, __pmd(pte_val(entry)));
774
775                         printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
776                                 addr, addr + PMD_SIZE - 1, p, node);
777                 } else
778                         vmemmap_verify((pte_t *)pmd, node, addr, next);
779         }
780
781         return 0;
782 }
783 #endif