arch/x86/mm/pgtable_32.c

   1 /*
   2  *  linux/arch/i386/mm/pgtable.c
   3  */
   4
   5 #include <linux/sched.h>
   6 #include <linux/kernel.h>
   7 #include <linux/errno.h>
   8 #include <linux/mm.h>
   9 #include <linux/nmi.h>
  10 #include <linux/swap.h>
  11 #include <linux/smp.h>
  12 #include <linux/highmem.h>
  13 #include <linux/slab.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/spinlock.h>
  16 #include <linux/module.h>
  17 #include <linux/quicklist.h>
  18
  19 #include <asm/system.h>
  20 #include <asm/pgtable.h>
  21 #include <asm/pgalloc.h>
  22 #include <asm/fixmap.h>
  23 #include <asm/e820.h>
  24 #include <asm/tlb.h>
  25 #include <asm/tlbflush.h>
  26
  27 void show_mem(void)
  28 {
  29         int total = 0, reserved = 0;
  30         int shared = 0, cached = 0;
  31         int highmem = 0;
  32         struct page *page;
  33         pg_data_t *pgdat;
  34         unsigned long i;
  35         unsigned long flags;
  36
  37         printk(KERN_INFO "Mem-info:\n");
  38         show_free_areas();
  39         printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
  40         for_each_online_pgdat(pgdat) {
  41                 pgdat_resize_lock(pgdat, &flags);
  42                 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
  43                         if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
  44                                 touch_nmi_watchdog();
  45                         page = pgdat_page_nr(pgdat, i);
  46                         total++;
  47                         if (PageHighMem(page))
  48                                 highmem++;
  49                         if (PageReserved(page))
  50                                 reserved++;
  51                         else if (PageSwapCache(page))
  52                                 cached++;
  53                         else if (page_count(page))
  54                                 shared += page_count(page) - 1;
  55                 }
  56                 pgdat_resize_unlock(pgdat, &flags);
  57         }
  58         printk(KERN_INFO "%d pages of RAM\n", total);
  59         printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
  60         printk(KERN_INFO "%d reserved pages\n", reserved);
  61         printk(KERN_INFO "%d pages shared\n", shared);
  62         printk(KERN_INFO "%d pages swap cached\n", cached);
  63
  64         printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
  65         printk(KERN_INFO "%lu pages writeback\n",
  66                                         global_page_state(NR_WRITEBACK));
  67         printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
  68         printk(KERN_INFO "%lu pages slab\n",
  69                 global_page_state(NR_SLAB_RECLAIMABLE) +
  70                 global_page_state(NR_SLAB_UNRECLAIMABLE));
  71         printk(KERN_INFO "%lu pages pagetables\n",
  72                                         global_page_state(NR_PAGETABLE));
  73 }
  74
  75 /*
  76  * Associate a virtual page frame with a given physical page frame
  77  * and protection flags for that frame.
  78  */
  79 static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
  80 {
  81         pgd_t *pgd;
  82         pud_t *pud;
  83         pmd_t *pmd;
  84         pte_t *pte;
  85
  86         pgd = swapper_pg_dir + pgd_index(vaddr);
  87         if (pgd_none(*pgd)) {
  88                 BUG();
  89                 return;
  90         }
  91         pud = pud_offset(pgd, vaddr);
  92         if (pud_none(*pud)) {
  93                 BUG();
  94                 return;
  95         }
  96         pmd = pmd_offset(pud, vaddr);
  97         if (pmd_none(*pmd)) {
  98                 BUG();
  99                 return;
 100         }
 101         pte = pte_offset_kernel(pmd, vaddr);
 102         if (pgprot_val(flags))
 103                 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags));
 104         else
 105                 pte_clear(&init_mm, vaddr, pte);
 106
 107         /*
 108          * It's enough to flush this one mapping.
 109          * (PGE mappings get flushed as well)
 110          */
 111         __flush_tlb_one(vaddr);
 112 }
 113
 114 /*
 115  * Associate a large virtual page frame with a given physical page frame
 116  * and protection flags for that frame. pfn is for the base of the page,
 117  * vaddr is what the page gets mapped to - both must be properly aligned.
 118  * The pmd must already be instantiated. Assumes PAE mode.
 119  */
 120 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
 121 {
 122         pgd_t *pgd;
 123         pud_t *pud;
 124         pmd_t *pmd;
 125
 126         if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
 127                 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
 128                 return; /* BUG(); */
 129         }
 130         if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
 131                 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
 132                 return; /* BUG(); */
 133         }
 134         pgd = swapper_pg_dir + pgd_index(vaddr);
 135         if (pgd_none(*pgd)) {
 136                 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
 137                 return; /* BUG(); */
 138         }
 139         pud = pud_offset(pgd, vaddr);
 140         pmd = pmd_offset(pud, vaddr);
 141         set_pmd(pmd, pfn_pmd(pfn, flags));
 142         /*
 143          * It's enough to flush this one mapping.
 144          * (PGE mappings get flushed as well)
 145          */
 146         __flush_tlb_one(vaddr);
 147 }
 148
 149 static int fixmaps;
 150 unsigned long __FIXADDR_TOP = 0xfffff000;
 151 EXPORT_SYMBOL(__FIXADDR_TOP);
 152
 153 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
 154 {
 155         unsigned long address = __fix_to_virt(idx);
 156
 157         if (idx >= __end_of_fixed_addresses) {
 158                 BUG();
 159                 return;
 160         }
 161         set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
 162         fixmaps++;
 163 }
 164
 165 /**
 166  * reserve_top_address - reserves a hole in the top of kernel address space
 167  * @reserve - size of hole to reserve
 168  *
 169  * Can be used to relocate the fixmap area and poke a hole in the top
 170  * of kernel address space to make room for a hypervisor.
 171  */
 172 void reserve_top_address(unsigned long reserve)
 173 {
 174         BUG_ON(fixmaps > 0);
 175         printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
 176                (int)-reserve);
 177         __FIXADDR_TOP = -reserve - PAGE_SIZE;
 178         __VMALLOC_RESERVE += reserve;
 179 }
 180
 181 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 182 {
 183         return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
 184 }
 185
 186 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 187 {
 188         struct page *pte;
 189
 190 #ifdef CONFIG_HIGHPTE
 191         pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
 192 #else
 193         pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
 194 #endif
 195         return pte;
 196 }
 197
 198 /*
 199  * List of all pgd's needed for non-PAE so it can invalidate entries
 200  * in both cached and uncached pgd's; not needed for PAE since the
 201  * kernel pmd is shared. If PAE were not to share the pmd a similar
 202  * tactic would be needed. This is essentially codepath-based locking
 203  * against pageattr.c; it is the unique case in which a valid change
 204  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 205  * vmalloc faults work because attached pagetables are never freed.
 206  * -- wli
 207  */
 208 static inline void pgd_list_add(pgd_t *pgd)
 209 {
 210         struct page *page = virt_to_page(pgd);
 211
 212         list_add(&page->lru, &pgd_list);
 213 }
 214
 215 static inline void pgd_list_del(pgd_t *pgd)
 216 {
 217         struct page *page = virt_to_page(pgd);
 218
 219         list_del(&page->lru);
 220 }
 221
 222
 223
 224 #if (PTRS_PER_PMD == 1)
 225 /* Non-PAE pgd constructor */
 226 static void pgd_ctor(void *pgd)
 227 {
 228         unsigned long flags;
 229
 230         /* !PAE, no pagetable sharing */
 231         memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 232
 233         spin_lock_irqsave(&pgd_lock, flags);
 234
 235         /* must happen under lock */
 236         clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
 237                         swapper_pg_dir + USER_PTRS_PER_PGD,
 238                         KERNEL_PGD_PTRS);
 239         paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
 240                                 __pa(swapper_pg_dir) >> PAGE_SHIFT,
 241                                 USER_PTRS_PER_PGD,
 242                                 KERNEL_PGD_PTRS);
 243         pgd_list_add(pgd);
 244         spin_unlock_irqrestore(&pgd_lock, flags);
 245 }
 246 #else  /* PTRS_PER_PMD > 1 */
 247 /* PAE pgd constructor */
 248 static void pgd_ctor(void *pgd)
 249 {
 250         /* PAE, kernel PMD may be shared */
 251
 252         if (SHARED_KERNEL_PMD) {
 253                 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
 254                                 swapper_pg_dir + USER_PTRS_PER_PGD,
 255                                 KERNEL_PGD_PTRS);
 256         } else {
 257                 unsigned long flags;
 258
 259                 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 260                 spin_lock_irqsave(&pgd_lock, flags);
 261                 pgd_list_add(pgd);
 262                 spin_unlock_irqrestore(&pgd_lock, flags);
 263         }
 264 }
 265 #endif  /* PTRS_PER_PMD */
 266
 267 static void pgd_dtor(void *pgd)
 268 {
 269         unsigned long flags; /* can be called from interrupt context */
 270
 271         if (SHARED_KERNEL_PMD)
 272                 return;
 273
 274         spin_lock_irqsave(&pgd_lock, flags);
 275         pgd_list_del(pgd);
 276         spin_unlock_irqrestore(&pgd_lock, flags);
 277 }
 278
 279 #define UNSHARED_PTRS_PER_PGD                           \
 280         (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
 281
 282 #ifdef CONFIG_X86_PAE
 283 /*
 284  * Mop up any pmd pages which may still be attached to the pgd.
 285  * Normally they will be freed by munmap/exit_mmap, but any pmd we
 286  * preallocate which never got a corresponding vma will need to be
 287  * freed manually.
 288  */
 289 static void pgd_mop_up_pmds(pgd_t *pgdp)
 290 {
 291         int i;
 292
 293         for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
 294                 pgd_t pgd = pgdp[i];
 295
 296                 if (pgd_val(pgd) != 0) {
 297                         pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
 298
 299                         pgdp[i] = native_make_pgd(0);
 300
 301                         paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
 302                         pmd_free(pmd);
 303                 }
 304         }
 305 }
 306
 307 /*
 308  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 309  * updating the top-level pagetable entries to guarantee the
 310  * processor notices the update.  Since this is expensive, and
 311  * all 4 top-level entries are used almost immediately in a
 312  * new process's life, we just pre-populate them here.
 313  *
 314  * Also, if we're in a paravirt environment where the kernel pmd is
 315  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 316  * and initialize the kernel pmds here.
 317  */
 318 static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 319 {
 320         pud_t *pud;
 321         unsigned long addr;
 322         int i;
 323
 324         pud = pud_offset(pgd, 0);
 325         for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
 326              i++, pud++, addr += PUD_SIZE) {
 327                 pmd_t *pmd = pmd_alloc_one(mm, addr);
 328
 329                 if (!pmd) {
 330                         pgd_mop_up_pmds(pgd);
 331                         return 0;
 332                 }
 333
 334                 if (i >= USER_PTRS_PER_PGD)
 335                         memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
 336                                sizeof(pmd_t) * PTRS_PER_PMD);
 337
 338                 pud_populate(mm, pud, pmd);
 339         }
 340
 341         return 1;
 342 }
 343 #else  /* !CONFIG_X86_PAE */
 344 /* No need to prepopulate any pagetable entries in non-PAE modes. */
 345 static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
 346 {
 347         return 1;
 348 }
 349
 350 static void pgd_mop_up_pmds(pgd_t *pgd)
 351 {
 352 }
 353 #endif  /* CONFIG_X86_PAE */
 354
 355 pgd_t *pgd_alloc(struct mm_struct *mm)
 356 {
 357         pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
 358
 359         mm->pgd = pgd;          /* so that alloc_pd can use it */
 360
 361         if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
 362                 quicklist_free(0, pgd_dtor, pgd);
 363                 pgd = NULL;
 364         }
 365
 366         return pgd;
 367 }
 368
 369 void pgd_free(pgd_t *pgd)
 370 {
 371         pgd_mop_up_pmds(pgd);
 372         quicklist_free(0, pgd_dtor, pgd);
 373 }
 374
 375 void check_pgt_cache(void)
 376 {
 377         quicklist_trim(0, pgd_dtor, 25, 16);
 378 }
 379
 380 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 381 {
 382         paravirt_release_pt(page_to_pfn(pte));
 383         tlb_remove_page(tlb, pte);
 384 }
 385
 386 #ifdef CONFIG_X86_PAE
 387
 388 void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 389 {
 390         /* This is called just after the pmd has been detached from
 391            the pgd, which requires a full tlb flush to be recognized
 392            by the CPU.  Rather than incurring multiple tlb flushes
 393            while the address space is being pulled down, make the tlb
 394            gathering machinery do a full flush when we're done. */
 395         tlb->fullmm = 1;
 396
 397         paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
 398         tlb_remove_page(tlb, virt_to_page(pmd));
 399 }
 400
 401 #endif