arch/x86/mm/pgtable.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/mm.h>
   3 #include <linux/gfp.h>
   4 #include <linux/hugetlb.h>
   5 #include <asm/pgalloc.h>
   6 #include <asm/pgtable.h>
   7 #include <asm/tlb.h>
   8 #include <asm/fixmap.h>
   9 #include <asm/mtrr.h>
  10
  11 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
  12 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
  13 EXPORT_SYMBOL(physical_mask);
  14 #endif
  15
  16 #define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
  17
  18 #ifdef CONFIG_HIGHPTE
  19 #define PGALLOC_USER_GFP __GFP_HIGHMEM
  20 #else
  21 #define PGALLOC_USER_GFP 0
  22 #endif
  23
  24 gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
  25
  26 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
  27 {
  28         return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT);
  29 }
  30
  31 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
  32 {
  33         struct page *pte;
  34
  35         pte = alloc_pages(__userpte_alloc_gfp, 0);
  36         if (!pte)
  37                 return NULL;
  38         if (!pgtable_page_ctor(pte)) {
  39                 __free_page(pte);
  40                 return NULL;
  41         }
  42         return pte;
  43 }
  44
  45 static int __init setup_userpte(char *arg)
  46 {
  47         if (!arg)
  48                 return -EINVAL;
  49
  50         /*
  51          * "userpte=nohigh" disables allocation of user pagetables in
  52          * high memory.
  53          */
  54         if (strcmp(arg, "nohigh") == 0)
  55                 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
  56         else
  57                 return -EINVAL;
  58         return 0;
  59 }
  60 early_param("userpte", setup_userpte);
  61
  62 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
  63 {
  64         pgtable_page_dtor(pte);
  65         paravirt_release_pte(page_to_pfn(pte));
  66         tlb_remove_table(tlb, pte);
  67 }
  68
  69 #if CONFIG_PGTABLE_LEVELS > 2
  70 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
  71 {
  72         struct page *page = virt_to_page(pmd);
  73         paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
  74         /*
  75          * NOTE! For PAE, any changes to the top page-directory-pointer-table
  76          * entries need a full cr3 reload to flush.
  77          */
  78 #ifdef CONFIG_X86_PAE
  79         tlb->need_flush_all = 1;
  80 #endif
  81         pgtable_pmd_page_dtor(page);
  82         tlb_remove_table(tlb, page);
  83 }
  84
  85 #if CONFIG_PGTABLE_LEVELS > 3
  86 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
  87 {
  88         paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
  89         tlb_remove_table(tlb, virt_to_page(pud));
  90 }
  91
  92 #if CONFIG_PGTABLE_LEVELS > 4
  93 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
  94 {
  95         paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
  96         tlb_remove_table(tlb, virt_to_page(p4d));
  97 }
  98 #endif  /* CONFIG_PGTABLE_LEVELS > 4 */
  99 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 100 #endif  /* CONFIG_PGTABLE_LEVELS > 2 */
 101
 102 static inline void pgd_list_add(pgd_t *pgd)
 103 {
 104         struct page *page = virt_to_page(pgd);
 105
 106         list_add(&page->lru, &pgd_list);
 107 }
 108
 109 static inline void pgd_list_del(pgd_t *pgd)
 110 {
 111         struct page *page = virt_to_page(pgd);
 112
 113         list_del(&page->lru);
 114 }
 115
 116 #define UNSHARED_PTRS_PER_PGD                           \
 117         (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 118
 119
 120 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
 121 {
 122         virt_to_page(pgd)->pt_mm = mm;
 123 }
 124
 125 struct mm_struct *pgd_page_get_mm(struct page *page)
 126 {
 127         return page->pt_mm;
 128 }
 129
 130 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 131 {
 132         /* If the pgd points to a shared pagetable level (either the
 133            ptes in non-PAE, or shared PMD in PAE), then just copy the
 134            references from swapper_pg_dir. */
 135         if (CONFIG_PGTABLE_LEVELS == 2 ||
 136             (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
 137             CONFIG_PGTABLE_LEVELS >= 4) {
 138                 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 139                                 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 140                                 KERNEL_PGD_PTRS);
 141         }
 142
 143         /* list required to sync kernel mapping updates */
 144         if (!SHARED_KERNEL_PMD) {
 145                 pgd_set_mm(pgd, mm);
 146                 pgd_list_add(pgd);
 147         }
 148 }
 149
 150 static void pgd_dtor(pgd_t *pgd)
 151 {
 152         if (SHARED_KERNEL_PMD)
 153                 return;
 154
 155         spin_lock(&pgd_lock);
 156         pgd_list_del(pgd);
 157         spin_unlock(&pgd_lock);
 158 }
 159
 160 /*
 161  * List of all pgd's needed for non-PAE so it can invalidate entries
 162  * in both cached and uncached pgd's; not needed for PAE since the
 163  * kernel pmd is shared. If PAE were not to share the pmd a similar
 164  * tactic would be needed. This is essentially codepath-based locking
 165  * against pageattr.c; it is the unique case in which a valid change
 166  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
 167  * vmalloc faults work because attached pagetables are never freed.
 168  * -- nyc
 169  */
 170
 171 #ifdef CONFIG_X86_PAE
 172 /*
 173  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
 174  * updating the top-level pagetable entries to guarantee the
 175  * processor notices the update.  Since this is expensive, and
 176  * all 4 top-level entries are used almost immediately in a
 177  * new process's life, we just pre-populate them here.
 178  *
 179  * Also, if we're in a paravirt environment where the kernel pmd is
 180  * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
 181  * and initialize the kernel pmds here.
 182  */
 183 #define PREALLOCATED_PMDS       UNSHARED_PTRS_PER_PGD
 184
 185 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 186 {
 187         paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
 188
 189         /* Note: almost everything apart from _PAGE_PRESENT is
 190            reserved at the pmd (PDPT) level. */
 191         set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
 192
 193         /*
 194          * According to Intel App note "TLBs, Paging-Structure Caches,
 195          * and Their Invalidation", April 2007, document 317080-001,
 196          * section 8.1: in PAE mode we explicitly have to flush the
 197          * TLB via cr3 if the top-level pgd is changed...
 198          */
 199         flush_tlb_mm(mm);
 200 }
 201 #else  /* !CONFIG_X86_PAE */
 202
 203 /* No need to prepopulate any pagetable entries in non-PAE modes. */
 204 #define PREALLOCATED_PMDS       0
 205
 206 #endif  /* CONFIG_X86_PAE */
 207
 208 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
 209 {
 210         int i;
 211
 212         for(i = 0; i < PREALLOCATED_PMDS; i++)
 213                 if (pmds[i]) {
 214                         pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
 215                         free_page((unsigned long)pmds[i]);
 216                         mm_dec_nr_pmds(mm);
 217                 }
 218 }
 219
 220 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
 221 {
 222         int i;
 223         bool failed = false;
 224         gfp_t gfp = PGALLOC_GFP;
 225
 226         if (mm == &init_mm)
 227                 gfp &= ~__GFP_ACCOUNT;
 228
 229         for(i = 0; i < PREALLOCATED_PMDS; i++) {
 230                 pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
 231                 if (!pmd)
 232                         failed = true;
 233                 if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
 234                         free_page((unsigned long)pmd);
 235                         pmd = NULL;
 236                         failed = true;
 237                 }
 238                 if (pmd)
 239                         mm_inc_nr_pmds(mm);
 240                 pmds[i] = pmd;
 241         }
 242
 243         if (failed) {
 244                 free_pmds(mm, pmds);
 245                 return -ENOMEM;
 246         }
 247
 248         return 0;
 249 }
 250
 251 /*
 252  * Mop up any pmd pages which may still be attached to the pgd.
 253  * Normally they will be freed by munmap/exit_mmap, but any pmd we
 254  * preallocate which never got a corresponding vma will need to be
 255  * freed manually.
 256  */
 257 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 258 {
 259         int i;
 260
 261         for(i = 0; i < PREALLOCATED_PMDS; i++) {
 262                 pgd_t pgd = pgdp[i];
 263
 264                 if (pgd_val(pgd) != 0) {
 265                         pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
 266
 267                         pgdp[i] = native_make_pgd(0);
 268
 269                         paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 270                         pmd_free(mm, pmd);
 271                         mm_dec_nr_pmds(mm);
 272                 }
 273         }
 274 }
 275
 276 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 277 {
 278         p4d_t *p4d;
 279         pud_t *pud;
 280         int i;
 281
 282         if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
 283                 return;
 284
 285         p4d = p4d_offset(pgd, 0);
 286         pud = pud_offset(p4d, 0);
 287
 288         for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
 289                 pmd_t *pmd = pmds[i];
 290
 291                 if (i >= KERNEL_PGD_BOUNDARY)
 292                         memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
 293                                sizeof(pmd_t) * PTRS_PER_PMD);
 294
 295                 pud_populate(mm, pud, pmd);
 296         }
 297 }
 298
 299 /*
 300  * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
 301  * assumes that pgd should be in one page.
 302  *
 303  * But kernel with PAE paging that is not running as a Xen domain
 304  * only needs to allocate 32 bytes for pgd instead of one page.
 305  */
 306 #ifdef CONFIG_X86_PAE
 307
 308 #include <linux/slab.h>
 309
 310 #define PGD_SIZE        (PTRS_PER_PGD * sizeof(pgd_t))
 311 #define PGD_ALIGN       32
 312
 313 static struct kmem_cache *pgd_cache;
 314
 315 static int __init pgd_cache_init(void)
 316 {
 317         /*
 318          * When PAE kernel is running as a Xen domain, it does not use
 319          * shared kernel pmd. And this requires a whole page for pgd.
 320          */
 321         if (!SHARED_KERNEL_PMD)
 322                 return 0;
 323
 324         /*
 325          * when PAE kernel is not running as a Xen domain, it uses
 326          * shared kernel pmd. Shared kernel pmd does not require a whole
 327          * page for pgd. We are able to just allocate a 32-byte for pgd.
 328          * During boot time, we create a 32-byte slab for pgd table allocation.
 329          */
 330         pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
 331                                       SLAB_PANIC, NULL);
 332         if (!pgd_cache)
 333                 return -ENOMEM;
 334
 335         return 0;
 336 }
 337 core_initcall(pgd_cache_init);
 338
 339 static inline pgd_t *_pgd_alloc(void)
 340 {
 341         /*
 342          * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
 343          * We allocate one page for pgd.
 344          */
 345         if (!SHARED_KERNEL_PMD)
 346                 return (pgd_t *)__get_free_pages(PGALLOC_GFP,
 347                                                  PGD_ALLOCATION_ORDER);
 348
 349         /*
 350          * Now PAE kernel is not running as a Xen domain. We can allocate
 351          * a 32-byte slab for pgd to save memory space.
 352          */
 353         return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
 354 }
 355
 356 static inline void _pgd_free(pgd_t *pgd)
 357 {
 358         if (!SHARED_KERNEL_PMD)
 359                 free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 360         else
 361                 kmem_cache_free(pgd_cache, pgd);
 362 }
 363 #else
 364
 365 static inline pgd_t *_pgd_alloc(void)
 366 {
 367         return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 368 }
 369
 370 static inline void _pgd_free(pgd_t *pgd)
 371 {
 372         free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 373 }
 374 #endif /* CONFIG_X86_PAE */
 375
 376 pgd_t *pgd_alloc(struct mm_struct *mm)
 377 {
 378         pgd_t *pgd;
 379         pmd_t *pmds[PREALLOCATED_PMDS];
 380
 381         pgd = _pgd_alloc();
 382
 383         if (pgd == NULL)
 384                 goto out;
 385
 386         mm->pgd = pgd;
 387
 388         if (preallocate_pmds(mm, pmds) != 0)
 389                 goto out_free_pgd;
 390
 391         if (paravirt_pgd_alloc(mm) != 0)
 392                 goto out_free_pmds;
 393
 394         /*
 395          * Make sure that pre-populating the pmds is atomic with
 396          * respect to anything walking the pgd_list, so that they
 397          * never see a partially populated pgd.
 398          */
 399         spin_lock(&pgd_lock);
 400
 401         pgd_ctor(mm, pgd);
 402         pgd_prepopulate_pmd(mm, pgd, pmds);
 403
 404         spin_unlock(&pgd_lock);
 405
 406         return pgd;
 407
 408 out_free_pmds:
 409         free_pmds(mm, pmds);
 410 out_free_pgd:
 411         _pgd_free(pgd);
 412 out:
 413         return NULL;
 414 }
 415
 416 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 417 {
 418         pgd_mop_up_pmds(mm, pgd);
 419         pgd_dtor(pgd);
 420         paravirt_pgd_free(mm, pgd);
 421         _pgd_free(pgd);
 422 }
 423
 424 /*
 425  * Used to set accessed or dirty bits in the page table entries
 426  * on other architectures. On x86, the accessed and dirty bits
 427  * are tracked by hardware. However, do_wp_page calls this function
 428  * to also make the pte writeable at the same time the dirty bit is
 429  * set. In that case we do actually need to write the PTE.
 430  */
 431 int ptep_set_access_flags(struct vm_area_struct *vma,
 432                           unsigned long address, pte_t *ptep,
 433                           pte_t entry, int dirty)
 434 {
 435         int changed = !pte_same(*ptep, entry);
 436
 437         if (changed && dirty)
 438                 *ptep = entry;
 439
 440         return changed;
 441 }
 442
 443 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 444 int pmdp_set_access_flags(struct vm_area_struct *vma,
 445                           unsigned long address, pmd_t *pmdp,
 446                           pmd_t entry, int dirty)
 447 {
 448         int changed = !pmd_same(*pmdp, entry);
 449
 450         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 451
 452         if (changed && dirty) {
 453                 *pmdp = entry;
 454                 /*
 455                  * We had a write-protection fault here and changed the pmd
 456                  * to to more permissive. No need to flush the TLB for that,
 457                  * #PF is architecturally guaranteed to do that and in the
 458                  * worst-case we'll generate a spurious fault.
 459                  */
 460         }
 461
 462         return changed;
 463 }
 464
 465 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 466                           pud_t *pudp, pud_t entry, int dirty)
 467 {
 468         int changed = !pud_same(*pudp, entry);
 469
 470         VM_BUG_ON(address & ~HPAGE_PUD_MASK);
 471
 472         if (changed && dirty) {
 473                 *pudp = entry;
 474                 /*
 475                  * We had a write-protection fault here and changed the pud
 476                  * to to more permissive. No need to flush the TLB for that,
 477                  * #PF is architecturally guaranteed to do that and in the
 478                  * worst-case we'll generate a spurious fault.
 479                  */
 480         }
 481
 482         return changed;
 483 }
 484 #endif
 485
 486 int ptep_test_and_clear_young(struct vm_area_struct *vma,
 487                               unsigned long addr, pte_t *ptep)
 488 {
 489         int ret = 0;
 490
 491         if (pte_young(*ptep))
 492                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 493                                          (unsigned long *) &ptep->pte);
 494
 495         return ret;
 496 }
 497
 498 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 499 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 500                               unsigned long addr, pmd_t *pmdp)
 501 {
 502         int ret = 0;
 503
 504         if (pmd_young(*pmdp))
 505                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 506                                          (unsigned long *)pmdp);
 507
 508         return ret;
 509 }
 510 int pudp_test_and_clear_young(struct vm_area_struct *vma,
 511                               unsigned long addr, pud_t *pudp)
 512 {
 513         int ret = 0;
 514
 515         if (pud_young(*pudp))
 516                 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
 517                                          (unsigned long *)pudp);
 518
 519         return ret;
 520 }
 521 #endif
 522
 523 int ptep_clear_flush_young(struct vm_area_struct *vma,
 524                            unsigned long address, pte_t *ptep)
 525 {
 526         /*
 527          * On x86 CPUs, clearing the accessed bit without a TLB flush
 528          * doesn't cause data corruption. [ It could cause incorrect
 529          * page aging and the (mistaken) reclaim of hot pages, but the
 530          * chance of that should be relatively low. ]
 531          *
 532          * So as a performance optimization don't flush the TLB when
 533          * clearing the accessed bit, it will eventually be flushed by
 534          * a context switch or a VM operation anyway. [ In the rare
 535          * event of it not getting flushed for a long time the delay
 536          * shouldn't really matter because there's no real memory
 537          * pressure for swapout to react to. ]
 538          */
 539         return ptep_test_and_clear_young(vma, address, ptep);
 540 }
 541
 542 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 543 int pmdp_clear_flush_young(struct vm_area_struct *vma,
 544                            unsigned long address, pmd_t *pmdp)
 545 {
 546         int young;
 547
 548         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 549
 550         young = pmdp_test_and_clear_young(vma, address, pmdp);
 551         if (young)
 552                 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 553
 554         return young;
 555 }
 556 #endif
 557
 558 /**
 559  * reserve_top_address - reserves a hole in the top of kernel address space
 560  * @reserve - size of hole to reserve
 561  *
 562  * Can be used to relocate the fixmap area and poke a hole in the top
 563  * of kernel address space to make room for a hypervisor.
 564  */
 565 void __init reserve_top_address(unsigned long reserve)
 566 {
 567 #ifdef CONFIG_X86_32
 568         BUG_ON(fixmaps_set > 0);
 569         __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
 570         printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
 571                -reserve, __FIXADDR_TOP + PAGE_SIZE);
 572 #endif
 573 }
 574
 575 int fixmaps_set;
 576
 577 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
 578 {
 579         unsigned long address = __fix_to_virt(idx);
 580
 581         if (idx >= __end_of_fixed_addresses) {
 582                 BUG();
 583                 return;
 584         }
 585         set_pte_vaddr(address, pte);
 586         fixmaps_set++;
 587 }
 588
 589 void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 590                        pgprot_t flags)
 591 {
 592         /* Sanitize 'prot' against any unsupported bits: */
 593         pgprot_val(flags) &= __default_kernel_pte_mask;
 594
 595         __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
 596 }
 597
 598 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 599 #ifdef CONFIG_X86_5LEVEL
 600 /**
 601  * p4d_set_huge - setup kernel P4D mapping
 602  *
 603  * No 512GB pages yet -- always return 0
 604  */
 605 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
 606 {
 607         return 0;
 608 }
 609
 610 /**
 611  * p4d_clear_huge - clear kernel P4D mapping when it is set
 612  *
 613  * No 512GB pages yet -- always return 0
 614  */
 615 int p4d_clear_huge(p4d_t *p4d)
 616 {
 617         return 0;
 618 }
 619 #endif
 620
 621 /**
 622  * pud_set_huge - setup kernel PUD mapping
 623  *
 624  * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
 625  * function sets up a huge page only if any of the following conditions are met:
 626  *
 627  * - MTRRs are disabled, or
 628  *
 629  * - MTRRs are enabled and the range is completely covered by a single MTRR, or
 630  *
 631  * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
 632  *   has no effect on the requested PAT memory type.
 633  *
 634  * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
 635  * page mapping attempt fails.
 636  *
 637  * Returns 1 on success and 0 on failure.
 638  */
 639 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 640 {
 641         u8 mtrr, uniform;
 642
 643         mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
 644         if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 645             (mtrr != MTRR_TYPE_WRBACK))
 646                 return 0;
 647
 648         /* Bail out if we are we on a populated non-leaf entry: */
 649         if (pud_present(*pud) && !pud_huge(*pud))
 650                 return 0;
 651
 652         prot = pgprot_4k_2_large(prot);
 653
 654         set_pte((pte_t *)pud, pfn_pte(
 655                 (u64)addr >> PAGE_SHIFT,
 656                 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 657
 658         return 1;
 659 }
 660
 661 /**
 662  * pmd_set_huge - setup kernel PMD mapping
 663  *
 664  * See text over pud_set_huge() above.
 665  *
 666  * Returns 1 on success and 0 on failure.
 667  */
 668 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 669 {
 670         u8 mtrr, uniform;
 671
 672         mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
 673         if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
 674             (mtrr != MTRR_TYPE_WRBACK)) {
 675                 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
 676                              __func__, addr, addr + PMD_SIZE);
 677                 return 0;
 678         }
 679
 680         /* Bail out if we are we on a populated non-leaf entry: */
 681         if (pmd_present(*pmd) && !pmd_huge(*pmd))
 682                 return 0;
 683
 684         prot = pgprot_4k_2_large(prot);
 685
 686         set_pte((pte_t *)pmd, pfn_pte(
 687                 (u64)addr >> PAGE_SHIFT,
 688                 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
 689
 690         return 1;
 691 }
 692
 693 /**
 694  * pud_clear_huge - clear kernel PUD mapping when it is set
 695  *
 696  * Returns 1 on success and 0 on failure (no PUD map is found).
 697  */
 698 int pud_clear_huge(pud_t *pud)
 699 {
 700         if (pud_large(*pud)) {
 701                 pud_clear(pud);
 702                 return 1;
 703         }
 704
 705         return 0;
 706 }
 707
 708 /**
 709  * pmd_clear_huge - clear kernel PMD mapping when it is set
 710  *
 711  * Returns 1 on success and 0 on failure (no PMD map is found).
 712  */
 713 int pmd_clear_huge(pmd_t *pmd)
 714 {
 715         if (pmd_large(*pmd)) {
 716                 pmd_clear(pmd);
 717                 return 1;
 718         }
 719
 720         return 0;
 721 }
 722
 723 /**
 724  * pud_free_pmd_page - Clear pud entry and free pmd page.
 725  * @pud: Pointer to a PUD.
 726  *
 727  * Context: The pud range has been unmaped and TLB purged.
 728  * Return: 1 if clearing the entry succeeded. 0 otherwise.
 729  */
 730 int pud_free_pmd_page(pud_t *pud)
 731 {
 732         pmd_t *pmd;
 733         int i;
 734
 735         if (pud_none(*pud))
 736                 return 1;
 737
 738         pmd = (pmd_t *)pud_page_vaddr(*pud);
 739
 740         for (i = 0; i < PTRS_PER_PMD; i++)
 741                 if (!pmd_free_pte_page(&pmd[i]))
 742                         return 0;
 743
 744         pud_clear(pud);
 745         free_page((unsigned long)pmd);
 746
 747         return 1;
 748 }
 749
 750 /**
 751  * pmd_free_pte_page - Clear pmd entry and free pte page.
 752  * @pmd: Pointer to a PMD.
 753  *
 754  * Context: The pmd range has been unmaped and TLB purged.
 755  * Return: 1 if clearing the entry succeeded. 0 otherwise.
 756  */
 757 int pmd_free_pte_page(pmd_t *pmd)
 758 {
 759         pte_t *pte;
 760
 761         if (pmd_none(*pmd))
 762                 return 1;
 763
 764         pte = (pte_t *)pmd_page_vaddr(*pmd);
 765         pmd_clear(pmd);
 766         free_page((unsigned long)pte);
 767
 768         return 1;
 769 }
 770 #endif  /* CONFIG_HAVE_ARCH_HUGE_VMAP */