mm/memory.c: fix mem_cgroup_oom_disable() call missing

[sfrench/cifs-2.6.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index fe2fba27ded2fab229d0ef7a4908551343d31b89..0bbc1d612a632f5f9077e389f0efa58d5e23bb56 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -49,6 +49,7 @@
  #include <linux/swap.h>
  #include <linux/highmem.h>
  #include <linux/pagemap.h>
+#include <linux/memremap.h>
  #include <linux/ksm.h>
  #include <linux/rmap.h>
  #include <linux/export.h>
@@ -817,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  #else
  # define HAVE_PTE_SPECIAL 0
  #endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-                               pte_t pte)
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                            pte_t pte, bool with_public_device)
  {
         unsigned long pfn = pte_pfn(pte);
  
@@ -829,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                         return vma->vm_ops->find_special_page(vma, addr);
                 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                         return NULL;
-               if (!is_zero_pfn(pfn))
-                       print_bad_pte(vma, addr, pte, NULL);
+               if (is_zero_pfn(pfn))
+                       return NULL;
+
+               /*
+                * Device public pages are special pages (they are ZONE_DEVICE
+                * pages but different from persistent memory). They behave
+                * allmost like normal pages. The difference is that they are
+                * not on the lru and thus should never be involve with any-
+                * thing that involve lru manipulation (mlock, numa balancing,
+                * ...).
+                *
+                * This is why we still want to return NULL for such page from
+                * vm_normal_page() so that we do not have to special case all
+                * call site of vm_normal_page().
+                */
+               if (likely(pfn < highest_memmap_pfn)) {
+                       struct page *page = pfn_to_page(pfn);
+
+                       if (is_device_public_page(page)) {
+                               if (with_public_device)
+                                       return page;
+                               return NULL;
+                       }
+               }
+               print_bad_pte(vma, addr, pte, NULL);
                 return NULL;
         }
  
@@ -956,6 +980,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                         pte = pte_swp_mksoft_dirty(pte);
                                 set_pte_at(src_mm, addr, src_pte, pte);
                         }
+               } else if (is_device_private_entry(entry)) {
+                       page = device_private_entry_to_page(entry);
+
+                       /*
+                        * Update rss count even for unaddressable pages, as
+                        * they should treated just like normal pages in this
+                        * respect.
+                        *
+                        * We will likely want to have some new rss counters
+                        * for unaddressable pages, at some point. But for now
+                        * keep things as they are.
+                        */
+                       get_page(page);
+                       rss[mm_counter(page)]++;
+                       page_dup_rmap(page, false);
+
+                       /*
+                        * We do not preserve soft-dirty information, because so
+                        * far, checkpoint/restore is the only feature that
+                        * requires that. And checkpoint/restore does not work
+                        * when a device driver is involved (you cannot easily
+                        * save and restore device driver state).
+                        */
+                       if (is_write_device_private_entry(entry) &&
+                           is_cow_mapping(vm_flags)) {
+                               make_device_private_entry_read(&entry);
+                               pte = swp_entry_to_pte(entry);
+                               set_pte_at(src_mm, addr, src_pte, pte);
+                       }
                 }
                 goto out_set_pte;
         }
@@ -982,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 get_page(page);
                 page_dup_rmap(page, false);
                 rss[mm_counter(page)]++;
+       } else if (pte_devmap(pte)) {
+               page = pte_page(pte);
+
+               /*
+                * Cache coherent device memory behave like regular page and
+                * not like persistent memory page. For more informations see
+                * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
+                */
+               if (is_device_public_page(page)) {
+                       get_page(page);
+                       page_dup_rmap(page, false);
+                       rss[mm_counter(page)]++;
+               }
         }
  
  out_set_pte:
@@ -1065,7 +1131,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
         src_pmd = pmd_offset(src_pud, addr);
         do {
                 next = pmd_addr_end(addr, end);
-               if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
+               if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
+                       || pmd_devmap(*src_pmd)) {
                         int err;
                         VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
                         err = copy_huge_pmd(dst_mm, src_mm,
@@ -1236,7 +1303,7 @@ again:
                 if (pte_present(ptent)) {
                         struct page *page;
  
-                       page = vm_normal_page(vma, addr, ptent);
+                       page = _vm_normal_page(vma, addr, ptent, true);
                         if (unlikely(details) && page) {
                                 /*
                                  * unmap_shared_mapping_pages() wants to
@@ -1273,6 +1340,29 @@ again:
                         }
                         continue;
                 }
+
+               entry = pte_to_swp_entry(ptent);
+               if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+                       struct page *page = device_private_entry_to_page(entry);
+
+                       if (unlikely(details && details->check_mapping)) {
+                               /*
+                                * unmap_shared_mapping_pages() wants to
+                                * invalidate cache without truncating:
+                                * unmap shared but keep private pages.
+                                */
+                               if (details->check_mapping !=
+                                   page_rmapping(page))
+                                       continue;
+                       }
+
+                       pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+                       rss[mm_counter(page)]--;
+                       page_remove_rmap(page, false);
+                       put_page(page);
+                       continue;
+               }
+
                 /* If details->check_mapping, we leave swap entries. */
                 if (unlikely(details))
                         continue;
@@ -1326,7 +1416,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
         pmd = pmd_offset(pud, addr);
         do {
                 next = pmd_addr_end(addr, end);
-               if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+               if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                         if (next - addr != HPAGE_PMD_SIZE) {
                                 VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
                                     !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
@@ -1513,8 +1603,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
         tlb_gather_mmu(&tlb, mm, start, end);
         update_hiwater_rss(mm);
         mmu_notifier_invalidate_range_start(mm, start, end);
-       for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+       for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
                 unmap_single_vma(&tlb, vma, start, end, NULL);
+
+               /*
+                * zap_page_range does not specify whether mmap_sem should be
+                * held for read or write. That allows parallel zap_page_range
+                * operations to unmap a PTE and defer a flush meaning that
+                * this call observes pte_none and fails to flush the TLB.
+                * Rather than adding a complex API, ensure that no stale
+                * TLB entries exist when this call returns.
+                */
+               flush_tlb_range(vma, start, end);
+       }
+
         mmu_notifier_invalidate_range_end(mm, start, end);
         tlb_finish_mmu(&tlb, start, end);
  }
@@ -1676,7 +1778,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
  EXPORT_SYMBOL(vm_insert_page);
  
  static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-                       pfn_t pfn, pgprot_t prot)
+                       pfn_t pfn, pgprot_t prot, bool mkwrite)
  {
         struct mm_struct *mm = vma->vm_mm;
         int retval;
@@ -1688,14 +1790,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
         if (!pte)
                 goto out;
         retval = -EBUSY;
-       if (!pte_none(*pte))
-               goto out_unlock;
+       if (!pte_none(*pte)) {
+               if (mkwrite) {
+                       /*
+                        * For read faults on private mappings the PFN passed
+                        * in may not match the PFN we have mapped if the
+                        * mapped PFN is a writeable COW page.  In the mkwrite
+                        * case we are creating a writable PTE for a shared
+                        * mapping and we expect the PFNs to match.
+                        */
+                       if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+                               goto out_unlock;
+                       entry = *pte;
+                       goto out_mkwrite;
+               } else
+                       goto out_unlock;
+       }
  
         /* Ok, finally just insert the thing.. */
         if (pfn_t_devmap(pfn))
                 entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
         else
                 entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+       if (mkwrite) {
+               entry = pte_mkyoung(entry);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       }
+
         set_pte_at(mm, addr, pte, entry);
         update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
  
@@ -1766,14 +1889,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
  
         track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
  
-       ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+       ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+                       false);
  
         return ret;
  }
  EXPORT_SYMBOL(vm_insert_pfn_prot);
  
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-                       pfn_t pfn)
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+                       pfn_t pfn, bool mkwrite)
  {
         pgprot_t pgprot = vma->vm_page_prot;
  
@@ -1802,10 +1926,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                 page = pfn_to_page(pfn_t_to_pfn(pfn));
                 return insert_page(vma, addr, page, pgprot);
         }
-       return insert_pfn(vma, addr, pfn, pgprot);
+       return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+                       pfn_t pfn)
+{
+       return __vm_insert_mixed(vma, addr, pfn, false);
+
  }
  EXPORT_SYMBOL(vm_insert_mixed);
  
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+                       pfn_t pfn)
+{
+       return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+
  /*
   * maps a range of physical memory into the requested pages. the old
   * mappings are removed. any references to nonexistent pages results
@@ -2571,7 +2709,7 @@ static int do_wp_page(struct vm_fault *vmf)
          * not dirty accountable.
          */
         if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
-               int total_mapcount;
+               int total_map_swapcount;
                 if (!trylock_page(vmf->page)) {
                         get_page(vmf->page);
                         pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2586,8 +2724,8 @@ static int do_wp_page(struct vm_fault *vmf)
                         }
                         put_page(vmf->page);
                 }
-               if (reuse_swap_page(vmf->page, &total_mapcount)) {
-                       if (total_mapcount == 1) {
+               if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+                       if (total_map_swapcount == 1) {
                                 /*
                                  * The page is all ours. Move it to
                                  * our anon_vma so the rmap code will
@@ -2704,22 +2842,37 @@ EXPORT_SYMBOL(unmap_mapping_range);
  int do_swap_page(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
-       struct page *page, *swapcache;
+       struct page *page = NULL, *swapcache;
         struct mem_cgroup *memcg;
+       struct vma_swap_readahead swap_ra;
         swp_entry_t entry;
         pte_t pte;
         int locked;
         int exclusive = 0;
         int ret = 0;
+       bool vma_readahead = swap_use_vma_readahead();
  
-       if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+       if (vma_readahead)
+               page = swap_readahead_detect(vmf, &swap_ra);
+       if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
+               if (page)
+                       put_page(page);
                 goto out;
+       }
  
         entry = pte_to_swp_entry(vmf->orig_pte);
         if (unlikely(non_swap_entry(entry))) {
                 if (is_migration_entry(entry)) {
                         migration_entry_wait(vma->vm_mm, vmf->pmd,
                                              vmf->address);
+               } else if (is_device_private_entry(entry)) {
+                       /*
+                        * For un-addressable device memory we call the pgmap
+                        * fault handler callback. The callback must migrate
+                        * the page back to some CPU accessible page.
+                        */
+                       ret = device_private_entry_fault(vma, vmf->address, entry,
+                                                vmf->flags, vmf->pmd);
                 } else if (is_hwpoison_entry(entry)) {
                         ret = VM_FAULT_HWPOISON;
                 } else {
@@ -2729,10 +2882,16 @@ int do_swap_page(struct vm_fault *vmf)
                 goto out;
         }
         delayacct_set_flag(DELAYACCT_PF_SWAPIN);
-       page = lookup_swap_cache(entry);
+       if (!page)
+               page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
+                                        vmf->address);
         if (!page) {
-               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
-                                       vmf->address);
+               if (vma_readahead)
+                       page = do_swap_page_readahead(entry,
+                               GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+               else
+                       page = swapin_readahead(entry,
+                               GFP_HIGHUSER_MOVABLE, vma, vmf->address);
                 if (!page) {
                         /*
                          * Back out if somebody else faulted in this pte
@@ -3802,6 +3961,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                 .pgoff = linear_page_index(vma, address),
                 .gfp_mask = __get_fault_gfp_mask(vma),
         };
+       unsigned int dirty = flags & FAULT_FLAG_WRITE;
         struct mm_struct *mm = vma->vm_mm;
         pgd_t *pgd;
         p4d_t *p4d;
@@ -3824,7 +3984,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
  
                 barrier();
                 if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
-                       unsigned int dirty = flags & FAULT_FLAG_WRITE;
  
                         /* NUMA case for anonymous PUDs would go here */
  
@@ -3850,12 +4009,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                 pmd_t orig_pmd = *vmf.pmd;
  
                 barrier();
+               if (unlikely(is_swap_pmd(orig_pmd))) {
+                       VM_BUG_ON(thp_migration_supported() &&
+                                         !is_pmd_migration_entry(orig_pmd));
+                       if (is_pmd_migration_entry(orig_pmd))
+                               pmd_migration_entry_wait(mm, vmf.pmd);
+                       return 0;
+               }
                 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
                         if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
                                 return do_huge_pmd_numa_page(&vmf, orig_pmd);
  
-                       if ((vmf.flags & FAULT_FLAG_WRITE) &&
-                                       !pmd_write(orig_pmd)) {
+                       if (dirty && !pmd_write(orig_pmd)) {
                                 ret = wp_huge_pmd(&vmf, orig_pmd);
                                 if (!(ret & VM_FAULT_FALLBACK))
                                         return ret;
@@ -3888,6 +4053,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
         /* do counter updates before entering really critical section. */
         check_sync_rss_stat(current);
  
+       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+                                           flags & FAULT_FLAG_INSTRUCTION,
+                                           flags & FAULT_FLAG_REMOTE))
+               return VM_FAULT_SIGSEGV;
+
         /*
          * Enable the memcg OOM handling for faults triggered in user
          * space.  Kernel faults are handled more gracefully.
@@ -3895,11 +4065,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
         if (flags & FAULT_FLAG_USER)
                 mem_cgroup_oom_enable();
  
-       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
-                                           flags & FAULT_FLAG_INSTRUCTION,
-                                           flags & FAULT_FLAG_REMOTE))
-               return VM_FAULT_SIGSEGV;
-
         if (unlikely(is_vm_hugetlb_page(vma)))
                 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
         else
@@ -4008,7 +4173,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
  #endif /* __PAGETABLE_PMD_FOLDED */
  
  static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-               pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+                           unsigned long *start, unsigned long *end,
+                           pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
  {
         pgd_t *pgd;
         p4d_t *p4d;
@@ -4035,17 +4201,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
                 if (!pmdpp)
                         goto out;
  
+               if (start && end) {
+                       *start = address & PMD_MASK;
+                       *end = *start + PMD_SIZE;
+                       mmu_notifier_invalidate_range_start(mm, *start, *end);
+               }
                 *ptlp = pmd_lock(mm, pmd);
                 if (pmd_huge(*pmd)) {
                         *pmdpp = pmd;
                         return 0;
                 }
                 spin_unlock(*ptlp);
+               if (start && end)
+                       mmu_notifier_invalidate_range_end(mm, *start, *end);
         }
  
         if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                 goto out;
  
+       if (start && end) {
+               *start = address & PAGE_MASK;
+               *end = *start + PAGE_SIZE;
+               mmu_notifier_invalidate_range_start(mm, *start, *end);
+       }
         ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
         if (!pte_present(*ptep))
                 goto unlock;
@@ -4053,6 +4231,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
         return 0;
  unlock:
         pte_unmap_unlock(ptep, *ptlp);
+       if (start && end)
+               mmu_notifier_invalidate_range_end(mm, *start, *end);
  out:
         return -EINVAL;
  }
@@ -4064,20 +4244,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
  
         /* (void) is needed to make gcc happy */
         (void) __cond_lock(*ptlp,
-                          !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
-                                          ptlp)));
+                          !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+                                                   ptepp, NULL, ptlp)));
         return res;
  }
  
  int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+                            unsigned long *start, unsigned long *end,
                              pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
  {
         int res;
  
         /* (void) is needed to make gcc happy */
         (void) __cond_lock(*ptlp,
-                          !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
-                                          ptlp)));
+                          !(res = __follow_pte_pmd(mm, address, start, end,
+                                                   ptepp, pmdpp, ptlp)));
         return res;
  }
  EXPORT_SYMBOL(follow_pte_pmd);
@@ -4340,19 +4521,53 @@ static void clear_gigantic_page(struct page *page,
         }
  }
  void clear_huge_page(struct page *page,
-                    unsigned long addr, unsigned int pages_per_huge_page)
+                    unsigned long addr_hint, unsigned int pages_per_huge_page)
  {
-       int i;
+       int i, n, base, l;
+       unsigned long addr = addr_hint &
+               ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
  
         if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
                 clear_gigantic_page(page, addr, pages_per_huge_page);
                 return;
         }
  
+       /* Clear sub-page to access last to keep its cache lines hot */
         might_sleep();
-       for (i = 0; i < pages_per_huge_page; i++) {
+       n = (addr_hint - addr) / PAGE_SIZE;
+       if (2 * n <= pages_per_huge_page) {
+               /* If sub-page to access in first half of huge page */
+               base = 0;
+               l = n;
+               /* Clear sub-pages at the end of huge page */
+               for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
+                       cond_resched();
+                       clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+               }
+       } else {
+               /* If sub-page to access in second half of huge page */
+               base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
+               l = pages_per_huge_page - n;
+               /* Clear sub-pages at the begin of huge page */
+               for (i = 0; i < base; i++) {
+                       cond_resched();
+                       clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+               }
+       }
+       /*
+        * Clear remaining sub-pages in left-right-left-right pattern
+        * towards the sub-page to access
+        */
+       for (i = 0; i < l; i++) {
+               int left_idx = base + i;
+               int right_idx = base + 2 * l - 1 - i;
+
+               cond_resched();
+               clear_user_highpage(page + left_idx,
+                                   addr + left_idx * PAGE_SIZE);
                 cond_resched();
-               clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+               clear_user_highpage(page + right_idx,
+                                   addr + right_idx * PAGE_SIZE);
         }
  }