mm/memory.c: fix mem_cgroup_oom_disable() call missing
[sfrench/cifs-2.6.git] / mm / memory.c
index 13ee83b4387872b325414bf46b8f44710c2230f2..0bbc1d612a632f5f9077e389f0efa58d5e23bb56 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/memremap.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
@@ -817,8 +818,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 #else
 # define HAVE_PTE_SPECIAL 0
 #endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-                               pte_t pte)
+struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                            pte_t pte, bool with_public_device)
 {
        unsigned long pfn = pte_pfn(pte);
 
@@ -829,8 +830,31 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                        return vma->vm_ops->find_special_page(vma, addr);
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
-               if (!is_zero_pfn(pfn))
-                       print_bad_pte(vma, addr, pte, NULL);
+               if (is_zero_pfn(pfn))
+                       return NULL;
+
+               /*
+                * Device public pages are special pages (they are ZONE_DEVICE
+                * pages but different from persistent memory). They behave
+                * allmost like normal pages. The difference is that they are
+                * not on the lru and thus should never be involve with any-
+                * thing that involve lru manipulation (mlock, numa balancing,
+                * ...).
+                *
+                * This is why we still want to return NULL for such page from
+                * vm_normal_page() so that we do not have to special case all
+                * call site of vm_normal_page().
+                */
+               if (likely(pfn < highest_memmap_pfn)) {
+                       struct page *page = pfn_to_page(pfn);
+
+                       if (is_device_public_page(page)) {
+                               if (with_public_device)
+                                       return page;
+                               return NULL;
+                       }
+               }
+               print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }
 
@@ -956,6 +980,35 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                        pte = pte_swp_mksoft_dirty(pte);
                                set_pte_at(src_mm, addr, src_pte, pte);
                        }
+               } else if (is_device_private_entry(entry)) {
+                       page = device_private_entry_to_page(entry);
+
+                       /*
+                        * Update rss count even for unaddressable pages, as
+                        * they should treated just like normal pages in this
+                        * respect.
+                        *
+                        * We will likely want to have some new rss counters
+                        * for unaddressable pages, at some point. But for now
+                        * keep things as they are.
+                        */
+                       get_page(page);
+                       rss[mm_counter(page)]++;
+                       page_dup_rmap(page, false);
+
+                       /*
+                        * We do not preserve soft-dirty information, because so
+                        * far, checkpoint/restore is the only feature that
+                        * requires that. And checkpoint/restore does not work
+                        * when a device driver is involved (you cannot easily
+                        * save and restore device driver state).
+                        */
+                       if (is_write_device_private_entry(entry) &&
+                           is_cow_mapping(vm_flags)) {
+                               make_device_private_entry_read(&entry);
+                               pte = swp_entry_to_pte(entry);
+                               set_pte_at(src_mm, addr, src_pte, pte);
+                       }
                }
                goto out_set_pte;
        }
@@ -982,6 +1035,19 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                get_page(page);
                page_dup_rmap(page, false);
                rss[mm_counter(page)]++;
+       } else if (pte_devmap(pte)) {
+               page = pte_page(pte);
+
+               /*
+                * Cache coherent device memory behave like regular page and
+                * not like persistent memory page. For more informations see
+                * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
+                */
+               if (is_device_public_page(page)) {
+                       get_page(page);
+                       page_dup_rmap(page, false);
+                       rss[mm_counter(page)]++;
+               }
        }
 
 out_set_pte:
@@ -1065,7 +1131,8 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-               if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
+               if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
+                       || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
                        err = copy_huge_pmd(dst_mm, src_mm,
@@ -1236,7 +1303,7 @@ again:
                if (pte_present(ptent)) {
                        struct page *page;
 
-                       page = vm_normal_page(vma, addr, ptent);
+                       page = _vm_normal_page(vma, addr, ptent, true);
                        if (unlikely(details) && page) {
                                /*
                                 * unmap_shared_mapping_pages() wants to
@@ -1273,6 +1340,29 @@ again:
                        }
                        continue;
                }
+
+               entry = pte_to_swp_entry(ptent);
+               if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+                       struct page *page = device_private_entry_to_page(entry);
+
+                       if (unlikely(details && details->check_mapping)) {
+                               /*
+                                * unmap_shared_mapping_pages() wants to
+                                * invalidate cache without truncating:
+                                * unmap shared but keep private pages.
+                                */
+                               if (details->check_mapping !=
+                                   page_rmapping(page))
+                                       continue;
+                       }
+
+                       pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+                       rss[mm_counter(page)]--;
+                       page_remove_rmap(page, false);
+                       put_page(page);
+                       continue;
+               }
+
                /* If details->check_mapping, we leave swap entries. */
                if (unlikely(details))
                        continue;
@@ -1326,7 +1416,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-               if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+               if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE) {
                                VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
                                    !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
@@ -2775,6 +2865,14 @@ int do_swap_page(struct vm_fault *vmf)
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
+               } else if (is_device_private_entry(entry)) {
+                       /*
+                        * For un-addressable device memory we call the pgmap
+                        * fault handler callback. The callback must migrate
+                        * the page back to some CPU accessible page.
+                        */
+                       ret = device_private_entry_fault(vma, vmf->address, entry,
+                                                vmf->flags, vmf->pmd);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else {
@@ -3863,6 +3961,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
        };
+       unsigned int dirty = flags & FAULT_FLAG_WRITE;
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
        p4d_t *p4d;
@@ -3885,7 +3984,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 
                barrier();
                if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
-                       unsigned int dirty = flags & FAULT_FLAG_WRITE;
 
                        /* NUMA case for anonymous PUDs would go here */
 
@@ -3911,12 +4009,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                pmd_t orig_pmd = *vmf.pmd;
 
                barrier();
+               if (unlikely(is_swap_pmd(orig_pmd))) {
+                       VM_BUG_ON(thp_migration_supported() &&
+                                         !is_pmd_migration_entry(orig_pmd));
+                       if (is_pmd_migration_entry(orig_pmd))
+                               pmd_migration_entry_wait(mm, vmf.pmd);
+                       return 0;
+               }
                if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
                        if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
                                return do_huge_pmd_numa_page(&vmf, orig_pmd);
 
-                       if ((vmf.flags & FAULT_FLAG_WRITE) &&
-                                       !pmd_write(orig_pmd)) {
+                       if (dirty && !pmd_write(orig_pmd)) {
                                ret = wp_huge_pmd(&vmf, orig_pmd);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
@@ -3949,6 +4053,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        /* do counter updates before entering really critical section. */
        check_sync_rss_stat(current);
 
+       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+                                           flags & FAULT_FLAG_INSTRUCTION,
+                                           flags & FAULT_FLAG_REMOTE))
+               return VM_FAULT_SIGSEGV;
+
        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
@@ -3956,11 +4065,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_oom_enable();
 
-       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
-                                           flags & FAULT_FLAG_INSTRUCTION,
-                                           flags & FAULT_FLAG_REMOTE))
-               return VM_FAULT_SIGSEGV;
-
        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else