ia64: Make sure that we have a mmiowb function real early
[sfrench/cifs-2.6.git] / mm / hugetlb.c
index 641cedfc8c0fd0c3d81311ac1bb7abb936e970b3..81718c56b8f5d951cce489e17222bdae1fa2a852 100644 (file)
@@ -740,7 +740,15 @@ void resv_map_release(struct kref *ref)
 
 static inline struct resv_map *inode_resv_map(struct inode *inode)
 {
-       return inode->i_mapping->private_data;
+       /*
+        * At inode evict time, i_mapping may not point to the original
+        * address space within the inode.  This original address space
+        * contains the pointer to the resv_map.  So, always use the
+        * address space embedded within the inode.
+        * The VERY common case is inode->mapping == &inode->i_data but,
+        * this may not be true for device special inodes.
+        */
+       return (struct resv_map *)(&inode->i_data)->private_data;
 }
 
 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
@@ -1059,6 +1067,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
        free_contig_range(page_to_pfn(page), 1 << order);
 }
 
+#ifdef CONFIG_CONTIG_ALLOC
 static int __alloc_gigantic_page(unsigned long start_pfn,
                                unsigned long nr_pages, gfp_t gfp_mask)
 {
@@ -1143,11 +1152,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
 static void prep_compound_gigantic_page(struct page *page, unsigned int order);
+#else /* !CONFIG_CONTIG_ALLOC */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+                                       int nid, nodemask_t *nodemask)
+{
+       return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
 
 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static inline bool gigantic_page_supported(void) { return false; }
 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
-               int nid, nodemask_t *nodemask) { return NULL; }
+                                       int nid, nodemask_t *nodemask)
+{
+       return NULL;
+}
 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 static inline void destroy_compound_gigantic_page(struct page *page,
                                                unsigned int order) { }
@@ -1157,7 +1175,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
 
-       if (hstate_is_gigantic(h) && !gigantic_page_supported())
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
 
        h->nr_huge_pages--;
@@ -1258,12 +1276,23 @@ void free_huge_page(struct page *page)
        ClearPagePrivate(page);
 
        /*
-        * A return code of zero implies that the subpool will be under its
-        * minimum size if the reservation is not restored after page is free.
-        * Therefore, force restore_reserve operation.
+        * If PagePrivate() was set on page, page allocation consumed a
+        * reservation.  If the page was associated with a subpool, there
+        * would have been a page reserved in the subpool before allocation
+        * via hugepage_subpool_get_pages().  Since we are 'restoring' the
+        * reservtion, do not call hugepage_subpool_put_pages() as this will
+        * remove the reserved page from the subpool.
         */
-       if (hugepage_subpool_put_pages(spool, 1) == 0)
-               restore_reserve = true;
+       if (!restore_reserve) {
+               /*
+                * A return code of zero implies that the subpool will be
+                * under its minimum size if the reservation is not restored
+                * after page is free.  Therefore, force restore_reserve
+                * operation.
+                */
+               if (hugepage_subpool_put_pages(spool, 1) == 0)
+                       restore_reserve = true;
+       }
 
        spin_lock(&hugetlb_lock);
        clear_page_huge_active(page);
@@ -1574,8 +1603,9 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
         */
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                SetPageHugeTemporary(page);
+               spin_unlock(&hugetlb_lock);
                put_page(page);
-               page = NULL;
+               return NULL;
        } else {
                h->surplus_huge_pages++;
                h->surplus_huge_pages_node[page_to_nid(page)]++;
@@ -2277,13 +2307,47 @@ found:
 }
 
 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
-                                               nodemask_t *nodes_allowed)
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+                             nodemask_t *nodes_allowed)
 {
        unsigned long min_count, ret;
 
-       if (hstate_is_gigantic(h) && !gigantic_page_supported())
-               return h->max_huge_pages;
+       spin_lock(&hugetlb_lock);
+
+       /*
+        * Check for a node specific request.
+        * Changing node specific huge page count may require a corresponding
+        * change to the global count.  In any case, the passed node mask
+        * (nodes_allowed) will restrict alloc/free to the specified node.
+        */
+       if (nid != NUMA_NO_NODE) {
+               unsigned long old_count = count;
+
+               count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+               /*
+                * User may have specified a large count value which caused the
+                * above calculation to overflow.  In this case, they wanted
+                * to allocate as many huge pages as possible.  Set count to
+                * largest possible value to align with their intention.
+                */
+               if (count < old_count)
+                       count = ULONG_MAX;
+       }
+
+       /*
+        * Gigantic pages runtime allocation depend on the capability for large
+        * page range allocation.
+        * If the system does not provide this feature, return an error when
+        * the user tries to allocate gigantic pages but let the user free the
+        * boottime allocated gigantic pages.
+        */
+       if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+               if (count > persistent_huge_pages(h)) {
+                       spin_unlock(&hugetlb_lock);
+                       return -EINVAL;
+               }
+               /* Fall through to decrease pool */
+       }
 
        /*
         * Increase the pool size
@@ -2296,7 +2360,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
         * pool might be one hugepage larger than it needs to be, but
         * within all the constraints specified by the sysctls.
         */
-       spin_lock(&hugetlb_lock);
        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, -1))
                        break;
@@ -2351,9 +2414,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
                        break;
        }
 out:
-       ret = persistent_huge_pages(h);
+       h->max_huge_pages = persistent_huge_pages(h);
        spin_unlock(&hugetlb_lock);
-       return ret;
+
+       return 0;
 }
 
 #define HSTATE_ATTR_RO(_name) \
@@ -2403,41 +2467,32 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
                                           unsigned long count, size_t len)
 {
        int err;
-       NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+       nodemask_t nodes_allowed, *n_mask;
 
-       if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
-               err = -EINVAL;
-               goto out;
-       }
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return -EINVAL;
 
        if (nid == NUMA_NO_NODE) {
                /*
                 * global hstate attribute
                 */
                if (!(obey_mempolicy &&
-                               init_nodemask_of_mempolicy(nodes_allowed))) {
-                       NODEMASK_FREE(nodes_allowed);
-                       nodes_allowed = &node_states[N_MEMORY];
-               }
-       } else if (nodes_allowed) {
+                               init_nodemask_of_mempolicy(&nodes_allowed)))
+                       n_mask = &node_states[N_MEMORY];
+               else
+                       n_mask = &nodes_allowed;
+       } else {
                /*
-                * per node hstate attribute: adjust count to global,
-                * but restrict alloc/free to the specified node.
+                * Node specific request.  count adjustment happens in
+                * set_max_huge_pages() after acquiring hugetlb_lock.
                 */
-               count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
-               init_nodemask_of_node(nodes_allowed, nid);
-       } else
-               nodes_allowed = &node_states[N_MEMORY];
-
-       h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+               init_nodemask_of_node(&nodes_allowed, nid);
+               n_mask = &nodes_allowed;
+       }
 
-       if (nodes_allowed != &node_states[N_MEMORY])
-               NODEMASK_FREE(nodes_allowed);
+       err = set_max_huge_pages(h, count, nid, n_mask);
 
-       return len;
-out:
-       NODEMASK_FREE(nodes_allowed);
-       return err;
+       return err ? err : len;
 }
 
 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
@@ -3247,7 +3302,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
        if (cow) {
-               mmu_notifier_range_init(&range, src, vma->vm_start,
+               mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
+                                       vma->vm_start,
                                        vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
        }
@@ -3359,7 +3415,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        /*
         * If sharing possible, alert mmu notifiers of worst case.
         */
-       mmu_notifier_range_init(&range, mm, start, end);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
+                               end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
        mmu_notifier_invalidate_range_start(&range);
        address = start;
@@ -3626,7 +3683,8 @@ retry_avoidcopy:
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
 
-       mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+                               haddr + huge_page_size(h));
        mmu_notifier_invalidate_range_start(&range);
 
        /*
@@ -3777,8 +3835,7 @@ retry:
                         * handling userfault.  Reacquire after handling
                         * fault to make calling code simpler.
                         */
-                       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
-                                                       idx, haddr);
+                       hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        ret = handle_userfault(&vmf, VM_UFFD_MISSING);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3886,21 +3943,14 @@ backout_unlocked:
 }
 
 #ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
-                           struct vm_area_struct *vma,
-                           struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
                            pgoff_t idx, unsigned long address)
 {
        unsigned long key[2];
        u32 hash;
 
-       if (vma->vm_flags & VM_SHARED) {
-               key[0] = (unsigned long) mapping;
-               key[1] = idx;
-       } else {
-               key[0] = (unsigned long) mm;
-               key[1] = address >> huge_page_shift(h);
-       }
+       key[0] = (unsigned long) mapping;
+       key[1] = idx;
 
        hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
 
@@ -3911,9 +3961,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
  * For uniprocesor systems we always use a single mutex, so just
  * return 0 and avoid the hashing overhead.
  */
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
-                           struct vm_area_struct *vma,
-                           struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
                            pgoff_t idx, unsigned long address)
 {
        return 0;
@@ -3958,7 +4006,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
-       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+       hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
        mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
        entry = huge_ptep_get(ptep);
@@ -4371,7 +4419,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         * start/end.  Set range.start/range.end to cover the maximum possible
         * range if PMD sharing is possible.
         */
-       mmu_notifier_range_init(&range, mm, start, end);
+       mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
+                               0, vma, mm, start, end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
 
        BUG_ON(address >= end);
@@ -4477,6 +4526,11 @@ int hugetlb_reserve_pages(struct inode *inode,
         * called to make the mapping read-write. Assume !vma is a shm mapping
         */
        if (!vma || vma->vm_flags & VM_MAYSHARE) {
+               /*
+                * resv_map can not be NULL as hugetlb_reserve_pages is only
+                * called for inodes for which resv_maps were created (see
+                * hugetlbfs_get_inode).
+                */
                resv_map = inode_resv_map(inode);
 
                chg = region_chg(resv_map, from, to);
@@ -4568,6 +4622,10 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
        struct hugepage_subpool *spool = subpool_inode(inode);
        long gbl_reserve;
 
+       /*
+        * Since this routine can be called in the evict inode path for all
+        * hugetlbfs inodes, resv_map could be NULL.
+        */
        if (resv_map) {
                chg = region_del(resv_map, start, end);
                /*