hugetlb: reserve huge pages for reliable MAP_PRIVATE hugetlbfs mappings until fork()
authorMel Gorman <mel@csn.ul.ie>
Thu, 24 Jul 2008 04:27:23 +0000 (21:27 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)
This patch reserves huge pages at mmap() time for MAP_PRIVATE mappings in
a similar manner to the reservations taken for MAP_SHARED mappings.  The
reserve count is accounted both globally and on a per-VMA basis for
private mappings.  This guarantees that a process that successfully calls
mmap() will successfully fault all pages in the future unless fork() is
called.

The characteristics of private mappings of hugetlbfs files behaviour after
this patch are;

1. The process calling mmap() is guaranteed to succeed all future faults until
   it forks().
2. On fork(), the parent may die due to SIGKILL on writes to the private
   mapping if enough pages are not available for the COW. For reasonably
   reliable behaviour in the face of a small huge page pool, children of
   hugepage-aware processes should not reference the mappings; such as
   might occur when fork()ing to exec().
3. On fork(), the child VMAs inherit no reserves. Reads on pages already
   faulted by the parent will succeed. Successful writes will depend on enough
   huge pages being free in the pool.
4. Quotas of the hugetlbfs mount are checked at reserve time for the mapper
   and at fault time otherwise.

Before this patch, all reads or writes in the child potentially needs page
allocations that can later lead to the death of the parent.  This applies
to reads and writes of uninstantiated pages as well as COW.  After the
patch it is only a write to an instantiated page that causes problems.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/hugetlbfs/inode.c
include/linux/hugetlb.h
kernel/fork.c
mm/hugetlb.c

index aeabf80f81a5d38561768ea95d1b544605e5dcc4..1576bbecd084575e2cb97fffe6846c2039309dd5 100644 (file)
@@ -103,9 +103,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        ret = -ENOMEM;
        len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
        ret = -ENOMEM;
        len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
-       if (vma->vm_flags & VM_MAYSHARE &&
-           hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
-                                 len >> HPAGE_SHIFT))
+       if (hugetlb_reserve_pages(inode,
+                               vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
+                               len >> HPAGE_SHIFT, vma))
                goto out;
 
        ret = 0;
                goto out;
 
        ret = 0;
@@ -942,7 +942,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
                goto out_dentry;
 
        error = -ENOMEM;
                goto out_dentry;
 
        error = -ENOMEM;
-       if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
+       if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT, NULL))
                goto out_inode;
 
        d_instantiate(dentry, inode);
                goto out_inode;
 
        d_instantiate(dentry, inode);
index a79e80b689d803d161a4ca694370aeec1ba25629..185b14c9f021fb9d3fb76ae6a1296f8e71206a73 100644 (file)
@@ -17,6 +17,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
        return vma->vm_flags & VM_HUGETLB;
 }
 
        return vma->vm_flags & VM_HUGETLB;
 }
 
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
@@ -30,7 +31,8 @@ int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, int write_access);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, int write_access);
-int hugetlb_reserve_pages(struct inode *inode, long from, long to);
+int hugetlb_reserve_pages(struct inode *inode, long from, long to,
+                                               struct vm_area_struct *vma);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long max_huge_pages;
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long max_huge_pages;
@@ -58,6 +60,11 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
        return 0;
 }
 {
        return 0;
 }
+
+static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+}
+
 static inline unsigned long hugetlb_total_pages(void)
 {
        return 0;
 static inline unsigned long hugetlb_total_pages(void)
 {
        return 0;
index adefc1131f274082960de7f375a67d4949cdfe27..552c8d8e77ad5e3fb43f606081c57023bb25e4c6 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -306,6 +307,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                        spin_unlock(&file->f_mapping->i_mmap_lock);
                }
 
                        spin_unlock(&file->f_mapping->i_mmap_lock);
                }
 
+               /*
+                * Clear hugetlb-related page reserves for children. This only
+                * affects MAP_PRIVATE mappings. Faults generated by the child
+                * are not guaranteed to succeed, even if read-only
+                */
+               if (is_vm_hugetlb_page(tmp))
+                       reset_vma_resv_huge_pages(tmp);
+
                /*
                 * Link in the new vma and copy the page table entries.
                 */
                /*
                 * Link in the new vma and copy the page table entries.
                 */
index a4dbba8965f3a19abb88b356e3da4b8f61d26895..0af500db3632e809956b4acc8f8580724efa628d 100644 (file)
@@ -40,6 +40,69 @@ static int hugetlb_next_nid;
  */
 static DEFINE_SPINLOCK(hugetlb_lock);
 
  */
 static DEFINE_SPINLOCK(hugetlb_lock);
 
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ */
+static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       if (!(vma->vm_flags & VM_SHARED))
+               return (unsigned long)vma->vm_private_data;
+       return 0;
+}
+
+static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
+                                                       unsigned long reserve)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+       vma->vm_private_data = (void *)reserve;
+}
+
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED) {
+               /* Shared mappings always use reserves */
+               resv_huge_pages--;
+       } else {
+               /*
+                * Only the process that called mmap() has reserves for
+                * private mappings.
+                */
+               if (vma_resv_huge_pages(vma)) {
+                       resv_huge_pages--;
+                       reserve = (unsigned long)vma->vm_private_data - 1;
+                       vma->vm_private_data = (void *)reserve;
+               }
+       }
+}
+
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       if (!(vma->vm_flags & VM_SHARED))
+               vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_private_reserves(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED)
+               return 0;
+       if (!vma_resv_huge_pages(vma))
+               return 0;
+       return 1;
+}
+
 static void clear_huge_page(struct page *page, unsigned long addr)
 {
        int i;
 static void clear_huge_page(struct page *page, unsigned long addr)
 {
        int i;
@@ -101,6 +164,15 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        struct zone *zone;
        struct zoneref *z;
 
        struct zone *zone;
        struct zoneref *z;
 
+       /*
+        * A child process with MAP_PRIVATE mappings created by their parent
+        * have no page reserves. This check ensures that reservations are
+        * not "stolen". The child may still get SIGKILLed
+        */
+       if (!vma_has_private_reserves(vma) &&
+                       free_huge_pages - resv_huge_pages == 0)
+               return NULL;
+
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
                nid = zone_to_nid(zone);
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
                nid = zone_to_nid(zone);
@@ -111,8 +183,8 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
                        list_del(&page->lru);
                        free_huge_pages--;
                        free_huge_pages_node[nid]--;
                        list_del(&page->lru);
                        free_huge_pages--;
                        free_huge_pages_node[nid]--;
-                       if (vma && vma->vm_flags & VM_MAYSHARE)
-                               resv_huge_pages--;
+                       decrement_hugepage_resv_vma(vma);
+
                        break;
                }
        }
                        break;
                }
        }
@@ -461,55 +533,40 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
        }
 }
 
        }
 }
 
-
-static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
-                                               unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+                                   unsigned long addr)
 {
        struct page *page;
 {
        struct page *page;
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct inode *inode = mapping->host;
+       unsigned int chg = 0;
+
+       /*
+        * Processes that did not create the mapping will have no reserves and
+        * will not have accounted against quota. Check that the quota can be
+        * made before satisfying the allocation
+        */
+       if (!vma_has_private_reserves(vma)) {
+               chg = 1;
+               if (hugetlb_get_quota(inode->i_mapping, chg))
+                       return ERR_PTR(-ENOSPC);
+       }
 
        spin_lock(&hugetlb_lock);
        page = dequeue_huge_page_vma(vma, addr);
        spin_unlock(&hugetlb_lock);
 
        spin_lock(&hugetlb_lock);
        page = dequeue_huge_page_vma(vma, addr);
        spin_unlock(&hugetlb_lock);
-       return page ? page : ERR_PTR(-VM_FAULT_OOM);
-}
 
 
-static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
-                                               unsigned long addr)
-{
-       struct page *page = NULL;
-
-       if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
-               return ERR_PTR(-VM_FAULT_SIGBUS);
-
-       spin_lock(&hugetlb_lock);
-       if (free_huge_pages > resv_huge_pages)
-               page = dequeue_huge_page_vma(vma, addr);
-       spin_unlock(&hugetlb_lock);
        if (!page) {
                page = alloc_buddy_huge_page(vma, addr);
                if (!page) {
        if (!page) {
                page = alloc_buddy_huge_page(vma, addr);
                if (!page) {
-                       hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+                       hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_OOM);
                }
        }
                        return ERR_PTR(-VM_FAULT_OOM);
                }
        }
-       return page;
-}
 
 
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                   unsigned long addr)
-{
-       struct page *page;
-       struct address_space *mapping = vma->vm_file->f_mapping;
-
-       if (vma->vm_flags & VM_MAYSHARE)
-               page = alloc_huge_page_shared(vma, addr);
-       else
-               page = alloc_huge_page_private(vma, addr);
+       set_page_refcounted(page);
+       set_page_private(page, (unsigned long) mapping);
 
 
-       if (!IS_ERR(page)) {
-               set_page_refcounted(page);
-               set_page_private(page, (unsigned long) mapping);
-       }
        return page;
 }
 
        return page;
 }
 
@@ -757,6 +814,13 @@ out:
        return ret;
 }
 
        return ret;
 }
 
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+       unsigned long reserve = vma_resv_huge_pages(vma);
+       if (reserve)
+               hugetlb_acct_memory(-reserve);
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -771,6 +835,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
 
 struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
+       .close = hugetlb_vm_op_close,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -1289,11 +1354,25 @@ static long region_truncate(struct list_head *head, long end)
        return chg;
 }
 
        return chg;
 }
 
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+int hugetlb_reserve_pages(struct inode *inode,
+                                       long from, long to,
+                                       struct vm_area_struct *vma)
 {
        long ret, chg;
 
 {
        long ret, chg;
 
-       chg = region_chg(&inode->i_mapping->private_list, from, to);
+       /*
+        * Shared mappings base their reservation on the number of pages that
+        * are already allocated on behalf of the file. Private mappings need
+        * to reserve the full area even if read-only as mprotect() may be
+        * called to make the mapping read-write. Assume !vma is a shm mapping
+        */
+       if (!vma || vma->vm_flags & VM_SHARED)
+               chg = region_chg(&inode->i_mapping->private_list, from, to);
+       else {
+               chg = to - from;
+               set_vma_resv_huge_pages(vma, chg);
+       }
+
        if (chg < 0)
                return chg;
 
        if (chg < 0)
                return chg;
 
@@ -1304,7 +1383,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
                hugetlb_put_quota(inode->i_mapping, chg);
                return ret;
        }
                hugetlb_put_quota(inode->i_mapping, chg);
                return ret;
        }
-       region_add(&inode->i_mapping->private_list, from, to);
+       if (!vma || vma->vm_flags & VM_SHARED)
+               region_add(&inode->i_mapping->private_list, from, to);
        return 0;
 }
 
        return 0;
 }