Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Aug 2010 17:15:10 +0000 (10:15 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 12 Aug 2010 17:15:10 +0000 (10:15 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Aug 2010 17:15:10 +0000 (10:15 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 12 Aug 2010 17:15:10 +0000 (10:15 -0700)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 78b4bc64c0064f19bccce02de2862813d56ff7e0..f479700df61b186118d607a0db86b1e47ffa8294 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -2,6 +2,7 @@
  #define _LINUX_HUGETLB_H
  
  #include <linux/fs.h>
+#include <linux/hugetlb_inline.h>
  
  struct ctl_table;
  struct user_struct;
@@ -14,11 +15,6 @@ struct user_struct;
  
  int PageHuge(struct page *page);
  
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
-{
-       return vma->vm_flags & VM_HUGETLB;
-}
-
  void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
  int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
  int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -47,6 +43,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                 struct vm_area_struct *vma,
                                                 int acctflags);
  void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
+void __isolate_hwpoisoned_huge_page(struct page *page);
  
  extern unsigned long hugepages_treat_as_movable;
  extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -77,11 +74,6 @@ static inline int PageHuge(struct page *page)
         return 0;
  }
  
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
-{
-       return 0;
-}
-
  static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
  {
  }
@@ -108,6 +100,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
  #define is_hugepage_only_range(mm, addr, len)  0
  #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
  #define hugetlb_fault(mm, vma, addr, flags)    ({ BUG(); 0; })
+#define huge_pte_offset(mm, address)   0
+#define __isolate_hwpoisoned_huge_page(page)   0
  
  #define hugetlb_change_protection(vma, address, end, newprot)
  
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h

new file mode 100644 (file)

index 0000000..6931489
--- /dev/null
+++ b/include/linux/hugetlb_inline.h
@@ -0,0 +1,22 @@
+#ifndef _LINUX_HUGETLB_INLINE_H
+#define _LINUX_HUGETLB_INLINE_H
+
+#ifdef CONFIG_HUGETLB_PAGE
+
+#include <linux/mm.h>
+
+static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_HUGETLB;
+}
+
+#else
+
+static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+       return 0;
+}
+
+#endif
+
+#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h

index 78a702ce4fcb0e194c6f8483d4f79631a4d932ad..e12cdc6d79ee79e80ddbcf6709e97a403a4746f7 100644 (file)
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -13,6 +13,7 @@
  #include <linux/gfp.h>
  #include <linux/bitops.h>
  #include <linux/hardirq.h> /* for in_interrupt() */
+#include <linux/hugetlb_inline.h>
  
  /*
   * Bits in mapping->flags.  The lower __GFP_BITS_SHIFT bits are the page
@@ -281,10 +282,16 @@ static inline loff_t page_offset(struct page *page)
         return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
  }
  
+extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+                                    unsigned long address);
+
  static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
                                         unsigned long address)
  {
-       pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
+       pgoff_t pgoff;
+       if (unlikely(is_vm_hugetlb_page(vma)))
+               return linear_hugepage_index(vma, address);
+       pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
         pgoff += vma->vm_pgoff;
         return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  }
diff --git a/include/linux/poison.h b/include/linux/poison.h

index 34066ffd893d733b8134da1b57915bec314afb48..2110a81c5e2afaab47ec5cb107cf17503d731317 100644 (file)
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -48,15 +48,6 @@
  #define POISON_FREE    0x6b    /* for use-after-free poisoning */
  #define        POISON_END      0xa5    /* end-byte of poisoning */
  
-/********** mm/hugetlb.c **********/
-/*
- * Private mappings of hugetlb pages use this poisoned value for
- * page->mapping. The core VM should not be doing anything with this mapping
- * but futex requires the existence of some page->mapping value even though it
- * is unused if PAGE_MAPPING_ANON is set.
- */
-#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON))
-
  /********** arch/$ARCH/mm/init.c **********/
  #define POISON_FREE_INITMEM    0xcc
  
diff --git a/include/linux/rmap.h b/include/linux/rmap.h

index d6661de56f303d34421d1662921cdf39d0cded5d..31b2fd75dcbae59d3ed633253c1f5cd3850c201a 100644 (file)
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -168,6 +168,11 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon
  void page_add_file_rmap(struct page *);
  void page_remove_rmap(struct page *);
  
+void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
+                           unsigned long);
+void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+                               unsigned long);
+
  static inline void page_dup_rmap(struct page *page)
  {
         atomic_inc(&page->_mapcount);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index b61d2db9f34e6cfeff5f2dbe040bbf84a8c30ee4..cc5be788a39fe132c72cbc1d2fb1c03f71708575 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
  #include <linux/bootmem.h>
  #include <linux/sysfs.h>
  #include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
  
  #include <asm/page.h>
  #include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
                         (vma->vm_pgoff >> huge_page_order(h));
  }
  
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+                                    unsigned long address)
+{
+       return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
  /*
   * Return the size of the pages allocated when backing a VMA. In the majority
   * cases this will be same size as used by the page table entries.
@@ -552,6 +561,7 @@ static void free_huge_page(struct page *page)
         set_page_private(page, 0);
         page->mapping = NULL;
         BUG_ON(page_count(page));
+       BUG_ON(page_mapcount(page));
         INIT_LIST_HEAD(&page->lru);
  
         spin_lock(&hugetlb_lock);
@@ -605,6 +615,8 @@ int PageHuge(struct page *page)
         return dtor == free_huge_page;
  }
  
+EXPORT_SYMBOL_GPL(PageHuge);
+
  static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
  {
         struct page *page;
@@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                         entry = huge_ptep_get(src_pte);
                         ptepage = pte_page(entry);
                         get_page(ptepage);
+                       page_dup_rmap(ptepage);
                         set_huge_pte_at(dst, addr, dst_pte, entry);
                 }
                 spin_unlock(&src->page_table_lock);
@@ -2140,6 +2153,19 @@ nomem:
         return -ENOMEM;
  }
  
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+       swp_entry_t swp;
+
+       if (huge_pte_none(pte) || pte_present(pte))
+               return 0;
+       swp = pte_to_swp_entry(pte);
+       if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+               return 1;
+       } else
+               return 0;
+}
+
  void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                             unsigned long end, struct page *ref_page)
  {
@@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                 if (huge_pte_none(pte))
                         continue;
  
+               /*
+                * HWPoisoned hugepage is already unmapped and dropped reference
+                */
+               if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+                       continue;
+
                 page = pte_page(pte);
                 if (pte_dirty(pte))
                         set_page_dirty(page);
@@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
         flush_tlb_range(vma, start, end);
         mmu_notifier_invalidate_range_end(mm, start, end);
         list_for_each_entry_safe(page, tmp, &page_list, lru) {
+               page_remove_rmap(page);
                 list_del(&page->lru);
                 put_page(page);
         }
@@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         return 1;
  }
  
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
  static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, pte_t *ptep, pte_t pte,
                         struct page *pagecache_page)
@@ -2286,8 +2322,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
  retry_avoidcopy:
         /* If no-one else is actually using this page, avoid the copy
          * and just make the page writable */
-       avoidcopy = (page_count(old_page) == 1);
+       avoidcopy = (page_mapcount(old_page) == 1);
         if (avoidcopy) {
+               if (!trylock_page(old_page)) {
+                       if (PageAnon(old_page))
+                               page_move_anon_rmap(old_page, vma, address);
+               } else
+                       unlock_page(old_page);
                 set_huge_ptep_writable(vma, address, ptep);
                 return 0;
         }
@@ -2338,6 +2379,13 @@ retry_avoidcopy:
                 return -PTR_ERR(new_page);
         }
  
+       /*
+        * When the original hugepage is shared one, it does not have
+        * anon_vma prepared.
+        */
+       if (unlikely(anon_vma_prepare(vma)))
+               return VM_FAULT_OOM;
+
         copy_huge_page(new_page, old_page, address, vma);
         __SetPageUptodate(new_page);
  
@@ -2355,6 +2403,8 @@ retry_avoidcopy:
                 huge_ptep_clear_flush(vma, address, ptep);
                 set_huge_pte_at(mm, address, ptep,
                                 make_huge_pte(vma, new_page, 1));
+               page_remove_rmap(old_page);
+               hugepage_add_anon_rmap(new_page, vma, address);
                 /* Make the old page be freed below */
                 new_page = old_page;
                 mmu_notifier_invalidate_range_end(mm,
@@ -2458,10 +2508,29 @@ retry:
                         spin_lock(&inode->i_lock);
                         inode->i_blocks += blocks_per_huge_page(h);
                         spin_unlock(&inode->i_lock);
+                       page_dup_rmap(page);
                 } else {
                         lock_page(page);
-                       page->mapping = HUGETLB_POISON;
+                       if (unlikely(anon_vma_prepare(vma))) {
+                               ret = VM_FAULT_OOM;
+                               goto backout_unlocked;
+                       }
+                       hugepage_add_new_anon_rmap(page, vma, address);
                 }
+       } else {
+               page_dup_rmap(page);
+       }
+
+       /*
+        * Since memory error handler replaces pte into hwpoison swap entry
+        * at the time of error handling, a process which reserved but not have
+        * the mapping to the error hugepage does not have hwpoison swap entry.
+        * So we need to block accesses from such a process by checking
+        * PG_hwpoison bit here.
+        */
+       if (unlikely(PageHWPoison(page))) {
+               ret = VM_FAULT_HWPOISON;
+               goto backout_unlocked;
         }
  
         /*
@@ -2513,10 +2582,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         pte_t *ptep;
         pte_t entry;
         int ret;
+       struct page *page = NULL;
         struct page *pagecache_page = NULL;
         static DEFINE_MUTEX(hugetlb_instantiation_mutex);
         struct hstate *h = hstate_vma(vma);
  
+       ptep = huge_pte_offset(mm, address);
+       if (ptep) {
+               entry = huge_ptep_get(ptep);
+               if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                       return VM_FAULT_HWPOISON;
+       }
+
         ptep = huge_pte_alloc(mm, address, huge_page_size(h));
         if (!ptep)
                 return VM_FAULT_OOM;
@@ -2554,6 +2631,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                                 vma, address);
         }
  
+       if (!pagecache_page) {
+               page = pte_page(entry);
+               lock_page(page);
+       }
+
         spin_lock(&mm->page_table_lock);
         /* Check for a racing update before calling hugetlb_cow */
         if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2579,6 +2661,8 @@ out_page_table_lock:
         if (pagecache_page) {
                 unlock_page(pagecache_page);
                 put_page(pagecache_page);
+       } else {
+               unlock_page(page);
         }
  
  out_mutex:
@@ -2791,3 +2875,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
         hugetlb_put_quota(inode->i_mapping, (chg - freed));
         hugetlb_acct_memory(h, -(chg - freed));
  }
+
+/*
+ * This function is called from memory failure code.
+ * Assume the caller holds page lock of the head page.
+ */
+void __isolate_hwpoisoned_huge_page(struct page *hpage)
+{
+       struct hstate *h = page_hstate(hpage);
+       int nid = page_to_nid(hpage);
+
+       spin_lock(&hugetlb_lock);
+       list_del(&hpage->lru);
+       h->free_huge_pages--;
+       h->free_huge_pages_node[nid]--;
+       spin_unlock(&hugetlb_lock);
+}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c

index 10ea71905c1fbbb8a9582737e05f78fefa86012e..0948f1072d6b150f145a159a89ae1aab364d9f0c 100644 (file)
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
  #include <linux/mm.h>
  #include <linux/swap.h>
  #include <linux/pagemap.h>
+#include <linux/hugetlb.h>
  #include "internal.h"
  
  static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
  {
         unsigned long pfn = val;
         struct page *p;
+       struct page *hpage;
         int err;
  
         if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
                 return -ENXIO;
  
         p = pfn_to_page(pfn);
+       hpage = compound_head(p);
         /*
          * This implies unable to support free buddy pages.
          */
-       if (!get_page_unless_zero(p))
+       if (!get_page_unless_zero(hpage))
                 return 0;
  
-       if (!PageLRU(p))
+       if (!PageLRU(p) && !PageHuge(p))
                 shake_page(p, 0);
         /*
          * This implies unable to support non-LRU pages.
          */
-       if (!PageLRU(p))
+       if (!PageLRU(p) && !PageHuge(p))
                 return 0;
  
         /*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
          * We temporarily take page lock for try_get_mem_cgroup_from_page().
          * __memory_failure() will redo the check reliably inside page lock.
          */
-       lock_page(p);
-       err = hwpoison_filter(p);
-       unlock_page(p);
+       lock_page(hpage);
+       err = hwpoison_filter(hpage);
+       unlock_page(hpage);
         if (err)
                 return 0;
  
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index 6b44e52cacaa19857de7ba08af0469492dbeb2db..9c26eeca13425886690cddaf6dd45954fd3f0097 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
  #include <linux/suspend.h>
  #include <linux/slab.h>
  #include <linux/swapops.h>
+#include <linux/hugetlb.h>
  #include "internal.h"
  
  int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
  /*
   * Huge pages. Needs work.
   * Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
- * all MMs and look for the mappings, but that would be non atomic and racy.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
- * like just walking the current process and hoping it has it mapped (that
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ *   To narrow down kill region to one page, we need to break up pmd.
+ * - To support soft-offlining for hugepage, we need to support hugepage
+ *   migration.
   */
  static int me_huge_page(struct page *p, unsigned long pfn)
  {
-       return FAILED;
+       struct page *hpage = compound_head(p);
+       /*
+        * We can safely recover from error on free or reserved (i.e.
+        * not in-use) hugepage by dequeuing it from freelist.
+        * To check whether a hugepage is in-use or not, we can't use
+        * page->lru because it can be used in other hugepage operations,
+        * such as __unmap_hugepage_range() and gather_surplus_pages().
+        * So instead we use page_mapping() and PageAnon().
+        * We assume that this function is called with page lock held,
+        * so there is no race between isolation and mapping/unmapping.
+        */
+       if (!(page_mapping(hpage) || PageAnon(hpage))) {
+               __isolate_hwpoisoned_huge_page(hpage);
+               return RECOVERED;
+       }
+       return DELAYED;
  }
  
  /*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         int ret;
         int i;
         int kill = 1;
+       struct page *hpage = compound_head(p);
  
         if (PageReserved(p) || PageSlab(p))
                 return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
          * This check implies we don't kill processes if their pages
          * are in the swap cache early. Those are always late kills.
          */
-       if (!page_mapped(p))
+       if (!page_mapped(hpage))
                 return SWAP_SUCCESS;
  
-       if (PageCompound(p) || PageKsm(p))
+       if (PageKsm(p))
                 return SWAP_FAIL;
  
         if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
          * XXX: the dirty test could be racy: set_page_dirty() may not always
          * be called inside page lock (it's recommended but not enforced).
          */
-       mapping = page_mapping(p);
-       if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
-               if (page_mkclean(p)) {
-                       SetPageDirty(p);
+       mapping = page_mapping(hpage);
+       if (!PageDirty(hpage) && mapping &&
+           mapping_cap_writeback_dirty(mapping)) {
+               if (page_mkclean(hpage)) {
+                       SetPageDirty(hpage);
                 } else {
                         kill = 0;
                         ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
          * there's nothing that can be done.
          */
         if (kill)
-               collect_procs(p, &tokill);
+               collect_procs(hpage, &tokill);
  
         /*
          * try_to_unmap can fail temporarily due to races.
          * Try a few times (RED-PEN better strategy?)
          */
         for (i = 0; i < N_UNMAP_TRIES; i++) {
-               ret = try_to_unmap(p, ttu);
+               ret = try_to_unmap(hpage, ttu);
                 if (ret == SWAP_SUCCESS)
                         break;
                 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
  
         if (ret != SWAP_SUCCESS)
                 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                               pfn, page_mapcount(p));
+                               pfn, page_mapcount(hpage));
  
         /*
          * Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
          * use a more force-full uncatchable kill to prevent
          * any accesses to the poisoned memory.
          */
-       kill_procs_ao(&tokill, !!PageDirty(p), trapno,
+       kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
                       ret != SWAP_SUCCESS, pfn);
  
         return ret;
  }
  
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+       int i;
+       int nr_pages = 1 << compound_order(hpage);
+       for (i = 0; i < nr_pages; i++)
+               SetPageHWPoison(hpage + i);
+}
+
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+       int i;
+       int nr_pages = 1 << compound_order(hpage);
+       for (i = 0; i < nr_pages; i++)
+               ClearPageHWPoison(hpage + i);
+}
+
  int __memory_failure(unsigned long pfn, int trapno, int flags)
  {
         struct page_state *ps;
         struct page *p;
+       struct page *hpage;
         int res;
+       unsigned int nr_pages;
  
         if (!sysctl_memory_failure_recovery)
                 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         }
  
         p = pfn_to_page(pfn);
+       hpage = compound_head(p);
         if (TestSetPageHWPoison(p)) {
                 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                 return 0;
         }
  
-       atomic_long_add(1, &mce_bad_pages);
+       nr_pages = 1 << compound_order(hpage);
+       atomic_long_add(nr_pages, &mce_bad_pages);
  
         /*
          * We need/can do nothing about count=0 pages.
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
          * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
          */
         if (!(flags & MF_COUNT_INCREASED) &&
-               !get_page_unless_zero(compound_head(p))) {
+               !get_page_unless_zero(hpage)) {
                 if (is_free_buddy_page(p)) {
                         action_result(pfn, "free buddy", DELAYED);
                         return 0;
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
          * The check (unnecessarily) ignores LRU pages being isolated and
          * walked by the page reclaim code, however that's not a big loss.
          */
-       if (!PageLRU(p))
+       if (!PageLRU(p) && !PageHuge(p))
                 shake_page(p, 0);
-       if (!PageLRU(p)) {
+       if (!PageLRU(p) && !PageHuge(p)) {
                 /*
                  * shake_page could have turned it free.
                  */
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
          * It's very difficult to mess with pages currently under IO
          * and in many cases impossible, so we just avoid it here.
          */
-       lock_page_nosync(p);
+       lock_page_nosync(hpage);
  
         /*
          * unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         }
         if (hwpoison_filter(p)) {
                 if (TestClearPageHWPoison(p))
-                       atomic_long_dec(&mce_bad_pages);
-               unlock_page(p);
-               put_page(p);
+                       atomic_long_sub(nr_pages, &mce_bad_pages);
+               unlock_page(hpage);
+               put_page(hpage);
+               return 0;
+       }
+
+       /*
+        * For error on the tail page, we should set PG_hwpoison
+        * on the head page to show that the hugepage is hwpoisoned
+        */
+       if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+               action_result(pfn, "hugepage already hardware poisoned",
+                               IGNORED);
+               unlock_page(hpage);
+               put_page(hpage);
                 return 0;
         }
+       /*
+        * Set PG_hwpoison on all pages in an error hugepage,
+        * because containment is done in hugepage unit for now.
+        * Since we have done TestSetPageHWPoison() for the head page with
+        * page lock held, we can safely set PG_hwpoison bits on tail pages.
+        */
+       if (PageHuge(p))
+               set_page_hwpoison_huge_page(hpage);
  
         wait_on_page_writeback(p);
  
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                 }
         }
  out:
-       unlock_page(p);
+       unlock_page(hpage);
         return res;
  }
  EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
         struct page *page;
         struct page *p;
         int freeit = 0;
+       unsigned int nr_pages;
  
         if (!pfn_valid(pfn))
                 return -ENXIO;
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
                 return 0;
         }
  
+       nr_pages = 1 << compound_order(page);
+
         if (!get_page_unless_zero(page)) {
                 if (TestClearPageHWPoison(p))
-                       atomic_long_dec(&mce_bad_pages);
+                       atomic_long_sub(nr_pages, &mce_bad_pages);
                 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
                 return 0;
         }
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
          * the PG_hwpoison page will be caught and isolated on the entrance to
          * the free buddy page pool.
          */
-       if (TestClearPageHWPoison(p)) {
+       if (TestClearPageHWPoison(page)) {
                 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
-               atomic_long_dec(&mce_bad_pages);
+               atomic_long_sub(nr_pages, &mce_bad_pages);
                 freeit = 1;
         }
+       if (PageHuge(p))
+               clear_page_hwpoison_huge_page(page);
         unlock_page(page);
  
         put_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c

index a7d0f5482634d00724d91ebcf42cba4073cab20e..87b9e8ad450962afa1159b763f0aa0a977ab9a88 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
  #include <linux/memcontrol.h>
  #include <linux/mmu_notifier.h>
  #include <linux/migrate.h>
+#include <linux/hugetlb.h>
  
  #include <asm/tlbflush.h>
  
@@ -350,6 +351,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
         unsigned long address;
  
+       if (unlikely(is_vm_hugetlb_page(vma)))
+               pgoff = page->index << huge_page_order(page_hstate(page));
         address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
         if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
                 /* page should be within @vma mapping range */
@@ -394,6 +397,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
         pte_t *pte;
         spinlock_t *ptl;
  
+       if (unlikely(PageHuge(page))) {
+               pte = huge_pte_offset(mm, address);
+               ptl = &mm->page_table_lock;
+               goto check;
+       }
+
         pgd = pgd_offset(mm, address);
         if (!pgd_present(*pgd))
                 return NULL;
@@ -414,6 +423,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
         }
  
         ptl = pte_lockptr(mm, pmd);
+check:
         spin_lock(ptl);
         if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
                 *ptlp = ptl;
@@ -916,6 +926,12 @@ void page_remove_rmap(struct page *page)
                 page_clear_dirty(page);
                 set_page_dirty(page);
         }
+       /*
+        * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+        * and not charged by memcg for now.
+        */
+       if (unlikely(PageHuge(page)))
+               return;
         if (PageAnon(page)) {
                 mem_cgroup_uncharge_page(page);
                 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1524,3 +1540,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
                 return rmap_walk_file(page, rmap_one, arg);
  }
  #endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The following three functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+       struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+       struct anon_vma *anon_vma = vma->anon_vma;
+       BUG_ON(!anon_vma);
+       if (!exclusive) {
+               struct anon_vma_chain *avc;
+               avc = list_entry(vma->anon_vma_chain.prev,
+                                struct anon_vma_chain, same_vma);
+               anon_vma = avc->anon_vma;
+       }
+       anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+       page->mapping = (struct address_space *) anon_vma;
+       page->index = linear_page_index(vma, address);
+}
+
+void hugepage_add_anon_rmap(struct page *page,
+                           struct vm_area_struct *vma, unsigned long address)
+{
+       struct anon_vma *anon_vma = vma->anon_vma;
+       int first;
+       BUG_ON(!anon_vma);
+       BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+       first = atomic_inc_and_test(&page->_mapcount);
+       if (first)
+               __hugepage_set_anon_rmap(page, vma, address, 0);
+}
+
+void hugepage_add_new_anon_rmap(struct page *page,
+                       struct vm_area_struct *vma, unsigned long address)
+{
+       BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+       atomic_set(&page->_mapcount, 0);
+       __hugepage_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Aug 2010 17:15:10 +0000 (10:15 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 12 Aug 2010 17:15:10 +0000 (10:15 -0700)
include/linux/hugetlb.h		patch \| blob \| history
include/linux/hugetlb_inline.h	[new file with mode: 0644]	patch \| blob
include/linux/pagemap.h		patch \| blob \| history
include/linux/poison.h		patch \| blob \| history
include/linux/rmap.h		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/hwpoison-inject.c		patch \| blob \| history
mm/memory-failure.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history