mm: rid swapoff of quadratic complexity
[sfrench/cifs-2.6.git] / mm / swapfile.c
index 8688ae65ef58ac639b0b2202039fa22577309350..6de46984d59d240e83987f95b0be37a5af8438cb 100644 (file)
@@ -1799,44 +1799,77 @@ out_nolock:
 }
 
 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-                               unsigned long addr, unsigned long end,
-                               swp_entry_t entry, struct page *page)
+                       unsigned long addr, unsigned long end,
+                       unsigned int type, bool frontswap,
+                       unsigned long *fs_pages_to_unuse)
 {
-       pte_t swp_pte = swp_entry_to_pte(entry);
+       struct page *page;
+       swp_entry_t entry;
        pte_t *pte;
+       struct swap_info_struct *si;
+       unsigned long offset;
        int ret = 0;
+       volatile unsigned char *swap_map;
 
-       /*
-        * We don't actually need pte lock while scanning for swp_pte: since
-        * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
-        * page table while we're scanning; though it could get zapped, and on
-        * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
-        * of unmatched parts which look like swp_pte, so unuse_pte must
-        * recheck under pte lock.  Scanning without pte lock lets it be
-        * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
-        */
+       si = swap_info[type];
        pte = pte_offset_map(pmd, addr);
        do {
-               /*
-                * swapoff spends a _lot_ of time in this loop!
-                * Test inline before going to call unuse_pte.
-                */
-               if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
-                       pte_unmap(pte);
-                       ret = unuse_pte(vma, pmd, addr, entry, page);
-                       if (ret)
-                               goto out;
-                       pte = pte_offset_map(pmd, addr);
+               struct vm_fault vmf;
+
+               if (!is_swap_pte(*pte))
+                       continue;
+
+               entry = pte_to_swp_entry(*pte);
+               if (swp_type(entry) != type)
+                       continue;
+
+               offset = swp_offset(entry);
+               if (frontswap && !frontswap_test(si, offset))
+                       continue;
+
+               pte_unmap(pte);
+               swap_map = &si->swap_map[offset];
+               vmf.vma = vma;
+               vmf.address = addr;
+               vmf.pmd = pmd;
+               page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+               if (!page) {
+                       if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
+                               goto try_next;
+                       return -ENOMEM;
+               }
+
+               lock_page(page);
+               wait_on_page_writeback(page);
+               ret = unuse_pte(vma, pmd, addr, entry, page);
+               if (ret < 0) {
+                       unlock_page(page);
+                       put_page(page);
+                       goto out;
+               }
+
+               try_to_free_swap(page);
+               unlock_page(page);
+               put_page(page);
+
+               if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
+                       ret = FRONTSWAP_PAGES_UNUSED;
+                       goto out;
                }
+try_next:
+               pte = pte_offset_map(pmd, addr);
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap(pte - 1);
+
+       ret = 0;
 out:
        return ret;
 }
 
 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
-                               swp_entry_t entry, struct page *page)
+                               unsigned int type, bool frontswap,
+                               unsigned long *fs_pages_to_unuse)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -1848,7 +1881,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        continue;
-               ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+               ret = unuse_pte_range(vma, pmd, addr, next, type,
+                                     frontswap, fs_pages_to_unuse);
                if (ret)
                        return ret;
        } while (pmd++, addr = next, addr != end);
@@ -1857,7 +1891,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 
 static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
-                               swp_entry_t entry, struct page *page)
+                               unsigned int type, bool frontswap,
+                               unsigned long *fs_pages_to_unuse)
 {
        pud_t *pud;
        unsigned long next;
@@ -1868,7 +1903,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-               ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+               ret = unuse_pmd_range(vma, pud, addr, next, type,
+                                     frontswap, fs_pages_to_unuse);
                if (ret)
                        return ret;
        } while (pud++, addr = next, addr != end);
@@ -1877,7 +1913,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
 
 static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
-                               swp_entry_t entry, struct page *page)
+                               unsigned int type, bool frontswap,
+                               unsigned long *fs_pages_to_unuse)
 {
        p4d_t *p4d;
        unsigned long next;
@@ -1888,78 +1925,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
-               ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
+               ret = unuse_pud_range(vma, p4d, addr, next, type,
+                                     frontswap, fs_pages_to_unuse);
                if (ret)
                        return ret;
        } while (p4d++, addr = next, addr != end);
        return 0;
 }
 
-static int unuse_vma(struct vm_area_struct *vma,
-                               swp_entry_t entry, struct page *page)
+static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
+                    bool frontswap, unsigned long *fs_pages_to_unuse)
 {
        pgd_t *pgd;
        unsigned long addr, end, next;
        int ret;
 
-       if (page_anon_vma(page)) {
-               addr = page_address_in_vma(page, vma);
-               if (addr == -EFAULT)
-                       return 0;
-               else
-                       end = addr + PAGE_SIZE;
-       } else {
-               addr = vma->vm_start;
-               end = vma->vm_end;
-       }
+       addr = vma->vm_start;
+       end = vma->vm_end;
 
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-               ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
+               ret = unuse_p4d_range(vma, pgd, addr, next, type,
+                                     frontswap, fs_pages_to_unuse);
                if (ret)
                        return ret;
        } while (pgd++, addr = next, addr != end);
        return 0;
 }
 
-static int unuse_mm(struct mm_struct *mm,
-                               swp_entry_t entry, struct page *page)
+static int unuse_mm(struct mm_struct *mm, unsigned int type,
+                   bool frontswap, unsigned long *fs_pages_to_unuse)
 {
        struct vm_area_struct *vma;
        int ret = 0;
 
-       if (!down_read_trylock(&mm->mmap_sem)) {
-               /*
-                * Activate page so shrink_inactive_list is unlikely to unmap
-                * its ptes while lock is dropped, so swapoff can make progress.
-                */
-               activate_page(page);
-               unlock_page(page);
-               down_read(&mm->mmap_sem);
-               lock_page(page);
-       }
+       down_read(&mm->mmap_sem);
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
-                       break;
+               if (vma->anon_vma) {
+                       ret = unuse_vma(vma, type, frontswap,
+                                       fs_pages_to_unuse);
+                       if (ret)
+                               break;
+               }
                cond_resched();
        }
        up_read(&mm->mmap_sem);
-       return (ret < 0)? ret: 0;
+       return ret;
 }
 
 /*
  * Scan swap_map (or frontswap_map if frontswap parameter is true)
- * from current position to next entry still in use.
- * Recycle to start on reaching the end, returning 0 when empty.
+ * from current position to next entry still in use. Return 0
+ * if there are no inuse entries after prev till end of the map.
  */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                                        unsigned int prev, bool frontswap)
 {
-       unsigned int max = si->max;
-       unsigned int i = prev;
+       unsigned int i;
        unsigned char count;
 
        /*
@@ -1968,20 +1993,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
         * hits are okay, and sys_swapoff() has already prevented new
         * allocations from this area (while holding swap_lock).
         */
-       for (;;) {
-               if (++i >= max) {
-                       if (!prev) {
-                               i = 0;
-                               break;
-                       }
-                       /*
-                        * No entries in use at top of swap_map,
-                        * loop back to start and recheck there.
-                        */
-                       max = prev + 1;
-                       prev = 0;
-                       i = 1;
-               }
+       for (i = prev + 1; i < si->max; i++) {
                count = READ_ONCE(si->swap_map[i]);
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        if (!frontswap || frontswap_test(si, i))
@@ -1989,239 +2001,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                if ((i % LATENCY_LIMIT) == 0)
                        cond_resched();
        }
+
+       if (i == si->max)
+               i = 0;
+
        return i;
 }
 
 /*
- * We completely avoid races by reading each swap page in advance,
- * and then search for the process using it.  All the necessary
- * page table adjustments can then be made atomically.
- *
- * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * If the boolean frontswap is true, only unuse pages_to_unuse pages;
  * pages_to_unuse==0 means all pages; ignored if frontswap is false
  */
+#define SWAP_UNUSE_MAX_TRIES 3
 int try_to_unuse(unsigned int type, bool frontswap,
                 unsigned long pages_to_unuse)
 {
+       struct mm_struct *prev_mm;
+       struct mm_struct *mm;
+       struct list_head *p;
+       int retval = 0;
        struct swap_info_struct *si = swap_info[type];
-       struct mm_struct *start_mm;
-       volatile unsigned char *swap_map; /* swap_map is accessed without
-                                          * locking. Mark it as volatile
-                                          * to prevent compiler doing
-                                          * something odd.
-                                          */
-       unsigned char swcount;
        struct page *page;
        swp_entry_t entry;
-       unsigned int i = 0;
-       int retval = 0;
+       unsigned int i;
+       int retries = 0;
 
-       /*
-        * When searching mms for an entry, a good strategy is to
-        * start at the first mm we freed the previous entry from
-        * (though actually we don't notice whether we or coincidence
-        * freed the entry).  Initialize this start_mm with a hold.
-        *
-        * A simpler strategy would be to start at the last mm we
-        * freed the previous entry from; but that would take less
-        * advantage of mmlist ordering, which clusters forked mms
-        * together, child after parent.  If we race with dup_mmap(), we
-        * prefer to resolve parent before child, lest we miss entries
-        * duplicated after we scanned child: using last mm would invert
-        * that.
-        */
-       start_mm = &init_mm;
-       mmget(&init_mm);
+       if (!si->inuse_pages)
+               return 0;
 
-       /*
-        * Keep on scanning until all entries have gone.  Usually,
-        * one pass through swap_map is enough, but not necessarily:
-        * there are races when an instance of an entry might be missed.
-        */
-       while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
+       if (!frontswap)
+               pages_to_unuse = 0;
+
+retry:
+       retval = shmem_unuse(type, frontswap, &pages_to_unuse);
+       if (retval)
+               goto out;
+
+       prev_mm = &init_mm;
+       mmget(prev_mm);
+
+       spin_lock(&mmlist_lock);
+       p = &init_mm.mmlist;
+       while ((p = p->next) != &init_mm.mmlist) {
                if (signal_pending(current)) {
                        retval = -EINTR;
                        break;
                }
 
-               /*
-                * Get a page for the entry, using the existing swap
-                * cache page if there is one.  Otherwise, get a clean
-                * page and read the swap into it.
-                */
-               swap_map = &si->swap_map[i];
-               entry = swp_entry(type, i);
-               page = read_swap_cache_async(entry,
-                                       GFP_HIGHUSER_MOVABLE, NULL, 0, false);
-               if (!page) {
-                       /*
-                        * Either swap_duplicate() failed because entry
-                        * has been freed independently, and will not be
-                        * reused since sys_swapoff() already disabled
-                        * allocation from here, or alloc_page() failed.
-                        */
-                       swcount = *swap_map;
-                       /*
-                        * We don't hold lock here, so the swap entry could be
-                        * SWAP_MAP_BAD (when the cluster is discarding).
-                        * Instead of fail out, We can just skip the swap
-                        * entry because swapoff will wait for discarding
-                        * finish anyway.
-                        */
-                       if (!swcount || swcount == SWAP_MAP_BAD)
-                               continue;
-                       retval = -ENOMEM;
-                       break;
-               }
+               mm = list_entry(p, struct mm_struct, mmlist);
+               if (!mmget_not_zero(mm))
+                       continue;
+               spin_unlock(&mmlist_lock);
+               mmput(prev_mm);
+               prev_mm = mm;
+               retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
 
-               /*
-                * Don't hold on to start_mm if it looks like exiting.
-                */
-               if (atomic_read(&start_mm->mm_users) == 1) {
-                       mmput(start_mm);
-                       start_mm = &init_mm;
-                       mmget(&init_mm);
+               if (retval) {
+                       mmput(prev_mm);
+                       goto out;
                }
 
                /*
-                * Wait for and lock page.  When do_swap_page races with
-                * try_to_unuse, do_swap_page can handle the fault much
-                * faster than try_to_unuse can locate the entry.  This
-                * apparently redundant "wait_on_page_locked" lets try_to_unuse
-                * defer to do_swap_page in such a case - in some tests,
-                * do_swap_page and try_to_unuse repeatedly compete.
-                */
-               wait_on_page_locked(page);
-               wait_on_page_writeback(page);
-               lock_page(page);
-               wait_on_page_writeback(page);
-
-               /*
-                * Remove all references to entry.
+                * Make sure that we aren't completely killing
+                * interactive performance.
                 */
-               swcount = *swap_map;
-               if (swap_count(swcount) == SWAP_MAP_SHMEM) {
-                       retval = shmem_unuse(entry, page);
-                       /* page has already been unlocked and released */
-                       if (retval < 0)
-                               break;
-                       continue;
-               }
-               if (swap_count(swcount) && start_mm != &init_mm)
-                       retval = unuse_mm(start_mm, entry, page);
-
-               if (swap_count(*swap_map)) {
-                       int set_start_mm = (*swap_map >= swcount);
-                       struct list_head *p = &start_mm->mmlist;
-                       struct mm_struct *new_start_mm = start_mm;
-                       struct mm_struct *prev_mm = start_mm;
-                       struct mm_struct *mm;
-
-                       mmget(new_start_mm);
-                       mmget(prev_mm);
-                       spin_lock(&mmlist_lock);
-                       while (swap_count(*swap_map) && !retval &&
-                                       (p = p->next) != &start_mm->mmlist) {
-                               mm = list_entry(p, struct mm_struct, mmlist);
-                               if (!mmget_not_zero(mm))
-                                       continue;
-                               spin_unlock(&mmlist_lock);
-                               mmput(prev_mm);
-                               prev_mm = mm;
+               cond_resched();
+               spin_lock(&mmlist_lock);
+       }
+       spin_unlock(&mmlist_lock);
 
-                               cond_resched();
+       mmput(prev_mm);
 
-                               swcount = *swap_map;
-                               if (!swap_count(swcount)) /* any usage ? */
-                                       ;
-                               else if (mm == &init_mm)
-                                       set_start_mm = 1;
-                               else
-                                       retval = unuse_mm(mm, entry, page);
-
-                               if (set_start_mm && *swap_map < swcount) {
-                                       mmput(new_start_mm);
-                                       mmget(mm);
-                                       new_start_mm = mm;
-                                       set_start_mm = 0;
-                               }
-                               spin_lock(&mmlist_lock);
-                       }
-                       spin_unlock(&mmlist_lock);
-                       mmput(prev_mm);
-                       mmput(start_mm);
-                       start_mm = new_start_mm;
-               }
-               if (retval) {
-                       unlock_page(page);
-                       put_page(page);
-                       break;
-               }
+       i = 0;
+       while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
 
-               /*
-                * If a reference remains (rare), we would like to leave
-                * the page in the swap cache; but try_to_unmap could
-                * then re-duplicate the entry once we drop page lock,
-                * so we might loop indefinitely; also, that page could
-                * not be swapped out to other storage meanwhile.  So:
-                * delete from cache even if there's another reference,
-                * after ensuring that the data has been saved to disk -
-                * since if the reference remains (rarer), it will be
-                * read from disk into another page.  Splitting into two
-                * pages would be incorrect if swap supported "shared
-                * private" pages, but they are handled by tmpfs files.
-                *
-                * Given how unuse_vma() targets one particular offset
-                * in an anon_vma, once the anon_vma has been determined,
-                * this splitting happens to be just what is needed to
-                * handle where KSM pages have been swapped out: re-reading
-                * is unnecessarily slow, but we can fix that later on.
-                */
-               if (swap_count(*swap_map) &&
-                    PageDirty(page) && PageSwapCache(page)) {
-                       struct writeback_control wbc = {
-                               .sync_mode = WB_SYNC_NONE,
-                       };
-
-                       swap_writepage(compound_head(page), &wbc);
-                       lock_page(page);
-                       wait_on_page_writeback(page);
-               }
+               entry = swp_entry(type, i);
+               page = find_get_page(swap_address_space(entry), i);
+               if (!page)
+                       continue;
 
                /*
                 * It is conceivable that a racing task removed this page from
-                * swap cache just before we acquired the page lock at the top,
-                * or while we dropped it in unuse_mm().  The page might even
-                * be back in swap cache on another swap area: that we must not
-                * delete, since it may not have been written out to swap yet.
-                */
-               if (PageSwapCache(page) &&
-                   likely(page_private(page) == entry.val) &&
-                   !page_swapped(page))
-                       delete_from_swap_cache(compound_head(page));
-
-               /*
-                * So we could skip searching mms once swap count went
-                * to 1, we did not mark any present ptes as dirty: must
-                * mark page dirty so shrink_page_list will preserve it.
+                * swap cache just before we acquired the page lock. The page
+                * might even be back in swap cache on another swap area. But
+                * that is okay, try_to_free_swap() only removes stale pages.
                 */
-               SetPageDirty(page);
+               lock_page(page);
+               wait_on_page_writeback(page);
+               try_to_free_swap(page);
                unlock_page(page);
                put_page(page);
 
                /*
-                * Make sure that we aren't completely killing
-                * interactive performance.
+                * For frontswap, we just need to unuse pages_to_unuse, if
+                * it was specified. Need not check frontswap again here as
+                * we already zeroed out pages_to_unuse if not frontswap.
                 */
-               cond_resched();
-               if (frontswap && pages_to_unuse > 0) {
-                       if (!--pages_to_unuse)
-                               break;
-               }
+               if (pages_to_unuse && --pages_to_unuse == 0)
+                       goto out;
        }
 
-       mmput(start_mm);
-       return retval;
+       /*
+        * Lets check again to see if there are still swap entries in the map.
+        * If yes, we would need to do retry the unuse logic again.
+        * Under global memory pressure, swap entries can be reinserted back
+        * into process space after the mmlist loop above passes over them.
+        * Its not worth continuosuly retrying to unuse the swap in this case.
+        * So we try SWAP_UNUSE_MAX_TRIES times.
+        */
+       if (++retries >= SWAP_UNUSE_MAX_TRIES)
+               retval = -EBUSY;
+       else if (si->inuse_pages)
+               goto retry;
+
+out:
+       return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
 }
 
 /*
@@ -2812,8 +2706,9 @@ static struct swap_info_struct *alloc_swap_info(void)
        struct swap_info_struct *p;
        unsigned int type;
        int i;
+       int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
 
-       p = kvzalloc(sizeof(*p), GFP_KERNEL);
+       p = kvzalloc(size, GFP_KERNEL);
        if (!p)
                return ERR_PTR(-ENOMEM);