mm, swap: don't use VMA based swap readahead if HDD is used as swap
[sfrench/cifs-2.6.git] / mm / swapfile.c
index 6ba4aab2db0b570a241abb8935f901833c65b862..4f8b3e08a5476e867366f1bf91f74751bb7fc8e7 100644 (file)
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
 /* Activity counter to indicate that a swapon or swapoff has occurred */
 static atomic_t proc_poll_event = ATOMIC_INIT(0);
 
+atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+
 static inline unsigned char swap_count(unsigned char ent)
 {
        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
@@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
        info->data = 0;
 }
 
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+       return info->flags & CLUSTER_FLAG_HUGE;
+}
+
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+       info->flags &= ~CLUSTER_FLAG_HUGE;
+}
+
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
                                                     unsigned long offset)
 {
@@ -846,7 +858,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
        offset = idx * SWAPFILE_CLUSTER;
        ci = lock_cluster(si, offset);
        alloc_cluster(si, idx);
-       cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+       cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
 
        map = si->swap_map + offset;
        for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -938,9 +950,10 @@ start_over:
                        spin_unlock(&si->lock);
                        goto nextsi;
                }
-               if (cluster)
-                       n_ret = swap_alloc_cluster(si, swp_entries);
-               else
+               if (cluster) {
+                       if (!(si->flags & SWP_FILE))
+                               n_ret = swap_alloc_cluster(si, swp_entries);
+               } else
                        n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
                                                    n_goal, swp_entries);
                spin_unlock(&si->lock);
@@ -1168,22 +1181,57 @@ static void swapcache_free_cluster(swp_entry_t entry)
        struct swap_cluster_info *ci;
        struct swap_info_struct *si;
        unsigned char *map;
-       unsigned int i;
+       unsigned int i, free_entries = 0;
+       unsigned char val;
 
-       si = swap_info_get(entry);
+       si = _swap_info_get(entry);
        if (!si)
                return;
 
        ci = lock_cluster(si, offset);
+       VM_BUG_ON(!cluster_is_huge(ci));
        map = si->swap_map + offset;
        for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-               VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
-               map[i] = 0;
+               val = map[i];
+               VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+               if (val == SWAP_HAS_CACHE)
+                       free_entries++;
        }
+       if (!free_entries) {
+               for (i = 0; i < SWAPFILE_CLUSTER; i++)
+                       map[i] &= ~SWAP_HAS_CACHE;
+       }
+       cluster_clear_huge(ci);
        unlock_cluster(ci);
-       mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-       swap_free_cluster(si, idx);
-       spin_unlock(&si->lock);
+       if (free_entries == SWAPFILE_CLUSTER) {
+               spin_lock(&si->lock);
+               ci = lock_cluster(si, offset);
+               memset(map, 0, SWAPFILE_CLUSTER);
+               unlock_cluster(ci);
+               mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+               swap_free_cluster(si, idx);
+               spin_unlock(&si->lock);
+       } else if (free_entries) {
+               for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+                       if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
+                               free_swap_slot(entry);
+               }
+       }
+}
+
+int split_swap_cluster(swp_entry_t entry)
+{
+       struct swap_info_struct *si;
+       struct swap_cluster_info *ci;
+       unsigned long offset = swp_offset(entry);
+
+       si = _swap_info_get(entry);
+       if (!si)
+               return -EBUSY;
+       ci = lock_cluster(si, offset);
+       cluster_clear_huge(ci);
+       unlock_cluster(ci);
+       return 0;
 }
 #else
 static inline void swapcache_free_cluster(swp_entry_t entry)
@@ -1332,29 +1380,161 @@ out:
        return count;
 }
 
+#ifdef CONFIG_THP_SWAP
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+                                        swp_entry_t entry)
+{
+       struct swap_cluster_info *ci;
+       unsigned char *map = si->swap_map;
+       unsigned long roffset = swp_offset(entry);
+       unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+       int i;
+       bool ret = false;
+
+       ci = lock_cluster_or_swap_info(si, offset);
+       if (!ci || !cluster_is_huge(ci)) {
+               if (map[roffset] != SWAP_HAS_CACHE)
+                       ret = true;
+               goto unlock_out;
+       }
+       for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+               if (map[offset + i] != SWAP_HAS_CACHE) {
+                       ret = true;
+                       break;
+               }
+       }
+unlock_out:
+       unlock_cluster_or_swap_info(si, ci);
+       return ret;
+}
+
+static bool page_swapped(struct page *page)
+{
+       swp_entry_t entry;
+       struct swap_info_struct *si;
+
+       if (likely(!PageTransCompound(page)))
+               return page_swapcount(page) != 0;
+
+       page = compound_head(page);
+       entry.val = page_private(page);
+       si = _swap_info_get(entry);
+       if (si)
+               return swap_page_trans_huge_swapped(si, entry);
+       return false;
+}
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+                                        int *total_swapcount)
+{
+       int i, map_swapcount, _total_mapcount, _total_swapcount;
+       unsigned long offset = 0;
+       struct swap_info_struct *si;
+       struct swap_cluster_info *ci = NULL;
+       unsigned char *map = NULL;
+       int mapcount, swapcount = 0;
+
+       /* hugetlbfs shouldn't call it */
+       VM_BUG_ON_PAGE(PageHuge(page), page);
+
+       if (likely(!PageTransCompound(page))) {
+               mapcount = atomic_read(&page->_mapcount) + 1;
+               if (total_mapcount)
+                       *total_mapcount = mapcount;
+               if (PageSwapCache(page))
+                       swapcount = page_swapcount(page);
+               if (total_swapcount)
+                       *total_swapcount = swapcount;
+               return mapcount + swapcount;
+       }
+
+       page = compound_head(page);
+
+       _total_mapcount = _total_swapcount = map_swapcount = 0;
+       if (PageSwapCache(page)) {
+               swp_entry_t entry;
+
+               entry.val = page_private(page);
+               si = _swap_info_get(entry);
+               if (si) {
+                       map = si->swap_map;
+                       offset = swp_offset(entry);
+               }
+       }
+       if (map)
+               ci = lock_cluster(si, offset);
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               mapcount = atomic_read(&page[i]._mapcount) + 1;
+               _total_mapcount += mapcount;
+               if (map) {
+                       swapcount = swap_count(map[offset + i]);
+                       _total_swapcount += swapcount;
+               }
+               map_swapcount = max(map_swapcount, mapcount + swapcount);
+       }
+       unlock_cluster(ci);
+       if (PageDoubleMap(page)) {
+               map_swapcount -= 1;
+               _total_mapcount -= HPAGE_PMD_NR;
+       }
+       mapcount = compound_mapcount(page);
+       map_swapcount += mapcount;
+       _total_mapcount += mapcount;
+       if (total_mapcount)
+               *total_mapcount = _total_mapcount;
+       if (total_swapcount)
+               *total_swapcount = _total_swapcount;
+
+       return map_swapcount;
+}
+#else
+#define swap_page_trans_huge_swapped(si, entry)        swap_swapcount(si, entry)
+#define page_swapped(page)                     (page_swapcount(page) != 0)
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+                                        int *total_swapcount)
+{
+       int mapcount, swapcount = 0;
+
+       /* hugetlbfs shouldn't call it */
+       VM_BUG_ON_PAGE(PageHuge(page), page);
+
+       mapcount = page_trans_huge_mapcount(page, total_mapcount);
+       if (PageSwapCache(page))
+               swapcount = page_swapcount(page);
+       if (total_swapcount)
+               *total_swapcount = swapcount;
+       return mapcount + swapcount;
+}
+#endif
+
 /*
  * We can write to an anon page without COW if there are no other references
  * to it.  And as a side-effect, free up its swap: because the old content
  * on disk will never be read, and seeking back there to write new content
  * later would only waste time away from clustering.
  *
- * NOTE: total_mapcount should not be relied upon by the caller if
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
  * reuse_swap_page() returns false, but it may be always overwritten
  * (see the other implementation for CONFIG_SWAP=n).
  */
-bool reuse_swap_page(struct page *page, int *total_mapcount)
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
 {
-       int count;
+       int count, total_mapcount, total_swapcount;
 
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (unlikely(PageKsm(page)))
                return false;
-       count = page_trans_huge_mapcount(page, total_mapcount);
-       if (count <= 1 && PageSwapCache(page)) {
-               count += page_swapcount(page);
-               if (count != 1)
-                       goto out;
+       count = page_trans_huge_map_swapcount(page, &total_mapcount,
+                                             &total_swapcount);
+       if (total_map_swapcount)
+               *total_map_swapcount = total_mapcount + total_swapcount;
+       if (count == 1 && PageSwapCache(page) &&
+           (likely(!PageTransCompound(page)) ||
+            /* The remaining swap count will be freed soon */
+            total_swapcount == page_swapcount(page))) {
                if (!PageWriteback(page)) {
+                       page = compound_head(page);
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
                } else {
@@ -1370,7 +1550,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
                        spin_unlock(&p->lock);
                }
        }
-out:
+
        return count <= 1;
 }
 
@@ -1386,7 +1566,7 @@ int try_to_free_swap(struct page *page)
                return 0;
        if (PageWriteback(page))
                return 0;
-       if (page_swapcount(page))
+       if (page_swapped(page))
                return 0;
 
        /*
@@ -1407,6 +1587,7 @@ int try_to_free_swap(struct page *page)
        if (pm_suspended_storage())
                return 0;
 
+       page = compound_head(page);
        delete_from_swap_cache(page);
        SetPageDirty(page);
        return 1;
@@ -1428,7 +1609,8 @@ int free_swap_and_cache(swp_entry_t entry)
        p = _swap_info_get(entry);
        if (p) {
                count = __swap_entry_free(p, entry, 1);
-               if (count == SWAP_HAS_CACHE) {
+               if (count == SWAP_HAS_CACHE &&
+                   !swap_page_trans_huge_swapped(p, entry)) {
                        page = find_get_page(swap_address_space(entry),
                                             swp_offset(entry));
                        if (page && !trylock_page(page)) {
@@ -1445,7 +1627,8 @@ int free_swap_and_cache(swp_entry_t entry)
                 */
                if (PageSwapCache(page) && !PageWriteback(page) &&
                    (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
-                   !swap_swapcount(p, entry)) {
+                   !swap_page_trans_huge_swapped(p, entry)) {
+                       page = compound_head(page);
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
                }
@@ -1999,7 +2182,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
                                .sync_mode = WB_SYNC_NONE,
                        };
 
-                       swap_writepage(page, &wbc);
+                       swap_writepage(compound_head(page), &wbc);
                        lock_page(page);
                        wait_on_page_writeback(page);
                }
@@ -2012,8 +2195,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
                 * delete, since it may not have been written out to swap yet.
                 */
                if (PageSwapCache(page) &&
-                   likely(page_private(page) == entry.val))
-                       delete_from_swap_cache(page);
+                   likely(page_private(page) == entry.val) &&
+                   !page_swapped(page))
+                       delete_from_swap_cache(compound_head(page));
 
                /*
                 * So we could skip searching mms once swap count went
@@ -2387,6 +2571,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);
 
+       if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+               atomic_dec(&nr_rotate_swap);
+
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
@@ -2963,7 +3150,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        cluster = per_cpu_ptr(p->percpu_cluster, cpu);
                        cluster_set_null(&cluster->index);
                }
-       }
+       } else
+               atomic_inc(&nr_rotate_swap);
 
        error = swap_cgroup_swapon(p->type, maxpages);
        if (error)