mm: kvfree the swap cluster info if the swap file is unsatisfactory
[sfrench/cifs-2.6.git] / mm / swapfile.c
index 6ba4aab2db0b570a241abb8935f901833c65b862..12251d386dc76b3ad8db17014086538f80ccf686 100644 (file)
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
 EXPORT_SYMBOL_GPL(nr_swap_pages);
 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 long total_swap_pages;
-static int least_priority;
+static int least_priority = -1;
 
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
  * is held and the locking order requires swap_lock to be taken
  * before any swap_info_struct->lock.
  */
-static PLIST_HEAD(swap_avail_head);
+struct plist_head *swap_avail_heads;
 static DEFINE_SPINLOCK(swap_avail_lock);
 
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
 /* Activity counter to indicate that a swapon or swapoff has occurred */
 static atomic_t proc_poll_event = ATOMIC_INIT(0);
 
+atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+
 static inline unsigned char swap_count(unsigned char ent)
 {
        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
@@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
        info->data = 0;
 }
 
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+       return info->flags & CLUSTER_FLAG_HUGE;
+}
+
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+       info->flags &= ~CLUSTER_FLAG_HUGE;
+}
+
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
                                                     unsigned long offset)
 {
@@ -580,6 +592,21 @@ new_cluster:
        return found_free;
 }
 
+static void __del_from_avail_list(struct swap_info_struct *p)
+{
+       int nid;
+
+       for_each_node(nid)
+               plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+}
+
+static void del_from_avail_list(struct swap_info_struct *p)
+{
+       spin_lock(&swap_avail_lock);
+       __del_from_avail_list(p);
+       spin_unlock(&swap_avail_lock);
+}
+
 static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
                             unsigned int nr_entries)
 {
@@ -593,10 +620,20 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
        if (si->inuse_pages == si->pages) {
                si->lowest_bit = si->max;
                si->highest_bit = 0;
-               spin_lock(&swap_avail_lock);
-               plist_del(&si->avail_list, &swap_avail_head);
-               spin_unlock(&swap_avail_lock);
+               del_from_avail_list(si);
+       }
+}
+
+static void add_to_avail_list(struct swap_info_struct *p)
+{
+       int nid;
+
+       spin_lock(&swap_avail_lock);
+       for_each_node(nid) {
+               WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+               plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
        }
+       spin_unlock(&swap_avail_lock);
 }
 
 static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -611,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
                bool was_full = !si->highest_bit;
 
                si->highest_bit = end;
-               if (was_full && (si->flags & SWP_WRITEOK)) {
-                       spin_lock(&swap_avail_lock);
-                       WARN_ON(!plist_node_empty(&si->avail_list));
-                       if (plist_node_empty(&si->avail_list))
-                               plist_add(&si->avail_list, &swap_avail_head);
-                       spin_unlock(&swap_avail_lock);
-               }
+               if (was_full && (si->flags & SWP_WRITEOK))
+                       add_to_avail_list(si);
        }
        atomic_long_add(nr_entries, &nr_swap_pages);
        si->inuse_pages -= nr_entries;
@@ -846,7 +878,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
        offset = idx * SWAPFILE_CLUSTER;
        ci = lock_cluster(si, offset);
        alloc_cluster(si, idx);
-       cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+       cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
 
        map = si->swap_map + offset;
        for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -898,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
        struct swap_info_struct *si, *next;
        long avail_pgs;
        int n_ret = 0;
+       int node;
 
        /* Only single cluster request supported */
        WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -917,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
        spin_lock(&swap_avail_lock);
 
 start_over:
-       plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+       node = numa_node_id();
+       plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
                /* requeue si to after same-priority siblings */
-               plist_requeue(&si->avail_list, &swap_avail_head);
+               plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
                spin_unlock(&swap_avail_lock);
                spin_lock(&si->lock);
                if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
                        spin_lock(&swap_avail_lock);
-                       if (plist_node_empty(&si->avail_list)) {
+                       if (plist_node_empty(&si->avail_lists[node])) {
                                spin_unlock(&si->lock);
                                goto nextsi;
                        }
@@ -934,13 +968,14 @@ start_over:
                        WARN(!(si->flags & SWP_WRITEOK),
                             "swap_info %d in list but !SWP_WRITEOK\n",
                             si->type);
-                       plist_del(&si->avail_list, &swap_avail_head);
+                       __del_from_avail_list(si);
                        spin_unlock(&si->lock);
                        goto nextsi;
                }
-               if (cluster)
-                       n_ret = swap_alloc_cluster(si, swp_entries);
-               else
+               if (cluster) {
+                       if (!(si->flags & SWP_FILE))
+                               n_ret = swap_alloc_cluster(si, swp_entries);
+               } else
                        n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
                                                    n_goal, swp_entries);
                spin_unlock(&si->lock);
@@ -962,7 +997,7 @@ nextsi:
                 * swap_avail_head list then try it, otherwise start over
                 * if we have not gotten any slots.
                 */
-               if (plist_node_empty(&next->avail_list))
+               if (plist_node_empty(&next->avail_lists[node]))
                        goto start_over;
        }
 
@@ -1168,22 +1203,57 @@ static void swapcache_free_cluster(swp_entry_t entry)
        struct swap_cluster_info *ci;
        struct swap_info_struct *si;
        unsigned char *map;
-       unsigned int i;
+       unsigned int i, free_entries = 0;
+       unsigned char val;
 
-       si = swap_info_get(entry);
+       si = _swap_info_get(entry);
        if (!si)
                return;
 
        ci = lock_cluster(si, offset);
+       VM_BUG_ON(!cluster_is_huge(ci));
        map = si->swap_map + offset;
        for (i = 0; i < SWAPFILE_CLUSTER; i++) {
-               VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
-               map[i] = 0;
+               val = map[i];
+               VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+               if (val == SWAP_HAS_CACHE)
+                       free_entries++;
+       }
+       if (!free_entries) {
+               for (i = 0; i < SWAPFILE_CLUSTER; i++)
+                       map[i] &= ~SWAP_HAS_CACHE;
        }
+       cluster_clear_huge(ci);
        unlock_cluster(ci);
-       mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
-       swap_free_cluster(si, idx);
-       spin_unlock(&si->lock);
+       if (free_entries == SWAPFILE_CLUSTER) {
+               spin_lock(&si->lock);
+               ci = lock_cluster(si, offset);
+               memset(map, 0, SWAPFILE_CLUSTER);
+               unlock_cluster(ci);
+               mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+               swap_free_cluster(si, idx);
+               spin_unlock(&si->lock);
+       } else if (free_entries) {
+               for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+                       if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
+                               free_swap_slot(entry);
+               }
+       }
+}
+
+int split_swap_cluster(swp_entry_t entry)
+{
+       struct swap_info_struct *si;
+       struct swap_cluster_info *ci;
+       unsigned long offset = swp_offset(entry);
+
+       si = _swap_info_get(entry);
+       if (!si)
+               return -EBUSY;
+       ci = lock_cluster(si, offset);
+       cluster_clear_huge(ci);
+       unlock_cluster(ci);
+       return 0;
 }
 #else
 static inline void swapcache_free_cluster(swp_entry_t entry)
@@ -1332,29 +1402,161 @@ out:
        return count;
 }
 
+#ifdef CONFIG_THP_SWAP
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+                                        swp_entry_t entry)
+{
+       struct swap_cluster_info *ci;
+       unsigned char *map = si->swap_map;
+       unsigned long roffset = swp_offset(entry);
+       unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+       int i;
+       bool ret = false;
+
+       ci = lock_cluster_or_swap_info(si, offset);
+       if (!ci || !cluster_is_huge(ci)) {
+               if (map[roffset] != SWAP_HAS_CACHE)
+                       ret = true;
+               goto unlock_out;
+       }
+       for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+               if (map[offset + i] != SWAP_HAS_CACHE) {
+                       ret = true;
+                       break;
+               }
+       }
+unlock_out:
+       unlock_cluster_or_swap_info(si, ci);
+       return ret;
+}
+
+static bool page_swapped(struct page *page)
+{
+       swp_entry_t entry;
+       struct swap_info_struct *si;
+
+       if (likely(!PageTransCompound(page)))
+               return page_swapcount(page) != 0;
+
+       page = compound_head(page);
+       entry.val = page_private(page);
+       si = _swap_info_get(entry);
+       if (si)
+               return swap_page_trans_huge_swapped(si, entry);
+       return false;
+}
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+                                        int *total_swapcount)
+{
+       int i, map_swapcount, _total_mapcount, _total_swapcount;
+       unsigned long offset = 0;
+       struct swap_info_struct *si;
+       struct swap_cluster_info *ci = NULL;
+       unsigned char *map = NULL;
+       int mapcount, swapcount = 0;
+
+       /* hugetlbfs shouldn't call it */
+       VM_BUG_ON_PAGE(PageHuge(page), page);
+
+       if (likely(!PageTransCompound(page))) {
+               mapcount = atomic_read(&page->_mapcount) + 1;
+               if (total_mapcount)
+                       *total_mapcount = mapcount;
+               if (PageSwapCache(page))
+                       swapcount = page_swapcount(page);
+               if (total_swapcount)
+                       *total_swapcount = swapcount;
+               return mapcount + swapcount;
+       }
+
+       page = compound_head(page);
+
+       _total_mapcount = _total_swapcount = map_swapcount = 0;
+       if (PageSwapCache(page)) {
+               swp_entry_t entry;
+
+               entry.val = page_private(page);
+               si = _swap_info_get(entry);
+               if (si) {
+                       map = si->swap_map;
+                       offset = swp_offset(entry);
+               }
+       }
+       if (map)
+               ci = lock_cluster(si, offset);
+       for (i = 0; i < HPAGE_PMD_NR; i++) {
+               mapcount = atomic_read(&page[i]._mapcount) + 1;
+               _total_mapcount += mapcount;
+               if (map) {
+                       swapcount = swap_count(map[offset + i]);
+                       _total_swapcount += swapcount;
+               }
+               map_swapcount = max(map_swapcount, mapcount + swapcount);
+       }
+       unlock_cluster(ci);
+       if (PageDoubleMap(page)) {
+               map_swapcount -= 1;
+               _total_mapcount -= HPAGE_PMD_NR;
+       }
+       mapcount = compound_mapcount(page);
+       map_swapcount += mapcount;
+       _total_mapcount += mapcount;
+       if (total_mapcount)
+               *total_mapcount = _total_mapcount;
+       if (total_swapcount)
+               *total_swapcount = _total_swapcount;
+
+       return map_swapcount;
+}
+#else
+#define swap_page_trans_huge_swapped(si, entry)        swap_swapcount(si, entry)
+#define page_swapped(page)                     (page_swapcount(page) != 0)
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+                                        int *total_swapcount)
+{
+       int mapcount, swapcount = 0;
+
+       /* hugetlbfs shouldn't call it */
+       VM_BUG_ON_PAGE(PageHuge(page), page);
+
+       mapcount = page_trans_huge_mapcount(page, total_mapcount);
+       if (PageSwapCache(page))
+               swapcount = page_swapcount(page);
+       if (total_swapcount)
+               *total_swapcount = swapcount;
+       return mapcount + swapcount;
+}
+#endif
+
 /*
  * We can write to an anon page without COW if there are no other references
  * to it.  And as a side-effect, free up its swap: because the old content
  * on disk will never be read, and seeking back there to write new content
  * later would only waste time away from clustering.
  *
- * NOTE: total_mapcount should not be relied upon by the caller if
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
  * reuse_swap_page() returns false, but it may be always overwritten
  * (see the other implementation for CONFIG_SWAP=n).
  */
-bool reuse_swap_page(struct page *page, int *total_mapcount)
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
 {
-       int count;
+       int count, total_mapcount, total_swapcount;
 
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        if (unlikely(PageKsm(page)))
                return false;
-       count = page_trans_huge_mapcount(page, total_mapcount);
-       if (count <= 1 && PageSwapCache(page)) {
-               count += page_swapcount(page);
-               if (count != 1)
-                       goto out;
+       count = page_trans_huge_map_swapcount(page, &total_mapcount,
+                                             &total_swapcount);
+       if (total_map_swapcount)
+               *total_map_swapcount = total_mapcount + total_swapcount;
+       if (count == 1 && PageSwapCache(page) &&
+           (likely(!PageTransCompound(page)) ||
+            /* The remaining swap count will be freed soon */
+            total_swapcount == page_swapcount(page))) {
                if (!PageWriteback(page)) {
+                       page = compound_head(page);
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
                } else {
@@ -1370,7 +1572,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
                        spin_unlock(&p->lock);
                }
        }
-out:
+
        return count <= 1;
 }
 
@@ -1386,7 +1588,7 @@ int try_to_free_swap(struct page *page)
                return 0;
        if (PageWriteback(page))
                return 0;
-       if (page_swapcount(page))
+       if (page_swapped(page))
                return 0;
 
        /*
@@ -1407,6 +1609,7 @@ int try_to_free_swap(struct page *page)
        if (pm_suspended_storage())
                return 0;
 
+       page = compound_head(page);
        delete_from_swap_cache(page);
        SetPageDirty(page);
        return 1;
@@ -1428,7 +1631,8 @@ int free_swap_and_cache(swp_entry_t entry)
        p = _swap_info_get(entry);
        if (p) {
                count = __swap_entry_free(p, entry, 1);
-               if (count == SWAP_HAS_CACHE) {
+               if (count == SWAP_HAS_CACHE &&
+                   !swap_page_trans_huge_swapped(p, entry)) {
                        page = find_get_page(swap_address_space(entry),
                                             swp_offset(entry));
                        if (page && !trylock_page(page)) {
@@ -1445,7 +1649,8 @@ int free_swap_and_cache(swp_entry_t entry)
                 */
                if (PageSwapCache(page) && !PageWriteback(page) &&
                    (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
-                   !swap_swapcount(p, entry)) {
+                   !swap_page_trans_huge_swapped(p, entry)) {
+                       page = compound_head(page);
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
                }
@@ -1999,7 +2204,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
                                .sync_mode = WB_SYNC_NONE,
                        };
 
-                       swap_writepage(page, &wbc);
+                       swap_writepage(compound_head(page), &wbc);
                        lock_page(page);
                        wait_on_page_writeback(page);
                }
@@ -2012,8 +2217,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
                 * delete, since it may not have been written out to swap yet.
                 */
                if (PageSwapCache(page) &&
-                   likely(page_private(page) == entry.val))
-                       delete_from_swap_cache(page);
+                   likely(page_private(page) == entry.val) &&
+                   !page_swapped(page))
+                       delete_from_swap_cache(compound_head(page));
 
                /*
                 * So we could skip searching mms once swap count went
@@ -2226,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
        return generic_swapfile_activate(sis, swap_file, span);
 }
 
+static int swap_node(struct swap_info_struct *p)
+{
+       struct block_device *bdev;
+
+       if (p->bdev)
+               bdev = p->bdev;
+       else
+               bdev = p->swap_file->f_inode->i_sb->s_bdev;
+
+       return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
+}
+
 static void _enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned char *swap_map,
                                struct swap_cluster_info *cluster_info)
 {
+       int i;
+
        if (prio >= 0)
                p->prio = prio;
        else
@@ -2239,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
         * low-to-high, while swap ordering is high-to-low
         */
        p->list.prio = -p->prio;
-       p->avail_list.prio = -p->prio;
+       for_each_node(i) {
+               if (p->prio >= 0)
+                       p->avail_lists[i].prio = -p->prio;
+               else {
+                       if (swap_node(p) == i)
+                               p->avail_lists[i].prio = 1;
+                       else
+                               p->avail_lists[i].prio = -p->prio;
+               }
+       }
        p->swap_map = swap_map;
        p->cluster_info = cluster_info;
        p->flags |= SWP_WRITEOK;
@@ -2258,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
         * swap_info_struct.
         */
        plist_add(&p->list, &swap_active_head);
-       spin_lock(&swap_avail_lock);
-       plist_add(&p->avail_list, &swap_avail_head);
-       spin_unlock(&swap_avail_lock);
+       add_to_avail_list(p);
 }
 
 static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2345,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                spin_unlock(&swap_lock);
                goto out_dput;
        }
-       spin_lock(&swap_avail_lock);
-       plist_del(&p->avail_list, &swap_avail_head);
-       spin_unlock(&swap_avail_lock);
+       del_from_avail_list(p);
        spin_lock(&p->lock);
        if (p->prio < 0) {
                struct swap_info_struct *si = p;
+               int nid;
 
                plist_for_each_entry_continue(si, &swap_active_head, list) {
                        si->prio++;
                        si->list.prio--;
-                       si->avail_list.prio--;
+                       for_each_node(nid) {
+                               if (si->avail_lists[nid].prio != 1)
+                                       si->avail_lists[nid].prio--;
+                       }
                }
                least_priority++;
        }
@@ -2387,6 +2616,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);
 
+       if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+               atomic_dec(&nr_rotate_swap);
+
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
@@ -2596,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 {
        struct swap_info_struct *p;
        unsigned int type;
+       int i;
 
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
@@ -2631,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
        }
        INIT_LIST_HEAD(&p->first_swap_extent.list);
        plist_node_init(&p->list, 0);
-       plist_node_init(&p->avail_list, 0);
+       for_each_node(i)
+               plist_node_init(&p->avail_lists[i], 0);
        p->flags = SWP_USED;
        spin_unlock(&swap_lock);
        spin_lock_init(&p->lock);
@@ -2873,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       if (!swap_avail_heads)
+               return -ENOMEM;
+
        p = alloc_swap_info();
        if (IS_ERR(p))
                return PTR_ERR(p);
@@ -2963,7 +3200,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                        cluster = per_cpu_ptr(p->percpu_cluster, cpu);
                        cluster_set_null(&cluster->index);
                }
-       }
+       } else
+               atomic_inc(&nr_rotate_swap);
 
        error = swap_cgroup_swapon(p->type, maxpages);
        if (error)
@@ -3052,7 +3290,7 @@ bad_swap:
        p->flags = 0;
        spin_unlock(&swap_lock);
        vfree(swap_map);
-       vfree(cluster_info);
+       kvfree(cluster_info);
        if (swap_file) {
                if (inode && S_ISREG(inode->i_mode)) {
                        inode_unlock(inode);
@@ -3457,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
                }
        }
 }
+
+static int __init swapfile_init(void)
+{
+       int nid;
+
+       swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
+                                        GFP_KERNEL);
+       if (!swap_avail_heads) {
+               pr_emerg("Not enough memory for swap heads, swap is disabled\n");
+               return -ENOMEM;
+       }
+
+       for_each_node(nid)
+               plist_head_init(&swap_avail_heads[nid]);
+
+       return 0;
+}
+subsys_initcall(swapfile_init);