mm/zsmalloc.c: fix race condition in zs_destroy_pool
[sfrench/cifs-2.6.git] / mm / zsmalloc.c
index db09eb3669c5d9ca630d1cea5ea66d85e2dea70d..08def3a0d2007c3030384e4cd4d04f83b5ac7c74 100644 (file)
@@ -52,7 +52,9 @@
 #include <linux/zsmalloc.h>
 #include <linux/zpool.h>
 #include <linux/mount.h>
+#include <linux/pseudo_fs.h>
 #include <linux/migrate.h>
+#include <linux/wait.h>
 #include <linux/pagemap.h>
 #include <linux/fs.h>
 
@@ -267,6 +269,10 @@ struct zs_pool {
 #ifdef CONFIG_COMPACTION
        struct inode *inode;
        struct work_struct free_work;
+       /* A wait queue for when migration races with async_free_zspage() */
+       struct wait_queue_head migration_wait;
+       atomic_long_t isolated_pages;
+       bool destroying;
 #endif
 };
 
@@ -1798,19 +1804,14 @@ static void lock_zspage(struct zspage *zspage)
        } while ((page = get_next_page(page)) != NULL);
 }
 
-static struct dentry *zs_mount(struct file_system_type *fs_type,
-                               int flags, const char *dev_name, void *data)
+static int zs_init_fs_context(struct fs_context *fc)
 {
-       static const struct dentry_operations ops = {
-               .d_dname = simple_dname,
-       };
-
-       return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
+       return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
 }
 
 static struct file_system_type zsmalloc_fs = {
        .name           = "zsmalloc",
-       .mount          = zs_mount,
+       .init_fs_context = zs_init_fs_context,
        .kill_sb        = kill_anon_super,
 };
 
@@ -1866,6 +1867,31 @@ static void dec_zspage_isolation(struct zspage *zspage)
        zspage->isolated--;
 }
 
+static void putback_zspage_deferred(struct zs_pool *pool,
+                                   struct size_class *class,
+                                   struct zspage *zspage)
+{
+       enum fullness_group fg;
+
+       fg = putback_zspage(class, zspage);
+       if (fg == ZS_EMPTY)
+               schedule_work(&pool->free_work);
+
+}
+
+static inline void zs_pool_dec_isolated(struct zs_pool *pool)
+{
+       VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
+       atomic_long_dec(&pool->isolated_pages);
+       /*
+        * There's no possibility of racing, since wait_for_isolated_drain()
+        * checks the isolated count under &class->lock after enqueuing
+        * on migration_wait.
+        */
+       if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+               wake_up_all(&pool->migration_wait);
+}
+
 static void replace_sub_page(struct size_class *class, struct zspage *zspage,
                                struct page *newpage, struct page *oldpage)
 {
@@ -1935,6 +1961,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
         */
        if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
                get_zspage_mapping(zspage, &class_idx, &fullness);
+               atomic_long_inc(&pool->isolated_pages);
                remove_zspage(class, zspage, fullness);
        }
 
@@ -2034,8 +2061,16 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
         * Page migration is done so let's putback isolated zspage to
         * the list if @page is final isolated subpage in the zspage.
         */
-       if (!is_zspage_isolated(zspage))
-               putback_zspage(class, zspage);
+       if (!is_zspage_isolated(zspage)) {
+               /*
+                * We cannot race with zs_destroy_pool() here because we wait
+                * for isolation to hit zero before we start destroying.
+                * Also, we ensure that everyone can see pool->destroying before
+                * we start waiting.
+                */
+               putback_zspage_deferred(pool, class, zspage);
+               zs_pool_dec_isolated(pool);
+       }
 
        reset_page(page);
        put_page(page);
@@ -2081,13 +2116,12 @@ static void zs_page_putback(struct page *page)
        spin_lock(&class->lock);
        dec_zspage_isolation(zspage);
        if (!is_zspage_isolated(zspage)) {
-               fg = putback_zspage(class, zspage);
                /*
                 * Due to page_lock, we cannot free zspage immediately
                 * so let's defer.
                 */
-               if (fg == ZS_EMPTY)
-                       schedule_work(&pool->free_work);
+               putback_zspage_deferred(pool, class, zspage);
+               zs_pool_dec_isolated(pool);
        }
        spin_unlock(&class->lock);
 }
@@ -2111,8 +2145,36 @@ static int zs_register_migration(struct zs_pool *pool)
        return 0;
 }
 
+static bool pool_isolated_are_drained(struct zs_pool *pool)
+{
+       return atomic_long_read(&pool->isolated_pages) == 0;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+
+       /*
+        * We're in the process of destroying the pool, so there are no
+        * active allocations. zs_page_isolate() fails for completely free
+        * zspages, so we need only wait for the zs_pool's isolated
+        * count to hit zero.
+        */
+       wait_event(pool->migration_wait,
+                  pool_isolated_are_drained(pool));
+}
+
 static void zs_unregister_migration(struct zs_pool *pool)
 {
+       pool->destroying = true;
+       /*
+        * We need a memory barrier here to ensure global visibility of
+        * pool->destroying. Thus pool->isolated pages will either be 0 in which
+        * case we don't care, or it will be > 0 and pool->destroying will
+        * ensure that we wake up once isolation hits 0.
+        */
+       smp_mb();
+       wait_for_isolated_drain(pool); /* This can block */
        flush_work(&pool->free_work);
        iput(pool->inode);
 }
@@ -2350,6 +2412,8 @@ struct zs_pool *zs_create_pool(const char *name)
        if (!pool->name)
                goto err;
 
+       init_waitqueue_head(&pool->migration_wait);
+
        if (create_cache(pool))
                goto err;