mm/zsmalloc.c: fix race condition in zs_destroy_pool

[sfrench/cifs-2.6.git] / mm / zsmalloc.c
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c

index db09eb3669c5d9ca630d1cea5ea66d85e2dea70d..08def3a0d2007c3030384e4cd4d04f83b5ac7c74 100644 (file)
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -52,7 +52,9 @@
  #include <linux/zsmalloc.h>
  #include <linux/zpool.h>
  #include <linux/mount.h>
+#include <linux/pseudo_fs.h>
  #include <linux/migrate.h>
+#include <linux/wait.h>
  #include <linux/pagemap.h>
  #include <linux/fs.h>
  
@@ -267,6 +269,10 @@ struct zs_pool {
  #ifdef CONFIG_COMPACTION
         struct inode *inode;
         struct work_struct free_work;
+       /* A wait queue for when migration races with async_free_zspage() */
+       struct wait_queue_head migration_wait;
+       atomic_long_t isolated_pages;
+       bool destroying;
  #endif
  };
  
@@ -1798,19 +1804,14 @@ static void lock_zspage(struct zspage *zspage)
         } while ((page = get_next_page(page)) != NULL);
  }
  
-static struct dentry *zs_mount(struct file_system_type *fs_type,
-                               int flags, const char *dev_name, void *data)
+static int zs_init_fs_context(struct fs_context *fc)
  {
-       static const struct dentry_operations ops = {
-               .d_dname = simple_dname,
-       };
-
-       return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
+       return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
  }
  
  static struct file_system_type zsmalloc_fs = {
         .name           = "zsmalloc",
-       .mount          = zs_mount,
+       .init_fs_context = zs_init_fs_context,
         .kill_sb        = kill_anon_super,
  };
  
@@ -1866,6 +1867,31 @@ static void dec_zspage_isolation(struct zspage *zspage)
         zspage->isolated--;
  }
  
+static void putback_zspage_deferred(struct zs_pool *pool,
+                                   struct size_class *class,
+                                   struct zspage *zspage)
+{
+       enum fullness_group fg;
+
+       fg = putback_zspage(class, zspage);
+       if (fg == ZS_EMPTY)
+               schedule_work(&pool->free_work);
+
+}
+
+static inline void zs_pool_dec_isolated(struct zs_pool *pool)
+{
+       VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
+       atomic_long_dec(&pool->isolated_pages);
+       /*
+        * There's no possibility of racing, since wait_for_isolated_drain()
+        * checks the isolated count under &class->lock after enqueuing
+        * on migration_wait.
+        */
+       if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+               wake_up_all(&pool->migration_wait);
+}
+
  static void replace_sub_page(struct size_class *class, struct zspage *zspage,
                                 struct page *newpage, struct page *oldpage)
  {
@@ -1935,6 +1961,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
          */
         if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
                 get_zspage_mapping(zspage, &class_idx, &fullness);
+               atomic_long_inc(&pool->isolated_pages);
                 remove_zspage(class, zspage, fullness);
         }
  
@@ -2034,8 +2061,16 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
          * Page migration is done so let's putback isolated zspage to
          * the list if @page is final isolated subpage in the zspage.
          */
-       if (!is_zspage_isolated(zspage))
-               putback_zspage(class, zspage);
+       if (!is_zspage_isolated(zspage)) {
+               /*
+                * We cannot race with zs_destroy_pool() here because we wait
+                * for isolation to hit zero before we start destroying.
+                * Also, we ensure that everyone can see pool->destroying before
+                * we start waiting.
+                */
+               putback_zspage_deferred(pool, class, zspage);
+               zs_pool_dec_isolated(pool);
+       }
  
         reset_page(page);
         put_page(page);
@@ -2081,13 +2116,12 @@ static void zs_page_putback(struct page *page)
         spin_lock(&class->lock);
         dec_zspage_isolation(zspage);
         if (!is_zspage_isolated(zspage)) {
-               fg = putback_zspage(class, zspage);
                 /*
                  * Due to page_lock, we cannot free zspage immediately
                  * so let's defer.
                  */
-               if (fg == ZS_EMPTY)
-                       schedule_work(&pool->free_work);
+               putback_zspage_deferred(pool, class, zspage);
+               zs_pool_dec_isolated(pool);
         }
         spin_unlock(&class->lock);
  }
@@ -2111,8 +2145,36 @@ static int zs_register_migration(struct zs_pool *pool)
         return 0;
  }
  
+static bool pool_isolated_are_drained(struct zs_pool *pool)
+{
+       return atomic_long_read(&pool->isolated_pages) == 0;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+
+       /*
+        * We're in the process of destroying the pool, so there are no
+        * active allocations. zs_page_isolate() fails for completely free
+        * zspages, so we need only wait for the zs_pool's isolated
+        * count to hit zero.
+        */
+       wait_event(pool->migration_wait,
+                  pool_isolated_are_drained(pool));
+}
+
  static void zs_unregister_migration(struct zs_pool *pool)
  {
+       pool->destroying = true;
+       /*
+        * We need a memory barrier here to ensure global visibility of
+        * pool->destroying. Thus pool->isolated pages will either be 0 in which
+        * case we don't care, or it will be > 0 and pool->destroying will
+        * ensure that we wake up once isolation hits 0.
+        */
+       smp_mb();
+       wait_for_isolated_drain(pool); /* This can block */
         flush_work(&pool->free_work);
         iput(pool->inode);
  }
@@ -2350,6 +2412,8 @@ struct zs_pool *zs_create_pool(const char *name)
         if (!pool->name)
                 goto err;
  
+       init_waitqueue_head(&pool->migration_wait);
+
         if (create_cache(pool))
                 goto err;