bcache: move closure debug file into debug directory
[sfrench/cifs-2.6.git] / drivers / md / bcache / super.c
index 133b81225ea9c6ac61934d8593e09e94749c3152..f1f64853114b92268cbc1b4133e06cde9d6e37da 100644 (file)
@@ -47,6 +47,14 @@ const char * const bch_cache_modes[] = {
        NULL
 };
 
+/* Default is -1; we skip past it for stop_when_cache_set_failed */
+const char * const bch_stop_on_failure_modes[] = {
+       "default",
+       "auto",
+       "always",
+       NULL
+};
+
 static struct kobject *bcache_kobj;
 struct mutex bch_register_lock;
 LIST_HEAD(bch_cache_sets);
@@ -521,7 +529,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
        bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
        bch_bio_map(bio, ca->disk_buckets);
 
-       closure_bio_submit(bio, &ca->prio);
+       closure_bio_submit(ca->set, bio, &ca->prio);
        closure_sync(cl);
 }
 
@@ -833,9 +841,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
        q->limits.io_min                = block_size;
        q->limits.logical_block_size    = block_size;
        q->limits.physical_block_size   = block_size;
-       set_bit(QUEUE_FLAG_NONROT,      &d->disk->queue->queue_flags);
-       clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
-       set_bit(QUEUE_FLAG_DISCARD,     &d->disk->queue->queue_flags);
+       blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
+       blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
+       blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
 
        blk_queue_write_cache(q, true, true);
 
@@ -899,6 +907,31 @@ void bch_cached_dev_run(struct cached_dev *dc)
                pr_debug("error creating sysfs link");
 }
 
+/*
+ * If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
+ * work dc->writeback_rate_update is running. Wait until the routine
+ * quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
+ * cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
+ * seconds, give up waiting here and continue to cancel it too.
+ */
+static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
+{
+       int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
+
+       do {
+               if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
+                             &dc->disk.flags))
+                       break;
+               time_out--;
+               schedule_timeout_interruptible(1);
+       } while (time_out > 0);
+
+       if (time_out == 0)
+               pr_warn("give up waiting for dc->writeback_write_update to quit");
+
+       cancel_delayed_work_sync(&dc->writeback_rate_update);
+}
+
 static void cached_dev_detach_finish(struct work_struct *w)
 {
        struct cached_dev *dc = container_of(w, struct cached_dev, detach);
@@ -911,7 +944,9 @@ static void cached_dev_detach_finish(struct work_struct *w)
 
        mutex_lock(&bch_register_lock);
 
-       cancel_delayed_work_sync(&dc->writeback_rate_update);
+       if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+               cancel_writeback_rate_update_dwork(dc);
+
        if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
                kthread_stop(dc->writeback_thread);
                dc->writeback_thread = NULL;
@@ -954,10 +989,12 @@ void bch_cached_dev_detach(struct cached_dev *dc)
        closure_get(&dc->disk.cl);
 
        bch_writeback_queue(dc);
+
        cached_dev_put(dc);
 }
 
-int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
+                         uint8_t *set_uuid)
 {
        uint32_t rtime = cpu_to_le32(get_seconds());
        struct uuid_entry *u;
@@ -965,7 +1002,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 
        bdevname(dc->bdev, buf);
 
-       if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
+       if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
+           (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
                return -ENOENT;
 
        if (dc->disk.c) {
@@ -1052,7 +1090,6 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
        if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
                bch_sectors_dirty_init(&dc->disk);
                atomic_set(&dc->has_dirty, 1);
-               refcount_inc(&dc->count);
                bch_writeback_queue(dc);
        }
 
@@ -1080,14 +1117,16 @@ static void cached_dev_free(struct closure *cl)
 {
        struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
-       cancel_delayed_work_sync(&dc->writeback_rate_update);
+       mutex_lock(&bch_register_lock);
+
+       if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+               cancel_writeback_rate_update_dwork(dc);
+
        if (!IS_ERR_OR_NULL(dc->writeback_thread))
                kthread_stop(dc->writeback_thread);
        if (dc->writeback_write_wq)
                destroy_workqueue(dc->writeback_write_wq);
 
-       mutex_lock(&bch_register_lock);
-
        if (atomic_read(&dc->running))
                bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
        bcache_device_free(&dc->disk);
@@ -1157,6 +1196,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
                max(dc->disk.disk->queue->backing_dev_info->ra_pages,
                    q->backing_dev_info->ra_pages);
 
+       /* default to auto */
+       dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
+
        bch_cached_dev_request_init(dc);
        bch_cached_dev_writeback_init(dc);
        return 0;
@@ -1194,7 +1236,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
 
        list_add(&dc->list, &uncached_devices);
        list_for_each_entry(c, &bch_cache_sets, list)
-               bch_cached_dev_attach(dc, c);
+               bch_cached_dev_attach(dc, c, NULL);
 
        if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
            BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
@@ -1272,7 +1314,7 @@ static int flash_devs_run(struct cache_set *c)
        struct uuid_entry *u;
 
        for (u = c->uuids;
-            u < c->uuids + c->devices_max_used && !ret;
+            u < c->uuids + c->nr_uuids && !ret;
             u++)
                if (UUID_FLASH_ONLY(u))
                        ret = flash_dev_run(c, u);
@@ -1319,6 +1361,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
            test_bit(CACHE_SET_STOPPING, &c->flags))
                return false;
 
+       if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
+               pr_warn("CACHE_SET_IO_DISABLE already set");
+
        /* XXX: we can be called from atomic context
        acquire_console_sem();
        */
@@ -1430,25 +1475,72 @@ static void cache_set_flush(struct closure *cl)
        closure_return(cl);
 }
 
+/*
+ * This function is only called when CACHE_SET_IO_DISABLE is set, which means
+ * cache set is unregistering due to too many I/O errors. In this condition,
+ * the bcache device might be stopped, it depends on stop_when_cache_set_failed
+ * value and whether the broken cache has dirty data:
+ *
+ * dc->stop_when_cache_set_failed    dc->has_dirty   stop bcache device
+ *  BCH_CACHED_STOP_AUTO               0               NO
+ *  BCH_CACHED_STOP_AUTO               1               YES
+ *  BCH_CACHED_DEV_STOP_ALWAYS         0               YES
+ *  BCH_CACHED_DEV_STOP_ALWAYS         1               YES
+ *
+ * The expected behavior is, if stop_when_cache_set_failed is configured to
+ * "auto" via sysfs interface, the bcache device will not be stopped if the
+ * backing device is clean on the broken cache device.
+ */
+static void conditional_stop_bcache_device(struct cache_set *c,
+                                          struct bcache_device *d,
+                                          struct cached_dev *dc)
+{
+       if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
+               pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
+                       d->disk->disk_name, c->sb.set_uuid);
+               bcache_device_stop(d);
+       } else if (atomic_read(&dc->has_dirty)) {
+               /*
+                * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
+                * and dc->has_dirty == 1
+                */
+               pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
+                       d->disk->disk_name);
+                       bcache_device_stop(d);
+       } else {
+               /*
+                * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
+                * and dc->has_dirty == 0
+                */
+               pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
+                       d->disk->disk_name);
+       }
+}
+
 static void __cache_set_unregister(struct closure *cl)
 {
        struct cache_set *c = container_of(cl, struct cache_set, caching);
        struct cached_dev *dc;
+       struct bcache_device *d;
        size_t i;
 
        mutex_lock(&bch_register_lock);
 
-       for (i = 0; i < c->devices_max_used; i++)
-               if (c->devices[i]) {
-                       if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
-                           test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
-                               dc = container_of(c->devices[i],
-                                                 struct cached_dev, disk);
-                               bch_cached_dev_detach(dc);
-                       } else {
-                               bcache_device_stop(c->devices[i]);
-                       }
+       for (i = 0; i < c->devices_max_used; i++) {
+               d = c->devices[i];
+               if (!d)
+                       continue;
+
+               if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
+                   test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
+                       dc = container_of(d, struct cached_dev, disk);
+                       bch_cached_dev_detach(dc);
+                       if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
+                               conditional_stop_bcache_device(c, d, dc);
+               } else {
+                       bcache_device_stop(d);
                }
+       }
 
        mutex_unlock(&bch_register_lock);
 
@@ -1553,7 +1645,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 
        c->congested_read_threshold_us  = 2000;
        c->congested_write_threshold_us = 20000;
-       c->error_limit  = 8 << IO_ERROR_SHIFT;
+       c->error_limit  = DEFAULT_IO_ERROR_LIMIT;
+       WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
 
        return c;
 err:
@@ -1716,7 +1809,7 @@ static void run_cache_set(struct cache_set *c)
        bcache_write_super(c);
 
        list_for_each_entry_safe(dc, t, &uncached_devices, list)
-               bch_cached_dev_attach(dc, c);
+               bch_cached_dev_attach(dc, c, NULL);
 
        flash_devs_run(c);
 
@@ -1833,6 +1926,7 @@ void bch_cache_release(struct kobject *kobj)
 static int cache_alloc(struct cache *ca)
 {
        size_t free;
+       size_t btree_buckets;
        struct bucket *b;
 
        __module_get(THIS_MODULE);
@@ -1840,9 +1934,19 @@ static int cache_alloc(struct cache *ca)
 
        bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
 
+       /*
+        * when ca->sb.njournal_buckets is not zero, journal exists,
+        * and in bch_journal_replay(), tree node may split,
+        * so bucket of RESERVE_BTREE type is needed,
+        * the worst situation is all journal buckets are valid journal,
+        * and all the keys need to replay,
+        * so the number of  RESERVE_BTREE type buckets should be as much
+        * as journal buckets
+        */
+       btree_buckets = ca->sb.njournal_buckets ?: 8;
        free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
 
-       if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
+       if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
            !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
            !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
            !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
@@ -2120,7 +2224,6 @@ static int __init bcache_init(void)
        mutex_init(&bch_register_lock);
        init_waitqueue_head(&unregister_wait);
        register_reboot_notifier(&reboot);
-       closure_debug_init();
 
        bcache_major = register_blkdev(0, "bcache");
        if (bcache_major < 0) {
@@ -2132,7 +2235,7 @@ static int __init bcache_init(void)
        if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
            !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
            bch_request_init() ||
-           bch_debug_init(bcache_kobj) ||
+           bch_debug_init(bcache_kobj) || closure_debug_init() ||
            sysfs_create_files(bcache_kobj, files))
                goto err;