xfs: CIL work is serialised, not pipelined
authorDave Chinner <dchinner@redhat.com>
Wed, 11 Aug 2021 01:00:45 +0000 (18:00 -0700)
committerDarrick J. Wong <djwong@kernel.org>
Mon, 16 Aug 2021 19:09:30 +0000 (12:09 -0700)
Because we use a single work structure attached to the CIL rather
than the CIL context, we can only queue a single work item at a
time. This results in the CIL being single threaded and limits
performance when it becomes CPU bound.

The design of the CIL is that it is pipelined and multiple commits
can be running concurrently, but the way the work is currently
implemented means that it is not pipelining as it was intended. The
critical work to switch the CIL context can take a few milliseconds
to run, but the rest of the CIL context flush can take hundreds of
milliseconds to complete. The context switching is the serialisation
point of the CIL, once the context has been switched the rest of the
context push can run asynchrnously with all other context pushes.

Hence we can move the work to the CIL context so that we can run
multiple CIL pushes at the same time and spread the majority of
the work out over multiple CPUs. We can keep the per-cpu CIL commit
state on the CIL rather than the context, because the context is
pinned to the CIL until the switch is done and we aggregate and
drain the per-cpu state held on the CIL during the context switch.

However, because we no longer serialise the CIL work, we can have
effectively unlimited CIL pushes in progress. We don't want to do
this - not only does it create contention on the iclogs and the
state machine locks, we can run the log right out of space with
outstanding pushes. Instead, limit the work concurrency to 4
concurrent works being processed at a time. This is enough
concurrency to remove the CIL from being a CPU bound bottleneck but
not enough to create new contention points or unbound concurrency
issues.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
fs/xfs/xfs_log_cil.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_super.c

index 59d3bd45543bdbbe34dd596f332881ac9b5153d4..17785f4d50f7db2cddd219a20a51c612b8335015 100644 (file)
@@ -47,6 +47,34 @@ xlog_cil_ticket_alloc(
        return tic;
 }
 
+/*
+ * Unavoidable forward declaration - xlog_cil_push_work() calls
+ * xlog_cil_ctx_alloc() itself.
+ */
+static void xlog_cil_push_work(struct work_struct *work);
+
+static struct xfs_cil_ctx *
+xlog_cil_ctx_alloc(void)
+{
+       struct xfs_cil_ctx      *ctx;
+
+       ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
+       INIT_LIST_HEAD(&ctx->committing);
+       INIT_LIST_HEAD(&ctx->busy_extents);
+       INIT_WORK(&ctx->push_work, xlog_cil_push_work);
+       return ctx;
+}
+
+static void
+xlog_cil_ctx_switch(
+       struct xfs_cil          *cil,
+       struct xfs_cil_ctx      *ctx)
+{
+       ctx->sequence = ++cil->xc_current_sequence;
+       ctx->cil = cil;
+       cil->xc_ctx = ctx;
+}
+
 /*
  * After the first stage of log recovery is done, we know where the head and
  * tail of the log are. We need this log initialisation done before we can
@@ -824,11 +852,11 @@ static void
 xlog_cil_push_work(
        struct work_struct      *work)
 {
-       struct xfs_cil          *cil =
-               container_of(work, struct xfs_cil, xc_push_work);
+       struct xfs_cil_ctx      *ctx =
+               container_of(work, struct xfs_cil_ctx, push_work);
+       struct xfs_cil          *cil = ctx->cil;
        struct xlog             *log = cil->xc_log;
        struct xfs_log_vec      *lv;
-       struct xfs_cil_ctx      *ctx;
        struct xfs_cil_ctx      *new_ctx;
        struct xlog_ticket      *tic;
        int                     num_iovecs;
@@ -842,11 +870,10 @@ xlog_cil_push_work(
        DECLARE_COMPLETION_ONSTACK(bdev_flush);
        bool                    push_commit_stable;
 
-       new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS);
+       new_ctx = xlog_cil_ctx_alloc();
        new_ctx->ticket = xlog_cil_ticket_alloc(log);
 
        down_write(&cil->xc_ctx_lock);
-       ctx = cil->xc_ctx;
 
        spin_lock(&cil->xc_push_lock);
        push_seq = cil->xc_push_seq;
@@ -878,7 +905,7 @@ xlog_cil_push_work(
 
 
        /* check for a previously pushed sequence */
-       if (push_seq < cil->xc_ctx->sequence) {
+       if (push_seq < ctx->sequence) {
                spin_unlock(&cil->xc_push_lock);
                goto out_skip;
        }
@@ -951,19 +978,7 @@ xlog_cil_push_work(
        }
 
        /*
-        * initialise the new context and attach it to the CIL. Then attach
-        * the current context to the CIL committing list so it can be found
-        * during log forces to extract the commit lsn of the sequence that
-        * needs to be forced.
-        */
-       INIT_LIST_HEAD(&new_ctx->committing);
-       INIT_LIST_HEAD(&new_ctx->busy_extents);
-       new_ctx->sequence = ctx->sequence + 1;
-       new_ctx->cil = cil;
-       cil->xc_ctx = new_ctx;
-
-       /*
-        * The switch is now done, so we can drop the context lock and move out
+        * Switch the contexts so we can drop the context lock and move out
         * of a shared context. We can't just go straight to the commit record,
         * though - we need to synchronise with previous and future commits so
         * that the commit records are correctly ordered in the log to ensure
@@ -988,7 +1003,7 @@ xlog_cil_push_work(
         * deferencing a freed context pointer.
         */
        spin_lock(&cil->xc_push_lock);
-       cil->xc_current_sequence = new_ctx->sequence;
+       xlog_cil_ctx_switch(cil, new_ctx);
        spin_unlock(&cil->xc_push_lock);
        up_write(&cil->xc_ctx_lock);
 
@@ -1136,7 +1151,7 @@ xlog_cil_push_background(
        spin_lock(&cil->xc_push_lock);
        if (cil->xc_push_seq < cil->xc_current_sequence) {
                cil->xc_push_seq = cil->xc_current_sequence;
-               queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+               queue_work(log->l_mp->m_cil_workqueue, &cil->xc_ctx->push_work);
        }
 
        /*
@@ -1202,7 +1217,7 @@ xlog_cil_push_now(
 
        /* start on any pending background push to minimise wait time on it */
        if (!async)
-               flush_work(&cil->xc_push_work);
+               flush_workqueue(log->l_mp->m_cil_workqueue);
 
        /*
         * If the CIL is empty or we've already pushed the sequence then
@@ -1216,7 +1231,7 @@ xlog_cil_push_now(
 
        cil->xc_push_seq = push_seq;
        cil->xc_push_commit_stable = async;
-       queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
+       queue_work(log->l_mp->m_cil_workqueue, &cil->xc_ctx->push_work);
        spin_unlock(&cil->xc_push_lock);
 }
 
@@ -1456,13 +1471,6 @@ xlog_cil_init(
        if (!cil)
                return -ENOMEM;
 
-       ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
-       if (!ctx) {
-               kmem_free(cil);
-               return -ENOMEM;
-       }
-
-       INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
        INIT_LIST_HEAD(&cil->xc_cil);
        INIT_LIST_HEAD(&cil->xc_committing);
        spin_lock_init(&cil->xc_cil_lock);
@@ -1471,16 +1479,12 @@ xlog_cil_init(
        init_rwsem(&cil->xc_ctx_lock);
        init_waitqueue_head(&cil->xc_start_wait);
        init_waitqueue_head(&cil->xc_commit_wait);
-
-       INIT_LIST_HEAD(&ctx->committing);
-       INIT_LIST_HEAD(&ctx->busy_extents);
-       ctx->sequence = 1;
-       ctx->cil = cil;
-       cil->xc_ctx = ctx;
-       cil->xc_current_sequence = ctx->sequence;
-
        cil->xc_log = log;
        log->l_cilp = cil;
+
+       ctx = xlog_cil_ctx_alloc();
+       xlog_cil_ctx_switch(cil, ctx);
+
        return 0;
 }
 
index 014e0dc0ba9748c8ce2400170608c1a0ae5c165f..5aaaf5f0b35cb19f3582205e81c55234b1cd9784 100644 (file)
@@ -249,6 +249,7 @@ struct xfs_cil_ctx {
        struct list_head        iclog_entry;
        struct list_head        committing;     /* ctx committing list */
        struct work_struct      discard_endio_work;
+       struct work_struct      push_work;
 };
 
 /*
@@ -282,7 +283,6 @@ struct xfs_cil {
        wait_queue_head_t       xc_commit_wait;
        wait_queue_head_t       xc_start_wait;
        xfs_csn_t               xc_current_sequence;
-       struct work_struct      xc_push_work;
        wait_queue_head_t       xc_push_wait;   /* background push throttle */
 } ____cacheline_aligned_in_smp;
 
index 53ce250089480205930d5d0cf39a898f8a62dbb9..6d42883b8fae58c482e98dec4eaccdd4a453bdae 100644 (file)
@@ -518,9 +518,13 @@ xfs_init_mount_workqueues(
        if (!mp->m_unwritten_workqueue)
                goto out_destroy_buf;
 
+       /*
+        * Limit the CIL pipeline depth to 4 concurrent works to bound the
+        * concurrency the log spinlocks will be exposed to.
+        */
        mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
                        XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
-                       0, mp->m_super->s_id);
+                       4, mp->m_super->s_id);
        if (!mp->m_cil_workqueue)
                goto out_destroy_unwritten;