workqueue: Drain BH work items on hot-unplugged CPUs

author Tejun Heo <tj@kernel.org>

Tue, 27 Feb 2024 01:38:55 +0000 (15:38 -1000)

committer Tejun Heo <tj@kernel.org>

Thu, 29 Feb 2024 21:51:24 +0000 (11:51 -1000)
author Tejun Heo <tj@kernel.org>
Tue, 27 Feb 2024 01:38:55 +0000 (15:38 -1000)
committer Tejun Heo <tj@kernel.org>
Thu, 29 Feb 2024 21:51:24 +0000 (11:51 -1000)
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h

index 64a60b9232d30f21437dada0e54f08e3d8ccc433..158784dd189ab2a424b02fca0760f95f0af7de2d 100644 (file)
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -458,6 +458,7 @@ extern struct workqueue_struct *system_bh_wq;
  extern struct workqueue_struct *system_bh_highpri_wq;
  
  void workqueue_softirq_action(bool highpri);
+void workqueue_softirq_dead(unsigned int cpu);
  
  /**
   * alloc_workqueue - allocate a workqueue
diff --git a/kernel/softirq.c b/kernel/softirq.c

index 547d282548a8e3543763eed9ce0f01633ff8dcd7..b315b21fb28cd281fc38d1a8a00ed9f9527e2f53 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -932,6 +932,8 @@ static void run_ksoftirqd(unsigned int cpu)
  #ifdef CONFIG_HOTPLUG_CPU
  static int takeover_tasklets(unsigned int cpu)
  {
+       workqueue_softirq_dead(cpu);
+
         /* CPU is dead, so no lock needed. */
         local_irq_disable();
  
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index 38783e3a60bbfa627c34a308c5fb87b19615d533..a60eb65955e739108c878229ee3042dcf311363e 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -81,6 +81,7 @@ enum worker_pool_flags {
         POOL_BH                 = 1 << 0,       /* is a BH pool */
         POOL_MANAGER_ACTIVE     = 1 << 1,       /* being managed */
         POOL_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
+       POOL_BH_DRAINING        = 1 << 3,       /* draining after CPU offline */
  };
  
  enum worker_flags {
@@ -1218,7 +1219,9 @@ static struct irq_work *bh_pool_irq_work(struct worker_pool *pool)
  static void kick_bh_pool(struct worker_pool *pool)
  {
  #ifdef CONFIG_SMP
-       if (unlikely(pool->cpu != smp_processor_id())) {
+       /* see drain_dead_softirq_workfn() for BH_DRAINING */
+       if (unlikely(pool->cpu != smp_processor_id() &&
+                    !(pool->flags & POOL_BH_DRAINING))) {
                 irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
                 return;
         }
@@ -3155,6 +3158,7 @@ __acquires(&pool->lock)
         struct worker_pool *pool = worker->pool;
         unsigned long work_data;
         int lockdep_start_depth, rcu_start_depth;
+       bool bh_draining = pool->flags & POOL_BH_DRAINING;
  #ifdef CONFIG_LOCKDEP
         /*
          * It is permissible to free the struct work_struct from
@@ -3220,7 +3224,9 @@ __acquires(&pool->lock)
  
         rcu_start_depth = rcu_preempt_depth();
         lockdep_start_depth = lockdep_depth(current);
-       lock_map_acquire(&pwq->wq->lockdep_map);
+       /* see drain_dead_softirq_workfn() */
+       if (!bh_draining)
+               lock_map_acquire(&pwq->wq->lockdep_map);
         lock_map_acquire(&lockdep_map);
         /*
          * Strictly speaking we should mark the invariant state without holding
@@ -3253,7 +3259,8 @@ __acquires(&pool->lock)
         trace_workqueue_execute_end(work, worker->current_func);
         pwq->stats[PWQ_STAT_COMPLETED]++;
         lock_map_release(&lockdep_map);
-       lock_map_release(&pwq->wq->lockdep_map);
+       if (!bh_draining)
+               lock_map_release(&pwq->wq->lockdep_map);
  
         if (unlikely((worker->task && in_atomic()) ||
                      lockdep_depth(current) != lockdep_start_depth ||
@@ -3615,6 +3622,84 @@ void workqueue_softirq_action(bool highpri)
                 bh_worker(list_first_entry(&pool->workers, struct worker, node));
  }
  
+struct wq_drain_dead_softirq_work {
+       struct work_struct      work;
+       struct worker_pool      *pool;
+       struct completion       done;
+};
+
+static void drain_dead_softirq_workfn(struct work_struct *work)
+{
+       struct wq_drain_dead_softirq_work *dead_work =
+               container_of(work, struct wq_drain_dead_softirq_work, work);
+       struct worker_pool *pool = dead_work->pool;
+       bool repeat;
+
+       /*
+        * @pool's CPU is dead and we want to execute its still pending work
+        * items from this BH work item which is running on a different CPU. As
+        * its CPU is dead, @pool can't be kicked and, as work execution path
+        * will be nested, a lockdep annotation needs to be suppressed. Mark
+        * @pool with %POOL_BH_DRAINING for the special treatments.
+        */
+       raw_spin_lock_irq(&pool->lock);
+       pool->flags |= POOL_BH_DRAINING;
+       raw_spin_unlock_irq(&pool->lock);
+
+       bh_worker(list_first_entry(&pool->workers, struct worker, node));
+
+       raw_spin_lock_irq(&pool->lock);
+       pool->flags &= ~POOL_BH_DRAINING;
+       repeat = need_more_worker(pool);
+       raw_spin_unlock_irq(&pool->lock);
+
+       /*
+        * bh_worker() might hit consecutive execution limit and bail. If there
+        * still are pending work items, reschedule self and return so that we
+        * don't hog this CPU's BH.
+        */
+       if (repeat) {
+               if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
+                       queue_work(system_bh_highpri_wq, work);
+               else
+                       queue_work(system_bh_wq, work);
+       } else {
+               complete(&dead_work->done);
+       }
+}
+
+/*
+ * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
+ * possible to allocate dead_work per CPU and avoid flushing. However, then we
+ * have to worry about draining overlapping with CPU coming back online or
+ * nesting (one CPU's dead_work queued on another CPU which is also dead and so
+ * on). Let's keep it simple and drain them synchronously. These are BH work
+ * items which shouldn't be requeued on the same pool. Shouldn't take long.
+ */
+void workqueue_softirq_dead(unsigned int cpu)
+{
+       int i;
+
+       for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
+               struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
+               struct wq_drain_dead_softirq_work dead_work;
+
+               if (!need_more_worker(pool))
+                       continue;
+
+               INIT_WORK(&dead_work.work, drain_dead_softirq_workfn);
+               dead_work.pool = pool;
+               init_completion(&dead_work.done);
+
+               if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
+                       queue_work(system_bh_highpri_wq, &dead_work.work);
+               else
+                       queue_work(system_bh_wq, &dead_work.work);
+
+               wait_for_completion(&dead_work.done);
+       }
+}
+
  /**
   * check_flush_dependency - check for flush dependency sanity
   * @target_wq: workqueue being flushed
author	Tejun Heo <tj@kernel.org>
	Tue, 27 Feb 2024 01:38:55 +0000 (15:38 -1000)
committer	Tejun Heo <tj@kernel.org>
	Thu, 29 Feb 2024 21:51:24 +0000 (11:51 -1000)
include/linux/workqueue.h		patch \| blob \| history
kernel/softirq.c		patch \| blob \| history
kernel/workqueue.c		patch \| blob \| history