blkcg: implement REQ_CGROUP_PUNT

author Tejun Heo <tj@kernel.org>

Thu, 27 Jun 2019 20:39:52 +0000 (13:39 -0700)

committer Jens Axboe <axboe@kernel.dk>

Wed, 10 Jul 2019 15:00:57 +0000 (09:00 -0600)
author Tejun Heo <tj@kernel.org>
Thu, 27 Jun 2019 20:39:52 +0000 (13:39 -0700)
committer Jens Axboe <axboe@kernel.dk>
Wed, 10 Jul 2019 15:00:57 +0000 (09:00 -0600)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index ad7a91dec934624418f83c1a0820dba00ff55584..24ed26957367e2cbba91590040c9454612edad98 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  static LIST_HEAD(all_blkcgs);          /* protected by blkcg_pol_mutex */
  
  static bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
  
  static bool blkcg_policy_enabled(struct request_queue *q,
                                  const struct blkcg_policy *pol)
@@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
  {
         struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
  
+       WARN_ON(!bio_list_empty(&blkg->async_bios));
+
         /* release the blkcg and parent blkg refs this blkg has been holding */
         css_put(&blkg->blkcg->css);
         if (blkg->parent)
@@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
         call_rcu(&blkg->rcu_head, __blkg_release);
  }
  
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+       struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+                                            async_bio_work);
+       struct bio_list bios = BIO_EMPTY_LIST;
+       struct bio *bio;
+
+       /* as long as there are pending bios, @blkg can't go away */
+       spin_lock_bh(&blkg->async_bio_lock);
+       bio_list_merge(&bios, &blkg->async_bios);
+       bio_list_init(&blkg->async_bios);
+       spin_unlock_bh(&blkg->async_bio_lock);
+
+       while ((bio = bio_list_pop(&bios)))
+               submit_bio(bio);
+}
+
  /**
   * blkg_alloc - allocate a blkg
   * @blkcg: block cgroup the new blkg is associated with
@@ -142,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
  
         blkg->q = q;
         INIT_LIST_HEAD(&blkg->q_node);
+       spin_lock_init(&blkg->async_bio_lock);
+       bio_list_init(&blkg->async_bios);
+       INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
         blkg->blkcg = blkcg;
  
         for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1528,6 +1551,25 @@ out_unlock:
  }
  EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
  
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+       struct blkcg_gq *blkg = bio->bi_blkg;
+
+       /* consume the flag first */
+       bio->bi_opf &= ~REQ_CGROUP_PUNT;
+
+       /* never bounce for the root cgroup */
+       if (!blkg->parent)
+               return false;
+
+       spin_lock_bh(&blkg->async_bio_lock);
+       bio_list_add(&blkg->async_bios, bio);
+       spin_unlock_bh(&blkg->async_bio_lock);
+
+       queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+       return true;
+}
+
  /*
   * Scale the accumulated delay based on how long it has been since we updated
   * the delay.  We only call this when we are adding delay, in case it's been a
@@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
         atomic64_add(delta, &blkg->delay_nsec);
  }
  
+static int __init blkcg_init(void)
+{
+       blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+                                           WQ_MEM_RECLAIM | WQ_FREEZABLE |
+                                           WQ_UNBOUND | WQ_SYSFS, 0);
+       if (!blkcg_punt_bio_wq)
+               return -ENOMEM;
+       return 0;
+}
+subsys_initcall(blkcg_init);
+
  module_param(blkcg_debug_stats, bool, 0644);
  MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c

index edd009213f5b62473e597fd0c176d6df1b0aacdb..260e36a2c34314e979875dd73c2e15a3c1354528 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1128,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
   */
  blk_qc_t submit_bio(struct bio *bio)
  {
+       if (blkcg_punt_bio_submit(bio))
+               return BLK_QC_T_NONE;
+
         /*
          * If it's a regular read/write or a barrier with data attached,
          * go through the normal accounting stuff before submission.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h

index f9b02918024123aac03fef8312af8a3a18f19973..35b31d176f7475273704192bf9dc448ea09b3cd3 100644 (file)
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
  extern struct list_head bdi_list;
  
  extern struct workqueue_struct *bdi_wq;
+extern struct workqueue_struct *bdi_async_bio_wq;
  
  static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
  {
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h

index 33f23a8584387b1baa3aeb107acc1fd8086cffa0..689a582312887d05b1d81212454e02560e36c4de 100644 (file)
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -132,13 +132,17 @@ struct blkcg_gq {
  
         struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
  
-       struct rcu_head                 rcu_head;
+       spinlock_t                      async_bio_lock;
+       struct bio_list                 async_bios;
+       struct work_struct              async_bio_work;
  
         atomic_t                        use_delay;
         atomic64_t                      delay_nsec;
         atomic64_t                      delay_start;
         u64                             last_delay;
         int                             last_use;
+
+       struct rcu_head                 rcu_head;
  };
  
  typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
                                   struct bio *bio) { return false; }
  #endif
  
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+       if (bio->bi_opf & REQ_CGROUP_PUNT)
+               return __blkcg_punt_bio_submit(bio);
+       else
+               return false;
+}
  
  static inline void blkcg_bio_issue_init(struct bio *bio)
  {
@@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
  static inline void blkg_get(struct blkcg_gq *blkg) { }
  static inline void blkg_put(struct blkcg_gq *blkg) { }
  
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
  static inline void blkcg_bio_issue_init(struct bio *bio) { }
  static inline bool blkcg_bio_issue_check(struct request_queue *q,
                                          struct bio *bio) { return true; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h

index 6a53799c3fe2d9f1d9fd0e66adc1bbbffcb24125..feff3fe4467ec97ce92aa7e6b5b13ef05012bb47 100644 (file)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,14 @@ enum req_flag_bits {
         __REQ_RAHEAD,           /* read ahead, can fail anytime */
         __REQ_BACKGROUND,       /* background IO */
         __REQ_NOWAIT,           /* Don't wait if request will block */
+       /*
+        * When a shared kthread needs to issue a bio for a cgroup, doing
+        * so synchronously can lead to priority inversions as the kthread
+        * can be trapped waiting for that cgroup.  CGROUP_PUNT flag makes
+        * submit_bio() punt the actual issuing to a dedicated per-blkcg
+        * work item to avoid such priority inversions.
+        */
+       __REQ_CGROUP_PUNT,
  
         /* command specific flags for REQ_OP_WRITE_ZEROES: */
         __REQ_NOUNMAP,          /* do not free blocks when zeroing */
@@ -337,6 +345,8 @@ enum req_flag_bits {
  #define REQ_RAHEAD             (1ULL << __REQ_RAHEAD)
  #define REQ_BACKGROUND         (1ULL << __REQ_BACKGROUND)
  #define REQ_NOWAIT             (1ULL << __REQ_NOWAIT)
+#define REQ_CGROUP_PUNT                (1ULL << __REQ_CGROUP_PUNT)
+
  #define REQ_NOUNMAP            (1ULL << __REQ_NOUNMAP)
  #define REQ_HIPRI              (1ULL << __REQ_HIPRI)
  
diff --git a/include/linux/writeback.h b/include/linux/writeback.h

index e056a22075cf4786d3a087e204967a410b03246d..8945aac313929ca1afa831a58a4ea0661a02fd11 100644 (file)
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -78,6 +78,8 @@ struct writeback_control {
          */
         unsigned no_cgroup_owner:1;
  
+       unsigned punt_to_cgroup:1;      /* cgrp punting, see __REQ_CGROUP_PUNT */
+
  #ifdef CONFIG_CGROUP_WRITEBACK
         struct bdi_writeback *wb;       /* wb this writeback is issued under */
         struct inode *inode;            /* inode being written out */
@@ -94,12 +96,17 @@ struct writeback_control {
  
  static inline int wbc_to_write_flags(struct writeback_control *wbc)
  {
+       int flags = 0;
+
+       if (wbc->punt_to_cgroup)
+               flags = REQ_CGROUP_PUNT;
+
         if (wbc->sync_mode == WB_SYNC_ALL)
-               return REQ_SYNC;
+               flags |= REQ_SYNC;
         else if (wbc->for_kupdate || wbc->for_background)
-               return REQ_BACKGROUND;
+               flags |= REQ_BACKGROUND;
  
-       return 0;
+       return flags;
  }
  
  static inline struct cgroup_subsys_state *
author	Tejun Heo <tj@kernel.org>
	Thu, 27 Jun 2019 20:39:52 +0000 (13:39 -0700)
committer	Jens Axboe <axboe@kernel.dk>
	Wed, 10 Jul 2019 15:00:57 +0000 (09:00 -0600)
block/blk-cgroup.c		patch \| blob \| history
block/blk-core.c		patch \| blob \| history
include/linux/backing-dev.h		patch \| blob \| history
include/linux/blk-cgroup.h		patch \| blob \| history
include/linux/blk_types.h		patch \| blob \| history
include/linux/writeback.h		patch \| blob \| history