Merge tag 'for-linus-20190125' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 26 Jan 2019 20:42:41 +0000 (12:42 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 26 Jan 2019 20:42:41 +0000 (12:42 -0800)
Pull block fixes from Jens Axboe:
 "A collection of fixes for this release. This contains:

   - Silence sparse rightfully complaining about non-static wbt
     functions (Bart)

   - Fixes for the zoned comments/ioctl documentation (Damien)

   - direct-io fix that's been lingering for a while (Ernesto)

   - cgroup writeback fix (Tejun)

   - Set of NVMe patches for nvme-rdma/tcp (Sagi, Hannes, Raju)

   - Block recursion tracking fix (Ming)

   - Fix debugfs command flag naming for a few flags (Jianchao)"

* tag 'for-linus-20190125' of git://git.kernel.dk/linux-block:
  block: Fix comment typo
  uapi: fix ioctl documentation
  blk-wbt: Declare local functions static
  blk-mq: fix the cmd_flag_name array
  nvme-multipath: drop optimization for static ANA group IDs
  nvmet-rdma: fix null dereference under heavy load
  nvme-rdma: rework queue maps handling
  nvme-tcp: fix timeout handler
  nvme-rdma: fix timeout handler
  writeback: synchronize sync(2) against cgroup writeback membership switches
  block: cover another queue enter recursion via BIO_QUEUE_ENTERED
  direct-io: allow direct writes to empty inodes

14 files changed:
block/blk-core.c
block/blk-merge.c
block/blk-mq-debugfs.c
block/blk-wbt.c
drivers/nvme/host/multipath.c
drivers/nvme/host/rdma.c
drivers/nvme/host/tcp.c
drivers/nvme/target/rdma.c
fs/direct-io.c
fs/fs-writeback.c
include/linux/backing-dev-defs.h
include/linux/blk_types.h
include/uapi/linux/blkzoned.h
mm/backing-dev.c

index 3c5f61ceeb671ee1c9181eb7fbc3d1dc65327917..1ccec27d20c38d8d2613893092e67292d3baae0f 100644 (file)
@@ -1083,7 +1083,18 @@ blk_qc_t generic_make_request(struct bio *bio)
                        /* Create a fresh bio_list for all subordinate requests */
                        bio_list_on_stack[1] = bio_list_on_stack[0];
                        bio_list_init(&bio_list_on_stack[0]);
+
+                       /*
+                        * Since we're recursing into make_request here, ensure
+                        * that we mark this bio as already having entered the queue.
+                        * If not, and the queue is going away, we can get stuck
+                        * forever on waiting for the queue reference to drop. But
+                        * that will never happen, as we're already holding a
+                        * reference to it.
+                        */
+                       bio_set_flag(bio, BIO_QUEUE_ENTERED);
                        ret = q->make_request_fn(q, bio);
+                       bio_clear_flag(bio, BIO_QUEUE_ENTERED);
 
                        /* sort new bios into those for a lower level
                         * and those for the same level
index 71e9ac03f62187a37abf5eae88f0008a4c18e5a4..d79a22f111d132f4820188429dc2c321d308c371 100644 (file)
@@ -272,16 +272,6 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
                /* there isn't chance to merge the splitted bio */
                split->bi_opf |= REQ_NOMERGE;
 
-               /*
-                * Since we're recursing into make_request here, ensure
-                * that we mark this bio as already having entered the queue.
-                * If not, and the queue is going away, we can get stuck
-                * forever on waiting for the queue reference to drop. But
-                * that will never happen, as we're already holding a
-                * reference to it.
-                */
-               bio_set_flag(*bio, BIO_QUEUE_ENTERED);
-
                bio_chain(split, *bio);
                trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
                generic_make_request(*bio);
index 90d68760af086bd6de78c71dc4a1f6954ceb7a5b..f8120832ca7b8ea44cf872c3810dd373a662656b 100644 (file)
@@ -308,8 +308,9 @@ static const char *const cmd_flag_name[] = {
        CMD_FLAG_NAME(PREFLUSH),
        CMD_FLAG_NAME(RAHEAD),
        CMD_FLAG_NAME(BACKGROUND),
-       CMD_FLAG_NAME(NOUNMAP),
        CMD_FLAG_NAME(NOWAIT),
+       CMD_FLAG_NAME(NOUNMAP),
+       CMD_FLAG_NAME(HIPRI),
 };
 #undef CMD_FLAG_NAME
 
index f0c56649775fcb35ae978b9dac27bcd4c4001fb3..fd166fbb0f6587c494e6095b8bf6e58de0c67360 100644 (file)
@@ -597,7 +597,7 @@ static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
        rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
 }
 
-void wbt_issue(struct rq_qos *rqos, struct request *rq)
+static void wbt_issue(struct rq_qos *rqos, struct request *rq)
 {
        struct rq_wb *rwb = RQWB(rqos);
 
@@ -617,7 +617,7 @@ void wbt_issue(struct rq_qos *rqos, struct request *rq)
        }
 }
 
-void wbt_requeue(struct rq_qos *rqos, struct request *rq)
+static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
 {
        struct rq_wb *rwb = RQWB(rqos);
        if (!rwb_enabled(rwb))
index df4b3a6db51bfdf8307e38da3cd568209266f0a5..b9fff3b8ed1b1dd180b50de141bbc3d2af73a485 100644 (file)
@@ -545,8 +545,7 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
        timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
        ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
                ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
-       if (!(ctrl->anacap & (1 << 6)))
-               ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
+       ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
 
        if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
                dev_err(ctrl->device,
index 0a2fd2949ad788b9f5daae0acf3d7d54b8a6ad18..52abc3a6de129cab702ee1ca488bf8946940657e 100644 (file)
@@ -119,6 +119,7 @@ struct nvme_rdma_ctrl {
 
        struct nvme_ctrl        ctrl;
        bool                    use_inline_data;
+       u32                     io_queues[HCTX_MAX_TYPES];
 };
 
 static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
@@ -165,8 +166,8 @@ static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
 static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
 {
        return nvme_rdma_queue_idx(queue) >
-               queue->ctrl->ctrl.opts->nr_io_queues +
-               queue->ctrl->ctrl.opts->nr_write_queues;
+               queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+               queue->ctrl->io_queues[HCTX_TYPE_READ];
 }
 
 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
@@ -661,8 +662,21 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
        nr_io_queues = min_t(unsigned int, nr_io_queues,
                                ibdev->num_comp_vectors);
 
-       nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
-       nr_io_queues += min(opts->nr_poll_queues, num_online_cpus());
+       if (opts->nr_write_queues) {
+               ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+                               min(opts->nr_write_queues, nr_io_queues);
+               nr_io_queues += ctrl->io_queues[HCTX_TYPE_DEFAULT];
+       } else {
+               ctrl->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues;
+       }
+
+       ctrl->io_queues[HCTX_TYPE_READ] = nr_io_queues;
+
+       if (opts->nr_poll_queues) {
+               ctrl->io_queues[HCTX_TYPE_POLL] =
+                       min(opts->nr_poll_queues, num_online_cpus());
+               nr_io_queues += ctrl->io_queues[HCTX_TYPE_POLL];
+       }
 
        ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
        if (ret)
@@ -1689,18 +1703,28 @@ static enum blk_eh_timer_return
 nvme_rdma_timeout(struct request *rq, bool reserved)
 {
        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+       struct nvme_rdma_queue *queue = req->queue;
+       struct nvme_rdma_ctrl *ctrl = queue->ctrl;
 
-       dev_warn(req->queue->ctrl->ctrl.device,
-                "I/O %d QID %d timeout, reset controller\n",
-                rq->tag, nvme_rdma_queue_idx(req->queue));
+       dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
+                rq->tag, nvme_rdma_queue_idx(queue));
 
-       /* queue error recovery */
-       nvme_rdma_error_recovery(req->queue->ctrl);
+       if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+               /*
+                * Teardown immediately if controller times out while starting
+                * or we are already started error recovery. all outstanding
+                * requests are completed on shutdown, so we return BLK_EH_DONE.
+                */
+               flush_work(&ctrl->err_work);
+               nvme_rdma_teardown_io_queues(ctrl, false);
+               nvme_rdma_teardown_admin_queue(ctrl, false);
+               return BLK_EH_DONE;
+       }
 
-       /* fail with DNR on cmd timeout */
-       nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+       dev_warn(ctrl->ctrl.device, "starting error recovery\n");
+       nvme_rdma_error_recovery(ctrl);
 
-       return BLK_EH_DONE;
+       return BLK_EH_RESET_TIMER;
 }
 
 static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -1779,17 +1803,15 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
        struct nvme_rdma_ctrl *ctrl = set->driver_data;
 
        set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
-       set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
+       set->map[HCTX_TYPE_DEFAULT].nr_queues =
+                       ctrl->io_queues[HCTX_TYPE_DEFAULT];
+       set->map[HCTX_TYPE_READ].nr_queues = ctrl->io_queues[HCTX_TYPE_READ];
        if (ctrl->ctrl.opts->nr_write_queues) {
                /* separate read/write queues */
-               set->map[HCTX_TYPE_DEFAULT].nr_queues =
-                               ctrl->ctrl.opts->nr_write_queues;
                set->map[HCTX_TYPE_READ].queue_offset =
-                               ctrl->ctrl.opts->nr_write_queues;
+                               ctrl->io_queues[HCTX_TYPE_DEFAULT];
        } else {
                /* mixed read/write queues */
-               set->map[HCTX_TYPE_DEFAULT].nr_queues =
-                               ctrl->ctrl.opts->nr_io_queues;
                set->map[HCTX_TYPE_READ].queue_offset = 0;
        }
        blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
@@ -1799,12 +1821,12 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 
        if (ctrl->ctrl.opts->nr_poll_queues) {
                set->map[HCTX_TYPE_POLL].nr_queues =
-                               ctrl->ctrl.opts->nr_poll_queues;
+                               ctrl->io_queues[HCTX_TYPE_POLL];
                set->map[HCTX_TYPE_POLL].queue_offset =
-                               ctrl->ctrl.opts->nr_io_queues;
+                               ctrl->io_queues[HCTX_TYPE_DEFAULT];
                if (ctrl->ctrl.opts->nr_write_queues)
                        set->map[HCTX_TYPE_POLL].queue_offset +=
-                               ctrl->ctrl.opts->nr_write_queues;
+                               ctrl->io_queues[HCTX_TYPE_READ];
                blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
        }
        return 0;
index 265a0543b381c318f792e5088d8b2f1549cd9556..5f0a004252422f970c2f90ced7a01e4f92dd0041 100644 (file)
@@ -1948,20 +1948,23 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
        struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
 
-       dev_dbg(ctrl->ctrl.device,
+       dev_warn(ctrl->ctrl.device,
                "queue %d: timeout request %#x type %d\n",
-               nvme_tcp_queue_id(req->queue), rq->tag,
-               pdu->hdr.type);
+               nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
 
        if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
-               union nvme_result res = {};
-
-               nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
-               nvme_end_request(rq, cpu_to_le16(NVME_SC_ABORT_REQ), res);
+               /*
+                * Teardown immediately if controller times out while starting
+                * or we are already started error recovery. all outstanding
+                * requests are completed on shutdown, so we return BLK_EH_DONE.
+                */
+               flush_work(&ctrl->err_work);
+               nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
+               nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
                return BLK_EH_DONE;
        }
 
-       /* queue error recovery */
+       dev_warn(ctrl->ctrl.device, "starting error recovery\n");
        nvme_tcp_error_recovery(&ctrl->ctrl);
 
        return BLK_EH_RESET_TIMER;
index a8d23eb80192024e2fe101f91fa58d222a17b698..a884e3a0e8afee4cf0d6bf3b8a8848d1f659c7c0 100644 (file)
@@ -139,6 +139,10 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
+static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
+                               struct nvmet_rdma_rsp *r);
+static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
+                               struct nvmet_rdma_rsp *r);
 
 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
 
@@ -182,9 +186,17 @@ nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
        spin_unlock_irqrestore(&queue->rsps_lock, flags);
 
        if (unlikely(!rsp)) {
-               rsp = kmalloc(sizeof(*rsp), GFP_KERNEL);
+               int ret;
+
+               rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
                if (unlikely(!rsp))
                        return NULL;
+               ret = nvmet_rdma_alloc_rsp(queue->dev, rsp);
+               if (unlikely(ret)) {
+                       kfree(rsp);
+                       return NULL;
+               }
+
                rsp->allocated = true;
        }
 
@@ -197,6 +209,7 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
        unsigned long flags;
 
        if (unlikely(rsp->allocated)) {
+               nvmet_rdma_free_rsp(rsp->queue->dev, rsp);
                kfree(rsp);
                return;
        }
index dbc1a1f080ceb231f0553ff270ee7663a5479bfd..ec2fb6fe6d37c628d5335b7a0971b464824b0623 100644 (file)
@@ -679,6 +679,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
        unsigned long fs_count; /* Number of filesystem-sized blocks */
        int create;
        unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
+       loff_t i_size;
 
        /*
         * If there was a memory error and we've overwritten all the
@@ -708,8 +709,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
                 */
                create = dio->op == REQ_OP_WRITE;
                if (dio->flags & DIO_SKIP_HOLES) {
-                       if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
-                                                       i_blkbits))
+                       i_size = i_size_read(dio->inode);
+                       if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits)
                                create = 0;
                }
 
index b40168fcc94a6ae6383600b443e67163f820abbb..36855c1f8dafdce42422e0b31ce806d6d9973979 100644 (file)
@@ -331,11 +331,22 @@ struct inode_switch_wbs_context {
        struct work_struct      work;
 };
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+       down_write(&bdi->wb_switch_rwsem);
+}
+
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+       up_write(&bdi->wb_switch_rwsem);
+}
+
 static void inode_switch_wbs_work_fn(struct work_struct *work)
 {
        struct inode_switch_wbs_context *isw =
                container_of(work, struct inode_switch_wbs_context, work);
        struct inode *inode = isw->inode;
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct address_space *mapping = inode->i_mapping;
        struct bdi_writeback *old_wb = inode->i_wb;
        struct bdi_writeback *new_wb = isw->new_wb;
@@ -343,6 +354,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
        struct page *page;
        bool switched = false;
 
+       /*
+        * If @inode switches cgwb membership while sync_inodes_sb() is
+        * being issued, sync_inodes_sb() might miss it.  Synchronize.
+        */
+       down_read(&bdi->wb_switch_rwsem);
+
        /*
         * By the time control reaches here, RCU grace period has passed
         * since I_WB_SWITCH assertion and all wb stat update transactions
@@ -428,6 +445,8 @@ skip_switch:
        spin_unlock(&new_wb->list_lock);
        spin_unlock(&old_wb->list_lock);
 
+       up_read(&bdi->wb_switch_rwsem);
+
        if (switched) {
                wb_wakeup(new_wb);
                wb_put(old_wb);
@@ -468,9 +487,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
        if (inode->i_state & I_WB_SWITCH)
                return;
 
+       /*
+        * Avoid starting new switches while sync_inodes_sb() is in
+        * progress.  Otherwise, if the down_write protected issue path
+        * blocks heavily, we might end up starting a large number of
+        * switches which will block on the rwsem.
+        */
+       if (!down_read_trylock(&bdi->wb_switch_rwsem))
+               return;
+
        isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
        if (!isw)
-               return;
+               goto out_unlock;
 
        /* find and pin the new wb */
        rcu_read_lock();
@@ -504,12 +532,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
         * Let's continue after I_WB_SWITCH is guaranteed to be visible.
         */
        call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
-       return;
+       goto out_unlock;
 
 out_free:
        if (isw->new_wb)
                wb_put(isw->new_wb);
        kfree(isw);
+out_unlock:
+       up_read(&bdi->wb_switch_rwsem);
 }
 
 /**
@@ -887,6 +917,9 @@ fs_initcall(cgroup_writeback_init);
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+
 static struct bdi_writeback *
 locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
@@ -2413,8 +2446,11 @@ void sync_inodes_sb(struct super_block *sb)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
+       /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
+       bdi_down_write_wb_switch_rwsem(bdi);
        bdi_split_work_to_wbs(bdi, &work, false);
        wb_wait_for_completion(bdi, &done);
+       bdi_up_write_wb_switch_rwsem(bdi);
 
        wait_sb_inodes(sb);
 }
index c31157135598150332d3aa193764c39e18f247b2..07e02d6df5ad9f24b262fe2d852e21235de8119f 100644 (file)
@@ -190,6 +190,7 @@ struct backing_dev_info {
        struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
        struct rb_root cgwb_congested_tree; /* their congested states */
        struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
+       struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
 #else
        struct bdi_writeback_congested *wb_congested;
 #endif
index 5c7e7f859a2493f58ac65a6f10aee7e611b01458..d66bf5f32610adce133e522b7f3852bd08817ff7 100644 (file)
@@ -287,7 +287,7 @@ enum req_opf {
        REQ_OP_DISCARD          = 3,
        /* securely erase sectors */
        REQ_OP_SECURE_ERASE     = 5,
-       /* seset a zone write pointer */
+       /* reset a zone write pointer */
        REQ_OP_ZONE_RESET       = 6,
        /* write the same sector many times */
        REQ_OP_WRITE_SAME       = 7,
index 6fa38d001d84ff5af90fe6014e7874d18f8e2bc1..498eec813494c6ccbef5d939211088b90bb534d9 100644 (file)
@@ -138,6 +138,7 @@ struct blk_zone_range {
  * @BLKRESETZONE: Reset the write pointer of the zones in the specified
  *                sector range. The sector range must be zone aligned.
  * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors.
+ * @BLKGETNRZONES: Get the total number of zones of the device.
  */
 #define BLKREPORTZONE  _IOWR(0x12, 130, struct blk_zone_report)
 #define BLKRESETZONE   _IOW(0x12, 131, struct blk_zone_range)
index 8a8bb8796c6c43cb711c0f618c488df188853623..72e6d0c55cfad9b51b173cca10c34011b035879e 100644 (file)
@@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
        INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
        bdi->cgwb_congested_tree = RB_ROOT;
        mutex_init(&bdi->cgwb_release_mutex);
+       init_rwsem(&bdi->wb_switch_rwsem);
 
        ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
        if (!ret) {