Merge tag 'powerpc-5.0-4' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
[sfrench/cifs-2.6.git] / drivers / nvme / host / core.c
index 962012135b62acf7e956df6fc5780780c3f8a9c4..150e49723c15af38167f3d6aa6a3ddb9a745ed4b 100644 (file)
@@ -97,7 +97,6 @@ static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 static struct class *nvme_subsys_class;
 
-static void nvme_ns_remove(struct nvme_ns *ns);
 static int nvme_revalidate_disk(struct gendisk *disk);
 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@@ -245,12 +244,31 @@ static inline bool nvme_req_needs_retry(struct request *req)
        return true;
 }
 
+static void nvme_retry_req(struct request *req)
+{
+       struct nvme_ns *ns = req->q->queuedata;
+       unsigned long delay = 0;
+       u16 crd;
+
+       /* The mask and shift result must be <= 3 */
+       crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
+       if (ns && crd)
+               delay = ns->ctrl->crdt[crd - 1] * 100;
+
+       nvme_req(req)->retries++;
+       blk_mq_requeue_request(req, false);
+       blk_mq_delay_kick_requeue_list(req->q, delay);
+}
+
 void nvme_complete_rq(struct request *req)
 {
        blk_status_t status = nvme_error_status(req);
 
        trace_nvme_complete_rq(req);
 
+       if (nvme_req(req)->ctrl->kas)
+               nvme_req(req)->ctrl->comp_seen = true;
+
        if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
                if ((req->cmd_flags & REQ_NVME_MPATH) &&
                    blk_path_error(status)) {
@@ -259,8 +277,7 @@ void nvme_complete_rq(struct request *req)
                }
 
                if (!blk_queue_dying(req->q)) {
-                       nvme_req(req)->retries++;
-                       blk_mq_requeue_request(req, true);
+                       nvme_retry_req(req);
                        return;
                }
        }
@@ -268,14 +285,14 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
-void nvme_cancel_request(struct request *req, void *data, bool reserved)
+bool nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
        dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
                                "Cancelling I/O %d", req->tag);
 
        nvme_req(req)->status = NVME_SC_ABORT_REQ;
        blk_mq_complete_request(req);
-
+       return true;
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 
@@ -536,7 +553,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
 static inline void nvme_setup_flush(struct nvme_ns *ns,
                struct nvme_command *cmnd)
 {
-       memset(cmnd, 0, sizeof(*cmnd));
        cmnd->common.opcode = nvme_cmd_flush;
        cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 }
@@ -548,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
        struct nvme_dsm_range *range;
        struct bio *bio;
 
-       range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
-       if (!range)
-               return BLK_STS_RESOURCE;
+       range = kmalloc_array(segments, sizeof(*range),
+                               GFP_ATOMIC | __GFP_NOWARN);
+       if (!range) {
+               /*
+                * If we fail allocation our range, fallback to the controller
+                * discard page. If that's also busy, it's safe to return
+                * busy, as we know we can make progress once that's freed.
+                */
+               if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
+                       return BLK_STS_RESOURCE;
+
+               range = page_address(ns->ctrl->discard_page);
+       }
 
        __rq_for_each_bio(bio, req) {
                u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
@@ -565,11 +591,13 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
        }
 
        if (WARN_ON_ONCE(n != segments)) {
-               kfree(range);
+               if (virt_to_page(range) == ns->ctrl->discard_page)
+                       clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+               else
+                       kfree(range);
                return BLK_STS_IOERR;
        }
 
-       memset(cmnd, 0, sizeof(*cmnd));
        cmnd->dsm.opcode = nvme_cmd_dsm;
        cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
        cmnd->dsm.nr = cpu_to_le32(segments - 1);
@@ -598,7 +626,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
        if (req->cmd_flags & REQ_RAHEAD)
                dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
-       memset(cmnd, 0, sizeof(*cmnd));
        cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
        cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
        cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
@@ -650,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req)
                                blk_rq_bytes(req) >> ns->lba_shift);
        }
        if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-               kfree(page_address(req->special_vec.bv_page) +
-                     req->special_vec.bv_offset);
+               struct nvme_ns *ns = req->rq_disk->private_data;
+               struct page *page = req->special_vec.bv_page;
+
+               if (page == ns->ctrl->discard_page)
+                       clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+               else
+                       kfree(page_address(page) + req->special_vec.bv_offset);
        }
 }
 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -663,6 +695,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 
        nvme_clear_nvme_request(req);
 
+       memset(cmd, 0, sizeof(*cmd));
        switch (req_op(req)) {
        case REQ_OP_DRV_IN:
        case REQ_OP_DRV_OUT:
@@ -691,6 +724,31 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 }
 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
 
+static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
+{
+       struct completion *waiting = rq->end_io_data;
+
+       rq->end_io_data = NULL;
+       complete(waiting);
+}
+
+static void nvme_execute_rq_polled(struct request_queue *q,
+               struct gendisk *bd_disk, struct request *rq, int at_head)
+{
+       DECLARE_COMPLETION_ONSTACK(wait);
+
+       WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
+
+       rq->cmd_flags |= REQ_HIPRI;
+       rq->end_io_data = &wait;
+       blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
+
+       while (!completion_done(&wait)) {
+               blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
+               cond_resched();
+       }
+}
+
 /*
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
@@ -698,7 +756,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
                union nvme_result *result, void *buffer, unsigned bufflen,
                unsigned timeout, int qid, int at_head,
-               blk_mq_req_flags_t flags)
+               blk_mq_req_flags_t flags, bool poll)
 {
        struct request *req;
        int ret;
@@ -715,7 +773,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
                        goto out;
        }
 
-       blk_execute_rq(req->q, NULL, req, at_head);
+       if (poll)
+               nvme_execute_rq_polled(req->q, NULL, req, at_head);
+       else
+               blk_execute_rq(req->q, NULL, req, at_head);
        if (result)
                *result = nvme_req(req)->result;
        if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -732,7 +793,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
                void *buffer, unsigned bufflen)
 {
        return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
-                       NVME_QID_ANY, 0, 0);
+                       NVME_QID_ANY, 0, 0, false);
 }
 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
 
@@ -843,6 +904,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
                return;
        }
 
+       ctrl->comp_seen = false;
        spin_lock_irqsave(&ctrl->lock, flags);
        if (ctrl->state == NVME_CTRL_LIVE ||
            ctrl->state == NVME_CTRL_CONNECTING)
@@ -873,6 +935,15 @@ static void nvme_keep_alive_work(struct work_struct *work)
 {
        struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
                        struct nvme_ctrl, ka_work);
+       bool comp_seen = ctrl->comp_seen;
+
+       if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
+               dev_dbg(ctrl->device,
+                       "reschedule traffic based keep-alive timer\n");
+               ctrl->comp_seen = false;
+               schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+               return;
+       }
 
        if (nvme_keep_alive(ctrl)) {
                /* allocation failure, reset the controller */
@@ -1041,7 +1112,7 @@ static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword
        c.features.dword11 = cpu_to_le32(dword11);
 
        ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
-                       buffer, buflen, 0, NVME_QID_ANY, 0, 0);
+                       buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
        if (ret >= 0 && result)
                *result = le32_to_cpu(res.u32);
        return ret;
@@ -1240,12 +1311,12 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
        c.common.nsid = cpu_to_le32(cmd.nsid);
        c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
        c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-       c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
-       c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
-       c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
-       c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
-       c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
-       c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+       c.common.cdw10 = cpu_to_le32(cmd.cdw10);
+       c.common.cdw11 = cpu_to_le32(cmd.cdw11);
+       c.common.cdw12 = cpu_to_le32(cmd.cdw12);
+       c.common.cdw13 = cpu_to_le32(cmd.cdw13);
+       c.common.cdw14 = cpu_to_le32(cmd.cdw14);
+       c.common.cdw15 = cpu_to_le32(cmd.cdw15);
 
        if (cmd.timeout_ms)
                timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -1524,8 +1595,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
        if (ns->noiob)
                nvme_set_chunk_size(ns);
        nvme_update_disk_info(disk, ns, id);
-       if (ns->ndev)
-               nvme_nvm_update_nvm_info(ns);
 #ifdef CONFIG_NVME_MULTIPATH
        if (ns->head->disk) {
                nvme_update_disk_info(ns->head->disk, ns, id);
@@ -1608,7 +1677,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
        memset(&c, 0, sizeof(c));
        c.common.opcode = op;
        c.common.nsid = cpu_to_le32(ns->head->ns_id);
-       c.common.cdw10[0] = cpu_to_le32(cdw10);
+       c.common.cdw10 = cpu_to_le32(cdw10);
 
        ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
        nvme_put_ns_from_disk(head, srcu_idx);
@@ -1682,11 +1751,11 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
        else
                cmd.common.opcode = nvme_admin_security_recv;
        cmd.common.nsid = 0;
-       cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
-       cmd.common.cdw10[1] = cpu_to_le32(len);
+       cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
+       cmd.common.cdw11 = cpu_to_le32(len);
 
        return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
-                                     ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
+                                     ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
 }
 EXPORT_SYMBOL_GPL(nvme_sec_submit);
 #endif /* CONFIG_BLK_SED_OPAL */
@@ -1881,6 +1950,26 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
        return ret;
 }
 
+static int nvme_configure_acre(struct nvme_ctrl *ctrl)
+{
+       struct nvme_feat_host_behavior *host;
+       int ret;
+
+       /* Don't bother enabling the feature if retry delay is not reported */
+       if (!ctrl->crdt[0])
+               return 0;
+
+       host = kzalloc(sizeof(*host), GFP_KERNEL);
+       if (!host)
+               return 0;
+
+       host->acre = NVME_ENABLE_ACRE;
+       ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
+                               host, sizeof(*host), NULL);
+       kfree(host);
+       return ret;
+}
+
 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
 {
        /*
@@ -2084,18 +2173,20 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct
        size_t nqnlen;
        int off;
 
-       nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
-       if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
-               strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
-               return;
-       }
+       if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
+               nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
+               if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
+                       strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
+                       return;
+               }
 
-       if (ctrl->vs >= NVME_VS(1, 2, 1))
-               dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
+               if (ctrl->vs >= NVME_VS(1, 2, 1))
+                       dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
+       }
 
        /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
        off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
-                       "nqn.2014.08.org.nvmexpress:%4x%4x",
+                       "nqn.2014.08.org.nvmexpress:%04x%04x",
                        le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
        memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
        off += sizeof(id->sn);
@@ -2402,12 +2493,15 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
                ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
        }
 
+       ctrl->crdt[0] = le16_to_cpu(id->crdt1);
+       ctrl->crdt[1] = le16_to_cpu(id->crdt2);
+       ctrl->crdt[2] = le16_to_cpu(id->crdt3);
+
        ctrl->oacs = le16_to_cpu(id->oacs);
        ctrl->oncs = le16_to_cpup(&id->oncs);
        ctrl->oaes = le32_to_cpu(id->oaes);
        atomic_set(&ctrl->abort_limit, id->acl + 1);
        ctrl->vwc = id->vwc;
-       ctrl->cntlid = le16_to_cpup(&id->cntlid);
        if (id->mdts)
                max_hw_sectors = 1 << (id->mdts + page_shift - 9);
        else
@@ -2419,6 +2513,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
        ctrl->sgls = le32_to_cpu(id->sgls);
        ctrl->kas = le16_to_cpu(id->kas);
        ctrl->max_namespaces = le32_to_cpu(id->mnan);
+       ctrl->ctratt = le32_to_cpu(id->ctratt);
 
        if (id->rtd3e) {
                /* us -> s */
@@ -2501,6 +2596,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
        if (ret < 0)
                return ret;
 
+       ret = nvme_configure_acre(ctrl);
+       if (ret < 0)
+               return ret;
+
        ctrl->identified = true;
 
        return 0;
@@ -2776,6 +2875,7 @@ static ssize_t  field##_show(struct device *dev,                          \
 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
 
 nvme_show_int_function(cntlid);
+nvme_show_int_function(numa_node);
 
 static ssize_t nvme_sysfs_delete(struct device *dev,
                                struct device_attribute *attr, const char *buf,
@@ -2855,6 +2955,7 @@ static struct attribute *nvme_dev_attrs[] = {
        &dev_attr_subsysnqn.attr,
        &dev_attr_address.attr,
        &dev_attr_state.attr,
+       &dev_attr_numa_node.attr,
        NULL
 };
 
@@ -3065,7 +3166,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        struct gendisk *disk;
        struct nvme_id_ns *id;
        char disk_name[DISK_NAME_LEN];
-       int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+       int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
 
        ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
        if (!ns)
@@ -3100,13 +3201,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        nvme_setup_streams_ns(ctrl, ns);
        nvme_set_disk_name(disk_name, ns, ctrl, &flags);
 
-       if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
-               if (nvme_nvm_register(ns, disk_name, node)) {
-                       dev_warn(ctrl->device, "LightNVM init failure\n");
-                       goto out_unlink_ns;
-               }
-       }
-
        disk = alloc_disk_node(0, node);
        if (!disk)
                goto out_unlink_ns;
@@ -3120,6 +3214,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
        __nvme_revalidate_disk(disk, id);
 
+       if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
+               if (nvme_nvm_register(ns, disk_name, node)) {
+                       dev_warn(ctrl->device, "LightNVM init failure\n");
+                       goto out_put_disk;
+               }
+       }
+
        down_write(&ctrl->namespaces_rwsem);
        list_add_tail(&ns->list, &ctrl->namespaces);
        up_write(&ctrl->namespaces_rwsem);
@@ -3133,6 +3234,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
        kfree(id);
 
        return;
+ out_put_disk:
+       put_disk(ns->disk);
  out_unlink_ns:
        mutex_lock(&ctrl->subsys->lock);
        list_del_rcu(&ns->siblings);
@@ -3522,6 +3625,7 @@ static void nvme_free_ctrl(struct device *dev)
        ida_simple_remove(&nvme_instance_ida, ctrl->instance);
        kfree(ctrl->effects);
        nvme_mpath_uninit(ctrl);
+       __free_page(ctrl->discard_page);
 
        if (subsys) {
                mutex_lock(&subsys->lock);
@@ -3562,6 +3666,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
        memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
        ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 
+       BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
+                       PAGE_SIZE);
+       ctrl->discard_page = alloc_page(GFP_KERNEL);
+       if (!ctrl->discard_page) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
        ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
        if (ret < 0)
                goto out;
@@ -3599,6 +3711,8 @@ out_free_name:
 out_release_instance:
        ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 out:
+       if (ctrl->discard_page)
+               __free_page(ctrl->discard_page);
        return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -3746,7 +3860,7 @@ out:
        return result;
 }
 
-void nvme_core_exit(void)
+void __exit nvme_core_exit(void)
 {
        ida_destroy(&nvme_subsystems_ida);
        class_destroy(nvme_subsys_class);