Merge remote-tracking branches 'asoc/topic/adsp', 'asoc/topic/ak4613', 'asoc/topic...
[sfrench/cifs-2.6.git] / drivers / nvme / host / rdma.c
1 /*
2  * NVMe over Fabrics RDMA host code.
3  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  */
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/module.h>
16 #include <linux/init.h>
17 #include <linux/slab.h>
18 #include <linux/err.h>
19 #include <linux/string.h>
20 #include <linux/atomic.h>
21 #include <linux/blk-mq.h>
22 #include <linux/types.h>
23 #include <linux/list.h>
24 #include <linux/mutex.h>
25 #include <linux/scatterlist.h>
26 #include <linux/nvme.h>
27 #include <asm/unaligned.h>
28
29 #include <rdma/ib_verbs.h>
30 #include <rdma/rdma_cm.h>
31 #include <linux/nvme-rdma.h>
32
33 #include "nvme.h"
34 #include "fabrics.h"
35
36
37 #define NVME_RDMA_CONNECT_TIMEOUT_MS    1000            /* 1 second */
38
39 #define NVME_RDMA_MAX_SEGMENT_SIZE      0xffffff        /* 24-bit SGL field */
40
41 #define NVME_RDMA_MAX_SEGMENTS          256
42
43 #define NVME_RDMA_MAX_INLINE_SEGMENTS   1
44
45 /*
46  * We handle AEN commands ourselves and don't even let the
47  * block layer know about them.
48  */
49 #define NVME_RDMA_NR_AEN_COMMANDS      1
50 #define NVME_RDMA_AQ_BLKMQ_DEPTH       \
51         (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
52
53 struct nvme_rdma_device {
54         struct ib_device       *dev;
55         struct ib_pd           *pd;
56         struct kref             ref;
57         struct list_head        entry;
58 };
59
60 struct nvme_rdma_qe {
61         struct ib_cqe           cqe;
62         void                    *data;
63         u64                     dma;
64 };
65
66 struct nvme_rdma_queue;
67 struct nvme_rdma_request {
68         struct nvme_request     req;
69         struct ib_mr            *mr;
70         struct nvme_rdma_qe     sqe;
71         struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
72         u32                     num_sge;
73         int                     nents;
74         bool                    inline_data;
75         struct ib_reg_wr        reg_wr;
76         struct ib_cqe           reg_cqe;
77         struct nvme_rdma_queue  *queue;
78         struct sg_table         sg_table;
79         struct scatterlist      first_sgl[];
80 };
81
82 enum nvme_rdma_queue_flags {
83         NVME_RDMA_Q_CONNECTED = (1 << 0),
84         NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1),
85         NVME_RDMA_Q_DELETING = (1 << 2),
86         NVME_RDMA_Q_LIVE = (1 << 3),
87 };
88
89 struct nvme_rdma_queue {
90         struct nvme_rdma_qe     *rsp_ring;
91         u8                      sig_count;
92         int                     queue_size;
93         size_t                  cmnd_capsule_len;
94         struct nvme_rdma_ctrl   *ctrl;
95         struct nvme_rdma_device *device;
96         struct ib_cq            *ib_cq;
97         struct ib_qp            *qp;
98
99         unsigned long           flags;
100         struct rdma_cm_id       *cm_id;
101         int                     cm_error;
102         struct completion       cm_done;
103 };
104
105 struct nvme_rdma_ctrl {
106         /* read and written in the hot path */
107         spinlock_t              lock;
108
109         /* read only in the hot path */
110         struct nvme_rdma_queue  *queues;
111         u32                     queue_count;
112
113         /* other member variables */
114         struct blk_mq_tag_set   tag_set;
115         struct work_struct      delete_work;
116         struct work_struct      reset_work;
117         struct work_struct      err_work;
118
119         struct nvme_rdma_qe     async_event_sqe;
120
121         int                     reconnect_delay;
122         struct delayed_work     reconnect_work;
123
124         struct list_head        list;
125
126         struct blk_mq_tag_set   admin_tag_set;
127         struct nvme_rdma_device *device;
128
129         u64                     cap;
130         u32                     max_fr_pages;
131
132         union {
133                 struct sockaddr addr;
134                 struct sockaddr_in addr_in;
135         };
136         union {
137                 struct sockaddr src_addr;
138                 struct sockaddr_in src_addr_in;
139         };
140
141         struct nvme_ctrl        ctrl;
142 };
143
144 static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
145 {
146         return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
147 }
148
149 static LIST_HEAD(device_list);
150 static DEFINE_MUTEX(device_list_mutex);
151
152 static LIST_HEAD(nvme_rdma_ctrl_list);
153 static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
154
155 static struct workqueue_struct *nvme_rdma_wq;
156
157 /*
158  * Disabling this option makes small I/O goes faster, but is fundamentally
159  * unsafe.  With it turned off we will have to register a global rkey that
160  * allows read and write access to all physical memory.
161  */
162 static bool register_always = true;
163 module_param(register_always, bool, 0444);
164 MODULE_PARM_DESC(register_always,
165          "Use memory registration even for contiguous memory regions");
166
167 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
168                 struct rdma_cm_event *event);
169 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
170
171 /* XXX: really should move to a generic header sooner or later.. */
172 static inline void put_unaligned_le24(u32 val, u8 *p)
173 {
174         *p++ = val;
175         *p++ = val >> 8;
176         *p++ = val >> 16;
177 }
178
179 static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
180 {
181         return queue - queue->ctrl->queues;
182 }
183
184 static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
185 {
186         return queue->cmnd_capsule_len - sizeof(struct nvme_command);
187 }
188
189 static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
190                 size_t capsule_size, enum dma_data_direction dir)
191 {
192         ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
193         kfree(qe->data);
194 }
195
196 static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
197                 size_t capsule_size, enum dma_data_direction dir)
198 {
199         qe->data = kzalloc(capsule_size, GFP_KERNEL);
200         if (!qe->data)
201                 return -ENOMEM;
202
203         qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
204         if (ib_dma_mapping_error(ibdev, qe->dma)) {
205                 kfree(qe->data);
206                 return -ENOMEM;
207         }
208
209         return 0;
210 }
211
212 static void nvme_rdma_free_ring(struct ib_device *ibdev,
213                 struct nvme_rdma_qe *ring, size_t ib_queue_size,
214                 size_t capsule_size, enum dma_data_direction dir)
215 {
216         int i;
217
218         for (i = 0; i < ib_queue_size; i++)
219                 nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
220         kfree(ring);
221 }
222
223 static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
224                 size_t ib_queue_size, size_t capsule_size,
225                 enum dma_data_direction dir)
226 {
227         struct nvme_rdma_qe *ring;
228         int i;
229
230         ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
231         if (!ring)
232                 return NULL;
233
234         for (i = 0; i < ib_queue_size; i++) {
235                 if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
236                         goto out_free_ring;
237         }
238
239         return ring;
240
241 out_free_ring:
242         nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
243         return NULL;
244 }
245
246 static void nvme_rdma_qp_event(struct ib_event *event, void *context)
247 {
248         pr_debug("QP event %s (%d)\n",
249                  ib_event_msg(event->event), event->event);
250
251 }
252
253 static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
254 {
255         wait_for_completion_interruptible_timeout(&queue->cm_done,
256                         msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
257         return queue->cm_error;
258 }
259
260 static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
261 {
262         struct nvme_rdma_device *dev = queue->device;
263         struct ib_qp_init_attr init_attr;
264         int ret;
265
266         memset(&init_attr, 0, sizeof(init_attr));
267         init_attr.event_handler = nvme_rdma_qp_event;
268         /* +1 for drain */
269         init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
270         /* +1 for drain */
271         init_attr.cap.max_recv_wr = queue->queue_size + 1;
272         init_attr.cap.max_recv_sge = 1;
273         init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
274         init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
275         init_attr.qp_type = IB_QPT_RC;
276         init_attr.send_cq = queue->ib_cq;
277         init_attr.recv_cq = queue->ib_cq;
278
279         ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
280
281         queue->qp = queue->cm_id->qp;
282         return ret;
283 }
284
285 static int nvme_rdma_reinit_request(void *data, struct request *rq)
286 {
287         struct nvme_rdma_ctrl *ctrl = data;
288         struct nvme_rdma_device *dev = ctrl->device;
289         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
290         int ret = 0;
291
292         if (!req->mr->need_inval)
293                 goto out;
294
295         ib_dereg_mr(req->mr);
296
297         req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
298                         ctrl->max_fr_pages);
299         if (IS_ERR(req->mr)) {
300                 ret = PTR_ERR(req->mr);
301                 req->mr = NULL;
302                 goto out;
303         }
304
305         req->mr->need_inval = false;
306
307 out:
308         return ret;
309 }
310
311 static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl,
312                 struct request *rq, unsigned int queue_idx)
313 {
314         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
315         struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
316         struct nvme_rdma_device *dev = queue->device;
317
318         if (req->mr)
319                 ib_dereg_mr(req->mr);
320
321         nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
322                         DMA_TO_DEVICE);
323 }
324
325 static void nvme_rdma_exit_request(void *data, struct request *rq,
326                                 unsigned int hctx_idx, unsigned int rq_idx)
327 {
328         return __nvme_rdma_exit_request(data, rq, hctx_idx + 1);
329 }
330
331 static void nvme_rdma_exit_admin_request(void *data, struct request *rq,
332                                 unsigned int hctx_idx, unsigned int rq_idx)
333 {
334         return __nvme_rdma_exit_request(data, rq, 0);
335 }
336
337 static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl,
338                 struct request *rq, unsigned int queue_idx)
339 {
340         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
341         struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
342         struct nvme_rdma_device *dev = queue->device;
343         struct ib_device *ibdev = dev->dev;
344         int ret;
345
346         ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
347                         DMA_TO_DEVICE);
348         if (ret)
349                 return ret;
350
351         req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
352                         ctrl->max_fr_pages);
353         if (IS_ERR(req->mr)) {
354                 ret = PTR_ERR(req->mr);
355                 goto out_free_qe;
356         }
357
358         req->queue = queue;
359
360         return 0;
361
362 out_free_qe:
363         nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
364                         DMA_TO_DEVICE);
365         return -ENOMEM;
366 }
367
368 static int nvme_rdma_init_request(void *data, struct request *rq,
369                                 unsigned int hctx_idx, unsigned int rq_idx,
370                                 unsigned int numa_node)
371 {
372         return __nvme_rdma_init_request(data, rq, hctx_idx + 1);
373 }
374
375 static int nvme_rdma_init_admin_request(void *data, struct request *rq,
376                                 unsigned int hctx_idx, unsigned int rq_idx,
377                                 unsigned int numa_node)
378 {
379         return __nvme_rdma_init_request(data, rq, 0);
380 }
381
382 static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
383                 unsigned int hctx_idx)
384 {
385         struct nvme_rdma_ctrl *ctrl = data;
386         struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
387
388         BUG_ON(hctx_idx >= ctrl->queue_count);
389
390         hctx->driver_data = queue;
391         return 0;
392 }
393
394 static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
395                 unsigned int hctx_idx)
396 {
397         struct nvme_rdma_ctrl *ctrl = data;
398         struct nvme_rdma_queue *queue = &ctrl->queues[0];
399
400         BUG_ON(hctx_idx != 0);
401
402         hctx->driver_data = queue;
403         return 0;
404 }
405
406 static void nvme_rdma_free_dev(struct kref *ref)
407 {
408         struct nvme_rdma_device *ndev =
409                 container_of(ref, struct nvme_rdma_device, ref);
410
411         mutex_lock(&device_list_mutex);
412         list_del(&ndev->entry);
413         mutex_unlock(&device_list_mutex);
414
415         ib_dealloc_pd(ndev->pd);
416         kfree(ndev);
417 }
418
419 static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
420 {
421         kref_put(&dev->ref, nvme_rdma_free_dev);
422 }
423
424 static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
425 {
426         return kref_get_unless_zero(&dev->ref);
427 }
428
429 static struct nvme_rdma_device *
430 nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
431 {
432         struct nvme_rdma_device *ndev;
433
434         mutex_lock(&device_list_mutex);
435         list_for_each_entry(ndev, &device_list, entry) {
436                 if (ndev->dev->node_guid == cm_id->device->node_guid &&
437                     nvme_rdma_dev_get(ndev))
438                         goto out_unlock;
439         }
440
441         ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
442         if (!ndev)
443                 goto out_err;
444
445         ndev->dev = cm_id->device;
446         kref_init(&ndev->ref);
447
448         ndev->pd = ib_alloc_pd(ndev->dev,
449                 register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
450         if (IS_ERR(ndev->pd))
451                 goto out_free_dev;
452
453         if (!(ndev->dev->attrs.device_cap_flags &
454               IB_DEVICE_MEM_MGT_EXTENSIONS)) {
455                 dev_err(&ndev->dev->dev,
456                         "Memory registrations not supported.\n");
457                 goto out_free_pd;
458         }
459
460         list_add(&ndev->entry, &device_list);
461 out_unlock:
462         mutex_unlock(&device_list_mutex);
463         return ndev;
464
465 out_free_pd:
466         ib_dealloc_pd(ndev->pd);
467 out_free_dev:
468         kfree(ndev);
469 out_err:
470         mutex_unlock(&device_list_mutex);
471         return NULL;
472 }
473
474 static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
475 {
476         struct nvme_rdma_device *dev;
477         struct ib_device *ibdev;
478
479         if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags))
480                 return;
481
482         dev = queue->device;
483         ibdev = dev->dev;
484         rdma_destroy_qp(queue->cm_id);
485         ib_free_cq(queue->ib_cq);
486
487         nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
488                         sizeof(struct nvme_completion), DMA_FROM_DEVICE);
489
490         nvme_rdma_dev_put(dev);
491 }
492
493 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue,
494                 struct nvme_rdma_device *dev)
495 {
496         struct ib_device *ibdev = dev->dev;
497         const int send_wr_factor = 3;                   /* MR, SEND, INV */
498         const int cq_factor = send_wr_factor + 1;       /* + RECV */
499         int comp_vector, idx = nvme_rdma_queue_idx(queue);
500
501         int ret;
502
503         queue->device = dev;
504
505         /*
506          * The admin queue is barely used once the controller is live, so don't
507          * bother to spread it out.
508          */
509         if (idx == 0)
510                 comp_vector = 0;
511         else
512                 comp_vector = idx % ibdev->num_comp_vectors;
513
514
515         /* +1 for ib_stop_cq */
516         queue->ib_cq = ib_alloc_cq(dev->dev, queue,
517                                 cq_factor * queue->queue_size + 1, comp_vector,
518                                 IB_POLL_SOFTIRQ);
519         if (IS_ERR(queue->ib_cq)) {
520                 ret = PTR_ERR(queue->ib_cq);
521                 goto out;
522         }
523
524         ret = nvme_rdma_create_qp(queue, send_wr_factor);
525         if (ret)
526                 goto out_destroy_ib_cq;
527
528         queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
529                         sizeof(struct nvme_completion), DMA_FROM_DEVICE);
530         if (!queue->rsp_ring) {
531                 ret = -ENOMEM;
532                 goto out_destroy_qp;
533         }
534         set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags);
535
536         return 0;
537
538 out_destroy_qp:
539         ib_destroy_qp(queue->qp);
540 out_destroy_ib_cq:
541         ib_free_cq(queue->ib_cq);
542 out:
543         return ret;
544 }
545
546 static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
547                 int idx, size_t queue_size)
548 {
549         struct nvme_rdma_queue *queue;
550         struct sockaddr *src_addr = NULL;
551         int ret;
552
553         queue = &ctrl->queues[idx];
554         queue->ctrl = ctrl;
555         init_completion(&queue->cm_done);
556
557         if (idx > 0)
558                 queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
559         else
560                 queue->cmnd_capsule_len = sizeof(struct nvme_command);
561
562         queue->queue_size = queue_size;
563
564         queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
565                         RDMA_PS_TCP, IB_QPT_RC);
566         if (IS_ERR(queue->cm_id)) {
567                 dev_info(ctrl->ctrl.device,
568                         "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
569                 return PTR_ERR(queue->cm_id);
570         }
571
572         queue->cm_error = -ETIMEDOUT;
573         if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
574                 src_addr = &ctrl->src_addr;
575
576         ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr,
577                         NVME_RDMA_CONNECT_TIMEOUT_MS);
578         if (ret) {
579                 dev_info(ctrl->ctrl.device,
580                         "rdma_resolve_addr failed (%d).\n", ret);
581                 goto out_destroy_cm_id;
582         }
583
584         ret = nvme_rdma_wait_for_cm(queue);
585         if (ret) {
586                 dev_info(ctrl->ctrl.device,
587                         "rdma_resolve_addr wait failed (%d).\n", ret);
588                 goto out_destroy_cm_id;
589         }
590
591         clear_bit(NVME_RDMA_Q_DELETING, &queue->flags);
592         set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags);
593
594         return 0;
595
596 out_destroy_cm_id:
597         nvme_rdma_destroy_queue_ib(queue);
598         rdma_destroy_id(queue->cm_id);
599         return ret;
600 }
601
602 static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
603 {
604         rdma_disconnect(queue->cm_id);
605         ib_drain_qp(queue->qp);
606 }
607
608 static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
609 {
610         nvme_rdma_destroy_queue_ib(queue);
611         rdma_destroy_id(queue->cm_id);
612 }
613
614 static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue)
615 {
616         if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags))
617                 return;
618         nvme_rdma_stop_queue(queue);
619         nvme_rdma_free_queue(queue);
620 }
621
622 static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
623 {
624         int i;
625
626         for (i = 1; i < ctrl->queue_count; i++)
627                 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
628 }
629
630 static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl)
631 {
632         int i, ret = 0;
633
634         for (i = 1; i < ctrl->queue_count; i++) {
635                 ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
636                 if (ret) {
637                         dev_info(ctrl->ctrl.device,
638                                 "failed to connect i/o queue: %d\n", ret);
639                         goto out_free_queues;
640                 }
641                 set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags);
642         }
643
644         return 0;
645
646 out_free_queues:
647         nvme_rdma_free_io_queues(ctrl);
648         return ret;
649 }
650
651 static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
652 {
653         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
654         unsigned int nr_io_queues;
655         int i, ret;
656
657         nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
658         ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
659         if (ret)
660                 return ret;
661
662         ctrl->queue_count = nr_io_queues + 1;
663         if (ctrl->queue_count < 2)
664                 return 0;
665
666         dev_info(ctrl->ctrl.device,
667                 "creating %d I/O queues.\n", nr_io_queues);
668
669         for (i = 1; i < ctrl->queue_count; i++) {
670                 ret = nvme_rdma_init_queue(ctrl, i,
671                                            ctrl->ctrl.opts->queue_size);
672                 if (ret) {
673                         dev_info(ctrl->ctrl.device,
674                                 "failed to initialize i/o queue: %d\n", ret);
675                         goto out_free_queues;
676                 }
677         }
678
679         return 0;
680
681 out_free_queues:
682         for (i--; i >= 1; i--)
683                 nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
684
685         return ret;
686 }
687
688 static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
689 {
690         nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe,
691                         sizeof(struct nvme_command), DMA_TO_DEVICE);
692         nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
693         blk_cleanup_queue(ctrl->ctrl.admin_q);
694         blk_mq_free_tag_set(&ctrl->admin_tag_set);
695         nvme_rdma_dev_put(ctrl->device);
696 }
697
698 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
699 {
700         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
701
702         if (list_empty(&ctrl->list))
703                 goto free_ctrl;
704
705         mutex_lock(&nvme_rdma_ctrl_mutex);
706         list_del(&ctrl->list);
707         mutex_unlock(&nvme_rdma_ctrl_mutex);
708
709         kfree(ctrl->queues);
710         nvmf_free_options(nctrl->opts);
711 free_ctrl:
712         kfree(ctrl);
713 }
714
715 static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
716 {
717         struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
718                         struct nvme_rdma_ctrl, reconnect_work);
719         bool changed;
720         int ret;
721
722         if (ctrl->queue_count > 1) {
723                 nvme_rdma_free_io_queues(ctrl);
724
725                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
726                 if (ret)
727                         goto requeue;
728         }
729
730         nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
731
732         ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
733         if (ret)
734                 goto requeue;
735
736         ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
737         if (ret)
738                 goto requeue;
739
740         blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
741
742         ret = nvmf_connect_admin_queue(&ctrl->ctrl);
743         if (ret)
744                 goto stop_admin_q;
745
746         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
747
748         ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
749         if (ret)
750                 goto stop_admin_q;
751
752         nvme_start_keep_alive(&ctrl->ctrl);
753
754         if (ctrl->queue_count > 1) {
755                 ret = nvme_rdma_init_io_queues(ctrl);
756                 if (ret)
757                         goto stop_admin_q;
758
759                 ret = nvme_rdma_connect_io_queues(ctrl);
760                 if (ret)
761                         goto stop_admin_q;
762         }
763
764         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
765         WARN_ON_ONCE(!changed);
766
767         if (ctrl->queue_count > 1) {
768                 nvme_start_queues(&ctrl->ctrl);
769                 nvme_queue_scan(&ctrl->ctrl);
770                 nvme_queue_async_events(&ctrl->ctrl);
771         }
772
773         dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
774
775         return;
776
777 stop_admin_q:
778         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
779 requeue:
780         /* Make sure we are not resetting/deleting */
781         if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
782                 dev_info(ctrl->ctrl.device,
783                         "Failed reconnect attempt, requeueing...\n");
784                 queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
785                                         ctrl->reconnect_delay * HZ);
786         }
787 }
788
789 static void nvme_rdma_error_recovery_work(struct work_struct *work)
790 {
791         struct nvme_rdma_ctrl *ctrl = container_of(work,
792                         struct nvme_rdma_ctrl, err_work);
793         int i;
794
795         nvme_stop_keep_alive(&ctrl->ctrl);
796
797         for (i = 0; i < ctrl->queue_count; i++) {
798                 clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags);
799                 clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags);
800         }
801
802         if (ctrl->queue_count > 1)
803                 nvme_stop_queues(&ctrl->ctrl);
804         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
805
806         /* We must take care of fastfail/requeue all our inflight requests */
807         if (ctrl->queue_count > 1)
808                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
809                                         nvme_cancel_request, &ctrl->ctrl);
810         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
811                                 nvme_cancel_request, &ctrl->ctrl);
812
813         dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
814                 ctrl->reconnect_delay);
815
816         queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
817                                 ctrl->reconnect_delay * HZ);
818 }
819
820 static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
821 {
822         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
823                 return;
824
825         queue_work(nvme_rdma_wq, &ctrl->err_work);
826 }
827
828 static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
829                 const char *op)
830 {
831         struct nvme_rdma_queue *queue = cq->cq_context;
832         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
833
834         if (ctrl->ctrl.state == NVME_CTRL_LIVE)
835                 dev_info(ctrl->ctrl.device,
836                              "%s for CQE 0x%p failed with status %s (%d)\n",
837                              op, wc->wr_cqe,
838                              ib_wc_status_msg(wc->status), wc->status);
839         nvme_rdma_error_recovery(ctrl);
840 }
841
842 static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
843 {
844         if (unlikely(wc->status != IB_WC_SUCCESS))
845                 nvme_rdma_wr_error(cq, wc, "MEMREG");
846 }
847
848 static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
849 {
850         if (unlikely(wc->status != IB_WC_SUCCESS))
851                 nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
852 }
853
854 static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
855                 struct nvme_rdma_request *req)
856 {
857         struct ib_send_wr *bad_wr;
858         struct ib_send_wr wr = {
859                 .opcode             = IB_WR_LOCAL_INV,
860                 .next               = NULL,
861                 .num_sge            = 0,
862                 .send_flags         = 0,
863                 .ex.invalidate_rkey = req->mr->rkey,
864         };
865
866         req->reg_cqe.done = nvme_rdma_inv_rkey_done;
867         wr.wr_cqe = &req->reg_cqe;
868
869         return ib_post_send(queue->qp, &wr, &bad_wr);
870 }
871
872 static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
873                 struct request *rq)
874 {
875         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
876         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
877         struct nvme_rdma_device *dev = queue->device;
878         struct ib_device *ibdev = dev->dev;
879         int res;
880
881         if (!blk_rq_bytes(rq))
882                 return;
883
884         if (req->mr->need_inval) {
885                 res = nvme_rdma_inv_rkey(queue, req);
886                 if (res < 0) {
887                         dev_err(ctrl->ctrl.device,
888                                 "Queueing INV WR for rkey %#x failed (%d)\n",
889                                 req->mr->rkey, res);
890                         nvme_rdma_error_recovery(queue->ctrl);
891                 }
892         }
893
894         ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
895                         req->nents, rq_data_dir(rq) ==
896                                     WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
897
898         nvme_cleanup_cmd(rq);
899         sg_free_table_chained(&req->sg_table, true);
900 }
901
902 static int nvme_rdma_set_sg_null(struct nvme_command *c)
903 {
904         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
905
906         sg->addr = 0;
907         put_unaligned_le24(0, sg->length);
908         put_unaligned_le32(0, sg->key);
909         sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
910         return 0;
911 }
912
913 static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
914                 struct nvme_rdma_request *req, struct nvme_command *c)
915 {
916         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
917
918         req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
919         req->sge[1].length = sg_dma_len(req->sg_table.sgl);
920         req->sge[1].lkey = queue->device->pd->local_dma_lkey;
921
922         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
923         sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
924         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
925
926         req->inline_data = true;
927         req->num_sge++;
928         return 0;
929 }
930
931 static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
932                 struct nvme_rdma_request *req, struct nvme_command *c)
933 {
934         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
935
936         sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
937         put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
938         put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
939         sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
940         return 0;
941 }
942
943 static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
944                 struct nvme_rdma_request *req, struct nvme_command *c,
945                 int count)
946 {
947         struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
948         int nr;
949
950         nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
951         if (nr < count) {
952                 if (nr < 0)
953                         return nr;
954                 return -EINVAL;
955         }
956
957         ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
958
959         req->reg_cqe.done = nvme_rdma_memreg_done;
960         memset(&req->reg_wr, 0, sizeof(req->reg_wr));
961         req->reg_wr.wr.opcode = IB_WR_REG_MR;
962         req->reg_wr.wr.wr_cqe = &req->reg_cqe;
963         req->reg_wr.wr.num_sge = 0;
964         req->reg_wr.mr = req->mr;
965         req->reg_wr.key = req->mr->rkey;
966         req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
967                              IB_ACCESS_REMOTE_READ |
968                              IB_ACCESS_REMOTE_WRITE;
969
970         req->mr->need_inval = true;
971
972         sg->addr = cpu_to_le64(req->mr->iova);
973         put_unaligned_le24(req->mr->length, sg->length);
974         put_unaligned_le32(req->mr->rkey, sg->key);
975         sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
976                         NVME_SGL_FMT_INVALIDATE;
977
978         return 0;
979 }
980
981 static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
982                 struct request *rq, struct nvme_command *c)
983 {
984         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
985         struct nvme_rdma_device *dev = queue->device;
986         struct ib_device *ibdev = dev->dev;
987         int count, ret;
988
989         req->num_sge = 1;
990         req->inline_data = false;
991         req->mr->need_inval = false;
992
993         c->common.flags |= NVME_CMD_SGL_METABUF;
994
995         if (!blk_rq_bytes(rq))
996                 return nvme_rdma_set_sg_null(c);
997
998         req->sg_table.sgl = req->first_sgl;
999         ret = sg_alloc_table_chained(&req->sg_table,
1000                         blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
1001         if (ret)
1002                 return -ENOMEM;
1003
1004         req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
1005
1006         count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
1007                     rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1008         if (unlikely(count <= 0)) {
1009                 sg_free_table_chained(&req->sg_table, true);
1010                 return -EIO;
1011         }
1012
1013         if (count == 1) {
1014                 if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
1015                     blk_rq_payload_bytes(rq) <=
1016                                 nvme_rdma_inline_data_size(queue))
1017                         return nvme_rdma_map_sg_inline(queue, req, c);
1018
1019                 if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
1020                         return nvme_rdma_map_sg_single(queue, req, c);
1021         }
1022
1023         return nvme_rdma_map_sg_fr(queue, req, c, count);
1024 }
1025
1026 static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
1027 {
1028         if (unlikely(wc->status != IB_WC_SUCCESS))
1029                 nvme_rdma_wr_error(cq, wc, "SEND");
1030 }
1031
1032 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1033                 struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1034                 struct ib_send_wr *first, bool flush)
1035 {
1036         struct ib_send_wr wr, *bad_wr;
1037         int ret;
1038
1039         sge->addr   = qe->dma;
1040         sge->length = sizeof(struct nvme_command),
1041         sge->lkey   = queue->device->pd->local_dma_lkey;
1042
1043         qe->cqe.done = nvme_rdma_send_done;
1044
1045         wr.next       = NULL;
1046         wr.wr_cqe     = &qe->cqe;
1047         wr.sg_list    = sge;
1048         wr.num_sge    = num_sge;
1049         wr.opcode     = IB_WR_SEND;
1050         wr.send_flags = 0;
1051
1052         /*
1053          * Unsignalled send completions are another giant desaster in the
1054          * IB Verbs spec:  If we don't regularly post signalled sends
1055          * the send queue will fill up and only a QP reset will rescue us.
1056          * Would have been way to obvious to handle this in hardware or
1057          * at least the RDMA stack..
1058          *
1059          * This messy and racy code sniplet is copy and pasted from the iSER
1060          * initiator, and the magic '32' comes from there as well.
1061          *
1062          * Always signal the flushes. The magic request used for the flush
1063          * sequencer is not allocated in our driver's tagset and it's
1064          * triggered to be freed by blk_cleanup_queue(). So we need to
1065          * always mark it as signaled to ensure that the "wr_cqe", which is
1066          * embedded in request's payload, is not freed when __ib_process_cq()
1067          * calls wr_cqe->done().
1068          */
1069         if ((++queue->sig_count % 32) == 0 || flush)
1070                 wr.send_flags |= IB_SEND_SIGNALED;
1071
1072         if (first)
1073                 first->next = &wr;
1074         else
1075                 first = &wr;
1076
1077         ret = ib_post_send(queue->qp, first, &bad_wr);
1078         if (ret) {
1079                 dev_err(queue->ctrl->ctrl.device,
1080                              "%s failed with error code %d\n", __func__, ret);
1081         }
1082         return ret;
1083 }
1084
1085 static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
1086                 struct nvme_rdma_qe *qe)
1087 {
1088         struct ib_recv_wr wr, *bad_wr;
1089         struct ib_sge list;
1090         int ret;
1091
1092         list.addr   = qe->dma;
1093         list.length = sizeof(struct nvme_completion);
1094         list.lkey   = queue->device->pd->local_dma_lkey;
1095
1096         qe->cqe.done = nvme_rdma_recv_done;
1097
1098         wr.next     = NULL;
1099         wr.wr_cqe   = &qe->cqe;
1100         wr.sg_list  = &list;
1101         wr.num_sge  = 1;
1102
1103         ret = ib_post_recv(queue->qp, &wr, &bad_wr);
1104         if (ret) {
1105                 dev_err(queue->ctrl->ctrl.device,
1106                         "%s failed with error code %d\n", __func__, ret);
1107         }
1108         return ret;
1109 }
1110
1111 static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1112 {
1113         u32 queue_idx = nvme_rdma_queue_idx(queue);
1114
1115         if (queue_idx == 0)
1116                 return queue->ctrl->admin_tag_set.tags[queue_idx];
1117         return queue->ctrl->tag_set.tags[queue_idx - 1];
1118 }
1119
1120 static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
1121 {
1122         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1123         struct nvme_rdma_queue *queue = &ctrl->queues[0];
1124         struct ib_device *dev = queue->device->dev;
1125         struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
1126         struct nvme_command *cmd = sqe->data;
1127         struct ib_sge sge;
1128         int ret;
1129
1130         if (WARN_ON_ONCE(aer_idx != 0))
1131                 return;
1132
1133         ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1134
1135         memset(cmd, 0, sizeof(*cmd));
1136         cmd->common.opcode = nvme_admin_async_event;
1137         cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
1138         cmd->common.flags |= NVME_CMD_SGL_METABUF;
1139         nvme_rdma_set_sg_null(cmd);
1140
1141         ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1142                         DMA_TO_DEVICE);
1143
1144         ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
1145         WARN_ON_ONCE(ret);
1146 }
1147
1148 static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1149                 struct nvme_completion *cqe, struct ib_wc *wc, int tag)
1150 {
1151         struct request *rq;
1152         struct nvme_rdma_request *req;
1153         int ret = 0;
1154
1155         rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1156         if (!rq) {
1157                 dev_err(queue->ctrl->ctrl.device,
1158                         "tag 0x%x on QP %#x not found\n",
1159                         cqe->command_id, queue->qp->qp_num);
1160                 nvme_rdma_error_recovery(queue->ctrl);
1161                 return ret;
1162         }
1163         req = blk_mq_rq_to_pdu(rq);
1164
1165         if (rq->tag == tag)
1166                 ret = 1;
1167
1168         if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
1169             wc->ex.invalidate_rkey == req->mr->rkey)
1170                 req->mr->need_inval = false;
1171
1172         req->req.result = cqe->result;
1173         blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
1174         return ret;
1175 }
1176
1177 static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1178 {
1179         struct nvme_rdma_qe *qe =
1180                 container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1181         struct nvme_rdma_queue *queue = cq->cq_context;
1182         struct ib_device *ibdev = queue->device->dev;
1183         struct nvme_completion *cqe = qe->data;
1184         const size_t len = sizeof(struct nvme_completion);
1185         int ret = 0;
1186
1187         if (unlikely(wc->status != IB_WC_SUCCESS)) {
1188                 nvme_rdma_wr_error(cq, wc, "RECV");
1189                 return 0;
1190         }
1191
1192         ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1193         /*
1194          * AEN requests are special as they don't time out and can
1195          * survive any kind of queue freeze and often don't respond to
1196          * aborts.  We don't even bother to allocate a struct request
1197          * for them but rather special case them here.
1198          */
1199         if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
1200                         cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
1201                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
1202                                 &cqe->result);
1203         else
1204                 ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
1205         ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1206
1207         nvme_rdma_post_recv(queue, qe);
1208         return ret;
1209 }
1210
1211 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1212 {
1213         __nvme_rdma_recv_done(cq, wc, -1);
1214 }
1215
1216 static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
1217 {
1218         int ret, i;
1219
1220         for (i = 0; i < queue->queue_size; i++) {
1221                 ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
1222                 if (ret)
1223                         goto out_destroy_queue_ib;
1224         }
1225
1226         return 0;
1227
1228 out_destroy_queue_ib:
1229         nvme_rdma_destroy_queue_ib(queue);
1230         return ret;
1231 }
1232
1233 static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
1234                 struct rdma_cm_event *ev)
1235 {
1236         struct rdma_cm_id *cm_id = queue->cm_id;
1237         int status = ev->status;
1238         const char *rej_msg;
1239         const struct nvme_rdma_cm_rej *rej_data;
1240         u8 rej_data_len;
1241
1242         rej_msg = rdma_reject_msg(cm_id, status);
1243         rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len);
1244
1245         if (rej_data && rej_data_len >= sizeof(u16)) {
1246                 u16 sts = le16_to_cpu(rej_data->sts);
1247
1248                 dev_err(queue->ctrl->ctrl.device,
1249                       "Connect rejected: status %d (%s) nvme status %d (%s).\n",
1250                       status, rej_msg, sts, nvme_rdma_cm_msg(sts));
1251         } else {
1252                 dev_err(queue->ctrl->ctrl.device,
1253                         "Connect rejected: status %d (%s).\n", status, rej_msg);
1254         }
1255
1256         return -ECONNRESET;
1257 }
1258
1259 static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
1260 {
1261         struct nvme_rdma_device *dev;
1262         int ret;
1263
1264         dev = nvme_rdma_find_get_device(queue->cm_id);
1265         if (!dev) {
1266                 dev_err(queue->cm_id->device->dev.parent,
1267                         "no client data found!\n");
1268                 return -ECONNREFUSED;
1269         }
1270
1271         ret = nvme_rdma_create_queue_ib(queue, dev);
1272         if (ret) {
1273                 nvme_rdma_dev_put(dev);
1274                 goto out;
1275         }
1276
1277         ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
1278         if (ret) {
1279                 dev_err(queue->ctrl->ctrl.device,
1280                         "rdma_resolve_route failed (%d).\n",
1281                         queue->cm_error);
1282                 goto out_destroy_queue;
1283         }
1284
1285         return 0;
1286
1287 out_destroy_queue:
1288         nvme_rdma_destroy_queue_ib(queue);
1289 out:
1290         return ret;
1291 }
1292
1293 static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
1294 {
1295         struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1296         struct rdma_conn_param param = { };
1297         struct nvme_rdma_cm_req priv = { };
1298         int ret;
1299
1300         param.qp_num = queue->qp->qp_num;
1301         param.flow_control = 1;
1302
1303         param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
1304         /* maximum retry count */
1305         param.retry_count = 7;
1306         param.rnr_retry_count = 7;
1307         param.private_data = &priv;
1308         param.private_data_len = sizeof(priv);
1309
1310         priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1311         priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
1312         /*
1313          * set the admin queue depth to the minimum size
1314          * specified by the Fabrics standard.
1315          */
1316         if (priv.qid == 0) {
1317                 priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH);
1318                 priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1);
1319         } else {
1320                 /*
1321                  * current interpretation of the fabrics spec
1322                  * is at minimum you make hrqsize sqsize+1, or a
1323                  * 1's based representation of sqsize.
1324                  */
1325                 priv.hrqsize = cpu_to_le16(queue->queue_size);
1326                 priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
1327         }
1328
1329         ret = rdma_connect(queue->cm_id, &param);
1330         if (ret) {
1331                 dev_err(ctrl->ctrl.device,
1332                         "rdma_connect failed (%d).\n", ret);
1333                 goto out_destroy_queue_ib;
1334         }
1335
1336         return 0;
1337
1338 out_destroy_queue_ib:
1339         nvme_rdma_destroy_queue_ib(queue);
1340         return ret;
1341 }
1342
1343 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1344                 struct rdma_cm_event *ev)
1345 {
1346         struct nvme_rdma_queue *queue = cm_id->context;
1347         int cm_error = 0;
1348
1349         dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
1350                 rdma_event_msg(ev->event), ev->event,
1351                 ev->status, cm_id);
1352
1353         switch (ev->event) {
1354         case RDMA_CM_EVENT_ADDR_RESOLVED:
1355                 cm_error = nvme_rdma_addr_resolved(queue);
1356                 break;
1357         case RDMA_CM_EVENT_ROUTE_RESOLVED:
1358                 cm_error = nvme_rdma_route_resolved(queue);
1359                 break;
1360         case RDMA_CM_EVENT_ESTABLISHED:
1361                 queue->cm_error = nvme_rdma_conn_established(queue);
1362                 /* complete cm_done regardless of success/failure */
1363                 complete(&queue->cm_done);
1364                 return 0;
1365         case RDMA_CM_EVENT_REJECTED:
1366                 cm_error = nvme_rdma_conn_rejected(queue, ev);
1367                 break;
1368         case RDMA_CM_EVENT_ADDR_ERROR:
1369         case RDMA_CM_EVENT_ROUTE_ERROR:
1370         case RDMA_CM_EVENT_CONNECT_ERROR:
1371         case RDMA_CM_EVENT_UNREACHABLE:
1372                 dev_dbg(queue->ctrl->ctrl.device,
1373                         "CM error event %d\n", ev->event);
1374                 cm_error = -ECONNRESET;
1375                 break;
1376         case RDMA_CM_EVENT_DISCONNECTED:
1377         case RDMA_CM_EVENT_ADDR_CHANGE:
1378         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1379                 dev_dbg(queue->ctrl->ctrl.device,
1380                         "disconnect received - connection closed\n");
1381                 nvme_rdma_error_recovery(queue->ctrl);
1382                 break;
1383         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1384                 /* device removal is handled via the ib_client API */
1385                 break;
1386         default:
1387                 dev_err(queue->ctrl->ctrl.device,
1388                         "Unexpected RDMA CM event (%d)\n", ev->event);
1389                 nvme_rdma_error_recovery(queue->ctrl);
1390                 break;
1391         }
1392
1393         if (cm_error) {
1394                 queue->cm_error = cm_error;
1395                 complete(&queue->cm_done);
1396         }
1397
1398         return 0;
1399 }
1400
1401 static enum blk_eh_timer_return
1402 nvme_rdma_timeout(struct request *rq, bool reserved)
1403 {
1404         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1405
1406         /* queue error recovery */
1407         nvme_rdma_error_recovery(req->queue->ctrl);
1408
1409         /* fail with DNR on cmd timeout */
1410         rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
1411
1412         return BLK_EH_HANDLED;
1413 }
1414
1415 /*
1416  * We cannot accept any other command until the Connect command has completed.
1417  */
1418 static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
1419                 struct request *rq)
1420 {
1421         if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) {
1422                 struct nvme_command *cmd = nvme_req(rq)->cmd;
1423
1424                 if (!blk_rq_is_passthrough(rq) ||
1425                     cmd->common.opcode != nvme_fabrics_command ||
1426                     cmd->fabrics.fctype != nvme_fabrics_type_connect)
1427                         return false;
1428         }
1429
1430         return true;
1431 }
1432
1433 static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1434                 const struct blk_mq_queue_data *bd)
1435 {
1436         struct nvme_ns *ns = hctx->queue->queuedata;
1437         struct nvme_rdma_queue *queue = hctx->driver_data;
1438         struct request *rq = bd->rq;
1439         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1440         struct nvme_rdma_qe *sqe = &req->sqe;
1441         struct nvme_command *c = sqe->data;
1442         bool flush = false;
1443         struct ib_device *dev;
1444         int ret;
1445
1446         WARN_ON_ONCE(rq->tag < 0);
1447
1448         if (!nvme_rdma_queue_is_ready(queue, rq))
1449                 return BLK_MQ_RQ_QUEUE_BUSY;
1450
1451         dev = queue->device->dev;
1452         ib_dma_sync_single_for_cpu(dev, sqe->dma,
1453                         sizeof(struct nvme_command), DMA_TO_DEVICE);
1454
1455         ret = nvme_setup_cmd(ns, rq, c);
1456         if (ret != BLK_MQ_RQ_QUEUE_OK)
1457                 return ret;
1458
1459         blk_mq_start_request(rq);
1460
1461         ret = nvme_rdma_map_data(queue, rq, c);
1462         if (ret < 0) {
1463                 dev_err(queue->ctrl->ctrl.device,
1464                              "Failed to map data (%d)\n", ret);
1465                 nvme_cleanup_cmd(rq);
1466                 goto err;
1467         }
1468
1469         ib_dma_sync_single_for_device(dev, sqe->dma,
1470                         sizeof(struct nvme_command), DMA_TO_DEVICE);
1471
1472         if (req_op(rq) == REQ_OP_FLUSH)
1473                 flush = true;
1474         ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
1475                         req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
1476         if (ret) {
1477                 nvme_rdma_unmap_data(queue, rq);
1478                 goto err;
1479         }
1480
1481         return BLK_MQ_RQ_QUEUE_OK;
1482 err:
1483         return (ret == -ENOMEM || ret == -EAGAIN) ?
1484                 BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
1485 }
1486
1487 static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
1488 {
1489         struct nvme_rdma_queue *queue = hctx->driver_data;
1490         struct ib_cq *cq = queue->ib_cq;
1491         struct ib_wc wc;
1492         int found = 0;
1493
1494         ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
1495         while (ib_poll_cq(cq, 1, &wc) > 0) {
1496                 struct ib_cqe *cqe = wc.wr_cqe;
1497
1498                 if (cqe) {
1499                         if (cqe->done == nvme_rdma_recv_done)
1500                                 found |= __nvme_rdma_recv_done(cq, &wc, tag);
1501                         else
1502                                 cqe->done(cq, &wc);
1503                 }
1504         }
1505
1506         return found;
1507 }
1508
1509 static void nvme_rdma_complete_rq(struct request *rq)
1510 {
1511         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1512         struct nvme_rdma_queue *queue = req->queue;
1513         int error = 0;
1514
1515         nvme_rdma_unmap_data(queue, rq);
1516
1517         if (unlikely(rq->errors)) {
1518                 if (nvme_req_needs_retry(rq, rq->errors)) {
1519                         nvme_requeue_req(rq);
1520                         return;
1521                 }
1522
1523                 if (blk_rq_is_passthrough(rq))
1524                         error = rq->errors;
1525                 else
1526                         error = nvme_error_status(rq->errors);
1527         }
1528
1529         blk_mq_end_request(rq, error);
1530 }
1531
1532 static struct blk_mq_ops nvme_rdma_mq_ops = {
1533         .queue_rq       = nvme_rdma_queue_rq,
1534         .complete       = nvme_rdma_complete_rq,
1535         .init_request   = nvme_rdma_init_request,
1536         .exit_request   = nvme_rdma_exit_request,
1537         .reinit_request = nvme_rdma_reinit_request,
1538         .init_hctx      = nvme_rdma_init_hctx,
1539         .poll           = nvme_rdma_poll,
1540         .timeout        = nvme_rdma_timeout,
1541 };
1542
1543 static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
1544         .queue_rq       = nvme_rdma_queue_rq,
1545         .complete       = nvme_rdma_complete_rq,
1546         .init_request   = nvme_rdma_init_admin_request,
1547         .exit_request   = nvme_rdma_exit_admin_request,
1548         .reinit_request = nvme_rdma_reinit_request,
1549         .init_hctx      = nvme_rdma_init_admin_hctx,
1550         .timeout        = nvme_rdma_timeout,
1551 };
1552
1553 static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
1554 {
1555         int error;
1556
1557         error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
1558         if (error)
1559                 return error;
1560
1561         ctrl->device = ctrl->queues[0].device;
1562
1563         /*
1564          * We need a reference on the device as long as the tag_set is alive,
1565          * as the MRs in the request structures need a valid ib_device.
1566          */
1567         error = -EINVAL;
1568         if (!nvme_rdma_dev_get(ctrl->device))
1569                 goto out_free_queue;
1570
1571         ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
1572                 ctrl->device->dev->attrs.max_fast_reg_page_list_len);
1573
1574         memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
1575         ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops;
1576         ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
1577         ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
1578         ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
1579         ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1580                 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1581         ctrl->admin_tag_set.driver_data = ctrl;
1582         ctrl->admin_tag_set.nr_hw_queues = 1;
1583         ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
1584
1585         error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
1586         if (error)
1587                 goto out_put_dev;
1588
1589         ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
1590         if (IS_ERR(ctrl->ctrl.admin_q)) {
1591                 error = PTR_ERR(ctrl->ctrl.admin_q);
1592                 goto out_free_tagset;
1593         }
1594
1595         error = nvmf_connect_admin_queue(&ctrl->ctrl);
1596         if (error)
1597                 goto out_cleanup_queue;
1598
1599         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
1600
1601         error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
1602         if (error) {
1603                 dev_err(ctrl->ctrl.device,
1604                         "prop_get NVME_REG_CAP failed\n");
1605                 goto out_cleanup_queue;
1606         }
1607
1608         ctrl->ctrl.sqsize =
1609                 min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize);
1610
1611         error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
1612         if (error)
1613                 goto out_cleanup_queue;
1614
1615         ctrl->ctrl.max_hw_sectors =
1616                 (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
1617
1618         error = nvme_init_identify(&ctrl->ctrl);
1619         if (error)
1620                 goto out_cleanup_queue;
1621
1622         error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
1623                         &ctrl->async_event_sqe, sizeof(struct nvme_command),
1624                         DMA_TO_DEVICE);
1625         if (error)
1626                 goto out_cleanup_queue;
1627
1628         nvme_start_keep_alive(&ctrl->ctrl);
1629
1630         return 0;
1631
1632 out_cleanup_queue:
1633         blk_cleanup_queue(ctrl->ctrl.admin_q);
1634 out_free_tagset:
1635         /* disconnect and drain the queue before freeing the tagset */
1636         nvme_rdma_stop_queue(&ctrl->queues[0]);
1637         blk_mq_free_tag_set(&ctrl->admin_tag_set);
1638 out_put_dev:
1639         nvme_rdma_dev_put(ctrl->device);
1640 out_free_queue:
1641         nvme_rdma_free_queue(&ctrl->queues[0]);
1642         return error;
1643 }
1644
1645 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
1646 {
1647         nvme_stop_keep_alive(&ctrl->ctrl);
1648         cancel_work_sync(&ctrl->err_work);
1649         cancel_delayed_work_sync(&ctrl->reconnect_work);
1650
1651         if (ctrl->queue_count > 1) {
1652                 nvme_stop_queues(&ctrl->ctrl);
1653                 blk_mq_tagset_busy_iter(&ctrl->tag_set,
1654                                         nvme_cancel_request, &ctrl->ctrl);
1655                 nvme_rdma_free_io_queues(ctrl);
1656         }
1657
1658         if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags))
1659                 nvme_shutdown_ctrl(&ctrl->ctrl);
1660
1661         blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
1662         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
1663                                 nvme_cancel_request, &ctrl->ctrl);
1664         nvme_rdma_destroy_admin_queue(ctrl);
1665 }
1666
1667 static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
1668 {
1669         nvme_uninit_ctrl(&ctrl->ctrl);
1670         if (shutdown)
1671                 nvme_rdma_shutdown_ctrl(ctrl);
1672
1673         if (ctrl->ctrl.tagset) {
1674                 blk_cleanup_queue(ctrl->ctrl.connect_q);
1675                 blk_mq_free_tag_set(&ctrl->tag_set);
1676                 nvme_rdma_dev_put(ctrl->device);
1677         }
1678
1679         nvme_put_ctrl(&ctrl->ctrl);
1680 }
1681
1682 static void nvme_rdma_del_ctrl_work(struct work_struct *work)
1683 {
1684         struct nvme_rdma_ctrl *ctrl = container_of(work,
1685                                 struct nvme_rdma_ctrl, delete_work);
1686
1687         __nvme_rdma_remove_ctrl(ctrl, true);
1688 }
1689
1690 static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
1691 {
1692         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
1693                 return -EBUSY;
1694
1695         if (!queue_work(nvme_rdma_wq, &ctrl->delete_work))
1696                 return -EBUSY;
1697
1698         return 0;
1699 }
1700
1701 static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
1702 {
1703         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1704         int ret = 0;
1705
1706         /*
1707          * Keep a reference until all work is flushed since
1708          * __nvme_rdma_del_ctrl can free the ctrl mem
1709          */
1710         if (!kref_get_unless_zero(&ctrl->ctrl.kref))
1711                 return -EBUSY;
1712         ret = __nvme_rdma_del_ctrl(ctrl);
1713         if (!ret)
1714                 flush_work(&ctrl->delete_work);
1715         nvme_put_ctrl(&ctrl->ctrl);
1716         return ret;
1717 }
1718
1719 static void nvme_rdma_remove_ctrl_work(struct work_struct *work)
1720 {
1721         struct nvme_rdma_ctrl *ctrl = container_of(work,
1722                                 struct nvme_rdma_ctrl, delete_work);
1723
1724         __nvme_rdma_remove_ctrl(ctrl, false);
1725 }
1726
1727 static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1728 {
1729         struct nvme_rdma_ctrl *ctrl = container_of(work,
1730                                         struct nvme_rdma_ctrl, reset_work);
1731         int ret;
1732         bool changed;
1733
1734         nvme_rdma_shutdown_ctrl(ctrl);
1735
1736         ret = nvme_rdma_configure_admin_queue(ctrl);
1737         if (ret) {
1738                 /* ctrl is already shutdown, just remove the ctrl */
1739                 INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work);
1740                 goto del_dead_ctrl;
1741         }
1742
1743         if (ctrl->queue_count > 1) {
1744                 ret = blk_mq_reinit_tagset(&ctrl->tag_set);
1745                 if (ret)
1746                         goto del_dead_ctrl;
1747
1748                 ret = nvme_rdma_init_io_queues(ctrl);
1749                 if (ret)
1750                         goto del_dead_ctrl;
1751
1752                 ret = nvme_rdma_connect_io_queues(ctrl);
1753                 if (ret)
1754                         goto del_dead_ctrl;
1755         }
1756
1757         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1758         WARN_ON_ONCE(!changed);
1759
1760         if (ctrl->queue_count > 1) {
1761                 nvme_start_queues(&ctrl->ctrl);
1762                 nvme_queue_scan(&ctrl->ctrl);
1763                 nvme_queue_async_events(&ctrl->ctrl);
1764         }
1765
1766         return;
1767
1768 del_dead_ctrl:
1769         /* Deleting this dead controller... */
1770         dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
1771         WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work));
1772 }
1773
1774 static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl)
1775 {
1776         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1777
1778         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
1779                 return -EBUSY;
1780
1781         if (!queue_work(nvme_rdma_wq, &ctrl->reset_work))
1782                 return -EBUSY;
1783
1784         flush_work(&ctrl->reset_work);
1785
1786         return 0;
1787 }
1788
1789 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
1790         .name                   = "rdma",
1791         .module                 = THIS_MODULE,
1792         .is_fabrics             = true,
1793         .reg_read32             = nvmf_reg_read32,
1794         .reg_read64             = nvmf_reg_read64,
1795         .reg_write32            = nvmf_reg_write32,
1796         .reset_ctrl             = nvme_rdma_reset_ctrl,
1797         .free_ctrl              = nvme_rdma_free_ctrl,
1798         .submit_async_event     = nvme_rdma_submit_async_event,
1799         .delete_ctrl            = nvme_rdma_del_ctrl,
1800         .get_subsysnqn          = nvmf_get_subsysnqn,
1801         .get_address            = nvmf_get_address,
1802 };
1803
1804 static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
1805 {
1806         int ret;
1807
1808         ret = nvme_rdma_init_io_queues(ctrl);
1809         if (ret)
1810                 return ret;
1811
1812         /*
1813          * We need a reference on the device as long as the tag_set is alive,
1814          * as the MRs in the request structures need a valid ib_device.
1815          */
1816         ret = -EINVAL;
1817         if (!nvme_rdma_dev_get(ctrl->device))
1818                 goto out_free_io_queues;
1819
1820         memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
1821         ctrl->tag_set.ops = &nvme_rdma_mq_ops;
1822         ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
1823         ctrl->tag_set.reserved_tags = 1; /* fabric connect */
1824         ctrl->tag_set.numa_node = NUMA_NO_NODE;
1825         ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1826         ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1827                 SG_CHUNK_SIZE * sizeof(struct scatterlist);
1828         ctrl->tag_set.driver_data = ctrl;
1829         ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
1830         ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
1831
1832         ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
1833         if (ret)
1834                 goto out_put_dev;
1835         ctrl->ctrl.tagset = &ctrl->tag_set;
1836
1837         ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
1838         if (IS_ERR(ctrl->ctrl.connect_q)) {
1839                 ret = PTR_ERR(ctrl->ctrl.connect_q);
1840                 goto out_free_tag_set;
1841         }
1842
1843         ret = nvme_rdma_connect_io_queues(ctrl);
1844         if (ret)
1845                 goto out_cleanup_connect_q;
1846
1847         return 0;
1848
1849 out_cleanup_connect_q:
1850         blk_cleanup_queue(ctrl->ctrl.connect_q);
1851 out_free_tag_set:
1852         blk_mq_free_tag_set(&ctrl->tag_set);
1853 out_put_dev:
1854         nvme_rdma_dev_put(ctrl->device);
1855 out_free_io_queues:
1856         nvme_rdma_free_io_queues(ctrl);
1857         return ret;
1858 }
1859
1860 static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
1861 {
1862         u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
1863         size_t buflen = strlen(p);
1864
1865         /* XXX: handle IPv6 addresses */
1866
1867         if (buflen > INET_ADDRSTRLEN)
1868                 return -EINVAL;
1869         if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
1870                 return -EINVAL;
1871         in_addr->sin_family = AF_INET;
1872         return 0;
1873 }
1874
1875 static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1876                 struct nvmf_ctrl_options *opts)
1877 {
1878         struct nvme_rdma_ctrl *ctrl;
1879         int ret;
1880         bool changed;
1881
1882         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1883         if (!ctrl)
1884                 return ERR_PTR(-ENOMEM);
1885         ctrl->ctrl.opts = opts;
1886         INIT_LIST_HEAD(&ctrl->list);
1887
1888         ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
1889         if (ret) {
1890                 pr_err("malformed IP address passed: %s\n", opts->traddr);
1891                 goto out_free_ctrl;
1892         }
1893
1894         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
1895                 ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in,
1896                                 opts->host_traddr);
1897                 if (ret) {
1898                         pr_err("malformed src IP address passed: %s\n",
1899                                opts->host_traddr);
1900                         goto out_free_ctrl;
1901                 }
1902         }
1903
1904         if (opts->mask & NVMF_OPT_TRSVCID) {
1905                 u16 port;
1906
1907                 ret = kstrtou16(opts->trsvcid, 0, &port);
1908                 if (ret)
1909                         goto out_free_ctrl;
1910
1911                 ctrl->addr_in.sin_port = cpu_to_be16(port);
1912         } else {
1913                 ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
1914         }
1915
1916         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
1917                                 0 /* no quirks, we're perfect! */);
1918         if (ret)
1919                 goto out_free_ctrl;
1920
1921         ctrl->reconnect_delay = opts->reconnect_delay;
1922         INIT_DELAYED_WORK(&ctrl->reconnect_work,
1923                         nvme_rdma_reconnect_ctrl_work);
1924         INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
1925         INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
1926         INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work);
1927         spin_lock_init(&ctrl->lock);
1928
1929         ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
1930         ctrl->ctrl.sqsize = opts->queue_size - 1;
1931         ctrl->ctrl.kato = opts->kato;
1932
1933         ret = -ENOMEM;
1934         ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues),
1935                                 GFP_KERNEL);
1936         if (!ctrl->queues)
1937                 goto out_uninit_ctrl;
1938
1939         ret = nvme_rdma_configure_admin_queue(ctrl);
1940         if (ret)
1941                 goto out_kfree_queues;
1942
1943         /* sanity check icdoff */
1944         if (ctrl->ctrl.icdoff) {
1945                 dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
1946                 goto out_remove_admin_queue;
1947         }
1948
1949         /* sanity check keyed sgls */
1950         if (!(ctrl->ctrl.sgls & (1 << 20))) {
1951                 dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n");
1952                 goto out_remove_admin_queue;
1953         }
1954
1955         if (opts->queue_size > ctrl->ctrl.maxcmd) {
1956                 /* warn if maxcmd is lower than queue_size */
1957                 dev_warn(ctrl->ctrl.device,
1958                         "queue_size %zu > ctrl maxcmd %u, clamping down\n",
1959                         opts->queue_size, ctrl->ctrl.maxcmd);
1960                 opts->queue_size = ctrl->ctrl.maxcmd;
1961         }
1962
1963         if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
1964                 /* warn if sqsize is lower than queue_size */
1965                 dev_warn(ctrl->ctrl.device,
1966                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
1967                         opts->queue_size, ctrl->ctrl.sqsize + 1);
1968                 opts->queue_size = ctrl->ctrl.sqsize + 1;
1969         }
1970
1971         if (opts->nr_io_queues) {
1972                 ret = nvme_rdma_create_io_queues(ctrl);
1973                 if (ret)
1974                         goto out_remove_admin_queue;
1975         }
1976
1977         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1978         WARN_ON_ONCE(!changed);
1979
1980         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
1981                 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
1982
1983         kref_get(&ctrl->ctrl.kref);
1984
1985         mutex_lock(&nvme_rdma_ctrl_mutex);
1986         list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
1987         mutex_unlock(&nvme_rdma_ctrl_mutex);
1988
1989         if (opts->nr_io_queues) {
1990                 nvme_queue_scan(&ctrl->ctrl);
1991                 nvme_queue_async_events(&ctrl->ctrl);
1992         }
1993
1994         return &ctrl->ctrl;
1995
1996 out_remove_admin_queue:
1997         nvme_stop_keep_alive(&ctrl->ctrl);
1998         nvme_rdma_destroy_admin_queue(ctrl);
1999 out_kfree_queues:
2000         kfree(ctrl->queues);
2001 out_uninit_ctrl:
2002         nvme_uninit_ctrl(&ctrl->ctrl);
2003         nvme_put_ctrl(&ctrl->ctrl);
2004         if (ret > 0)
2005                 ret = -EIO;
2006         return ERR_PTR(ret);
2007 out_free_ctrl:
2008         kfree(ctrl);
2009         return ERR_PTR(ret);
2010 }
2011
2012 static struct nvmf_transport_ops nvme_rdma_transport = {
2013         .name           = "rdma",
2014         .required_opts  = NVMF_OPT_TRADDR,
2015         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2016                           NVMF_OPT_HOST_TRADDR,
2017         .create_ctrl    = nvme_rdma_create_ctrl,
2018 };
2019
2020 static void nvme_rdma_add_one(struct ib_device *ib_device)
2021 {
2022 }
2023
2024 static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
2025 {
2026         struct nvme_rdma_ctrl *ctrl;
2027
2028         /* Delete all controllers using this device */
2029         mutex_lock(&nvme_rdma_ctrl_mutex);
2030         list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
2031                 if (ctrl->device->dev != ib_device)
2032                         continue;
2033                 dev_info(ctrl->ctrl.device,
2034                         "Removing ctrl: NQN \"%s\", addr %pISp\n",
2035                         ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2036                 __nvme_rdma_del_ctrl(ctrl);
2037         }
2038         mutex_unlock(&nvme_rdma_ctrl_mutex);
2039
2040         flush_workqueue(nvme_rdma_wq);
2041 }
2042
2043 static struct ib_client nvme_rdma_ib_client = {
2044         .name   = "nvme_rdma",
2045         .add = nvme_rdma_add_one,
2046         .remove = nvme_rdma_remove_one
2047 };
2048
2049 static int __init nvme_rdma_init_module(void)
2050 {
2051         int ret;
2052
2053         nvme_rdma_wq = create_workqueue("nvme_rdma_wq");
2054         if (!nvme_rdma_wq)
2055                 return -ENOMEM;
2056
2057         ret = ib_register_client(&nvme_rdma_ib_client);
2058         if (ret) {
2059                 destroy_workqueue(nvme_rdma_wq);
2060                 return ret;
2061         }
2062
2063         return nvmf_register_transport(&nvme_rdma_transport);
2064 }
2065
2066 static void __exit nvme_rdma_cleanup_module(void)
2067 {
2068         nvmf_unregister_transport(&nvme_rdma_transport);
2069         ib_unregister_client(&nvme_rdma_ib_client);
2070         destroy_workqueue(nvme_rdma_wq);
2071 }
2072
2073 module_init(nvme_rdma_init_module);
2074 module_exit(nvme_rdma_cleanup_module);
2075
2076 MODULE_LICENSE("GPL v2");