Merge tag 'block-6.3-2023-03-03' of git://git.kernel.dk/linux
[sfrench/cifs-2.6.git] / drivers / nvme / host / tcp.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * NVMe over Fabrics TCP host.
4  * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5  */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
12 #include <net/sock.h>
13 #include <net/tcp.h>
14 #include <linux/blk-mq.h>
15 #include <crypto/hash.h>
16 #include <net/busy_poll.h>
17 #include <trace/events/sock.h>
18
19 #include "nvme.h"
20 #include "fabrics.h"
21
22 struct nvme_tcp_queue;
23
24 /* Define the socket priority to use for connections were it is desirable
25  * that the NIC consider performing optimized packet processing or filtering.
26  * A non-zero value being sufficient to indicate general consideration of any
27  * possible optimization.  Making it a module param allows for alternative
28  * values that may be unique for some NIC implementations.
29  */
30 static int so_priority;
31 module_param(so_priority, int, 0644);
32 MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
33
34 #ifdef CONFIG_DEBUG_LOCK_ALLOC
35 /* lockdep can detect a circular dependency of the form
36  *   sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
37  * because dependencies are tracked for both nvme-tcp and user contexts. Using
38  * a separate class prevents lockdep from conflating nvme-tcp socket use with
39  * user-space socket API use.
40  */
41 static struct lock_class_key nvme_tcp_sk_key[2];
42 static struct lock_class_key nvme_tcp_slock_key[2];
43
44 static void nvme_tcp_reclassify_socket(struct socket *sock)
45 {
46         struct sock *sk = sock->sk;
47
48         if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
49                 return;
50
51         switch (sk->sk_family) {
52         case AF_INET:
53                 sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
54                                               &nvme_tcp_slock_key[0],
55                                               "sk_lock-AF_INET-NVME",
56                                               &nvme_tcp_sk_key[0]);
57                 break;
58         case AF_INET6:
59                 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
60                                               &nvme_tcp_slock_key[1],
61                                               "sk_lock-AF_INET6-NVME",
62                                               &nvme_tcp_sk_key[1]);
63                 break;
64         default:
65                 WARN_ON_ONCE(1);
66         }
67 }
68 #else
69 static void nvme_tcp_reclassify_socket(struct socket *sock) { }
70 #endif
71
72 enum nvme_tcp_send_state {
73         NVME_TCP_SEND_CMD_PDU = 0,
74         NVME_TCP_SEND_H2C_PDU,
75         NVME_TCP_SEND_DATA,
76         NVME_TCP_SEND_DDGST,
77 };
78
79 struct nvme_tcp_request {
80         struct nvme_request     req;
81         void                    *pdu;
82         struct nvme_tcp_queue   *queue;
83         u32                     data_len;
84         u32                     pdu_len;
85         u32                     pdu_sent;
86         u32                     h2cdata_left;
87         u32                     h2cdata_offset;
88         u16                     ttag;
89         __le16                  status;
90         struct list_head        entry;
91         struct llist_node       lentry;
92         __le32                  ddgst;
93
94         struct bio              *curr_bio;
95         struct iov_iter         iter;
96
97         /* send state */
98         size_t                  offset;
99         size_t                  data_sent;
100         enum nvme_tcp_send_state state;
101 };
102
103 enum nvme_tcp_queue_flags {
104         NVME_TCP_Q_ALLOCATED    = 0,
105         NVME_TCP_Q_LIVE         = 1,
106         NVME_TCP_Q_POLLING      = 2,
107 };
108
109 enum nvme_tcp_recv_state {
110         NVME_TCP_RECV_PDU = 0,
111         NVME_TCP_RECV_DATA,
112         NVME_TCP_RECV_DDGST,
113 };
114
115 struct nvme_tcp_ctrl;
116 struct nvme_tcp_queue {
117         struct socket           *sock;
118         struct work_struct      io_work;
119         int                     io_cpu;
120
121         struct mutex            queue_lock;
122         struct mutex            send_mutex;
123         struct llist_head       req_list;
124         struct list_head        send_list;
125
126         /* recv state */
127         void                    *pdu;
128         int                     pdu_remaining;
129         int                     pdu_offset;
130         size_t                  data_remaining;
131         size_t                  ddgst_remaining;
132         unsigned int            nr_cqe;
133
134         /* send state */
135         struct nvme_tcp_request *request;
136
137         u32                     maxh2cdata;
138         size_t                  cmnd_capsule_len;
139         struct nvme_tcp_ctrl    *ctrl;
140         unsigned long           flags;
141         bool                    rd_enabled;
142
143         bool                    hdr_digest;
144         bool                    data_digest;
145         struct ahash_request    *rcv_hash;
146         struct ahash_request    *snd_hash;
147         __le32                  exp_ddgst;
148         __le32                  recv_ddgst;
149
150         struct page_frag_cache  pf_cache;
151
152         void (*state_change)(struct sock *);
153         void (*data_ready)(struct sock *);
154         void (*write_space)(struct sock *);
155 };
156
157 struct nvme_tcp_ctrl {
158         /* read only in the hot path */
159         struct nvme_tcp_queue   *queues;
160         struct blk_mq_tag_set   tag_set;
161
162         /* other member variables */
163         struct list_head        list;
164         struct blk_mq_tag_set   admin_tag_set;
165         struct sockaddr_storage addr;
166         struct sockaddr_storage src_addr;
167         struct nvme_ctrl        ctrl;
168
169         struct work_struct      err_work;
170         struct delayed_work     connect_work;
171         struct nvme_tcp_request async_req;
172         u32                     io_queues[HCTX_MAX_TYPES];
173 };
174
175 static LIST_HEAD(nvme_tcp_ctrl_list);
176 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
177 static struct workqueue_struct *nvme_tcp_wq;
178 static const struct blk_mq_ops nvme_tcp_mq_ops;
179 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
180 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
181
182 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
183 {
184         return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
185 }
186
187 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
188 {
189         return queue - queue->ctrl->queues;
190 }
191
192 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
193 {
194         u32 queue_idx = nvme_tcp_queue_id(queue);
195
196         if (queue_idx == 0)
197                 return queue->ctrl->admin_tag_set.tags[queue_idx];
198         return queue->ctrl->tag_set.tags[queue_idx - 1];
199 }
200
201 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
202 {
203         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
204 }
205
206 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
207 {
208         return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
209 }
210
211 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
212 {
213         if (nvme_is_fabrics(req->req.cmd))
214                 return NVME_TCP_ADMIN_CCSZ;
215         return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
216 }
217
218 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
219 {
220         return req == &req->queue->ctrl->async_req;
221 }
222
223 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
224 {
225         struct request *rq;
226
227         if (unlikely(nvme_tcp_async_req(req)))
228                 return false; /* async events don't have a request */
229
230         rq = blk_mq_rq_from_pdu(req);
231
232         return rq_data_dir(rq) == WRITE && req->data_len &&
233                 req->data_len <= nvme_tcp_inline_data_size(req);
234 }
235
236 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
237 {
238         return req->iter.bvec->bv_page;
239 }
240
241 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
242 {
243         return req->iter.bvec->bv_offset + req->iter.iov_offset;
244 }
245
246 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
247 {
248         return min_t(size_t, iov_iter_single_seg_count(&req->iter),
249                         req->pdu_len - req->pdu_sent);
250 }
251
252 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
253 {
254         return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
255                         req->pdu_len - req->pdu_sent : 0;
256 }
257
258 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
259                 int len)
260 {
261         return nvme_tcp_pdu_data_left(req) <= len;
262 }
263
264 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
265                 unsigned int dir)
266 {
267         struct request *rq = blk_mq_rq_from_pdu(req);
268         struct bio_vec *vec;
269         unsigned int size;
270         int nr_bvec;
271         size_t offset;
272
273         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
274                 vec = &rq->special_vec;
275                 nr_bvec = 1;
276                 size = blk_rq_payload_bytes(rq);
277                 offset = 0;
278         } else {
279                 struct bio *bio = req->curr_bio;
280                 struct bvec_iter bi;
281                 struct bio_vec bv;
282
283                 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
284                 nr_bvec = 0;
285                 bio_for_each_bvec(bv, bio, bi) {
286                         nr_bvec++;
287                 }
288                 size = bio->bi_iter.bi_size;
289                 offset = bio->bi_iter.bi_bvec_done;
290         }
291
292         iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
293         req->iter.iov_offset = offset;
294 }
295
296 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
297                 int len)
298 {
299         req->data_sent += len;
300         req->pdu_sent += len;
301         iov_iter_advance(&req->iter, len);
302         if (!iov_iter_count(&req->iter) &&
303             req->data_sent < req->data_len) {
304                 req->curr_bio = req->curr_bio->bi_next;
305                 nvme_tcp_init_iter(req, ITER_SOURCE);
306         }
307 }
308
309 static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
310 {
311         int ret;
312
313         /* drain the send queue as much as we can... */
314         do {
315                 ret = nvme_tcp_try_send(queue);
316         } while (ret > 0);
317 }
318
319 static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
320 {
321         return !list_empty(&queue->send_list) ||
322                 !llist_empty(&queue->req_list);
323 }
324
325 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
326                 bool sync, bool last)
327 {
328         struct nvme_tcp_queue *queue = req->queue;
329         bool empty;
330
331         empty = llist_add(&req->lentry, &queue->req_list) &&
332                 list_empty(&queue->send_list) && !queue->request;
333
334         /*
335          * if we're the first on the send_list and we can try to send
336          * directly, otherwise queue io_work. Also, only do that if we
337          * are on the same cpu, so we don't introduce contention.
338          */
339         if (queue->io_cpu == raw_smp_processor_id() &&
340             sync && empty && mutex_trylock(&queue->send_mutex)) {
341                 nvme_tcp_send_all(queue);
342                 mutex_unlock(&queue->send_mutex);
343         }
344
345         if (last && nvme_tcp_queue_more(queue))
346                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
347 }
348
349 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
350 {
351         struct nvme_tcp_request *req;
352         struct llist_node *node;
353
354         for (node = llist_del_all(&queue->req_list); node; node = node->next) {
355                 req = llist_entry(node, struct nvme_tcp_request, lentry);
356                 list_add(&req->entry, &queue->send_list);
357         }
358 }
359
360 static inline struct nvme_tcp_request *
361 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
362 {
363         struct nvme_tcp_request *req;
364
365         req = list_first_entry_or_null(&queue->send_list,
366                         struct nvme_tcp_request, entry);
367         if (!req) {
368                 nvme_tcp_process_req_list(queue);
369                 req = list_first_entry_or_null(&queue->send_list,
370                                 struct nvme_tcp_request, entry);
371                 if (unlikely(!req))
372                         return NULL;
373         }
374
375         list_del(&req->entry);
376         return req;
377 }
378
379 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
380                 __le32 *dgst)
381 {
382         ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
383         crypto_ahash_final(hash);
384 }
385
386 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
387                 struct page *page, off_t off, size_t len)
388 {
389         struct scatterlist sg;
390
391         sg_init_table(&sg, 1);
392         sg_set_page(&sg, page, len, off);
393         ahash_request_set_crypt(hash, &sg, NULL, len);
394         crypto_ahash_update(hash);
395 }
396
397 static inline void nvme_tcp_hdgst(struct ahash_request *hash,
398                 void *pdu, size_t len)
399 {
400         struct scatterlist sg;
401
402         sg_init_one(&sg, pdu, len);
403         ahash_request_set_crypt(hash, &sg, pdu + len, len);
404         crypto_ahash_digest(hash);
405 }
406
407 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
408                 void *pdu, size_t pdu_len)
409 {
410         struct nvme_tcp_hdr *hdr = pdu;
411         __le32 recv_digest;
412         __le32 exp_digest;
413
414         if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
415                 dev_err(queue->ctrl->ctrl.device,
416                         "queue %d: header digest flag is cleared\n",
417                         nvme_tcp_queue_id(queue));
418                 return -EPROTO;
419         }
420
421         recv_digest = *(__le32 *)(pdu + hdr->hlen);
422         nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
423         exp_digest = *(__le32 *)(pdu + hdr->hlen);
424         if (recv_digest != exp_digest) {
425                 dev_err(queue->ctrl->ctrl.device,
426                         "header digest error: recv %#x expected %#x\n",
427                         le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
428                 return -EIO;
429         }
430
431         return 0;
432 }
433
434 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
435 {
436         struct nvme_tcp_hdr *hdr = pdu;
437         u8 digest_len = nvme_tcp_hdgst_len(queue);
438         u32 len;
439
440         len = le32_to_cpu(hdr->plen) - hdr->hlen -
441                 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
442
443         if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
444                 dev_err(queue->ctrl->ctrl.device,
445                         "queue %d: data digest flag is cleared\n",
446                 nvme_tcp_queue_id(queue));
447                 return -EPROTO;
448         }
449         crypto_ahash_init(queue->rcv_hash);
450
451         return 0;
452 }
453
454 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
455                 struct request *rq, unsigned int hctx_idx)
456 {
457         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
458
459         page_frag_free(req->pdu);
460 }
461
462 static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
463                 struct request *rq, unsigned int hctx_idx,
464                 unsigned int numa_node)
465 {
466         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
467         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
468         struct nvme_tcp_cmd_pdu *pdu;
469         int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
470         struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
471         u8 hdgst = nvme_tcp_hdgst_len(queue);
472
473         req->pdu = page_frag_alloc(&queue->pf_cache,
474                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
475                 GFP_KERNEL | __GFP_ZERO);
476         if (!req->pdu)
477                 return -ENOMEM;
478
479         pdu = req->pdu;
480         req->queue = queue;
481         nvme_req(rq)->ctrl = &ctrl->ctrl;
482         nvme_req(rq)->cmd = &pdu->cmd;
483
484         return 0;
485 }
486
487 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
488                 unsigned int hctx_idx)
489 {
490         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
491         struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
492
493         hctx->driver_data = queue;
494         return 0;
495 }
496
497 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
498                 unsigned int hctx_idx)
499 {
500         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
501         struct nvme_tcp_queue *queue = &ctrl->queues[0];
502
503         hctx->driver_data = queue;
504         return 0;
505 }
506
507 static enum nvme_tcp_recv_state
508 nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
509 {
510         return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
511                 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
512                 NVME_TCP_RECV_DATA;
513 }
514
515 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
516 {
517         queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
518                                 nvme_tcp_hdgst_len(queue);
519         queue->pdu_offset = 0;
520         queue->data_remaining = -1;
521         queue->ddgst_remaining = 0;
522 }
523
524 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
525 {
526         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
527                 return;
528
529         dev_warn(ctrl->device, "starting error recovery\n");
530         queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
531 }
532
533 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
534                 struct nvme_completion *cqe)
535 {
536         struct nvme_tcp_request *req;
537         struct request *rq;
538
539         rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
540         if (!rq) {
541                 dev_err(queue->ctrl->ctrl.device,
542                         "got bad cqe.command_id %#x on queue %d\n",
543                         cqe->command_id, nvme_tcp_queue_id(queue));
544                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
545                 return -EINVAL;
546         }
547
548         req = blk_mq_rq_to_pdu(rq);
549         if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
550                 req->status = cqe->status;
551
552         if (!nvme_try_complete_req(rq, req->status, cqe->result))
553                 nvme_complete_rq(rq);
554         queue->nr_cqe++;
555
556         return 0;
557 }
558
559 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
560                 struct nvme_tcp_data_pdu *pdu)
561 {
562         struct request *rq;
563
564         rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
565         if (!rq) {
566                 dev_err(queue->ctrl->ctrl.device,
567                         "got bad c2hdata.command_id %#x on queue %d\n",
568                         pdu->command_id, nvme_tcp_queue_id(queue));
569                 return -ENOENT;
570         }
571
572         if (!blk_rq_payload_bytes(rq)) {
573                 dev_err(queue->ctrl->ctrl.device,
574                         "queue %d tag %#x unexpected data\n",
575                         nvme_tcp_queue_id(queue), rq->tag);
576                 return -EIO;
577         }
578
579         queue->data_remaining = le32_to_cpu(pdu->data_length);
580
581         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
582             unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
583                 dev_err(queue->ctrl->ctrl.device,
584                         "queue %d tag %#x SUCCESS set but not last PDU\n",
585                         nvme_tcp_queue_id(queue), rq->tag);
586                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
587                 return -EPROTO;
588         }
589
590         return 0;
591 }
592
593 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
594                 struct nvme_tcp_rsp_pdu *pdu)
595 {
596         struct nvme_completion *cqe = &pdu->cqe;
597         int ret = 0;
598
599         /*
600          * AEN requests are special as they don't time out and can
601          * survive any kind of queue freeze and often don't respond to
602          * aborts.  We don't even bother to allocate a struct request
603          * for them but rather special case them here.
604          */
605         if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
606                                      cqe->command_id)))
607                 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
608                                 &cqe->result);
609         else
610                 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
611
612         return ret;
613 }
614
615 static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
616 {
617         struct nvme_tcp_data_pdu *data = req->pdu;
618         struct nvme_tcp_queue *queue = req->queue;
619         struct request *rq = blk_mq_rq_from_pdu(req);
620         u32 h2cdata_sent = req->pdu_len;
621         u8 hdgst = nvme_tcp_hdgst_len(queue);
622         u8 ddgst = nvme_tcp_ddgst_len(queue);
623
624         req->state = NVME_TCP_SEND_H2C_PDU;
625         req->offset = 0;
626         req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata);
627         req->pdu_sent = 0;
628         req->h2cdata_left -= req->pdu_len;
629         req->h2cdata_offset += h2cdata_sent;
630
631         memset(data, 0, sizeof(*data));
632         data->hdr.type = nvme_tcp_h2c_data;
633         if (!req->h2cdata_left)
634                 data->hdr.flags = NVME_TCP_F_DATA_LAST;
635         if (queue->hdr_digest)
636                 data->hdr.flags |= NVME_TCP_F_HDGST;
637         if (queue->data_digest)
638                 data->hdr.flags |= NVME_TCP_F_DDGST;
639         data->hdr.hlen = sizeof(*data);
640         data->hdr.pdo = data->hdr.hlen + hdgst;
641         data->hdr.plen =
642                 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
643         data->ttag = req->ttag;
644         data->command_id = nvme_cid(rq);
645         data->data_offset = cpu_to_le32(req->h2cdata_offset);
646         data->data_length = cpu_to_le32(req->pdu_len);
647 }
648
649 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
650                 struct nvme_tcp_r2t_pdu *pdu)
651 {
652         struct nvme_tcp_request *req;
653         struct request *rq;
654         u32 r2t_length = le32_to_cpu(pdu->r2t_length);
655         u32 r2t_offset = le32_to_cpu(pdu->r2t_offset);
656
657         rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
658         if (!rq) {
659                 dev_err(queue->ctrl->ctrl.device,
660                         "got bad r2t.command_id %#x on queue %d\n",
661                         pdu->command_id, nvme_tcp_queue_id(queue));
662                 return -ENOENT;
663         }
664         req = blk_mq_rq_to_pdu(rq);
665
666         if (unlikely(!r2t_length)) {
667                 dev_err(queue->ctrl->ctrl.device,
668                         "req %d r2t len is %u, probably a bug...\n",
669                         rq->tag, r2t_length);
670                 return -EPROTO;
671         }
672
673         if (unlikely(req->data_sent + r2t_length > req->data_len)) {
674                 dev_err(queue->ctrl->ctrl.device,
675                         "req %d r2t len %u exceeded data len %u (%zu sent)\n",
676                         rq->tag, r2t_length, req->data_len, req->data_sent);
677                 return -EPROTO;
678         }
679
680         if (unlikely(r2t_offset < req->data_sent)) {
681                 dev_err(queue->ctrl->ctrl.device,
682                         "req %d unexpected r2t offset %u (expected %zu)\n",
683                         rq->tag, r2t_offset, req->data_sent);
684                 return -EPROTO;
685         }
686
687         req->pdu_len = 0;
688         req->h2cdata_left = r2t_length;
689         req->h2cdata_offset = r2t_offset;
690         req->ttag = pdu->ttag;
691
692         nvme_tcp_setup_h2c_data_pdu(req);
693         nvme_tcp_queue_request(req, false, true);
694
695         return 0;
696 }
697
698 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
699                 unsigned int *offset, size_t *len)
700 {
701         struct nvme_tcp_hdr *hdr;
702         char *pdu = queue->pdu;
703         size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
704         int ret;
705
706         ret = skb_copy_bits(skb, *offset,
707                 &pdu[queue->pdu_offset], rcv_len);
708         if (unlikely(ret))
709                 return ret;
710
711         queue->pdu_remaining -= rcv_len;
712         queue->pdu_offset += rcv_len;
713         *offset += rcv_len;
714         *len -= rcv_len;
715         if (queue->pdu_remaining)
716                 return 0;
717
718         hdr = queue->pdu;
719         if (queue->hdr_digest) {
720                 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
721                 if (unlikely(ret))
722                         return ret;
723         }
724
725
726         if (queue->data_digest) {
727                 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
728                 if (unlikely(ret))
729                         return ret;
730         }
731
732         switch (hdr->type) {
733         case nvme_tcp_c2h_data:
734                 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
735         case nvme_tcp_rsp:
736                 nvme_tcp_init_recv_ctx(queue);
737                 return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
738         case nvme_tcp_r2t:
739                 nvme_tcp_init_recv_ctx(queue);
740                 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
741         default:
742                 dev_err(queue->ctrl->ctrl.device,
743                         "unsupported pdu type (%d)\n", hdr->type);
744                 return -EINVAL;
745         }
746 }
747
748 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
749 {
750         union nvme_result res = {};
751
752         if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
753                 nvme_complete_rq(rq);
754 }
755
756 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
757                               unsigned int *offset, size_t *len)
758 {
759         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
760         struct request *rq =
761                 nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
762         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
763
764         while (true) {
765                 int recv_len, ret;
766
767                 recv_len = min_t(size_t, *len, queue->data_remaining);
768                 if (!recv_len)
769                         break;
770
771                 if (!iov_iter_count(&req->iter)) {
772                         req->curr_bio = req->curr_bio->bi_next;
773
774                         /*
775                          * If we don`t have any bios it means that controller
776                          * sent more data than we requested, hence error
777                          */
778                         if (!req->curr_bio) {
779                                 dev_err(queue->ctrl->ctrl.device,
780                                         "queue %d no space in request %#x",
781                                         nvme_tcp_queue_id(queue), rq->tag);
782                                 nvme_tcp_init_recv_ctx(queue);
783                                 return -EIO;
784                         }
785                         nvme_tcp_init_iter(req, ITER_DEST);
786                 }
787
788                 /* we can read only from what is left in this bio */
789                 recv_len = min_t(size_t, recv_len,
790                                 iov_iter_count(&req->iter));
791
792                 if (queue->data_digest)
793                         ret = skb_copy_and_hash_datagram_iter(skb, *offset,
794                                 &req->iter, recv_len, queue->rcv_hash);
795                 else
796                         ret = skb_copy_datagram_iter(skb, *offset,
797                                         &req->iter, recv_len);
798                 if (ret) {
799                         dev_err(queue->ctrl->ctrl.device,
800                                 "queue %d failed to copy request %#x data",
801                                 nvme_tcp_queue_id(queue), rq->tag);
802                         return ret;
803                 }
804
805                 *len -= recv_len;
806                 *offset += recv_len;
807                 queue->data_remaining -= recv_len;
808         }
809
810         if (!queue->data_remaining) {
811                 if (queue->data_digest) {
812                         nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
813                         queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
814                 } else {
815                         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
816                                 nvme_tcp_end_request(rq,
817                                                 le16_to_cpu(req->status));
818                                 queue->nr_cqe++;
819                         }
820                         nvme_tcp_init_recv_ctx(queue);
821                 }
822         }
823
824         return 0;
825 }
826
827 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
828                 struct sk_buff *skb, unsigned int *offset, size_t *len)
829 {
830         struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
831         char *ddgst = (char *)&queue->recv_ddgst;
832         size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
833         off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
834         int ret;
835
836         ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
837         if (unlikely(ret))
838                 return ret;
839
840         queue->ddgst_remaining -= recv_len;
841         *offset += recv_len;
842         *len -= recv_len;
843         if (queue->ddgst_remaining)
844                 return 0;
845
846         if (queue->recv_ddgst != queue->exp_ddgst) {
847                 struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
848                                         pdu->command_id);
849                 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
850
851                 req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
852
853                 dev_err(queue->ctrl->ctrl.device,
854                         "data digest error: recv %#x expected %#x\n",
855                         le32_to_cpu(queue->recv_ddgst),
856                         le32_to_cpu(queue->exp_ddgst));
857         }
858
859         if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
860                 struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
861                                         pdu->command_id);
862                 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
863
864                 nvme_tcp_end_request(rq, le16_to_cpu(req->status));
865                 queue->nr_cqe++;
866         }
867
868         nvme_tcp_init_recv_ctx(queue);
869         return 0;
870 }
871
872 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
873                              unsigned int offset, size_t len)
874 {
875         struct nvme_tcp_queue *queue = desc->arg.data;
876         size_t consumed = len;
877         int result;
878
879         while (len) {
880                 switch (nvme_tcp_recv_state(queue)) {
881                 case NVME_TCP_RECV_PDU:
882                         result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
883                         break;
884                 case NVME_TCP_RECV_DATA:
885                         result = nvme_tcp_recv_data(queue, skb, &offset, &len);
886                         break;
887                 case NVME_TCP_RECV_DDGST:
888                         result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
889                         break;
890                 default:
891                         result = -EFAULT;
892                 }
893                 if (result) {
894                         dev_err(queue->ctrl->ctrl.device,
895                                 "receive failed:  %d\n", result);
896                         queue->rd_enabled = false;
897                         nvme_tcp_error_recovery(&queue->ctrl->ctrl);
898                         return result;
899                 }
900         }
901
902         return consumed;
903 }
904
905 static void nvme_tcp_data_ready(struct sock *sk)
906 {
907         struct nvme_tcp_queue *queue;
908
909         trace_sk_data_ready(sk);
910
911         read_lock_bh(&sk->sk_callback_lock);
912         queue = sk->sk_user_data;
913         if (likely(queue && queue->rd_enabled) &&
914             !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
915                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
916         read_unlock_bh(&sk->sk_callback_lock);
917 }
918
919 static void nvme_tcp_write_space(struct sock *sk)
920 {
921         struct nvme_tcp_queue *queue;
922
923         read_lock_bh(&sk->sk_callback_lock);
924         queue = sk->sk_user_data;
925         if (likely(queue && sk_stream_is_writeable(sk))) {
926                 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
927                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
928         }
929         read_unlock_bh(&sk->sk_callback_lock);
930 }
931
932 static void nvme_tcp_state_change(struct sock *sk)
933 {
934         struct nvme_tcp_queue *queue;
935
936         read_lock_bh(&sk->sk_callback_lock);
937         queue = sk->sk_user_data;
938         if (!queue)
939                 goto done;
940
941         switch (sk->sk_state) {
942         case TCP_CLOSE:
943         case TCP_CLOSE_WAIT:
944         case TCP_LAST_ACK:
945         case TCP_FIN_WAIT1:
946         case TCP_FIN_WAIT2:
947                 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
948                 break;
949         default:
950                 dev_info(queue->ctrl->ctrl.device,
951                         "queue %d socket state %d\n",
952                         nvme_tcp_queue_id(queue), sk->sk_state);
953         }
954
955         queue->state_change(sk);
956 done:
957         read_unlock_bh(&sk->sk_callback_lock);
958 }
959
960 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
961 {
962         queue->request = NULL;
963 }
964
965 static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
966 {
967         if (nvme_tcp_async_req(req)) {
968                 union nvme_result res = {};
969
970                 nvme_complete_async_event(&req->queue->ctrl->ctrl,
971                                 cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res);
972         } else {
973                 nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
974                                 NVME_SC_HOST_PATH_ERROR);
975         }
976 }
977
978 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
979 {
980         struct nvme_tcp_queue *queue = req->queue;
981         int req_data_len = req->data_len;
982         u32 h2cdata_left = req->h2cdata_left;
983
984         while (true) {
985                 struct page *page = nvme_tcp_req_cur_page(req);
986                 size_t offset = nvme_tcp_req_cur_offset(req);
987                 size_t len = nvme_tcp_req_cur_length(req);
988                 bool last = nvme_tcp_pdu_last_send(req, len);
989                 int req_data_sent = req->data_sent;
990                 int ret, flags = MSG_DONTWAIT;
991
992                 if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
993                         flags |= MSG_EOR;
994                 else
995                         flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
996
997                 if (sendpage_ok(page)) {
998                         ret = kernel_sendpage(queue->sock, page, offset, len,
999                                         flags);
1000                 } else {
1001                         ret = sock_no_sendpage(queue->sock, page, offset, len,
1002                                         flags);
1003                 }
1004                 if (ret <= 0)
1005                         return ret;
1006
1007                 if (queue->data_digest)
1008                         nvme_tcp_ddgst_update(queue->snd_hash, page,
1009                                         offset, ret);
1010
1011                 /*
1012                  * update the request iterator except for the last payload send
1013                  * in the request where we don't want to modify it as we may
1014                  * compete with the RX path completing the request.
1015                  */
1016                 if (req_data_sent + ret < req_data_len)
1017                         nvme_tcp_advance_req(req, ret);
1018
1019                 /* fully successful last send in current PDU */
1020                 if (last && ret == len) {
1021                         if (queue->data_digest) {
1022                                 nvme_tcp_ddgst_final(queue->snd_hash,
1023                                         &req->ddgst);
1024                                 req->state = NVME_TCP_SEND_DDGST;
1025                                 req->offset = 0;
1026                         } else {
1027                                 if (h2cdata_left)
1028                                         nvme_tcp_setup_h2c_data_pdu(req);
1029                                 else
1030                                         nvme_tcp_done_send_req(queue);
1031                         }
1032                         return 1;
1033                 }
1034         }
1035         return -EAGAIN;
1036 }
1037
1038 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
1039 {
1040         struct nvme_tcp_queue *queue = req->queue;
1041         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1042         bool inline_data = nvme_tcp_has_inline_data(req);
1043         u8 hdgst = nvme_tcp_hdgst_len(queue);
1044         int len = sizeof(*pdu) + hdgst - req->offset;
1045         int flags = MSG_DONTWAIT;
1046         int ret;
1047
1048         if (inline_data || nvme_tcp_queue_more(queue))
1049                 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
1050         else
1051                 flags |= MSG_EOR;
1052
1053         if (queue->hdr_digest && !req->offset)
1054                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1055
1056         ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1057                         offset_in_page(pdu) + req->offset, len,  flags);
1058         if (unlikely(ret <= 0))
1059                 return ret;
1060
1061         len -= ret;
1062         if (!len) {
1063                 if (inline_data) {
1064                         req->state = NVME_TCP_SEND_DATA;
1065                         if (queue->data_digest)
1066                                 crypto_ahash_init(queue->snd_hash);
1067                 } else {
1068                         nvme_tcp_done_send_req(queue);
1069                 }
1070                 return 1;
1071         }
1072         req->offset += ret;
1073
1074         return -EAGAIN;
1075 }
1076
1077 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1078 {
1079         struct nvme_tcp_queue *queue = req->queue;
1080         struct nvme_tcp_data_pdu *pdu = req->pdu;
1081         u8 hdgst = nvme_tcp_hdgst_len(queue);
1082         int len = sizeof(*pdu) - req->offset + hdgst;
1083         int ret;
1084
1085         if (queue->hdr_digest && !req->offset)
1086                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1087
1088         if (!req->h2cdata_left)
1089                 ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1090                                 offset_in_page(pdu) + req->offset, len,
1091                                 MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
1092         else
1093                 ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
1094                                 offset_in_page(pdu) + req->offset, len,
1095                                 MSG_DONTWAIT | MSG_MORE);
1096         if (unlikely(ret <= 0))
1097                 return ret;
1098
1099         len -= ret;
1100         if (!len) {
1101                 req->state = NVME_TCP_SEND_DATA;
1102                 if (queue->data_digest)
1103                         crypto_ahash_init(queue->snd_hash);
1104                 return 1;
1105         }
1106         req->offset += ret;
1107
1108         return -EAGAIN;
1109 }
1110
1111 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1112 {
1113         struct nvme_tcp_queue *queue = req->queue;
1114         size_t offset = req->offset;
1115         u32 h2cdata_left = req->h2cdata_left;
1116         int ret;
1117         struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1118         struct kvec iov = {
1119                 .iov_base = (u8 *)&req->ddgst + req->offset,
1120                 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1121         };
1122
1123         if (nvme_tcp_queue_more(queue))
1124                 msg.msg_flags |= MSG_MORE;
1125         else
1126                 msg.msg_flags |= MSG_EOR;
1127
1128         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1129         if (unlikely(ret <= 0))
1130                 return ret;
1131
1132         if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
1133                 if (h2cdata_left)
1134                         nvme_tcp_setup_h2c_data_pdu(req);
1135                 else
1136                         nvme_tcp_done_send_req(queue);
1137                 return 1;
1138         }
1139
1140         req->offset += ret;
1141         return -EAGAIN;
1142 }
1143
1144 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1145 {
1146         struct nvme_tcp_request *req;
1147         unsigned int noreclaim_flag;
1148         int ret = 1;
1149
1150         if (!queue->request) {
1151                 queue->request = nvme_tcp_fetch_request(queue);
1152                 if (!queue->request)
1153                         return 0;
1154         }
1155         req = queue->request;
1156
1157         noreclaim_flag = memalloc_noreclaim_save();
1158         if (req->state == NVME_TCP_SEND_CMD_PDU) {
1159                 ret = nvme_tcp_try_send_cmd_pdu(req);
1160                 if (ret <= 0)
1161                         goto done;
1162                 if (!nvme_tcp_has_inline_data(req))
1163                         goto out;
1164         }
1165
1166         if (req->state == NVME_TCP_SEND_H2C_PDU) {
1167                 ret = nvme_tcp_try_send_data_pdu(req);
1168                 if (ret <= 0)
1169                         goto done;
1170         }
1171
1172         if (req->state == NVME_TCP_SEND_DATA) {
1173                 ret = nvme_tcp_try_send_data(req);
1174                 if (ret <= 0)
1175                         goto done;
1176         }
1177
1178         if (req->state == NVME_TCP_SEND_DDGST)
1179                 ret = nvme_tcp_try_send_ddgst(req);
1180 done:
1181         if (ret == -EAGAIN) {
1182                 ret = 0;
1183         } else if (ret < 0) {
1184                 dev_err(queue->ctrl->ctrl.device,
1185                         "failed to send request %d\n", ret);
1186                 nvme_tcp_fail_request(queue->request);
1187                 nvme_tcp_done_send_req(queue);
1188         }
1189 out:
1190         memalloc_noreclaim_restore(noreclaim_flag);
1191         return ret;
1192 }
1193
1194 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1195 {
1196         struct socket *sock = queue->sock;
1197         struct sock *sk = sock->sk;
1198         read_descriptor_t rd_desc;
1199         int consumed;
1200
1201         rd_desc.arg.data = queue;
1202         rd_desc.count = 1;
1203         lock_sock(sk);
1204         queue->nr_cqe = 0;
1205         consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1206         release_sock(sk);
1207         return consumed;
1208 }
1209
1210 static void nvme_tcp_io_work(struct work_struct *w)
1211 {
1212         struct nvme_tcp_queue *queue =
1213                 container_of(w, struct nvme_tcp_queue, io_work);
1214         unsigned long deadline = jiffies + msecs_to_jiffies(1);
1215
1216         do {
1217                 bool pending = false;
1218                 int result;
1219
1220                 if (mutex_trylock(&queue->send_mutex)) {
1221                         result = nvme_tcp_try_send(queue);
1222                         mutex_unlock(&queue->send_mutex);
1223                         if (result > 0)
1224                                 pending = true;
1225                         else if (unlikely(result < 0))
1226                                 break;
1227                 }
1228
1229                 result = nvme_tcp_try_recv(queue);
1230                 if (result > 0)
1231                         pending = true;
1232                 else if (unlikely(result < 0))
1233                         return;
1234
1235                 if (!pending || !queue->rd_enabled)
1236                         return;
1237
1238         } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1239
1240         queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1241 }
1242
1243 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1244 {
1245         struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1246
1247         ahash_request_free(queue->rcv_hash);
1248         ahash_request_free(queue->snd_hash);
1249         crypto_free_ahash(tfm);
1250 }
1251
1252 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1253 {
1254         struct crypto_ahash *tfm;
1255
1256         tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1257         if (IS_ERR(tfm))
1258                 return PTR_ERR(tfm);
1259
1260         queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1261         if (!queue->snd_hash)
1262                 goto free_tfm;
1263         ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1264
1265         queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1266         if (!queue->rcv_hash)
1267                 goto free_snd_hash;
1268         ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1269
1270         return 0;
1271 free_snd_hash:
1272         ahash_request_free(queue->snd_hash);
1273 free_tfm:
1274         crypto_free_ahash(tfm);
1275         return -ENOMEM;
1276 }
1277
1278 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1279 {
1280         struct nvme_tcp_request *async = &ctrl->async_req;
1281
1282         page_frag_free(async->pdu);
1283 }
1284
1285 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1286 {
1287         struct nvme_tcp_queue *queue = &ctrl->queues[0];
1288         struct nvme_tcp_request *async = &ctrl->async_req;
1289         u8 hdgst = nvme_tcp_hdgst_len(queue);
1290
1291         async->pdu = page_frag_alloc(&queue->pf_cache,
1292                 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1293                 GFP_KERNEL | __GFP_ZERO);
1294         if (!async->pdu)
1295                 return -ENOMEM;
1296
1297         async->queue = &ctrl->queues[0];
1298         return 0;
1299 }
1300
1301 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1302 {
1303         struct page *page;
1304         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1305         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1306         unsigned int noreclaim_flag;
1307
1308         if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1309                 return;
1310
1311         if (queue->hdr_digest || queue->data_digest)
1312                 nvme_tcp_free_crypto(queue);
1313
1314         if (queue->pf_cache.va) {
1315                 page = virt_to_head_page(queue->pf_cache.va);
1316                 __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
1317                 queue->pf_cache.va = NULL;
1318         }
1319
1320         noreclaim_flag = memalloc_noreclaim_save();
1321         sock_release(queue->sock);
1322         memalloc_noreclaim_restore(noreclaim_flag);
1323
1324         kfree(queue->pdu);
1325         mutex_destroy(&queue->send_mutex);
1326         mutex_destroy(&queue->queue_lock);
1327 }
1328
1329 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1330 {
1331         struct nvme_tcp_icreq_pdu *icreq;
1332         struct nvme_tcp_icresp_pdu *icresp;
1333         struct msghdr msg = {};
1334         struct kvec iov;
1335         bool ctrl_hdgst, ctrl_ddgst;
1336         u32 maxh2cdata;
1337         int ret;
1338
1339         icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1340         if (!icreq)
1341                 return -ENOMEM;
1342
1343         icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1344         if (!icresp) {
1345                 ret = -ENOMEM;
1346                 goto free_icreq;
1347         }
1348
1349         icreq->hdr.type = nvme_tcp_icreq;
1350         icreq->hdr.hlen = sizeof(*icreq);
1351         icreq->hdr.pdo = 0;
1352         icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1353         icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1354         icreq->maxr2t = 0; /* single inflight r2t supported */
1355         icreq->hpda = 0; /* no alignment constraint */
1356         if (queue->hdr_digest)
1357                 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1358         if (queue->data_digest)
1359                 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1360
1361         iov.iov_base = icreq;
1362         iov.iov_len = sizeof(*icreq);
1363         ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1364         if (ret < 0)
1365                 goto free_icresp;
1366
1367         memset(&msg, 0, sizeof(msg));
1368         iov.iov_base = icresp;
1369         iov.iov_len = sizeof(*icresp);
1370         ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1371                         iov.iov_len, msg.msg_flags);
1372         if (ret < 0)
1373                 goto free_icresp;
1374
1375         ret = -EINVAL;
1376         if (icresp->hdr.type != nvme_tcp_icresp) {
1377                 pr_err("queue %d: bad type returned %d\n",
1378                         nvme_tcp_queue_id(queue), icresp->hdr.type);
1379                 goto free_icresp;
1380         }
1381
1382         if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1383                 pr_err("queue %d: bad pdu length returned %d\n",
1384                         nvme_tcp_queue_id(queue), icresp->hdr.plen);
1385                 goto free_icresp;
1386         }
1387
1388         if (icresp->pfv != NVME_TCP_PFV_1_0) {
1389                 pr_err("queue %d: bad pfv returned %d\n",
1390                         nvme_tcp_queue_id(queue), icresp->pfv);
1391                 goto free_icresp;
1392         }
1393
1394         ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1395         if ((queue->data_digest && !ctrl_ddgst) ||
1396             (!queue->data_digest && ctrl_ddgst)) {
1397                 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1398                         nvme_tcp_queue_id(queue),
1399                         queue->data_digest ? "enabled" : "disabled",
1400                         ctrl_ddgst ? "enabled" : "disabled");
1401                 goto free_icresp;
1402         }
1403
1404         ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1405         if ((queue->hdr_digest && !ctrl_hdgst) ||
1406             (!queue->hdr_digest && ctrl_hdgst)) {
1407                 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1408                         nvme_tcp_queue_id(queue),
1409                         queue->hdr_digest ? "enabled" : "disabled",
1410                         ctrl_hdgst ? "enabled" : "disabled");
1411                 goto free_icresp;
1412         }
1413
1414         if (icresp->cpda != 0) {
1415                 pr_err("queue %d: unsupported cpda returned %d\n",
1416                         nvme_tcp_queue_id(queue), icresp->cpda);
1417                 goto free_icresp;
1418         }
1419
1420         maxh2cdata = le32_to_cpu(icresp->maxdata);
1421         if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) {
1422                 pr_err("queue %d: invalid maxh2cdata returned %u\n",
1423                        nvme_tcp_queue_id(queue), maxh2cdata);
1424                 goto free_icresp;
1425         }
1426         queue->maxh2cdata = maxh2cdata;
1427
1428         ret = 0;
1429 free_icresp:
1430         kfree(icresp);
1431 free_icreq:
1432         kfree(icreq);
1433         return ret;
1434 }
1435
1436 static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1437 {
1438         return nvme_tcp_queue_id(queue) == 0;
1439 }
1440
1441 static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1442 {
1443         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1444         int qid = nvme_tcp_queue_id(queue);
1445
1446         return !nvme_tcp_admin_queue(queue) &&
1447                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1448 }
1449
1450 static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1451 {
1452         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1453         int qid = nvme_tcp_queue_id(queue);
1454
1455         return !nvme_tcp_admin_queue(queue) &&
1456                 !nvme_tcp_default_queue(queue) &&
1457                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1458                           ctrl->io_queues[HCTX_TYPE_READ];
1459 }
1460
1461 static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1462 {
1463         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1464         int qid = nvme_tcp_queue_id(queue);
1465
1466         return !nvme_tcp_admin_queue(queue) &&
1467                 !nvme_tcp_default_queue(queue) &&
1468                 !nvme_tcp_read_queue(queue) &&
1469                 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1470                           ctrl->io_queues[HCTX_TYPE_READ] +
1471                           ctrl->io_queues[HCTX_TYPE_POLL];
1472 }
1473
1474 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1475 {
1476         struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1477         int qid = nvme_tcp_queue_id(queue);
1478         int n = 0;
1479
1480         if (nvme_tcp_default_queue(queue))
1481                 n = qid - 1;
1482         else if (nvme_tcp_read_queue(queue))
1483                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1484         else if (nvme_tcp_poll_queue(queue))
1485                 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1486                                 ctrl->io_queues[HCTX_TYPE_READ] - 1;
1487         queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1488 }
1489
1490 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
1491 {
1492         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1493         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1494         int ret, rcv_pdu_size;
1495
1496         mutex_init(&queue->queue_lock);
1497         queue->ctrl = ctrl;
1498         init_llist_head(&queue->req_list);
1499         INIT_LIST_HEAD(&queue->send_list);
1500         mutex_init(&queue->send_mutex);
1501         INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1502
1503         if (qid > 0)
1504                 queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1505         else
1506                 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1507                                                 NVME_TCP_ADMIN_CCSZ;
1508
1509         ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1510                         IPPROTO_TCP, &queue->sock);
1511         if (ret) {
1512                 dev_err(nctrl->device,
1513                         "failed to create socket: %d\n", ret);
1514                 goto err_destroy_mutex;
1515         }
1516
1517         nvme_tcp_reclassify_socket(queue->sock);
1518
1519         /* Single syn retry */
1520         tcp_sock_set_syncnt(queue->sock->sk, 1);
1521
1522         /* Set TCP no delay */
1523         tcp_sock_set_nodelay(queue->sock->sk);
1524
1525         /*
1526          * Cleanup whatever is sitting in the TCP transmit queue on socket
1527          * close. This is done to prevent stale data from being sent should
1528          * the network connection be restored before TCP times out.
1529          */
1530         sock_no_linger(queue->sock->sk);
1531
1532         if (so_priority > 0)
1533                 sock_set_priority(queue->sock->sk, so_priority);
1534
1535         /* Set socket type of service */
1536         if (nctrl->opts->tos >= 0)
1537                 ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1538
1539         /* Set 10 seconds timeout for icresp recvmsg */
1540         queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1541
1542         queue->sock->sk->sk_allocation = GFP_ATOMIC;
1543         queue->sock->sk->sk_use_task_frag = false;
1544         nvme_tcp_set_queue_io_cpu(queue);
1545         queue->request = NULL;
1546         queue->data_remaining = 0;
1547         queue->ddgst_remaining = 0;
1548         queue->pdu_remaining = 0;
1549         queue->pdu_offset = 0;
1550         sk_set_memalloc(queue->sock->sk);
1551
1552         if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1553                 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1554                         sizeof(ctrl->src_addr));
1555                 if (ret) {
1556                         dev_err(nctrl->device,
1557                                 "failed to bind queue %d socket %d\n",
1558                                 qid, ret);
1559                         goto err_sock;
1560                 }
1561         }
1562
1563         if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1564                 char *iface = nctrl->opts->host_iface;
1565                 sockptr_t optval = KERNEL_SOCKPTR(iface);
1566
1567                 ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1568                                       optval, strlen(iface));
1569                 if (ret) {
1570                         dev_err(nctrl->device,
1571                           "failed to bind to interface %s queue %d err %d\n",
1572                           iface, qid, ret);
1573                         goto err_sock;
1574                 }
1575         }
1576
1577         queue->hdr_digest = nctrl->opts->hdr_digest;
1578         queue->data_digest = nctrl->opts->data_digest;
1579         if (queue->hdr_digest || queue->data_digest) {
1580                 ret = nvme_tcp_alloc_crypto(queue);
1581                 if (ret) {
1582                         dev_err(nctrl->device,
1583                                 "failed to allocate queue %d crypto\n", qid);
1584                         goto err_sock;
1585                 }
1586         }
1587
1588         rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1589                         nvme_tcp_hdgst_len(queue);
1590         queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1591         if (!queue->pdu) {
1592                 ret = -ENOMEM;
1593                 goto err_crypto;
1594         }
1595
1596         dev_dbg(nctrl->device, "connecting queue %d\n",
1597                         nvme_tcp_queue_id(queue));
1598
1599         ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1600                 sizeof(ctrl->addr), 0);
1601         if (ret) {
1602                 dev_err(nctrl->device,
1603                         "failed to connect socket: %d\n", ret);
1604                 goto err_rcv_pdu;
1605         }
1606
1607         ret = nvme_tcp_init_connection(queue);
1608         if (ret)
1609                 goto err_init_connect;
1610
1611         queue->rd_enabled = true;
1612         set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1613         nvme_tcp_init_recv_ctx(queue);
1614
1615         write_lock_bh(&queue->sock->sk->sk_callback_lock);
1616         queue->sock->sk->sk_user_data = queue;
1617         queue->state_change = queue->sock->sk->sk_state_change;
1618         queue->data_ready = queue->sock->sk->sk_data_ready;
1619         queue->write_space = queue->sock->sk->sk_write_space;
1620         queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1621         queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1622         queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1623 #ifdef CONFIG_NET_RX_BUSY_POLL
1624         queue->sock->sk->sk_ll_usec = 1;
1625 #endif
1626         write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1627
1628         return 0;
1629
1630 err_init_connect:
1631         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1632 err_rcv_pdu:
1633         kfree(queue->pdu);
1634 err_crypto:
1635         if (queue->hdr_digest || queue->data_digest)
1636                 nvme_tcp_free_crypto(queue);
1637 err_sock:
1638         sock_release(queue->sock);
1639         queue->sock = NULL;
1640 err_destroy_mutex:
1641         mutex_destroy(&queue->send_mutex);
1642         mutex_destroy(&queue->queue_lock);
1643         return ret;
1644 }
1645
1646 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1647 {
1648         struct socket *sock = queue->sock;
1649
1650         write_lock_bh(&sock->sk->sk_callback_lock);
1651         sock->sk->sk_user_data  = NULL;
1652         sock->sk->sk_data_ready = queue->data_ready;
1653         sock->sk->sk_state_change = queue->state_change;
1654         sock->sk->sk_write_space  = queue->write_space;
1655         write_unlock_bh(&sock->sk->sk_callback_lock);
1656 }
1657
1658 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1659 {
1660         kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1661         nvme_tcp_restore_sock_calls(queue);
1662         cancel_work_sync(&queue->io_work);
1663 }
1664
1665 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1666 {
1667         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1668         struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1669
1670         if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1671                 return;
1672
1673         mutex_lock(&queue->queue_lock);
1674         if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1675                 __nvme_tcp_stop_queue(queue);
1676         mutex_unlock(&queue->queue_lock);
1677 }
1678
1679 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1680 {
1681         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1682         int ret;
1683
1684         if (idx)
1685                 ret = nvmf_connect_io_queue(nctrl, idx);
1686         else
1687                 ret = nvmf_connect_admin_queue(nctrl);
1688
1689         if (!ret) {
1690                 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1691         } else {
1692                 if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1693                         __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1694                 dev_err(nctrl->device,
1695                         "failed to connect queue: %d ret=%d\n", idx, ret);
1696         }
1697         return ret;
1698 }
1699
1700 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1701 {
1702         if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1703                 cancel_work_sync(&ctrl->async_event_work);
1704                 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1705                 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1706         }
1707
1708         nvme_tcp_free_queue(ctrl, 0);
1709 }
1710
1711 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1712 {
1713         int i;
1714
1715         for (i = 1; i < ctrl->queue_count; i++)
1716                 nvme_tcp_free_queue(ctrl, i);
1717 }
1718
1719 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1720 {
1721         int i;
1722
1723         for (i = 1; i < ctrl->queue_count; i++)
1724                 nvme_tcp_stop_queue(ctrl, i);
1725 }
1726
1727 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
1728                                     int first, int last)
1729 {
1730         int i, ret;
1731
1732         for (i = first; i < last; i++) {
1733                 ret = nvme_tcp_start_queue(ctrl, i);
1734                 if (ret)
1735                         goto out_stop_queues;
1736         }
1737
1738         return 0;
1739
1740 out_stop_queues:
1741         for (i--; i >= first; i--)
1742                 nvme_tcp_stop_queue(ctrl, i);
1743         return ret;
1744 }
1745
1746 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1747 {
1748         int ret;
1749
1750         ret = nvme_tcp_alloc_queue(ctrl, 0);
1751         if (ret)
1752                 return ret;
1753
1754         ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1755         if (ret)
1756                 goto out_free_queue;
1757
1758         return 0;
1759
1760 out_free_queue:
1761         nvme_tcp_free_queue(ctrl, 0);
1762         return ret;
1763 }
1764
1765 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1766 {
1767         int i, ret;
1768
1769         for (i = 1; i < ctrl->queue_count; i++) {
1770                 ret = nvme_tcp_alloc_queue(ctrl, i);
1771                 if (ret)
1772                         goto out_free_queues;
1773         }
1774
1775         return 0;
1776
1777 out_free_queues:
1778         for (i--; i >= 1; i--)
1779                 nvme_tcp_free_queue(ctrl, i);
1780
1781         return ret;
1782 }
1783
1784 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1785 {
1786         unsigned int nr_io_queues;
1787
1788         nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1789         nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1790         nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1791
1792         return nr_io_queues;
1793 }
1794
1795 static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1796                 unsigned int nr_io_queues)
1797 {
1798         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1799         struct nvmf_ctrl_options *opts = nctrl->opts;
1800
1801         if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1802                 /*
1803                  * separate read/write queues
1804                  * hand out dedicated default queues only after we have
1805                  * sufficient read queues.
1806                  */
1807                 ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1808                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1809                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1810                         min(opts->nr_write_queues, nr_io_queues);
1811                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1812         } else {
1813                 /*
1814                  * shared read/write queues
1815                  * either no write queues were requested, or we don't have
1816                  * sufficient queue count to have dedicated default queues.
1817                  */
1818                 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1819                         min(opts->nr_io_queues, nr_io_queues);
1820                 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1821         }
1822
1823         if (opts->nr_poll_queues && nr_io_queues) {
1824                 /* map dedicated poll queues only if we have queues left */
1825                 ctrl->io_queues[HCTX_TYPE_POLL] =
1826                         min(opts->nr_poll_queues, nr_io_queues);
1827         }
1828 }
1829
1830 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1831 {
1832         unsigned int nr_io_queues;
1833         int ret;
1834
1835         nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1836         ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1837         if (ret)
1838                 return ret;
1839
1840         if (nr_io_queues == 0) {
1841                 dev_err(ctrl->device,
1842                         "unable to set any I/O queues\n");
1843                 return -ENOMEM;
1844         }
1845
1846         ctrl->queue_count = nr_io_queues + 1;
1847         dev_info(ctrl->device,
1848                 "creating %d I/O queues.\n", nr_io_queues);
1849
1850         nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1851
1852         return __nvme_tcp_alloc_io_queues(ctrl);
1853 }
1854
1855 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1856 {
1857         nvme_tcp_stop_io_queues(ctrl);
1858         if (remove)
1859                 nvme_remove_io_tag_set(ctrl);
1860         nvme_tcp_free_io_queues(ctrl);
1861 }
1862
1863 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1864 {
1865         int ret, nr_queues;
1866
1867         ret = nvme_tcp_alloc_io_queues(ctrl);
1868         if (ret)
1869                 return ret;
1870
1871         if (new) {
1872                 ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set,
1873                                 &nvme_tcp_mq_ops,
1874                                 ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
1875                                 sizeof(struct nvme_tcp_request));
1876                 if (ret)
1877                         goto out_free_io_queues;
1878         }
1879
1880         /*
1881          * Only start IO queues for which we have allocated the tagset
1882          * and limitted it to the available queues. On reconnects, the
1883          * queue number might have changed.
1884          */
1885         nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
1886         ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues);
1887         if (ret)
1888                 goto out_cleanup_connect_q;
1889
1890         if (!new) {
1891                 nvme_unquiesce_io_queues(ctrl);
1892                 if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
1893                         /*
1894                          * If we timed out waiting for freeze we are likely to
1895                          * be stuck.  Fail the controller initialization just
1896                          * to be safe.
1897                          */
1898                         ret = -ENODEV;
1899                         goto out_wait_freeze_timed_out;
1900                 }
1901                 blk_mq_update_nr_hw_queues(ctrl->tagset,
1902                         ctrl->queue_count - 1);
1903                 nvme_unfreeze(ctrl);
1904         }
1905
1906         /*
1907          * If the number of queues has increased (reconnect case)
1908          * start all new queues now.
1909          */
1910         ret = nvme_tcp_start_io_queues(ctrl, nr_queues,
1911                                        ctrl->tagset->nr_hw_queues + 1);
1912         if (ret)
1913                 goto out_wait_freeze_timed_out;
1914
1915         return 0;
1916
1917 out_wait_freeze_timed_out:
1918         nvme_quiesce_io_queues(ctrl);
1919         nvme_sync_io_queues(ctrl);
1920         nvme_tcp_stop_io_queues(ctrl);
1921 out_cleanup_connect_q:
1922         nvme_cancel_tagset(ctrl);
1923         if (new)
1924                 nvme_remove_io_tag_set(ctrl);
1925 out_free_io_queues:
1926         nvme_tcp_free_io_queues(ctrl);
1927         return ret;
1928 }
1929
1930 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1931 {
1932         nvme_tcp_stop_queue(ctrl, 0);
1933         if (remove)
1934                 nvme_remove_admin_tag_set(ctrl);
1935         nvme_tcp_free_admin_queue(ctrl);
1936 }
1937
1938 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1939 {
1940         int error;
1941
1942         error = nvme_tcp_alloc_admin_queue(ctrl);
1943         if (error)
1944                 return error;
1945
1946         if (new) {
1947                 error = nvme_alloc_admin_tag_set(ctrl,
1948                                 &to_tcp_ctrl(ctrl)->admin_tag_set,
1949                                 &nvme_tcp_admin_mq_ops,
1950                                 sizeof(struct nvme_tcp_request));
1951                 if (error)
1952                         goto out_free_queue;
1953         }
1954
1955         error = nvme_tcp_start_queue(ctrl, 0);
1956         if (error)
1957                 goto out_cleanup_tagset;
1958
1959         error = nvme_enable_ctrl(ctrl);
1960         if (error)
1961                 goto out_stop_queue;
1962
1963         nvme_unquiesce_admin_queue(ctrl);
1964
1965         error = nvme_init_ctrl_finish(ctrl, false);
1966         if (error)
1967                 goto out_quiesce_queue;
1968
1969         return 0;
1970
1971 out_quiesce_queue:
1972         nvme_quiesce_admin_queue(ctrl);
1973         blk_sync_queue(ctrl->admin_q);
1974 out_stop_queue:
1975         nvme_tcp_stop_queue(ctrl, 0);
1976         nvme_cancel_admin_tagset(ctrl);
1977 out_cleanup_tagset:
1978         if (new)
1979                 nvme_remove_admin_tag_set(ctrl);
1980 out_free_queue:
1981         nvme_tcp_free_admin_queue(ctrl);
1982         return error;
1983 }
1984
1985 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1986                 bool remove)
1987 {
1988         nvme_quiesce_admin_queue(ctrl);
1989         blk_sync_queue(ctrl->admin_q);
1990         nvme_tcp_stop_queue(ctrl, 0);
1991         nvme_cancel_admin_tagset(ctrl);
1992         if (remove)
1993                 nvme_unquiesce_admin_queue(ctrl);
1994         nvme_tcp_destroy_admin_queue(ctrl, remove);
1995 }
1996
1997 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1998                 bool remove)
1999 {
2000         if (ctrl->queue_count <= 1)
2001                 return;
2002         nvme_quiesce_admin_queue(ctrl);
2003         nvme_start_freeze(ctrl);
2004         nvme_quiesce_io_queues(ctrl);
2005         nvme_sync_io_queues(ctrl);
2006         nvme_tcp_stop_io_queues(ctrl);
2007         nvme_cancel_tagset(ctrl);
2008         if (remove)
2009                 nvme_unquiesce_io_queues(ctrl);
2010         nvme_tcp_destroy_io_queues(ctrl, remove);
2011 }
2012
2013 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
2014 {
2015         /* If we are resetting/deleting then do nothing */
2016         if (ctrl->state != NVME_CTRL_CONNECTING) {
2017                 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
2018                         ctrl->state == NVME_CTRL_LIVE);
2019                 return;
2020         }
2021
2022         if (nvmf_should_reconnect(ctrl)) {
2023                 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
2024                         ctrl->opts->reconnect_delay);
2025                 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
2026                                 ctrl->opts->reconnect_delay * HZ);
2027         } else {
2028                 dev_info(ctrl->device, "Removing controller...\n");
2029                 nvme_delete_ctrl(ctrl);
2030         }
2031 }
2032
2033 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
2034 {
2035         struct nvmf_ctrl_options *opts = ctrl->opts;
2036         int ret;
2037
2038         ret = nvme_tcp_configure_admin_queue(ctrl, new);
2039         if (ret)
2040                 return ret;
2041
2042         if (ctrl->icdoff) {
2043                 ret = -EOPNOTSUPP;
2044                 dev_err(ctrl->device, "icdoff is not supported!\n");
2045                 goto destroy_admin;
2046         }
2047
2048         if (!nvme_ctrl_sgl_supported(ctrl)) {
2049                 ret = -EOPNOTSUPP;
2050                 dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
2051                 goto destroy_admin;
2052         }
2053
2054         if (opts->queue_size > ctrl->sqsize + 1)
2055                 dev_warn(ctrl->device,
2056                         "queue_size %zu > ctrl sqsize %u, clamping down\n",
2057                         opts->queue_size, ctrl->sqsize + 1);
2058
2059         if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2060                 dev_warn(ctrl->device,
2061                         "sqsize %u > ctrl maxcmd %u, clamping down\n",
2062                         ctrl->sqsize + 1, ctrl->maxcmd);
2063                 ctrl->sqsize = ctrl->maxcmd - 1;
2064         }
2065
2066         if (ctrl->queue_count > 1) {
2067                 ret = nvme_tcp_configure_io_queues(ctrl, new);
2068                 if (ret)
2069                         goto destroy_admin;
2070         }
2071
2072         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2073                 /*
2074                  * state change failure is ok if we started ctrl delete,
2075                  * unless we're during creation of a new controller to
2076                  * avoid races with teardown flow.
2077                  */
2078                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2079                              ctrl->state != NVME_CTRL_DELETING_NOIO);
2080                 WARN_ON_ONCE(new);
2081                 ret = -EINVAL;
2082                 goto destroy_io;
2083         }
2084
2085         nvme_start_ctrl(ctrl);
2086         return 0;
2087
2088 destroy_io:
2089         if (ctrl->queue_count > 1) {
2090                 nvme_quiesce_io_queues(ctrl);
2091                 nvme_sync_io_queues(ctrl);
2092                 nvme_tcp_stop_io_queues(ctrl);
2093                 nvme_cancel_tagset(ctrl);
2094                 nvme_tcp_destroy_io_queues(ctrl, new);
2095         }
2096 destroy_admin:
2097         nvme_quiesce_admin_queue(ctrl);
2098         blk_sync_queue(ctrl->admin_q);
2099         nvme_tcp_stop_queue(ctrl, 0);
2100         nvme_cancel_admin_tagset(ctrl);
2101         nvme_tcp_destroy_admin_queue(ctrl, new);
2102         return ret;
2103 }
2104
2105 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2106 {
2107         struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2108                         struct nvme_tcp_ctrl, connect_work);
2109         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2110
2111         ++ctrl->nr_reconnects;
2112
2113         if (nvme_tcp_setup_ctrl(ctrl, false))
2114                 goto requeue;
2115
2116         dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2117                         ctrl->nr_reconnects);
2118
2119         ctrl->nr_reconnects = 0;
2120
2121         return;
2122
2123 requeue:
2124         dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2125                         ctrl->nr_reconnects);
2126         nvme_tcp_reconnect_or_remove(ctrl);
2127 }
2128
2129 static void nvme_tcp_error_recovery_work(struct work_struct *work)
2130 {
2131         struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2132                                 struct nvme_tcp_ctrl, err_work);
2133         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2134
2135         nvme_stop_keep_alive(ctrl);
2136         flush_work(&ctrl->async_event_work);
2137         nvme_tcp_teardown_io_queues(ctrl, false);
2138         /* unquiesce to fail fast pending requests */
2139         nvme_unquiesce_io_queues(ctrl);
2140         nvme_tcp_teardown_admin_queue(ctrl, false);
2141         nvme_unquiesce_admin_queue(ctrl);
2142         nvme_auth_stop(ctrl);
2143
2144         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2145                 /* state change failure is ok if we started ctrl delete */
2146                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2147                              ctrl->state != NVME_CTRL_DELETING_NOIO);
2148                 return;
2149         }
2150
2151         nvme_tcp_reconnect_or_remove(ctrl);
2152 }
2153
2154 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2155 {
2156         nvme_tcp_teardown_io_queues(ctrl, shutdown);
2157         nvme_quiesce_admin_queue(ctrl);
2158         nvme_disable_ctrl(ctrl, shutdown);
2159         nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2160 }
2161
2162 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2163 {
2164         nvme_tcp_teardown_ctrl(ctrl, true);
2165 }
2166
2167 static void nvme_reset_ctrl_work(struct work_struct *work)
2168 {
2169         struct nvme_ctrl *ctrl =
2170                 container_of(work, struct nvme_ctrl, reset_work);
2171
2172         nvme_stop_ctrl(ctrl);
2173         nvme_tcp_teardown_ctrl(ctrl, false);
2174
2175         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2176                 /* state change failure is ok if we started ctrl delete */
2177                 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2178                              ctrl->state != NVME_CTRL_DELETING_NOIO);
2179                 return;
2180         }
2181
2182         if (nvme_tcp_setup_ctrl(ctrl, false))
2183                 goto out_fail;
2184
2185         return;
2186
2187 out_fail:
2188         ++ctrl->nr_reconnects;
2189         nvme_tcp_reconnect_or_remove(ctrl);
2190 }
2191
2192 static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
2193 {
2194         flush_work(&to_tcp_ctrl(ctrl)->err_work);
2195         cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2196 }
2197
2198 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2199 {
2200         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2201
2202         if (list_empty(&ctrl->list))
2203                 goto free_ctrl;
2204
2205         mutex_lock(&nvme_tcp_ctrl_mutex);
2206         list_del(&ctrl->list);
2207         mutex_unlock(&nvme_tcp_ctrl_mutex);
2208
2209         nvmf_free_options(nctrl->opts);
2210 free_ctrl:
2211         kfree(ctrl->queues);
2212         kfree(ctrl);
2213 }
2214
2215 static void nvme_tcp_set_sg_null(struct nvme_command *c)
2216 {
2217         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2218
2219         sg->addr = 0;
2220         sg->length = 0;
2221         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2222                         NVME_SGL_FMT_TRANSPORT_A;
2223 }
2224
2225 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2226                 struct nvme_command *c, u32 data_len)
2227 {
2228         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2229
2230         sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2231         sg->length = cpu_to_le32(data_len);
2232         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2233 }
2234
2235 static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2236                 u32 data_len)
2237 {
2238         struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2239
2240         sg->addr = 0;
2241         sg->length = cpu_to_le32(data_len);
2242         sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2243                         NVME_SGL_FMT_TRANSPORT_A;
2244 }
2245
2246 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2247 {
2248         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2249         struct nvme_tcp_queue *queue = &ctrl->queues[0];
2250         struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2251         struct nvme_command *cmd = &pdu->cmd;
2252         u8 hdgst = nvme_tcp_hdgst_len(queue);
2253
2254         memset(pdu, 0, sizeof(*pdu));
2255         pdu->hdr.type = nvme_tcp_cmd;
2256         if (queue->hdr_digest)
2257                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2258         pdu->hdr.hlen = sizeof(*pdu);
2259         pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2260
2261         cmd->common.opcode = nvme_admin_async_event;
2262         cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2263         cmd->common.flags |= NVME_CMD_SGL_METABUF;
2264         nvme_tcp_set_sg_null(cmd);
2265
2266         ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2267         ctrl->async_req.offset = 0;
2268         ctrl->async_req.curr_bio = NULL;
2269         ctrl->async_req.data_len = 0;
2270
2271         nvme_tcp_queue_request(&ctrl->async_req, true, true);
2272 }
2273
2274 static void nvme_tcp_complete_timed_out(struct request *rq)
2275 {
2276         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2277         struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2278
2279         nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2280         nvmf_complete_timed_out_request(rq);
2281 }
2282
2283 static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
2284 {
2285         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2286         struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2287         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2288         u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
2289         int qid = nvme_tcp_queue_id(req->queue);
2290
2291         dev_warn(ctrl->device,
2292                 "queue %d: timeout cid %#x type %d opcode %#x (%s)\n",
2293                 nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type,
2294                 opc, nvme_opcode_str(qid, opc, fctype));
2295
2296         if (ctrl->state != NVME_CTRL_LIVE) {
2297                 /*
2298                  * If we are resetting, connecting or deleting we should
2299                  * complete immediately because we may block controller
2300                  * teardown or setup sequence
2301                  * - ctrl disable/shutdown fabrics requests
2302                  * - connect requests
2303                  * - initialization admin requests
2304                  * - I/O requests that entered after unquiescing and
2305                  *   the controller stopped responding
2306                  *
2307                  * All other requests should be cancelled by the error
2308                  * recovery work, so it's fine that we fail it here.
2309                  */
2310                 nvme_tcp_complete_timed_out(rq);
2311                 return BLK_EH_DONE;
2312         }
2313
2314         /*
2315          * LIVE state should trigger the normal error recovery which will
2316          * handle completing this request.
2317          */
2318         nvme_tcp_error_recovery(ctrl);
2319         return BLK_EH_RESET_TIMER;
2320 }
2321
2322 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2323                         struct request *rq)
2324 {
2325         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2326         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2327         struct nvme_command *c = &pdu->cmd;
2328
2329         c->common.flags |= NVME_CMD_SGL_METABUF;
2330
2331         if (!blk_rq_nr_phys_segments(rq))
2332                 nvme_tcp_set_sg_null(c);
2333         else if (rq_data_dir(rq) == WRITE &&
2334             req->data_len <= nvme_tcp_inline_data_size(req))
2335                 nvme_tcp_set_sg_inline(queue, c, req->data_len);
2336         else
2337                 nvme_tcp_set_sg_host_data(c, req->data_len);
2338
2339         return 0;
2340 }
2341
2342 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2343                 struct request *rq)
2344 {
2345         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2346         struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2347         struct nvme_tcp_queue *queue = req->queue;
2348         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2349         blk_status_t ret;
2350
2351         ret = nvme_setup_cmd(ns, rq);
2352         if (ret)
2353                 return ret;
2354
2355         req->state = NVME_TCP_SEND_CMD_PDU;
2356         req->status = cpu_to_le16(NVME_SC_SUCCESS);
2357         req->offset = 0;
2358         req->data_sent = 0;
2359         req->pdu_len = 0;
2360         req->pdu_sent = 0;
2361         req->h2cdata_left = 0;
2362         req->data_len = blk_rq_nr_phys_segments(rq) ?
2363                                 blk_rq_payload_bytes(rq) : 0;
2364         req->curr_bio = rq->bio;
2365         if (req->curr_bio && req->data_len)
2366                 nvme_tcp_init_iter(req, rq_data_dir(rq));
2367
2368         if (rq_data_dir(rq) == WRITE &&
2369             req->data_len <= nvme_tcp_inline_data_size(req))
2370                 req->pdu_len = req->data_len;
2371
2372         pdu->hdr.type = nvme_tcp_cmd;
2373         pdu->hdr.flags = 0;
2374         if (queue->hdr_digest)
2375                 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2376         if (queue->data_digest && req->pdu_len) {
2377                 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2378                 ddgst = nvme_tcp_ddgst_len(queue);
2379         }
2380         pdu->hdr.hlen = sizeof(*pdu);
2381         pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2382         pdu->hdr.plen =
2383                 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2384
2385         ret = nvme_tcp_map_data(queue, rq);
2386         if (unlikely(ret)) {
2387                 nvme_cleanup_cmd(rq);
2388                 dev_err(queue->ctrl->ctrl.device,
2389                         "Failed to map data (%d)\n", ret);
2390                 return ret;
2391         }
2392
2393         return 0;
2394 }
2395
2396 static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2397 {
2398         struct nvme_tcp_queue *queue = hctx->driver_data;
2399
2400         if (!llist_empty(&queue->req_list))
2401                 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2402 }
2403
2404 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2405                 const struct blk_mq_queue_data *bd)
2406 {
2407         struct nvme_ns *ns = hctx->queue->queuedata;
2408         struct nvme_tcp_queue *queue = hctx->driver_data;
2409         struct request *rq = bd->rq;
2410         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2411         bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2412         blk_status_t ret;
2413
2414         if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2415                 return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2416
2417         ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2418         if (unlikely(ret))
2419                 return ret;
2420
2421         nvme_start_request(rq);
2422
2423         nvme_tcp_queue_request(req, true, bd->last);
2424
2425         return BLK_STS_OK;
2426 }
2427
2428 static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2429 {
2430         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
2431         struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2432
2433         if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2434                 /* separate read/write queues */
2435                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2436                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2437                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2438                 set->map[HCTX_TYPE_READ].nr_queues =
2439                         ctrl->io_queues[HCTX_TYPE_READ];
2440                 set->map[HCTX_TYPE_READ].queue_offset =
2441                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2442         } else {
2443                 /* shared read/write queues */
2444                 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2445                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2446                 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2447                 set->map[HCTX_TYPE_READ].nr_queues =
2448                         ctrl->io_queues[HCTX_TYPE_DEFAULT];
2449                 set->map[HCTX_TYPE_READ].queue_offset = 0;
2450         }
2451         blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2452         blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2453
2454         if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2455                 /* map dedicated poll queues only if we have queues left */
2456                 set->map[HCTX_TYPE_POLL].nr_queues =
2457                                 ctrl->io_queues[HCTX_TYPE_POLL];
2458                 set->map[HCTX_TYPE_POLL].queue_offset =
2459                         ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2460                         ctrl->io_queues[HCTX_TYPE_READ];
2461                 blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2462         }
2463
2464         dev_info(ctrl->ctrl.device,
2465                 "mapped %d/%d/%d default/read/poll queues.\n",
2466                 ctrl->io_queues[HCTX_TYPE_DEFAULT],
2467                 ctrl->io_queues[HCTX_TYPE_READ],
2468                 ctrl->io_queues[HCTX_TYPE_POLL]);
2469 }
2470
2471 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
2472 {
2473         struct nvme_tcp_queue *queue = hctx->driver_data;
2474         struct sock *sk = queue->sock->sk;
2475
2476         if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2477                 return 0;
2478
2479         set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2480         if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2481                 sk_busy_loop(sk, true);
2482         nvme_tcp_try_recv(queue);
2483         clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2484         return queue->nr_cqe;
2485 }
2486
2487 static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
2488 {
2489         struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0];
2490         struct sockaddr_storage src_addr;
2491         int ret, len;
2492
2493         len = nvmf_get_address(ctrl, buf, size);
2494
2495         mutex_lock(&queue->queue_lock);
2496
2497         if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2498                 goto done;
2499         ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr);
2500         if (ret > 0) {
2501                 if (len > 0)
2502                         len--; /* strip trailing newline */
2503                 len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n",
2504                                 (len) ? "," : "", &src_addr);
2505         }
2506 done:
2507         mutex_unlock(&queue->queue_lock);
2508
2509         return len;
2510 }
2511
2512 static const struct blk_mq_ops nvme_tcp_mq_ops = {
2513         .queue_rq       = nvme_tcp_queue_rq,
2514         .commit_rqs     = nvme_tcp_commit_rqs,
2515         .complete       = nvme_complete_rq,
2516         .init_request   = nvme_tcp_init_request,
2517         .exit_request   = nvme_tcp_exit_request,
2518         .init_hctx      = nvme_tcp_init_hctx,
2519         .timeout        = nvme_tcp_timeout,
2520         .map_queues     = nvme_tcp_map_queues,
2521         .poll           = nvme_tcp_poll,
2522 };
2523
2524 static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2525         .queue_rq       = nvme_tcp_queue_rq,
2526         .complete       = nvme_complete_rq,
2527         .init_request   = nvme_tcp_init_request,
2528         .exit_request   = nvme_tcp_exit_request,
2529         .init_hctx      = nvme_tcp_init_admin_hctx,
2530         .timeout        = nvme_tcp_timeout,
2531 };
2532
2533 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2534         .name                   = "tcp",
2535         .module                 = THIS_MODULE,
2536         .flags                  = NVME_F_FABRICS | NVME_F_BLOCKING,
2537         .reg_read32             = nvmf_reg_read32,
2538         .reg_read64             = nvmf_reg_read64,
2539         .reg_write32            = nvmf_reg_write32,
2540         .free_ctrl              = nvme_tcp_free_ctrl,
2541         .submit_async_event     = nvme_tcp_submit_async_event,
2542         .delete_ctrl            = nvme_tcp_delete_ctrl,
2543         .get_address            = nvme_tcp_get_address,
2544         .stop_ctrl              = nvme_tcp_stop_ctrl,
2545 };
2546
2547 static bool
2548 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2549 {
2550         struct nvme_tcp_ctrl *ctrl;
2551         bool found = false;
2552
2553         mutex_lock(&nvme_tcp_ctrl_mutex);
2554         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2555                 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2556                 if (found)
2557                         break;
2558         }
2559         mutex_unlock(&nvme_tcp_ctrl_mutex);
2560
2561         return found;
2562 }
2563
2564 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2565                 struct nvmf_ctrl_options *opts)
2566 {
2567         struct nvme_tcp_ctrl *ctrl;
2568         int ret;
2569
2570         ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2571         if (!ctrl)
2572                 return ERR_PTR(-ENOMEM);
2573
2574         INIT_LIST_HEAD(&ctrl->list);
2575         ctrl->ctrl.opts = opts;
2576         ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2577                                 opts->nr_poll_queues + 1;
2578         ctrl->ctrl.sqsize = opts->queue_size - 1;
2579         ctrl->ctrl.kato = opts->kato;
2580
2581         INIT_DELAYED_WORK(&ctrl->connect_work,
2582                         nvme_tcp_reconnect_ctrl_work);
2583         INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2584         INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2585
2586         if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2587                 opts->trsvcid =
2588                         kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2589                 if (!opts->trsvcid) {
2590                         ret = -ENOMEM;
2591                         goto out_free_ctrl;
2592                 }
2593                 opts->mask |= NVMF_OPT_TRSVCID;
2594         }
2595
2596         ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2597                         opts->traddr, opts->trsvcid, &ctrl->addr);
2598         if (ret) {
2599                 pr_err("malformed address passed: %s:%s\n",
2600                         opts->traddr, opts->trsvcid);
2601                 goto out_free_ctrl;
2602         }
2603
2604         if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2605                 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2606                         opts->host_traddr, NULL, &ctrl->src_addr);
2607                 if (ret) {
2608                         pr_err("malformed src address passed: %s\n",
2609                                opts->host_traddr);
2610                         goto out_free_ctrl;
2611                 }
2612         }
2613
2614         if (opts->mask & NVMF_OPT_HOST_IFACE) {
2615                 if (!__dev_get_by_name(&init_net, opts->host_iface)) {
2616                         pr_err("invalid interface passed: %s\n",
2617                                opts->host_iface);
2618                         ret = -ENODEV;
2619                         goto out_free_ctrl;
2620                 }
2621         }
2622
2623         if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2624                 ret = -EALREADY;
2625                 goto out_free_ctrl;
2626         }
2627
2628         ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2629                                 GFP_KERNEL);
2630         if (!ctrl->queues) {
2631                 ret = -ENOMEM;
2632                 goto out_free_ctrl;
2633         }
2634
2635         ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2636         if (ret)
2637                 goto out_kfree_queues;
2638
2639         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2640                 WARN_ON_ONCE(1);
2641                 ret = -EINTR;
2642                 goto out_uninit_ctrl;
2643         }
2644
2645         ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2646         if (ret)
2647                 goto out_uninit_ctrl;
2648
2649         dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2650                 nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
2651
2652         mutex_lock(&nvme_tcp_ctrl_mutex);
2653         list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2654         mutex_unlock(&nvme_tcp_ctrl_mutex);
2655
2656         return &ctrl->ctrl;
2657
2658 out_uninit_ctrl:
2659         nvme_uninit_ctrl(&ctrl->ctrl);
2660         nvme_put_ctrl(&ctrl->ctrl);
2661         if (ret > 0)
2662                 ret = -EIO;
2663         return ERR_PTR(ret);
2664 out_kfree_queues:
2665         kfree(ctrl->queues);
2666 out_free_ctrl:
2667         kfree(ctrl);
2668         return ERR_PTR(ret);
2669 }
2670
2671 static struct nvmf_transport_ops nvme_tcp_transport = {
2672         .name           = "tcp",
2673         .module         = THIS_MODULE,
2674         .required_opts  = NVMF_OPT_TRADDR,
2675         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2676                           NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2677                           NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2678                           NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2679                           NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
2680         .create_ctrl    = nvme_tcp_create_ctrl,
2681 };
2682
2683 static int __init nvme_tcp_init_module(void)
2684 {
2685         nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2686                         WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2687         if (!nvme_tcp_wq)
2688                 return -ENOMEM;
2689
2690         nvmf_register_transport(&nvme_tcp_transport);
2691         return 0;
2692 }
2693
2694 static void __exit nvme_tcp_cleanup_module(void)
2695 {
2696         struct nvme_tcp_ctrl *ctrl;
2697
2698         nvmf_unregister_transport(&nvme_tcp_transport);
2699
2700         mutex_lock(&nvme_tcp_ctrl_mutex);
2701         list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2702                 nvme_delete_ctrl(&ctrl->ctrl);
2703         mutex_unlock(&nvme_tcp_ctrl_mutex);
2704         flush_workqueue(nvme_delete_wq);
2705
2706         destroy_workqueue(nvme_tcp_wq);
2707 }
2708
2709 module_init(nvme_tcp_init_module);
2710 module_exit(nvme_tcp_cleanup_module);
2711
2712 MODULE_LICENSE("GPL v2");