1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
6 #include <linux/errno.h>
7 #include <linux/types.h>
9 #include <linux/scatterlist.h>
10 #include <linux/llist.h>
11 #include <asm/barrier.h>
15 #include "siw_verbs.h"
18 static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
19 [SIW_QP_STATE_IDLE] = "IDLE",
20 [SIW_QP_STATE_RTR] = "RTR",
21 [SIW_QP_STATE_RTS] = "RTS",
22 [SIW_QP_STATE_CLOSING] = "CLOSING",
23 [SIW_QP_STATE_TERMINATE] = "TERMINATE",
24 [SIW_QP_STATE_ERROR] = "ERROR"
28 * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
29 * per-RDMAP message basis. Please keep order of initializer. All MPA len
30 * is initialized to minimum packet size.
32 struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = {
33 { /* RDMAP_RDMA_WRITE */
34 .hdr_len = sizeof(struct iwarp_rdma_write),
35 .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
36 .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
37 cpu_to_be16(DDP_VERSION << 8) |
38 cpu_to_be16(RDMAP_VERSION << 6) |
39 cpu_to_be16(RDMAP_RDMA_WRITE),
40 .rx_data = siw_proc_write },
41 { /* RDMAP_RDMA_READ_REQ */
42 .hdr_len = sizeof(struct iwarp_rdma_rreq),
43 .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
44 .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
45 cpu_to_be16(RDMAP_VERSION << 6) |
46 cpu_to_be16(RDMAP_RDMA_READ_REQ),
47 .rx_data = siw_proc_rreq },
48 { /* RDMAP_RDMA_READ_RESP */
49 .hdr_len = sizeof(struct iwarp_rdma_rresp),
50 .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
51 .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
52 cpu_to_be16(DDP_VERSION << 8) |
53 cpu_to_be16(RDMAP_VERSION << 6) |
54 cpu_to_be16(RDMAP_RDMA_READ_RESP),
55 .rx_data = siw_proc_rresp },
57 .hdr_len = sizeof(struct iwarp_send),
58 .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
59 .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
60 cpu_to_be16(RDMAP_VERSION << 6) |
61 cpu_to_be16(RDMAP_SEND),
62 .rx_data = siw_proc_send },
63 { /* RDMAP_SEND_INVAL */
64 .hdr_len = sizeof(struct iwarp_send_inv),
65 .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
66 .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
67 cpu_to_be16(RDMAP_VERSION << 6) |
68 cpu_to_be16(RDMAP_SEND_INVAL),
69 .rx_data = siw_proc_send },
71 .hdr_len = sizeof(struct iwarp_send),
72 .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
73 .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
74 cpu_to_be16(RDMAP_VERSION << 6) |
75 cpu_to_be16(RDMAP_SEND_SE),
76 .rx_data = siw_proc_send },
77 { /* RDMAP_SEND_SE_INVAL */
78 .hdr_len = sizeof(struct iwarp_send_inv),
79 .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
80 .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
81 cpu_to_be16(RDMAP_VERSION << 6) |
82 cpu_to_be16(RDMAP_SEND_SE_INVAL),
83 .rx_data = siw_proc_send },
84 { /* RDMAP_TERMINATE */
85 .hdr_len = sizeof(struct iwarp_terminate),
86 .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
87 .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
88 cpu_to_be16(RDMAP_VERSION << 6) |
89 cpu_to_be16(RDMAP_TERMINATE),
90 .rx_data = siw_proc_terminate }
93 void siw_qp_llp_data_ready(struct sock *sk)
97 read_lock(&sk->sk_callback_lock);
99 if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
104 if (likely(!qp->rx_stream.rx_suspend &&
105 down_read_trylock(&qp->state_lock))) {
106 read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 };
108 if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
110 * Implements data receive operation during
111 * socket callback. TCP gracefully catches
112 * the case where there is nothing to receive
113 * (not calling siw_tcp_rx_data() then).
115 tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
117 up_read(&qp->state_lock);
119 siw_dbg_qp(qp, "unable to process RX, suspend: %d\n",
120 qp->rx_stream.rx_suspend);
123 read_unlock(&sk->sk_callback_lock);
126 void siw_qp_llp_close(struct siw_qp *qp)
128 siw_dbg_qp(qp, "enter llp close, state = %s\n",
129 siw_qp_state_to_string[qp->attrs.state]);
131 down_write(&qp->state_lock);
133 qp->rx_stream.rx_suspend = 1;
134 qp->tx_ctx.tx_suspend = 1;
137 switch (qp->attrs.state) {
138 case SIW_QP_STATE_RTS:
139 case SIW_QP_STATE_RTR:
140 case SIW_QP_STATE_IDLE:
141 case SIW_QP_STATE_TERMINATE:
142 qp->attrs.state = SIW_QP_STATE_ERROR;
145 * SIW_QP_STATE_CLOSING:
147 * This is a forced close. shall the QP be moved to
150 case SIW_QP_STATE_CLOSING:
151 if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
152 qp->attrs.state = SIW_QP_STATE_ERROR;
154 qp->attrs.state = SIW_QP_STATE_IDLE;
158 siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
159 siw_qp_state_to_string[qp->attrs.state]);
166 * Dereference closing CEP
169 siw_cep_put(qp->cep);
173 up_write(&qp->state_lock);
175 siw_dbg_qp(qp, "llp close exit: state %s\n",
176 siw_qp_state_to_string[qp->attrs.state]);
180 * socket callback routine informing about newly available send space.
181 * Function schedules SQ work for processing SQ items.
183 void siw_qp_llp_write_space(struct sock *sk)
185 struct siw_cep *cep = sk_to_cep(sk);
187 cep->sk_write_space(sk);
189 if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
190 (void)siw_sq_start(cep->qp);
193 static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
195 irq_size = roundup_pow_of_two(irq_size);
196 orq_size = roundup_pow_of_two(orq_size);
198 qp->attrs.irq_size = irq_size;
199 qp->attrs.orq_size = orq_size;
201 qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
203 siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
204 qp->attrs.irq_size = 0;
207 qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
209 siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
210 qp->attrs.orq_size = 0;
211 qp->attrs.irq_size = 0;
215 siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size);
219 static int siw_qp_enable_crc(struct siw_qp *qp)
221 struct siw_rx_stream *c_rx = &qp->rx_stream;
222 struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
223 int size = crypto_shash_descsize(siw_crypto_shash) +
224 sizeof(struct shash_desc);
226 if (siw_crypto_shash == NULL)
229 c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
230 c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
231 if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
232 kfree(c_tx->mpa_crc_hd);
233 kfree(c_rx->mpa_crc_hd);
234 c_tx->mpa_crc_hd = NULL;
235 c_rx->mpa_crc_hd = NULL;
238 c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
239 c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
245 * Send a non signalled READ or WRITE to peer side as negotiated
246 * with MPAv2 P2P setup protocol. The work request is only created
247 * as a current active WR and does not consume Send Queue space.
249 * Caller must hold QP state lock.
251 int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
253 struct siw_wqe *wqe = tx_wqe(qp);
257 spin_lock_irqsave(&qp->sq_lock, flags);
259 if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
260 spin_unlock_irqrestore(&qp->sq_lock, flags);
263 memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
265 wqe->wr_status = SIW_WR_QUEUED;
267 wqe->sqe.num_sge = 1;
268 wqe->sqe.sge[0].length = 0;
269 wqe->sqe.sge[0].laddr = 0;
270 wqe->sqe.sge[0].lkey = 0;
272 * While it must not be checked for inbound zero length
273 * READ/WRITE, some HW may treat STag 0 special.
279 if (ctrl & MPA_V2_RDMA_WRITE_RTR)
280 wqe->sqe.opcode = SIW_OP_WRITE;
281 else if (ctrl & MPA_V2_RDMA_READ_RTR) {
282 struct siw_sqe *rreq;
284 wqe->sqe.opcode = SIW_OP_READ;
286 spin_lock(&qp->orq_lock);
288 rreq = orq_get_free(qp);
290 siw_read_to_orq(rreq, &wqe->sqe);
295 spin_unlock(&qp->orq_lock);
300 wqe->wr_status = SIW_WR_IDLE;
302 spin_unlock_irqrestore(&qp->sq_lock, flags);
305 rv = siw_sq_start(qp);
311 * Map memory access error to DDP tagged error
313 enum ddp_ecode siw_tagged_error(enum siw_access_state state)
317 return DDP_ECODE_T_INVALID_STAG;
319 return DDP_ECODE_T_BASE_BOUNDS;
321 return DDP_ECODE_T_STAG_NOT_ASSOC;
324 * RFC 5041 (DDP) lacks an ecode for insufficient access
325 * permissions. 'Invalid STag' seem to be the closest
328 return DDP_ECODE_T_INVALID_STAG;
331 return DDP_ECODE_T_INVALID_STAG;
336 * Map memory access error to RDMAP protection error
338 enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
342 return RDMAP_ECODE_INVALID_STAG;
344 return RDMAP_ECODE_BASE_BOUNDS;
346 return RDMAP_ECODE_STAG_NOT_ASSOC;
348 return RDMAP_ECODE_ACCESS_RIGHTS;
350 return RDMAP_ECODE_UNSPECIFIED;
354 void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype,
357 if (!qp->term_info.valid) {
358 memset(&qp->term_info, 0, sizeof(qp->term_info));
359 qp->term_info.layer = layer;
360 qp->term_info.etype = etype;
361 qp->term_info.ecode = ecode;
362 qp->term_info.in_tx = in_tx;
363 qp->term_info.valid = 1;
365 siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n",
366 layer, etype, ecode, in_tx ? "yes" : "no");
370 * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
371 * Sending TERMINATE messages is best effort - such messages
372 * can only be send if the QP is still connected and it does
373 * not have another outbound message in-progress, i.e. the
374 * TERMINATE message must not interfer with an incomplete current
375 * transmit operation.
377 void siw_send_terminate(struct siw_qp *qp)
380 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
381 struct iwarp_terminate *term = NULL;
382 union iwarp_hdr *err_hdr = NULL;
383 struct socket *s = qp->attrs.sk;
384 struct siw_rx_stream *srx = &qp->rx_stream;
385 union iwarp_hdr *rx_hdr = &srx->hdr;
387 int num_frags, len_terminate, rv;
389 if (!qp->term_info.valid)
392 qp->term_info.valid = 0;
394 if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
395 siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
396 tx_type(tx_wqe(qp)));
400 /* QP not yet in RTS. Take socket from connection end point */
404 siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
408 term = kzalloc(sizeof(*term), GFP_KERNEL);
412 term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
414 term->ddp_msn = cpu_to_be32(1);
416 iov[0].iov_base = term;
417 iov[0].iov_len = sizeof(*term);
419 if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
420 ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
421 (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
422 err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
428 memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
429 sizeof(struct iwarp_ctrl));
431 __rdmap_term_set_layer(term, qp->term_info.layer);
432 __rdmap_term_set_etype(term, qp->term_info.etype);
433 __rdmap_term_set_ecode(term, qp->term_info.ecode);
435 switch (qp->term_info.layer) {
436 case TERM_ERROR_LAYER_RDMAP:
437 if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
438 /* No additional DDP/RDMAP header to be included */
441 if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
443 * Complete RDMAP frame will get attached, and
444 * DDP segment length is valid
450 if (qp->term_info.in_tx) {
451 struct iwarp_rdma_rreq *rreq;
452 struct siw_wqe *wqe = tx_wqe(qp);
454 /* Inbound RREQ error, detected during
455 * RRESP creation. Take state from
456 * current TX work queue element to
457 * reconstruct peers RREQ.
459 rreq = (struct iwarp_rdma_rreq *)err_hdr;
462 &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
463 sizeof(struct iwarp_ctrl));
467 htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
469 /* Provide RREQ's MSN as kept aside */
470 rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
472 rreq->ddp_mo = htonl(wqe->processed);
473 rreq->sink_stag = htonl(wqe->sqe.rkey);
474 rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
475 rreq->read_size = htonl(wqe->sqe.sge[0].length);
476 rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
478 cpu_to_be64(wqe->sqe.sge[0].laddr);
480 iov[1].iov_base = rreq;
481 iov[1].iov_len = sizeof(*rreq);
483 rx_hdr = (union iwarp_hdr *)rreq;
485 /* Take RDMAP/DDP information from
486 * current (failed) inbound frame.
488 iov[1].iov_base = rx_hdr;
490 if (__rdmap_get_opcode(&rx_hdr->ctrl) ==
493 sizeof(struct iwarp_rdma_rreq);
496 sizeof(struct iwarp_send);
499 /* Do not report DDP hdr information if packet
502 if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
503 (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
506 iov[1].iov_base = rx_hdr;
508 /* Only DDP frame will get attached */
509 if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
511 sizeof(struct iwarp_rdma_write);
513 iov[1].iov_len = sizeof(struct iwarp_send);
518 term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
521 case TERM_ERROR_LAYER_DDP:
522 /* Report error encountered while DDP processing.
523 * This can only happen as a result of inbound
527 /* Do not report DDP hdr information if packet
530 if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
531 (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
532 ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
533 (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
536 iov[1].iov_base = rx_hdr;
538 if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
539 iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
541 iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
550 if (term->flag_m || term->flag_d || term->flag_r) {
551 iov[2].iov_base = &crc;
552 iov[2].iov_len = sizeof(crc);
553 len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
556 iov[1].iov_base = &crc;
557 iov[1].iov_len = sizeof(crc);
558 len_terminate = sizeof(*term) + MPA_CRC_SIZE;
562 /* Adjust DDP Segment Length parameter, if valid */
564 u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
565 enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl);
567 real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
568 rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
572 cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE));
573 if (qp->tx_ctx.mpa_crc_hd) {
574 crypto_shash_init(qp->tx_ctx.mpa_crc_hd);
575 if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
576 (u8 *)iov[0].iov_base,
580 if (num_frags == 3) {
581 if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
582 (u8 *)iov[1].iov_base,
586 crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc);
589 rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
590 siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
591 rv == len_terminate ? "success" : "failure",
592 __rdmap_term_layer(term), __rdmap_term_etype(term),
593 __rdmap_term_ecode(term), rv);
600 * Handle all attrs other than state
602 static void siw_qp_modify_nonstate(struct siw_qp *qp,
603 struct siw_qp_attrs *attrs,
604 enum siw_qp_attr_mask mask)
606 if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
607 if (attrs->flags & SIW_RDMA_BIND_ENABLED)
608 qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
610 qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
612 if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
613 qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
615 qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
617 if (attrs->flags & SIW_RDMA_READ_ENABLED)
618 qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
620 qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
624 static int siw_qp_nextstate_from_idle(struct siw_qp *qp,
625 struct siw_qp_attrs *attrs,
626 enum siw_qp_attr_mask mask)
630 switch (attrs->state) {
631 case SIW_QP_STATE_RTS:
632 if (attrs->flags & SIW_MPA_CRC) {
633 rv = siw_qp_enable_crc(qp);
637 if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
638 siw_dbg_qp(qp, "no socket\n");
642 if (!(mask & SIW_QP_ATTR_MPA)) {
643 siw_dbg_qp(qp, "no MPA\n");
648 * Initialize iWARP TX state
650 qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
651 qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
652 qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
655 * Initialize iWARP RX state
657 qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
658 qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
659 qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
662 * init IRD free queue, caller has already checked
665 rv = siw_qp_readq_init(qp, attrs->irq_size,
670 qp->attrs.sk = attrs->sk;
671 qp->attrs.state = SIW_QP_STATE_RTS;
673 siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n",
674 attrs->flags & SIW_MPA_CRC ? "y" : "n",
675 qp->attrs.orq_size, qp->attrs.irq_size);
678 case SIW_QP_STATE_ERROR:
680 qp->attrs.state = SIW_QP_STATE_ERROR;
682 siw_cep_put(qp->cep);
693 static int siw_qp_nextstate_from_rts(struct siw_qp *qp,
694 struct siw_qp_attrs *attrs)
698 switch (attrs->state) {
699 case SIW_QP_STATE_CLOSING:
701 * Verbs: move to IDLE if SQ and ORQ are empty.
702 * Move to ERROR otherwise. But first of all we must
703 * close the connection. So we keep CLOSING or ERROR
704 * as a transient state, schedule connection drop work
705 * and wait for the socket state change upcall to
708 if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
709 qp->attrs.state = SIW_QP_STATE_CLOSING;
711 qp->attrs.state = SIW_QP_STATE_ERROR;
719 case SIW_QP_STATE_TERMINATE:
720 qp->attrs.state = SIW_QP_STATE_TERMINATE;
722 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
723 RDMAP_ETYPE_CATASTROPHIC,
724 RDMAP_ECODE_UNSPECIFIED, 1);
728 case SIW_QP_STATE_ERROR:
730 * This is an emergency close.
732 * Any in progress transmit operation will get
734 * This will likely result in a protocol failure,
735 * if a TX operation is in transit. The caller
736 * could unconditional wait to give the current
737 * operation a chance to complete.
738 * Esp., how to handle the non-empty IRQ case?
739 * The peer was asking for data transfer at a valid
744 qp->attrs.state = SIW_QP_STATE_ERROR;
754 static void siw_qp_nextstate_from_term(struct siw_qp *qp,
755 struct siw_qp_attrs *attrs)
757 switch (attrs->state) {
758 case SIW_QP_STATE_ERROR:
760 qp->attrs.state = SIW_QP_STATE_ERROR;
762 if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
771 static int siw_qp_nextstate_from_close(struct siw_qp *qp,
772 struct siw_qp_attrs *attrs)
776 switch (attrs->state) {
777 case SIW_QP_STATE_IDLE:
778 WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
779 qp->attrs.state = SIW_QP_STATE_IDLE;
782 case SIW_QP_STATE_CLOSING:
784 * The LLP may already moved the QP to closing
785 * due to graceful peer close init
789 case SIW_QP_STATE_ERROR:
791 * QP was moved to CLOSING by LLP event
792 * not yet seen by user.
794 qp->attrs.state = SIW_QP_STATE_ERROR;
796 if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
803 siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
804 siw_qp_state_to_string[qp->attrs.state],
805 siw_qp_state_to_string[attrs->state]);
813 * Caller must hold qp->state_lock
815 int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
816 enum siw_qp_attr_mask mask)
818 int drop_conn = 0, rv = 0;
823 siw_dbg_qp(qp, "state: %s => %s\n",
824 siw_qp_state_to_string[qp->attrs.state],
825 siw_qp_state_to_string[attrs->state]);
827 if (mask != SIW_QP_ATTR_STATE)
828 siw_qp_modify_nonstate(qp, attrs, mask);
830 if (!(mask & SIW_QP_ATTR_STATE))
833 switch (qp->attrs.state) {
834 case SIW_QP_STATE_IDLE:
835 case SIW_QP_STATE_RTR:
836 rv = siw_qp_nextstate_from_idle(qp, attrs, mask);
839 case SIW_QP_STATE_RTS:
840 drop_conn = siw_qp_nextstate_from_rts(qp, attrs);
843 case SIW_QP_STATE_TERMINATE:
844 siw_qp_nextstate_from_term(qp, attrs);
847 case SIW_QP_STATE_CLOSING:
848 siw_qp_nextstate_from_close(qp, attrs);
854 siw_qp_cm_drop(qp, 0);
859 void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
862 rreq->opcode = sqe->opcode;
863 rreq->sge[0].laddr = sqe->sge[0].laddr;
864 rreq->sge[0].length = sqe->sge[0].length;
865 rreq->sge[0].lkey = sqe->sge[0].lkey;
866 rreq->sge[1].lkey = sqe->sge[1].lkey;
867 rreq->flags = sqe->flags | SIW_WQE_VALID;
872 * Must be called with SQ locked.
873 * To avoid complete SQ starvation by constant inbound READ requests,
874 * the active IRQ will not be served after qp->irq_burst, if the
875 * SQ has pending work.
877 int siw_activate_tx(struct siw_qp *qp)
879 struct siw_sqe *irqe, *sqe;
880 struct siw_wqe *wqe = tx_wqe(qp);
883 irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
885 if (irqe->flags & SIW_WQE_VALID) {
886 sqe = sq_get_next(qp);
889 * Avoid local WQE processing starvation in case
890 * of constant inbound READ request stream
892 if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
896 memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
897 wqe->wr_status = SIW_WR_QUEUED;
899 /* start READ RESPONSE */
900 wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
903 wqe->sqe.num_sge = 1;
904 wqe->sqe.sge[0].length = irqe->sge[0].length;
905 wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
906 wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
908 wqe->sqe.num_sge = 0;
911 /* Retain original RREQ's message sequence number for
912 * potential error reporting cases.
914 wqe->sqe.sge[1].length = irqe->sge[1].length;
916 wqe->sqe.rkey = irqe->rkey;
917 wqe->sqe.raddr = irqe->raddr;
922 /* mark current IRQ entry free */
923 smp_store_mb(irqe->flags, 0);
927 sqe = sq_get_next(qp);
930 memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
931 wqe->wr_status = SIW_WR_QUEUED;
933 /* First copy SQE to kernel private memory */
934 memcpy(&wqe->sqe, sqe, sizeof(*sqe));
936 if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
940 if (wqe->sqe.flags & SIW_WQE_INLINE) {
941 if (wqe->sqe.opcode != SIW_OP_SEND &&
942 wqe->sqe.opcode != SIW_OP_WRITE) {
946 if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
950 wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
951 wqe->sqe.sge[0].lkey = 0;
952 wqe->sqe.num_sge = 1;
954 if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
955 /* A READ cannot be fenced */
956 if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
958 SIW_OP_READ_LOCAL_INV)) {
959 siw_dbg_qp(qp, "cannot fence read\n");
963 spin_lock(&qp->orq_lock);
965 if (!siw_orq_empty(qp)) {
966 qp->tx_ctx.orq_fence = 1;
969 spin_unlock(&qp->orq_lock);
971 } else if (wqe->sqe.opcode == SIW_OP_READ ||
972 wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
973 struct siw_sqe *rreq;
975 wqe->sqe.num_sge = 1;
977 spin_lock(&qp->orq_lock);
979 rreq = orq_get_free(qp);
982 * Make an immediate copy in ORQ to be ready
983 * to process loopback READ reply
985 siw_read_to_orq(rreq, &wqe->sqe);
988 qp->tx_ctx.orq_fence = 1;
991 spin_unlock(&qp->orq_lock);
994 /* Clear SQE, can be re-used by application */
995 smp_store_mb(sqe->flags, 0);
1001 if (unlikely(rv < 0)) {
1002 siw_dbg_qp(qp, "error %d\n", rv);
1003 wqe->wr_status = SIW_WR_IDLE;
1009 * Check if current CQ state qualifies for calling CQ completion
1010 * handler. Must be called with CQ lock held.
1012 static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
1016 if (!cq->base_cq.comp_handler)
1019 cq_notify = READ_ONCE(*cq->notify);
1021 if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
1022 ((cq_notify & SIW_NOTIFY_SOLICITED) &&
1023 (flags & SIW_WQE_SOLICITED))) {
1025 smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
1032 int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
1033 enum siw_wc_status status)
1035 struct siw_cq *cq = qp->scq;
1039 u32 sqe_flags = sqe->flags;
1040 struct siw_cqe *cqe;
1042 unsigned long flags;
1044 spin_lock_irqsave(&cq->lock, flags);
1046 idx = cq->cq_put % cq->num_cqe;
1047 cqe = &cq->queue[idx];
1049 if (!READ_ONCE(cqe->flags)) {
1053 cqe->opcode = sqe->opcode;
1054 cqe->status = status;
1058 if (cq->kernel_verbs)
1059 cqe->base_qp = qp->ib_qp;
1061 cqe->qp_id = qp_id(qp);
1063 /* mark CQE valid for application */
1064 WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
1066 smp_store_mb(sqe->flags, 0);
1069 notify = siw_cq_notify_now(cq, sqe_flags);
1071 spin_unlock_irqrestore(&cq->lock, flags);
1074 siw_dbg_cq(cq, "Call completion handler\n");
1075 cq->base_cq.comp_handler(&cq->base_cq,
1076 cq->base_cq.cq_context);
1079 spin_unlock_irqrestore(&cq->lock, flags);
1081 siw_cq_event(cq, IB_EVENT_CQ_ERR);
1085 smp_store_mb(sqe->flags, 0);
1090 int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
1091 u32 inval_stag, enum siw_wc_status status)
1093 struct siw_cq *cq = qp->rcq;
1097 struct siw_cqe *cqe;
1099 unsigned long flags;
1101 spin_lock_irqsave(&cq->lock, flags);
1103 idx = cq->cq_put % cq->num_cqe;
1104 cqe = &cq->queue[idx];
1106 if (!READ_ONCE(cqe->flags)) {
1108 u8 cqe_flags = SIW_WQE_VALID;
1111 cqe->opcode = SIW_OP_RECEIVE;
1112 cqe->status = status;
1116 if (cq->kernel_verbs) {
1117 cqe->base_qp = qp->ib_qp;
1119 cqe_flags |= SIW_WQE_REM_INVAL;
1120 cqe->inval_stag = inval_stag;
1123 cqe->qp_id = qp_id(qp);
1125 /* mark CQE valid for application */
1126 WRITE_ONCE(cqe->flags, cqe_flags);
1128 smp_store_mb(rqe->flags, 0);
1131 notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED);
1133 spin_unlock_irqrestore(&cq->lock, flags);
1136 siw_dbg_cq(cq, "Call completion handler\n");
1137 cq->base_cq.comp_handler(&cq->base_cq,
1138 cq->base_cq.cq_context);
1141 spin_unlock_irqrestore(&cq->lock, flags);
1143 siw_cq_event(cq, IB_EVENT_CQ_ERR);
1147 smp_store_mb(rqe->flags, 0);
1155 * Flush SQ and ORRQ entries to CQ.
1157 * Must be called with QP state write lock held.
1158 * Therefore, SQ and ORQ lock must not be taken.
1160 void siw_sq_flush(struct siw_qp *qp)
1162 struct siw_sqe *sqe;
1163 struct siw_wqe *wqe = tx_wqe(qp);
1164 int async_event = 0;
1167 * Start with completing any work currently on the ORQ
1169 while (qp->attrs.orq_size) {
1170 sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size];
1171 if (!READ_ONCE(sqe->flags))
1174 if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1177 WRITE_ONCE(sqe->flags, 0);
1181 * Flush an in-progress WQE if present
1183 if (wqe->wr_status != SIW_WR_IDLE) {
1184 siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n",
1185 tx_type(wqe), wqe->wr_status);
1187 siw_wqe_put_mem(wqe, tx_type(wqe));
1189 if (tx_type(wqe) != SIW_OP_READ_RESPONSE &&
1190 ((tx_type(wqe) != SIW_OP_READ &&
1191 tx_type(wqe) != SIW_OP_READ_LOCAL_INV) ||
1192 wqe->wr_status == SIW_WR_QUEUED))
1194 * An in-progress Read Request is already in
1197 siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
1198 SIW_WC_WR_FLUSH_ERR);
1200 wqe->wr_status = SIW_WR_IDLE;
1203 * Flush the Send Queue
1205 while (qp->attrs.sq_size) {
1206 sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
1207 if (!READ_ONCE(sqe->flags))
1211 if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1213 * Shall IB_EVENT_SQ_DRAINED be supressed if work
1218 WRITE_ONCE(sqe->flags, 0);
1222 siw_qp_event(qp, IB_EVENT_SQ_DRAINED);
1228 * Flush recv queue entries to CQ. Also
1229 * takes care of pending active tagged and untagged
1230 * inbound transfers, which have target memory
1233 * Must be called with QP state write lock held.
1234 * Therefore, RQ lock must not be taken.
1236 void siw_rq_flush(struct siw_qp *qp)
1238 struct siw_wqe *wqe = &qp->rx_untagged.wqe_active;
1241 * Flush an in-progress untagged operation if present
1243 if (wqe->wr_status != SIW_WR_IDLE) {
1244 siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n",
1245 rx_type(wqe), wqe->wr_status);
1247 siw_wqe_put_mem(wqe, rx_type(wqe));
1249 if (rx_type(wqe) == SIW_OP_RECEIVE) {
1250 siw_rqe_complete(qp, &wqe->rqe, wqe->bytes,
1251 0, SIW_WC_WR_FLUSH_ERR);
1252 } else if (rx_type(wqe) != SIW_OP_READ &&
1253 rx_type(wqe) != SIW_OP_READ_RESPONSE &&
1254 rx_type(wqe) != SIW_OP_WRITE) {
1255 siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR);
1257 wqe->wr_status = SIW_WR_IDLE;
1259 wqe = &qp->rx_tagged.wqe_active;
1261 if (wqe->wr_status != SIW_WR_IDLE) {
1262 siw_wqe_put_mem(wqe, rx_type(wqe));
1263 wqe->wr_status = SIW_WR_IDLE;
1266 * Flush the Receive Queue
1268 while (qp->attrs.rq_size) {
1269 struct siw_rqe *rqe =
1270 &qp->recvq[qp->rq_get % qp->attrs.rq_size];
1272 if (!READ_ONCE(rqe->flags))
1275 if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1278 WRITE_ONCE(rqe->flags, 0);
1283 int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp)
1285 int rv = xa_alloc(&sdev->qp_xa, &qp->ib_qp->qp_num, qp, xa_limit_32b,
1289 kref_init(&qp->ref);
1291 qp->qp_num = qp->ib_qp->qp_num;
1292 siw_dbg_qp(qp, "new QP\n");
1297 void siw_free_qp(struct kref *ref)
1299 struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref);
1300 struct siw_device *sdev = qp->sdev;
1301 unsigned long flags;
1304 siw_cep_put(qp->cep);
1306 found = xa_erase(&sdev->qp_xa, qp_id(qp));
1307 WARN_ON(found != qp);
1308 spin_lock_irqsave(&sdev->lock, flags);
1309 list_del(&qp->devq);
1310 spin_unlock_irqrestore(&sdev->lock, flags);
1317 siw_put_tx_cpu(qp->tx_cpu);
1319 atomic_dec(&sdev->num_qp);
1320 siw_dbg_qp(qp, "free QP\n");