Merge tag 'vfs-6.10.rw' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
[sfrench/cifs-2.6.git] / drivers / infiniband / hw / qib / qib_rc.c
1 /*
2  * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
3  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <linux/io.h>
35
36 #include "qib.h"
37
38 /* cut down ridiculously long IB macro names */
39 #define OP(x) IB_OPCODE_RC_##x
40
41
42 static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
43                        u32 psn, u32 pmtu)
44 {
45         u32 len;
46
47         len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
48         return rvt_restart_sge(ss, wqe, len);
49 }
50
51 /**
52  * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
53  * @dev: the device for this QP
54  * @qp: a pointer to the QP
55  * @ohdr: a pointer to the IB header being constructed
56  * @pmtu: the path MTU
57  *
58  * Return 1 if constructed; otherwise, return 0.
59  * Note that we are in the responder's side of the QP context.
60  * Note the QP s_lock must be held.
61  */
62 static int qib_make_rc_ack(struct qib_ibdev *dev, struct rvt_qp *qp,
63                            struct ib_other_headers *ohdr, u32 pmtu)
64 {
65         struct rvt_ack_entry *e;
66         u32 hwords;
67         u32 len;
68         u32 bth0;
69         u32 bth2;
70
71         /* Don't send an ACK if we aren't supposed to. */
72         if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
73                 goto bail;
74
75         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
76         hwords = 5;
77
78         switch (qp->s_ack_state) {
79         case OP(RDMA_READ_RESPONSE_LAST):
80         case OP(RDMA_READ_RESPONSE_ONLY):
81                 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
82                 if (e->rdma_sge.mr) {
83                         rvt_put_mr(e->rdma_sge.mr);
84                         e->rdma_sge.mr = NULL;
85                 }
86                 fallthrough;
87         case OP(ATOMIC_ACKNOWLEDGE):
88                 /*
89                  * We can increment the tail pointer now that the last
90                  * response has been sent instead of only being
91                  * constructed.
92                  */
93                 if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
94                         qp->s_tail_ack_queue = 0;
95                 fallthrough;
96         case OP(SEND_ONLY):
97         case OP(ACKNOWLEDGE):
98                 /* Check for no next entry in the queue. */
99                 if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
100                         if (qp->s_flags & RVT_S_ACK_PENDING)
101                                 goto normal;
102                         goto bail;
103                 }
104
105                 e = &qp->s_ack_queue[qp->s_tail_ack_queue];
106                 if (e->opcode == OP(RDMA_READ_REQUEST)) {
107                         /*
108                          * If a RDMA read response is being resent and
109                          * we haven't seen the duplicate request yet,
110                          * then stop sending the remaining responses the
111                          * responder has seen until the requester resends it.
112                          */
113                         len = e->rdma_sge.sge_length;
114                         if (len && !e->rdma_sge.mr) {
115                                 qp->s_tail_ack_queue = qp->r_head_ack_queue;
116                                 goto bail;
117                         }
118                         /* Copy SGE state in case we need to resend */
119                         qp->s_rdma_mr = e->rdma_sge.mr;
120                         if (qp->s_rdma_mr)
121                                 rvt_get_mr(qp->s_rdma_mr);
122                         qp->s_ack_rdma_sge.sge = e->rdma_sge;
123                         qp->s_ack_rdma_sge.num_sge = 1;
124                         qp->s_cur_sge = &qp->s_ack_rdma_sge;
125                         if (len > pmtu) {
126                                 len = pmtu;
127                                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
128                         } else {
129                                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
130                                 e->sent = 1;
131                         }
132                         ohdr->u.aeth = rvt_compute_aeth(qp);
133                         hwords++;
134                         qp->s_ack_rdma_psn = e->psn;
135                         bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
136                 } else {
137                         /* COMPARE_SWAP or FETCH_ADD */
138                         qp->s_cur_sge = NULL;
139                         len = 0;
140                         qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
141                         ohdr->u.at.aeth = rvt_compute_aeth(qp);
142                         ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
143                         hwords += sizeof(ohdr->u.at) / sizeof(u32);
144                         bth2 = e->psn & QIB_PSN_MASK;
145                         e->sent = 1;
146                 }
147                 bth0 = qp->s_ack_state << 24;
148                 break;
149
150         case OP(RDMA_READ_RESPONSE_FIRST):
151                 qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
152                 fallthrough;
153         case OP(RDMA_READ_RESPONSE_MIDDLE):
154                 qp->s_cur_sge = &qp->s_ack_rdma_sge;
155                 qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
156                 if (qp->s_rdma_mr)
157                         rvt_get_mr(qp->s_rdma_mr);
158                 len = qp->s_ack_rdma_sge.sge.sge_length;
159                 if (len > pmtu)
160                         len = pmtu;
161                 else {
162                         ohdr->u.aeth = rvt_compute_aeth(qp);
163                         hwords++;
164                         qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
165                         e = &qp->s_ack_queue[qp->s_tail_ack_queue];
166                         e->sent = 1;
167                 }
168                 bth0 = qp->s_ack_state << 24;
169                 bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
170                 break;
171
172         default:
173 normal:
174                 /*
175                  * Send a regular ACK.
176                  * Set the s_ack_state so we wait until after sending
177                  * the ACK before setting s_ack_state to ACKNOWLEDGE
178                  * (see above).
179                  */
180                 qp->s_ack_state = OP(SEND_ONLY);
181                 qp->s_flags &= ~RVT_S_ACK_PENDING;
182                 qp->s_cur_sge = NULL;
183                 if (qp->s_nak_state)
184                         ohdr->u.aeth =
185                                 cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
186                                             (qp->s_nak_state <<
187                                              IB_AETH_CREDIT_SHIFT));
188                 else
189                         ohdr->u.aeth = rvt_compute_aeth(qp);
190                 hwords++;
191                 len = 0;
192                 bth0 = OP(ACKNOWLEDGE) << 24;
193                 bth2 = qp->s_ack_psn & QIB_PSN_MASK;
194         }
195         qp->s_rdma_ack_cnt++;
196         qp->s_hdrwords = hwords;
197         qp->s_cur_size = len;
198         qib_make_ruc_header(qp, ohdr, bth0, bth2);
199         return 1;
200
201 bail:
202         qp->s_ack_state = OP(ACKNOWLEDGE);
203         qp->s_flags &= ~(RVT_S_RESP_PENDING | RVT_S_ACK_PENDING);
204         return 0;
205 }
206
207 /**
208  * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
209  * @qp: a pointer to the QP
210  * @flags: unused
211  *
212  * Assumes the s_lock is held.
213  *
214  * Return 1 if constructed; otherwise, return 0.
215  */
216 int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags)
217 {
218         struct qib_qp_priv *priv = qp->priv;
219         struct qib_ibdev *dev = to_idev(qp->ibqp.device);
220         struct ib_other_headers *ohdr;
221         struct rvt_sge_state *ss;
222         struct rvt_swqe *wqe;
223         u32 hwords;
224         u32 len;
225         u32 bth0;
226         u32 bth2;
227         u32 pmtu = qp->pmtu;
228         char newreq;
229         int ret = 0;
230         int delta;
231
232         ohdr = &priv->s_hdr->u.oth;
233         if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
234                 ohdr = &priv->s_hdr->u.l.oth;
235
236         /* Sending responses has higher priority over sending requests. */
237         if ((qp->s_flags & RVT_S_RESP_PENDING) &&
238             qib_make_rc_ack(dev, qp, ohdr, pmtu))
239                 goto done;
240
241         if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
242                 if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
243                         goto bail;
244                 /* We are in the error state, flush the work request. */
245                 if (qp->s_last == READ_ONCE(qp->s_head))
246                         goto bail;
247                 /* If DMAs are in progress, we can't flush immediately. */
248                 if (atomic_read(&priv->s_dma_busy)) {
249                         qp->s_flags |= RVT_S_WAIT_DMA;
250                         goto bail;
251                 }
252                 wqe = rvt_get_swqe_ptr(qp, qp->s_last);
253                 rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
254                         IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
255                 /* will get called again */
256                 goto done;
257         }
258
259         if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
260                 goto bail;
261
262         if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
263                 if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
264                         qp->s_flags |= RVT_S_WAIT_PSN;
265                         goto bail;
266                 }
267                 qp->s_sending_psn = qp->s_psn;
268                 qp->s_sending_hpsn = qp->s_psn - 1;
269         }
270
271         /* header size in 32-bit words LRH+BTH = (8+12)/4. */
272         hwords = 5;
273         bth0 = 0;
274
275         /* Send a request. */
276         wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
277         switch (qp->s_state) {
278         default:
279                 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
280                         goto bail;
281                 /*
282                  * Resend an old request or start a new one.
283                  *
284                  * We keep track of the current SWQE so that
285                  * we don't reset the "furthest progress" state
286                  * if we need to back up.
287                  */
288                 newreq = 0;
289                 if (qp->s_cur == qp->s_tail) {
290                         /* Check if send work queue is empty. */
291                         if (qp->s_tail == READ_ONCE(qp->s_head))
292                                 goto bail;
293                         /*
294                          * If a fence is requested, wait for previous
295                          * RDMA read and atomic operations to finish.
296                          */
297                         if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
298                             qp->s_num_rd_atomic) {
299                                 qp->s_flags |= RVT_S_WAIT_FENCE;
300                                 goto bail;
301                         }
302                         newreq = 1;
303                         qp->s_psn = wqe->psn;
304                 }
305                 /*
306                  * Note that we have to be careful not to modify the
307                  * original work request since we may need to resend
308                  * it.
309                  */
310                 len = wqe->length;
311                 ss = &qp->s_sge;
312                 bth2 = qp->s_psn & QIB_PSN_MASK;
313                 switch (wqe->wr.opcode) {
314                 case IB_WR_SEND:
315                 case IB_WR_SEND_WITH_IMM:
316                         /* If no credit, return. */
317                         if (!rvt_rc_credit_avail(qp, wqe))
318                                 goto bail;
319                         if (len > pmtu) {
320                                 qp->s_state = OP(SEND_FIRST);
321                                 len = pmtu;
322                                 break;
323                         }
324                         if (wqe->wr.opcode == IB_WR_SEND)
325                                 qp->s_state = OP(SEND_ONLY);
326                         else {
327                                 qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
328                                 /* Immediate data comes after the BTH */
329                                 ohdr->u.imm_data = wqe->wr.ex.imm_data;
330                                 hwords += 1;
331                         }
332                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
333                                 bth0 |= IB_BTH_SOLICITED;
334                         bth2 |= IB_BTH_REQ_ACK;
335                         if (++qp->s_cur == qp->s_size)
336                                 qp->s_cur = 0;
337                         break;
338
339                 case IB_WR_RDMA_WRITE:
340                         if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
341                                 qp->s_lsn++;
342                         goto no_flow_control;
343                 case IB_WR_RDMA_WRITE_WITH_IMM:
344                         /* If no credit, return. */
345                         if (!rvt_rc_credit_avail(qp, wqe))
346                                 goto bail;
347 no_flow_control:
348                         ohdr->u.rc.reth.vaddr =
349                                 cpu_to_be64(wqe->rdma_wr.remote_addr);
350                         ohdr->u.rc.reth.rkey =
351                                 cpu_to_be32(wqe->rdma_wr.rkey);
352                         ohdr->u.rc.reth.length = cpu_to_be32(len);
353                         hwords += sizeof(struct ib_reth) / sizeof(u32);
354                         if (len > pmtu) {
355                                 qp->s_state = OP(RDMA_WRITE_FIRST);
356                                 len = pmtu;
357                                 break;
358                         }
359                         if (wqe->rdma_wr.wr.opcode == IB_WR_RDMA_WRITE)
360                                 qp->s_state = OP(RDMA_WRITE_ONLY);
361                         else {
362                                 qp->s_state = OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
363                                 /* Immediate data comes after RETH */
364                                 ohdr->u.rc.imm_data =
365                                         wqe->rdma_wr.wr.ex.imm_data;
366                                 hwords += 1;
367                                 if (wqe->rdma_wr.wr.send_flags & IB_SEND_SOLICITED)
368                                         bth0 |= IB_BTH_SOLICITED;
369                         }
370                         bth2 |= IB_BTH_REQ_ACK;
371                         if (++qp->s_cur == qp->s_size)
372                                 qp->s_cur = 0;
373                         break;
374
375                 case IB_WR_RDMA_READ:
376                         /*
377                          * Don't allow more operations to be started
378                          * than the QP limits allow.
379                          */
380                         if (newreq) {
381                                 if (qp->s_num_rd_atomic >=
382                                     qp->s_max_rd_atomic) {
383                                         qp->s_flags |= RVT_S_WAIT_RDMAR;
384                                         goto bail;
385                                 }
386                                 qp->s_num_rd_atomic++;
387                                 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
388                                         qp->s_lsn++;
389                         }
390
391                         ohdr->u.rc.reth.vaddr =
392                                 cpu_to_be64(wqe->rdma_wr.remote_addr);
393                         ohdr->u.rc.reth.rkey =
394                                 cpu_to_be32(wqe->rdma_wr.rkey);
395                         ohdr->u.rc.reth.length = cpu_to_be32(len);
396                         qp->s_state = OP(RDMA_READ_REQUEST);
397                         hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
398                         ss = NULL;
399                         len = 0;
400                         bth2 |= IB_BTH_REQ_ACK;
401                         if (++qp->s_cur == qp->s_size)
402                                 qp->s_cur = 0;
403                         break;
404
405                 case IB_WR_ATOMIC_CMP_AND_SWP:
406                 case IB_WR_ATOMIC_FETCH_AND_ADD:
407                         /*
408                          * Don't allow more operations to be started
409                          * than the QP limits allow.
410                          */
411                         if (newreq) {
412                                 if (qp->s_num_rd_atomic >=
413                                     qp->s_max_rd_atomic) {
414                                         qp->s_flags |= RVT_S_WAIT_RDMAR;
415                                         goto bail;
416                                 }
417                                 qp->s_num_rd_atomic++;
418                                 if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
419                                         qp->s_lsn++;
420                         }
421                         if (wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
422                                 qp->s_state = OP(COMPARE_SWAP);
423                                 put_ib_ateth_swap(wqe->atomic_wr.swap,
424                                                   &ohdr->u.atomic_eth);
425                                 put_ib_ateth_compare(wqe->atomic_wr.compare_add,
426                                                      &ohdr->u.atomic_eth);
427                         } else {
428                                 qp->s_state = OP(FETCH_ADD);
429                                 put_ib_ateth_swap(wqe->atomic_wr.compare_add,
430                                                   &ohdr->u.atomic_eth);
431                                 put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
432                         }
433                         put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
434                                            &ohdr->u.atomic_eth);
435                         ohdr->u.atomic_eth.rkey = cpu_to_be32(
436                                 wqe->atomic_wr.rkey);
437                         hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
438                         ss = NULL;
439                         len = 0;
440                         bth2 |= IB_BTH_REQ_ACK;
441                         if (++qp->s_cur == qp->s_size)
442                                 qp->s_cur = 0;
443                         break;
444
445                 default:
446                         goto bail;
447                 }
448                 qp->s_sge.sge = wqe->sg_list[0];
449                 qp->s_sge.sg_list = wqe->sg_list + 1;
450                 qp->s_sge.num_sge = wqe->wr.num_sge;
451                 qp->s_sge.total_len = wqe->length;
452                 qp->s_len = wqe->length;
453                 if (newreq) {
454                         qp->s_tail++;
455                         if (qp->s_tail >= qp->s_size)
456                                 qp->s_tail = 0;
457                 }
458                 if (wqe->wr.opcode == IB_WR_RDMA_READ)
459                         qp->s_psn = wqe->lpsn + 1;
460                 else
461                         qp->s_psn++;
462                 break;
463
464         case OP(RDMA_READ_RESPONSE_FIRST):
465                 /*
466                  * qp->s_state is normally set to the opcode of the
467                  * last packet constructed for new requests and therefore
468                  * is never set to RDMA read response.
469                  * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
470                  * thread to indicate a SEND needs to be restarted from an
471                  * earlier PSN without interferring with the sending thread.
472                  * See qib_restart_rc().
473                  */
474                 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
475                 fallthrough;
476         case OP(SEND_FIRST):
477                 qp->s_state = OP(SEND_MIDDLE);
478                 fallthrough;
479         case OP(SEND_MIDDLE):
480                 bth2 = qp->s_psn++ & QIB_PSN_MASK;
481                 ss = &qp->s_sge;
482                 len = qp->s_len;
483                 if (len > pmtu) {
484                         len = pmtu;
485                         break;
486                 }
487                 if (wqe->wr.opcode == IB_WR_SEND)
488                         qp->s_state = OP(SEND_LAST);
489                 else {
490                         qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
491                         /* Immediate data comes after the BTH */
492                         ohdr->u.imm_data = wqe->wr.ex.imm_data;
493                         hwords += 1;
494                 }
495                 if (wqe->wr.send_flags & IB_SEND_SOLICITED)
496                         bth0 |= IB_BTH_SOLICITED;
497                 bth2 |= IB_BTH_REQ_ACK;
498                 qp->s_cur++;
499                 if (qp->s_cur >= qp->s_size)
500                         qp->s_cur = 0;
501                 break;
502
503         case OP(RDMA_READ_RESPONSE_LAST):
504                 /*
505                  * qp->s_state is normally set to the opcode of the
506                  * last packet constructed for new requests and therefore
507                  * is never set to RDMA read response.
508                  * RDMA_READ_RESPONSE_LAST is used by the ACK processing
509                  * thread to indicate a RDMA write needs to be restarted from
510                  * an earlier PSN without interferring with the sending thread.
511                  * See qib_restart_rc().
512                  */
513                 qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
514                 fallthrough;
515         case OP(RDMA_WRITE_FIRST):
516                 qp->s_state = OP(RDMA_WRITE_MIDDLE);
517                 fallthrough;
518         case OP(RDMA_WRITE_MIDDLE):
519                 bth2 = qp->s_psn++ & QIB_PSN_MASK;
520                 ss = &qp->s_sge;
521                 len = qp->s_len;
522                 if (len > pmtu) {
523                         len = pmtu;
524                         break;
525                 }
526                 if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
527                         qp->s_state = OP(RDMA_WRITE_LAST);
528                 else {
529                         qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
530                         /* Immediate data comes after the BTH */
531                         ohdr->u.imm_data = wqe->wr.ex.imm_data;
532                         hwords += 1;
533                         if (wqe->wr.send_flags & IB_SEND_SOLICITED)
534                                 bth0 |= IB_BTH_SOLICITED;
535                 }
536                 bth2 |= IB_BTH_REQ_ACK;
537                 qp->s_cur++;
538                 if (qp->s_cur >= qp->s_size)
539                         qp->s_cur = 0;
540                 break;
541
542         case OP(RDMA_READ_RESPONSE_MIDDLE):
543                 /*
544                  * qp->s_state is normally set to the opcode of the
545                  * last packet constructed for new requests and therefore
546                  * is never set to RDMA read response.
547                  * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
548                  * thread to indicate a RDMA read needs to be restarted from
549                  * an earlier PSN without interferring with the sending thread.
550                  * See qib_restart_rc().
551                  */
552                 len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
553                 ohdr->u.rc.reth.vaddr =
554                         cpu_to_be64(wqe->rdma_wr.remote_addr + len);
555                 ohdr->u.rc.reth.rkey =
556                         cpu_to_be32(wqe->rdma_wr.rkey);
557                 ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
558                 qp->s_state = OP(RDMA_READ_REQUEST);
559                 hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
560                 bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;
561                 qp->s_psn = wqe->lpsn + 1;
562                 ss = NULL;
563                 len = 0;
564                 qp->s_cur++;
565                 if (qp->s_cur == qp->s_size)
566                         qp->s_cur = 0;
567                 break;
568         }
569         qp->s_sending_hpsn = bth2;
570         delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
571         if (delta && delta % QIB_PSN_CREDIT == 0)
572                 bth2 |= IB_BTH_REQ_ACK;
573         if (qp->s_flags & RVT_S_SEND_ONE) {
574                 qp->s_flags &= ~RVT_S_SEND_ONE;
575                 qp->s_flags |= RVT_S_WAIT_ACK;
576                 bth2 |= IB_BTH_REQ_ACK;
577         }
578         qp->s_len -= len;
579         qp->s_hdrwords = hwords;
580         qp->s_cur_sge = ss;
581         qp->s_cur_size = len;
582         qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
583 done:
584         return 1;
585 bail:
586         qp->s_flags &= ~RVT_S_BUSY;
587         return ret;
588 }
589
590 /**
591  * qib_send_rc_ack - Construct an ACK packet and send it
592  * @qp: a pointer to the QP
593  *
594  * This is called from qib_rc_rcv() and qib_kreceive().
595  * Note that RDMA reads and atomics are handled in the
596  * send side QP state and tasklet.
597  */
598 void qib_send_rc_ack(struct rvt_qp *qp)
599 {
600         struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
601         struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
602         struct qib_pportdata *ppd = ppd_from_ibp(ibp);
603         u64 pbc;
604         u16 lrh0;
605         u32 bth0;
606         u32 hwords;
607         u32 pbufn;
608         u32 __iomem *piobuf;
609         struct ib_header hdr;
610         struct ib_other_headers *ohdr;
611         u32 control;
612         unsigned long flags;
613
614         spin_lock_irqsave(&qp->s_lock, flags);
615
616         if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
617                 goto unlock;
618
619         /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
620         if ((qp->s_flags & RVT_S_RESP_PENDING) || qp->s_rdma_ack_cnt)
621                 goto queue_ack;
622
623         /* Construct the header with s_lock held so APM doesn't change it. */
624         ohdr = &hdr.u.oth;
625         lrh0 = QIB_LRH_BTH;
626         /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
627         hwords = 6;
628         if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) &
629                      IB_AH_GRH)) {
630                 hwords += qib_make_grh(ibp, &hdr.u.l.grh,
631                                        rdma_ah_read_grh(&qp->remote_ah_attr),
632                                        hwords, 0);
633                 ohdr = &hdr.u.l.oth;
634                 lrh0 = QIB_LRH_GRH;
635         }
636         /* read pkey_index w/o lock (its atomic) */
637         bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
638         if (qp->s_mig_state == IB_MIG_MIGRATED)
639                 bth0 |= IB_BTH_MIG_REQ;
640         if (qp->r_nak_state)
641                 ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
642                                             (qp->r_nak_state <<
643                                              IB_AETH_CREDIT_SHIFT));
644         else
645                 ohdr->u.aeth = rvt_compute_aeth(qp);
646         lrh0 |= ibp->sl_to_vl[rdma_ah_get_sl(&qp->remote_ah_attr)] << 12 |
647                 rdma_ah_get_sl(&qp->remote_ah_attr) << 4;
648         hdr.lrh[0] = cpu_to_be16(lrh0);
649         hdr.lrh[1] = cpu_to_be16(rdma_ah_get_dlid(&qp->remote_ah_attr));
650         hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
651         hdr.lrh[3] = cpu_to_be16(ppd->lid |
652                                  rdma_ah_get_path_bits(&qp->remote_ah_attr));
653         ohdr->bth[0] = cpu_to_be32(bth0);
654         ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
655         ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);
656
657         spin_unlock_irqrestore(&qp->s_lock, flags);
658
659         /* Don't try to send ACKs if the link isn't ACTIVE */
660         if (!(ppd->lflags & QIBL_LINKACTIVE))
661                 goto done;
662
663         control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,
664                                        qp->s_srate, lrh0 >> 12);
665         /* length is + 1 for the control dword */
666         pbc = ((u64) control << 32) | (hwords + 1);
667
668         piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
669         if (!piobuf) {
670                 /*
671                  * We are out of PIO buffers at the moment.
672                  * Pass responsibility for sending the ACK to the
673                  * send tasklet so that when a PIO buffer becomes
674                  * available, the ACK is sent ahead of other outgoing
675                  * packets.
676                  */
677                 spin_lock_irqsave(&qp->s_lock, flags);
678                 goto queue_ack;
679         }
680
681         /*
682          * Write the pbc.
683          * We have to flush after the PBC for correctness
684          * on some cpus or WC buffer can be written out of order.
685          */
686         writeq(pbc, piobuf);
687
688         if (dd->flags & QIB_PIO_FLUSH_WC) {
689                 u32 *hdrp = (u32 *) &hdr;
690
691                 qib_flush_wc();
692                 qib_pio_copy(piobuf + 2, hdrp, hwords - 1);
693                 qib_flush_wc();
694                 __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
695         } else
696                 qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);
697
698         if (dd->flags & QIB_USE_SPCL_TRIG) {
699                 u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
700
701                 qib_flush_wc();
702                 __raw_writel(0xaebecede, piobuf + spcl_off);
703         }
704
705         qib_flush_wc();
706         qib_sendbuf_done(dd, pbufn);
707
708         this_cpu_inc(ibp->pmastats->n_unicast_xmit);
709         goto done;
710
711 queue_ack:
712         if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
713                 this_cpu_inc(*ibp->rvp.rc_qacks);
714                 qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
715                 qp->s_nak_state = qp->r_nak_state;
716                 qp->s_ack_psn = qp->r_ack_psn;
717
718                 /* Schedule the send tasklet. */
719                 qib_schedule_send(qp);
720         }
721 unlock:
722         spin_unlock_irqrestore(&qp->s_lock, flags);
723 done:
724         return;
725 }
726
727 /**
728  * reset_psn - reset the QP state to send starting from PSN
729  * @qp: the QP
730  * @psn: the packet sequence number to restart at
731  *
732  * This is called from qib_rc_rcv() to process an incoming RC ACK
733  * for the given QP.
734  * Called at interrupt level with the QP s_lock held.
735  */
736 static void reset_psn(struct rvt_qp *qp, u32 psn)
737 {
738         u32 n = qp->s_acked;
739         struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
740         u32 opcode;
741
742         qp->s_cur = n;
743
744         /*
745          * If we are starting the request from the beginning,
746          * let the normal send code handle initialization.
747          */
748         if (qib_cmp24(psn, wqe->psn) <= 0) {
749                 qp->s_state = OP(SEND_LAST);
750                 goto done;
751         }
752
753         /* Find the work request opcode corresponding to the given PSN. */
754         opcode = wqe->wr.opcode;
755         for (;;) {
756                 int diff;
757
758                 if (++n == qp->s_size)
759                         n = 0;
760                 if (n == qp->s_tail)
761                         break;
762                 wqe = rvt_get_swqe_ptr(qp, n);
763                 diff = qib_cmp24(psn, wqe->psn);
764                 if (diff < 0)
765                         break;
766                 qp->s_cur = n;
767                 /*
768                  * If we are starting the request from the beginning,
769                  * let the normal send code handle initialization.
770                  */
771                 if (diff == 0) {
772                         qp->s_state = OP(SEND_LAST);
773                         goto done;
774                 }
775                 opcode = wqe->wr.opcode;
776         }
777
778         /*
779          * Set the state to restart in the middle of a request.
780          * Don't change the s_sge, s_cur_sge, or s_cur_size.
781          * See qib_make_rc_req().
782          */
783         switch (opcode) {
784         case IB_WR_SEND:
785         case IB_WR_SEND_WITH_IMM:
786                 qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
787                 break;
788
789         case IB_WR_RDMA_WRITE:
790         case IB_WR_RDMA_WRITE_WITH_IMM:
791                 qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
792                 break;
793
794         case IB_WR_RDMA_READ:
795                 qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
796                 break;
797
798         default:
799                 /*
800                  * This case shouldn't happen since its only
801                  * one PSN per req.
802                  */
803                 qp->s_state = OP(SEND_LAST);
804         }
805 done:
806         qp->s_psn = psn;
807         /*
808          * Set RVT_S_WAIT_PSN as qib_rc_complete() may start the timer
809          * asynchronously before the send tasklet can get scheduled.
810          * Doing it in qib_make_rc_req() is too late.
811          */
812         if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
813             (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
814                 qp->s_flags |= RVT_S_WAIT_PSN;
815 }
816
817 /*
818  * Back up requester to resend the last un-ACKed request.
819  * The QP r_lock and s_lock should be held and interrupts disabled.
820  */
821 void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
822 {
823         struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
824         struct qib_ibport *ibp;
825
826         if (qp->s_retry == 0) {
827                 if (qp->s_mig_state == IB_MIG_ARMED) {
828                         qib_migrate_qp(qp);
829                         qp->s_retry = qp->s_retry_cnt;
830                 } else if (qp->s_last == qp->s_acked) {
831                         rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
832                         rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
833                         return;
834                 } else /* XXX need to handle delayed completion */
835                         return;
836         } else
837                 qp->s_retry--;
838
839         ibp = to_iport(qp->ibqp.device, qp->port_num);
840         if (wqe->wr.opcode == IB_WR_RDMA_READ)
841                 ibp->rvp.n_rc_resends++;
842         else
843                 ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
844
845         qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
846                          RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
847                          RVT_S_WAIT_ACK);
848         if (wait)
849                 qp->s_flags |= RVT_S_SEND_ONE;
850         reset_psn(qp, psn);
851 }
852
853 /*
854  * Set qp->s_sending_psn to the next PSN after the given one.
855  * This would be psn+1 except when RDMA reads are present.
856  */
857 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
858 {
859         struct rvt_swqe *wqe;
860         u32 n = qp->s_last;
861
862         /* Find the work request corresponding to the given PSN. */
863         for (;;) {
864                 wqe = rvt_get_swqe_ptr(qp, n);
865                 if (qib_cmp24(psn, wqe->lpsn) <= 0) {
866                         if (wqe->wr.opcode == IB_WR_RDMA_READ)
867                                 qp->s_sending_psn = wqe->lpsn + 1;
868                         else
869                                 qp->s_sending_psn = psn + 1;
870                         break;
871                 }
872                 if (++n == qp->s_size)
873                         n = 0;
874                 if (n == qp->s_tail)
875                         break;
876         }
877 }
878
879 /*
880  * This should be called with the QP s_lock held and interrupts disabled.
881  */
882 void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
883 {
884         struct ib_other_headers *ohdr;
885         struct rvt_swqe *wqe;
886         u32 opcode;
887         u32 psn;
888
889         if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
890                 return;
891
892         /* Find out where the BTH is */
893         if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)
894                 ohdr = &hdr->u.oth;
895         else
896                 ohdr = &hdr->u.l.oth;
897
898         opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
899         if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
900             opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
901                 WARN_ON(!qp->s_rdma_ack_cnt);
902                 qp->s_rdma_ack_cnt--;
903                 return;
904         }
905
906         psn = be32_to_cpu(ohdr->bth[2]);
907         reset_sending_psn(qp, psn);
908
909         /*
910          * Start timer after a packet requesting an ACK has been sent and
911          * there are still requests that haven't been acked.
912          */
913         if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
914             !(qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
915             (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
916                 rvt_add_retry_timer(qp);
917
918         while (qp->s_last != qp->s_acked) {
919                 wqe = rvt_get_swqe_ptr(qp, qp->s_last);
920                 if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
921                     qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
922                         break;
923                 rvt_qp_complete_swqe(qp,
924                                      wqe,
925                                      ib_qib_wc_opcode[wqe->wr.opcode],
926                                      IB_WC_SUCCESS);
927         }
928         /*
929          * If we were waiting for sends to complete before resending,
930          * and they are now complete, restart sending.
931          */
932         if (qp->s_flags & RVT_S_WAIT_PSN &&
933             qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
934                 qp->s_flags &= ~RVT_S_WAIT_PSN;
935                 qp->s_sending_psn = qp->s_psn;
936                 qp->s_sending_hpsn = qp->s_psn - 1;
937                 qib_schedule_send(qp);
938         }
939 }
940
941 static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
942 {
943         qp->s_last_psn = psn;
944 }
945
946 /*
947  * Generate a SWQE completion.
948  * This is similar to qib_send_complete but has to check to be sure
949  * that the SGEs are not being referenced if the SWQE is being resent.
950  */
951 static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
952                                          struct rvt_swqe *wqe,
953                                          struct qib_ibport *ibp)
954 {
955         /*
956          * Don't decrement refcount and don't generate a
957          * completion if the SWQE is being resent until the send
958          * is finished.
959          */
960         if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
961             qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0)
962                 rvt_qp_complete_swqe(qp,
963                                      wqe,
964                                      ib_qib_wc_opcode[wqe->wr.opcode],
965                                      IB_WC_SUCCESS);
966         else
967                 this_cpu_inc(*ibp->rvp.rc_delayed_comp);
968
969         qp->s_retry = qp->s_retry_cnt;
970         update_last_psn(qp, wqe->lpsn);
971
972         /*
973          * If we are completing a request which is in the process of
974          * being resent, we can stop resending it since we know the
975          * responder has already seen it.
976          */
977         if (qp->s_acked == qp->s_cur) {
978                 if (++qp->s_cur >= qp->s_size)
979                         qp->s_cur = 0;
980                 qp->s_acked = qp->s_cur;
981                 wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
982                 if (qp->s_acked != qp->s_tail) {
983                         qp->s_state = OP(SEND_LAST);
984                         qp->s_psn = wqe->psn;
985                 }
986         } else {
987                 if (++qp->s_acked >= qp->s_size)
988                         qp->s_acked = 0;
989                 if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
990                         qp->s_draining = 0;
991                 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
992         }
993         return wqe;
994 }
995
996 /*
997  * do_rc_ack - process an incoming RC ACK
998  * @qp: the QP the ACK came in on
999  * @psn: the packet sequence number of the ACK
1000  * @opcode: the opcode of the request that resulted in the ACK
1001  *
1002  * This is called from qib_rc_rcv_resp() to process an incoming RC ACK
1003  * for the given QP.
1004  * Called at interrupt level with the QP s_lock held.
1005  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1006  */
1007 static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1008                      u64 val, struct qib_ctxtdata *rcd)
1009 {
1010         struct qib_ibport *ibp;
1011         enum ib_wc_status status;
1012         struct rvt_swqe *wqe;
1013         int ret = 0;
1014         u32 ack_psn;
1015         int diff;
1016
1017         /*
1018          * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1019          * requests and implicitly NAK RDMA read and atomic requests issued
1020          * before the NAK'ed request.  The MSN won't include the NAK'ed
1021          * request but will include an ACK'ed request(s).
1022          */
1023         ack_psn = psn;
1024         if (aeth >> IB_AETH_NAK_SHIFT)
1025                 ack_psn--;
1026         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1027         ibp = to_iport(qp->ibqp.device, qp->port_num);
1028
1029         /*
1030          * The MSN might be for a later WQE than the PSN indicates so
1031          * only complete WQEs that the PSN finishes.
1032          */
1033         while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {
1034                 /*
1035                  * RDMA_READ_RESPONSE_ONLY is a special case since
1036                  * we want to generate completion events for everything
1037                  * before the RDMA read, copy the data, then generate
1038                  * the completion for the read.
1039                  */
1040                 if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1041                     opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1042                     diff == 0) {
1043                         ret = 1;
1044                         goto bail;
1045                 }
1046                 /*
1047                  * If this request is a RDMA read or atomic, and the ACK is
1048                  * for a later operation, this ACK NAKs the RDMA read or
1049                  * atomic.  In other words, only a RDMA_READ_LAST or ONLY
1050                  * can ACK a RDMA read and likewise for atomic ops.  Note
1051                  * that the NAK case can only happen if relaxed ordering is
1052                  * used and requests are sent after an RDMA read or atomic
1053                  * is sent but before the response is received.
1054                  */
1055                 if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1056                      (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1057                     ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1058                       wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1059                      (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1060                         /* Retry this request. */
1061                         if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1062                                 qp->r_flags |= RVT_R_RDMAR_SEQ;
1063                                 qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1064                                 if (list_empty(&qp->rspwait)) {
1065                                         qp->r_flags |= RVT_R_RSP_SEND;
1066                                         rvt_get_qp(qp);
1067                                         list_add_tail(&qp->rspwait,
1068                                                       &rcd->qp_wait_list);
1069                                 }
1070                         }
1071                         /*
1072                          * No need to process the ACK/NAK since we are
1073                          * restarting an earlier request.
1074                          */
1075                         goto bail;
1076                 }
1077                 if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1078                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1079                         u64 *vaddr = wqe->sg_list[0].vaddr;
1080                         *vaddr = val;
1081                 }
1082                 if (qp->s_num_rd_atomic &&
1083                     (wqe->wr.opcode == IB_WR_RDMA_READ ||
1084                      wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1085                      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1086                         qp->s_num_rd_atomic--;
1087                         /* Restart sending task if fence is complete */
1088                         if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
1089                             !qp->s_num_rd_atomic) {
1090                                 qp->s_flags &= ~(RVT_S_WAIT_FENCE |
1091                                                  RVT_S_WAIT_ACK);
1092                                 qib_schedule_send(qp);
1093                         } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
1094                                 qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
1095                                                  RVT_S_WAIT_ACK);
1096                                 qib_schedule_send(qp);
1097                         }
1098                 }
1099                 wqe = do_rc_completion(qp, wqe, ibp);
1100                 if (qp->s_acked == qp->s_tail)
1101                         break;
1102         }
1103
1104         switch (aeth >> IB_AETH_NAK_SHIFT) {
1105         case 0:         /* ACK */
1106                 this_cpu_inc(*ibp->rvp.rc_acks);
1107                 if (qp->s_acked != qp->s_tail) {
1108                         /*
1109                          * We are expecting more ACKs so
1110                          * reset the retransmit timer.
1111                          */
1112                         rvt_mod_retry_timer(qp);
1113                         /*
1114                          * We can stop resending the earlier packets and
1115                          * continue with the next packet the receiver wants.
1116                          */
1117                         if (qib_cmp24(qp->s_psn, psn) <= 0)
1118                                 reset_psn(qp, psn + 1);
1119                 } else {
1120                         /* No more acks - kill all timers */
1121                         rvt_stop_rc_timers(qp);
1122                         if (qib_cmp24(qp->s_psn, psn) <= 0) {
1123                                 qp->s_state = OP(SEND_LAST);
1124                                 qp->s_psn = psn + 1;
1125                         }
1126                 }
1127                 if (qp->s_flags & RVT_S_WAIT_ACK) {
1128                         qp->s_flags &= ~RVT_S_WAIT_ACK;
1129                         qib_schedule_send(qp);
1130                 }
1131                 rvt_get_credit(qp, aeth);
1132                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1133                 qp->s_retry = qp->s_retry_cnt;
1134                 update_last_psn(qp, psn);
1135                 return 1;
1136
1137         case 1:         /* RNR NAK */
1138                 ibp->rvp.n_rnr_naks++;
1139                 if (qp->s_acked == qp->s_tail)
1140                         goto bail;
1141                 if (qp->s_flags & RVT_S_WAIT_RNR)
1142                         goto bail;
1143                 if (qp->s_rnr_retry == 0) {
1144                         status = IB_WC_RNR_RETRY_EXC_ERR;
1145                         goto class_b;
1146                 }
1147                 if (qp->s_rnr_retry_cnt < 7)
1148                         qp->s_rnr_retry--;
1149
1150                 /* The last valid PSN is the previous PSN. */
1151                 update_last_psn(qp, psn - 1);
1152
1153                 ibp->rvp.n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
1154
1155                 reset_psn(qp, psn);
1156
1157                 qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
1158                 rvt_stop_rc_timers(qp);
1159                 rvt_add_rnr_timer(qp, aeth);
1160                 return 0;
1161
1162         case 3:         /* NAK */
1163                 if (qp->s_acked == qp->s_tail)
1164                         goto bail;
1165                 /* The last valid PSN is the previous PSN. */
1166                 update_last_psn(qp, psn - 1);
1167                 switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
1168                         IB_AETH_CREDIT_MASK) {
1169                 case 0: /* PSN sequence error */
1170                         ibp->rvp.n_seq_naks++;
1171                         /*
1172                          * Back up to the responder's expected PSN.
1173                          * Note that we might get a NAK in the middle of an
1174                          * RDMA READ response which terminates the RDMA
1175                          * READ.
1176                          */
1177                         qib_restart_rc(qp, psn, 0);
1178                         qib_schedule_send(qp);
1179                         break;
1180
1181                 case 1: /* Invalid Request */
1182                         status = IB_WC_REM_INV_REQ_ERR;
1183                         ibp->rvp.n_other_naks++;
1184                         goto class_b;
1185
1186                 case 2: /* Remote Access Error */
1187                         status = IB_WC_REM_ACCESS_ERR;
1188                         ibp->rvp.n_other_naks++;
1189                         goto class_b;
1190
1191                 case 3: /* Remote Operation Error */
1192                         status = IB_WC_REM_OP_ERR;
1193                         ibp->rvp.n_other_naks++;
1194 class_b:
1195                         if (qp->s_last == qp->s_acked) {
1196                                 rvt_send_complete(qp, wqe, status);
1197                                 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1198                         }
1199                         break;
1200
1201                 default:
1202                         /* Ignore other reserved NAK error codes */
1203                         goto reserved;
1204                 }
1205                 qp->s_retry = qp->s_retry_cnt;
1206                 qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1207                 goto bail;
1208
1209         default:                /* 2: reserved */
1210 reserved:
1211                 /* Ignore reserved NAK codes. */
1212                 goto bail;
1213         }
1214
1215 bail:
1216         rvt_stop_rc_timers(qp);
1217         return ret;
1218 }
1219
1220 /*
1221  * We have seen an out of sequence RDMA read middle or last packet.
1222  * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1223  */
1224 static void rdma_seq_err(struct rvt_qp *qp, struct qib_ibport *ibp, u32 psn,
1225                          struct qib_ctxtdata *rcd)
1226 {
1227         struct rvt_swqe *wqe;
1228
1229         /* Remove QP from retry timer */
1230         rvt_stop_rc_timers(qp);
1231
1232         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1233
1234         while (qib_cmp24(psn, wqe->lpsn) > 0) {
1235                 if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1236                     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1237                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1238                         break;
1239                 wqe = do_rc_completion(qp, wqe, ibp);
1240         }
1241
1242         ibp->rvp.n_rdma_seq++;
1243         qp->r_flags |= RVT_R_RDMAR_SEQ;
1244         qib_restart_rc(qp, qp->s_last_psn + 1, 0);
1245         if (list_empty(&qp->rspwait)) {
1246                 qp->r_flags |= RVT_R_RSP_SEND;
1247                 rvt_get_qp(qp);
1248                 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1249         }
1250 }
1251
1252 /**
1253  * qib_rc_rcv_resp - process an incoming RC response packet
1254  * @ibp: the port this packet came in on
1255  * @ohdr: the other headers for this packet
1256  * @data: the packet data
1257  * @tlen: the packet length
1258  * @qp: the QP for this packet
1259  * @opcode: the opcode for this packet
1260  * @psn: the packet sequence number for this packet
1261  * @hdrsize: the header length
1262  * @pmtu: the path MTU
1263  * @rcd: the context pointer
1264  *
1265  * This is called from qib_rc_rcv() to process an incoming RC response
1266  * packet for the given QP.
1267  * Called at interrupt level.
1268  */
1269 static void qib_rc_rcv_resp(struct qib_ibport *ibp,
1270                             struct ib_other_headers *ohdr,
1271                             void *data, u32 tlen,
1272                             struct rvt_qp *qp,
1273                             u32 opcode,
1274                             u32 psn, u32 hdrsize, u32 pmtu,
1275                             struct qib_ctxtdata *rcd)
1276 {
1277         struct rvt_swqe *wqe;
1278         struct qib_pportdata *ppd = ppd_from_ibp(ibp);
1279         enum ib_wc_status status;
1280         unsigned long flags;
1281         int diff;
1282         u32 pad;
1283         u32 aeth;
1284         u64 val;
1285
1286         if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {
1287                 /*
1288                  * If ACK'd PSN on SDMA busy list try to make progress to
1289                  * reclaim SDMA credits.
1290                  */
1291                 if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&
1292                     (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {
1293
1294                         /*
1295                          * If send tasklet not running attempt to progress
1296                          * SDMA queue.
1297                          */
1298                         if (!(qp->s_flags & RVT_S_BUSY)) {
1299                                 /* Acquire SDMA Lock */
1300                                 spin_lock_irqsave(&ppd->sdma_lock, flags);
1301                                 /* Invoke sdma make progress */
1302                                 qib_sdma_make_progress(ppd);
1303                                 /* Release SDMA Lock */
1304                                 spin_unlock_irqrestore(&ppd->sdma_lock, flags);
1305                         }
1306                 }
1307         }
1308
1309         spin_lock_irqsave(&qp->s_lock, flags);
1310         if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1311                 goto ack_done;
1312
1313         /* Ignore invalid responses. */
1314         if (qib_cmp24(psn, READ_ONCE(qp->s_next_psn)) >= 0)
1315                 goto ack_done;
1316
1317         /* Ignore duplicate responses. */
1318         diff = qib_cmp24(psn, qp->s_last_psn);
1319         if (unlikely(diff <= 0)) {
1320                 /* Update credits for "ghost" ACKs */
1321                 if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1322                         aeth = be32_to_cpu(ohdr->u.aeth);
1323                         if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
1324                                 rvt_get_credit(qp, aeth);
1325                 }
1326                 goto ack_done;
1327         }
1328
1329         /*
1330          * Skip everything other than the PSN we expect, if we are waiting
1331          * for a reply to a restarted RDMA read or atomic op.
1332          */
1333         if (qp->r_flags & RVT_R_RDMAR_SEQ) {
1334                 if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)
1335                         goto ack_done;
1336                 qp->r_flags &= ~RVT_R_RDMAR_SEQ;
1337         }
1338
1339         if (unlikely(qp->s_acked == qp->s_tail))
1340                 goto ack_done;
1341         wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1342         status = IB_WC_SUCCESS;
1343
1344         switch (opcode) {
1345         case OP(ACKNOWLEDGE):
1346         case OP(ATOMIC_ACKNOWLEDGE):
1347         case OP(RDMA_READ_RESPONSE_FIRST):
1348                 aeth = be32_to_cpu(ohdr->u.aeth);
1349                 if (opcode == OP(ATOMIC_ACKNOWLEDGE))
1350                         val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
1351                 else
1352                         val = 0;
1353                 if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1354                     opcode != OP(RDMA_READ_RESPONSE_FIRST))
1355                         goto ack_done;
1356                 hdrsize += 4;
1357                 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1358                 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1359                         goto ack_op_err;
1360                 /*
1361                  * If this is a response to a resent RDMA read, we
1362                  * have to be careful to copy the data to the right
1363                  * location.
1364                  */
1365                 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1366                                                   wqe, psn, pmtu);
1367                 goto read_middle;
1368
1369         case OP(RDMA_READ_RESPONSE_MIDDLE):
1370                 /* no AETH, no ACK */
1371                 if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1372                         goto ack_seq_err;
1373                 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1374                         goto ack_op_err;
1375 read_middle:
1376                 if (unlikely(tlen != (hdrsize + pmtu + 4)))
1377                         goto ack_len_err;
1378                 if (unlikely(pmtu >= qp->s_rdma_read_len))
1379                         goto ack_len_err;
1380
1381                 /*
1382                  * We got a response so update the timeout.
1383                  * 4.096 usec. * (1 << qp->timeout)
1384                  */
1385                 rvt_mod_retry_timer(qp);
1386                 if (qp->s_flags & RVT_S_WAIT_ACK) {
1387                         qp->s_flags &= ~RVT_S_WAIT_ACK;
1388                         qib_schedule_send(qp);
1389                 }
1390
1391                 if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1392                         qp->s_retry = qp->s_retry_cnt;
1393
1394                 /*
1395                  * Update the RDMA receive state but do the copy w/o
1396                  * holding the locks and blocking interrupts.
1397                  */
1398                 qp->s_rdma_read_len -= pmtu;
1399                 update_last_psn(qp, psn);
1400                 spin_unlock_irqrestore(&qp->s_lock, flags);
1401                 rvt_copy_sge(qp, &qp->s_rdma_read_sge,
1402                              data, pmtu, false, false);
1403                 goto bail;
1404
1405         case OP(RDMA_READ_RESPONSE_ONLY):
1406                 aeth = be32_to_cpu(ohdr->u.aeth);
1407                 if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1408                         goto ack_done;
1409                 /* Get the number of bytes the message was padded by. */
1410                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1411                 /*
1412                  * Check that the data size is >= 0 && <= pmtu.
1413                  * Remember to account for the AETH header (4) and
1414                  * ICRC (4).
1415                  */
1416                 if (unlikely(tlen < (hdrsize + pad + 8)))
1417                         goto ack_len_err;
1418                 /*
1419                  * If this is a response to a resent RDMA read, we
1420                  * have to be careful to copy the data to the right
1421                  * location.
1422                  */
1423                 wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1424                 qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1425                                                   wqe, psn, pmtu);
1426                 goto read_last;
1427
1428         case OP(RDMA_READ_RESPONSE_LAST):
1429                 /* ACKs READ req. */
1430                 if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
1431                         goto ack_seq_err;
1432                 if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1433                         goto ack_op_err;
1434                 /* Get the number of bytes the message was padded by. */
1435                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1436                 /*
1437                  * Check that the data size is >= 1 && <= pmtu.
1438                  * Remember to account for the AETH header (4) and
1439                  * ICRC (4).
1440                  */
1441                 if (unlikely(tlen <= (hdrsize + pad + 8)))
1442                         goto ack_len_err;
1443 read_last:
1444                 tlen -= hdrsize + pad + 8;
1445                 if (unlikely(tlen != qp->s_rdma_read_len))
1446                         goto ack_len_err;
1447                 aeth = be32_to_cpu(ohdr->u.aeth);
1448                 rvt_copy_sge(qp, &qp->s_rdma_read_sge,
1449                              data, tlen, false, false);
1450                 WARN_ON(qp->s_rdma_read_sge.num_sge);
1451                 (void) do_rc_ack(qp, aeth, psn,
1452                                  OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1453                 goto ack_done;
1454         }
1455
1456 ack_op_err:
1457         status = IB_WC_LOC_QP_OP_ERR;
1458         goto ack_err;
1459
1460 ack_seq_err:
1461         rdma_seq_err(qp, ibp, psn, rcd);
1462         goto ack_done;
1463
1464 ack_len_err:
1465         status = IB_WC_LOC_LEN_ERR;
1466 ack_err:
1467         if (qp->s_last == qp->s_acked) {
1468                 rvt_send_complete(qp, wqe, status);
1469                 rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1470         }
1471 ack_done:
1472         spin_unlock_irqrestore(&qp->s_lock, flags);
1473 bail:
1474         return;
1475 }
1476
1477 /**
1478  * qib_rc_rcv_error - process an incoming duplicate or error RC packet
1479  * @ohdr: the other headers for this packet
1480  * @data: the packet data
1481  * @qp: the QP for this packet
1482  * @opcode: the opcode for this packet
1483  * @psn: the packet sequence number for this packet
1484  * @diff: the difference between the PSN and the expected PSN
1485  * @rcd: the context pointer
1486  *
1487  * This is called from qib_rc_rcv() to process an unexpected
1488  * incoming RC packet for the given QP.
1489  * Called at interrupt level.
1490  * Return 1 if no more processing is needed; otherwise return 0 to
1491  * schedule a response to be sent.
1492  */
1493 static int qib_rc_rcv_error(struct ib_other_headers *ohdr,
1494                             void *data,
1495                             struct rvt_qp *qp,
1496                             u32 opcode,
1497                             u32 psn,
1498                             int diff,
1499                             struct qib_ctxtdata *rcd)
1500 {
1501         struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1502         struct rvt_ack_entry *e;
1503         unsigned long flags;
1504         u8 i, prev;
1505         int old_req;
1506
1507         if (diff > 0) {
1508                 /*
1509                  * Packet sequence error.
1510                  * A NAK will ACK earlier sends and RDMA writes.
1511                  * Don't queue the NAK if we already sent one.
1512                  */
1513                 if (!qp->r_nak_state) {
1514                         ibp->rvp.n_rc_seqnak++;
1515                         qp->r_nak_state = IB_NAK_PSN_ERROR;
1516                         /* Use the expected PSN. */
1517                         qp->r_ack_psn = qp->r_psn;
1518                         /*
1519                          * Wait to send the sequence NAK until all packets
1520                          * in the receive queue have been processed.
1521                          * Otherwise, we end up propagating congestion.
1522                          */
1523                         if (list_empty(&qp->rspwait)) {
1524                                 qp->r_flags |= RVT_R_RSP_NAK;
1525                                 rvt_get_qp(qp);
1526                                 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1527                         }
1528                 }
1529                 goto done;
1530         }
1531
1532         /*
1533          * Handle a duplicate request.  Don't re-execute SEND, RDMA
1534          * write or atomic op.  Don't NAK errors, just silently drop
1535          * the duplicate request.  Note that r_sge, r_len, and
1536          * r_rcv_len may be in use so don't modify them.
1537          *
1538          * We are supposed to ACK the earliest duplicate PSN but we
1539          * can coalesce an outstanding duplicate ACK.  We have to
1540          * send the earliest so that RDMA reads can be restarted at
1541          * the requester's expected PSN.
1542          *
1543          * First, find where this duplicate PSN falls within the
1544          * ACKs previously sent.
1545          * old_req is true if there is an older response that is scheduled
1546          * to be sent before sending this one.
1547          */
1548         e = NULL;
1549         old_req = 1;
1550         ibp->rvp.n_rc_dupreq++;
1551
1552         spin_lock_irqsave(&qp->s_lock, flags);
1553
1554         for (i = qp->r_head_ack_queue; ; i = prev) {
1555                 if (i == qp->s_tail_ack_queue)
1556                         old_req = 0;
1557                 if (i)
1558                         prev = i - 1;
1559                 else
1560                         prev = QIB_MAX_RDMA_ATOMIC;
1561                 if (prev == qp->r_head_ack_queue) {
1562                         e = NULL;
1563                         break;
1564                 }
1565                 e = &qp->s_ack_queue[prev];
1566                 if (!e->opcode) {
1567                         e = NULL;
1568                         break;
1569                 }
1570                 if (qib_cmp24(psn, e->psn) >= 0) {
1571                         if (prev == qp->s_tail_ack_queue &&
1572                             qib_cmp24(psn, e->lpsn) <= 0)
1573                                 old_req = 0;
1574                         break;
1575                 }
1576         }
1577         switch (opcode) {
1578         case OP(RDMA_READ_REQUEST): {
1579                 struct ib_reth *reth;
1580                 u32 offset;
1581                 u32 len;
1582
1583                 /*
1584                  * If we didn't find the RDMA read request in the ack queue,
1585                  * we can ignore this request.
1586                  */
1587                 if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1588                         goto unlock_done;
1589                 /* RETH comes after BTH */
1590                 reth = &ohdr->u.rc.reth;
1591                 /*
1592                  * Address range must be a subset of the original
1593                  * request and start on pmtu boundaries.
1594                  * We reuse the old ack_queue slot since the requester
1595                  * should not back up and request an earlier PSN for the
1596                  * same request.
1597                  */
1598                 offset = ((psn - e->psn) & QIB_PSN_MASK) *
1599                         qp->pmtu;
1600                 len = be32_to_cpu(reth->length);
1601                 if (unlikely(offset + len != e->rdma_sge.sge_length))
1602                         goto unlock_done;
1603                 if (e->rdma_sge.mr) {
1604                         rvt_put_mr(e->rdma_sge.mr);
1605                         e->rdma_sge.mr = NULL;
1606                 }
1607                 if (len != 0) {
1608                         u32 rkey = be32_to_cpu(reth->rkey);
1609                         u64 vaddr = be64_to_cpu(reth->vaddr);
1610                         int ok;
1611
1612                         ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1613                                          IB_ACCESS_REMOTE_READ);
1614                         if (unlikely(!ok))
1615                                 goto unlock_done;
1616                 } else {
1617                         e->rdma_sge.vaddr = NULL;
1618                         e->rdma_sge.length = 0;
1619                         e->rdma_sge.sge_length = 0;
1620                 }
1621                 e->psn = psn;
1622                 if (old_req)
1623                         goto unlock_done;
1624                 qp->s_tail_ack_queue = prev;
1625                 break;
1626         }
1627
1628         case OP(COMPARE_SWAP):
1629         case OP(FETCH_ADD): {
1630                 /*
1631                  * If we didn't find the atomic request in the ack queue
1632                  * or the send tasklet is already backed up to send an
1633                  * earlier entry, we can ignore this request.
1634                  */
1635                 if (!e || e->opcode != (u8) opcode || old_req)
1636                         goto unlock_done;
1637                 qp->s_tail_ack_queue = prev;
1638                 break;
1639         }
1640
1641         default:
1642                 /*
1643                  * Ignore this operation if it doesn't request an ACK
1644                  * or an earlier RDMA read or atomic is going to be resent.
1645                  */
1646                 if (!(psn & IB_BTH_REQ_ACK) || old_req)
1647                         goto unlock_done;
1648                 /*
1649                  * Resend the most recent ACK if this request is
1650                  * after all the previous RDMA reads and atomics.
1651                  */
1652                 if (i == qp->r_head_ack_queue) {
1653                         spin_unlock_irqrestore(&qp->s_lock, flags);
1654                         qp->r_nak_state = 0;
1655                         qp->r_ack_psn = qp->r_psn - 1;
1656                         goto send_ack;
1657                 }
1658                 /*
1659                  * Try to send a simple ACK to work around a Mellanox bug
1660                  * which doesn't accept a RDMA read response or atomic
1661                  * response as an ACK for earlier SENDs or RDMA writes.
1662                  */
1663                 if (!(qp->s_flags & RVT_S_RESP_PENDING)) {
1664                         spin_unlock_irqrestore(&qp->s_lock, flags);
1665                         qp->r_nak_state = 0;
1666                         qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
1667                         goto send_ack;
1668                 }
1669                 /*
1670                  * Resend the RDMA read or atomic op which
1671                  * ACKs this duplicate request.
1672                  */
1673                 qp->s_tail_ack_queue = i;
1674                 break;
1675         }
1676         qp->s_ack_state = OP(ACKNOWLEDGE);
1677         qp->s_flags |= RVT_S_RESP_PENDING;
1678         qp->r_nak_state = 0;
1679         qib_schedule_send(qp);
1680
1681 unlock_done:
1682         spin_unlock_irqrestore(&qp->s_lock, flags);
1683 done:
1684         return 1;
1685
1686 send_ack:
1687         return 0;
1688 }
1689
1690 static inline void qib_update_ack_queue(struct rvt_qp *qp, unsigned n)
1691 {
1692         unsigned next;
1693
1694         next = n + 1;
1695         if (next > QIB_MAX_RDMA_ATOMIC)
1696                 next = 0;
1697         qp->s_tail_ack_queue = next;
1698         qp->s_ack_state = OP(ACKNOWLEDGE);
1699 }
1700
1701 /**
1702  * qib_rc_rcv - process an incoming RC packet
1703  * @rcd: the context pointer
1704  * @hdr: the header of this packet
1705  * @has_grh: true if the header has a GRH
1706  * @data: the packet data
1707  * @tlen: the packet length
1708  * @qp: the QP for this packet
1709  *
1710  * This is called from qib_qp_rcv() to process an incoming RC packet
1711  * for the given QP.
1712  * Called at interrupt level.
1713  */
1714 void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr,
1715                 int has_grh, void *data, u32 tlen, struct rvt_qp *qp)
1716 {
1717         struct qib_ibport *ibp = &rcd->ppd->ibport_data;
1718         struct ib_other_headers *ohdr;
1719         u32 opcode;
1720         u32 hdrsize;
1721         u32 psn;
1722         u32 pad;
1723         struct ib_wc wc;
1724         u32 pmtu = qp->pmtu;
1725         int diff;
1726         struct ib_reth *reth;
1727         unsigned long flags;
1728         int ret;
1729
1730         /* Check for GRH */
1731         if (!has_grh) {
1732                 ohdr = &hdr->u.oth;
1733                 hdrsize = 8 + 12;       /* LRH + BTH */
1734         } else {
1735                 ohdr = &hdr->u.l.oth;
1736                 hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
1737         }
1738
1739         opcode = be32_to_cpu(ohdr->bth[0]);
1740         if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
1741                 return;
1742
1743         psn = be32_to_cpu(ohdr->bth[2]);
1744         opcode >>= 24;
1745
1746         /*
1747          * Process responses (ACKs) before anything else.  Note that the
1748          * packet sequence number will be for something in the send work
1749          * queue rather than the expected receive packet sequence number.
1750          * In other words, this QP is the requester.
1751          */
1752         if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1753             opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1754                 qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
1755                                 hdrsize, pmtu, rcd);
1756                 return;
1757         }
1758
1759         /* Compute 24 bits worth of difference. */
1760         diff = qib_cmp24(psn, qp->r_psn);
1761         if (unlikely(diff)) {
1762                 if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
1763                         return;
1764                 goto send_ack;
1765         }
1766
1767         /* Check for opcode sequence errors. */
1768         switch (qp->r_state) {
1769         case OP(SEND_FIRST):
1770         case OP(SEND_MIDDLE):
1771                 if (opcode == OP(SEND_MIDDLE) ||
1772                     opcode == OP(SEND_LAST) ||
1773                     opcode == OP(SEND_LAST_WITH_IMMEDIATE))
1774                         break;
1775                 goto nack_inv;
1776
1777         case OP(RDMA_WRITE_FIRST):
1778         case OP(RDMA_WRITE_MIDDLE):
1779                 if (opcode == OP(RDMA_WRITE_MIDDLE) ||
1780                     opcode == OP(RDMA_WRITE_LAST) ||
1781                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1782                         break;
1783                 goto nack_inv;
1784
1785         default:
1786                 if (opcode == OP(SEND_MIDDLE) ||
1787                     opcode == OP(SEND_LAST) ||
1788                     opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
1789                     opcode == OP(RDMA_WRITE_MIDDLE) ||
1790                     opcode == OP(RDMA_WRITE_LAST) ||
1791                     opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
1792                         goto nack_inv;
1793                 /*
1794                  * Note that it is up to the requester to not send a new
1795                  * RDMA read or atomic operation before receiving an ACK
1796                  * for the previous operation.
1797                  */
1798                 break;
1799         }
1800
1801         if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
1802                 rvt_comm_est(qp);
1803
1804         /* OK, process the packet. */
1805         switch (opcode) {
1806         case OP(SEND_FIRST):
1807                 ret = rvt_get_rwqe(qp, false);
1808                 if (ret < 0)
1809                         goto nack_op_err;
1810                 if (!ret)
1811                         goto rnr_nak;
1812                 qp->r_rcv_len = 0;
1813                 fallthrough;
1814         case OP(SEND_MIDDLE):
1815         case OP(RDMA_WRITE_MIDDLE):
1816 send_middle:
1817                 /* Check for invalid length PMTU or posted rwqe len. */
1818                 if (unlikely(tlen != (hdrsize + pmtu + 4)))
1819                         goto nack_inv;
1820                 qp->r_rcv_len += pmtu;
1821                 if (unlikely(qp->r_rcv_len > qp->r_len))
1822                         goto nack_inv;
1823                 rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
1824                 break;
1825
1826         case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
1827                 /* consume RWQE */
1828                 ret = rvt_get_rwqe(qp, true);
1829                 if (ret < 0)
1830                         goto nack_op_err;
1831                 if (!ret)
1832                         goto rnr_nak;
1833                 goto send_last_imm;
1834
1835         case OP(SEND_ONLY):
1836         case OP(SEND_ONLY_WITH_IMMEDIATE):
1837                 ret = rvt_get_rwqe(qp, false);
1838                 if (ret < 0)
1839                         goto nack_op_err;
1840                 if (!ret)
1841                         goto rnr_nak;
1842                 qp->r_rcv_len = 0;
1843                 if (opcode == OP(SEND_ONLY))
1844                         goto no_immediate_data;
1845                 fallthrough;    /* for SEND_ONLY_WITH_IMMEDIATE */
1846         case OP(SEND_LAST_WITH_IMMEDIATE):
1847 send_last_imm:
1848                 wc.ex.imm_data = ohdr->u.imm_data;
1849                 hdrsize += 4;
1850                 wc.wc_flags = IB_WC_WITH_IMM;
1851                 goto send_last;
1852         case OP(SEND_LAST):
1853         case OP(RDMA_WRITE_LAST):
1854 no_immediate_data:
1855                 wc.wc_flags = 0;
1856                 wc.ex.imm_data = 0;
1857 send_last:
1858                 /* Get the number of bytes the message was padded by. */
1859                 pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1860                 /* Check for invalid length. */
1861                 /* XXX LAST len should be >= 1 */
1862                 if (unlikely(tlen < (hdrsize + pad + 4)))
1863                         goto nack_inv;
1864                 /* Don't count the CRC. */
1865                 tlen -= (hdrsize + pad + 4);
1866                 wc.byte_len = tlen + qp->r_rcv_len;
1867                 if (unlikely(wc.byte_len > qp->r_len))
1868                         goto nack_inv;
1869                 rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
1870                 rvt_put_ss(&qp->r_sge);
1871                 qp->r_msn++;
1872                 if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
1873                         break;
1874                 wc.wr_id = qp->r_wr_id;
1875                 wc.status = IB_WC_SUCCESS;
1876                 if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
1877                     opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
1878                         wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
1879                 else
1880                         wc.opcode = IB_WC_RECV;
1881                 wc.qp = &qp->ibqp;
1882                 wc.src_qp = qp->remote_qpn;
1883                 wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr);
1884                 wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
1885                 /* zero fields that are N/A */
1886                 wc.vendor_err = 0;
1887                 wc.pkey_index = 0;
1888                 wc.dlid_path_bits = 0;
1889                 wc.port_num = 0;
1890                 /* Signal completion event if the solicited bit is set. */
1891                 rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
1892                 break;
1893
1894         case OP(RDMA_WRITE_FIRST):
1895         case OP(RDMA_WRITE_ONLY):
1896         case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
1897                 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
1898                         goto nack_inv;
1899                 /* consume RWQE */
1900                 reth = &ohdr->u.rc.reth;
1901                 hdrsize += sizeof(*reth);
1902                 qp->r_len = be32_to_cpu(reth->length);
1903                 qp->r_rcv_len = 0;
1904                 qp->r_sge.sg_list = NULL;
1905                 if (qp->r_len != 0) {
1906                         u32 rkey = be32_to_cpu(reth->rkey);
1907                         u64 vaddr = be64_to_cpu(reth->vaddr);
1908                         int ok;
1909
1910                         /* Check rkey & NAK */
1911                         ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
1912                                          rkey, IB_ACCESS_REMOTE_WRITE);
1913                         if (unlikely(!ok))
1914                                 goto nack_acc;
1915                         qp->r_sge.num_sge = 1;
1916                 } else {
1917                         qp->r_sge.num_sge = 0;
1918                         qp->r_sge.sge.mr = NULL;
1919                         qp->r_sge.sge.vaddr = NULL;
1920                         qp->r_sge.sge.length = 0;
1921                         qp->r_sge.sge.sge_length = 0;
1922                 }
1923                 if (opcode == OP(RDMA_WRITE_FIRST))
1924                         goto send_middle;
1925                 else if (opcode == OP(RDMA_WRITE_ONLY))
1926                         goto no_immediate_data;
1927                 ret = rvt_get_rwqe(qp, true);
1928                 if (ret < 0)
1929                         goto nack_op_err;
1930                 if (!ret) {
1931                         rvt_put_ss(&qp->r_sge);
1932                         goto rnr_nak;
1933                 }
1934                 wc.ex.imm_data = ohdr->u.rc.imm_data;
1935                 hdrsize += 4;
1936                 wc.wc_flags = IB_WC_WITH_IMM;
1937                 goto send_last;
1938
1939         case OP(RDMA_READ_REQUEST): {
1940                 struct rvt_ack_entry *e;
1941                 u32 len;
1942                 u8 next;
1943
1944                 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
1945                         goto nack_inv;
1946                 next = qp->r_head_ack_queue + 1;
1947                 /* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */
1948                 if (next > QIB_MAX_RDMA_ATOMIC)
1949                         next = 0;
1950                 spin_lock_irqsave(&qp->s_lock, flags);
1951                 if (unlikely(next == qp->s_tail_ack_queue)) {
1952                         if (!qp->s_ack_queue[next].sent)
1953                                 goto nack_inv_unlck;
1954                         qib_update_ack_queue(qp, next);
1955                 }
1956                 e = &qp->s_ack_queue[qp->r_head_ack_queue];
1957                 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
1958                         rvt_put_mr(e->rdma_sge.mr);
1959                         e->rdma_sge.mr = NULL;
1960                 }
1961                 reth = &ohdr->u.rc.reth;
1962                 len = be32_to_cpu(reth->length);
1963                 if (len) {
1964                         u32 rkey = be32_to_cpu(reth->rkey);
1965                         u64 vaddr = be64_to_cpu(reth->vaddr);
1966                         int ok;
1967
1968                         /* Check rkey & NAK */
1969                         ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
1970                                          rkey, IB_ACCESS_REMOTE_READ);
1971                         if (unlikely(!ok))
1972                                 goto nack_acc_unlck;
1973                         /*
1974                          * Update the next expected PSN.  We add 1 later
1975                          * below, so only add the remainder here.
1976                          */
1977                         qp->r_psn += rvt_div_mtu(qp, len - 1);
1978                 } else {
1979                         e->rdma_sge.mr = NULL;
1980                         e->rdma_sge.vaddr = NULL;
1981                         e->rdma_sge.length = 0;
1982                         e->rdma_sge.sge_length = 0;
1983                 }
1984                 e->opcode = opcode;
1985                 e->sent = 0;
1986                 e->psn = psn;
1987                 e->lpsn = qp->r_psn;
1988                 /*
1989                  * We need to increment the MSN here instead of when we
1990                  * finish sending the result since a duplicate request would
1991                  * increment it more than once.
1992                  */
1993                 qp->r_msn++;
1994                 qp->r_psn++;
1995                 qp->r_state = opcode;
1996                 qp->r_nak_state = 0;
1997                 qp->r_head_ack_queue = next;
1998
1999                 /* Schedule the send tasklet. */
2000                 qp->s_flags |= RVT_S_RESP_PENDING;
2001                 qib_schedule_send(qp);
2002
2003                 goto sunlock;
2004         }
2005
2006         case OP(COMPARE_SWAP):
2007         case OP(FETCH_ADD): {
2008                 struct ib_atomic_eth *ateth;
2009                 struct rvt_ack_entry *e;
2010                 u64 vaddr;
2011                 atomic64_t *maddr;
2012                 u64 sdata;
2013                 u32 rkey;
2014                 u8 next;
2015
2016                 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2017                         goto nack_inv;
2018                 next = qp->r_head_ack_queue + 1;
2019                 if (next > QIB_MAX_RDMA_ATOMIC)
2020                         next = 0;
2021                 spin_lock_irqsave(&qp->s_lock, flags);
2022                 if (unlikely(next == qp->s_tail_ack_queue)) {
2023                         if (!qp->s_ack_queue[next].sent)
2024                                 goto nack_inv_unlck;
2025                         qib_update_ack_queue(qp, next);
2026                 }
2027                 e = &qp->s_ack_queue[qp->r_head_ack_queue];
2028                 if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2029                         rvt_put_mr(e->rdma_sge.mr);
2030                         e->rdma_sge.mr = NULL;
2031                 }
2032                 ateth = &ohdr->u.atomic_eth;
2033                 vaddr = get_ib_ateth_vaddr(ateth);
2034                 if (unlikely(vaddr & (sizeof(u64) - 1)))
2035                         goto nack_inv_unlck;
2036                 rkey = be32_to_cpu(ateth->rkey);
2037                 /* Check rkey & NAK */
2038                 if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2039                                           vaddr, rkey,
2040                                           IB_ACCESS_REMOTE_ATOMIC)))
2041                         goto nack_acc_unlck;
2042                 /* Perform atomic OP and save result. */
2043                 maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
2044                 sdata = get_ib_ateth_swap(ateth);
2045                 e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2046                         (u64) atomic64_add_return(sdata, maddr) - sdata :
2047                         (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
2048                                       get_ib_ateth_compare(ateth),
2049                                       sdata);
2050                 rvt_put_mr(qp->r_sge.sge.mr);
2051                 qp->r_sge.num_sge = 0;
2052                 e->opcode = opcode;
2053                 e->sent = 0;
2054                 e->psn = psn;
2055                 e->lpsn = psn;
2056                 qp->r_msn++;
2057                 qp->r_psn++;
2058                 qp->r_state = opcode;
2059                 qp->r_nak_state = 0;
2060                 qp->r_head_ack_queue = next;
2061
2062                 /* Schedule the send tasklet. */
2063                 qp->s_flags |= RVT_S_RESP_PENDING;
2064                 qib_schedule_send(qp);
2065
2066                 goto sunlock;
2067         }
2068
2069         default:
2070                 /* NAK unknown opcodes. */
2071                 goto nack_inv;
2072         }
2073         qp->r_psn++;
2074         qp->r_state = opcode;
2075         qp->r_ack_psn = psn;
2076         qp->r_nak_state = 0;
2077         /* Send an ACK if requested or required. */
2078         if (psn & (1 << 31))
2079                 goto send_ack;
2080         return;
2081
2082 rnr_nak:
2083         qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
2084         qp->r_ack_psn = qp->r_psn;
2085         /* Queue RNR NAK for later */
2086         if (list_empty(&qp->rspwait)) {
2087                 qp->r_flags |= RVT_R_RSP_NAK;
2088                 rvt_get_qp(qp);
2089                 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2090         }
2091         return;
2092
2093 nack_op_err:
2094         rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2095         qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2096         qp->r_ack_psn = qp->r_psn;
2097         /* Queue NAK for later */
2098         if (list_empty(&qp->rspwait)) {
2099                 qp->r_flags |= RVT_R_RSP_NAK;
2100                 rvt_get_qp(qp);
2101                 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2102         }
2103         return;
2104
2105 nack_inv_unlck:
2106         spin_unlock_irqrestore(&qp->s_lock, flags);
2107 nack_inv:
2108         rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2109         qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2110         qp->r_ack_psn = qp->r_psn;
2111         /* Queue NAK for later */
2112         if (list_empty(&qp->rspwait)) {
2113                 qp->r_flags |= RVT_R_RSP_NAK;
2114                 rvt_get_qp(qp);
2115                 list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2116         }
2117         return;
2118
2119 nack_acc_unlck:
2120         spin_unlock_irqrestore(&qp->s_lock, flags);
2121 nack_acc:
2122         rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2123         qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2124         qp->r_ack_psn = qp->r_psn;
2125 send_ack:
2126         qib_send_rc_ack(qp);
2127         return;
2128
2129 sunlock:
2130         spin_unlock_irqrestore(&qp->s_lock, flags);
2131 }