]> git.samba.org - sfrench/cifs-2.6.git/blob - drivers/infiniband/sw/siw/siw_verbs.c
Merge tag 'selinux-pr-20210629' of git://git.kernel.org/pub/scm/linux/kernel/git...
[sfrench/cifs-2.6.git] / drivers / infiniband / sw / siw / siw_verbs.c
1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/uaccess.h>
9 #include <linux/vmalloc.h>
10 #include <linux/xarray.h>
11
12 #include <rdma/iw_cm.h>
13 #include <rdma/ib_verbs.h>
14 #include <rdma/ib_user_verbs.h>
15 #include <rdma/uverbs_ioctl.h>
16
17 #include "siw.h"
18 #include "siw_verbs.h"
19 #include "siw_mem.h"
20
21 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
22         [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
23         [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
24         [IB_QPS_RTR] = SIW_QP_STATE_RTR,
25         [IB_QPS_RTS] = SIW_QP_STATE_RTS,
26         [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
27         [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
28         [IB_QPS_ERR] = SIW_QP_STATE_ERROR
29 };
30
31 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
32         [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
33         [IB_QPS_RTS] = "RTS",     [IB_QPS_SQD] = "SQD",   [IB_QPS_SQE] = "SQE",
34         [IB_QPS_ERR] = "ERR"
35 };
36
37 void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
38 {
39         struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry);
40
41         kfree(entry);
42 }
43
44 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
45 {
46         struct siw_ucontext *uctx = to_siw_ctx(ctx);
47         size_t size = vma->vm_end - vma->vm_start;
48         struct rdma_user_mmap_entry *rdma_entry;
49         struct siw_user_mmap_entry *entry;
50         int rv = -EINVAL;
51
52         /*
53          * Must be page aligned
54          */
55         if (vma->vm_start & (PAGE_SIZE - 1)) {
56                 pr_warn("siw: mmap not page aligned\n");
57                 return -EINVAL;
58         }
59         rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma);
60         if (!rdma_entry) {
61                 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n",
62                         vma->vm_pgoff, size);
63                 return -EINVAL;
64         }
65         entry = to_siw_mmap_entry(rdma_entry);
66
67         rv = remap_vmalloc_range(vma, entry->address, 0);
68         if (rv) {
69                 pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff,
70                         size);
71                 goto out;
72         }
73 out:
74         rdma_user_mmap_entry_put(rdma_entry);
75
76         return rv;
77 }
78
79 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
80 {
81         struct siw_device *sdev = to_siw_dev(base_ctx->device);
82         struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
83         struct siw_uresp_alloc_ctx uresp = {};
84         int rv;
85
86         if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
87                 rv = -ENOMEM;
88                 goto err_out;
89         }
90         ctx->sdev = sdev;
91
92         uresp.dev_id = sdev->vendor_part_id;
93
94         if (udata->outlen < sizeof(uresp)) {
95                 rv = -EINVAL;
96                 goto err_out;
97         }
98         rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
99         if (rv)
100                 goto err_out;
101
102         siw_dbg(base_ctx->device, "success. now %d context(s)\n",
103                 atomic_read(&sdev->num_ctx));
104
105         return 0;
106
107 err_out:
108         atomic_dec(&sdev->num_ctx);
109         siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
110                 atomic_read(&sdev->num_ctx));
111
112         return rv;
113 }
114
115 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
116 {
117         struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
118
119         atomic_dec(&uctx->sdev->num_ctx);
120 }
121
122 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
123                      struct ib_udata *udata)
124 {
125         struct siw_device *sdev = to_siw_dev(base_dev);
126
127         if (udata->inlen || udata->outlen)
128                 return -EINVAL;
129
130         memset(attr, 0, sizeof(*attr));
131
132         /* Revisit atomic caps if RFC 7306 gets supported */
133         attr->atomic_cap = 0;
134         attr->device_cap_flags =
135                 IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG;
136         attr->max_cq = sdev->attrs.max_cq;
137         attr->max_cqe = sdev->attrs.max_cqe;
138         attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
139         attr->max_mr = sdev->attrs.max_mr;
140         attr->max_mw = sdev->attrs.max_mw;
141         attr->max_mr_size = ~0ull;
142         attr->max_pd = sdev->attrs.max_pd;
143         attr->max_qp = sdev->attrs.max_qp;
144         attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
145         attr->max_qp_rd_atom = sdev->attrs.max_ord;
146         attr->max_qp_wr = sdev->attrs.max_qp_wr;
147         attr->max_recv_sge = sdev->attrs.max_sge;
148         attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
149         attr->max_send_sge = sdev->attrs.max_sge;
150         attr->max_sge_rd = sdev->attrs.max_sge_rd;
151         attr->max_srq = sdev->attrs.max_srq;
152         attr->max_srq_sge = sdev->attrs.max_srq_sge;
153         attr->max_srq_wr = sdev->attrs.max_srq_wr;
154         attr->page_size_cap = PAGE_SIZE;
155         attr->vendor_id = SIW_VENDOR_ID;
156         attr->vendor_part_id = sdev->vendor_part_id;
157
158         memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6);
159
160         return 0;
161 }
162
163 int siw_query_port(struct ib_device *base_dev, u32 port,
164                    struct ib_port_attr *attr)
165 {
166         struct siw_device *sdev = to_siw_dev(base_dev);
167         int rv;
168
169         memset(attr, 0, sizeof(*attr));
170
171         rv = ib_get_eth_speed(base_dev, port, &attr->active_speed,
172                          &attr->active_width);
173         attr->gid_tbl_len = 1;
174         attr->max_msg_sz = -1;
175         attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
176         attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
177         attr->phys_state = sdev->state == IB_PORT_ACTIVE ?
178                 IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
179         attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
180         attr->state = sdev->state;
181         /*
182          * All zero
183          *
184          * attr->lid = 0;
185          * attr->bad_pkey_cntr = 0;
186          * attr->qkey_viol_cntr = 0;
187          * attr->sm_lid = 0;
188          * attr->lmc = 0;
189          * attr->max_vl_num = 0;
190          * attr->sm_sl = 0;
191          * attr->subnet_timeout = 0;
192          * attr->init_type_repy = 0;
193          */
194         return rv;
195 }
196
197 int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
198                            struct ib_port_immutable *port_immutable)
199 {
200         struct ib_port_attr attr;
201         int rv = siw_query_port(base_dev, port, &attr);
202
203         if (rv)
204                 return rv;
205
206         port_immutable->gid_tbl_len = attr.gid_tbl_len;
207         port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
208
209         return 0;
210 }
211
212 int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
213                   union ib_gid *gid)
214 {
215         struct siw_device *sdev = to_siw_dev(base_dev);
216
217         /* subnet_prefix == interface_id == 0; */
218         memset(gid, 0, sizeof(*gid));
219         memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6);
220
221         return 0;
222 }
223
224 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
225 {
226         struct siw_device *sdev = to_siw_dev(pd->device);
227
228         if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
229                 atomic_dec(&sdev->num_pd);
230                 return -ENOMEM;
231         }
232         siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
233
234         return 0;
235 }
236
237 int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
238 {
239         struct siw_device *sdev = to_siw_dev(pd->device);
240
241         siw_dbg_pd(pd, "free PD\n");
242         atomic_dec(&sdev->num_pd);
243         return 0;
244 }
245
246 void siw_qp_get_ref(struct ib_qp *base_qp)
247 {
248         siw_qp_get(to_siw_qp(base_qp));
249 }
250
251 void siw_qp_put_ref(struct ib_qp *base_qp)
252 {
253         siw_qp_put(to_siw_qp(base_qp));
254 }
255
256 static struct rdma_user_mmap_entry *
257 siw_mmap_entry_insert(struct siw_ucontext *uctx,
258                       void *address, size_t length,
259                       u64 *offset)
260 {
261         struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
262         int rv;
263
264         *offset = SIW_INVAL_UOBJ_KEY;
265         if (!entry)
266                 return NULL;
267
268         entry->address = address;
269
270         rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext,
271                                          &entry->rdma_entry,
272                                          length);
273         if (rv) {
274                 kfree(entry);
275                 return NULL;
276         }
277
278         *offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
279
280         return &entry->rdma_entry;
281 }
282
283 /*
284  * siw_create_qp()
285  *
286  * Create QP of requested size on given device.
287  *
288  * @pd:         Protection Domain
289  * @attrs:      Initial QP attributes.
290  * @udata:      used to provide QP ID, SQ and RQ size back to user.
291  */
292
293 struct ib_qp *siw_create_qp(struct ib_pd *pd,
294                             struct ib_qp_init_attr *attrs,
295                             struct ib_udata *udata)
296 {
297         struct siw_qp *qp = NULL;
298         struct ib_device *base_dev = pd->device;
299         struct siw_device *sdev = to_siw_dev(base_dev);
300         struct siw_ucontext *uctx =
301                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
302                                           base_ucontext);
303         unsigned long flags;
304         int num_sqe, num_rqe, rv = 0;
305         size_t length;
306
307         siw_dbg(base_dev, "create new QP\n");
308
309         if (attrs->create_flags)
310                 return ERR_PTR(-EOPNOTSUPP);
311
312         if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
313                 siw_dbg(base_dev, "too many QP's\n");
314                 rv = -ENOMEM;
315                 goto err_out;
316         }
317         if (attrs->qp_type != IB_QPT_RC) {
318                 siw_dbg(base_dev, "only RC QP's supported\n");
319                 rv = -EOPNOTSUPP;
320                 goto err_out;
321         }
322         if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
323             (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
324             (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
325             (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
326                 siw_dbg(base_dev, "QP size error\n");
327                 rv = -EINVAL;
328                 goto err_out;
329         }
330         if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
331                 siw_dbg(base_dev, "max inline send: %d > %d\n",
332                         attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
333                 rv = -EINVAL;
334                 goto err_out;
335         }
336         /*
337          * NOTE: we allow for zero element SQ and RQ WQE's SGL's
338          * but not for a QP unable to hold any WQE (SQ + RQ)
339          */
340         if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
341                 siw_dbg(base_dev, "QP must have send or receive queue\n");
342                 rv = -EINVAL;
343                 goto err_out;
344         }
345
346         if (!attrs->send_cq || (!attrs->recv_cq && !attrs->srq)) {
347                 siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
348                 rv = -EINVAL;
349                 goto err_out;
350         }
351         qp = kzalloc(sizeof(*qp), GFP_KERNEL);
352         if (!qp) {
353                 rv = -ENOMEM;
354                 goto err_out;
355         }
356         init_rwsem(&qp->state_lock);
357         spin_lock_init(&qp->sq_lock);
358         spin_lock_init(&qp->rq_lock);
359         spin_lock_init(&qp->orq_lock);
360
361         rv = siw_qp_add(sdev, qp);
362         if (rv)
363                 goto err_out;
364
365         num_sqe = attrs->cap.max_send_wr;
366         num_rqe = attrs->cap.max_recv_wr;
367
368         /* All queue indices are derived from modulo operations
369          * on a free running 'get' (consumer) and 'put' (producer)
370          * unsigned counter. Having queue sizes at power of two
371          * avoids handling counter wrap around.
372          */
373         if (num_sqe)
374                 num_sqe = roundup_pow_of_two(num_sqe);
375         else {
376                 /* Zero sized SQ is not supported */
377                 rv = -EINVAL;
378                 goto err_out_xa;
379         }
380         if (num_rqe)
381                 num_rqe = roundup_pow_of_two(num_rqe);
382
383         if (udata)
384                 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
385         else
386                 qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe));
387
388         if (qp->sendq == NULL) {
389                 rv = -ENOMEM;
390                 goto err_out_xa;
391         }
392         if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
393                 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
394                         qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
395                 else {
396                         rv = -EINVAL;
397                         goto err_out_xa;
398                 }
399         }
400         qp->pd = pd;
401         qp->scq = to_siw_cq(attrs->send_cq);
402         qp->rcq = to_siw_cq(attrs->recv_cq);
403
404         if (attrs->srq) {
405                 /*
406                  * SRQ support.
407                  * Verbs 6.3.7: ignore RQ size, if SRQ present
408                  * Verbs 6.3.5: do not check PD of SRQ against PD of QP
409                  */
410                 qp->srq = to_siw_srq(attrs->srq);
411                 qp->attrs.rq_size = 0;
412                 siw_dbg(base_dev, "QP [%u]: SRQ attached\n",
413                         qp->base_qp.qp_num);
414         } else if (num_rqe) {
415                 if (udata)
416                         qp->recvq =
417                                 vmalloc_user(num_rqe * sizeof(struct siw_rqe));
418                 else
419                         qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe));
420
421                 if (qp->recvq == NULL) {
422                         rv = -ENOMEM;
423                         goto err_out_xa;
424                 }
425                 qp->attrs.rq_size = num_rqe;
426         }
427         qp->attrs.sq_size = num_sqe;
428         qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
429         qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
430
431         /* Make those two tunables fixed for now. */
432         qp->tx_ctx.gso_seg_limit = 1;
433         qp->tx_ctx.zcopy_tx = zcopy_tx;
434
435         qp->attrs.state = SIW_QP_STATE_IDLE;
436
437         if (udata) {
438                 struct siw_uresp_create_qp uresp = {};
439
440                 uresp.num_sqe = num_sqe;
441                 uresp.num_rqe = num_rqe;
442                 uresp.qp_id = qp_id(qp);
443
444                 if (qp->sendq) {
445                         length = num_sqe * sizeof(struct siw_sqe);
446                         qp->sq_entry =
447                                 siw_mmap_entry_insert(uctx, qp->sendq,
448                                                       length, &uresp.sq_key);
449                         if (!qp->sq_entry) {
450                                 rv = -ENOMEM;
451                                 goto err_out_xa;
452                         }
453                 }
454
455                 if (qp->recvq) {
456                         length = num_rqe * sizeof(struct siw_rqe);
457                         qp->rq_entry =
458                                 siw_mmap_entry_insert(uctx, qp->recvq,
459                                                       length, &uresp.rq_key);
460                         if (!qp->rq_entry) {
461                                 uresp.sq_key = SIW_INVAL_UOBJ_KEY;
462                                 rv = -ENOMEM;
463                                 goto err_out_xa;
464                         }
465                 }
466
467                 if (udata->outlen < sizeof(uresp)) {
468                         rv = -EINVAL;
469                         goto err_out_xa;
470                 }
471                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
472                 if (rv)
473                         goto err_out_xa;
474         }
475         qp->tx_cpu = siw_get_tx_cpu(sdev);
476         if (qp->tx_cpu < 0) {
477                 rv = -EINVAL;
478                 goto err_out_xa;
479         }
480         INIT_LIST_HEAD(&qp->devq);
481         spin_lock_irqsave(&sdev->lock, flags);
482         list_add_tail(&qp->devq, &sdev->qp_list);
483         spin_unlock_irqrestore(&sdev->lock, flags);
484
485         return &qp->base_qp;
486
487 err_out_xa:
488         xa_erase(&sdev->qp_xa, qp_id(qp));
489 err_out:
490         if (qp) {
491                 if (uctx) {
492                         rdma_user_mmap_entry_remove(qp->sq_entry);
493                         rdma_user_mmap_entry_remove(qp->rq_entry);
494                 }
495                 vfree(qp->sendq);
496                 vfree(qp->recvq);
497                 kfree(qp);
498         }
499         atomic_dec(&sdev->num_qp);
500
501         return ERR_PTR(rv);
502 }
503
504 /*
505  * Minimum siw_query_qp() verb interface.
506  *
507  * @qp_attr_mask is not used but all available information is provided
508  */
509 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
510                  int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
511 {
512         struct siw_qp *qp;
513         struct siw_device *sdev;
514
515         if (base_qp && qp_attr && qp_init_attr) {
516                 qp = to_siw_qp(base_qp);
517                 sdev = to_siw_dev(base_qp->device);
518         } else {
519                 return -EINVAL;
520         }
521         qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
522         qp_attr->cap.max_send_wr = qp->attrs.sq_size;
523         qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
524         qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
525         qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
526         qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
527         qp_attr->max_rd_atomic = qp->attrs.irq_size;
528         qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
529
530         qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
531                                    IB_ACCESS_REMOTE_WRITE |
532                                    IB_ACCESS_REMOTE_READ;
533
534         qp_init_attr->qp_type = base_qp->qp_type;
535         qp_init_attr->send_cq = base_qp->send_cq;
536         qp_init_attr->recv_cq = base_qp->recv_cq;
537         qp_init_attr->srq = base_qp->srq;
538
539         qp_init_attr->cap = qp_attr->cap;
540
541         return 0;
542 }
543
544 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
545                         int attr_mask, struct ib_udata *udata)
546 {
547         struct siw_qp_attrs new_attrs;
548         enum siw_qp_attr_mask siw_attr_mask = 0;
549         struct siw_qp *qp = to_siw_qp(base_qp);
550         int rv = 0;
551
552         if (!attr_mask)
553                 return 0;
554
555         if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
556                 return -EOPNOTSUPP;
557
558         memset(&new_attrs, 0, sizeof(new_attrs));
559
560         if (attr_mask & IB_QP_ACCESS_FLAGS) {
561                 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
562
563                 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
564                         new_attrs.flags |= SIW_RDMA_READ_ENABLED;
565                 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
566                         new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
567                 if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
568                         new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
569         }
570         if (attr_mask & IB_QP_STATE) {
571                 siw_dbg_qp(qp, "desired IB QP state: %s\n",
572                            ib_qp_state_to_string[attr->qp_state]);
573
574                 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
575
576                 if (new_attrs.state > SIW_QP_STATE_RTS)
577                         qp->tx_ctx.tx_suspend = 1;
578
579                 siw_attr_mask |= SIW_QP_ATTR_STATE;
580         }
581         if (!siw_attr_mask)
582                 goto out;
583
584         down_write(&qp->state_lock);
585
586         rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
587
588         up_write(&qp->state_lock);
589 out:
590         return rv;
591 }
592
593 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
594 {
595         struct siw_qp *qp = to_siw_qp(base_qp);
596         struct siw_ucontext *uctx =
597                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
598                                           base_ucontext);
599         struct siw_qp_attrs qp_attrs;
600
601         siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
602
603         /*
604          * Mark QP as in process of destruction to prevent from
605          * any async callbacks to RDMA core
606          */
607         qp->attrs.flags |= SIW_QP_IN_DESTROY;
608         qp->rx_stream.rx_suspend = 1;
609
610         if (uctx) {
611                 rdma_user_mmap_entry_remove(qp->sq_entry);
612                 rdma_user_mmap_entry_remove(qp->rq_entry);
613         }
614
615         down_write(&qp->state_lock);
616
617         qp_attrs.state = SIW_QP_STATE_ERROR;
618         siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
619
620         if (qp->cep) {
621                 siw_cep_put(qp->cep);
622                 qp->cep = NULL;
623         }
624         up_write(&qp->state_lock);
625
626         kfree(qp->tx_ctx.mpa_crc_hd);
627         kfree(qp->rx_stream.mpa_crc_hd);
628
629         qp->scq = qp->rcq = NULL;
630
631         siw_qp_put(qp);
632
633         return 0;
634 }
635
636 /*
637  * siw_copy_inline_sgl()
638  *
639  * Prepare sgl of inlined data for sending. For userland callers
640  * function checks if given buffer addresses and len's are within
641  * process context bounds.
642  * Data from all provided sge's are copied together into the wqe,
643  * referenced by a single sge.
644  */
645 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
646                                struct siw_sqe *sqe)
647 {
648         struct ib_sge *core_sge = core_wr->sg_list;
649         void *kbuf = &sqe->sge[1];
650         int num_sge = core_wr->num_sge, bytes = 0;
651
652         sqe->sge[0].laddr = (uintptr_t)kbuf;
653         sqe->sge[0].lkey = 0;
654
655         while (num_sge--) {
656                 if (!core_sge->length) {
657                         core_sge++;
658                         continue;
659                 }
660                 bytes += core_sge->length;
661                 if (bytes > SIW_MAX_INLINE) {
662                         bytes = -EINVAL;
663                         break;
664                 }
665                 memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
666                        core_sge->length);
667
668                 kbuf += core_sge->length;
669                 core_sge++;
670         }
671         sqe->sge[0].length = bytes > 0 ? bytes : 0;
672         sqe->num_sge = bytes > 0 ? 1 : 0;
673
674         return bytes;
675 }
676
677 /* Complete SQ WR's without processing */
678 static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
679                            const struct ib_send_wr **bad_wr)
680 {
681         struct siw_sqe sqe = {};
682         int rv = 0;
683
684         while (wr) {
685                 sqe.id = wr->wr_id;
686                 sqe.opcode = wr->opcode;
687                 rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR);
688                 if (rv) {
689                         if (bad_wr)
690                                 *bad_wr = wr;
691                         break;
692                 }
693                 wr = wr->next;
694         }
695         return rv;
696 }
697
698 /* Complete RQ WR's without processing */
699 static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
700                            const struct ib_recv_wr **bad_wr)
701 {
702         struct siw_rqe rqe = {};
703         int rv = 0;
704
705         while (wr) {
706                 rqe.id = wr->wr_id;
707                 rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
708                 if (rv) {
709                         if (bad_wr)
710                                 *bad_wr = wr;
711                         break;
712                 }
713                 wr = wr->next;
714         }
715         return rv;
716 }
717
718 /*
719  * siw_post_send()
720  *
721  * Post a list of S-WR's to a SQ.
722  *
723  * @base_qp:    Base QP contained in siw QP
724  * @wr:         Null terminated list of user WR's
725  * @bad_wr:     Points to failing WR in case of synchronous failure.
726  */
727 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
728                   const struct ib_send_wr **bad_wr)
729 {
730         struct siw_qp *qp = to_siw_qp(base_qp);
731         struct siw_wqe *wqe = tx_wqe(qp);
732
733         unsigned long flags;
734         int rv = 0;
735
736         if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) {
737                 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
738                 *bad_wr = wr;
739                 return -EINVAL;
740         }
741
742         /*
743          * Try to acquire QP state lock. Must be non-blocking
744          * to accommodate kernel clients needs.
745          */
746         if (!down_read_trylock(&qp->state_lock)) {
747                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
748                         /*
749                          * ERROR state is final, so we can be sure
750                          * this state will not change as long as the QP
751                          * exists.
752                          *
753                          * This handles an ib_drain_sq() call with
754                          * a concurrent request to set the QP state
755                          * to ERROR.
756                          */
757                         rv = siw_sq_flush_wr(qp, wr, bad_wr);
758                 } else {
759                         siw_dbg_qp(qp, "QP locked, state %d\n",
760                                    qp->attrs.state);
761                         *bad_wr = wr;
762                         rv = -ENOTCONN;
763                 }
764                 return rv;
765         }
766         if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
767                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
768                         /*
769                          * Immediately flush this WR to CQ, if QP
770                          * is in ERROR state. SQ is guaranteed to
771                          * be empty, so WR complets in-order.
772                          *
773                          * Typically triggered by ib_drain_sq().
774                          */
775                         rv = siw_sq_flush_wr(qp, wr, bad_wr);
776                 } else {
777                         siw_dbg_qp(qp, "QP out of state %d\n",
778                                    qp->attrs.state);
779                         *bad_wr = wr;
780                         rv = -ENOTCONN;
781                 }
782                 up_read(&qp->state_lock);
783                 return rv;
784         }
785         spin_lock_irqsave(&qp->sq_lock, flags);
786
787         while (wr) {
788                 u32 idx = qp->sq_put % qp->attrs.sq_size;
789                 struct siw_sqe *sqe = &qp->sendq[idx];
790
791                 if (sqe->flags) {
792                         siw_dbg_qp(qp, "sq full\n");
793                         rv = -ENOMEM;
794                         break;
795                 }
796                 if (wr->num_sge > qp->attrs.sq_max_sges) {
797                         siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
798                         rv = -EINVAL;
799                         break;
800                 }
801                 sqe->id = wr->wr_id;
802
803                 if ((wr->send_flags & IB_SEND_SIGNALED) ||
804                     (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
805                         sqe->flags |= SIW_WQE_SIGNALLED;
806
807                 if (wr->send_flags & IB_SEND_FENCE)
808                         sqe->flags |= SIW_WQE_READ_FENCE;
809
810                 switch (wr->opcode) {
811                 case IB_WR_SEND:
812                 case IB_WR_SEND_WITH_INV:
813                         if (wr->send_flags & IB_SEND_SOLICITED)
814                                 sqe->flags |= SIW_WQE_SOLICITED;
815
816                         if (!(wr->send_flags & IB_SEND_INLINE)) {
817                                 siw_copy_sgl(wr->sg_list, sqe->sge,
818                                              wr->num_sge);
819                                 sqe->num_sge = wr->num_sge;
820                         } else {
821                                 rv = siw_copy_inline_sgl(wr, sqe);
822                                 if (rv <= 0) {
823                                         rv = -EINVAL;
824                                         break;
825                                 }
826                                 sqe->flags |= SIW_WQE_INLINE;
827                                 sqe->num_sge = 1;
828                         }
829                         if (wr->opcode == IB_WR_SEND)
830                                 sqe->opcode = SIW_OP_SEND;
831                         else {
832                                 sqe->opcode = SIW_OP_SEND_REMOTE_INV;
833                                 sqe->rkey = wr->ex.invalidate_rkey;
834                         }
835                         break;
836
837                 case IB_WR_RDMA_READ_WITH_INV:
838                 case IB_WR_RDMA_READ:
839                         /*
840                          * iWarp restricts RREAD sink to SGL containing
841                          * 1 SGE only. we could relax to SGL with multiple
842                          * elements referring the SAME ltag or even sending
843                          * a private per-rreq tag referring to a checked
844                          * local sgl with MULTIPLE ltag's.
845                          */
846                         if (unlikely(wr->num_sge != 1)) {
847                                 rv = -EINVAL;
848                                 break;
849                         }
850                         siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
851                         /*
852                          * NOTE: zero length RREAD is allowed!
853                          */
854                         sqe->raddr = rdma_wr(wr)->remote_addr;
855                         sqe->rkey = rdma_wr(wr)->rkey;
856                         sqe->num_sge = 1;
857
858                         if (wr->opcode == IB_WR_RDMA_READ)
859                                 sqe->opcode = SIW_OP_READ;
860                         else
861                                 sqe->opcode = SIW_OP_READ_LOCAL_INV;
862                         break;
863
864                 case IB_WR_RDMA_WRITE:
865                         if (!(wr->send_flags & IB_SEND_INLINE)) {
866                                 siw_copy_sgl(wr->sg_list, &sqe->sge[0],
867                                              wr->num_sge);
868                                 sqe->num_sge = wr->num_sge;
869                         } else {
870                                 rv = siw_copy_inline_sgl(wr, sqe);
871                                 if (unlikely(rv < 0)) {
872                                         rv = -EINVAL;
873                                         break;
874                                 }
875                                 sqe->flags |= SIW_WQE_INLINE;
876                                 sqe->num_sge = 1;
877                         }
878                         sqe->raddr = rdma_wr(wr)->remote_addr;
879                         sqe->rkey = rdma_wr(wr)->rkey;
880                         sqe->opcode = SIW_OP_WRITE;
881                         break;
882
883                 case IB_WR_REG_MR:
884                         sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
885                         sqe->rkey = reg_wr(wr)->key;
886                         sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
887                         sqe->opcode = SIW_OP_REG_MR;
888                         break;
889
890                 case IB_WR_LOCAL_INV:
891                         sqe->rkey = wr->ex.invalidate_rkey;
892                         sqe->opcode = SIW_OP_INVAL_STAG;
893                         break;
894
895                 default:
896                         siw_dbg_qp(qp, "ib wr type %d unsupported\n",
897                                    wr->opcode);
898                         rv = -EINVAL;
899                         break;
900                 }
901                 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
902                            sqe->opcode, sqe->flags,
903                            (void *)(uintptr_t)sqe->id);
904
905                 if (unlikely(rv < 0))
906                         break;
907
908                 /* make SQE only valid after completely written */
909                 smp_wmb();
910                 sqe->flags |= SIW_WQE_VALID;
911
912                 qp->sq_put++;
913                 wr = wr->next;
914         }
915
916         /*
917          * Send directly if SQ processing is not in progress.
918          * Eventual immediate errors (rv < 0) do not affect the involved
919          * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
920          * processing, if new work is already pending. But rv must be passed
921          * to caller.
922          */
923         if (wqe->wr_status != SIW_WR_IDLE) {
924                 spin_unlock_irqrestore(&qp->sq_lock, flags);
925                 goto skip_direct_sending;
926         }
927         rv = siw_activate_tx(qp);
928         spin_unlock_irqrestore(&qp->sq_lock, flags);
929
930         if (rv <= 0)
931                 goto skip_direct_sending;
932
933         if (rdma_is_kernel_res(&qp->base_qp.res)) {
934                 rv = siw_sq_start(qp);
935         } else {
936                 qp->tx_ctx.in_syscall = 1;
937
938                 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
939                         siw_qp_cm_drop(qp, 0);
940
941                 qp->tx_ctx.in_syscall = 0;
942         }
943 skip_direct_sending:
944
945         up_read(&qp->state_lock);
946
947         if (rv >= 0)
948                 return 0;
949         /*
950          * Immediate error
951          */
952         siw_dbg_qp(qp, "error %d\n", rv);
953
954         *bad_wr = wr;
955         return rv;
956 }
957
958 /*
959  * siw_post_receive()
960  *
961  * Post a list of R-WR's to a RQ.
962  *
963  * @base_qp:    Base QP contained in siw QP
964  * @wr:         Null terminated list of user WR's
965  * @bad_wr:     Points to failing WR in case of synchronous failure.
966  */
967 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
968                      const struct ib_recv_wr **bad_wr)
969 {
970         struct siw_qp *qp = to_siw_qp(base_qp);
971         unsigned long flags;
972         int rv = 0;
973
974         if (qp->srq || qp->attrs.rq_size == 0) {
975                 *bad_wr = wr;
976                 return -EINVAL;
977         }
978         if (!rdma_is_kernel_res(&qp->base_qp.res)) {
979                 siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n");
980                 *bad_wr = wr;
981                 return -EINVAL;
982         }
983
984         /*
985          * Try to acquire QP state lock. Must be non-blocking
986          * to accommodate kernel clients needs.
987          */
988         if (!down_read_trylock(&qp->state_lock)) {
989                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
990                         /*
991                          * ERROR state is final, so we can be sure
992                          * this state will not change as long as the QP
993                          * exists.
994                          *
995                          * This handles an ib_drain_rq() call with
996                          * a concurrent request to set the QP state
997                          * to ERROR.
998                          */
999                         rv = siw_rq_flush_wr(qp, wr, bad_wr);
1000                 } else {
1001                         siw_dbg_qp(qp, "QP locked, state %d\n",
1002                                    qp->attrs.state);
1003                         *bad_wr = wr;
1004                         rv = -ENOTCONN;
1005                 }
1006                 return rv;
1007         }
1008         if (qp->attrs.state > SIW_QP_STATE_RTS) {
1009                 if (qp->attrs.state == SIW_QP_STATE_ERROR) {
1010                         /*
1011                          * Immediately flush this WR to CQ, if QP
1012                          * is in ERROR state. RQ is guaranteed to
1013                          * be empty, so WR complets in-order.
1014                          *
1015                          * Typically triggered by ib_drain_rq().
1016                          */
1017                         rv = siw_rq_flush_wr(qp, wr, bad_wr);
1018                 } else {
1019                         siw_dbg_qp(qp, "QP out of state %d\n",
1020                                    qp->attrs.state);
1021                         *bad_wr = wr;
1022                         rv = -ENOTCONN;
1023                 }
1024                 up_read(&qp->state_lock);
1025                 return rv;
1026         }
1027         /*
1028          * Serialize potentially multiple producers.
1029          * Not needed for single threaded consumer side.
1030          */
1031         spin_lock_irqsave(&qp->rq_lock, flags);
1032
1033         while (wr) {
1034                 u32 idx = qp->rq_put % qp->attrs.rq_size;
1035                 struct siw_rqe *rqe = &qp->recvq[idx];
1036
1037                 if (rqe->flags) {
1038                         siw_dbg_qp(qp, "RQ full\n");
1039                         rv = -ENOMEM;
1040                         break;
1041                 }
1042                 if (wr->num_sge > qp->attrs.rq_max_sges) {
1043                         siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
1044                         rv = -EINVAL;
1045                         break;
1046                 }
1047                 rqe->id = wr->wr_id;
1048                 rqe->num_sge = wr->num_sge;
1049                 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1050
1051                 /* make sure RQE is completely written before valid */
1052                 smp_wmb();
1053
1054                 rqe->flags = SIW_WQE_VALID;
1055
1056                 qp->rq_put++;
1057                 wr = wr->next;
1058         }
1059         spin_unlock_irqrestore(&qp->rq_lock, flags);
1060
1061         up_read(&qp->state_lock);
1062
1063         if (rv < 0) {
1064                 siw_dbg_qp(qp, "error %d\n", rv);
1065                 *bad_wr = wr;
1066         }
1067         return rv > 0 ? 0 : rv;
1068 }
1069
1070 int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
1071 {
1072         struct siw_cq *cq = to_siw_cq(base_cq);
1073         struct siw_device *sdev = to_siw_dev(base_cq->device);
1074         struct siw_ucontext *ctx =
1075                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1076                                           base_ucontext);
1077
1078         siw_dbg_cq(cq, "free CQ resources\n");
1079
1080         siw_cq_flush(cq);
1081
1082         if (ctx)
1083                 rdma_user_mmap_entry_remove(cq->cq_entry);
1084
1085         atomic_dec(&sdev->num_cq);
1086
1087         vfree(cq->queue);
1088         return 0;
1089 }
1090
1091 /*
1092  * siw_create_cq()
1093  *
1094  * Populate CQ of requested size
1095  *
1096  * @base_cq: CQ as allocated by RDMA midlayer
1097  * @attr: Initial CQ attributes
1098  * @udata: relates to user context
1099  */
1100
1101 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
1102                   struct ib_udata *udata)
1103 {
1104         struct siw_device *sdev = to_siw_dev(base_cq->device);
1105         struct siw_cq *cq = to_siw_cq(base_cq);
1106         int rv, size = attr->cqe;
1107
1108         if (attr->flags)
1109                 return -EOPNOTSUPP;
1110
1111         if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
1112                 siw_dbg(base_cq->device, "too many CQ's\n");
1113                 rv = -ENOMEM;
1114                 goto err_out;
1115         }
1116         if (size < 1 || size > sdev->attrs.max_cqe) {
1117                 siw_dbg(base_cq->device, "CQ size error: %d\n", size);
1118                 rv = -EINVAL;
1119                 goto err_out;
1120         }
1121         size = roundup_pow_of_two(size);
1122         cq->base_cq.cqe = size;
1123         cq->num_cqe = size;
1124
1125         if (udata)
1126                 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
1127                                          sizeof(struct siw_cq_ctrl));
1128         else
1129                 cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
1130                                     sizeof(struct siw_cq_ctrl));
1131
1132         if (cq->queue == NULL) {
1133                 rv = -ENOMEM;
1134                 goto err_out;
1135         }
1136         get_random_bytes(&cq->id, 4);
1137         siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
1138
1139         spin_lock_init(&cq->lock);
1140
1141         cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
1142
1143         if (udata) {
1144                 struct siw_uresp_create_cq uresp = {};
1145                 struct siw_ucontext *ctx =
1146                         rdma_udata_to_drv_context(udata, struct siw_ucontext,
1147                                                   base_ucontext);
1148                 size_t length = size * sizeof(struct siw_cqe) +
1149                         sizeof(struct siw_cq_ctrl);
1150
1151                 cq->cq_entry =
1152                         siw_mmap_entry_insert(ctx, cq->queue,
1153                                               length, &uresp.cq_key);
1154                 if (!cq->cq_entry) {
1155                         rv = -ENOMEM;
1156                         goto err_out;
1157                 }
1158
1159                 uresp.cq_id = cq->id;
1160                 uresp.num_cqe = size;
1161
1162                 if (udata->outlen < sizeof(uresp)) {
1163                         rv = -EINVAL;
1164                         goto err_out;
1165                 }
1166                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1167                 if (rv)
1168                         goto err_out;
1169         }
1170         return 0;
1171
1172 err_out:
1173         siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
1174
1175         if (cq && cq->queue) {
1176                 struct siw_ucontext *ctx =
1177                         rdma_udata_to_drv_context(udata, struct siw_ucontext,
1178                                                   base_ucontext);
1179                 if (ctx)
1180                         rdma_user_mmap_entry_remove(cq->cq_entry);
1181                 vfree(cq->queue);
1182         }
1183         atomic_dec(&sdev->num_cq);
1184
1185         return rv;
1186 }
1187
1188 /*
1189  * siw_poll_cq()
1190  *
1191  * Reap CQ entries if available and copy work completion status into
1192  * array of WC's provided by caller. Returns number of reaped CQE's.
1193  *
1194  * @base_cq:    Base CQ contained in siw CQ.
1195  * @num_cqe:    Maximum number of CQE's to reap.
1196  * @wc:         Array of work completions to be filled by siw.
1197  */
1198 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
1199 {
1200         struct siw_cq *cq = to_siw_cq(base_cq);
1201         int i;
1202
1203         for (i = 0; i < num_cqe; i++) {
1204                 if (!siw_reap_cqe(cq, wc))
1205                         break;
1206                 wc++;
1207         }
1208         return i;
1209 }
1210
1211 /*
1212  * siw_req_notify_cq()
1213  *
1214  * Request notification for new CQE's added to that CQ.
1215  * Defined flags:
1216  * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
1217  *   event if a WQE with notification flag set enters the CQ
1218  * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
1219  *   event if a WQE enters the CQ.
1220  * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
1221  *   number of not reaped CQE's regardless of its notification
1222  *   type and current or new CQ notification settings.
1223  *
1224  * @base_cq:    Base CQ contained in siw CQ.
1225  * @flags:      Requested notification flags.
1226  */
1227 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
1228 {
1229         struct siw_cq *cq = to_siw_cq(base_cq);
1230
1231         siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
1232
1233         if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
1234                 /*
1235                  * Enable CQ event for next solicited completion.
1236                  * and make it visible to all associated producers.
1237                  */
1238                 smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
1239         else
1240                 /*
1241                  * Enable CQ event for any signalled completion.
1242                  * and make it visible to all associated producers.
1243                  */
1244                 smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
1245
1246         if (flags & IB_CQ_REPORT_MISSED_EVENTS)
1247                 return cq->cq_put - cq->cq_get;
1248
1249         return 0;
1250 }
1251
1252 /*
1253  * siw_dereg_mr()
1254  *
1255  * Release Memory Region.
1256  *
1257  * @base_mr: Base MR contained in siw MR.
1258  * @udata: points to user context, unused.
1259  */
1260 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
1261 {
1262         struct siw_mr *mr = to_siw_mr(base_mr);
1263         struct siw_device *sdev = to_siw_dev(base_mr->device);
1264
1265         siw_dbg_mem(mr->mem, "deregister MR\n");
1266
1267         atomic_dec(&sdev->num_mr);
1268
1269         siw_mr_drop_mem(mr);
1270         kfree_rcu(mr, rcu);
1271
1272         return 0;
1273 }
1274
1275 /*
1276  * siw_reg_user_mr()
1277  *
1278  * Register Memory Region.
1279  *
1280  * @pd:         Protection Domain
1281  * @start:      starting address of MR (virtual address)
1282  * @len:        len of MR
1283  * @rnic_va:    not used by siw
1284  * @rights:     MR access rights
1285  * @udata:      user buffer to communicate STag and Key.
1286  */
1287 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
1288                               u64 rnic_va, int rights, struct ib_udata *udata)
1289 {
1290         struct siw_mr *mr = NULL;
1291         struct siw_umem *umem = NULL;
1292         struct siw_ureq_reg_mr ureq;
1293         struct siw_device *sdev = to_siw_dev(pd->device);
1294
1295         unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
1296         int rv;
1297
1298         siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
1299                    (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
1300                    (unsigned long long)len);
1301
1302         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1303                 siw_dbg_pd(pd, "too many mr's\n");
1304                 rv = -ENOMEM;
1305                 goto err_out;
1306         }
1307         if (!len) {
1308                 rv = -EINVAL;
1309                 goto err_out;
1310         }
1311         if (mem_limit != RLIM_INFINITY) {
1312                 unsigned long num_pages =
1313                         (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
1314                 mem_limit >>= PAGE_SHIFT;
1315
1316                 if (num_pages > mem_limit - current->mm->locked_vm) {
1317                         siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
1318                                    num_pages, mem_limit,
1319                                    current->mm->locked_vm);
1320                         rv = -ENOMEM;
1321                         goto err_out;
1322                 }
1323         }
1324         umem = siw_umem_get(start, len, ib_access_writable(rights));
1325         if (IS_ERR(umem)) {
1326                 rv = PTR_ERR(umem);
1327                 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
1328                 umem = NULL;
1329                 goto err_out;
1330         }
1331         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1332         if (!mr) {
1333                 rv = -ENOMEM;
1334                 goto err_out;
1335         }
1336         rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
1337         if (rv)
1338                 goto err_out;
1339
1340         if (udata) {
1341                 struct siw_uresp_reg_mr uresp = {};
1342                 struct siw_mem *mem = mr->mem;
1343
1344                 if (udata->inlen < sizeof(ureq)) {
1345                         rv = -EINVAL;
1346                         goto err_out;
1347                 }
1348                 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
1349                 if (rv)
1350                         goto err_out;
1351
1352                 mr->base_mr.lkey |= ureq.stag_key;
1353                 mr->base_mr.rkey |= ureq.stag_key;
1354                 mem->stag |= ureq.stag_key;
1355                 uresp.stag = mem->stag;
1356
1357                 if (udata->outlen < sizeof(uresp)) {
1358                         rv = -EINVAL;
1359                         goto err_out;
1360                 }
1361                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1362                 if (rv)
1363                         goto err_out;
1364         }
1365         mr->mem->stag_valid = 1;
1366
1367         return &mr->base_mr;
1368
1369 err_out:
1370         atomic_dec(&sdev->num_mr);
1371         if (mr) {
1372                 if (mr->mem)
1373                         siw_mr_drop_mem(mr);
1374                 kfree_rcu(mr, rcu);
1375         } else {
1376                 if (umem)
1377                         siw_umem_release(umem, false);
1378         }
1379         return ERR_PTR(rv);
1380 }
1381
1382 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
1383                            u32 max_sge)
1384 {
1385         struct siw_device *sdev = to_siw_dev(pd->device);
1386         struct siw_mr *mr = NULL;
1387         struct siw_pbl *pbl = NULL;
1388         int rv;
1389
1390         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1391                 siw_dbg_pd(pd, "too many mr's\n");
1392                 rv = -ENOMEM;
1393                 goto err_out;
1394         }
1395         if (mr_type != IB_MR_TYPE_MEM_REG) {
1396                 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
1397                 rv = -EOPNOTSUPP;
1398                 goto err_out;
1399         }
1400         if (max_sge > SIW_MAX_SGE_PBL) {
1401                 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
1402                 rv = -ENOMEM;
1403                 goto err_out;
1404         }
1405         pbl = siw_pbl_alloc(max_sge);
1406         if (IS_ERR(pbl)) {
1407                 rv = PTR_ERR(pbl);
1408                 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
1409                 pbl = NULL;
1410                 goto err_out;
1411         }
1412         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1413         if (!mr) {
1414                 rv = -ENOMEM;
1415                 goto err_out;
1416         }
1417         rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
1418         if (rv)
1419                 goto err_out;
1420
1421         mr->mem->is_pbl = 1;
1422
1423         siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1424
1425         return &mr->base_mr;
1426
1427 err_out:
1428         atomic_dec(&sdev->num_mr);
1429
1430         if (!mr) {
1431                 kfree(pbl);
1432         } else {
1433                 if (mr->mem)
1434                         siw_mr_drop_mem(mr);
1435                 kfree_rcu(mr, rcu);
1436         }
1437         siw_dbg_pd(pd, "failed: %d\n", rv);
1438
1439         return ERR_PTR(rv);
1440 }
1441
1442 /* Just used to count number of pages being mapped */
1443 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
1444 {
1445         return 0;
1446 }
1447
1448 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
1449                   unsigned int *sg_off)
1450 {
1451         struct scatterlist *slp;
1452         struct siw_mr *mr = to_siw_mr(base_mr);
1453         struct siw_mem *mem = mr->mem;
1454         struct siw_pbl *pbl = mem->pbl;
1455         struct siw_pble *pble;
1456         unsigned long pbl_size;
1457         int i, rv;
1458
1459         if (!pbl) {
1460                 siw_dbg_mem(mem, "no PBL allocated\n");
1461                 return -EINVAL;
1462         }
1463         pble = pbl->pbe;
1464
1465         if (pbl->max_buf < num_sle) {
1466                 siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
1467                             mem->pbl->max_buf, num_sle);
1468                 return -ENOMEM;
1469         }
1470         for_each_sg(sl, slp, num_sle, i) {
1471                 if (sg_dma_len(slp) == 0) {
1472                         siw_dbg_mem(mem, "empty SGE\n");
1473                         return -EINVAL;
1474                 }
1475                 if (i == 0) {
1476                         pble->addr = sg_dma_address(slp);
1477                         pble->size = sg_dma_len(slp);
1478                         pble->pbl_off = 0;
1479                         pbl_size = pble->size;
1480                         pbl->num_buf = 1;
1481                 } else {
1482                         /* Merge PBL entries if adjacent */
1483                         if (pble->addr + pble->size == sg_dma_address(slp)) {
1484                                 pble->size += sg_dma_len(slp);
1485                         } else {
1486                                 pble++;
1487                                 pbl->num_buf++;
1488                                 pble->addr = sg_dma_address(slp);
1489                                 pble->size = sg_dma_len(slp);
1490                                 pble->pbl_off = pbl_size;
1491                         }
1492                         pbl_size += sg_dma_len(slp);
1493                 }
1494                 siw_dbg_mem(mem,
1495                         "sge[%d], size %u, addr 0x%p, total %lu\n",
1496                         i, pble->size, (void *)(uintptr_t)pble->addr,
1497                         pbl_size);
1498         }
1499         rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
1500         if (rv > 0) {
1501                 mem->len = base_mr->length;
1502                 mem->va = base_mr->iova;
1503                 siw_dbg_mem(mem,
1504                         "%llu bytes, start 0x%pK, %u SLE to %u entries\n",
1505                         mem->len, (void *)(uintptr_t)mem->va, num_sle,
1506                         pbl->num_buf);
1507         }
1508         return rv;
1509 }
1510
1511 /*
1512  * siw_get_dma_mr()
1513  *
1514  * Create a (empty) DMA memory region, where no umem is attached.
1515  */
1516 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
1517 {
1518         struct siw_device *sdev = to_siw_dev(pd->device);
1519         struct siw_mr *mr = NULL;
1520         int rv;
1521
1522         if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
1523                 siw_dbg_pd(pd, "too many mr's\n");
1524                 rv = -ENOMEM;
1525                 goto err_out;
1526         }
1527         mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1528         if (!mr) {
1529                 rv = -ENOMEM;
1530                 goto err_out;
1531         }
1532         rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
1533         if (rv)
1534                 goto err_out;
1535
1536         mr->mem->stag_valid = 1;
1537
1538         siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
1539
1540         return &mr->base_mr;
1541
1542 err_out:
1543         if (rv)
1544                 kfree(mr);
1545
1546         atomic_dec(&sdev->num_mr);
1547
1548         return ERR_PTR(rv);
1549 }
1550
1551 /*
1552  * siw_create_srq()
1553  *
1554  * Create Shared Receive Queue of attributes @init_attrs
1555  * within protection domain given by @pd.
1556  *
1557  * @base_srq:   Base SRQ contained in siw SRQ.
1558  * @init_attrs: SRQ init attributes.
1559  * @udata:      points to user context
1560  */
1561 int siw_create_srq(struct ib_srq *base_srq,
1562                    struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
1563 {
1564         struct siw_srq *srq = to_siw_srq(base_srq);
1565         struct ib_srq_attr *attrs = &init_attrs->attr;
1566         struct siw_device *sdev = to_siw_dev(base_srq->device);
1567         struct siw_ucontext *ctx =
1568                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1569                                           base_ucontext);
1570         int rv;
1571
1572         if (init_attrs->srq_type != IB_SRQT_BASIC)
1573                 return -EOPNOTSUPP;
1574
1575         if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
1576                 siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
1577                 rv = -ENOMEM;
1578                 goto err_out;
1579         }
1580         if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
1581             attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
1582                 rv = -EINVAL;
1583                 goto err_out;
1584         }
1585         srq->max_sge = attrs->max_sge;
1586         srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
1587         srq->limit = attrs->srq_limit;
1588         if (srq->limit)
1589                 srq->armed = true;
1590
1591         srq->is_kernel_res = !udata;
1592
1593         if (udata)
1594                 srq->recvq =
1595                         vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
1596         else
1597                 srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe));
1598
1599         if (srq->recvq == NULL) {
1600                 rv = -ENOMEM;
1601                 goto err_out;
1602         }
1603         if (udata) {
1604                 struct siw_uresp_create_srq uresp = {};
1605                 size_t length = srq->num_rqe * sizeof(struct siw_rqe);
1606
1607                 srq->srq_entry =
1608                         siw_mmap_entry_insert(ctx, srq->recvq,
1609                                               length, &uresp.srq_key);
1610                 if (!srq->srq_entry) {
1611                         rv = -ENOMEM;
1612                         goto err_out;
1613                 }
1614
1615                 uresp.num_rqe = srq->num_rqe;
1616
1617                 if (udata->outlen < sizeof(uresp)) {
1618                         rv = -EINVAL;
1619                         goto err_out;
1620                 }
1621                 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
1622                 if (rv)
1623                         goto err_out;
1624         }
1625         spin_lock_init(&srq->lock);
1626
1627         siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
1628
1629         return 0;
1630
1631 err_out:
1632         if (srq->recvq) {
1633                 if (ctx)
1634                         rdma_user_mmap_entry_remove(srq->srq_entry);
1635                 vfree(srq->recvq);
1636         }
1637         atomic_dec(&sdev->num_srq);
1638
1639         return rv;
1640 }
1641
1642 /*
1643  * siw_modify_srq()
1644  *
1645  * Modify SRQ. The caller may resize SRQ and/or set/reset notification
1646  * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
1647  *
1648  * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
1649  * parameter. siw_modify_srq() does not check the attrs->max_sge param.
1650  */
1651 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
1652                    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
1653 {
1654         struct siw_srq *srq = to_siw_srq(base_srq);
1655         unsigned long flags;
1656         int rv = 0;
1657
1658         spin_lock_irqsave(&srq->lock, flags);
1659
1660         if (attr_mask & IB_SRQ_MAX_WR) {
1661                 /* resize request not yet supported */
1662                 rv = -EOPNOTSUPP;
1663                 goto out;
1664         }
1665         if (attr_mask & IB_SRQ_LIMIT) {
1666                 if (attrs->srq_limit) {
1667                         if (unlikely(attrs->srq_limit > srq->num_rqe)) {
1668                                 rv = -EINVAL;
1669                                 goto out;
1670                         }
1671                         srq->armed = true;
1672                 } else {
1673                         srq->armed = false;
1674                 }
1675                 srq->limit = attrs->srq_limit;
1676         }
1677 out:
1678         spin_unlock_irqrestore(&srq->lock, flags);
1679
1680         return rv;
1681 }
1682
1683 /*
1684  * siw_query_srq()
1685  *
1686  * Query SRQ attributes.
1687  */
1688 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
1689 {
1690         struct siw_srq *srq = to_siw_srq(base_srq);
1691         unsigned long flags;
1692
1693         spin_lock_irqsave(&srq->lock, flags);
1694
1695         attrs->max_wr = srq->num_rqe;
1696         attrs->max_sge = srq->max_sge;
1697         attrs->srq_limit = srq->limit;
1698
1699         spin_unlock_irqrestore(&srq->lock, flags);
1700
1701         return 0;
1702 }
1703
1704 /*
1705  * siw_destroy_srq()
1706  *
1707  * Destroy SRQ.
1708  * It is assumed that the SRQ is not referenced by any
1709  * QP anymore - the code trusts the RDMA core environment to keep track
1710  * of QP references.
1711  */
1712 int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
1713 {
1714         struct siw_srq *srq = to_siw_srq(base_srq);
1715         struct siw_device *sdev = to_siw_dev(base_srq->device);
1716         struct siw_ucontext *ctx =
1717                 rdma_udata_to_drv_context(udata, struct siw_ucontext,
1718                                           base_ucontext);
1719
1720         if (ctx)
1721                 rdma_user_mmap_entry_remove(srq->srq_entry);
1722         vfree(srq->recvq);
1723         atomic_dec(&sdev->num_srq);
1724         return 0;
1725 }
1726
1727 /*
1728  * siw_post_srq_recv()
1729  *
1730  * Post a list of receive queue elements to SRQ.
1731  * NOTE: The function does not check or lock a certain SRQ state
1732  *       during the post operation. The code simply trusts the
1733  *       RDMA core environment.
1734  *
1735  * @base_srq:   Base SRQ contained in siw SRQ
1736  * @wr:         List of R-WR's
1737  * @bad_wr:     Updated to failing WR if posting fails.
1738  */
1739 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
1740                       const struct ib_recv_wr **bad_wr)
1741 {
1742         struct siw_srq *srq = to_siw_srq(base_srq);
1743         unsigned long flags;
1744         int rv = 0;
1745
1746         if (unlikely(!srq->is_kernel_res)) {
1747                 siw_dbg_pd(base_srq->pd,
1748                            "[SRQ]: no kernel post_recv for mapped srq\n");
1749                 rv = -EINVAL;
1750                 goto out;
1751         }
1752         /*
1753          * Serialize potentially multiple producers.
1754          * Also needed to serialize potentially multiple
1755          * consumers.
1756          */
1757         spin_lock_irqsave(&srq->lock, flags);
1758
1759         while (wr) {
1760                 u32 idx = srq->rq_put % srq->num_rqe;
1761                 struct siw_rqe *rqe = &srq->recvq[idx];
1762
1763                 if (rqe->flags) {
1764                         siw_dbg_pd(base_srq->pd, "SRQ full\n");
1765                         rv = -ENOMEM;
1766                         break;
1767                 }
1768                 if (unlikely(wr->num_sge > srq->max_sge)) {
1769                         siw_dbg_pd(base_srq->pd,
1770                                    "[SRQ]: too many sge's: %d\n", wr->num_sge);
1771                         rv = -EINVAL;
1772                         break;
1773                 }
1774                 rqe->id = wr->wr_id;
1775                 rqe->num_sge = wr->num_sge;
1776                 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
1777
1778                 /* Make sure S-RQE is completely written before valid */
1779                 smp_wmb();
1780
1781                 rqe->flags = SIW_WQE_VALID;
1782
1783                 srq->rq_put++;
1784                 wr = wr->next;
1785         }
1786         spin_unlock_irqrestore(&srq->lock, flags);
1787 out:
1788         if (unlikely(rv < 0)) {
1789                 siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
1790                 *bad_wr = wr;
1791         }
1792         return rv;
1793 }
1794
1795 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
1796 {
1797         struct ib_event event;
1798         struct ib_qp *base_qp = &qp->base_qp;
1799
1800         /*
1801          * Do not report asynchronous errors on QP which gets
1802          * destroyed via verbs interface (siw_destroy_qp())
1803          */
1804         if (qp->attrs.flags & SIW_QP_IN_DESTROY)
1805                 return;
1806
1807         event.event = etype;
1808         event.device = base_qp->device;
1809         event.element.qp = base_qp;
1810
1811         if (base_qp->event_handler) {
1812                 siw_dbg_qp(qp, "reporting event %d\n", etype);
1813                 base_qp->event_handler(&event, base_qp->qp_context);
1814         }
1815 }
1816
1817 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
1818 {
1819         struct ib_event event;
1820         struct ib_cq *base_cq = &cq->base_cq;
1821
1822         event.event = etype;
1823         event.device = base_cq->device;
1824         event.element.cq = base_cq;
1825
1826         if (base_cq->event_handler) {
1827                 siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
1828                 base_cq->event_handler(&event, base_cq->cq_context);
1829         }
1830 }
1831
1832 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
1833 {
1834         struct ib_event event;
1835         struct ib_srq *base_srq = &srq->base_srq;
1836
1837         event.event = etype;
1838         event.device = base_srq->device;
1839         event.element.srq = base_srq;
1840
1841         if (base_srq->event_handler) {
1842                 siw_dbg_pd(srq->base_srq.pd,
1843                            "reporting SRQ event %d\n", etype);
1844                 base_srq->event_handler(&event, base_srq->srq_context);
1845         }
1846 }
1847
1848 void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype)
1849 {
1850         struct ib_event event;
1851
1852         event.event = etype;
1853         event.device = &sdev->base_dev;
1854         event.element.port_num = port;
1855
1856         siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
1857
1858         ib_dispatch_event(&event);
1859 }