RDMA/mlx5: Add capability for max sge to get optimized performance
[sfrench/cifs-2.6.git] / drivers / infiniband / hw / mlx5 / main.c
1 /*
2  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/module.h>
36 #include <linux/init.h>
37 #include <linux/errno.h>
38 #include <linux/pci.h>
39 #include <linux/dma-mapping.h>
40 #include <linux/slab.h>
41 #include <linux/bitmap.h>
42 #if defined(CONFIG_X86)
43 #include <asm/pat.h>
44 #endif
45 #include <linux/sched.h>
46 #include <linux/sched/mm.h>
47 #include <linux/sched/task.h>
48 #include <linux/delay.h>
49 #include <rdma/ib_user_verbs.h>
50 #include <rdma/ib_addr.h>
51 #include <rdma/ib_cache.h>
52 #include <linux/mlx5/port.h>
53 #include <linux/mlx5/vport.h>
54 #include <linux/mlx5/fs.h>
55 #include <linux/mlx5/eswitch.h>
56 #include <linux/list.h>
57 #include <rdma/ib_smi.h>
58 #include <rdma/ib_umem.h>
59 #include <linux/in.h>
60 #include <linux/etherdevice.h>
61 #include "mlx5_ib.h"
62 #include "ib_rep.h"
63 #include "cmd.h"
64 #include "srq.h"
65 #include <linux/mlx5/fs_helpers.h>
66 #include <linux/mlx5/accel.h>
67 #include <rdma/uverbs_std_types.h>
68 #include <rdma/mlx5_user_ioctl_verbs.h>
69 #include <rdma/mlx5_user_ioctl_cmds.h>
70
71 #define UVERBS_MODULE_NAME mlx5_ib
72 #include <rdma/uverbs_named_ioctl.h>
73
74 #define DRIVER_NAME "mlx5_ib"
75 #define DRIVER_VERSION "5.0-0"
76
77 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
78 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
79 MODULE_LICENSE("Dual BSD/GPL");
80
81 static char mlx5_version[] =
82         DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
83         DRIVER_VERSION "\n";
84
85 struct mlx5_ib_event_work {
86         struct work_struct      work;
87         union {
88                 struct mlx5_ib_dev            *dev;
89                 struct mlx5_ib_multiport_info *mpi;
90         };
91         bool                    is_slave;
92         unsigned int            event;
93         void                    *param;
94 };
95
96 enum {
97         MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
98 };
99
100 static struct workqueue_struct *mlx5_ib_event_wq;
101 static LIST_HEAD(mlx5_ib_unaffiliated_port_list);
102 static LIST_HEAD(mlx5_ib_dev_list);
103 /*
104  * This mutex should be held when accessing either of the above lists
105  */
106 static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
107
108 /* We can't use an array for xlt_emergency_page because dma_map_single
109  * doesn't work on kernel modules memory
110  */
111 static unsigned long xlt_emergency_page;
112 static struct mutex xlt_emergency_page_mutex;
113
114 struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
115 {
116         struct mlx5_ib_dev *dev;
117
118         mutex_lock(&mlx5_ib_multiport_mutex);
119         dev = mpi->ibdev;
120         mutex_unlock(&mlx5_ib_multiport_mutex);
121         return dev;
122 }
123
124 static enum rdma_link_layer
125 mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
126 {
127         switch (port_type_cap) {
128         case MLX5_CAP_PORT_TYPE_IB:
129                 return IB_LINK_LAYER_INFINIBAND;
130         case MLX5_CAP_PORT_TYPE_ETH:
131                 return IB_LINK_LAYER_ETHERNET;
132         default:
133                 return IB_LINK_LAYER_UNSPECIFIED;
134         }
135 }
136
137 static enum rdma_link_layer
138 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
139 {
140         struct mlx5_ib_dev *dev = to_mdev(device);
141         int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
142
143         return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
144 }
145
146 static int get_port_state(struct ib_device *ibdev,
147                           u8 port_num,
148                           enum ib_port_state *state)
149 {
150         struct ib_port_attr attr;
151         int ret;
152
153         memset(&attr, 0, sizeof(attr));
154         ret = ibdev->ops.query_port(ibdev, port_num, &attr);
155         if (!ret)
156                 *state = attr.state;
157         return ret;
158 }
159
160 static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
161                                            struct net_device *ndev,
162                                            u8 *port_num)
163 {
164         struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
165         struct net_device *rep_ndev;
166         struct mlx5_ib_port *port;
167         int i;
168
169         for (i = 0; i < dev->num_ports; i++) {
170                 port  = &dev->port[i];
171                 if (!port->rep)
172                         continue;
173
174                 read_lock(&port->roce.netdev_lock);
175                 rep_ndev = mlx5_ib_get_rep_netdev(esw,
176                                                   port->rep->vport);
177                 if (rep_ndev == ndev) {
178                         read_unlock(&port->roce.netdev_lock);
179                         *port_num = i + 1;
180                         return &port->roce;
181                 }
182                 read_unlock(&port->roce.netdev_lock);
183         }
184
185         return NULL;
186 }
187
188 static int mlx5_netdev_event(struct notifier_block *this,
189                              unsigned long event, void *ptr)
190 {
191         struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
192         struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
193         u8 port_num = roce->native_port_num;
194         struct mlx5_core_dev *mdev;
195         struct mlx5_ib_dev *ibdev;
196
197         ibdev = roce->dev;
198         mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
199         if (!mdev)
200                 return NOTIFY_DONE;
201
202         switch (event) {
203         case NETDEV_REGISTER:
204                 /* Should already be registered during the load */
205                 if (ibdev->is_rep)
206                         break;
207                 write_lock(&roce->netdev_lock);
208                 if (ndev->dev.parent == mdev->device)
209                         roce->netdev = ndev;
210                 write_unlock(&roce->netdev_lock);
211                 break;
212
213         case NETDEV_UNREGISTER:
214                 /* In case of reps, ib device goes away before the netdevs */
215                 write_lock(&roce->netdev_lock);
216                 if (roce->netdev == ndev)
217                         roce->netdev = NULL;
218                 write_unlock(&roce->netdev_lock);
219                 break;
220
221         case NETDEV_CHANGE:
222         case NETDEV_UP:
223         case NETDEV_DOWN: {
224                 struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev);
225                 struct net_device *upper = NULL;
226
227                 if (lag_ndev) {
228                         upper = netdev_master_upper_dev_get(lag_ndev);
229                         dev_put(lag_ndev);
230                 }
231
232                 if (ibdev->is_rep)
233                         roce = mlx5_get_rep_roce(ibdev, ndev, &port_num);
234                 if (!roce)
235                         return NOTIFY_DONE;
236                 if ((upper == ndev || (!upper && ndev == roce->netdev))
237                     && ibdev->ib_active) {
238                         struct ib_event ibev = { };
239                         enum ib_port_state port_state;
240
241                         if (get_port_state(&ibdev->ib_dev, port_num,
242                                            &port_state))
243                                 goto done;
244
245                         if (roce->last_port_state == port_state)
246                                 goto done;
247
248                         roce->last_port_state = port_state;
249                         ibev.device = &ibdev->ib_dev;
250                         if (port_state == IB_PORT_DOWN)
251                                 ibev.event = IB_EVENT_PORT_ERR;
252                         else if (port_state == IB_PORT_ACTIVE)
253                                 ibev.event = IB_EVENT_PORT_ACTIVE;
254                         else
255                                 goto done;
256
257                         ibev.element.port_num = port_num;
258                         ib_dispatch_event(&ibev);
259                 }
260                 break;
261         }
262
263         default:
264                 break;
265         }
266 done:
267         mlx5_ib_put_native_port_mdev(ibdev, port_num);
268         return NOTIFY_DONE;
269 }
270
271 static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
272                                              u8 port_num)
273 {
274         struct mlx5_ib_dev *ibdev = to_mdev(device);
275         struct net_device *ndev;
276         struct mlx5_core_dev *mdev;
277
278         mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
279         if (!mdev)
280                 return NULL;
281
282         ndev = mlx5_lag_get_roce_netdev(mdev);
283         if (ndev)
284                 goto out;
285
286         /* Ensure ndev does not disappear before we invoke dev_hold()
287          */
288         read_lock(&ibdev->port[port_num - 1].roce.netdev_lock);
289         ndev = ibdev->port[port_num - 1].roce.netdev;
290         if (ndev)
291                 dev_hold(ndev);
292         read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock);
293
294 out:
295         mlx5_ib_put_native_port_mdev(ibdev, port_num);
296         return ndev;
297 }
298
299 struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
300                                                    u8 ib_port_num,
301                                                    u8 *native_port_num)
302 {
303         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
304                                                           ib_port_num);
305         struct mlx5_core_dev *mdev = NULL;
306         struct mlx5_ib_multiport_info *mpi;
307         struct mlx5_ib_port *port;
308
309         if (!mlx5_core_mp_enabled(ibdev->mdev) ||
310             ll != IB_LINK_LAYER_ETHERNET) {
311                 if (native_port_num)
312                         *native_port_num = ib_port_num;
313                 return ibdev->mdev;
314         }
315
316         if (native_port_num)
317                 *native_port_num = 1;
318
319         port = &ibdev->port[ib_port_num - 1];
320         if (!port)
321                 return NULL;
322
323         spin_lock(&port->mp.mpi_lock);
324         mpi = ibdev->port[ib_port_num - 1].mp.mpi;
325         if (mpi && !mpi->unaffiliate) {
326                 mdev = mpi->mdev;
327                 /* If it's the master no need to refcount, it'll exist
328                  * as long as the ib_dev exists.
329                  */
330                 if (!mpi->is_master)
331                         mpi->mdev_refcnt++;
332         }
333         spin_unlock(&port->mp.mpi_lock);
334
335         return mdev;
336 }
337
338 void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num)
339 {
340         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
341                                                           port_num);
342         struct mlx5_ib_multiport_info *mpi;
343         struct mlx5_ib_port *port;
344
345         if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
346                 return;
347
348         port = &ibdev->port[port_num - 1];
349
350         spin_lock(&port->mp.mpi_lock);
351         mpi = ibdev->port[port_num - 1].mp.mpi;
352         if (mpi->is_master)
353                 goto out;
354
355         mpi->mdev_refcnt--;
356         if (mpi->unaffiliate)
357                 complete(&mpi->unref_comp);
358 out:
359         spin_unlock(&port->mp.mpi_lock);
360 }
361
362 static int translate_eth_legacy_proto_oper(u32 eth_proto_oper, u8 *active_speed,
363                                            u8 *active_width)
364 {
365         switch (eth_proto_oper) {
366         case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
367         case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
368         case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
369         case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
370                 *active_width = IB_WIDTH_1X;
371                 *active_speed = IB_SPEED_SDR;
372                 break;
373         case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
374         case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
375         case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
376         case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
377         case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
378         case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
379         case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
380                 *active_width = IB_WIDTH_1X;
381                 *active_speed = IB_SPEED_QDR;
382                 break;
383         case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
384         case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
385         case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
386                 *active_width = IB_WIDTH_1X;
387                 *active_speed = IB_SPEED_EDR;
388                 break;
389         case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
390         case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
391         case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
392         case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
393                 *active_width = IB_WIDTH_4X;
394                 *active_speed = IB_SPEED_QDR;
395                 break;
396         case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
397         case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
398         case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
399                 *active_width = IB_WIDTH_1X;
400                 *active_speed = IB_SPEED_HDR;
401                 break;
402         case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
403                 *active_width = IB_WIDTH_4X;
404                 *active_speed = IB_SPEED_FDR;
405                 break;
406         case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
407         case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
408         case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
409         case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
410                 *active_width = IB_WIDTH_4X;
411                 *active_speed = IB_SPEED_EDR;
412                 break;
413         default:
414                 return -EINVAL;
415         }
416
417         return 0;
418 }
419
420 static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed,
421                                         u8 *active_width)
422 {
423         switch (eth_proto_oper) {
424         case MLX5E_PROT_MASK(MLX5E_SGMII_100M):
425         case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII):
426                 *active_width = IB_WIDTH_1X;
427                 *active_speed = IB_SPEED_SDR;
428                 break;
429         case MLX5E_PROT_MASK(MLX5E_5GBASE_R):
430                 *active_width = IB_WIDTH_1X;
431                 *active_speed = IB_SPEED_DDR;
432                 break;
433         case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1):
434                 *active_width = IB_WIDTH_1X;
435                 *active_speed = IB_SPEED_QDR;
436                 break;
437         case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4):
438                 *active_width = IB_WIDTH_4X;
439                 *active_speed = IB_SPEED_QDR;
440                 break;
441         case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR):
442                 *active_width = IB_WIDTH_1X;
443                 *active_speed = IB_SPEED_EDR;
444                 break;
445         case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
446                 *active_width = IB_WIDTH_2X;
447                 *active_speed = IB_SPEED_EDR;
448                 break;
449         case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
450                 *active_width = IB_WIDTH_1X;
451                 *active_speed = IB_SPEED_HDR;
452                 break;
453         case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
454                 *active_width = IB_WIDTH_4X;
455                 *active_speed = IB_SPEED_EDR;
456                 break;
457         case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
458                 *active_width = IB_WIDTH_2X;
459                 *active_speed = IB_SPEED_HDR;
460                 break;
461         case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4):
462                 *active_width = IB_WIDTH_4X;
463                 *active_speed = IB_SPEED_HDR;
464                 break;
465         default:
466                 return -EINVAL;
467         }
468
469         return 0;
470 }
471
472 static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
473                                     u8 *active_width, bool ext)
474 {
475         return ext ?
476                 translate_eth_ext_proto_oper(eth_proto_oper, active_speed,
477                                              active_width) :
478                 translate_eth_legacy_proto_oper(eth_proto_oper, active_speed,
479                                                 active_width);
480 }
481
482 static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
483                                 struct ib_port_attr *props)
484 {
485         struct mlx5_ib_dev *dev = to_mdev(device);
486         u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
487         struct mlx5_core_dev *mdev;
488         struct net_device *ndev, *upper;
489         enum ib_mtu ndev_ib_mtu;
490         bool put_mdev = true;
491         u16 qkey_viol_cntr;
492         u32 eth_prot_oper;
493         u8 mdev_port_num;
494         bool ext;
495         int err;
496
497         mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
498         if (!mdev) {
499                 /* This means the port isn't affiliated yet. Get the
500                  * info for the master port instead.
501                  */
502                 put_mdev = false;
503                 mdev = dev->mdev;
504                 mdev_port_num = 1;
505                 port_num = 1;
506         }
507
508         /* Possible bad flows are checked before filling out props so in case
509          * of an error it will still be zeroed out.
510          * Use native port in case of reps
511          */
512         if (dev->is_rep)
513                 err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
514                                            1);
515         else
516                 err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
517                                            mdev_port_num);
518         if (err)
519                 goto out;
520         ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet);
521         eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
522
523         props->active_width     = IB_WIDTH_4X;
524         props->active_speed     = IB_SPEED_QDR;
525
526         translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
527                                  &props->active_width, ext);
528
529         props->port_cap_flags |= IB_PORT_CM_SUP;
530         props->ip_gids = true;
531
532         props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
533                                                 roce_address_table_size);
534         props->max_mtu          = IB_MTU_4096;
535         props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
536         props->pkey_tbl_len     = 1;
537         props->state            = IB_PORT_DOWN;
538         props->phys_state       = IB_PORT_PHYS_STATE_DISABLED;
539
540         mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
541         props->qkey_viol_cntr = qkey_viol_cntr;
542
543         /* If this is a stub query for an unaffiliated port stop here */
544         if (!put_mdev)
545                 goto out;
546
547         ndev = mlx5_ib_get_netdev(device, port_num);
548         if (!ndev)
549                 goto out;
550
551         if (dev->lag_active) {
552                 rcu_read_lock();
553                 upper = netdev_master_upper_dev_get_rcu(ndev);
554                 if (upper) {
555                         dev_put(ndev);
556                         ndev = upper;
557                         dev_hold(ndev);
558                 }
559                 rcu_read_unlock();
560         }
561
562         if (netif_running(ndev) && netif_carrier_ok(ndev)) {
563                 props->state      = IB_PORT_ACTIVE;
564                 props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
565         }
566
567         ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
568
569         dev_put(ndev);
570
571         props->active_mtu       = min(props->max_mtu, ndev_ib_mtu);
572 out:
573         if (put_mdev)
574                 mlx5_ib_put_native_port_mdev(dev, port_num);
575         return err;
576 }
577
578 static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
579                          unsigned int index, const union ib_gid *gid,
580                          const struct ib_gid_attr *attr)
581 {
582         enum ib_gid_type gid_type = IB_GID_TYPE_IB;
583         u16 vlan_id = 0xffff;
584         u8 roce_version = 0;
585         u8 roce_l3_type = 0;
586         u8 mac[ETH_ALEN];
587         int ret;
588
589         if (gid) {
590                 gid_type = attr->gid_type;
591                 ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
592                 if (ret)
593                         return ret;
594         }
595
596         switch (gid_type) {
597         case IB_GID_TYPE_IB:
598                 roce_version = MLX5_ROCE_VERSION_1;
599                 break;
600         case IB_GID_TYPE_ROCE_UDP_ENCAP:
601                 roce_version = MLX5_ROCE_VERSION_2;
602                 if (ipv6_addr_v4mapped((void *)gid))
603                         roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4;
604                 else
605                         roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6;
606                 break;
607
608         default:
609                 mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type);
610         }
611
612         return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
613                                       roce_l3_type, gid->raw, mac,
614                                       vlan_id < VLAN_CFI_MASK, vlan_id,
615                                       port_num);
616 }
617
618 static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
619                            __always_unused void **context)
620 {
621         return set_roce_addr(to_mdev(attr->device), attr->port_num,
622                              attr->index, &attr->gid, attr);
623 }
624
625 static int mlx5_ib_del_gid(const struct ib_gid_attr *attr,
626                            __always_unused void **context)
627 {
628         return set_roce_addr(to_mdev(attr->device), attr->port_num,
629                              attr->index, NULL, NULL);
630 }
631
632 __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev,
633                                const struct ib_gid_attr *attr)
634 {
635         if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
636                 return 0;
637
638         return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
639 }
640
641 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
642 {
643         if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
644                 return !MLX5_CAP_GEN(dev->mdev, ib_virt);
645         return 0;
646 }
647
648 enum {
649         MLX5_VPORT_ACCESS_METHOD_MAD,
650         MLX5_VPORT_ACCESS_METHOD_HCA,
651         MLX5_VPORT_ACCESS_METHOD_NIC,
652 };
653
654 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
655 {
656         if (mlx5_use_mad_ifc(to_mdev(ibdev)))
657                 return MLX5_VPORT_ACCESS_METHOD_MAD;
658
659         if (mlx5_ib_port_link_layer(ibdev, 1) ==
660             IB_LINK_LAYER_ETHERNET)
661                 return MLX5_VPORT_ACCESS_METHOD_NIC;
662
663         return MLX5_VPORT_ACCESS_METHOD_HCA;
664 }
665
666 static void get_atomic_caps(struct mlx5_ib_dev *dev,
667                             u8 atomic_size_qp,
668                             struct ib_device_attr *props)
669 {
670         u8 tmp;
671         u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
672         u8 atomic_req_8B_endianness_mode =
673                 MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
674
675         /* Check if HW supports 8 bytes standard atomic operations and capable
676          * of host endianness respond
677          */
678         tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
679         if (((atomic_operations & tmp) == tmp) &&
680             (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
681             (atomic_req_8B_endianness_mode)) {
682                 props->atomic_cap = IB_ATOMIC_HCA;
683         } else {
684                 props->atomic_cap = IB_ATOMIC_NONE;
685         }
686 }
687
688 static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
689                                struct ib_device_attr *props)
690 {
691         u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
692
693         get_atomic_caps(dev, atomic_size_qp, props);
694 }
695
696 static void get_atomic_caps_dc(struct mlx5_ib_dev *dev,
697                                struct ib_device_attr *props)
698 {
699         u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
700
701         get_atomic_caps(dev, atomic_size_qp, props);
702 }
703
704 bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev)
705 {
706         struct ib_device_attr props = {};
707
708         get_atomic_caps_dc(dev, &props);
709         return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false;
710 }
711 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
712                                         __be64 *sys_image_guid)
713 {
714         struct mlx5_ib_dev *dev = to_mdev(ibdev);
715         struct mlx5_core_dev *mdev = dev->mdev;
716         u64 tmp;
717         int err;
718
719         switch (mlx5_get_vport_access_method(ibdev)) {
720         case MLX5_VPORT_ACCESS_METHOD_MAD:
721                 return mlx5_query_mad_ifc_system_image_guid(ibdev,
722                                                             sys_image_guid);
723
724         case MLX5_VPORT_ACCESS_METHOD_HCA:
725                 err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
726                 break;
727
728         case MLX5_VPORT_ACCESS_METHOD_NIC:
729                 err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
730                 break;
731
732         default:
733                 return -EINVAL;
734         }
735
736         if (!err)
737                 *sys_image_guid = cpu_to_be64(tmp);
738
739         return err;
740
741 }
742
743 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
744                                 u16 *max_pkeys)
745 {
746         struct mlx5_ib_dev *dev = to_mdev(ibdev);
747         struct mlx5_core_dev *mdev = dev->mdev;
748
749         switch (mlx5_get_vport_access_method(ibdev)) {
750         case MLX5_VPORT_ACCESS_METHOD_MAD:
751                 return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
752
753         case MLX5_VPORT_ACCESS_METHOD_HCA:
754         case MLX5_VPORT_ACCESS_METHOD_NIC:
755                 *max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
756                                                 pkey_table_size));
757                 return 0;
758
759         default:
760                 return -EINVAL;
761         }
762 }
763
764 static int mlx5_query_vendor_id(struct ib_device *ibdev,
765                                 u32 *vendor_id)
766 {
767         struct mlx5_ib_dev *dev = to_mdev(ibdev);
768
769         switch (mlx5_get_vport_access_method(ibdev)) {
770         case MLX5_VPORT_ACCESS_METHOD_MAD:
771                 return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
772
773         case MLX5_VPORT_ACCESS_METHOD_HCA:
774         case MLX5_VPORT_ACCESS_METHOD_NIC:
775                 return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
776
777         default:
778                 return -EINVAL;
779         }
780 }
781
782 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
783                                 __be64 *node_guid)
784 {
785         u64 tmp;
786         int err;
787
788         switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
789         case MLX5_VPORT_ACCESS_METHOD_MAD:
790                 return mlx5_query_mad_ifc_node_guid(dev, node_guid);
791
792         case MLX5_VPORT_ACCESS_METHOD_HCA:
793                 err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
794                 break;
795
796         case MLX5_VPORT_ACCESS_METHOD_NIC:
797                 err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
798                 break;
799
800         default:
801                 return -EINVAL;
802         }
803
804         if (!err)
805                 *node_guid = cpu_to_be64(tmp);
806
807         return err;
808 }
809
810 struct mlx5_reg_node_desc {
811         u8      desc[IB_DEVICE_NODE_DESC_MAX];
812 };
813
814 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
815 {
816         struct mlx5_reg_node_desc in;
817
818         if (mlx5_use_mad_ifc(dev))
819                 return mlx5_query_mad_ifc_node_desc(dev, node_desc);
820
821         memset(&in, 0, sizeof(in));
822
823         return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
824                                     sizeof(struct mlx5_reg_node_desc),
825                                     MLX5_REG_NODE_DESC, 0, 0);
826 }
827
828 static int mlx5_ib_query_device(struct ib_device *ibdev,
829                                 struct ib_device_attr *props,
830                                 struct ib_udata *uhw)
831 {
832         struct mlx5_ib_dev *dev = to_mdev(ibdev);
833         struct mlx5_core_dev *mdev = dev->mdev;
834         int err = -ENOMEM;
835         int max_sq_desc;
836         int max_rq_sg;
837         int max_sq_sg;
838         u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
839         bool raw_support = !mlx5_core_mp_enabled(mdev);
840         struct mlx5_ib_query_device_resp resp = {};
841         size_t resp_len;
842         u64 max_tso;
843
844         resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
845         if (uhw->outlen && uhw->outlen < resp_len)
846                 return -EINVAL;
847         else
848                 resp.response_length = resp_len;
849
850         if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
851                 return -EINVAL;
852
853         memset(props, 0, sizeof(*props));
854         err = mlx5_query_system_image_guid(ibdev,
855                                            &props->sys_image_guid);
856         if (err)
857                 return err;
858
859         err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
860         if (err)
861                 return err;
862
863         err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
864         if (err)
865                 return err;
866
867         props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
868                 (fw_rev_min(dev->mdev) << 16) |
869                 fw_rev_sub(dev->mdev);
870         props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
871                 IB_DEVICE_PORT_ACTIVE_EVENT             |
872                 IB_DEVICE_SYS_IMAGE_GUID                |
873                 IB_DEVICE_RC_RNR_NAK_GEN;
874
875         if (MLX5_CAP_GEN(mdev, pkv))
876                 props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
877         if (MLX5_CAP_GEN(mdev, qkv))
878                 props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
879         if (MLX5_CAP_GEN(mdev, apm))
880                 props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
881         if (MLX5_CAP_GEN(mdev, xrc))
882                 props->device_cap_flags |= IB_DEVICE_XRC;
883         if (MLX5_CAP_GEN(mdev, imaicl)) {
884                 props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
885                                            IB_DEVICE_MEM_WINDOW_TYPE_2B;
886                 props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
887                 /* We support 'Gappy' memory registration too */
888                 props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
889         }
890         props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
891         if (MLX5_CAP_GEN(mdev, sho)) {
892                 props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER;
893                 /* At this stage no support for signature handover */
894                 props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
895                                       IB_PROT_T10DIF_TYPE_2 |
896                                       IB_PROT_T10DIF_TYPE_3;
897                 props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
898                                        IB_GUARD_T10DIF_CSUM;
899         }
900         if (MLX5_CAP_GEN(mdev, block_lb_mc))
901                 props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
902
903         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) {
904                 if (MLX5_CAP_ETH(mdev, csum_cap)) {
905                         /* Legacy bit to support old userspace libraries */
906                         props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
907                         props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
908                 }
909
910                 if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
911                         props->raw_packet_caps |=
912                                 IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
913
914                 if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
915                         max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
916                         if (max_tso) {
917                                 resp.tso_caps.max_tso = 1 << max_tso;
918                                 resp.tso_caps.supported_qpts |=
919                                         1 << IB_QPT_RAW_PACKET;
920                                 resp.response_length += sizeof(resp.tso_caps);
921                         }
922                 }
923
924                 if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
925                         resp.rss_caps.rx_hash_function =
926                                                 MLX5_RX_HASH_FUNC_TOEPLITZ;
927                         resp.rss_caps.rx_hash_fields_mask =
928                                                 MLX5_RX_HASH_SRC_IPV4 |
929                                                 MLX5_RX_HASH_DST_IPV4 |
930                                                 MLX5_RX_HASH_SRC_IPV6 |
931                                                 MLX5_RX_HASH_DST_IPV6 |
932                                                 MLX5_RX_HASH_SRC_PORT_TCP |
933                                                 MLX5_RX_HASH_DST_PORT_TCP |
934                                                 MLX5_RX_HASH_SRC_PORT_UDP |
935                                                 MLX5_RX_HASH_DST_PORT_UDP |
936                                                 MLX5_RX_HASH_INNER;
937                         if (mlx5_accel_ipsec_device_caps(dev->mdev) &
938                             MLX5_ACCEL_IPSEC_CAP_DEVICE)
939                                 resp.rss_caps.rx_hash_fields_mask |=
940                                         MLX5_RX_HASH_IPSEC_SPI;
941                         resp.response_length += sizeof(resp.rss_caps);
942                 }
943         } else {
944                 if (field_avail(typeof(resp), tso_caps, uhw->outlen))
945                         resp.response_length += sizeof(resp.tso_caps);
946                 if (field_avail(typeof(resp), rss_caps, uhw->outlen))
947                         resp.response_length += sizeof(resp.rss_caps);
948         }
949
950         if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
951                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
952                 props->device_cap_flags |= IB_DEVICE_UD_TSO;
953         }
954
955         if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
956             MLX5_CAP_GEN(dev->mdev, general_notification_event) &&
957             raw_support)
958                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
959
960         if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
961             MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap))
962                 props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
963
964         if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
965             MLX5_CAP_ETH(dev->mdev, scatter_fcs) &&
966             raw_support) {
967                 /* Legacy bit to support old userspace libraries */
968                 props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
969                 props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
970         }
971
972         if (MLX5_CAP_DEV_MEM(mdev, memic)) {
973                 props->max_dm_size =
974                         MLX5_CAP_DEV_MEM(mdev, max_memic_size);
975         }
976
977         if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
978                 props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
979
980         if (MLX5_CAP_GEN(mdev, end_pad))
981                 props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING;
982
983         props->vendor_part_id      = mdev->pdev->device;
984         props->hw_ver              = mdev->pdev->revision;
985
986         props->max_mr_size         = ~0ull;
987         props->page_size_cap       = ~(min_page_size - 1);
988         props->max_qp              = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
989         props->max_qp_wr           = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
990         max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
991                      sizeof(struct mlx5_wqe_data_seg);
992         max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
993         max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
994                      sizeof(struct mlx5_wqe_raddr_seg)) /
995                 sizeof(struct mlx5_wqe_data_seg);
996         props->max_send_sge = max_sq_sg;
997         props->max_recv_sge = max_rq_sg;
998         props->max_sge_rd          = MLX5_MAX_SGE_RD;
999         props->max_cq              = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
1000         props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
1001         props->max_mr              = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
1002         props->max_pd              = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
1003         props->max_qp_rd_atom      = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
1004         props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
1005         props->max_srq             = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
1006         props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
1007         props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
1008         props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
1009         props->max_srq_sge         = max_rq_sg - 1;
1010         props->max_fast_reg_page_list_len =
1011                 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
1012         props->max_pi_fast_reg_page_list_len =
1013                 props->max_fast_reg_page_list_len / 2;
1014         props->max_sgl_rd =
1015                 MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance);
1016         get_atomic_caps_qp(dev, props);
1017         props->masked_atomic_cap   = IB_ATOMIC_NONE;
1018         props->max_mcast_grp       = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
1019         props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
1020         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
1021                                            props->max_mcast_grp;
1022         props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
1023         props->max_ah = INT_MAX;
1024         props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
1025         props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
1026
1027         if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
1028                 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
1029                         props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
1030                 props->odp_caps = dev->odp_caps;
1031         }
1032
1033         if (MLX5_CAP_GEN(mdev, cd))
1034                 props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
1035
1036         if (!mlx5_core_is_pf(mdev))
1037                 props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
1038
1039         if (mlx5_ib_port_link_layer(ibdev, 1) ==
1040             IB_LINK_LAYER_ETHERNET && raw_support) {
1041                 props->rss_caps.max_rwq_indirection_tables =
1042                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
1043                 props->rss_caps.max_rwq_indirection_table_size =
1044                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
1045                 props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
1046                 props->max_wq_type_rq =
1047                         1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
1048         }
1049
1050         if (MLX5_CAP_GEN(mdev, tag_matching)) {
1051                 props->tm_caps.max_num_tags =
1052                         (1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
1053                 props->tm_caps.max_ops =
1054                         1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
1055                 props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
1056         }
1057
1058         if (MLX5_CAP_GEN(mdev, tag_matching) &&
1059             MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
1060                 props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
1061                 props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
1062         }
1063
1064         if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
1065                 props->cq_caps.max_cq_moderation_count =
1066                                                 MLX5_MAX_CQ_COUNT;
1067                 props->cq_caps.max_cq_moderation_period =
1068                                                 MLX5_MAX_CQ_PERIOD;
1069         }
1070
1071         if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
1072                 resp.response_length += sizeof(resp.cqe_comp_caps);
1073
1074                 if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
1075                         resp.cqe_comp_caps.max_num =
1076                                 MLX5_CAP_GEN(dev->mdev,
1077                                              cqe_compression_max_num);
1078
1079                         resp.cqe_comp_caps.supported_format =
1080                                 MLX5_IB_CQE_RES_FORMAT_HASH |
1081                                 MLX5_IB_CQE_RES_FORMAT_CSUM;
1082
1083                         if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index))
1084                                 resp.cqe_comp_caps.supported_format |=
1085                                         MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX;
1086                 }
1087         }
1088
1089         if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) &&
1090             raw_support) {
1091                 if (MLX5_CAP_QOS(mdev, packet_pacing) &&
1092                     MLX5_CAP_GEN(mdev, qos)) {
1093                         resp.packet_pacing_caps.qp_rate_limit_max =
1094                                 MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
1095                         resp.packet_pacing_caps.qp_rate_limit_min =
1096                                 MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
1097                         resp.packet_pacing_caps.supported_qpts |=
1098                                 1 << IB_QPT_RAW_PACKET;
1099                         if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) &&
1100                             MLX5_CAP_QOS(mdev, packet_pacing_typical_size))
1101                                 resp.packet_pacing_caps.cap_flags |=
1102                                         MLX5_IB_PP_SUPPORT_BURST;
1103                 }
1104                 resp.response_length += sizeof(resp.packet_pacing_caps);
1105         }
1106
1107         if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
1108                         uhw->outlen)) {
1109                 if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
1110                         resp.mlx5_ib_support_multi_pkt_send_wqes =
1111                                 MLX5_IB_ALLOW_MPW;
1112
1113                 if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe))
1114                         resp.mlx5_ib_support_multi_pkt_send_wqes |=
1115                                 MLX5_IB_SUPPORT_EMPW;
1116
1117                 resp.response_length +=
1118                         sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
1119         }
1120
1121         if (field_avail(typeof(resp), flags, uhw->outlen)) {
1122                 resp.response_length += sizeof(resp.flags);
1123
1124                 if (MLX5_CAP_GEN(mdev, cqe_compression_128))
1125                         resp.flags |=
1126                                 MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP;
1127
1128                 if (MLX5_CAP_GEN(mdev, cqe_128_always))
1129                         resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
1130                 if (MLX5_CAP_GEN(mdev, qp_packet_based))
1131                         resp.flags |=
1132                                 MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
1133
1134                 resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
1135         }
1136
1137         if (field_avail(typeof(resp), sw_parsing_caps,
1138                         uhw->outlen)) {
1139                 resp.response_length += sizeof(resp.sw_parsing_caps);
1140                 if (MLX5_CAP_ETH(mdev, swp)) {
1141                         resp.sw_parsing_caps.sw_parsing_offloads |=
1142                                 MLX5_IB_SW_PARSING;
1143
1144                         if (MLX5_CAP_ETH(mdev, swp_csum))
1145                                 resp.sw_parsing_caps.sw_parsing_offloads |=
1146                                         MLX5_IB_SW_PARSING_CSUM;
1147
1148                         if (MLX5_CAP_ETH(mdev, swp_lso))
1149                                 resp.sw_parsing_caps.sw_parsing_offloads |=
1150                                         MLX5_IB_SW_PARSING_LSO;
1151
1152                         if (resp.sw_parsing_caps.sw_parsing_offloads)
1153                                 resp.sw_parsing_caps.supported_qpts =
1154                                         BIT(IB_QPT_RAW_PACKET);
1155                 }
1156         }
1157
1158         if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) &&
1159             raw_support) {
1160                 resp.response_length += sizeof(resp.striding_rq_caps);
1161                 if (MLX5_CAP_GEN(mdev, striding_rq)) {
1162                         resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
1163                                 MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
1164                         resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
1165                                 MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
1166                         resp.striding_rq_caps.min_single_wqe_log_num_of_strides =
1167                                 MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1168                         resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
1169                                 MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
1170                         resp.striding_rq_caps.supported_qpts =
1171                                 BIT(IB_QPT_RAW_PACKET);
1172                 }
1173         }
1174
1175         if (field_avail(typeof(resp), tunnel_offloads_caps,
1176                         uhw->outlen)) {
1177                 resp.response_length += sizeof(resp.tunnel_offloads_caps);
1178                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
1179                         resp.tunnel_offloads_caps |=
1180                                 MLX5_IB_TUNNELED_OFFLOADS_VXLAN;
1181                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx))
1182                         resp.tunnel_offloads_caps |=
1183                                 MLX5_IB_TUNNELED_OFFLOADS_GENEVE;
1184                 if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
1185                         resp.tunnel_offloads_caps |=
1186                                 MLX5_IB_TUNNELED_OFFLOADS_GRE;
1187                 if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
1188                     MLX5_FLEX_PROTO_CW_MPLS_GRE)
1189                         resp.tunnel_offloads_caps |=
1190                                 MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
1191                 if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
1192                     MLX5_FLEX_PROTO_CW_MPLS_UDP)
1193                         resp.tunnel_offloads_caps |=
1194                                 MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
1195         }
1196
1197         if (uhw->outlen) {
1198                 err = ib_copy_to_udata(uhw, &resp, resp.response_length);
1199
1200                 if (err)
1201                         return err;
1202         }
1203
1204         return 0;
1205 }
1206
1207 enum mlx5_ib_width {
1208         MLX5_IB_WIDTH_1X        = 1 << 0,
1209         MLX5_IB_WIDTH_2X        = 1 << 1,
1210         MLX5_IB_WIDTH_4X        = 1 << 2,
1211         MLX5_IB_WIDTH_8X        = 1 << 3,
1212         MLX5_IB_WIDTH_12X       = 1 << 4
1213 };
1214
1215 static void translate_active_width(struct ib_device *ibdev, u8 active_width,
1216                                   u8 *ib_width)
1217 {
1218         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1219
1220         if (active_width & MLX5_IB_WIDTH_1X)
1221                 *ib_width = IB_WIDTH_1X;
1222         else if (active_width & MLX5_IB_WIDTH_2X)
1223                 *ib_width = IB_WIDTH_2X;
1224         else if (active_width & MLX5_IB_WIDTH_4X)
1225                 *ib_width = IB_WIDTH_4X;
1226         else if (active_width & MLX5_IB_WIDTH_8X)
1227                 *ib_width = IB_WIDTH_8X;
1228         else if (active_width & MLX5_IB_WIDTH_12X)
1229                 *ib_width = IB_WIDTH_12X;
1230         else {
1231                 mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n",
1232                             (int)active_width);
1233                 *ib_width = IB_WIDTH_4X;
1234         }
1235
1236         return;
1237 }
1238
1239 static int mlx5_mtu_to_ib_mtu(int mtu)
1240 {
1241         switch (mtu) {
1242         case 256: return 1;
1243         case 512: return 2;
1244         case 1024: return 3;
1245         case 2048: return 4;
1246         case 4096: return 5;
1247         default:
1248                 pr_warn("invalid mtu\n");
1249                 return -1;
1250         }
1251 }
1252
1253 enum ib_max_vl_num {
1254         __IB_MAX_VL_0           = 1,
1255         __IB_MAX_VL_0_1         = 2,
1256         __IB_MAX_VL_0_3         = 3,
1257         __IB_MAX_VL_0_7         = 4,
1258         __IB_MAX_VL_0_14        = 5,
1259 };
1260
1261 enum mlx5_vl_hw_cap {
1262         MLX5_VL_HW_0    = 1,
1263         MLX5_VL_HW_0_1  = 2,
1264         MLX5_VL_HW_0_2  = 3,
1265         MLX5_VL_HW_0_3  = 4,
1266         MLX5_VL_HW_0_4  = 5,
1267         MLX5_VL_HW_0_5  = 6,
1268         MLX5_VL_HW_0_6  = 7,
1269         MLX5_VL_HW_0_7  = 8,
1270         MLX5_VL_HW_0_14 = 15
1271 };
1272
1273 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
1274                                 u8 *max_vl_num)
1275 {
1276         switch (vl_hw_cap) {
1277         case MLX5_VL_HW_0:
1278                 *max_vl_num = __IB_MAX_VL_0;
1279                 break;
1280         case MLX5_VL_HW_0_1:
1281                 *max_vl_num = __IB_MAX_VL_0_1;
1282                 break;
1283         case MLX5_VL_HW_0_3:
1284                 *max_vl_num = __IB_MAX_VL_0_3;
1285                 break;
1286         case MLX5_VL_HW_0_7:
1287                 *max_vl_num = __IB_MAX_VL_0_7;
1288                 break;
1289         case MLX5_VL_HW_0_14:
1290                 *max_vl_num = __IB_MAX_VL_0_14;
1291                 break;
1292
1293         default:
1294                 return -EINVAL;
1295         }
1296
1297         return 0;
1298 }
1299
1300 static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
1301                                struct ib_port_attr *props)
1302 {
1303         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1304         struct mlx5_core_dev *mdev = dev->mdev;
1305         struct mlx5_hca_vport_context *rep;
1306         u16 max_mtu;
1307         u16 oper_mtu;
1308         int err;
1309         u8 ib_link_width_oper;
1310         u8 vl_hw_cap;
1311
1312         rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1313         if (!rep) {
1314                 err = -ENOMEM;
1315                 goto out;
1316         }
1317
1318         /* props being zeroed by the caller, avoid zeroing it here */
1319
1320         err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
1321         if (err)
1322                 goto out;
1323
1324         props->lid              = rep->lid;
1325         props->lmc              = rep->lmc;
1326         props->sm_lid           = rep->sm_lid;
1327         props->sm_sl            = rep->sm_sl;
1328         props->state            = rep->vport_state;
1329         props->phys_state       = rep->port_physical_state;
1330         props->port_cap_flags   = rep->cap_mask1;
1331         props->gid_tbl_len      = mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
1332         props->max_msg_sz       = 1 << MLX5_CAP_GEN(mdev, log_max_msg);
1333         props->pkey_tbl_len     = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
1334         props->bad_pkey_cntr    = rep->pkey_violation_counter;
1335         props->qkey_viol_cntr   = rep->qkey_violation_counter;
1336         props->subnet_timeout   = rep->subnet_timeout;
1337         props->init_type_reply  = rep->init_type_reply;
1338
1339         if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
1340                 props->port_cap_flags2 = rep->cap_mask2;
1341
1342         err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
1343         if (err)
1344                 goto out;
1345
1346         translate_active_width(ibdev, ib_link_width_oper, &props->active_width);
1347
1348         err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
1349         if (err)
1350                 goto out;
1351
1352         mlx5_query_port_max_mtu(mdev, &max_mtu, port);
1353
1354         props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
1355
1356         mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
1357
1358         props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
1359
1360         err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
1361         if (err)
1362                 goto out;
1363
1364         err = translate_max_vl_num(ibdev, vl_hw_cap,
1365                                    &props->max_vl_num);
1366 out:
1367         kfree(rep);
1368         return err;
1369 }
1370
1371 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
1372                        struct ib_port_attr *props)
1373 {
1374         unsigned int count;
1375         int ret;
1376
1377         switch (mlx5_get_vport_access_method(ibdev)) {
1378         case MLX5_VPORT_ACCESS_METHOD_MAD:
1379                 ret = mlx5_query_mad_ifc_port(ibdev, port, props);
1380                 break;
1381
1382         case MLX5_VPORT_ACCESS_METHOD_HCA:
1383                 ret = mlx5_query_hca_port(ibdev, port, props);
1384                 break;
1385
1386         case MLX5_VPORT_ACCESS_METHOD_NIC:
1387                 ret = mlx5_query_port_roce(ibdev, port, props);
1388                 break;
1389
1390         default:
1391                 ret = -EINVAL;
1392         }
1393
1394         if (!ret && props) {
1395                 struct mlx5_ib_dev *dev = to_mdev(ibdev);
1396                 struct mlx5_core_dev *mdev;
1397                 bool put_mdev = true;
1398
1399                 mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL);
1400                 if (!mdev) {
1401                         /* If the port isn't affiliated yet query the master.
1402                          * The master and slave will have the same values.
1403                          */
1404                         mdev = dev->mdev;
1405                         port = 1;
1406                         put_mdev = false;
1407                 }
1408                 count = mlx5_core_reserved_gids_count(mdev);
1409                 if (put_mdev)
1410                         mlx5_ib_put_native_port_mdev(dev, port);
1411                 props->gid_tbl_len -= count;
1412         }
1413         return ret;
1414 }
1415
1416 static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port,
1417                                   struct ib_port_attr *props)
1418 {
1419         int ret;
1420
1421         /* Only link layer == ethernet is valid for representors
1422          * and we always use port 1
1423          */
1424         ret = mlx5_query_port_roce(ibdev, port, props);
1425         if (ret || !props)
1426                 return ret;
1427
1428         /* We don't support GIDS */
1429         props->gid_tbl_len = 0;
1430
1431         return ret;
1432 }
1433
1434 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
1435                              union ib_gid *gid)
1436 {
1437         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1438         struct mlx5_core_dev *mdev = dev->mdev;
1439
1440         switch (mlx5_get_vport_access_method(ibdev)) {
1441         case MLX5_VPORT_ACCESS_METHOD_MAD:
1442                 return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
1443
1444         case MLX5_VPORT_ACCESS_METHOD_HCA:
1445                 return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
1446
1447         default:
1448                 return -EINVAL;
1449         }
1450
1451 }
1452
1453 static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port,
1454                                    u16 index, u16 *pkey)
1455 {
1456         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1457         struct mlx5_core_dev *mdev;
1458         bool put_mdev = true;
1459         u8 mdev_port_num;
1460         int err;
1461
1462         mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
1463         if (!mdev) {
1464                 /* The port isn't affiliated yet, get the PKey from the master
1465                  * port. For RoCE the PKey tables will be the same.
1466                  */
1467                 put_mdev = false;
1468                 mdev = dev->mdev;
1469                 mdev_port_num = 1;
1470         }
1471
1472         err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0,
1473                                         index, pkey);
1474         if (put_mdev)
1475                 mlx5_ib_put_native_port_mdev(dev, port);
1476
1477         return err;
1478 }
1479
1480 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
1481                               u16 *pkey)
1482 {
1483         switch (mlx5_get_vport_access_method(ibdev)) {
1484         case MLX5_VPORT_ACCESS_METHOD_MAD:
1485                 return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
1486
1487         case MLX5_VPORT_ACCESS_METHOD_HCA:
1488         case MLX5_VPORT_ACCESS_METHOD_NIC:
1489                 return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey);
1490         default:
1491                 return -EINVAL;
1492         }
1493 }
1494
1495 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
1496                                  struct ib_device_modify *props)
1497 {
1498         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1499         struct mlx5_reg_node_desc in;
1500         struct mlx5_reg_node_desc out;
1501         int err;
1502
1503         if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
1504                 return -EOPNOTSUPP;
1505
1506         if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
1507                 return 0;
1508
1509         /*
1510          * If possible, pass node desc to FW, so it can generate
1511          * a 144 trap.  If cmd fails, just ignore.
1512          */
1513         memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1514         err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
1515                                    sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
1516         if (err)
1517                 return err;
1518
1519         memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1520
1521         return err;
1522 }
1523
1524 static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask,
1525                                 u32 value)
1526 {
1527         struct mlx5_hca_vport_context ctx = {};
1528         struct mlx5_core_dev *mdev;
1529         u8 mdev_port_num;
1530         int err;
1531
1532         mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
1533         if (!mdev)
1534                 return -ENODEV;
1535
1536         err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx);
1537         if (err)
1538                 goto out;
1539
1540         if (~ctx.cap_mask1_perm & mask) {
1541                 mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
1542                              mask, ctx.cap_mask1_perm);
1543                 err = -EINVAL;
1544                 goto out;
1545         }
1546
1547         ctx.cap_mask1 = value;
1548         ctx.cap_mask1_perm = mask;
1549         err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num,
1550                                                  0, &ctx);
1551
1552 out:
1553         mlx5_ib_put_native_port_mdev(dev, port_num);
1554
1555         return err;
1556 }
1557
1558 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
1559                                struct ib_port_modify *props)
1560 {
1561         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1562         struct ib_port_attr attr;
1563         u32 tmp;
1564         int err;
1565         u32 change_mask;
1566         u32 value;
1567         bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
1568                       IB_LINK_LAYER_INFINIBAND);
1569
1570         /* CM layer calls ib_modify_port() regardless of the link layer. For
1571          * Ethernet ports, qkey violation and Port capabilities are meaningless.
1572          */
1573         if (!is_ib)
1574                 return 0;
1575
1576         if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
1577                 change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
1578                 value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
1579                 return set_port_caps_atomic(dev, port, change_mask, value);
1580         }
1581
1582         mutex_lock(&dev->cap_mask_mutex);
1583
1584         err = ib_query_port(ibdev, port, &attr);
1585         if (err)
1586                 goto out;
1587
1588         tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1589                 ~props->clr_port_cap_mask;
1590
1591         err = mlx5_set_port_caps(dev->mdev, port, tmp);
1592
1593 out:
1594         mutex_unlock(&dev->cap_mask_mutex);
1595         return err;
1596 }
1597
1598 static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1599 {
1600         mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1601                     caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1602 }
1603
1604 static u16 calc_dynamic_bfregs(int uars_per_sys_page)
1605 {
1606         /* Large page with non 4k uar support might limit the dynamic size */
1607         if (uars_per_sys_page == 1  && PAGE_SIZE > 4096)
1608                 return MLX5_MIN_DYN_BFREGS;
1609
1610         return MLX5_MAX_DYN_BFREGS;
1611 }
1612
1613 static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1614                              struct mlx5_ib_alloc_ucontext_req_v2 *req,
1615                              struct mlx5_bfreg_info *bfregi)
1616 {
1617         int uars_per_sys_page;
1618         int bfregs_per_sys_page;
1619         int ref_bfregs = req->total_num_bfregs;
1620
1621         if (req->total_num_bfregs == 0)
1622                 return -EINVAL;
1623
1624         BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1625         BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1626
1627         if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1628                 return -ENOMEM;
1629
1630         uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1631         bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1632         /* This holds the required static allocation asked by the user */
1633         req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1634         if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1635                 return -EINVAL;
1636
1637         bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1638         bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
1639         bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
1640         bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
1641
1642         mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
1643                     MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1644                     lib_uar_4k ? "yes" : "no", ref_bfregs,
1645                     req->total_num_bfregs, bfregi->total_num_bfregs,
1646                     bfregi->num_sys_pages);
1647
1648         return 0;
1649 }
1650
1651 static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1652 {
1653         struct mlx5_bfreg_info *bfregi;
1654         int err;
1655         int i;
1656
1657         bfregi = &context->bfregi;
1658         for (i = 0; i < bfregi->num_static_sys_pages; i++) {
1659                 err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]);
1660                 if (err)
1661                         goto error;
1662
1663                 mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1664         }
1665
1666         for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
1667                 bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
1668
1669         return 0;
1670
1671 error:
1672         for (--i; i >= 0; i--)
1673                 if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]))
1674                         mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1675
1676         return err;
1677 }
1678
1679 static void deallocate_uars(struct mlx5_ib_dev *dev,
1680                             struct mlx5_ib_ucontext *context)
1681 {
1682         struct mlx5_bfreg_info *bfregi;
1683         int i;
1684
1685         bfregi = &context->bfregi;
1686         for (i = 0; i < bfregi->num_sys_pages; i++)
1687                 if (i < bfregi->num_static_sys_pages ||
1688                     bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX)
1689                         mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]);
1690 }
1691
1692 int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1693 {
1694         int err = 0;
1695
1696         mutex_lock(&dev->lb.mutex);
1697         if (td)
1698                 dev->lb.user_td++;
1699         if (qp)
1700                 dev->lb.qps++;
1701
1702         if (dev->lb.user_td == 2 ||
1703             dev->lb.qps == 1) {
1704                 if (!dev->lb.enabled) {
1705                         err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
1706                         dev->lb.enabled = true;
1707                 }
1708         }
1709
1710         mutex_unlock(&dev->lb.mutex);
1711
1712         return err;
1713 }
1714
1715 void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1716 {
1717         mutex_lock(&dev->lb.mutex);
1718         if (td)
1719                 dev->lb.user_td--;
1720         if (qp)
1721                 dev->lb.qps--;
1722
1723         if (dev->lb.user_td == 1 &&
1724             dev->lb.qps == 0) {
1725                 if (dev->lb.enabled) {
1726                         mlx5_nic_vport_update_local_lb(dev->mdev, false);
1727                         dev->lb.enabled = false;
1728                 }
1729         }
1730
1731         mutex_unlock(&dev->lb.mutex);
1732 }
1733
1734 static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
1735                                           u16 uid)
1736 {
1737         int err;
1738
1739         if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1740                 return 0;
1741
1742         err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid);
1743         if (err)
1744                 return err;
1745
1746         if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1747             (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1748              !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1749                 return err;
1750
1751         return mlx5_ib_enable_lb(dev, true, false);
1752 }
1753
1754 static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
1755                                              u16 uid)
1756 {
1757         if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1758                 return;
1759
1760         mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid);
1761
1762         if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1763             (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1764              !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1765                 return;
1766
1767         mlx5_ib_disable_lb(dev, true, false);
1768 }
1769
1770 static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
1771                                   struct ib_udata *udata)
1772 {
1773         struct ib_device *ibdev = uctx->device;
1774         struct mlx5_ib_dev *dev = to_mdev(ibdev);
1775         struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1776         struct mlx5_ib_alloc_ucontext_resp resp = {};
1777         struct mlx5_core_dev *mdev = dev->mdev;
1778         struct mlx5_ib_ucontext *context = to_mucontext(uctx);
1779         struct mlx5_bfreg_info *bfregi;
1780         int ver;
1781         int err;
1782         size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1783                                      max_cqe_version);
1784         u32 dump_fill_mkey;
1785         bool lib_uar_4k;
1786
1787         if (!dev->ib_active)
1788                 return -EAGAIN;
1789
1790         if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1791                 ver = 0;
1792         else if (udata->inlen >= min_req_v2)
1793                 ver = 2;
1794         else
1795                 return -EINVAL;
1796
1797         err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1798         if (err)
1799                 return err;
1800
1801         if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
1802                 return -EOPNOTSUPP;
1803
1804         if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1805                 return -EOPNOTSUPP;
1806
1807         req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1808                                     MLX5_NON_FP_BFREGS_PER_UAR);
1809         if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1810                 return -EINVAL;
1811
1812         resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1813         if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
1814                 resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1815         resp.cache_line_size = cache_line_size();
1816         resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1817         resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1818         resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1819         resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1820         resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1821         resp.cqe_version = min_t(__u8,
1822                                  (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1823                                  req.max_cqe_version);
1824         resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1825                                 MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1826         resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1827                                         MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
1828         resp.response_length = min(offsetof(typeof(resp), response_length) +
1829                                    sizeof(resp.response_length), udata->outlen);
1830
1831         if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) {
1832                 if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS))
1833                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
1834                 if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
1835                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
1836                 if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
1837                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
1838                 if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
1839                         resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
1840                 /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
1841         }
1842
1843         lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
1844         bfregi = &context->bfregi;
1845
1846         /* updates req->total_num_bfregs */
1847         err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
1848         if (err)
1849                 goto out_ctx;
1850
1851         mutex_init(&bfregi->lock);
1852         bfregi->lib_uar_4k = lib_uar_4k;
1853         bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
1854                                 GFP_KERNEL);
1855         if (!bfregi->count) {
1856                 err = -ENOMEM;
1857                 goto out_ctx;
1858         }
1859
1860         bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
1861                                     sizeof(*bfregi->sys_pages),
1862                                     GFP_KERNEL);
1863         if (!bfregi->sys_pages) {
1864                 err = -ENOMEM;
1865                 goto out_count;
1866         }
1867
1868         err = allocate_uars(dev, context);
1869         if (err)
1870                 goto out_sys_pages;
1871
1872         if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
1873                 err = mlx5_ib_devx_create(dev, true);
1874                 if (err < 0)
1875                         goto out_uars;
1876                 context->devx_uid = err;
1877         }
1878
1879         err = mlx5_ib_alloc_transport_domain(dev, &context->tdn,
1880                                              context->devx_uid);
1881         if (err)
1882                 goto out_devx;
1883
1884         if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1885                 err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey);
1886                 if (err)
1887                         goto out_mdev;
1888         }
1889
1890         INIT_LIST_HEAD(&context->db_page_list);
1891         mutex_init(&context->db_page_mutex);
1892
1893         resp.tot_bfregs = req.total_num_bfregs;
1894         resp.num_ports = dev->num_ports;
1895
1896         if (field_avail(typeof(resp), cqe_version, udata->outlen))
1897                 resp.response_length += sizeof(resp.cqe_version);
1898
1899         if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
1900                 resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1901                                       MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1902                 resp.response_length += sizeof(resp.cmds_supp_uhw);
1903         }
1904
1905         if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
1906                 if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
1907                         mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
1908                         resp.eth_min_inline++;
1909                 }
1910                 resp.response_length += sizeof(resp.eth_min_inline);
1911         }
1912
1913         if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) {
1914                 if (mdev->clock_info)
1915                         resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
1916                 resp.response_length += sizeof(resp.clock_info_versions);
1917         }
1918
1919         /*
1920          * We don't want to expose information from the PCI bar that is located
1921          * after 4096 bytes, so if the arch only supports larger pages, let's
1922          * pretend we don't support reading the HCA's core clock. This is also
1923          * forced by mmap function.
1924          */
1925         if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
1926                 if (PAGE_SIZE <= 4096) {
1927                         resp.comp_mask |=
1928                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1929                         resp.hca_core_clock_offset =
1930                                 offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
1931                 }
1932                 resp.response_length += sizeof(resp.hca_core_clock_offset);
1933         }
1934
1935         if (field_avail(typeof(resp), log_uar_size, udata->outlen))
1936                 resp.response_length += sizeof(resp.log_uar_size);
1937
1938         if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
1939                 resp.response_length += sizeof(resp.num_uars_per_page);
1940
1941         if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) {
1942                 resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
1943                 resp.response_length += sizeof(resp.num_dyn_bfregs);
1944         }
1945
1946         if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) {
1947                 if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1948                         resp.dump_fill_mkey = dump_fill_mkey;
1949                         resp.comp_mask |=
1950                                 MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
1951                 }
1952                 resp.response_length += sizeof(resp.dump_fill_mkey);
1953         }
1954
1955         err = ib_copy_to_udata(udata, &resp, resp.response_length);
1956         if (err)
1957                 goto out_mdev;
1958
1959         bfregi->ver = ver;
1960         bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
1961         context->cqe_version = resp.cqe_version;
1962         context->lib_caps = req.lib_caps;
1963         print_lib_caps(dev, context->lib_caps);
1964
1965         if (dev->lag_active) {
1966                 u8 port = mlx5_core_native_port_num(dev->mdev) - 1;
1967
1968                 atomic_set(&context->tx_port_affinity,
1969                            atomic_add_return(
1970                                    1, &dev->port[port].roce.tx_port_affinity));
1971         }
1972
1973         return 0;
1974
1975 out_mdev:
1976         mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
1977 out_devx:
1978         if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
1979                 mlx5_ib_devx_destroy(dev, context->devx_uid);
1980
1981 out_uars:
1982         deallocate_uars(dev, context);
1983
1984 out_sys_pages:
1985         kfree(bfregi->sys_pages);
1986
1987 out_count:
1988         kfree(bfregi->count);
1989
1990 out_ctx:
1991         return err;
1992 }
1993
1994 static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1995 {
1996         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1997         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1998         struct mlx5_bfreg_info *bfregi;
1999
2000         bfregi = &context->bfregi;
2001         mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
2002
2003         if (context->devx_uid)
2004                 mlx5_ib_devx_destroy(dev, context->devx_uid);
2005
2006         deallocate_uars(dev, context);
2007         kfree(bfregi->sys_pages);
2008         kfree(bfregi->count);
2009 }
2010
2011 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
2012                                  int uar_idx)
2013 {
2014         int fw_uars_per_page;
2015
2016         fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
2017
2018         return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
2019 }
2020
2021 static int get_command(unsigned long offset)
2022 {
2023         return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
2024 }
2025
2026 static int get_arg(unsigned long offset)
2027 {
2028         return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
2029 }
2030
2031 static int get_index(unsigned long offset)
2032 {
2033         return get_arg(offset);
2034 }
2035
2036 /* Index resides in an extra byte to enable larger values than 255 */
2037 static int get_extended_index(unsigned long offset)
2038 {
2039         return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
2040 }
2041
2042
2043 static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
2044 {
2045 }
2046
2047 static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
2048 {
2049         switch (cmd) {
2050         case MLX5_IB_MMAP_WC_PAGE:
2051                 return "WC";
2052         case MLX5_IB_MMAP_REGULAR_PAGE:
2053                 return "best effort WC";
2054         case MLX5_IB_MMAP_NC_PAGE:
2055                 return "NC";
2056         case MLX5_IB_MMAP_DEVICE_MEM:
2057                 return "Device Memory";
2058         default:
2059                 return NULL;
2060         }
2061 }
2062
2063 static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
2064                                         struct vm_area_struct *vma,
2065                                         struct mlx5_ib_ucontext *context)
2066 {
2067         if ((vma->vm_end - vma->vm_start != PAGE_SIZE) ||
2068             !(vma->vm_flags & VM_SHARED))
2069                 return -EINVAL;
2070
2071         if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
2072                 return -EOPNOTSUPP;
2073
2074         if (vma->vm_flags & (VM_WRITE | VM_EXEC))
2075                 return -EPERM;
2076         vma->vm_flags &= ~VM_MAYWRITE;
2077
2078         if (!dev->mdev->clock_info)
2079                 return -EOPNOTSUPP;
2080
2081         return vm_insert_page(vma, vma->vm_start,
2082                               virt_to_page(dev->mdev->clock_info));
2083 }
2084
2085 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
2086                     struct vm_area_struct *vma,
2087                     struct mlx5_ib_ucontext *context)
2088 {
2089         struct mlx5_bfreg_info *bfregi = &context->bfregi;
2090         int err;
2091         unsigned long idx;
2092         phys_addr_t pfn;
2093         pgprot_t prot;
2094         u32 bfreg_dyn_idx = 0;
2095         u32 uar_index;
2096         int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
2097         int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
2098                                 bfregi->num_static_sys_pages;
2099
2100         if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2101                 return -EINVAL;
2102
2103         if (dyn_uar)
2104                 idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
2105         else
2106                 idx = get_index(vma->vm_pgoff);
2107
2108         if (idx >= max_valid_idx) {
2109                 mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
2110                              idx, max_valid_idx);
2111                 return -EINVAL;
2112         }
2113
2114         switch (cmd) {
2115         case MLX5_IB_MMAP_WC_PAGE:
2116         case MLX5_IB_MMAP_ALLOC_WC:
2117 /* Some architectures don't support WC memory */
2118 #if defined(CONFIG_X86)
2119                 if (!pat_enabled())
2120                         return -EPERM;
2121 #elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
2122                         return -EPERM;
2123 #endif
2124         /* fall through */
2125         case MLX5_IB_MMAP_REGULAR_PAGE:
2126                 /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
2127                 prot = pgprot_writecombine(vma->vm_page_prot);
2128                 break;
2129         case MLX5_IB_MMAP_NC_PAGE:
2130                 prot = pgprot_noncached(vma->vm_page_prot);
2131                 break;
2132         default:
2133                 return -EINVAL;
2134         }
2135
2136         if (dyn_uar) {
2137                 int uars_per_page;
2138
2139                 uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
2140                 bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
2141                 if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
2142                         mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
2143                                      bfreg_dyn_idx, bfregi->total_num_bfregs);
2144                         return -EINVAL;
2145                 }
2146
2147                 mutex_lock(&bfregi->lock);
2148                 /* Fail if uar already allocated, first bfreg index of each
2149                  * page holds its count.
2150                  */
2151                 if (bfregi->count[bfreg_dyn_idx]) {
2152                         mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
2153                         mutex_unlock(&bfregi->lock);
2154                         return -EINVAL;
2155                 }
2156
2157                 bfregi->count[bfreg_dyn_idx]++;
2158                 mutex_unlock(&bfregi->lock);
2159
2160                 err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
2161                 if (err) {
2162                         mlx5_ib_warn(dev, "UAR alloc failed\n");
2163                         goto free_bfreg;
2164                 }
2165         } else {
2166                 uar_index = bfregi->sys_pages[idx];
2167         }
2168
2169         pfn = uar_index2pfn(dev, uar_index);
2170         mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
2171
2172         err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
2173                                 prot);
2174         if (err) {
2175                 mlx5_ib_err(dev,
2176                             "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
2177                             err, mmap_cmd2str(cmd));
2178                 goto err;
2179         }
2180
2181         if (dyn_uar)
2182                 bfregi->sys_pages[idx] = uar_index;
2183         return 0;
2184
2185 err:
2186         if (!dyn_uar)
2187                 return err;
2188
2189         mlx5_cmd_free_uar(dev->mdev, idx);
2190
2191 free_bfreg:
2192         mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
2193
2194         return err;
2195 }
2196
2197 static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
2198 {
2199         struct mlx5_ib_ucontext *mctx = to_mucontext(context);
2200         struct mlx5_ib_dev *dev = to_mdev(context->device);
2201         u16 page_idx = get_extended_index(vma->vm_pgoff);
2202         size_t map_size = vma->vm_end - vma->vm_start;
2203         u32 npages = map_size >> PAGE_SHIFT;
2204         phys_addr_t pfn;
2205
2206         if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) !=
2207             page_idx + npages)
2208                 return -EINVAL;
2209
2210         pfn = ((dev->mdev->bar_addr +
2211               MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >>
2212               PAGE_SHIFT) +
2213               page_idx;
2214         return rdma_user_mmap_io(context, vma, pfn, map_size,
2215                                  pgprot_writecombine(vma->vm_page_prot));
2216 }
2217
2218 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
2219 {
2220         struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
2221         struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2222         unsigned long command;
2223         phys_addr_t pfn;
2224
2225         command = get_command(vma->vm_pgoff);
2226         switch (command) {
2227         case MLX5_IB_MMAP_WC_PAGE:
2228         case MLX5_IB_MMAP_NC_PAGE:
2229         case MLX5_IB_MMAP_REGULAR_PAGE:
2230         case MLX5_IB_MMAP_ALLOC_WC:
2231                 return uar_mmap(dev, command, vma, context);
2232
2233         case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
2234                 return -ENOSYS;
2235
2236         case MLX5_IB_MMAP_CORE_CLOCK:
2237                 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2238                         return -EINVAL;
2239
2240                 if (vma->vm_flags & VM_WRITE)
2241                         return -EPERM;
2242                 vma->vm_flags &= ~VM_MAYWRITE;
2243
2244                 /* Don't expose to user-space information it shouldn't have */
2245                 if (PAGE_SIZE > 4096)
2246                         return -EOPNOTSUPP;
2247
2248                 pfn = (dev->mdev->iseg_base +
2249                        offsetof(struct mlx5_init_seg, internal_timer_h)) >>
2250                         PAGE_SHIFT;
2251                 return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
2252                                          PAGE_SIZE,
2253                                          pgprot_noncached(vma->vm_page_prot));
2254         case MLX5_IB_MMAP_CLOCK_INFO:
2255                 return mlx5_ib_mmap_clock_info_page(dev, vma, context);
2256
2257         case MLX5_IB_MMAP_DEVICE_MEM:
2258                 return dm_mmap(ibcontext, vma);
2259
2260         default:
2261                 return -EINVAL;
2262         }
2263
2264         return 0;
2265 }
2266
2267 static inline int check_dm_type_support(struct mlx5_ib_dev *dev,
2268                                         u32 type)
2269 {
2270         switch (type) {
2271         case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2272                 if (!MLX5_CAP_DEV_MEM(dev->mdev, memic))
2273                         return -EOPNOTSUPP;
2274                 break;
2275         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2276         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2277                 if (!capable(CAP_SYS_RAWIO) ||
2278                     !capable(CAP_NET_RAW))
2279                         return -EPERM;
2280
2281                 if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
2282                       MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner)))
2283                         return -EOPNOTSUPP;
2284                 break;
2285         }
2286
2287         return 0;
2288 }
2289
2290 static int handle_alloc_dm_memic(struct ib_ucontext *ctx,
2291                                  struct mlx5_ib_dm *dm,
2292                                  struct ib_dm_alloc_attr *attr,
2293                                  struct uverbs_attr_bundle *attrs)
2294 {
2295         struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
2296         u64 start_offset;
2297         u32 page_idx;
2298         int err;
2299
2300         dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
2301
2302         err = mlx5_cmd_alloc_memic(dm_db, &dm->dev_addr,
2303                                    dm->size, attr->alignment);
2304         if (err)
2305                 return err;
2306
2307         page_idx = (dm->dev_addr - pci_resource_start(dm_db->dev->pdev, 0) -
2308                     MLX5_CAP64_DEV_MEM(dm_db->dev, memic_bar_start_addr)) >>
2309                     PAGE_SHIFT;
2310
2311         err = uverbs_copy_to(attrs,
2312                              MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
2313                              &page_idx, sizeof(page_idx));
2314         if (err)
2315                 goto err_dealloc;
2316
2317         start_offset = dm->dev_addr & ~PAGE_MASK;
2318         err = uverbs_copy_to(attrs,
2319                              MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
2320                              &start_offset, sizeof(start_offset));
2321         if (err)
2322                 goto err_dealloc;
2323
2324         bitmap_set(to_mucontext(ctx)->dm_pages, page_idx,
2325                    DIV_ROUND_UP(dm->size, PAGE_SIZE));
2326
2327         return 0;
2328
2329 err_dealloc:
2330         mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
2331
2332         return err;
2333 }
2334
2335 static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
2336                                   struct mlx5_ib_dm *dm,
2337                                   struct ib_dm_alloc_attr *attr,
2338                                   struct uverbs_attr_bundle *attrs,
2339                                   int type)
2340 {
2341         struct mlx5_core_dev *dev = to_mdev(ctx->device)->mdev;
2342         u64 act_size;
2343         int err;
2344
2345         /* Allocation size must a multiple of the basic block size
2346          * and a power of 2.
2347          */
2348         act_size = round_up(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dev));
2349         act_size = roundup_pow_of_two(act_size);
2350
2351         dm->size = act_size;
2352         err = mlx5_dm_sw_icm_alloc(dev, type, act_size,
2353                                    to_mucontext(ctx)->devx_uid, &dm->dev_addr,
2354                                    &dm->icm_dm.obj_id);
2355         if (err)
2356                 return err;
2357
2358         err = uverbs_copy_to(attrs,
2359                              MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
2360                              &dm->dev_addr, sizeof(dm->dev_addr));
2361         if (err)
2362                 mlx5_dm_sw_icm_dealloc(dev, type, dm->size,
2363                                        to_mucontext(ctx)->devx_uid, dm->dev_addr,
2364                                        dm->icm_dm.obj_id);
2365
2366         return err;
2367 }
2368
2369 struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
2370                                struct ib_ucontext *context,
2371                                struct ib_dm_alloc_attr *attr,
2372                                struct uverbs_attr_bundle *attrs)
2373 {
2374         struct mlx5_ib_dm *dm;
2375         enum mlx5_ib_uapi_dm_type type;
2376         int err;
2377
2378         err = uverbs_get_const_default(&type, attrs,
2379                                        MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
2380                                        MLX5_IB_UAPI_DM_TYPE_MEMIC);
2381         if (err)
2382                 return ERR_PTR(err);
2383
2384         mlx5_ib_dbg(to_mdev(ibdev), "alloc_dm req: dm_type=%d user_length=0x%llx log_alignment=%d\n",
2385                     type, attr->length, attr->alignment);
2386
2387         err = check_dm_type_support(to_mdev(ibdev), type);
2388         if (err)
2389                 return ERR_PTR(err);
2390
2391         dm = kzalloc(sizeof(*dm), GFP_KERNEL);
2392         if (!dm)
2393                 return ERR_PTR(-ENOMEM);
2394
2395         dm->type = type;
2396
2397         switch (type) {
2398         case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2399                 err = handle_alloc_dm_memic(context, dm,
2400                                             attr,
2401                                             attrs);
2402                 break;
2403         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2404                 err = handle_alloc_dm_sw_icm(context, dm,
2405                                              attr, attrs,
2406                                              MLX5_SW_ICM_TYPE_STEERING);
2407                 break;
2408         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2409                 err = handle_alloc_dm_sw_icm(context, dm,
2410                                              attr, attrs,
2411                                              MLX5_SW_ICM_TYPE_HEADER_MODIFY);
2412                 break;
2413         default:
2414                 err = -EOPNOTSUPP;
2415         }
2416
2417         if (err)
2418                 goto err_free;
2419
2420         return &dm->ibdm;
2421
2422 err_free:
2423         kfree(dm);
2424         return ERR_PTR(err);
2425 }
2426
2427 int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs)
2428 {
2429         struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
2430                 &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
2431         struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
2432         struct mlx5_dm *dm_db = &to_mdev(ibdm->device)->dm;
2433         struct mlx5_ib_dm *dm = to_mdm(ibdm);
2434         u32 page_idx;
2435         int ret;
2436
2437         switch (dm->type) {
2438         case MLX5_IB_UAPI_DM_TYPE_MEMIC:
2439                 ret = mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
2440                 if (ret)
2441                         return ret;
2442
2443                 page_idx = (dm->dev_addr - pci_resource_start(dev->pdev, 0) -
2444                             MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr)) >>
2445                             PAGE_SHIFT;
2446                 bitmap_clear(ctx->dm_pages, page_idx,
2447                              DIV_ROUND_UP(dm->size, PAGE_SIZE));
2448                 break;
2449         case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
2450                 ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
2451                                              dm->size, ctx->devx_uid, dm->dev_addr,
2452                                              dm->icm_dm.obj_id);
2453                 if (ret)
2454                         return ret;
2455                 break;
2456         case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
2457                 ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_HEADER_MODIFY,
2458                                              dm->size, ctx->devx_uid, dm->dev_addr,
2459                                              dm->icm_dm.obj_id);
2460                 if (ret)
2461                         return ret;
2462                 break;
2463         default:
2464                 return -EOPNOTSUPP;
2465         }
2466
2467         kfree(dm);
2468
2469         return 0;
2470 }
2471
2472 static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
2473 {
2474         struct mlx5_ib_pd *pd = to_mpd(ibpd);
2475         struct ib_device *ibdev = ibpd->device;
2476         struct mlx5_ib_alloc_pd_resp resp;
2477         int err;
2478         u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
2479         u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
2480         u16 uid = 0;
2481         struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
2482                 udata, struct mlx5_ib_ucontext, ibucontext);
2483
2484         uid = context ? context->devx_uid : 0;
2485         MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
2486         MLX5_SET(alloc_pd_in, in, uid, uid);
2487         err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
2488                             out, sizeof(out));
2489         if (err)
2490                 return err;
2491
2492         pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
2493         pd->uid = uid;
2494         if (udata) {
2495                 resp.pdn = pd->pdn;
2496                 if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
2497                         mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
2498                         return -EFAULT;
2499                 }
2500         }
2501
2502         return 0;
2503 }
2504
2505 static void mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
2506 {
2507         struct mlx5_ib_dev *mdev = to_mdev(pd->device);
2508         struct mlx5_ib_pd *mpd = to_mpd(pd);
2509
2510         mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
2511 }
2512
2513 enum {
2514         MATCH_CRITERIA_ENABLE_OUTER_BIT,
2515         MATCH_CRITERIA_ENABLE_MISC_BIT,
2516         MATCH_CRITERIA_ENABLE_INNER_BIT,
2517         MATCH_CRITERIA_ENABLE_MISC2_BIT
2518 };
2519
2520 #define HEADER_IS_ZERO(match_criteria, headers)                            \
2521         !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
2522                     0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
2523
2524 static u8 get_match_criteria_enable(u32 *match_criteria)
2525 {
2526         u8 match_criteria_enable;
2527
2528         match_criteria_enable =
2529                 (!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
2530                 MATCH_CRITERIA_ENABLE_OUTER_BIT;
2531         match_criteria_enable |=
2532                 (!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
2533                 MATCH_CRITERIA_ENABLE_MISC_BIT;
2534         match_criteria_enable |=
2535                 (!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
2536                 MATCH_CRITERIA_ENABLE_INNER_BIT;
2537         match_criteria_enable |=
2538                 (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
2539                 MATCH_CRITERIA_ENABLE_MISC2_BIT;
2540
2541         return match_criteria_enable;
2542 }
2543
2544 static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
2545 {
2546         u8 entry_mask;
2547         u8 entry_val;
2548         int err = 0;
2549
2550         if (!mask)
2551                 goto out;
2552
2553         entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
2554                               ip_protocol);
2555         entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
2556                              ip_protocol);
2557         if (!entry_mask) {
2558                 MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
2559                 MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
2560                 goto out;
2561         }
2562         /* Don't override existing ip protocol */
2563         if (mask != entry_mask || val != entry_val)
2564                 err = -EINVAL;
2565 out:
2566         return err;
2567 }
2568
2569 static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
2570                            bool inner)
2571 {
2572         if (inner) {
2573                 MLX5_SET(fte_match_set_misc,
2574                          misc_c, inner_ipv6_flow_label, mask);
2575                 MLX5_SET(fte_match_set_misc,
2576                          misc_v, inner_ipv6_flow_label, val);
2577         } else {
2578                 MLX5_SET(fte_match_set_misc,
2579                          misc_c, outer_ipv6_flow_label, mask);
2580                 MLX5_SET(fte_match_set_misc,
2581                          misc_v, outer_ipv6_flow_label, val);
2582         }
2583 }
2584
2585 static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
2586 {
2587         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
2588         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
2589         MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
2590         MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
2591 }
2592
2593 static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask)
2594 {
2595         if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) &&
2596             !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL))
2597                 return -EOPNOTSUPP;
2598
2599         if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) &&
2600             !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP))
2601                 return -EOPNOTSUPP;
2602
2603         if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) &&
2604             !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS))
2605                 return -EOPNOTSUPP;
2606
2607         if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) &&
2608             !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL))
2609                 return -EOPNOTSUPP;
2610
2611         return 0;
2612 }
2613
2614 #define LAST_ETH_FIELD vlan_tag
2615 #define LAST_IB_FIELD sl
2616 #define LAST_IPV4_FIELD tos
2617 #define LAST_IPV6_FIELD traffic_class
2618 #define LAST_TCP_UDP_FIELD src_port
2619 #define LAST_TUNNEL_FIELD tunnel_id
2620 #define LAST_FLOW_TAG_FIELD tag_id
2621 #define LAST_DROP_FIELD size
2622 #define LAST_COUNTERS_FIELD counters
2623
2624 /* Field is the last supported field */
2625 #define FIELDS_NOT_SUPPORTED(filter, field)\
2626         memchr_inv((void *)&filter.field  +\
2627                    sizeof(filter.field), 0,\
2628                    sizeof(filter) -\
2629                    offsetof(typeof(filter), field) -\
2630                    sizeof(filter.field))
2631
2632 int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
2633                            bool is_egress,
2634                            struct mlx5_flow_act *action)
2635 {
2636
2637         switch (maction->ib_action.type) {
2638         case IB_FLOW_ACTION_ESP:
2639                 if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
2640                                       MLX5_FLOW_CONTEXT_ACTION_DECRYPT))
2641                         return -EINVAL;
2642                 /* Currently only AES_GCM keymat is supported by the driver */
2643                 action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
2644                 action->action |= is_egress ?
2645                         MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
2646                         MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
2647                 return 0;
2648         case IB_FLOW_ACTION_UNSPECIFIED:
2649                 if (maction->flow_action_raw.sub_type ==
2650                     MLX5_IB_FLOW_ACTION_MODIFY_HEADER) {
2651                         if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
2652                                 return -EINVAL;
2653                         action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
2654                         action->modify_hdr =
2655                                 maction->flow_action_raw.modify_hdr;
2656                         return 0;
2657                 }
2658                 if (maction->flow_action_raw.sub_type ==
2659                     MLX5_IB_FLOW_ACTION_DECAP) {
2660                         if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
2661                                 return -EINVAL;
2662                         action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
2663                         return 0;
2664                 }
2665                 if (maction->flow_action_raw.sub_type ==
2666                     MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) {
2667                         if (action->action &
2668                             MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
2669                                 return -EINVAL;
2670                         action->action |=
2671                                 MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
2672                         action->pkt_reformat =
2673                                 maction->flow_action_raw.pkt_reformat;
2674                         return 0;
2675                 }
2676                 /* fall through */
2677         default:
2678                 return -EOPNOTSUPP;
2679         }
2680 }
2681
2682 static int parse_flow_attr(struct mlx5_core_dev *mdev,
2683                            struct mlx5_flow_spec *spec,
2684                            const union ib_flow_spec *ib_spec,
2685                            const struct ib_flow_attr *flow_attr,
2686                            struct mlx5_flow_act *action, u32 prev_type)
2687 {
2688         struct mlx5_flow_context *flow_context = &spec->flow_context;
2689         u32 *match_c = spec->match_criteria;
2690         u32 *match_v = spec->match_value;
2691         void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
2692                                            misc_parameters);
2693         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
2694                                            misc_parameters);
2695         void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c,
2696                                             misc_parameters_2);
2697         void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v,
2698                                             misc_parameters_2);
2699         void *headers_c;
2700         void *headers_v;
2701         int match_ipv;
2702         int ret;
2703
2704         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
2705                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
2706                                          inner_headers);
2707                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
2708                                          inner_headers);
2709                 match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2710                                         ft_field_support.inner_ip_version);
2711         } else {
2712                 headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
2713                                          outer_headers);
2714                 headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
2715                                          outer_headers);
2716                 match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2717                                         ft_field_support.outer_ip_version);
2718         }
2719
2720         switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
2721         case IB_FLOW_SPEC_ETH:
2722                 if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
2723                         return -EOPNOTSUPP;
2724
2725                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2726                                              dmac_47_16),
2727                                 ib_spec->eth.mask.dst_mac);
2728                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2729                                              dmac_47_16),
2730                                 ib_spec->eth.val.dst_mac);
2731
2732                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2733                                              smac_47_16),
2734                                 ib_spec->eth.mask.src_mac);
2735                 ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2736                                              smac_47_16),
2737                                 ib_spec->eth.val.src_mac);
2738
2739                 if (ib_spec->eth.mask.vlan_tag) {
2740                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2741                                  cvlan_tag, 1);
2742                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2743                                  cvlan_tag, 1);
2744
2745                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2746                                  first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
2747                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2748                                  first_vid, ntohs(ib_spec->eth.val.vlan_tag));
2749
2750                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2751                                  first_cfi,
2752                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
2753                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2754                                  first_cfi,
2755                                  ntohs(ib_spec->eth.val.vlan_tag) >> 12);
2756
2757                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2758                                  first_prio,
2759                                  ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
2760                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2761                                  first_prio,
2762                                  ntohs(ib_spec->eth.val.vlan_tag) >> 13);
2763                 }
2764                 MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2765                          ethertype, ntohs(ib_spec->eth.mask.ether_type));
2766                 MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2767                          ethertype, ntohs(ib_spec->eth.val.ether_type));
2768                 break;
2769         case IB_FLOW_SPEC_IPV4:
2770                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
2771                         return -EOPNOTSUPP;
2772
2773                 if (match_ipv) {
2774                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2775                                  ip_version, 0xf);
2776                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2777                                  ip_version, MLX5_FS_IPV4_VERSION);
2778                 } else {
2779                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2780                                  ethertype, 0xffff);
2781                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2782                                  ethertype, ETH_P_IP);
2783                 }
2784
2785                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2786                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
2787                        &ib_spec->ipv4.mask.src_ip,
2788                        sizeof(ib_spec->ipv4.mask.src_ip));
2789                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2790                                     src_ipv4_src_ipv6.ipv4_layout.ipv4),
2791                        &ib_spec->ipv4.val.src_ip,
2792                        sizeof(ib_spec->ipv4.val.src_ip));
2793                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2794                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
2795                        &ib_spec->ipv4.mask.dst_ip,
2796                        sizeof(ib_spec->ipv4.mask.dst_ip));
2797                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2798                                     dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
2799                        &ib_spec->ipv4.val.dst_ip,
2800                        sizeof(ib_spec->ipv4.val.dst_ip));
2801
2802                 set_tos(headers_c, headers_v,
2803                         ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
2804
2805                 if (set_proto(headers_c, headers_v,
2806                               ib_spec->ipv4.mask.proto,
2807                               ib_spec->ipv4.val.proto))
2808                         return -EINVAL;
2809                 break;
2810         case IB_FLOW_SPEC_IPV6:
2811                 if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
2812                         return -EOPNOTSUPP;
2813
2814                 if (match_ipv) {
2815                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2816                                  ip_version, 0xf);
2817                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2818                                  ip_version, MLX5_FS_IPV6_VERSION);
2819                 } else {
2820                         MLX5_SET(fte_match_set_lyr_2_4, headers_c,
2821                                  ethertype, 0xffff);
2822                         MLX5_SET(fte_match_set_lyr_2_4, headers_v,
2823                                  ethertype, ETH_P_IPV6);
2824                 }
2825
2826                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2827                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
2828                        &ib_spec->ipv6.mask.src_ip,
2829                        sizeof(ib_spec->ipv6.mask.src_ip));
2830                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2831                                     src_ipv4_src_ipv6.ipv6_layout.ipv6),
2832                        &ib_spec->ipv6.val.src_ip,
2833                        sizeof(ib_spec->ipv6.val.src_ip));
2834                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
2835                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
2836                        &ib_spec->ipv6.mask.dst_ip,
2837                        sizeof(ib_spec->ipv6.mask.dst_ip));
2838                 memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
2839                                     dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
2840                        &ib_spec->ipv6.val.dst_ip,
2841                        sizeof(ib_spec->ipv6.val.dst_ip));
2842
2843                 set_tos(headers_c, headers_v,
2844                         ib_spec->ipv6.mask.traffic_class,
2845                         ib_spec->ipv6.val.traffic_class);
2846
2847                 if (set_proto(headers_c, headers_v,
2848                               ib_spec->ipv6.mask.next_hdr,
2849                               ib_spec->ipv6.val.next_hdr))
2850                         return -EINVAL;
2851
2852                 set_flow_label(misc_params_c, misc_params_v,
2853                                ntohl(ib_spec->ipv6.mask.flow_label),
2854                                ntohl(ib_spec->ipv6.val.flow_label),
2855                                ib_spec->type & IB_FLOW_SPEC_INNER);
2856                 break;
2857         case IB_FLOW_SPEC_ESP:
2858                 if (ib_spec->esp.mask.seq)
2859                         return -EOPNOTSUPP;
2860
2861                 MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi,
2862                          ntohl(ib_spec->esp.mask.spi));
2863                 MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
2864                          ntohl(ib_spec->esp.val.spi));
2865                 break;
2866         case IB_FLOW_SPEC_TCP:
2867                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
2868                                          LAST_TCP_UDP_FIELD))
2869                         return -EOPNOTSUPP;
2870
2871                 if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
2872                         return -EINVAL;
2873
2874                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
2875                          ntohs(ib_spec->tcp_udp.mask.src_port));
2876                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
2877                          ntohs(ib_spec->tcp_udp.val.src_port));
2878
2879                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
2880                          ntohs(ib_spec->tcp_udp.mask.dst_port));
2881                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
2882                          ntohs(ib_spec->tcp_udp.val.dst_port));
2883                 break;
2884         case IB_FLOW_SPEC_UDP:
2885                 if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
2886                                          LAST_TCP_UDP_FIELD))
2887                         return -EOPNOTSUPP;
2888
2889                 if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
2890                         return -EINVAL;
2891
2892                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
2893                          ntohs(ib_spec->tcp_udp.mask.src_port));
2894                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
2895                          ntohs(ib_spec->tcp_udp.val.src_port));
2896
2897                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
2898                          ntohs(ib_spec->tcp_udp.mask.dst_port));
2899                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
2900                          ntohs(ib_spec->tcp_udp.val.dst_port));
2901                 break;
2902         case IB_FLOW_SPEC_GRE:
2903                 if (ib_spec->gre.mask.c_ks_res0_ver)
2904                         return -EOPNOTSUPP;
2905
2906                 if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
2907                         return -EINVAL;
2908
2909                 MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
2910                          0xff);
2911                 MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
2912                          IPPROTO_GRE);
2913
2914                 MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol,
2915                          ntohs(ib_spec->gre.mask.protocol));
2916                 MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol,
2917                          ntohs(ib_spec->gre.val.protocol));
2918
2919                 memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
2920                                     gre_key.nvgre.hi),
2921                        &ib_spec->gre.mask.key,
2922                        sizeof(ib_spec->gre.mask.key));
2923                 memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
2924                                     gre_key.nvgre.hi),
2925                        &ib_spec->gre.val.key,
2926                        sizeof(ib_spec->gre.val.key));
2927                 break;
2928         case IB_FLOW_SPEC_MPLS:
2929                 switch (prev_type) {
2930                 case IB_FLOW_SPEC_UDP:
2931                         if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2932                                                    ft_field_support.outer_first_mpls_over_udp),
2933                                                    &ib_spec->mpls.mask.tag))
2934                                 return -EOPNOTSUPP;
2935
2936                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
2937                                             outer_first_mpls_over_udp),
2938                                &ib_spec->mpls.val.tag,
2939                                sizeof(ib_spec->mpls.val.tag));
2940                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
2941                                             outer_first_mpls_over_udp),
2942                                &ib_spec->mpls.mask.tag,
2943                                sizeof(ib_spec->mpls.mask.tag));
2944                         break;
2945                 case IB_FLOW_SPEC_GRE:
2946                         if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2947                                                    ft_field_support.outer_first_mpls_over_gre),
2948                                                    &ib_spec->mpls.mask.tag))
2949                                 return -EOPNOTSUPP;
2950
2951                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
2952                                             outer_first_mpls_over_gre),
2953                                &ib_spec->mpls.val.tag,
2954                                sizeof(ib_spec->mpls.val.tag));
2955                         memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
2956                                             outer_first_mpls_over_gre),
2957                                &ib_spec->mpls.mask.tag,
2958                                sizeof(ib_spec->mpls.mask.tag));
2959                         break;
2960                 default:
2961                         if (ib_spec->type & IB_FLOW_SPEC_INNER) {
2962                                 if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2963                                                            ft_field_support.inner_first_mpls),
2964                                                            &ib_spec->mpls.mask.tag))
2965                                         return -EOPNOTSUPP;
2966
2967                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
2968                                                     inner_first_mpls),
2969                                        &ib_spec->mpls.val.tag,
2970                                        sizeof(ib_spec->mpls.val.tag));
2971                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
2972                                                     inner_first_mpls),
2973                                        &ib_spec->mpls.mask.tag,
2974                                        sizeof(ib_spec->mpls.mask.tag));
2975                         } else {
2976                                 if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
2977                                                            ft_field_support.outer_first_mpls),
2978                                                            &ib_spec->mpls.mask.tag))
2979                                         return -EOPNOTSUPP;
2980
2981                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
2982                                                     outer_first_mpls),
2983                                        &ib_spec->mpls.val.tag,
2984                                        sizeof(ib_spec->mpls.val.tag));
2985                                 memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
2986                                                     outer_first_mpls),
2987                                        &ib_spec->mpls.mask.tag,
2988                                        sizeof(ib_spec->mpls.mask.tag));
2989                         }
2990                 }
2991                 break;
2992         case IB_FLOW_SPEC_VXLAN_TUNNEL:
2993                 if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
2994                                          LAST_TUNNEL_FIELD))
2995                         return -EOPNOTSUPP;
2996
2997                 MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
2998                          ntohl(ib_spec->tunnel.mask.tunnel_id));
2999                 MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
3000                          ntohl(ib_spec->tunnel.val.tunnel_id));
3001                 break;
3002         case IB_FLOW_SPEC_ACTION_TAG:
3003                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
3004                                          LAST_FLOW_TAG_FIELD))
3005                         return -EOPNOTSUPP;
3006                 if (ib_spec->flow_tag.tag_id >= BIT(24))
3007                         return -EINVAL;
3008
3009                 flow_context->flow_tag = ib_spec->flow_tag.tag_id;
3010                 flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
3011                 break;
3012         case IB_FLOW_SPEC_ACTION_DROP:
3013                 if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
3014                                          LAST_DROP_FIELD))
3015                         return -EOPNOTSUPP;
3016                 action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
3017                 break;
3018         case IB_FLOW_SPEC_ACTION_HANDLE:
3019                 ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act),
3020                         flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action);
3021                 if (ret)
3022                         return ret;
3023                 break;
3024         case IB_FLOW_SPEC_ACTION_COUNT:
3025                 if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
3026                                          LAST_COUNTERS_FIELD))
3027                         return -EOPNOTSUPP;
3028
3029                 /* for now support only one counters spec per flow */
3030                 if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
3031                         return -EINVAL;
3032
3033                 action->counters = ib_spec->flow_count.counters;
3034                 action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
3035                 break;
3036         default:
3037                 return -EINVAL;
3038         }
3039
3040         return 0;
3041 }
3042
3043 /* If a flow could catch both multicast and unicast packets,
3044  * it won't fall into the multicast flow steering table and this rule
3045  * could steal other multicast packets.
3046  */
3047 static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
3048 {
3049         union ib_flow_spec *flow_spec;
3050
3051         if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
3052             ib_attr->num_of_specs < 1)
3053                 return false;
3054
3055         flow_spec = (union ib_flow_spec *)(ib_attr + 1);
3056         if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
3057                 struct ib_flow_spec_ipv4 *ipv4_spec;
3058
3059                 ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
3060                 if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
3061                         return true;
3062
3063                 return false;
3064         }
3065
3066         if (flow_spec->type == IB_FLOW_SPEC_ETH) {
3067                 struct ib_flow_spec_eth *eth_spec;
3068
3069                 eth_spec = (struct ib_flow_spec_eth *)flow_spec;
3070                 return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
3071                        is_multicast_ether_addr(eth_spec->val.dst_mac);
3072         }
3073
3074         return false;
3075 }
3076
3077 enum valid_spec {
3078         VALID_SPEC_INVALID,
3079         VALID_SPEC_VALID,
3080         VALID_SPEC_NA,
3081 };
3082
3083 static enum valid_spec
3084 is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
3085                      const struct mlx5_flow_spec *spec,
3086                      const struct mlx5_flow_act *flow_act,
3087                      bool egress)
3088 {
3089         const u32 *match_c = spec->match_criteria;
3090         bool is_crypto =
3091                 (flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
3092                                      MLX5_FLOW_CONTEXT_ACTION_DECRYPT));
3093         bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c);
3094         bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP;
3095
3096         /*
3097          * Currently only crypto is supported in egress, when regular egress
3098          * rules would be supported, always return VALID_SPEC_NA.
3099          */
3100         if (!is_crypto)
3101                 return VALID_SPEC_NA;
3102
3103         return is_crypto && is_ipsec &&
3104                 (!egress || (!is_drop &&
3105                              !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
3106                 VALID_SPEC_VALID : VALID_SPEC_INVALID;
3107 }
3108
3109 static bool is_valid_spec(struct mlx5_core_dev *mdev,
3110                           const struct mlx5_flow_spec *spec,
3111                           const struct mlx5_flow_act *flow_act,
3112                           bool egress)
3113 {
3114         /* We curretly only support ipsec egress flow */
3115         return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID;
3116 }
3117
3118 static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
3119                                const struct ib_flow_attr *flow_attr,
3120                                bool check_inner)
3121 {
3122         union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
3123         int match_ipv = check_inner ?
3124                         MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3125                                         ft_field_support.inner_ip_version) :
3126                         MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
3127                                         ft_field_support.outer_ip_version);
3128         int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
3129         bool ipv4_spec_valid, ipv6_spec_valid;
3130         unsigned int ip_spec_type = 0;
3131         bool has_ethertype = false;
3132         unsigned int spec_index;
3133         bool mask_valid = true;
3134         u16 eth_type = 0;
3135         bool type_valid;
3136
3137         /* Validate that ethertype is correct */
3138         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
3139                 if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
3140                     ib_spec->eth.mask.ether_type) {
3141                         mask_valid = (ib_spec->eth.mask.ether_type ==
3142                                       htons(0xffff));
3143                         has_ethertype = true;
3144                         eth_type = ntohs(ib_spec->eth.val.ether_type);
3145                 } else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
3146                            (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
3147                         ip_spec_type = ib_spec->type;
3148                 }
3149                 ib_spec = (void *)ib_spec + ib_spec->size;
3150         }
3151
3152         type_valid = (!has_ethertype) || (!ip_spec_type);
3153         if (!type_valid && mask_valid) {
3154                 ipv4_spec_valid = (eth_type == ETH_P_IP) &&
3155                         (ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
3156                 ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
3157                         (ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
3158
3159                 type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
3160                              (((eth_type == ETH_P_MPLS_UC) ||
3161                                (eth_type == ETH_P_MPLS_MC)) && match_ipv);
3162         }
3163
3164         return type_valid;
3165 }
3166
3167 static bool is_valid_attr(struct mlx5_core_dev *mdev,
3168                           const struct ib_flow_attr *flow_attr)
3169 {
3170         return is_valid_ethertype(mdev, flow_attr, false) &&
3171                is_valid_ethertype(mdev, flow_attr, true);
3172 }
3173
3174 static void put_flow_table(struct mlx5_ib_dev *dev,
3175                            struct mlx5_ib_flow_prio *prio, bool ft_added)
3176 {
3177         prio->refcount -= !!ft_added;
3178         if (!prio->refcount) {
3179                 mlx5_destroy_flow_table(prio->flow_table);
3180                 prio->flow_table = NULL;
3181         }
3182 }
3183
3184 static void counters_clear_description(struct ib_counters *counters)
3185 {
3186         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
3187
3188         mutex_lock(&mcounters->mcntrs_mutex);
3189         kfree(mcounters->counters_data);
3190         mcounters->counters_data = NULL;
3191         mcounters->cntrs_max_index = 0;
3192         mutex_unlock(&mcounters->mcntrs_mutex);
3193 }
3194
3195 static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
3196 {
3197         struct mlx5_ib_flow_handler *handler = container_of(flow_id,
3198                                                           struct mlx5_ib_flow_handler,
3199                                                           ibflow);
3200         struct mlx5_ib_flow_handler *iter, *tmp;
3201         struct mlx5_ib_dev *dev = handler->dev;
3202
3203         mutex_lock(&dev->flow_db->lock);
3204
3205         list_for_each_entry_safe(iter, tmp, &handler->list, list) {
3206                 mlx5_del_flow_rules(iter->rule);
3207                 put_flow_table(dev, iter->prio, true);
3208                 list_del(&iter->list);
3209                 kfree(iter);
3210         }
3211
3212         mlx5_del_flow_rules(handler->rule);
3213         put_flow_table(dev, handler->prio, true);
3214         if (handler->ibcounters &&
3215             atomic_read(&handler->ibcounters->usecnt) == 1)
3216                 counters_clear_description(handler->ibcounters);
3217
3218         mutex_unlock(&dev->flow_db->lock);
3219         if (handler->flow_matcher)
3220                 atomic_dec(&handler->flow_matcher->usecnt);
3221         kfree(handler);
3222
3223         return 0;
3224 }
3225
3226 static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
3227 {
3228         priority *= 2;
3229         if (!dont_trap)
3230                 priority++;
3231         return priority;
3232 }
3233
3234 enum flow_table_type {
3235         MLX5_IB_FT_RX,
3236         MLX5_IB_FT_TX
3237 };
3238
3239 #define MLX5_FS_MAX_TYPES        6
3240 #define MLX5_FS_MAX_ENTRIES      BIT(16)
3241
3242 static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
3243                                            struct mlx5_ib_flow_prio *prio,
3244                                            int priority,
3245                                            int num_entries, int num_groups,
3246                                            u32 flags)
3247 {
3248         struct mlx5_flow_table *ft;
3249
3250         ft = mlx5_create_auto_grouped_flow_table(ns, priority,
3251                                                  num_entries,
3252                                                  num_groups,
3253                                                  0, flags);
3254         if (IS_ERR(ft))
3255                 return ERR_CAST(ft);
3256
3257         prio->flow_table = ft;
3258         prio->refcount = 0;
3259         return prio;
3260 }
3261
3262 static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
3263                                                 struct ib_flow_attr *flow_attr,
3264                                                 enum flow_table_type ft_type)
3265 {
3266         bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
3267         struct mlx5_flow_namespace *ns = NULL;
3268         struct mlx5_ib_flow_prio *prio;
3269         struct mlx5_flow_table *ft;
3270         int max_table_size;
3271         int num_entries;
3272         int num_groups;
3273         bool esw_encap;
3274         u32 flags = 0;
3275         int priority;
3276
3277         max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3278                                                        log_max_ft_size));
3279         esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
3280                 DEVLINK_ESWITCH_ENCAP_MODE_NONE;
3281         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
3282                 enum mlx5_flow_namespace_type fn_type;
3283
3284                 if (flow_is_multicast_only(flow_attr) &&
3285                     !dont_trap)
3286                         priority = MLX5_IB_FLOW_MCAST_PRIO;
3287                 else
3288                         priority = ib_prio_to_core_prio(flow_attr->priority,
3289                                                         dont_trap);
3290                 if (ft_type == MLX5_IB_FT_RX) {
3291                         fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
3292                         prio = &dev->flow_db->prios[priority];
3293                         if (!dev->is_rep && !esw_encap &&
3294                             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
3295                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
3296                         if (!dev->is_rep && !esw_encap &&
3297                             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3298                                         reformat_l3_tunnel_to_l2))
3299                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3300                 } else {
3301                         max_table_size =
3302                                 BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
3303                                                               log_max_ft_size));
3304                         fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
3305                         prio = &dev->flow_db->egress_prios[priority];
3306                         if (!dev->is_rep && !esw_encap &&
3307                             MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
3308                                 flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3309                 }
3310                 ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
3311                 num_entries = MLX5_FS_MAX_ENTRIES;
3312                 num_groups = MLX5_FS_MAX_TYPES;
3313         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3314                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
3315                 ns = mlx5_get_flow_namespace(dev->mdev,
3316                                              MLX5_FLOW_NAMESPACE_LEFTOVERS);
3317                 build_leftovers_ft_param(&priority,
3318                                          &num_entries,
3319                                          &num_groups);
3320                 prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
3321         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3322                 if (!MLX5_CAP_FLOWTABLE(dev->mdev,
3323                                         allow_sniffer_and_nic_rx_shared_tir))
3324                         return ERR_PTR(-ENOTSUPP);
3325
3326                 ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
3327                                              MLX5_FLOW_NAMESPACE_SNIFFER_RX :
3328                                              MLX5_FLOW_NAMESPACE_SNIFFER_TX);
3329
3330                 prio = &dev->flow_db->sniffer[ft_type];
3331                 priority = 0;
3332                 num_entries = 1;
3333                 num_groups = 1;
3334         }
3335
3336         if (!ns)
3337                 return ERR_PTR(-ENOTSUPP);
3338
3339         max_table_size = min_t(int, num_entries, max_table_size);
3340
3341         ft = prio->flow_table;
3342         if (!ft)
3343                 return _get_prio(ns, prio, priority, max_table_size, num_groups,
3344                                  flags);
3345
3346         return prio;
3347 }
3348
3349 static void set_underlay_qp(struct mlx5_ib_dev *dev,
3350                             struct mlx5_flow_spec *spec,
3351                             u32 underlay_qpn)
3352 {
3353         void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
3354                                            spec->match_criteria,
3355                                            misc_parameters);
3356         void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3357                                            misc_parameters);
3358
3359         if (underlay_qpn &&
3360             MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3361                                       ft_field_support.bth_dst_qp)) {
3362                 MLX5_SET(fte_match_set_misc,
3363                          misc_params_v, bth_dst_qp, underlay_qpn);
3364                 MLX5_SET(fte_match_set_misc,
3365                          misc_params_c, bth_dst_qp, 0xffffff);
3366         }
3367 }
3368
3369 static int read_flow_counters(struct ib_device *ibdev,
3370                               struct mlx5_read_counters_attr *read_attr)
3371 {
3372         struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
3373         struct mlx5_ib_dev *dev = to_mdev(ibdev);
3374
3375         return mlx5_fc_query(dev->mdev, fc,
3376                              &read_attr->out[IB_COUNTER_PACKETS],
3377                              &read_attr->out[IB_COUNTER_BYTES]);
3378 }
3379
3380 /* flow counters currently expose two counters packets and bytes */
3381 #define FLOW_COUNTERS_NUM 2
3382 static int counters_set_description(struct ib_counters *counters,
3383                                     enum mlx5_ib_counters_type counters_type,
3384                                     struct mlx5_ib_flow_counters_desc *desc_data,
3385                                     u32 ncounters)
3386 {
3387         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
3388         u32 cntrs_max_index = 0;
3389         int i;
3390
3391         if (counters_type != MLX5_IB_COUNTERS_FLOW)
3392                 return -EINVAL;
3393
3394         /* init the fields for the object */
3395         mcounters->type = counters_type;
3396         mcounters->read_counters = read_flow_counters;
3397         mcounters->counters_num = FLOW_COUNTERS_NUM;
3398         mcounters->ncounters = ncounters;
3399         /* each counter entry have both description and index pair */
3400         for (i = 0; i < ncounters; i++) {
3401                 if (desc_data[i].description > IB_COUNTER_BYTES)
3402                         return -EINVAL;
3403
3404                 if (cntrs_max_index <= desc_data[i].index)
3405                         cntrs_max_index = desc_data[i].index + 1;
3406         }
3407
3408         mutex_lock(&mcounters->mcntrs_mutex);
3409         mcounters->counters_data = desc_data;
3410         mcounters->cntrs_max_index = cntrs_max_index;
3411         mutex_unlock(&mcounters->mcntrs_mutex);
3412
3413         return 0;
3414 }
3415
3416 #define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
3417 static int flow_counters_set_data(struct ib_counters *ibcounters,
3418                                   struct mlx5_ib_create_flow *ucmd)
3419 {
3420         struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
3421         struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
3422         struct mlx5_ib_flow_counters_desc *desc_data = NULL;
3423         bool hw_hndl = false;
3424         int ret = 0;
3425
3426         if (ucmd && ucmd->ncounters_data != 0) {
3427                 cntrs_data = ucmd->data;
3428                 if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
3429                         return -EINVAL;
3430
3431                 desc_data = kcalloc(cntrs_data->ncounters,
3432                                     sizeof(*desc_data),
3433                                     GFP_KERNEL);
3434                 if (!desc_data)
3435                         return  -ENOMEM;
3436
3437                 if (copy_from_user(desc_data,
3438                                    u64_to_user_ptr(cntrs_data->counters_data),
3439                                    sizeof(*desc_data) * cntrs_data->ncounters)) {
3440                         ret = -EFAULT;
3441                         goto free;
3442                 }
3443         }
3444
3445         if (!mcounters->hw_cntrs_hndl) {
3446                 mcounters->hw_cntrs_hndl = mlx5_fc_create(
3447                         to_mdev(ibcounters->device)->mdev, false);
3448                 if (IS_ERR(mcounters->hw_cntrs_hndl)) {
3449                         ret = PTR_ERR(mcounters->hw_cntrs_hndl);
3450                         goto free;
3451                 }
3452                 hw_hndl = true;
3453         }
3454
3455         if (desc_data) {
3456                 /* counters already bound to at least one flow */
3457                 if (mcounters->cntrs_max_index) {
3458                         ret = -EINVAL;
3459                         goto free_hndl;
3460                 }
3461
3462                 ret = counters_set_description(ibcounters,
3463                                                MLX5_IB_COUNTERS_FLOW,
3464                                                desc_data,
3465                                                cntrs_data->ncounters);
3466                 if (ret)
3467                         goto free_hndl;
3468
3469         } else if (!mcounters->cntrs_max_index) {
3470                 /* counters not bound yet, must have udata passed */
3471                 ret = -EINVAL;
3472                 goto free_hndl;
3473         }
3474
3475         return 0;
3476
3477 free_hndl:
3478         if (hw_hndl) {
3479                 mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
3480                                 mcounters->hw_cntrs_hndl);
3481                 mcounters->hw_cntrs_hndl = NULL;
3482         }
3483 free:
3484         kfree(desc_data);
3485         return ret;
3486 }
3487
3488 static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev,
3489                                          struct mlx5_flow_spec *spec,
3490                                          struct mlx5_eswitch_rep *rep)
3491 {
3492         struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
3493         void *misc;
3494
3495         if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
3496                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3497                                     misc_parameters_2);
3498
3499                 MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
3500                          mlx5_eswitch_get_vport_metadata_for_match(esw,
3501                                                                    rep->vport));
3502                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
3503                                     misc_parameters_2);
3504
3505                 MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0);
3506         } else {
3507                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
3508                                     misc_parameters);
3509
3510                 MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport);
3511
3512                 misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
3513                                     misc_parameters);
3514
3515                 MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
3516         }
3517 }
3518
3519 static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
3520                                                       struct mlx5_ib_flow_prio *ft_prio,
3521                                                       const struct ib_flow_attr *flow_attr,
3522                                                       struct mlx5_flow_destination *dst,
3523                                                       u32 underlay_qpn,
3524                                                       struct mlx5_ib_create_flow *ucmd)
3525 {
3526         struct mlx5_flow_table  *ft = ft_prio->flow_table;
3527         struct mlx5_ib_flow_handler *handler;
3528         struct mlx5_flow_act flow_act = {};
3529         struct mlx5_flow_spec *spec;
3530         struct mlx5_flow_destination dest_arr[2] = {};
3531         struct mlx5_flow_destination *rule_dst = dest_arr;
3532         const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
3533         unsigned int spec_index;
3534         u32 prev_type = 0;
3535         int err = 0;
3536         int dest_num = 0;
3537         bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
3538
3539         if (!is_valid_attr(dev->mdev, flow_attr))
3540                 return ERR_PTR(-EINVAL);
3541
3542         if (dev->is_rep && is_egress)
3543                 return ERR_PTR(-EINVAL);
3544
3545         spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
3546         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
3547         if (!handler || !spec) {
3548                 err = -ENOMEM;
3549                 goto free;
3550         }
3551
3552         INIT_LIST_HEAD(&handler->list);
3553         if (dst) {
3554                 memcpy(&dest_arr[0], dst, sizeof(*dst));
3555                 dest_num++;
3556         }
3557
3558         for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
3559                 err = parse_flow_attr(dev->mdev, spec,
3560                                       ib_flow, flow_attr, &flow_act,
3561                                       prev_type);
3562                 if (err < 0)
3563                         goto free;
3564
3565                 prev_type = ((union ib_flow_spec *)ib_flow)->type;
3566                 ib_flow += ((union ib_flow_spec *)ib_flow)->size;
3567         }
3568
3569         if (!flow_is_multicast_only(flow_attr))
3570                 set_underlay_qp(dev, spec, underlay_qpn);
3571
3572         if (dev->is_rep) {
3573                 struct mlx5_eswitch_rep *rep;
3574
3575                 rep = dev->port[flow_attr->port - 1].rep;
3576                 if (!rep) {
3577                         err = -EINVAL;
3578                         goto free;
3579                 }
3580
3581                 mlx5_ib_set_rule_source_port(dev, spec, rep);
3582         }
3583
3584         spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
3585
3586         if (is_egress &&
3587             !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) {
3588                 err = -EINVAL;
3589                 goto free;
3590         }
3591
3592         if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
3593                 struct mlx5_ib_mcounters *mcounters;
3594
3595                 err = flow_counters_set_data(flow_act.counters, ucmd);
3596                 if (err)
3597                         goto free;
3598
3599                 mcounters = to_mcounters(flow_act.counters);
3600                 handler->ibcounters = flow_act.counters;
3601                 dest_arr[dest_num].type =
3602                         MLX5_FLOW_DESTINATION_TYPE_COUNTER;
3603                 dest_arr[dest_num].counter_id =
3604                         mlx5_fc_id(mcounters->hw_cntrs_hndl);
3605                 dest_num++;
3606         }
3607
3608         if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
3609                 if (!(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT)) {
3610                         rule_dst = NULL;
3611                         dest_num = 0;
3612                 }
3613         } else {
3614                 if (is_egress)
3615                         flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
3616                 else
3617                         flow_act.action |=
3618                                 dest_num ?  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
3619                                         MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
3620         }
3621
3622         if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
3623             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3624              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
3625                 mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
3626                              spec->flow_context.flow_tag, flow_attr->type);
3627                 err = -EINVAL;
3628                 goto free;
3629         }
3630         handler->rule = mlx5_add_flow_rules(ft, spec,
3631                                             &flow_act,
3632                                             rule_dst, dest_num);
3633
3634         if (IS_ERR(handler->rule)) {
3635                 err = PTR_ERR(handler->rule);
3636                 goto free;
3637         }
3638
3639         ft_prio->refcount++;
3640         handler->prio = ft_prio;
3641         handler->dev = dev;
3642
3643         ft_prio->flow_table = ft;
3644 free:
3645         if (err && handler) {
3646                 if (handler->ibcounters &&
3647                     atomic_read(&handler->ibcounters->usecnt) == 1)
3648                         counters_clear_description(handler->ibcounters);
3649                 kfree(handler);
3650         }
3651         kvfree(spec);
3652         return err ? ERR_PTR(err) : handler;
3653 }
3654
3655 static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
3656                                                      struct mlx5_ib_flow_prio *ft_prio,
3657                                                      const struct ib_flow_attr *flow_attr,
3658                                                      struct mlx5_flow_destination *dst)
3659 {
3660         return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
3661 }
3662
3663 static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
3664                                                           struct mlx5_ib_flow_prio *ft_prio,
3665                                                           struct ib_flow_attr *flow_attr,
3666                                                           struct mlx5_flow_destination *dst)
3667 {
3668         struct mlx5_ib_flow_handler *handler_dst = NULL;
3669         struct mlx5_ib_flow_handler *handler = NULL;
3670
3671         handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
3672         if (!IS_ERR(handler)) {
3673                 handler_dst = create_flow_rule(dev, ft_prio,
3674                                                flow_attr, dst);
3675                 if (IS_ERR(handler_dst)) {
3676                         mlx5_del_flow_rules(handler->rule);
3677                         ft_prio->refcount--;
3678                         kfree(handler);
3679                         handler = handler_dst;
3680                 } else {
3681                         list_add(&handler_dst->list, &handler->list);
3682                 }
3683         }
3684
3685         return handler;
3686 }
3687 enum {
3688         LEFTOVERS_MC,
3689         LEFTOVERS_UC,
3690 };
3691
3692 static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
3693                                                           struct mlx5_ib_flow_prio *ft_prio,
3694                                                           struct ib_flow_attr *flow_attr,
3695                                                           struct mlx5_flow_destination *dst)
3696 {
3697         struct mlx5_ib_flow_handler *handler_ucast = NULL;
3698         struct mlx5_ib_flow_handler *handler = NULL;
3699
3700         static struct {
3701                 struct ib_flow_attr     flow_attr;
3702                 struct ib_flow_spec_eth eth_flow;
3703         } leftovers_specs[] = {
3704                 [LEFTOVERS_MC] = {
3705                         .flow_attr = {
3706                                 .num_of_specs = 1,
3707                                 .size = sizeof(leftovers_specs[0])
3708                         },
3709                         .eth_flow = {
3710                                 .type = IB_FLOW_SPEC_ETH,
3711                                 .size = sizeof(struct ib_flow_spec_eth),
3712                                 .mask = {.dst_mac = {0x1} },
3713                                 .val =  {.dst_mac = {0x1} }
3714                         }
3715                 },
3716                 [LEFTOVERS_UC] = {
3717                         .flow_attr = {
3718                                 .num_of_specs = 1,
3719                                 .size = sizeof(leftovers_specs[0])
3720                         },
3721                         .eth_flow = {
3722                                 .type = IB_FLOW_SPEC_ETH,
3723                                 .size = sizeof(struct ib_flow_spec_eth),
3724                                 .mask = {.dst_mac = {0x1} },
3725                                 .val = {.dst_mac = {} }
3726                         }
3727                 }
3728         };
3729
3730         handler = create_flow_rule(dev, ft_prio,
3731                                    &leftovers_specs[LEFTOVERS_MC].flow_attr,
3732                                    dst);
3733         if (!IS_ERR(handler) &&
3734             flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
3735                 handler_ucast = create_flow_rule(dev, ft_prio,
3736                                                  &leftovers_specs[LEFTOVERS_UC].flow_attr,
3737                                                  dst);
3738                 if (IS_ERR(handler_ucast)) {
3739                         mlx5_del_flow_rules(handler->rule);
3740                         ft_prio->refcount--;
3741                         kfree(handler);
3742                         handler = handler_ucast;
3743                 } else {
3744                         list_add(&handler_ucast->list, &handler->list);
3745                 }
3746         }
3747
3748         return handler;
3749 }
3750
3751 static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
3752                                                         struct mlx5_ib_flow_prio *ft_rx,
3753                                                         struct mlx5_ib_flow_prio *ft_tx,
3754                                                         struct mlx5_flow_destination *dst)
3755 {
3756         struct mlx5_ib_flow_handler *handler_rx;
3757         struct mlx5_ib_flow_handler *handler_tx;
3758         int err;
3759         static const struct ib_flow_attr flow_attr  = {
3760                 .num_of_specs = 0,
3761                 .size = sizeof(flow_attr)
3762         };
3763
3764         handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
3765         if (IS_ERR(handler_rx)) {
3766                 err = PTR_ERR(handler_rx);
3767                 goto err;
3768         }
3769
3770         handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
3771         if (IS_ERR(handler_tx)) {
3772                 err = PTR_ERR(handler_tx);
3773                 goto err_tx;
3774         }
3775
3776         list_add(&handler_tx->list, &handler_rx->list);
3777
3778         return handler_rx;
3779
3780 err_tx:
3781         mlx5_del_flow_rules(handler_rx->rule);
3782         ft_rx->refcount--;
3783         kfree(handler_rx);
3784 err:
3785         return ERR_PTR(err);
3786 }
3787
3788 static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
3789                                            struct ib_flow_attr *flow_attr,
3790                                            int domain,
3791                                            struct ib_udata *udata)
3792 {
3793         struct mlx5_ib_dev *dev = to_mdev(qp->device);
3794         struct mlx5_ib_qp *mqp = to_mqp(qp);
3795         struct mlx5_ib_flow_handler *handler = NULL;
3796         struct mlx5_flow_destination *dst = NULL;
3797         struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
3798         struct mlx5_ib_flow_prio *ft_prio;
3799         bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
3800         struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
3801         size_t min_ucmd_sz, required_ucmd_sz;
3802         int err;
3803         int underlay_qpn;
3804
3805         if (udata && udata->inlen) {
3806                 min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) +
3807                                 sizeof(ucmd_hdr.reserved);
3808                 if (udata->inlen < min_ucmd_sz)
3809                         return ERR_PTR(-EOPNOTSUPP);
3810
3811                 err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
3812                 if (err)
3813                         return ERR_PTR(err);
3814
3815                 /* currently supports only one counters data */
3816                 if (ucmd_hdr.ncounters_data > 1)
3817                         return ERR_PTR(-EINVAL);
3818
3819                 required_ucmd_sz = min_ucmd_sz +
3820                         sizeof(struct mlx5_ib_flow_counters_data) *
3821                         ucmd_hdr.ncounters_data;
3822                 if (udata->inlen > required_ucmd_sz &&
3823                     !ib_is_udata_cleared(udata, required_ucmd_sz,
3824                                          udata->inlen - required_ucmd_sz))
3825                         return ERR_PTR(-EOPNOTSUPP);
3826
3827                 ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
3828                 if (!ucmd)
3829                         return ERR_PTR(-ENOMEM);
3830
3831                 err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
3832                 if (err)
3833                         goto free_ucmd;
3834         }
3835
3836         if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) {
3837                 err = -ENOMEM;
3838                 goto free_ucmd;
3839         }
3840
3841         if (domain != IB_FLOW_DOMAIN_USER ||
3842             flow_attr->port > dev->num_ports ||
3843             (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP |
3844                                   IB_FLOW_ATTR_FLAGS_EGRESS))) {
3845                 err = -EINVAL;
3846                 goto free_ucmd;
3847         }
3848
3849         if (is_egress &&
3850             (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3851              flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
3852                 err = -EINVAL;
3853                 goto free_ucmd;
3854         }
3855
3856         dst = kzalloc(sizeof(*dst), GFP_KERNEL);
3857         if (!dst) {
3858                 err = -ENOMEM;
3859                 goto free_ucmd;
3860         }
3861
3862         mutex_lock(&dev->flow_db->lock);
3863
3864         ft_prio = get_flow_table(dev, flow_attr,
3865                                  is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX);
3866         if (IS_ERR(ft_prio)) {
3867                 err = PTR_ERR(ft_prio);
3868                 goto unlock;
3869         }
3870         if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3871                 ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
3872                 if (IS_ERR(ft_prio_tx)) {
3873                         err = PTR_ERR(ft_prio_tx);
3874                         ft_prio_tx = NULL;
3875                         goto destroy_ft;
3876                 }
3877         }
3878
3879         if (is_egress) {
3880                 dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
3881         } else {
3882                 dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
3883                 if (mqp->flags & MLX5_IB_QP_RSS)
3884                         dst->tir_num = mqp->rss_qp.tirn;
3885                 else
3886                         dst->tir_num = mqp->raw_packet_qp.rq.tirn;
3887         }
3888
3889         if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
3890                 if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
3891                         handler = create_dont_trap_rule(dev, ft_prio,
3892                                                         flow_attr, dst);
3893                 } else {
3894                         underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ?
3895                                         mqp->underlay_qpn : 0;
3896                         handler = _create_flow_rule(dev, ft_prio, flow_attr,
3897                                                     dst, underlay_qpn, ucmd);
3898                 }
3899         } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
3900                    flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
3901                 handler = create_leftovers_rule(dev, ft_prio, flow_attr,
3902                                                 dst);
3903         } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
3904                 handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
3905         } else {
3906                 err = -EINVAL;
3907                 goto destroy_ft;
3908         }
3909
3910         if (IS_ERR(handler)) {
3911                 err = PTR_ERR(handler);
3912                 handler = NULL;
3913                 goto destroy_ft;
3914         }
3915
3916         mutex_unlock(&dev->flow_db->lock);
3917         kfree(dst);
3918         kfree(ucmd);
3919
3920         return &handler->ibflow;
3921
3922 destroy_ft:
3923         put_flow_table(dev, ft_prio, false);
3924         if (ft_prio_tx)
3925                 put_flow_table(dev, ft_prio_tx, false);
3926 unlock:
3927         mutex_unlock(&dev->flow_db->lock);
3928         kfree(dst);
3929 free_ucmd:
3930         kfree(ucmd);
3931         return ERR_PTR(err);
3932 }
3933
3934 static struct mlx5_ib_flow_prio *
3935 _get_flow_table(struct mlx5_ib_dev *dev,
3936                 struct mlx5_ib_flow_matcher *fs_matcher,
3937                 bool mcast)
3938 {
3939         struct mlx5_flow_namespace *ns = NULL;
3940         struct mlx5_ib_flow_prio *prio = NULL;
3941         int max_table_size = 0;
3942         bool esw_encap;
3943         u32 flags = 0;
3944         int priority;
3945
3946         if (mcast)
3947                 priority = MLX5_IB_FLOW_MCAST_PRIO;
3948         else
3949                 priority = ib_prio_to_core_prio(fs_matcher->priority, false);
3950
3951         esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
3952                 DEVLINK_ESWITCH_ENCAP_MODE_NONE;
3953         if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
3954                 max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3955                                         log_max_ft_size));
3956                 if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap)
3957                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
3958                 if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
3959                                               reformat_l3_tunnel_to_l2) &&
3960                     !esw_encap)
3961                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3962         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
3963                 max_table_size = BIT(
3964                         MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
3965                 if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap)
3966                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3967         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) {
3968                 max_table_size = BIT(
3969                         MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
3970                 if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap)
3971                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
3972                 if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) &&
3973                     esw_encap)
3974                         flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
3975                 priority = FDB_BYPASS_PATH;
3976         } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) {
3977                 max_table_size =
3978                         BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
3979                                                        log_max_ft_size));
3980                 priority = fs_matcher->priority;
3981         }
3982
3983         max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
3984
3985         ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
3986         if (!ns)
3987                 return ERR_PTR(-ENOTSUPP);
3988
3989         if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS)
3990                 prio = &dev->flow_db->prios[priority];
3991         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
3992                 prio = &dev->flow_db->egress_prios[priority];
3993         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB)
3994                 prio = &dev->flow_db->fdb;
3995         else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
3996                 prio = &dev->flow_db->rdma_rx[priority];
3997
3998         if (!prio)
3999                 return ERR_PTR(-EINVAL);
4000
4001         if (prio->flow_table)
4002                 return prio;
4003
4004         return _get_prio(ns, prio, priority, max_table_size,
4005                          MLX5_FS_MAX_TYPES, flags);
4006 }
4007
4008 static struct mlx5_ib_flow_handler *
4009 _create_raw_flow_rule(struct mlx5_ib_dev *dev,
4010                       struct mlx5_ib_flow_prio *ft_prio,
4011                       struct mlx5_flow_destination *dst,
4012                       struct mlx5_ib_flow_matcher  *fs_matcher,
4013                       struct mlx5_flow_context *flow_context,
4014                       struct mlx5_flow_act *flow_act,
4015                       void *cmd_in, int inlen,
4016                       int dst_num)
4017 {
4018         struct mlx5_ib_flow_handler *handler;
4019         struct mlx5_flow_spec *spec;
4020         struct mlx5_flow_table *ft = ft_prio->flow_table;
4021         int err = 0;
4022
4023         spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
4024         handler = kzalloc(sizeof(*handler), GFP_KERNEL);
4025         if (!handler || !spec) {
4026                 err = -ENOMEM;
4027                 goto free;
4028         }
4029
4030         INIT_LIST_HEAD(&handler->list);
4031
4032         memcpy(spec->match_value, cmd_in, inlen);
4033         memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
4034                fs_matcher->mask_len);
4035         spec->match_criteria_enable = fs_matcher->match_criteria_enable;
4036         spec->flow_context = *flow_context;
4037
4038         handler->rule = mlx5_add_flow_rules(ft, spec,
4039                                             flow_act, dst, dst_num);
4040
4041         if (IS_ERR(handler->rule)) {
4042                 err = PTR_ERR(handler->rule);
4043                 goto free;
4044         }
4045
4046         ft_prio->refcount++;
4047         handler->prio = ft_prio;
4048         handler->dev = dev;
4049         ft_prio->flow_table = ft;
4050
4051 free:
4052         if (err)
4053                 kfree(handler);
4054         kvfree(spec);
4055         return err ? ERR_PTR(err) : handler;
4056 }
4057
4058 static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
4059                                 void *match_v)
4060 {
4061         void *match_c;
4062         void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4;
4063         void *dmac, *dmac_mask;
4064         void *ipv4, *ipv4_mask;
4065
4066         if (!(fs_matcher->match_criteria_enable &
4067               (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT)))
4068                 return false;
4069
4070         match_c = fs_matcher->matcher_mask.match_params;
4071         match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v,
4072                                            outer_headers);
4073         match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c,
4074                                            outer_headers);
4075
4076         dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
4077                             dmac_47_16);
4078         dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
4079                                  dmac_47_16);
4080
4081         if (is_multicast_ether_addr(dmac) &&
4082             is_multicast_ether_addr(dmac_mask))
4083                 return true;
4084
4085         ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
4086                             dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
4087
4088         ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
4089                                  dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
4090
4091         if (ipv4_is_multicast(*(__be32 *)(ipv4)) &&
4092             ipv4_is_multicast(*(__be32 *)(ipv4_mask)))
4093                 return true;
4094
4095         return false;
4096 }
4097
4098 struct mlx5_ib_flow_handler *
4099 mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
4100                         struct mlx5_ib_flow_matcher *fs_matcher,
4101                         struct mlx5_flow_context *flow_context,
4102                         struct mlx5_flow_act *flow_act,
4103                         u32 counter_id,
4104                         void *cmd_in, int inlen, int dest_id,
4105                         int dest_type)
4106 {
4107         struct mlx5_flow_destination *dst;
4108         struct mlx5_ib_flow_prio *ft_prio;
4109         struct mlx5_ib_flow_handler *handler;
4110         int dst_num = 0;
4111         bool mcast;
4112         int err;
4113
4114         if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL)
4115                 return ERR_PTR(-EOPNOTSUPP);
4116
4117         if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
4118                 return ERR_PTR(-ENOMEM);
4119
4120         dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
4121         if (!dst)
4122                 return ERR_PTR(-ENOMEM);
4123
4124         mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
4125         mutex_lock(&dev->flow_db->lock);
4126
4127         ft_prio = _get_flow_table(dev, fs_matcher, mcast);
4128         if (IS_ERR(ft_prio)) {
4129                 err = PTR_ERR(ft_prio);
4130                 goto unlock;
4131         }
4132
4133         if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
4134                 dst[dst_num].type = dest_type;
4135                 dst[dst_num].tir_num = dest_id;
4136                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
4137         } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
4138                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
4139                 dst[dst_num].ft_num = dest_id;
4140                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
4141         } else {
4142                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
4143                 flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
4144         }
4145
4146         dst_num++;
4147
4148         if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
4149                 dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
4150                 dst[dst_num].counter_id = counter_id;
4151                 dst_num++;
4152         }
4153
4154         handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
4155                                         flow_context, flow_act,
4156                                         cmd_in, inlen, dst_num);
4157
4158         if (IS_ERR(handler)) {
4159                 err = PTR_ERR(handler);
4160                 goto destroy_ft;
4161         }
4162
4163         mutex_unlock(&dev->flow_db->lock);
4164         atomic_inc(&fs_matcher->usecnt);
4165         handler->flow_matcher = fs_matcher;
4166
4167         kfree(dst);
4168
4169         return handler;
4170
4171 destroy_ft:
4172         put_flow_table(dev, ft_prio, false);
4173 unlock:
4174         mutex_unlock(&dev->flow_db->lock);
4175         kfree(dst);
4176
4177         return ERR_PTR(err);
4178 }
4179
4180 static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags)
4181 {
4182         u32 flags = 0;
4183
4184         if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)
4185                 flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA;
4186
4187         return flags;
4188 }
4189
4190 #define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED      MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
4191 static struct ib_flow_action *
4192 mlx5_ib_create_flow_action_esp(struct ib_device *device,
4193                                const struct ib_flow_action_attrs_esp *attr,
4194                                struct uverbs_attr_bundle *attrs)
4195 {
4196         struct mlx5_ib_dev *mdev = to_mdev(device);
4197         struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm;
4198         struct mlx5_accel_esp_xfrm_attrs accel_attrs = {};
4199         struct mlx5_ib_flow_action *action;
4200         u64 action_flags;
4201         u64 flags;
4202         int err = 0;
4203
4204         err = uverbs_get_flags64(
4205                 &action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
4206                 ((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1));
4207         if (err)
4208                 return ERR_PTR(err);
4209
4210         flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags);
4211
4212         /* We current only support a subset of the standard features. Only a
4213          * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn
4214          * (with overlap). Full offload mode isn't supported.
4215          */
4216         if (!attr->keymat || attr->replay || attr->encap ||
4217             attr->spi || attr->seq || attr->tfc_pad ||
4218             attr->hard_limit_pkts ||
4219             (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4220                              IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)))
4221                 return ERR_PTR(-EOPNOTSUPP);
4222
4223         if (attr->keymat->protocol !=
4224             IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM)
4225                 return ERR_PTR(-EOPNOTSUPP);
4226
4227         aes_gcm = &attr->keymat->keymat.aes_gcm;
4228
4229         if (aes_gcm->icv_len != 16 ||
4230             aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
4231                 return ERR_PTR(-EOPNOTSUPP);
4232
4233         action = kmalloc(sizeof(*action), GFP_KERNEL);
4234         if (!action)
4235                 return ERR_PTR(-ENOMEM);
4236
4237         action->esp_aes_gcm.ib_flags = attr->flags;
4238         memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key,
4239                sizeof(accel_attrs.keymat.aes_gcm.aes_key));
4240         accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8;
4241         memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt,
4242                sizeof(accel_attrs.keymat.aes_gcm.salt));
4243         memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv,
4244                sizeof(accel_attrs.keymat.aes_gcm.seq_iv));
4245         accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8;
4246         accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ;
4247         accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
4248
4249         accel_attrs.esn = attr->esn;
4250         if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED)
4251                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
4252         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
4253                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4254
4255         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)
4256                 accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT;
4257
4258         action->esp_aes_gcm.ctx =
4259                 mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags);
4260         if (IS_ERR(action->esp_aes_gcm.ctx)) {
4261                 err = PTR_ERR(action->esp_aes_gcm.ctx);
4262                 goto err_parse;
4263         }
4264
4265         action->esp_aes_gcm.ib_flags = attr->flags;
4266
4267         return &action->ib_action;
4268
4269 err_parse:
4270         kfree(action);
4271         return ERR_PTR(err);
4272 }
4273
4274 static int
4275 mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action,
4276                                const struct ib_flow_action_attrs_esp *attr,
4277                                struct uverbs_attr_bundle *attrs)
4278 {
4279         struct mlx5_ib_flow_action *maction = to_mflow_act(action);
4280         struct mlx5_accel_esp_xfrm_attrs accel_attrs;
4281         int err = 0;
4282
4283         if (attr->keymat || attr->replay || attr->encap ||
4284             attr->spi || attr->seq || attr->tfc_pad ||
4285             attr->hard_limit_pkts ||
4286             (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4287                              IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS |
4288                              IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)))
4289                 return -EOPNOTSUPP;
4290
4291         /* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can
4292          * be modified.
4293          */
4294         if (!(maction->esp_aes_gcm.ib_flags &
4295               IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) &&
4296             attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
4297                            IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))
4298                 return -EINVAL;
4299
4300         memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs,
4301                sizeof(accel_attrs));
4302
4303         accel_attrs.esn = attr->esn;
4304         if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
4305                 accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4306         else
4307                 accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
4308
4309         err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx,
4310                                          &accel_attrs);
4311         if (err)
4312                 return err;
4313
4314         maction->esp_aes_gcm.ib_flags &=
4315                 ~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
4316         maction->esp_aes_gcm.ib_flags |=
4317                 attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
4318
4319         return 0;
4320 }
4321
4322 static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action)
4323 {
4324         struct mlx5_ib_flow_action *maction = to_mflow_act(action);
4325
4326         switch (action->type) {
4327         case IB_FLOW_ACTION_ESP:
4328                 /*
4329                  * We only support aes_gcm by now, so we implicitly know this is
4330                  * the underline crypto.
4331                  */
4332                 mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
4333                 break;
4334         case IB_FLOW_ACTION_UNSPECIFIED:
4335                 mlx5_ib_destroy_flow_action_raw(maction);
4336                 break;
4337         default:
4338                 WARN_ON(true);
4339                 break;
4340         }
4341
4342         kfree(maction);
4343         return 0;
4344 }
4345
4346 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
4347 {
4348         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
4349         struct mlx5_ib_qp *mqp = to_mqp(ibqp);
4350         int err;
4351         u16 uid;
4352
4353         uid = ibqp->pd ?
4354                 to_mpd(ibqp->pd)->uid : 0;
4355
4356         if (mqp->flags & MLX5_IB_QP_UNDERLAY) {
4357                 mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
4358                 return -EOPNOTSUPP;
4359         }
4360
4361         err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
4362         if (err)
4363                 mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
4364                              ibqp->qp_num, gid->raw);
4365
4366         return err;
4367 }
4368
4369 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
4370 {
4371         struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
4372         int err;
4373         u16 uid;
4374
4375         uid = ibqp->pd ?
4376                 to_mpd(ibqp->pd)->uid : 0;
4377         err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
4378         if (err)
4379                 mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
4380                              ibqp->qp_num, gid->raw);
4381
4382         return err;
4383 }
4384
4385 static int init_node_data(struct mlx5_ib_dev *dev)
4386 {
4387         int err;
4388
4389         err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
4390         if (err)
4391                 return err;
4392
4393         dev->mdev->rev_id = dev->mdev->pdev->revision;
4394
4395         return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
4396 }
4397
4398 static ssize_t fw_pages_show(struct device *device,
4399                              struct device_attribute *attr, char *buf)
4400 {
4401         struct mlx5_ib_dev *dev =
4402                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4403
4404         return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
4405 }
4406 static DEVICE_ATTR_RO(fw_pages);
4407
4408 static ssize_t reg_pages_show(struct device *device,
4409                               struct device_attribute *attr, char *buf)
4410 {
4411         struct mlx5_ib_dev *dev =
4412                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4413
4414         return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
4415 }
4416 static DEVICE_ATTR_RO(reg_pages);
4417
4418 static ssize_t hca_type_show(struct device *device,
4419                              struct device_attribute *attr, char *buf)
4420 {
4421         struct mlx5_ib_dev *dev =
4422                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4423
4424         return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
4425 }
4426 static DEVICE_ATTR_RO(hca_type);
4427
4428 static ssize_t hw_rev_show(struct device *device,
4429                            struct device_attribute *attr, char *buf)
4430 {
4431         struct mlx5_ib_dev *dev =
4432                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4433
4434         return sprintf(buf, "%x\n", dev->mdev->rev_id);
4435 }
4436 static DEVICE_ATTR_RO(hw_rev);
4437
4438 static ssize_t board_id_show(struct device *device,
4439                              struct device_attribute *attr, char *buf)
4440 {
4441         struct mlx5_ib_dev *dev =
4442                 rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
4443
4444         return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
4445                        dev->mdev->board_id);
4446 }
4447 static DEVICE_ATTR_RO(board_id);
4448
4449 static struct attribute *mlx5_class_attributes[] = {
4450         &dev_attr_hw_rev.attr,
4451         &dev_attr_hca_type.attr,
4452         &dev_attr_board_id.attr,
4453         &dev_attr_fw_pages.attr,
4454         &dev_attr_reg_pages.attr,
4455         NULL,
4456 };
4457
4458 static const struct attribute_group mlx5_attr_group = {
4459         .attrs = mlx5_class_attributes,
4460 };
4461
4462 static void pkey_change_handler(struct work_struct *work)
4463 {
4464         struct mlx5_ib_port_resources *ports =
4465                 container_of(work, struct mlx5_ib_port_resources,
4466                              pkey_change_work);
4467
4468         mutex_lock(&ports->devr->mutex);
4469         mlx5_ib_gsi_pkey_change(ports->gsi);
4470         mutex_unlock(&ports->devr->mutex);
4471 }
4472
4473 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
4474 {
4475         struct mlx5_ib_qp *mqp;
4476         struct mlx5_ib_cq *send_mcq, *recv_mcq;
4477         struct mlx5_core_cq *mcq;
4478         struct list_head cq_armed_list;
4479         unsigned long flags_qp;
4480         unsigned long flags_cq;
4481         unsigned long flags;
4482
4483         INIT_LIST_HEAD(&cq_armed_list);
4484
4485         /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
4486         spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
4487         list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
4488                 spin_lock_irqsave(&mqp->sq.lock, flags_qp);
4489                 if (mqp->sq.tail != mqp->sq.head) {
4490                         send_mcq = to_mcq(mqp->ibqp.send_cq);
4491                         spin_lock_irqsave(&send_mcq->lock, flags_cq);
4492                         if (send_mcq->mcq.comp &&
4493                             mqp->ibqp.send_cq->comp_handler) {
4494                                 if (!send_mcq->mcq.reset_notify_added) {
4495                                         send_mcq->mcq.reset_notify_added = 1;
4496                                         list_add_tail(&send_mcq->mcq.reset_notify,
4497                                                       &cq_armed_list);
4498                                 }
4499                         }
4500                         spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
4501                 }
4502                 spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
4503                 spin_lock_irqsave(&mqp->rq.lock, flags_qp);
4504                 /* no handling is needed for SRQ */
4505                 if (!mqp->ibqp.srq) {
4506                         if (mqp->rq.tail != mqp->rq.head) {
4507                                 recv_mcq = to_mcq(mqp->ibqp.recv_cq);
4508                                 spin_lock_irqsave(&recv_mcq->lock, flags_cq);
4509                                 if (recv_mcq->mcq.comp &&
4510                                     mqp->ibqp.recv_cq->comp_handler) {
4511                                         if (!recv_mcq->mcq.reset_notify_added) {
4512                                                 recv_mcq->mcq.reset_notify_added = 1;
4513                                                 list_add_tail(&recv_mcq->mcq.reset_notify,
4514                                                               &cq_armed_list);
4515                                         }
4516                                 }
4517                                 spin_unlock_irqrestore(&recv_mcq->lock,
4518                                                        flags_cq);
4519                         }
4520                 }
4521                 spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
4522         }
4523         /*At that point all inflight post send were put to be executed as of we
4524          * lock/unlock above locks Now need to arm all involved CQs.
4525          */
4526         list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
4527                 mcq->comp(mcq, NULL);
4528         }
4529         spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
4530 }
4531
4532 static void delay_drop_handler(struct work_struct *work)
4533 {
4534         int err;
4535         struct mlx5_ib_delay_drop *delay_drop =
4536                 container_of(work, struct mlx5_ib_delay_drop,
4537                              delay_drop_work);
4538
4539         atomic_inc(&delay_drop->events_cnt);
4540
4541         mutex_lock(&delay_drop->lock);
4542         err = mlx5_core_set_delay_drop(delay_drop->dev->mdev,
4543                                        delay_drop->timeout);
4544         if (err) {
4545                 mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
4546                              delay_drop->timeout);
4547                 delay_drop->activate = false;
4548         }
4549         mutex_unlock(&delay_drop->lock);
4550 }
4551
4552 static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
4553                                  struct ib_event *ibev)
4554 {
4555         u8 port = (eqe->data.port.port >> 4) & 0xf;
4556
4557         switch (eqe->sub_type) {
4558         case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
4559                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
4560                                             IB_LINK_LAYER_ETHERNET)
4561                         schedule_work(&ibdev->delay_drop.delay_drop_work);
4562                 break;
4563         default: /* do nothing */
4564                 return;
4565         }
4566 }
4567
4568 static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
4569                               struct ib_event *ibev)
4570 {
4571         u8 port = (eqe->data.port.port >> 4) & 0xf;
4572
4573         ibev->element.port_num = port;
4574
4575         switch (eqe->sub_type) {
4576         case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
4577         case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
4578         case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
4579                 /* In RoCE, port up/down events are handled in
4580                  * mlx5_netdev_event().
4581                  */
4582                 if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
4583                                             IB_LINK_LAYER_ETHERNET)
4584                         return -EINVAL;
4585
4586                 ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
4587                                 IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
4588                 break;
4589
4590         case MLX5_PORT_CHANGE_SUBTYPE_LID:
4591                 ibev->event = IB_EVENT_LID_CHANGE;
4592                 break;
4593
4594         case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
4595                 ibev->event = IB_EVENT_PKEY_CHANGE;
4596                 schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
4597                 break;
4598
4599         case MLX5_PORT_CHANGE_SUBTYPE_GUID:
4600                 ibev->event = IB_EVENT_GID_CHANGE;
4601                 break;
4602
4603         case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
4604                 ibev->event = IB_EVENT_CLIENT_REREGISTER;
4605                 break;
4606         default:
4607                 return -EINVAL;
4608         }
4609
4610         return 0;
4611 }
4612
4613 static void mlx5_ib_handle_event(struct work_struct *_work)
4614 {
4615         struct mlx5_ib_event_work *work =
4616                 container_of(_work, struct mlx5_ib_event_work, work);
4617         struct mlx5_ib_dev *ibdev;
4618         struct ib_event ibev;
4619         bool fatal = false;
4620
4621         if (work->is_slave) {
4622                 ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
4623                 if (!ibdev)
4624                         goto out;
4625         } else {
4626                 ibdev = work->dev;
4627         }
4628
4629         switch (work->event) {
4630         case MLX5_DEV_EVENT_SYS_ERROR:
4631                 ibev.event = IB_EVENT_DEVICE_FATAL;
4632                 mlx5_ib_handle_internal_error(ibdev);
4633                 ibev.element.port_num  = (u8)(unsigned long)work->param;
4634                 fatal = true;
4635                 break;
4636         case MLX5_EVENT_TYPE_PORT_CHANGE:
4637                 if (handle_port_change(ibdev, work->param, &ibev))
4638                         goto out;
4639                 break;
4640         case MLX5_EVENT_TYPE_GENERAL_EVENT:
4641                 handle_general_event(ibdev, work->param, &ibev);
4642                 /* fall through */
4643         default:
4644                 goto out;
4645         }
4646
4647         ibev.device = &ibdev->ib_dev;
4648
4649         if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
4650                 mlx5_ib_warn(ibdev, "warning: event on port %d\n",  ibev.element.port_num);
4651                 goto out;
4652         }
4653
4654         if (ibdev->ib_active)
4655                 ib_dispatch_event(&ibev);
4656
4657         if (fatal)
4658                 ibdev->ib_active = false;
4659 out:
4660         kfree(work);
4661 }
4662
4663 static int mlx5_ib_event(struct notifier_block *nb,
4664                          unsigned long event, void *param)
4665 {
4666         struct mlx5_ib_event_work *work;
4667
4668         work = kmalloc(sizeof(*work), GFP_ATOMIC);
4669         if (!work)
4670                 return NOTIFY_DONE;
4671
4672         INIT_WORK(&work->work, mlx5_ib_handle_event);
4673         work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
4674         work->is_slave = false;
4675         work->param = param;
4676         work->event = event;
4677
4678         queue_work(mlx5_ib_event_wq, &work->work);
4679
4680         return NOTIFY_OK;
4681 }
4682
4683 static int mlx5_ib_event_slave_port(struct notifier_block *nb,
4684                                     unsigned long event, void *param)
4685 {
4686         struct mlx5_ib_event_work *work;
4687
4688         work = kmalloc(sizeof(*work), GFP_ATOMIC);
4689         if (!work)
4690                 return NOTIFY_DONE;
4691
4692         INIT_WORK(&work->work, mlx5_ib_handle_event);
4693         work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events);
4694         work->is_slave = true;
4695         work->param = param;
4696         work->event = event;
4697         queue_work(mlx5_ib_event_wq, &work->work);
4698
4699         return NOTIFY_OK;
4700 }
4701
4702 static int set_has_smi_cap(struct mlx5_ib_dev *dev)
4703 {
4704         struct mlx5_hca_vport_context vport_ctx;
4705         int err;
4706         int port;
4707
4708         for (port = 1; port <= ARRAY_SIZE(dev->mdev->port_caps); port++) {
4709                 dev->mdev->port_caps[port - 1].has_smi = false;
4710                 if (MLX5_CAP_GEN(dev->mdev, port_type) ==
4711                     MLX5_CAP_PORT_TYPE_IB) {
4712                         if (MLX5_CAP_GEN(dev->mdev, ib_virt)) {
4713                                 err = mlx5_query_hca_vport_context(dev->mdev, 0,
4714                                                                    port, 0,
4715                                                                    &vport_ctx);
4716                                 if (err) {
4717                                         mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
4718                                                     port, err);
4719                                         return err;
4720                                 }
4721                                 dev->mdev->port_caps[port - 1].has_smi =
4722                                         vport_ctx.has_smi;
4723                         } else {
4724                                 dev->mdev->port_caps[port - 1].has_smi = true;
4725                         }
4726                 }
4727         }
4728         return 0;
4729 }
4730
4731 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
4732 {
4733         int port;
4734
4735         for (port = 1; port <= dev->num_ports; port++)
4736                 mlx5_query_ext_port_caps(dev, port);
4737 }
4738
4739 static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
4740 {
4741         struct ib_device_attr *dprops = NULL;
4742         struct ib_port_attr *pprops = NULL;
4743         int err = -ENOMEM;
4744         struct ib_udata uhw = {.inlen = 0, .outlen = 0};
4745
4746         pprops = kzalloc(sizeof(*pprops), GFP_KERNEL);
4747         if (!pprops)
4748                 goto out;
4749
4750         dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
4751         if (!dprops)
4752                 goto out;
4753
4754         err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
4755         if (err) {
4756                 mlx5_ib_warn(dev, "query_device failed %d\n", err);
4757                 goto out;
4758         }
4759
4760         err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
4761         if (err) {
4762                 mlx5_ib_warn(dev, "query_port %d failed %d\n",
4763                              port, err);
4764                 goto out;
4765         }
4766
4767         dev->mdev->port_caps[port - 1].pkey_table_len =
4768                                         dprops->max_pkeys;
4769         dev->mdev->port_caps[port - 1].gid_table_len =
4770                                         pprops->gid_tbl_len;
4771         mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n",
4772                     port, dprops->max_pkeys, pprops->gid_tbl_len);
4773
4774 out:
4775         kfree(pprops);
4776         kfree(dprops);
4777
4778         return err;
4779 }
4780
4781 static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
4782 {
4783         /* For representors use port 1, is this is the only native
4784          * port
4785          */
4786         if (dev->is_rep)
4787                 return __get_port_caps(dev, 1);
4788         return __get_port_caps(dev, port);
4789 }
4790
4791 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
4792 {
4793         int err;
4794
4795         err = mlx5_mr_cache_cleanup(dev);
4796         if (err)
4797                 mlx5_ib_warn(dev, "mr cache cleanup failed\n");
4798
4799         if (dev->umrc.qp)
4800                 mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
4801         if (dev->umrc.cq)
4802                 ib_free_cq(dev->umrc.cq);
4803         if (dev->umrc.pd)
4804                 ib_dealloc_pd(dev->umrc.pd);
4805 }
4806
4807 enum {
4808         MAX_UMR_WR = 128,
4809 };
4810
4811 static int create_umr_res(struct mlx5_ib_dev *dev)
4812 {
4813         struct ib_qp_init_attr *init_attr = NULL;
4814         struct ib_qp_attr *attr = NULL;
4815         struct ib_pd *pd;
4816         struct ib_cq *cq;
4817         struct ib_qp *qp;
4818         int ret;
4819
4820         attr = kzalloc(sizeof(*attr), GFP_KERNEL);
4821         init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
4822         if (!attr || !init_attr) {
4823                 ret = -ENOMEM;
4824                 goto error_0;
4825         }
4826
4827         pd = ib_alloc_pd(&dev->ib_dev, 0);
4828         if (IS_ERR(pd)) {
4829                 mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
4830                 ret = PTR_ERR(pd);
4831                 goto error_0;
4832         }
4833
4834         cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
4835         if (IS_ERR(cq)) {
4836                 mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
4837                 ret = PTR_ERR(cq);
4838                 goto error_2;
4839         }
4840
4841         init_attr->send_cq = cq;
4842         init_attr->recv_cq = cq;
4843         init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
4844         init_attr->cap.max_send_wr = MAX_UMR_WR;
4845         init_attr->cap.max_send_sge = 1;
4846         init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
4847         init_attr->port_num = 1;
4848         qp = mlx5_ib_create_qp(pd, init_attr, NULL);
4849         if (IS_ERR(qp)) {
4850                 mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
4851                 ret = PTR_ERR(qp);
4852                 goto error_3;
4853         }
4854         qp->device     = &dev->ib_dev;
4855         qp->real_qp    = qp;
4856         qp->uobject    = NULL;
4857         qp->qp_type    = MLX5_IB_QPT_REG_UMR;
4858         qp->send_cq    = init_attr->send_cq;
4859         qp->recv_cq    = init_attr->recv_cq;
4860
4861         attr->qp_state = IB_QPS_INIT;
4862         attr->port_num = 1;
4863         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
4864                                 IB_QP_PORT, NULL);
4865         if (ret) {
4866                 mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
4867                 goto error_4;
4868         }
4869
4870         memset(attr, 0, sizeof(*attr));
4871         attr->qp_state = IB_QPS_RTR;
4872         attr->path_mtu = IB_MTU_256;
4873
4874         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4875         if (ret) {
4876                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
4877                 goto error_4;
4878         }
4879
4880         memset(attr, 0, sizeof(*attr));
4881         attr->qp_state = IB_QPS_RTS;
4882         ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
4883         if (ret) {
4884                 mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
4885                 goto error_4;
4886         }
4887
4888         dev->umrc.qp = qp;
4889         dev->umrc.cq = cq;
4890         dev->umrc.pd = pd;
4891
4892         sema_init(&dev->umrc.sem, MAX_UMR_WR);
4893         ret = mlx5_mr_cache_init(dev);
4894         if (ret) {
4895                 mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
4896                 goto error_4;
4897         }
4898
4899         kfree(attr);
4900         kfree(init_attr);
4901
4902         return 0;
4903
4904 error_4:
4905         mlx5_ib_destroy_qp(qp, NULL);
4906         dev->umrc.qp = NULL;
4907
4908 error_3:
4909         ib_free_cq(cq);
4910         dev->umrc.cq = NULL;
4911
4912 error_2:
4913         ib_dealloc_pd(pd);
4914         dev->umrc.pd = NULL;
4915
4916 error_0:
4917         kfree(attr);
4918         kfree(init_attr);
4919         return ret;
4920 }
4921
4922 static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
4923 {
4924         switch (umr_fence_cap) {
4925         case MLX5_CAP_UMR_FENCE_NONE:
4926                 return MLX5_FENCE_MODE_NONE;
4927         case MLX5_CAP_UMR_FENCE_SMALL:
4928                 return MLX5_FENCE_MODE_INITIATOR_SMALL;
4929         default:
4930                 return MLX5_FENCE_MODE_STRONG_ORDERING;
4931         }
4932 }
4933
4934 static int create_dev_resources(struct mlx5_ib_resources *devr)
4935 {
4936         struct ib_srq_init_attr attr;
4937         struct mlx5_ib_dev *dev;
4938         struct ib_device *ibdev;
4939         struct ib_cq_init_attr cq_attr = {.cqe = 1};
4940         int port;
4941         int ret = 0;
4942
4943         dev = container_of(devr, struct mlx5_ib_dev, devr);
4944         ibdev = &dev->ib_dev;
4945
4946         mutex_init(&devr->mutex);
4947
4948         devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
4949         if (!devr->p0)
4950                 return -ENOMEM;
4951
4952         devr->p0->device  = ibdev;
4953         devr->p0->uobject = NULL;
4954         atomic_set(&devr->p0->usecnt, 0);
4955
4956         ret = mlx5_ib_alloc_pd(devr->p0, NULL);
4957         if (ret)
4958                 goto error0;
4959
4960         devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq);
4961         if (!devr->c0) {
4962                 ret = -ENOMEM;
4963                 goto error1;
4964         }
4965
4966         devr->c0->device = &dev->ib_dev;
4967         atomic_set(&devr->c0->usecnt, 0);
4968
4969         ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL);
4970         if (ret)
4971                 goto err_create_cq;
4972
4973         devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
4974         if (IS_ERR(devr->x0)) {
4975                 ret = PTR_ERR(devr->x0);
4976                 goto error2;
4977         }
4978         devr->x0->device = &dev->ib_dev;
4979         devr->x0->inode = NULL;
4980         atomic_set(&devr->x0->usecnt, 0);
4981         mutex_init(&devr->x0->tgt_qp_mutex);
4982         INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
4983
4984         devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
4985         if (IS_ERR(devr->x1)) {
4986                 ret = PTR_ERR(devr->x1);
4987                 goto error3;
4988         }
4989         devr->x1->device = &dev->ib_dev;
4990         devr->x1->inode = NULL;
4991         atomic_set(&devr->x1->usecnt, 0);
4992         mutex_init(&devr->x1->tgt_qp_mutex);
4993         INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
4994
4995         memset(&attr, 0, sizeof(attr));
4996         attr.attr.max_sge = 1;
4997         attr.attr.max_wr = 1;
4998         attr.srq_type = IB_SRQT_XRC;
4999         attr.ext.cq = devr->c0;
5000         attr.ext.xrc.xrcd = devr->x0;
5001
5002         devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq);
5003         if (!devr->s0) {
5004                 ret = -ENOMEM;
5005                 goto error4;
5006         }
5007
5008         devr->s0->device        = &dev->ib_dev;
5009         devr->s0->pd            = devr->p0;
5010         devr->s0->srq_type      = IB_SRQT_XRC;
5011         devr->s0->ext.xrc.xrcd  = devr->x0;
5012         devr->s0->ext.cq        = devr->c0;
5013         ret = mlx5_ib_create_srq(devr->s0, &attr, NULL);
5014         if (ret)
5015                 goto err_create;
5016
5017         atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
5018         atomic_inc(&devr->s0->ext.cq->usecnt);
5019         atomic_inc(&devr->p0->usecnt);
5020         atomic_set(&devr->s0->usecnt, 0);
5021
5022         memset(&attr, 0, sizeof(attr));
5023         attr.attr.max_sge = 1;
5024         attr.attr.max_wr = 1;
5025         attr.srq_type = IB_SRQT_BASIC;
5026         devr->s1 = rdma_zalloc_drv_obj(ibdev, ib_srq);
5027         if (!devr->s1) {
5028                 ret = -ENOMEM;
5029                 goto error5;
5030         }
5031
5032         devr->s1->device        = &dev->ib_dev;
5033         devr->s1->pd            = devr->p0;
5034         devr->s1->srq_type      = IB_SRQT_BASIC;
5035         devr->s1->ext.cq        = devr->c0;
5036
5037         ret = mlx5_ib_create_srq(devr->s1, &attr, NULL);
5038         if (ret)
5039                 goto error6;
5040
5041         atomic_inc(&devr->p0->usecnt);
5042         atomic_set(&devr->s1->usecnt, 0);
5043
5044         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
5045                 INIT_WORK(&devr->ports[port].pkey_change_work,
5046                           pkey_change_handler);
5047                 devr->ports[port].devr = devr;
5048         }
5049
5050         return 0;
5051
5052 error6:
5053         kfree(devr->s1);
5054 error5:
5055         mlx5_ib_destroy_srq(devr->s0, NULL);
5056 err_create:
5057         kfree(devr->s0);
5058 error4:
5059         mlx5_ib_dealloc_xrcd(devr->x1, NULL);
5060 error3:
5061         mlx5_ib_dealloc_xrcd(devr->x0, NULL);
5062 error2:
5063         mlx5_ib_destroy_cq(devr->c0, NULL);
5064 err_create_cq:
5065         kfree(devr->c0);
5066 error1:
5067         mlx5_ib_dealloc_pd(devr->p0, NULL);
5068 error0:
5069         kfree(devr->p0);
5070         return ret;
5071 }
5072
5073 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
5074 {
5075         int port;
5076
5077         mlx5_ib_destroy_srq(devr->s1, NULL);
5078         kfree(devr->s1);
5079         mlx5_ib_destroy_srq(devr->s0, NULL);
5080         kfree(devr->s0);
5081         mlx5_ib_dealloc_xrcd(devr->x0, NULL);
5082         mlx5_ib_dealloc_xrcd(devr->x1, NULL);
5083         mlx5_ib_destroy_cq(devr->c0, NULL);
5084         kfree(devr->c0);
5085         mlx5_ib_dealloc_pd(devr->p0, NULL);
5086         kfree(devr->p0);
5087
5088         /* Make sure no change P_Key work items are still executing */
5089         for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
5090                 cancel_work_sync(&devr->ports[port].pkey_change_work);
5091 }
5092
5093 static u32 get_core_cap_flags(struct ib_device *ibdev,
5094                               struct mlx5_hca_vport_context *rep)
5095 {
5096         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5097         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
5098         u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
5099         u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
5100         bool raw_support = !mlx5_core_mp_enabled(dev->mdev);
5101         u32 ret = 0;
5102
5103         if (rep->grh_required)
5104                 ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED;
5105
5106         if (ll == IB_LINK_LAYER_INFINIBAND)
5107                 return ret | RDMA_CORE_PORT_IBA_IB;
5108
5109         if (raw_support)
5110                 ret |= RDMA_CORE_PORT_RAW_PACKET;
5111
5112         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
5113                 return ret;
5114
5115         if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
5116                 return ret;
5117
5118         if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
5119                 ret |= RDMA_CORE_PORT_IBA_ROCE;
5120
5121         if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
5122                 ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
5123
5124         return ret;
5125 }
5126
5127 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
5128                                struct ib_port_immutable *immutable)
5129 {
5130         struct ib_port_attr attr;
5131         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5132         enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
5133         struct mlx5_hca_vport_context rep = {0};
5134         int err;
5135
5136         err = ib_query_port(ibdev, port_num, &attr);
5137         if (err)
5138                 return err;
5139
5140         if (ll == IB_LINK_LAYER_INFINIBAND) {
5141                 err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0,
5142                                                    &rep);
5143                 if (err)
5144                         return err;
5145         }
5146
5147         immutable->pkey_tbl_len = attr.pkey_tbl_len;
5148         immutable->gid_tbl_len = attr.gid_tbl_len;
5149         immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep);
5150         if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
5151                 immutable->max_mad_size = IB_MGMT_MAD_SIZE;
5152
5153         return 0;
5154 }
5155
5156 static int mlx5_port_rep_immutable(struct ib_device *ibdev, u8 port_num,
5157                                    struct ib_port_immutable *immutable)
5158 {
5159         struct ib_port_attr attr;
5160         int err;
5161
5162         immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
5163
5164         err = ib_query_port(ibdev, port_num, &attr);
5165         if (err)
5166                 return err;
5167
5168         immutable->pkey_tbl_len = attr.pkey_tbl_len;
5169         immutable->gid_tbl_len = attr.gid_tbl_len;
5170         immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
5171
5172         return 0;
5173 }
5174
5175 static void get_dev_fw_str(struct ib_device *ibdev, char *str)
5176 {
5177         struct mlx5_ib_dev *dev =
5178                 container_of(ibdev, struct mlx5_ib_dev, ib_dev);
5179         snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
5180                  fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
5181                  fw_rev_sub(dev->mdev));
5182 }
5183
5184 static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
5185 {
5186         struct mlx5_core_dev *mdev = dev->mdev;
5187         struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
5188                                                                  MLX5_FLOW_NAMESPACE_LAG);
5189         struct mlx5_flow_table *ft;
5190         int err;
5191
5192         if (!ns || !mlx5_lag_is_roce(mdev))
5193                 return 0;
5194
5195         err = mlx5_cmd_create_vport_lag(mdev);
5196         if (err)
5197                 return err;
5198
5199         ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
5200         if (IS_ERR(ft)) {
5201                 err = PTR_ERR(ft);
5202                 goto err_destroy_vport_lag;
5203         }
5204
5205         dev->flow_db->lag_demux_ft = ft;
5206         dev->lag_active = true;
5207         return 0;
5208
5209 err_destroy_vport_lag:
5210         mlx5_cmd_destroy_vport_lag(mdev);
5211         return err;
5212 }
5213
5214 static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
5215 {
5216         struct mlx5_core_dev *mdev = dev->mdev;
5217
5218         if (dev->lag_active) {
5219                 dev->lag_active = false;
5220
5221                 mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
5222                 dev->flow_db->lag_demux_ft = NULL;
5223
5224                 mlx5_cmd_destroy_vport_lag(mdev);
5225         }
5226 }
5227
5228 static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
5229 {
5230         int err;
5231
5232         dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event;
5233         err = register_netdevice_notifier(&dev->port[port_num].roce.nb);
5234         if (err) {
5235                 dev->port[port_num].roce.nb.notifier_call = NULL;
5236                 return err;
5237         }
5238
5239         return 0;
5240 }
5241
5242 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
5243 {
5244         if (dev->port[port_num].roce.nb.notifier_call) {
5245                 unregister_netdevice_notifier(&dev->port[port_num].roce.nb);
5246                 dev->port[port_num].roce.nb.notifier_call = NULL;
5247         }
5248 }
5249
5250 static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
5251 {
5252         int err;
5253
5254         if (MLX5_CAP_GEN(dev->mdev, roce)) {
5255                 err = mlx5_nic_vport_enable_roce(dev->mdev);
5256                 if (err)
5257                         return err;
5258         }
5259
5260         err = mlx5_eth_lag_init(dev);
5261         if (err)
5262                 goto err_disable_roce;
5263
5264         return 0;
5265
5266 err_disable_roce:
5267         if (MLX5_CAP_GEN(dev->mdev, roce))
5268                 mlx5_nic_vport_disable_roce(dev->mdev);
5269
5270         return err;
5271 }
5272
5273 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
5274 {
5275         mlx5_eth_lag_cleanup(dev);
5276         if (MLX5_CAP_GEN(dev->mdev, roce))
5277                 mlx5_nic_vport_disable_roce(dev->mdev);
5278 }
5279
5280 struct mlx5_ib_counter {
5281         const char *name;
5282         size_t offset;
5283 };
5284
5285 #define INIT_Q_COUNTER(_name)           \
5286         { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
5287
5288 static const struct mlx5_ib_counter basic_q_cnts[] = {
5289         INIT_Q_COUNTER(rx_write_requests),
5290         INIT_Q_COUNTER(rx_read_requests),
5291         INIT_Q_COUNTER(rx_atomic_requests),
5292         INIT_Q_COUNTER(out_of_buffer),
5293 };
5294
5295 static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
5296         INIT_Q_COUNTER(out_of_sequence),
5297 };
5298
5299 static const struct mlx5_ib_counter retrans_q_cnts[] = {
5300         INIT_Q_COUNTER(duplicate_request),
5301         INIT_Q_COUNTER(rnr_nak_retry_err),
5302         INIT_Q_COUNTER(packet_seq_err),
5303         INIT_Q_COUNTER(implied_nak_seq_err),
5304         INIT_Q_COUNTER(local_ack_timeout_err),
5305 };
5306
5307 #define INIT_CONG_COUNTER(_name)                \
5308         { .name = #_name, .offset =     \
5309                 MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
5310
5311 static const struct mlx5_ib_counter cong_cnts[] = {
5312         INIT_CONG_COUNTER(rp_cnp_ignored),
5313         INIT_CONG_COUNTER(rp_cnp_handled),
5314         INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
5315         INIT_CONG_COUNTER(np_cnp_sent),
5316 };
5317
5318 static const struct mlx5_ib_counter extended_err_cnts[] = {
5319         INIT_Q_COUNTER(resp_local_length_error),
5320         INIT_Q_COUNTER(resp_cqe_error),
5321         INIT_Q_COUNTER(req_cqe_error),
5322         INIT_Q_COUNTER(req_remote_invalid_request),
5323         INIT_Q_COUNTER(req_remote_access_errors),
5324         INIT_Q_COUNTER(resp_remote_access_errors),
5325         INIT_Q_COUNTER(resp_cqe_flush_error),
5326         INIT_Q_COUNTER(req_cqe_flush_error),
5327 };
5328
5329 #define INIT_EXT_PPCNT_COUNTER(_name)           \
5330         { .name = #_name, .offset =     \
5331         MLX5_BYTE_OFF(ppcnt_reg, \
5332                       counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
5333
5334 static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
5335         INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
5336 };
5337
5338 static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
5339 {
5340         return MLX5_ESWITCH_MANAGER(mdev) &&
5341                mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
5342                        MLX5_ESWITCH_OFFLOADS;
5343 }
5344
5345 static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
5346 {
5347         int num_cnt_ports;
5348         int i;
5349
5350         num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
5351
5352         for (i = 0; i < num_cnt_ports; i++) {
5353                 if (dev->port[i].cnts.set_id_valid)
5354                         mlx5_core_dealloc_q_counter(dev->mdev,
5355                                                     dev->port[i].cnts.set_id);
5356                 kfree(dev->port[i].cnts.names);
5357                 kfree(dev->port[i].cnts.offsets);
5358         }
5359 }
5360
5361 static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
5362                                     struct mlx5_ib_counters *cnts)
5363 {
5364         u32 num_counters;
5365
5366         num_counters = ARRAY_SIZE(basic_q_cnts);
5367
5368         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
5369                 num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
5370
5371         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
5372                 num_counters += ARRAY_SIZE(retrans_q_cnts);
5373
5374         if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
5375                 num_counters += ARRAY_SIZE(extended_err_cnts);
5376
5377         cnts->num_q_counters = num_counters;
5378
5379         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5380                 cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
5381                 num_counters += ARRAY_SIZE(cong_cnts);
5382         }
5383         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5384                 cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
5385                 num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
5386         }
5387         cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
5388         if (!cnts->names)
5389                 return -ENOMEM;
5390
5391         cnts->offsets = kcalloc(num_counters,
5392                                 sizeof(cnts->offsets), GFP_KERNEL);
5393         if (!cnts->offsets)
5394                 goto err_names;
5395
5396         return 0;
5397
5398 err_names:
5399         kfree(cnts->names);
5400         cnts->names = NULL;
5401         return -ENOMEM;
5402 }
5403
5404 static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
5405                                   const char **names,
5406                                   size_t *offsets)
5407 {
5408         int i;
5409         int j = 0;
5410
5411         for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
5412                 names[j] = basic_q_cnts[i].name;
5413                 offsets[j] = basic_q_cnts[i].offset;
5414         }
5415
5416         if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
5417                 for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
5418                         names[j] = out_of_seq_q_cnts[i].name;
5419                         offsets[j] = out_of_seq_q_cnts[i].offset;
5420                 }
5421         }
5422
5423         if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
5424                 for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
5425                         names[j] = retrans_q_cnts[i].name;
5426                         offsets[j] = retrans_q_cnts[i].offset;
5427                 }
5428         }
5429
5430         if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
5431                 for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
5432                         names[j] = extended_err_cnts[i].name;
5433                         offsets[j] = extended_err_cnts[i].offset;
5434                 }
5435         }
5436
5437         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5438                 for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
5439                         names[j] = cong_cnts[i].name;
5440                         offsets[j] = cong_cnts[i].offset;
5441                 }
5442         }
5443
5444         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5445                 for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
5446                         names[j] = ext_ppcnt_cnts[i].name;
5447                         offsets[j] = ext_ppcnt_cnts[i].offset;
5448                 }
5449         }
5450 }
5451
5452 static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
5453 {
5454         int num_cnt_ports;
5455         int err = 0;
5456         int i;
5457         bool is_shared;
5458
5459         is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
5460         num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
5461
5462         for (i = 0; i < num_cnt_ports; i++) {
5463                 err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
5464                 if (err)
5465                         goto err_alloc;
5466
5467                 mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
5468                                       dev->port[i].cnts.offsets);
5469
5470                 err = mlx5_cmd_alloc_q_counter(dev->mdev,
5471                                                &dev->port[i].cnts.set_id,
5472                                                is_shared ?
5473                                                MLX5_SHARED_RESOURCE_UID : 0);
5474                 if (err) {
5475                         mlx5_ib_warn(dev,
5476                                      "couldn't allocate queue counter for port %d, err %d\n",
5477                                      i + 1, err);
5478                         goto err_alloc;
5479                 }
5480                 dev->port[i].cnts.set_id_valid = true;
5481         }
5482         return 0;
5483
5484 err_alloc:
5485         mlx5_ib_dealloc_counters(dev);
5486         return err;
5487 }
5488
5489 static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
5490                                                    u8 port_num)
5491 {
5492         return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
5493                                                    &dev->port[port_num].cnts;
5494 }
5495
5496 /**
5497  * mlx5_ib_get_counters_id - Returns counters id to use for device+port
5498  * @dev:        Pointer to mlx5 IB device
5499  * @port_num:   Zero based port number
5500  *
5501  * mlx5_ib_get_counters_id() Returns counters set id to use for given
5502  * device port combination in switchdev and non switchdev mode of the
5503  * parent device.
5504  */
5505 u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
5506 {
5507         const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
5508
5509         return cnts->set_id;
5510 }
5511
5512 static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
5513                                                     u8 port_num)
5514 {
5515         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5516         const struct mlx5_ib_counters *cnts;
5517         bool is_switchdev = is_mdev_switchdev_mode(dev->mdev);
5518
5519         if ((is_switchdev && port_num) || (!is_switchdev && !port_num))
5520                 return NULL;
5521
5522         cnts = get_counters(dev, port_num - 1);
5523
5524         return rdma_alloc_hw_stats_struct(cnts->names,
5525                                           cnts->num_q_counters +
5526                                           cnts->num_cong_counters +
5527                                           cnts->num_ext_ppcnt_counters,
5528                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
5529 }
5530
5531 static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
5532                                     const struct mlx5_ib_counters *cnts,
5533                                     struct rdma_hw_stats *stats,
5534                                     u16 set_id)
5535 {
5536         int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
5537         void *out;
5538         __be32 val;
5539         int ret, i;
5540
5541         out = kvzalloc(outlen, GFP_KERNEL);
5542         if (!out)
5543                 return -ENOMEM;
5544
5545         ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen);
5546         if (ret)
5547                 goto free;
5548
5549         for (i = 0; i < cnts->num_q_counters; i++) {
5550                 val = *(__be32 *)(out + cnts->offsets[i]);
5551                 stats->value[i] = (u64)be32_to_cpu(val);
5552         }
5553
5554 free:
5555         kvfree(out);
5556         return ret;
5557 }
5558
5559 static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
5560                                             const struct mlx5_ib_counters *cnts,
5561                                             struct rdma_hw_stats *stats)
5562 {
5563         int offset = cnts->num_q_counters + cnts->num_cong_counters;
5564         int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
5565         int ret, i;
5566         void *out;
5567
5568         out = kvzalloc(sz, GFP_KERNEL);
5569         if (!out)
5570                 return -ENOMEM;
5571
5572         ret = mlx5_cmd_query_ext_ppcnt_counters(dev->mdev, out);
5573         if (ret)
5574                 goto free;
5575
5576         for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
5577                 stats->value[i + offset] =
5578                         be64_to_cpup((__be64 *)(out +
5579                                     cnts->offsets[i + offset]));
5580 free:
5581         kvfree(out);
5582         return ret;
5583 }
5584
5585 static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
5586                                 struct rdma_hw_stats *stats,
5587                                 u8 port_num, int index)
5588 {
5589         struct mlx5_ib_dev *dev = to_mdev(ibdev);
5590         const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
5591         struct mlx5_core_dev *mdev;
5592         int ret, num_counters;
5593         u8 mdev_port_num;
5594
5595         if (!stats)
5596                 return -EINVAL;
5597
5598         num_counters = cnts->num_q_counters +
5599                        cnts->num_cong_counters +
5600                        cnts->num_ext_ppcnt_counters;
5601
5602         /* q_counters are per IB device, query the master mdev */
5603         ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id);
5604         if (ret)
5605                 return ret;
5606
5607         if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
5608                 ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
5609                 if (ret)
5610                         return ret;
5611         }
5612
5613         if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
5614                 mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
5615                                                     &mdev_port_num);
5616                 if (!mdev) {
5617                         /* If port is not affiliated yet, its in down state
5618                          * which doesn't have any counters yet, so it would be
5619                          * zero. So no need to read from the HCA.
5620                          */
5621                         goto done;
5622                 }
5623                 ret = mlx5_lag_query_cong_counters(dev->mdev,
5624                                                    stats->value +
5625                                                    cnts->num_q_counters,
5626                                                    cnts->num_cong_counters,
5627                                                    cnts->offsets +
5628                                                    cnts->num_q_counters);
5629
5630                 mlx5_ib_put_native_port_mdev(dev, port_num);
5631                 if (ret)
5632                         return ret;
5633         }
5634
5635 done:
5636         return num_counters;
5637 }
5638
5639 static struct rdma_hw_stats *
5640 mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
5641 {
5642         struct mlx5_ib_dev *dev = to_mdev(counter->device);
5643         const struct mlx5_ib_counters *cnts =
5644                 get_counters(dev, counter->port - 1);
5645
5646         /* Q counters are in the beginning of all counters */
5647         return rdma_alloc_hw_stats_struct(cnts->names,
5648                                           cnts->num_q_counters,
5649                                           RDMA_HW_STATS_DEFAULT_LIFESPAN);
5650 }
5651
5652 static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
5653 {
5654         struct mlx5_ib_dev *dev = to_mdev(counter->device);
5655         const struct mlx5_ib_counters *cnts =
5656                 get_counters(dev, counter->port - 1);
5657
5658         return mlx5_ib_query_q_counters(dev->mdev, cnts,
5659                                         counter->stats, counter->id);
5660 }
5661
5662 static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
5663                                    struct ib_qp *qp)
5664 {
5665         struct mlx5_ib_dev *dev = to_mdev(qp->device);
5666         u16 cnt_set_id = 0;
5667         int err;
5668
5669         if (!counter->id) {
5670                 err = mlx5_cmd_alloc_q_counter(dev->mdev,
5671                                                &cnt_set_id,
5672                                                MLX5_SHARED_RESOURCE_UID);
5673                 if (err)
5674                         return err;
5675                 counter->id = cnt_set_id;
5676         }
5677
5678         err = mlx5_ib_qp_set_counter(qp, counter);
5679         if (err)
5680                 goto fail_set_counter;
5681
5682         return 0;
5683
5684 fail_set_counter:
5685         mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id);
5686         counter->id = 0;
5687
5688         return err;
5689 }
5690
5691 static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
5692 {
5693         return mlx5_ib_qp_set_counter(qp, NULL);
5694 }
5695
5696 static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
5697 {
5698         struct mlx5_ib_dev *dev = to_mdev(counter->device);
5699
5700         return mlx5_core_dealloc_q_counter(dev->mdev, counter->id);
5701 }
5702
5703 static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
5704                                  enum rdma_netdev_t type,
5705                                  struct rdma_netdev_alloc_params *params)
5706 {
5707         if (type != RDMA_NETDEV_IPOIB)
5708                 return -EOPNOTSUPP;
5709
5710         return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
5711 }
5712
5713 static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
5714 {
5715         if (!dev->delay_drop.dbg)
5716                 return;
5717         debugfs_remove_recursive(dev->delay_drop.dbg->dir_debugfs);
5718         kfree(dev->delay_drop.dbg);
5719         dev->delay_drop.dbg = NULL;
5720 }
5721
5722 static void cancel_delay_drop(struct mlx5_ib_dev *dev)
5723 {
5724         if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
5725                 return;
5726
5727         cancel_work_sync(&dev->delay_drop.delay_drop_work);
5728         delay_drop_debugfs_cleanup(dev);
5729 }
5730
5731 static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
5732                                        size_t count, loff_t *pos)
5733 {
5734         struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
5735         char lbuf[20];
5736         int len;
5737
5738         len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout);
5739         return simple_read_from_buffer(buf, count, pos, lbuf, len);
5740 }
5741
5742 static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf,
5743                                         size_t count, loff_t *pos)
5744 {
5745         struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
5746         u32 timeout;
5747         u32 var;
5748
5749         if (kstrtouint_from_user(buf, count, 0, &var))
5750                 return -EFAULT;
5751
5752         timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS *
5753                         1000);
5754         if (timeout != var)
5755                 mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n",
5756                             timeout);
5757
5758         delay_drop->timeout = timeout;
5759
5760         return count;
5761 }
5762
5763 static const struct file_operations fops_delay_drop_timeout = {
5764         .owner  = THIS_MODULE,
5765         .open   = simple_open,
5766         .write  = delay_drop_timeout_write,
5767         .read   = delay_drop_timeout_read,
5768 };
5769
5770 static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
5771 {
5772         struct mlx5_ib_dbg_delay_drop *dbg;
5773
5774         if (!mlx5_debugfs_root)
5775                 return 0;
5776
5777         dbg = kzalloc(sizeof(*dbg), GFP_KERNEL);
5778         if (!dbg)
5779                 return -ENOMEM;
5780
5781         dev->delay_drop.dbg = dbg;
5782
5783         dbg->dir_debugfs =
5784                 debugfs_create_dir("delay_drop",
5785                                    dev->mdev->priv.dbg_root);
5786         if (!dbg->dir_debugfs)
5787                 goto out_debugfs;
5788
5789         dbg->events_cnt_debugfs =
5790                 debugfs_create_atomic_t("num_timeout_events", 0400,
5791                                         dbg->dir_debugfs,
5792                                         &dev->delay_drop.events_cnt);
5793         if (!dbg->events_cnt_debugfs)
5794                 goto out_debugfs;
5795
5796         dbg->rqs_cnt_debugfs =
5797                 debugfs_create_atomic_t("num_rqs", 0400,
5798                                         dbg->dir_debugfs,
5799                                         &dev->delay_drop.rqs_cnt);
5800         if (!dbg->rqs_cnt_debugfs)
5801                 goto out_debugfs;
5802
5803         dbg->timeout_debugfs =
5804                 debugfs_create_file("timeout", 0600,
5805                                     dbg->dir_debugfs,
5806                                     &dev->delay_drop,
5807                                     &fops_delay_drop_timeout);
5808         if (!dbg->timeout_debugfs)
5809                 goto out_debugfs;
5810
5811         return 0;
5812
5813 out_debugfs:
5814         delay_drop_debugfs_cleanup(dev);
5815         return -ENOMEM;
5816 }
5817
5818 static void init_delay_drop(struct mlx5_ib_dev *dev)
5819 {
5820         if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
5821                 return;
5822
5823         mutex_init(&dev->delay_drop.lock);
5824         dev->delay_drop.dev = dev;
5825         dev->delay_drop.activate = false;
5826         dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
5827         INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
5828         atomic_set(&dev->delay_drop.rqs_cnt, 0);
5829         atomic_set(&dev->delay_drop.events_cnt, 0);
5830
5831         if (delay_drop_debugfs_init(dev))
5832                 mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n");
5833 }
5834
5835 static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
5836                                       struct mlx5_ib_multiport_info *mpi)
5837 {
5838         u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
5839         struct mlx5_ib_port *port = &ibdev->port[port_num];
5840         int comps;
5841         int err;
5842         int i;
5843
5844         lockdep_assert_held(&mlx5_ib_multiport_mutex);
5845
5846         mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
5847
5848         spin_lock(&port->mp.mpi_lock);
5849         if (!mpi->ibdev) {
5850                 spin_unlock(&port->mp.mpi_lock);
5851                 return;
5852         }
5853
5854         mpi->ibdev = NULL;
5855
5856         spin_unlock(&port->mp.mpi_lock);
5857         if (mpi->mdev_events.notifier_call)
5858                 mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
5859         mpi->mdev_events.notifier_call = NULL;
5860         mlx5_remove_netdev_notifier(ibdev, port_num);
5861         spin_lock(&port->mp.mpi_lock);
5862
5863         comps = mpi->mdev_refcnt;
5864         if (comps) {
5865                 mpi->unaffiliate = true;
5866                 init_completion(&mpi->unref_comp);
5867                 spin_unlock(&port->mp.mpi_lock);
5868
5869                 for (i = 0; i < comps; i++)
5870                         wait_for_completion(&mpi->unref_comp);
5871
5872                 spin_lock(&port->mp.mpi_lock);
5873                 mpi->unaffiliate = false;
5874         }
5875
5876         port->mp.mpi = NULL;
5877
5878         list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
5879
5880         spin_unlock(&port->mp.mpi_lock);
5881
5882         err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
5883
5884         mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1);
5885         /* Log an error, still needed to cleanup the pointers and add
5886          * it back to the list.
5887          */
5888         if (err)
5889                 mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
5890                             port_num + 1);
5891
5892         ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN;
5893 }
5894
5895 static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
5896                                     struct mlx5_ib_multiport_info *mpi)
5897 {
5898         u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
5899         int err;
5900
5901         lockdep_assert_held(&mlx5_ib_multiport_mutex);
5902
5903         spin_lock(&ibdev->port[port_num].mp.mpi_lock);
5904         if (ibdev->port[port_num].mp.mpi) {
5905                 mlx5_ib_dbg(ibdev, "port %d already affiliated.\n",
5906                             port_num + 1);
5907                 spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
5908                 return false;
5909         }
5910
5911         ibdev->port[port_num].mp.mpi = mpi;
5912         mpi->ibdev = ibdev;
5913         mpi->mdev_events.notifier_call = NULL;
5914         spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
5915
5916         err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
5917         if (err)
5918                 goto unbind;
5919
5920         err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev));
5921         if (err)
5922                 goto unbind;
5923
5924         err = mlx5_add_netdev_notifier(ibdev, port_num);
5925         if (err) {
5926                 mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n",
5927                             port_num + 1);
5928                 goto unbind;
5929         }
5930
5931         mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
5932         mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
5933
5934         mlx5_ib_init_cong_debugfs(ibdev, port_num);
5935
5936         return true;
5937
5938 unbind:
5939         mlx5_ib_unbind_slave_port(ibdev, mpi);
5940         return false;
5941 }
5942
5943 static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
5944 {
5945         int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
5946         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
5947                                                           port_num + 1);
5948         struct mlx5_ib_multiport_info *mpi;
5949         int err;
5950         int i;
5951
5952         if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
5953                 return 0;
5954
5955         err = mlx5_query_nic_vport_system_image_guid(dev->mdev,
5956                                                      &dev->sys_image_guid);
5957         if (err)
5958                 return err;
5959
5960         err = mlx5_nic_vport_enable_roce(dev->mdev);
5961         if (err)
5962                 return err;
5963
5964         mutex_lock(&mlx5_ib_multiport_mutex);
5965         for (i = 0; i < dev->num_ports; i++) {
5966                 bool bound = false;
5967
5968                 /* build a stub multiport info struct for the native port. */
5969                 if (i == port_num) {
5970                         mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
5971                         if (!mpi) {
5972                                 mutex_unlock(&mlx5_ib_multiport_mutex);
5973                                 mlx5_nic_vport_disable_roce(dev->mdev);
5974                                 return -ENOMEM;
5975                         }
5976
5977                         mpi->is_master = true;
5978                         mpi->mdev = dev->mdev;
5979                         mpi->sys_image_guid = dev->sys_image_guid;
5980                         dev->port[i].mp.mpi = mpi;
5981                         mpi->ibdev = dev;
5982                         mpi = NULL;
5983                         continue;
5984                 }
5985
5986                 list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
5987                                     list) {
5988                         if (dev->sys_image_guid == mpi->sys_image_guid &&
5989                             (mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
5990                                 bound = mlx5_ib_bind_slave_port(dev, mpi);
5991                         }
5992
5993                         if (bound) {
5994                                 dev_dbg(mpi->mdev->device,
5995                                         "removing port from unaffiliated list.\n");
5996                                 mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
5997                                 list_del(&mpi->list);
5998                                 break;
5999                         }
6000                 }
6001                 if (!bound) {
6002                         get_port_caps(dev, i + 1);
6003                         mlx5_ib_dbg(dev, "no free port found for port %d\n",
6004                                     i + 1);
6005                 }
6006         }
6007
6008         list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list);
6009         mutex_unlock(&mlx5_ib_multiport_mutex);
6010         return err;
6011 }
6012
6013 static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
6014 {
6015         int port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6016         enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
6017                                                           port_num + 1);
6018         int i;
6019
6020         if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
6021                 return;
6022
6023         mutex_lock(&mlx5_ib_multiport_mutex);
6024         for (i = 0; i < dev->num_ports; i++) {
6025                 if (dev->port[i].mp.mpi) {
6026                         /* Destroy the native port stub */
6027                         if (i == port_num) {
6028                                 kfree(dev->port[i].mp.mpi);
6029                                 dev->port[i].mp.mpi = NULL;
6030                         } else {
6031                                 mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1);
6032                                 mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi);
6033                         }
6034                 }
6035         }
6036
6037         mlx5_ib_dbg(dev, "removing from devlist\n");
6038         list_del(&dev->ib_dev_list);
6039         mutex_unlock(&mlx5_ib_multiport_mutex);
6040
6041         mlx5_nic_vport_disable_roce(dev->mdev);
6042 }
6043
6044 ADD_UVERBS_ATTRIBUTES_SIMPLE(
6045         mlx5_ib_dm,
6046         UVERBS_OBJECT_DM,
6047         UVERBS_METHOD_DM_ALLOC,
6048         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
6049                             UVERBS_ATTR_TYPE(u64),
6050                             UA_MANDATORY),
6051         UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
6052                             UVERBS_ATTR_TYPE(u16),
6053                             UA_OPTIONAL),
6054         UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
6055                              enum mlx5_ib_uapi_dm_type,
6056                              UA_OPTIONAL));
6057
6058 ADD_UVERBS_ATTRIBUTES_SIMPLE(
6059         mlx5_ib_flow_action,
6060         UVERBS_OBJECT_FLOW_ACTION,
6061         UVERBS_METHOD_FLOW_ACTION_ESP_CREATE,
6062         UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
6063                              enum mlx5_ib_uapi_flow_action_flags));
6064
6065 static const struct uapi_definition mlx5_ib_defs[] = {
6066 #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
6067         UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
6068         UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
6069 #endif
6070
6071         UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
6072                                 &mlx5_ib_flow_action),
6073         UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
6074         {}
6075 };
6076
6077 static int mlx5_ib_read_counters(struct ib_counters *counters,
6078                                  struct ib_counters_read_attr *read_attr,
6079                                  struct uverbs_attr_bundle *attrs)
6080 {
6081         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
6082         struct mlx5_read_counters_attr mread_attr = {};
6083         struct mlx5_ib_flow_counters_desc *desc;
6084         int ret, i;
6085
6086         mutex_lock(&mcounters->mcntrs_mutex);
6087         if (mcounters->cntrs_max_index > read_attr->ncounters) {
6088                 ret = -EINVAL;
6089                 goto err_bound;
6090         }
6091
6092         mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
6093                                  GFP_KERNEL);
6094         if (!mread_attr.out) {
6095                 ret = -ENOMEM;
6096                 goto err_bound;
6097         }
6098
6099         mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
6100         mread_attr.flags = read_attr->flags;
6101         ret = mcounters->read_counters(counters->device, &mread_attr);
6102         if (ret)
6103                 goto err_read;
6104
6105         /* do the pass over the counters data array to assign according to the
6106          * descriptions and indexing pairs
6107          */
6108         desc = mcounters->counters_data;
6109         for (i = 0; i < mcounters->ncounters; i++)
6110                 read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
6111
6112 err_read:
6113         kfree(mread_attr.out);
6114 err_bound:
6115         mutex_unlock(&mcounters->mcntrs_mutex);
6116         return ret;
6117 }
6118
6119 static int mlx5_ib_destroy_counters(struct ib_counters *counters)
6120 {
6121         struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
6122
6123         counters_clear_description(counters);
6124         if (mcounters->hw_cntrs_hndl)
6125                 mlx5_fc_destroy(to_mdev(counters->device)->mdev,
6126                                 mcounters->hw_cntrs_hndl);
6127
6128         kfree(mcounters);
6129
6130         return 0;
6131 }
6132
6133 static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
6134                                                    struct uverbs_attr_bundle *attrs)
6135 {
6136         struct mlx5_ib_mcounters *mcounters;
6137
6138         mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL);
6139         if (!mcounters)
6140                 return ERR_PTR(-ENOMEM);
6141
6142         mutex_init(&mcounters->mcntrs_mutex);
6143
6144         return &mcounters->ibcntrs;
6145 }
6146
6147 static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
6148 {
6149         mlx5_ib_cleanup_multiport_master(dev);
6150         if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
6151                 srcu_barrier(&dev->mr_srcu);
6152                 cleanup_srcu_struct(&dev->mr_srcu);
6153         }
6154
6155         WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
6156 }
6157
6158 static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
6159 {
6160         struct mlx5_core_dev *mdev = dev->mdev;
6161         int err;
6162         int i;
6163
6164         for (i = 0; i < dev->num_ports; i++) {
6165                 spin_lock_init(&dev->port[i].mp.mpi_lock);
6166                 rwlock_init(&dev->port[i].roce.netdev_lock);
6167                 dev->port[i].roce.dev = dev;
6168                 dev->port[i].roce.native_port_num = i + 1;
6169                 dev->port[i].roce.last_port_state = IB_PORT_DOWN;
6170         }
6171
6172         mlx5_ib_internal_fill_odp_caps(dev);
6173
6174         err = mlx5_ib_init_multiport_master(dev);
6175         if (err)
6176                 return err;
6177
6178         err = set_has_smi_cap(dev);
6179         if (err)
6180                 return err;
6181
6182         if (!mlx5_core_mp_enabled(mdev)) {
6183                 for (i = 1; i <= dev->num_ports; i++) {
6184                         err = get_port_caps(dev, i);
6185                         if (err)
6186                                 break;
6187                 }
6188         } else {
6189                 err = get_port_caps(dev, mlx5_core_native_port_num(mdev));
6190         }
6191         if (err)
6192                 goto err_mp;
6193
6194         if (mlx5_use_mad_ifc(dev))
6195                 get_ext_port_caps(dev);
6196
6197         dev->ib_dev.node_type           = RDMA_NODE_IB_CA;
6198         dev->ib_dev.local_dma_lkey      = 0 /* not supported for now */;
6199         dev->ib_dev.phys_port_cnt       = dev->num_ports;
6200         dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_count(mdev);
6201         dev->ib_dev.dev.parent          = mdev->device;
6202
6203         mutex_init(&dev->cap_mask_mutex);
6204         INIT_LIST_HEAD(&dev->qp_list);
6205         spin_lock_init(&dev->reset_flow_resource_lock);
6206
6207         spin_lock_init(&dev->dm.lock);
6208         dev->dm.dev = mdev;
6209
6210         if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
6211                 err = init_srcu_struct(&dev->mr_srcu);
6212                 if (err)
6213                         goto err_mp;
6214         }
6215
6216         return 0;
6217
6218 err_mp:
6219         mlx5_ib_cleanup_multiport_master(dev);
6220
6221         return -ENOMEM;
6222 }
6223
6224 static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev)
6225 {
6226         dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
6227
6228         if (!dev->flow_db)
6229                 return -ENOMEM;
6230
6231         mutex_init(&dev->flow_db->lock);
6232
6233         return 0;
6234 }
6235
6236 static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
6237 {
6238         kfree(dev->flow_db);
6239 }
6240
6241 static const struct ib_device_ops mlx5_ib_dev_ops = {
6242         .owner = THIS_MODULE,
6243         .driver_id = RDMA_DRIVER_MLX5,
6244         .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION,
6245
6246         .add_gid = mlx5_ib_add_gid,
6247         .alloc_mr = mlx5_ib_alloc_mr,
6248         .alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
6249         .alloc_pd = mlx5_ib_alloc_pd,
6250         .alloc_ucontext = mlx5_ib_alloc_ucontext,
6251         .attach_mcast = mlx5_ib_mcg_attach,
6252         .check_mr_status = mlx5_ib_check_mr_status,
6253         .create_ah = mlx5_ib_create_ah,
6254         .create_counters = mlx5_ib_create_counters,
6255         .create_cq = mlx5_ib_create_cq,
6256         .create_flow = mlx5_ib_create_flow,
6257         .create_qp = mlx5_ib_create_qp,
6258         .create_srq = mlx5_ib_create_srq,
6259         .dealloc_pd = mlx5_ib_dealloc_pd,
6260         .dealloc_ucontext = mlx5_ib_dealloc_ucontext,
6261         .del_gid = mlx5_ib_del_gid,
6262         .dereg_mr = mlx5_ib_dereg_mr,
6263         .destroy_ah = mlx5_ib_destroy_ah,
6264         .destroy_counters = mlx5_ib_destroy_counters,
6265         .destroy_cq = mlx5_ib_destroy_cq,
6266         .destroy_flow = mlx5_ib_destroy_flow,
6267         .destroy_flow_action = mlx5_ib_destroy_flow_action,
6268         .destroy_qp = mlx5_ib_destroy_qp,
6269         .destroy_srq = mlx5_ib_destroy_srq,
6270         .detach_mcast = mlx5_ib_mcg_detach,
6271         .disassociate_ucontext = mlx5_ib_disassociate_ucontext,
6272         .drain_rq = mlx5_ib_drain_rq,
6273         .drain_sq = mlx5_ib_drain_sq,
6274         .get_dev_fw_str = get_dev_fw_str,
6275         .get_dma_mr = mlx5_ib_get_dma_mr,
6276         .get_link_layer = mlx5_ib_port_link_layer,
6277         .map_mr_sg = mlx5_ib_map_mr_sg,
6278         .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
6279         .mmap = mlx5_ib_mmap,
6280         .modify_cq = mlx5_ib_modify_cq,
6281         .modify_device = mlx5_ib_modify_device,
6282         .modify_port = mlx5_ib_modify_port,
6283         .modify_qp = mlx5_ib_modify_qp,
6284         .modify_srq = mlx5_ib_modify_srq,
6285         .poll_cq = mlx5_ib_poll_cq,
6286         .post_recv = mlx5_ib_post_recv,
6287         .post_send = mlx5_ib_post_send,
6288         .post_srq_recv = mlx5_ib_post_srq_recv,
6289         .process_mad = mlx5_ib_process_mad,
6290         .query_ah = mlx5_ib_query_ah,
6291         .query_device = mlx5_ib_query_device,
6292         .query_gid = mlx5_ib_query_gid,
6293         .query_pkey = mlx5_ib_query_pkey,
6294         .query_qp = mlx5_ib_query_qp,
6295         .query_srq = mlx5_ib_query_srq,
6296         .read_counters = mlx5_ib_read_counters,
6297         .reg_user_mr = mlx5_ib_reg_user_mr,
6298         .req_notify_cq = mlx5_ib_arm_cq,
6299         .rereg_user_mr = mlx5_ib_rereg_user_mr,
6300         .resize_cq = mlx5_ib_resize_cq,
6301
6302         INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
6303         INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
6304         INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
6305         INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
6306         INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
6307 };
6308
6309 static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
6310         .create_flow_action_esp = mlx5_ib_create_flow_action_esp,
6311         .modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
6312 };
6313
6314 static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
6315         .rdma_netdev_get_params = mlx5_ib_rn_get_params,
6316 };
6317
6318 static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
6319         .get_vf_config = mlx5_ib_get_vf_config,
6320         .get_vf_stats = mlx5_ib_get_vf_stats,
6321         .set_vf_guid = mlx5_ib_set_vf_guid,
6322         .set_vf_link_state = mlx5_ib_set_vf_link_state,
6323 };
6324
6325 static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
6326         .alloc_mw = mlx5_ib_alloc_mw,
6327         .dealloc_mw = mlx5_ib_dealloc_mw,
6328 };
6329
6330 static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
6331         .alloc_xrcd = mlx5_ib_alloc_xrcd,
6332         .dealloc_xrcd = mlx5_ib_dealloc_xrcd,
6333 };
6334
6335 static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
6336         .alloc_dm = mlx5_ib_alloc_dm,
6337         .dealloc_dm = mlx5_ib_dealloc_dm,
6338         .reg_dm_mr = mlx5_ib_reg_dm_mr,
6339 };
6340
6341 static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
6342 {
6343         struct mlx5_core_dev *mdev = dev->mdev;
6344         int err;
6345
6346         dev->ib_dev.uverbs_cmd_mask     =
6347                 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
6348                 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
6349                 (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
6350                 (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
6351                 (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
6352                 (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
6353                 (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
6354                 (1ull << IB_USER_VERBS_CMD_REG_MR)              |
6355                 (1ull << IB_USER_VERBS_CMD_REREG_MR)            |
6356                 (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
6357                 (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
6358                 (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
6359                 (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
6360                 (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
6361                 (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
6362                 (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
6363                 (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
6364                 (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
6365                 (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
6366                 (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
6367                 (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
6368                 (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
6369                 (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
6370                 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
6371                 (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)         |
6372                 (1ull << IB_USER_VERBS_CMD_OPEN_QP);
6373         dev->ib_dev.uverbs_ex_cmd_mask =
6374                 (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)     |
6375                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)        |
6376                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP)        |
6377                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_QP)        |
6378                 (1ull << IB_USER_VERBS_EX_CMD_MODIFY_CQ)        |
6379                 (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW)      |
6380                 (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
6381
6382         if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
6383             IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
6384                 ib_set_device_ops(&dev->ib_dev,
6385                                   &mlx5_ib_dev_ipoib_enhanced_ops);
6386
6387         if (mlx5_core_is_pf(mdev))
6388                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops);
6389
6390         dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
6391
6392         if (MLX5_CAP_GEN(mdev, imaicl)) {
6393                 dev->ib_dev.uverbs_cmd_mask |=
6394                         (1ull << IB_USER_VERBS_CMD_ALLOC_MW)    |
6395                         (1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
6396                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops);
6397         }
6398
6399         if (MLX5_CAP_GEN(mdev, xrc)) {
6400                 dev->ib_dev.uverbs_cmd_mask |=
6401                         (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
6402                         (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
6403                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
6404         }
6405
6406         if (MLX5_CAP_DEV_MEM(mdev, memic) ||
6407             MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
6408             MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
6409                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
6410
6411         if (mlx5_accel_ipsec_device_caps(dev->mdev) &
6412             MLX5_ACCEL_IPSEC_CAP_DEVICE)
6413                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
6414         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
6415
6416         if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
6417                 dev->ib_dev.driver_def = mlx5_ib_defs;
6418
6419         err = init_node_data(dev);
6420         if (err)
6421                 return err;
6422
6423         if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
6424             (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
6425              MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
6426                 mutex_init(&dev->lb.mutex);
6427
6428         dev->ib_dev.use_cq_dim = true;
6429
6430         return 0;
6431 }
6432
6433 static const struct ib_device_ops mlx5_ib_dev_port_ops = {
6434         .get_port_immutable = mlx5_port_immutable,
6435         .query_port = mlx5_ib_query_port,
6436 };
6437
6438 static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
6439 {
6440         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops);
6441         return 0;
6442 }
6443
6444 static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
6445         .get_port_immutable = mlx5_port_rep_immutable,
6446         .query_port = mlx5_ib_rep_query_port,
6447 };
6448
6449 static int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
6450 {
6451         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
6452         return 0;
6453 }
6454
6455 static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
6456         .create_rwq_ind_table = mlx5_ib_create_rwq_ind_table,
6457         .create_wq = mlx5_ib_create_wq,
6458         .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
6459         .destroy_wq = mlx5_ib_destroy_wq,
6460         .get_netdev = mlx5_ib_get_netdev,
6461         .modify_wq = mlx5_ib_modify_wq,
6462 };
6463
6464 static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
6465 {
6466         u8 port_num;
6467
6468         dev->ib_dev.uverbs_ex_cmd_mask |=
6469                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
6470                         (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
6471                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
6472                         (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
6473                         (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
6474         ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
6475
6476         port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6477
6478         /* Register only for native ports */
6479         return mlx5_add_netdev_notifier(dev, port_num);
6480 }
6481
6482 static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
6483 {
6484         u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
6485
6486         mlx5_remove_netdev_notifier(dev, port_num);
6487 }
6488
6489 static int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
6490 {
6491         struct mlx5_core_dev *mdev = dev->mdev;
6492         enum rdma_link_layer ll;
6493         int port_type_cap;
6494         int err = 0;
6495
6496         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6497         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6498
6499         if (ll == IB_LINK_LAYER_ETHERNET)
6500                 err = mlx5_ib_stage_common_roce_init(dev);
6501
6502         return err;
6503 }
6504
6505 static void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev)
6506 {
6507         mlx5_ib_stage_common_roce_cleanup(dev);
6508 }
6509
6510 static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev)
6511 {
6512         struct mlx5_core_dev *mdev = dev->mdev;
6513         enum rdma_link_layer ll;
6514         int port_type_cap;
6515         int err;
6516
6517         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6518         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6519
6520         if (ll == IB_LINK_LAYER_ETHERNET) {
6521                 err = mlx5_ib_stage_common_roce_init(dev);
6522                 if (err)
6523                         return err;
6524
6525                 err = mlx5_enable_eth(dev);
6526                 if (err)
6527                         goto cleanup;
6528         }
6529
6530         return 0;
6531 cleanup:
6532         mlx5_ib_stage_common_roce_cleanup(dev);
6533
6534         return err;
6535 }
6536
6537 static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev)
6538 {
6539         struct mlx5_core_dev *mdev = dev->mdev;
6540         enum rdma_link_layer ll;
6541         int port_type_cap;
6542
6543         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6544         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6545
6546         if (ll == IB_LINK_LAYER_ETHERNET) {
6547                 mlx5_disable_eth(dev);
6548                 mlx5_ib_stage_common_roce_cleanup(dev);
6549         }
6550 }
6551
6552 static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
6553 {
6554         return create_dev_resources(&dev->devr);
6555 }
6556
6557 static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
6558 {
6559         destroy_dev_resources(&dev->devr);
6560 }
6561
6562 static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
6563 {
6564         return mlx5_ib_odp_init_one(dev);
6565 }
6566
6567 static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
6568 {
6569         mlx5_ib_odp_cleanup_one(dev);
6570 }
6571
6572 static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
6573         .alloc_hw_stats = mlx5_ib_alloc_hw_stats,
6574         .get_hw_stats = mlx5_ib_get_hw_stats,
6575         .counter_bind_qp = mlx5_ib_counter_bind_qp,
6576         .counter_unbind_qp = mlx5_ib_counter_unbind_qp,
6577         .counter_dealloc = mlx5_ib_counter_dealloc,
6578         .counter_alloc_stats = mlx5_ib_counter_alloc_stats,
6579         .counter_update_stats = mlx5_ib_counter_update_stats,
6580 };
6581
6582 static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
6583 {
6584         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
6585                 ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops);
6586
6587                 return mlx5_ib_alloc_counters(dev);
6588         }
6589
6590         return 0;
6591 }
6592
6593 static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
6594 {
6595         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
6596                 mlx5_ib_dealloc_counters(dev);
6597 }
6598
6599 static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
6600 {
6601         mlx5_ib_init_cong_debugfs(dev,
6602                                   mlx5_core_native_port_num(dev->mdev) - 1);
6603         return 0;
6604 }
6605
6606 static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
6607 {
6608         mlx5_ib_cleanup_cong_debugfs(dev,
6609                                      mlx5_core_native_port_num(dev->mdev) - 1);
6610 }
6611
6612 static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
6613 {
6614         dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
6615         return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
6616 }
6617
6618 static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
6619 {
6620         mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
6621 }
6622
6623 static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
6624 {
6625         int err;
6626
6627         err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
6628         if (err)
6629                 return err;
6630
6631         err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
6632         if (err)
6633                 mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
6634
6635         return err;
6636 }
6637
6638 static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
6639 {
6640         mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
6641         mlx5_free_bfreg(dev->mdev, &dev->bfreg);
6642 }
6643
6644 static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
6645 {
6646         const char *name;
6647
6648         rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group);
6649         if (!mlx5_lag_is_roce(dev->mdev))
6650                 name = "mlx5_%d";
6651         else
6652                 name = "mlx5_bond_%d";
6653         return ib_register_device(&dev->ib_dev, name);
6654 }
6655
6656 static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
6657 {
6658         destroy_umrc_res(dev);
6659 }
6660
6661 static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
6662 {
6663         ib_unregister_device(&dev->ib_dev);
6664 }
6665
6666 static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
6667 {
6668         return create_umr_res(dev);
6669 }
6670
6671 static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
6672 {
6673         init_delay_drop(dev);
6674
6675         return 0;
6676 }
6677
6678 static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
6679 {
6680         cancel_delay_drop(dev);
6681 }
6682
6683 static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
6684 {
6685         dev->mdev_events.notifier_call = mlx5_ib_event;
6686         mlx5_notifier_register(dev->mdev, &dev->mdev_events);
6687         return 0;
6688 }
6689
6690 static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
6691 {
6692         mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
6693 }
6694
6695 static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev)
6696 {
6697         int uid;
6698
6699         uid = mlx5_ib_devx_create(dev, false);
6700         if (uid > 0) {
6701                 dev->devx_whitelist_uid = uid;
6702                 mlx5_ib_devx_init_event_table(dev);
6703         }
6704
6705         return 0;
6706 }
6707 static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
6708 {
6709         if (dev->devx_whitelist_uid) {
6710                 mlx5_ib_devx_cleanup_event_table(dev);
6711                 mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
6712         }
6713 }
6714
6715 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
6716                       const struct mlx5_ib_profile *profile,
6717                       int stage)
6718 {
6719         /* Number of stages to cleanup */
6720         while (stage) {
6721                 stage--;
6722                 if (profile->stage[stage].cleanup)
6723                         profile->stage[stage].cleanup(dev);
6724         }
6725
6726         kfree(dev->port);
6727         ib_dealloc_device(&dev->ib_dev);
6728 }
6729
6730 void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
6731                     const struct mlx5_ib_profile *profile)
6732 {
6733         int err;
6734         int i;
6735
6736         for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
6737                 if (profile->stage[i].init) {
6738                         err = profile->stage[i].init(dev);
6739                         if (err)
6740                                 goto err_out;
6741                 }
6742         }
6743
6744         dev->profile = profile;
6745         dev->ib_active = true;
6746
6747         return dev;
6748
6749 err_out:
6750         __mlx5_ib_remove(dev, profile, i);
6751
6752         return NULL;
6753 }
6754
6755 static const struct mlx5_ib_profile pf_profile = {
6756         STAGE_CREATE(MLX5_IB_STAGE_INIT,
6757                      mlx5_ib_stage_init_init,
6758                      mlx5_ib_stage_init_cleanup),
6759         STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
6760                      mlx5_ib_stage_flow_db_init,
6761                      mlx5_ib_stage_flow_db_cleanup),
6762         STAGE_CREATE(MLX5_IB_STAGE_CAPS,
6763                      mlx5_ib_stage_caps_init,
6764                      NULL),
6765         STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
6766                      mlx5_ib_stage_non_default_cb,
6767                      NULL),
6768         STAGE_CREATE(MLX5_IB_STAGE_ROCE,
6769                      mlx5_ib_stage_roce_init,
6770                      mlx5_ib_stage_roce_cleanup),
6771         STAGE_CREATE(MLX5_IB_STAGE_SRQ,
6772                      mlx5_init_srq_table,
6773                      mlx5_cleanup_srq_table),
6774         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
6775                      mlx5_ib_stage_dev_res_init,
6776                      mlx5_ib_stage_dev_res_cleanup),
6777         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
6778                      mlx5_ib_stage_dev_notifier_init,
6779                      mlx5_ib_stage_dev_notifier_cleanup),
6780         STAGE_CREATE(MLX5_IB_STAGE_ODP,
6781                      mlx5_ib_stage_odp_init,
6782                      mlx5_ib_stage_odp_cleanup),
6783         STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
6784                      mlx5_ib_stage_counters_init,
6785                      mlx5_ib_stage_counters_cleanup),
6786         STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
6787                      mlx5_ib_stage_cong_debugfs_init,
6788                      mlx5_ib_stage_cong_debugfs_cleanup),
6789         STAGE_CREATE(MLX5_IB_STAGE_UAR,
6790                      mlx5_ib_stage_uar_init,
6791                      mlx5_ib_stage_uar_cleanup),
6792         STAGE_CREATE(MLX5_IB_STAGE_BFREG,
6793                      mlx5_ib_stage_bfrag_init,
6794                      mlx5_ib_stage_bfrag_cleanup),
6795         STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
6796                      NULL,
6797                      mlx5_ib_stage_pre_ib_reg_umr_cleanup),
6798         STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
6799                      mlx5_ib_stage_devx_init,
6800                      mlx5_ib_stage_devx_cleanup),
6801         STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
6802                      mlx5_ib_stage_ib_reg_init,
6803                      mlx5_ib_stage_ib_reg_cleanup),
6804         STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
6805                      mlx5_ib_stage_post_ib_reg_umr_init,
6806                      NULL),
6807         STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
6808                      mlx5_ib_stage_delay_drop_init,
6809                      mlx5_ib_stage_delay_drop_cleanup),
6810 };
6811
6812 const struct mlx5_ib_profile uplink_rep_profile = {
6813         STAGE_CREATE(MLX5_IB_STAGE_INIT,
6814                      mlx5_ib_stage_init_init,
6815                      mlx5_ib_stage_init_cleanup),
6816         STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
6817                      mlx5_ib_stage_flow_db_init,
6818                      mlx5_ib_stage_flow_db_cleanup),
6819         STAGE_CREATE(MLX5_IB_STAGE_CAPS,
6820                      mlx5_ib_stage_caps_init,
6821                      NULL),
6822         STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
6823                      mlx5_ib_stage_rep_non_default_cb,
6824                      NULL),
6825         STAGE_CREATE(MLX5_IB_STAGE_ROCE,
6826                      mlx5_ib_stage_rep_roce_init,
6827                      mlx5_ib_stage_rep_roce_cleanup),
6828         STAGE_CREATE(MLX5_IB_STAGE_SRQ,
6829                      mlx5_init_srq_table,
6830                      mlx5_cleanup_srq_table),
6831         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
6832                      mlx5_ib_stage_dev_res_init,
6833                      mlx5_ib_stage_dev_res_cleanup),
6834         STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
6835                      mlx5_ib_stage_dev_notifier_init,
6836                      mlx5_ib_stage_dev_notifier_cleanup),
6837         STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
6838                      mlx5_ib_stage_counters_init,
6839                      mlx5_ib_stage_counters_cleanup),
6840         STAGE_CREATE(MLX5_IB_STAGE_UAR,
6841                      mlx5_ib_stage_uar_init,
6842                      mlx5_ib_stage_uar_cleanup),
6843         STAGE_CREATE(MLX5_IB_STAGE_BFREG,
6844                      mlx5_ib_stage_bfrag_init,
6845                      mlx5_ib_stage_bfrag_cleanup),
6846         STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
6847                      NULL,
6848                      mlx5_ib_stage_pre_ib_reg_umr_cleanup),
6849         STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
6850                      mlx5_ib_stage_devx_init,
6851                      mlx5_ib_stage_devx_cleanup),
6852         STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
6853                      mlx5_ib_stage_ib_reg_init,
6854                      mlx5_ib_stage_ib_reg_cleanup),
6855         STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
6856                      mlx5_ib_stage_post_ib_reg_umr_init,
6857                      NULL),
6858 };
6859
6860 static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
6861 {
6862         struct mlx5_ib_multiport_info *mpi;
6863         struct mlx5_ib_dev *dev;
6864         bool bound = false;
6865         int err;
6866
6867         mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
6868         if (!mpi)
6869                 return NULL;
6870
6871         mpi->mdev = mdev;
6872
6873         err = mlx5_query_nic_vport_system_image_guid(mdev,
6874                                                      &mpi->sys_image_guid);
6875         if (err) {
6876                 kfree(mpi);
6877                 return NULL;
6878         }
6879
6880         mutex_lock(&mlx5_ib_multiport_mutex);
6881         list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
6882                 if (dev->sys_image_guid == mpi->sys_image_guid)
6883                         bound = mlx5_ib_bind_slave_port(dev, mpi);
6884
6885                 if (bound) {
6886                         rdma_roce_rescan_device(&dev->ib_dev);
6887                         break;
6888                 }
6889         }
6890
6891         if (!bound) {
6892                 list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
6893                 dev_dbg(mdev->device,
6894                         "no suitable IB device found to bind to, added to unaffiliated list.\n");
6895         }
6896         mutex_unlock(&mlx5_ib_multiport_mutex);
6897
6898         return mpi;
6899 }
6900
6901 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
6902 {
6903         enum rdma_link_layer ll;
6904         struct mlx5_ib_dev *dev;
6905         int port_type_cap;
6906         int num_ports;
6907
6908         printk_once(KERN_INFO "%s", mlx5_version);
6909
6910         if (MLX5_ESWITCH_MANAGER(mdev) &&
6911             mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
6912                 if (!mlx5_core_mp_enabled(mdev))
6913                         mlx5_ib_register_vport_reps(mdev);
6914                 return mdev;
6915         }
6916
6917         port_type_cap = MLX5_CAP_GEN(mdev, port_type);
6918         ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
6919
6920         if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
6921                 return mlx5_ib_add_slave_port(mdev);
6922
6923         num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
6924                         MLX5_CAP_GEN(mdev, num_vhca_ports));
6925         dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
6926         if (!dev)
6927                 return NULL;
6928         dev->port = kcalloc(num_ports, sizeof(*dev->port),
6929                              GFP_KERNEL);
6930         if (!dev->port) {
6931                 ib_dealloc_device(&dev->ib_dev);
6932                 return NULL;
6933         }
6934
6935         dev->mdev = mdev;
6936         dev->num_ports = num_ports;
6937
6938         return __mlx5_ib_add(dev, &pf_profile);
6939 }
6940
6941 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
6942 {
6943         struct mlx5_ib_multiport_info *mpi;
6944         struct mlx5_ib_dev *dev;
6945
6946         if (MLX5_ESWITCH_MANAGER(mdev) && context == mdev) {
6947                 mlx5_ib_unregister_vport_reps(mdev);
6948                 return;
6949         }
6950
6951         if (mlx5_core_is_mp_slave(mdev)) {
6952                 mpi = context;
6953                 mutex_lock(&mlx5_ib_multiport_mutex);
6954                 if (mpi->ibdev)
6955                         mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
6956                 list_del(&mpi->list);
6957                 mutex_unlock(&mlx5_ib_multiport_mutex);
6958                 kfree(mpi);
6959                 return;
6960         }
6961
6962         dev = context;
6963         __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
6964 }
6965
6966 static struct mlx5_interface mlx5_ib_interface = {
6967         .add            = mlx5_ib_add,
6968         .remove         = mlx5_ib_remove,
6969         .protocol       = MLX5_INTERFACE_PROTOCOL_IB,
6970 };
6971
6972 unsigned long mlx5_ib_get_xlt_emergency_page(void)
6973 {
6974         mutex_lock(&xlt_emergency_page_mutex);
6975         return xlt_emergency_page;
6976 }
6977
6978 void mlx5_ib_put_xlt_emergency_page(void)
6979 {
6980         mutex_unlock(&xlt_emergency_page_mutex);
6981 }
6982
6983 static int __init mlx5_ib_init(void)
6984 {
6985         int err;
6986
6987         xlt_emergency_page = __get_free_page(GFP_KERNEL);
6988         if (!xlt_emergency_page)
6989                 return -ENOMEM;
6990
6991         mutex_init(&xlt_emergency_page_mutex);
6992
6993         mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
6994         if (!mlx5_ib_event_wq) {
6995                 free_page(xlt_emergency_page);
6996                 return -ENOMEM;
6997         }
6998
6999         mlx5_ib_odp_init();
7000
7001         err = mlx5_register_interface(&mlx5_ib_interface);
7002
7003         return err;
7004 }
7005
7006 static void __exit mlx5_ib_cleanup(void)
7007 {
7008         mlx5_unregister_interface(&mlx5_ib_interface);
7009         destroy_workqueue(mlx5_ib_event_wq);
7010         mutex_destroy(&xlt_emergency_page_mutex);
7011         free_page(xlt_emergency_page);
7012 }
7013
7014 module_init(mlx5_ib_init);
7015 module_exit(mlx5_ib_cleanup);