drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c

   1 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
   2 /* Copyright (c) 2018 Mellanox Technologies. */
   3
   4 #include <net/vxlan.h>
   5 #include <net/gre.h>
   6 #include <net/geneve.h>
   7 #include "en/tc_tun.h"
   8 #include "en_tc.h"
   9
  10 struct mlx5e_tc_tunnel *mlx5e_get_tc_tun(struct net_device *tunnel_dev)
  11 {
  12         if (netif_is_vxlan(tunnel_dev))
  13                 return &vxlan_tunnel;
  14         else if (netif_is_geneve(tunnel_dev))
  15                 return &geneve_tunnel;
  16         else if (netif_is_gretap(tunnel_dev) ||
  17                  netif_is_ip6gretap(tunnel_dev))
  18                 return &gre_tunnel;
  19         else
  20                 return NULL;
  21 }
  22
  23 static int get_route_and_out_devs(struct mlx5e_priv *priv,
  24                                   struct net_device *dev,
  25                                   struct net_device **route_dev,
  26                                   struct net_device **out_dev)
  27 {
  28         struct net_device *uplink_dev, *uplink_upper, *real_dev;
  29         struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
  30         bool dst_is_lag_dev;
  31
  32         real_dev = is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : dev;
  33         uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH);
  34
  35         rcu_read_lock();
  36         uplink_upper = netdev_master_upper_dev_get_rcu(uplink_dev);
  37         /* mlx5_lag_is_sriov() is a blocking function which can't be called
  38          * while holding rcu read lock. Take the net_device for correctness
  39          * sake.
  40          */
  41         if (uplink_upper)
  42                 dev_hold(uplink_upper);
  43         rcu_read_unlock();
  44
  45         dst_is_lag_dev = (uplink_upper &&
  46                           netif_is_lag_master(uplink_upper) &&
  47                           real_dev == uplink_upper &&
  48                           mlx5_lag_is_sriov(priv->mdev));
  49         if (uplink_upper)
  50                 dev_put(uplink_upper);
  51
  52         /* if the egress device isn't on the same HW e-switch or
  53          * it's a LAG device, use the uplink
  54          */
  55         *route_dev = dev;
  56         if (!netdev_port_same_parent_id(priv->netdev, real_dev) ||
  57             dst_is_lag_dev || is_vlan_dev(*route_dev))
  58                 *out_dev = uplink_dev;
  59         else if (mlx5e_eswitch_rep(dev) &&
  60                  mlx5e_is_valid_eswitch_fwd_dev(priv, dev))
  61                 *out_dev = *route_dev;
  62         else
  63                 return -EOPNOTSUPP;
  64
  65         if (!(mlx5e_eswitch_rep(*out_dev) &&
  66               mlx5e_is_uplink_rep(netdev_priv(*out_dev))))
  67                 return -EOPNOTSUPP;
  68
  69         return 0;
  70 }
  71
  72 static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
  73                                    struct net_device *mirred_dev,
  74                                    struct net_device **out_dev,
  75                                    struct net_device **route_dev,
  76                                    struct flowi4 *fl4,
  77                                    struct neighbour **out_n,
  78                                    u8 *out_ttl)
  79 {
  80         struct rtable *rt;
  81         struct neighbour *n = NULL;
  82
  83 #if IS_ENABLED(CONFIG_INET)
  84         struct mlx5_core_dev *mdev = priv->mdev;
  85         struct net_device *uplink_dev;
  86         int ret;
  87
  88         if (mlx5_lag_is_multipath(mdev)) {
  89                 struct mlx5_eswitch *esw = mdev->priv.eswitch;
  90
  91                 uplink_dev = mlx5_eswitch_uplink_get_proto_dev(esw, REP_ETH);
  92                 fl4->flowi4_oif = uplink_dev->ifindex;
  93         }
  94
  95         rt = ip_route_output_key(dev_net(mirred_dev), fl4);
  96         ret = PTR_ERR_OR_ZERO(rt);
  97         if (ret)
  98                 return ret;
  99
 100         if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET) {
 101                 ip_rt_put(rt);
 102                 return -ENETUNREACH;
 103         }
 104 #else
 105         return -EOPNOTSUPP;
 106 #endif
 107
 108         ret = get_route_and_out_devs(priv, rt->dst.dev, route_dev, out_dev);
 109         if (ret < 0) {
 110                 ip_rt_put(rt);
 111                 return ret;
 112         }
 113
 114         if (!(*out_ttl))
 115                 *out_ttl = ip4_dst_hoplimit(&rt->dst);
 116         n = dst_neigh_lookup(&rt->dst, &fl4->daddr);
 117         ip_rt_put(rt);
 118         if (!n)
 119                 return -ENOMEM;
 120
 121         *out_n = n;
 122         return 0;
 123 }
 124
 125 static const char *mlx5e_netdev_kind(struct net_device *dev)
 126 {
 127         if (dev->rtnl_link_ops)
 128                 return dev->rtnl_link_ops->kind;
 129         else
 130                 return "unknown";
 131 }
 132
 133 static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
 134                                    struct net_device *mirred_dev,
 135                                    struct net_device **out_dev,
 136                                    struct net_device **route_dev,
 137                                    struct flowi6 *fl6,
 138                                    struct neighbour **out_n,
 139                                    u8 *out_ttl)
 140 {
 141         struct neighbour *n = NULL;
 142         struct dst_entry *dst;
 143
 144 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
 145         int ret;
 146
 147         ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst,
 148                                          fl6);
 149         if (ret < 0)
 150                 return ret;
 151
 152         if (!(*out_ttl))
 153                 *out_ttl = ip6_dst_hoplimit(dst);
 154
 155         ret = get_route_and_out_devs(priv, dst->dev, route_dev, out_dev);
 156         if (ret < 0) {
 157                 dst_release(dst);
 158                 return ret;
 159         }
 160 #else
 161         return -EOPNOTSUPP;
 162 #endif
 163
 164         n = dst_neigh_lookup(dst, &fl6->daddr);
 165         dst_release(dst);
 166         if (!n)
 167                 return -ENOMEM;
 168
 169         *out_n = n;
 170         return 0;
 171 }
 172
 173 static int mlx5e_gen_ip_tunnel_header(char buf[], __u8 *ip_proto,
 174                                       struct mlx5e_encap_entry *e)
 175 {
 176         if (!e->tunnel) {
 177                 pr_warn("mlx5: Cannot generate tunnel header for this tunnel\n");
 178                 return -EOPNOTSUPP;
 179         }
 180
 181         return e->tunnel->generate_ip_tun_hdr(buf, ip_proto, e);
 182 }
 183
 184 static char *gen_eth_tnl_hdr(char *buf, struct net_device *dev,
 185                              struct mlx5e_encap_entry *e,
 186                              u16 proto)
 187 {
 188         struct ethhdr *eth = (struct ethhdr *)buf;
 189         char *ip;
 190
 191         ether_addr_copy(eth->h_dest, e->h_dest);
 192         ether_addr_copy(eth->h_source, dev->dev_addr);
 193         if (is_vlan_dev(dev)) {
 194                 struct vlan_hdr *vlan = (struct vlan_hdr *)
 195                                         ((char *)eth + ETH_HLEN);
 196                 ip = (char *)vlan + VLAN_HLEN;
 197                 eth->h_proto = vlan_dev_vlan_proto(dev);
 198                 vlan->h_vlan_TCI = htons(vlan_dev_vlan_id(dev));
 199                 vlan->h_vlan_encapsulated_proto = htons(proto);
 200         } else {
 201                 eth->h_proto = htons(proto);
 202                 ip = (char *)eth + ETH_HLEN;
 203         }
 204
 205         return ip;
 206 }
 207
 208 int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv,
 209                                     struct net_device *mirred_dev,
 210                                     struct mlx5e_encap_entry *e)
 211 {
 212         int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
 213         const struct ip_tunnel_key *tun_key = &e->tun_info->key;
 214         struct net_device *out_dev, *route_dev;
 215         struct neighbour *n = NULL;
 216         struct flowi4 fl4 = {};
 217         int ipv4_encap_size;
 218         char *encap_header;
 219         u8 nud_state, ttl;
 220         struct iphdr *ip;
 221         int err;
 222
 223         /* add the IP fields */
 224         fl4.flowi4_tos = tun_key->tos;
 225         fl4.daddr = tun_key->u.ipv4.dst;
 226         fl4.saddr = tun_key->u.ipv4.src;
 227         ttl = tun_key->ttl;
 228
 229         err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev, &route_dev,
 230                                       &fl4, &n, &ttl);
 231         if (err)
 232                 return err;
 233
 234         ipv4_encap_size =
 235                 (is_vlan_dev(route_dev) ? VLAN_ETH_HLEN : ETH_HLEN) +
 236                 sizeof(struct iphdr) +
 237                 e->tunnel->calc_hlen(e);
 238
 239         if (max_encap_size < ipv4_encap_size) {
 240                 mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
 241                                ipv4_encap_size, max_encap_size);
 242                 return -EOPNOTSUPP;
 243         }
 244
 245         encap_header = kzalloc(ipv4_encap_size, GFP_KERNEL);
 246         if (!encap_header)
 247                 return -ENOMEM;
 248
 249         /* used by mlx5e_detach_encap to lookup a neigh hash table
 250          * entry in the neigh hash table when a user deletes a rule
 251          */
 252         e->m_neigh.dev = n->dev;
 253         e->m_neigh.family = n->ops->family;
 254         memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
 255         e->out_dev = out_dev;
 256         e->route_dev = route_dev;
 257
 258         /* It's important to add the neigh to the hash table before checking
 259          * the neigh validity state. So if we'll get a notification, in case the
 260          * neigh changes it's validity state, we would find the relevant neigh
 261          * in the hash.
 262          */
 263         err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
 264         if (err)
 265                 goto free_encap;
 266
 267         read_lock_bh(&n->lock);
 268         nud_state = n->nud_state;
 269         ether_addr_copy(e->h_dest, n->ha);
 270         read_unlock_bh(&n->lock);
 271
 272         /* add ethernet header */
 273         ip = (struct iphdr *)gen_eth_tnl_hdr(encap_header, route_dev, e,
 274                                              ETH_P_IP);
 275
 276         /* add ip header */
 277         ip->tos = tun_key->tos;
 278         ip->version = 0x4;
 279         ip->ihl = 0x5;
 280         ip->ttl = ttl;
 281         ip->daddr = fl4.daddr;
 282         ip->saddr = fl4.saddr;
 283
 284         /* add tunneling protocol header */
 285         err = mlx5e_gen_ip_tunnel_header((char *)ip + sizeof(struct iphdr),
 286                                          &ip->protocol, e);
 287         if (err)
 288                 goto destroy_neigh_entry;
 289
 290         e->encap_size = ipv4_encap_size;
 291         e->encap_header = encap_header;
 292
 293         if (!(nud_state & NUD_VALID)) {
 294                 neigh_event_send(n, NULL);
 295                 /* the encap entry will be made valid on neigh update event
 296                  * and not used before that.
 297                  */
 298                 goto out;
 299         }
 300         e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
 301                                                      e->reformat_type,
 302                                                      ipv4_encap_size, encap_header,
 303                                                      MLX5_FLOW_NAMESPACE_FDB);
 304         if (IS_ERR(e->pkt_reformat)) {
 305                 err = PTR_ERR(e->pkt_reformat);
 306                 goto destroy_neigh_entry;
 307         }
 308
 309         e->flags |= MLX5_ENCAP_ENTRY_VALID;
 310         mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
 311         neigh_release(n);
 312         return err;
 313
 314 destroy_neigh_entry:
 315         mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
 316 free_encap:
 317         kfree(encap_header);
 318 out:
 319         if (n)
 320                 neigh_release(n);
 321         return err;
 322 }
 323
 324 int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv,
 325                                     struct net_device *mirred_dev,
 326                                     struct mlx5e_encap_entry *e)
 327 {
 328         int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
 329         const struct ip_tunnel_key *tun_key = &e->tun_info->key;
 330         struct net_device *out_dev, *route_dev;
 331         struct neighbour *n = NULL;
 332         struct flowi6 fl6 = {};
 333         struct ipv6hdr *ip6h;
 334         int ipv6_encap_size;
 335         char *encap_header;
 336         u8 nud_state, ttl;
 337         int err;
 338
 339         ttl = tun_key->ttl;
 340
 341         fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tun_key->tos), tun_key->label);
 342         fl6.daddr = tun_key->u.ipv6.dst;
 343         fl6.saddr = tun_key->u.ipv6.src;
 344
 345         err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev, &route_dev,
 346                                       &fl6, &n, &ttl);
 347         if (err)
 348                 return err;
 349
 350         ipv6_encap_size =
 351                 (is_vlan_dev(route_dev) ? VLAN_ETH_HLEN : ETH_HLEN) +
 352                 sizeof(struct ipv6hdr) +
 353                 e->tunnel->calc_hlen(e);
 354
 355         if (max_encap_size < ipv6_encap_size) {
 356                 mlx5_core_warn(priv->mdev, "encap size %d too big, max supported is %d\n",
 357                                ipv6_encap_size, max_encap_size);
 358                 return -EOPNOTSUPP;
 359         }
 360
 361         encap_header = kzalloc(ipv6_encap_size, GFP_KERNEL);
 362         if (!encap_header)
 363                 return -ENOMEM;
 364
 365         /* used by mlx5e_detach_encap to lookup a neigh hash table
 366          * entry in the neigh hash table when a user deletes a rule
 367          */
 368         e->m_neigh.dev = n->dev;
 369         e->m_neigh.family = n->ops->family;
 370         memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
 371         e->out_dev = out_dev;
 372         e->route_dev = route_dev;
 373
 374         /* It's importent to add the neigh to the hash table before checking
 375          * the neigh validity state. So if we'll get a notification, in case the
 376          * neigh changes it's validity state, we would find the relevant neigh
 377          * in the hash.
 378          */
 379         err = mlx5e_rep_encap_entry_attach(netdev_priv(out_dev), e);
 380         if (err)
 381                 goto free_encap;
 382
 383         read_lock_bh(&n->lock);
 384         nud_state = n->nud_state;
 385         ether_addr_copy(e->h_dest, n->ha);
 386         read_unlock_bh(&n->lock);
 387
 388         /* add ethernet header */
 389         ip6h = (struct ipv6hdr *)gen_eth_tnl_hdr(encap_header, route_dev, e,
 390                                                  ETH_P_IPV6);
 391
 392         /* add ip header */
 393         ip6_flow_hdr(ip6h, tun_key->tos, 0);
 394         /* the HW fills up ipv6 payload len */
 395         ip6h->hop_limit   = ttl;
 396         ip6h->daddr       = fl6.daddr;
 397         ip6h->saddr       = fl6.saddr;
 398
 399         /* add tunneling protocol header */
 400         err = mlx5e_gen_ip_tunnel_header((char *)ip6h + sizeof(struct ipv6hdr),
 401                                          &ip6h->nexthdr, e);
 402         if (err)
 403                 goto destroy_neigh_entry;
 404
 405         e->encap_size = ipv6_encap_size;
 406         e->encap_header = encap_header;
 407
 408         if (!(nud_state & NUD_VALID)) {
 409                 neigh_event_send(n, NULL);
 410                 /* the encap entry will be made valid on neigh update event
 411                  * and not used before that.
 412                  */
 413                 goto out;
 414         }
 415
 416         e->pkt_reformat = mlx5_packet_reformat_alloc(priv->mdev,
 417                                                      e->reformat_type,
 418                                                      ipv6_encap_size, encap_header,
 419                                                      MLX5_FLOW_NAMESPACE_FDB);
 420         if (IS_ERR(e->pkt_reformat)) {
 421                 err = PTR_ERR(e->pkt_reformat);
 422                 goto destroy_neigh_entry;
 423         }
 424
 425         e->flags |= MLX5_ENCAP_ENTRY_VALID;
 426         mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
 427         neigh_release(n);
 428         return err;
 429
 430 destroy_neigh_entry:
 431         mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
 432 free_encap:
 433         kfree(encap_header);
 434 out:
 435         if (n)
 436                 neigh_release(n);
 437         return err;
 438 }
 439
 440 bool mlx5e_tc_tun_device_to_offload(struct mlx5e_priv *priv,
 441                                     struct net_device *netdev)
 442 {
 443         struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(netdev);
 444
 445         if (tunnel && tunnel->can_offload(priv))
 446                 return true;
 447         else
 448                 return false;
 449 }
 450
 451 int mlx5e_tc_tun_init_encap_attr(struct net_device *tunnel_dev,
 452                                  struct mlx5e_priv *priv,
 453                                  struct mlx5e_encap_entry *e,
 454                                  struct netlink_ext_ack *extack)
 455 {
 456         struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(tunnel_dev);
 457
 458         if (!tunnel) {
 459                 e->reformat_type = -1;
 460                 return -EOPNOTSUPP;
 461         }
 462
 463         return tunnel->init_encap_attr(tunnel_dev, priv, e, extack);
 464 }
 465
 466 int mlx5e_tc_tun_parse(struct net_device *filter_dev,
 467                        struct mlx5e_priv *priv,
 468                        struct mlx5_flow_spec *spec,
 469                        struct flow_cls_offload *f,
 470                        void *headers_c,
 471                        void *headers_v, u8 *match_level)
 472 {
 473         struct mlx5e_tc_tunnel *tunnel = mlx5e_get_tc_tun(filter_dev);
 474         int err = 0;
 475
 476         if (!tunnel) {
 477                 netdev_warn(priv->netdev,
 478                             "decapsulation offload is not supported for %s net device\n",
 479                             mlx5e_netdev_kind(filter_dev));
 480                 err = -EOPNOTSUPP;
 481                 goto out;
 482         }
 483
 484         *match_level = tunnel->match_level;
 485
 486         if (tunnel->parse_udp_ports) {
 487                 err = tunnel->parse_udp_ports(priv, spec, f,
 488                                               headers_c, headers_v);
 489                 if (err)
 490                         goto out;
 491         }
 492
 493         if (tunnel->parse_tunnel) {
 494                 err = tunnel->parse_tunnel(priv, spec, f,
 495                                            headers_c, headers_v);
 496                 if (err)
 497                         goto out;
 498         }
 499
 500 out:
 501         return err;
 502 }
 503
 504 int mlx5e_tc_tun_parse_udp_ports(struct mlx5e_priv *priv,
 505                                  struct mlx5_flow_spec *spec,
 506                                  struct flow_cls_offload *f,
 507                                  void *headers_c,
 508                                  void *headers_v)
 509 {
 510         struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 511         struct netlink_ext_ack *extack = f->common.extack;
 512         struct flow_match_ports enc_ports;
 513
 514         /* Full udp dst port must be given */
 515
 516         if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
 517                 NL_SET_ERR_MSG_MOD(extack,
 518                                    "UDP tunnel decap filter must include enc_dst_port condition");
 519                 netdev_warn(priv->netdev,
 520                             "UDP tunnel decap filter must include enc_dst_port condition\n");
 521                 return -EOPNOTSUPP;
 522         }
 523
 524         flow_rule_match_enc_ports(rule, &enc_ports);
 525
 526         if (memchr_inv(&enc_ports.mask->dst, 0xff,
 527                        sizeof(enc_ports.mask->dst))) {
 528                 NL_SET_ERR_MSG_MOD(extack,
 529                                    "UDP tunnel decap filter must match enc_dst_port fully");
 530                 netdev_warn(priv->netdev,
 531                             "UDP tunnel decap filter must match enc_dst_port fully\n");
 532                 return -EOPNOTSUPP;
 533         }
 534
 535         /* match on UDP protocol and dst port number */
 536
 537         MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_protocol);
 538         MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, IPPROTO_UDP);
 539
 540         MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
 541                  ntohs(enc_ports.mask->dst));
 542         MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
 543                  ntohs(enc_ports.key->dst));
 544
 545         /* UDP src port on outer header is generated by HW,
 546          * so it is probably a bad idea to request matching it.
 547          * Nonetheless, it is allowed.
 548          */
 549
 550         MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
 551                  ntohs(enc_ports.mask->src));
 552         MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
 553                  ntohs(enc_ports.key->src));
 554
 555         return 0;
 556 }