ixgbe: add XDP support for pass and drop actions
authorJohn Fastabend <john.r.fastabend@intel.com>
Mon, 24 Apr 2017 10:30:17 +0000 (03:30 -0700)
committerJeff Kirsher <jeffrey.t.kirsher@intel.com>
Sun, 30 Apr 2017 02:55:08 +0000 (19:55 -0700)
Basic XDP drop support for ixgbe. Uses READ_ONCE/xchg semantics on XDP
programs instead of RCU primitives as suggested by Daniel Borkmann and
Alex Duyck.

v2: fix the build issues seen w/ XDP when page sizes are larger than 4K
    and made minor fixes based on feedback from Jakub Kicinski

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
drivers/net/ethernet/intel/ixgbe/ixgbe.h
drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

index 656ca8f697680bd70cf0e3d62a03d3cc1496dfb8..cb14813b00803d6806cbda55ddc67c1399856c40 100644 (file)
@@ -318,6 +318,7 @@ struct ixgbe_ring {
        struct ixgbe_ring *next;        /* pointer to next ring in q_vector */
        struct ixgbe_q_vector *q_vector; /* backpointer to host q_vector */
        struct net_device *netdev;      /* netdev ring belongs to */
+       struct bpf_prog *xdp_prog;
        struct device *dev;             /* device for DMA mapping */
        struct ixgbe_fwd_adapter *l2_accel_priv;
        void *desc;                     /* descriptor ring memory */
@@ -555,6 +556,7 @@ struct ixgbe_adapter {
        unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
        /* OS defined structs */
        struct net_device *netdev;
+       struct bpf_prog *xdp_prog;
        struct pci_dev *pdev;
 
        unsigned long state;
@@ -835,7 +837,7 @@ void ixgbe_down(struct ixgbe_adapter *adapter);
 void ixgbe_reinit_locked(struct ixgbe_adapter *adapter);
 void ixgbe_reset(struct ixgbe_adapter *adapter);
 void ixgbe_set_ethtool_ops(struct net_device *netdev);
-int ixgbe_setup_rx_resources(struct ixgbe_ring *);
+int ixgbe_setup_rx_resources(struct ixgbe_adapter *, struct ixgbe_ring *);
 int ixgbe_setup_tx_resources(struct ixgbe_ring *);
 void ixgbe_free_rx_resources(struct ixgbe_ring *);
 void ixgbe_free_tx_resources(struct ixgbe_ring *);
index 59730ede4746002bf587dffe28e9d98b735c5959..79a126d9e091c9a10b1acc5064cd41a50741112d 100644 (file)
@@ -1128,7 +1128,7 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
                               sizeof(struct ixgbe_ring));
 
                        temp_ring[i].count = new_rx_count;
-                       err = ixgbe_setup_rx_resources(&temp_ring[i]);
+                       err = ixgbe_setup_rx_resources(adapter, &temp_ring[i]);
                        if (err) {
                                while (i) {
                                        i--;
@@ -1761,7 +1761,7 @@ static int ixgbe_setup_desc_rings(struct ixgbe_adapter *adapter)
        rx_ring->netdev = adapter->netdev;
        rx_ring->reg_idx = adapter->rx_ring[0]->reg_idx;
 
-       err = ixgbe_setup_rx_resources(rx_ring);
+       err = ixgbe_setup_rx_resources(adapter, rx_ring);
        if (err) {
                ret_val = 4;
                goto err_nomem;
index afff2ca7f8c0d784d51f8232b0665aaa746151b3..99b5357c3e0068aef130beab99129e10eed3dbda 100644 (file)
@@ -49,6 +49,9 @@
 #include <linux/if_macvlan.h>
 #include <linux/if_bridge.h>
 #include <linux/prefetch.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/atomic.h>
 #include <scsi/fc/fc_fcoe.h>
 #include <net/udp_tunnel.h>
 #include <net/pkt_cls.h>
@@ -1855,6 +1858,10 @@ static void ixgbe_dma_sync_frag(struct ixgbe_ring *rx_ring,
  * @rx_desc: pointer to the EOP Rx descriptor
  * @skb: pointer to current skb being fixed
  *
+ * Check if the skb is valid in the XDP case it will be an error pointer.
+ * Return true in this case to abort processing and advance to next
+ * descriptor.
+ *
  * Check for corrupted packet headers caused by senders on the local L2
  * embedded NIC switch not setting up their Tx Descriptors right.  These
  * should be very rare.
@@ -1873,6 +1880,10 @@ static bool ixgbe_cleanup_headers(struct ixgbe_ring *rx_ring,
 {
        struct net_device *netdev = rx_ring->netdev;
 
+       /* XDP packets use error pointer so abort at this point */
+       if (IS_ERR(skb))
+               return true;
+
        /* verify that the packet does not have any known errors */
        if (unlikely(ixgbe_test_staterr(rx_desc,
                                        IXGBE_RXDADV_ERR_FRAME_ERR_MASK) &&
@@ -2048,7 +2059,7 @@ static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring,
                /* hand second half of page back to the ring */
                ixgbe_reuse_rx_page(rx_ring, rx_buffer);
        } else {
-               if (IXGBE_CB(skb)->dma == rx_buffer->dma) {
+               if (!IS_ERR(skb) && IXGBE_CB(skb)->dma == rx_buffer->dma) {
                        /* the page has been released from the ring */
                        IXGBE_CB(skb)->page_released = true;
                } else {
@@ -2069,21 +2080,22 @@ static void ixgbe_put_rx_buffer(struct ixgbe_ring *rx_ring,
 
 static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
                                           struct ixgbe_rx_buffer *rx_buffer,
-                                          union ixgbe_adv_rx_desc *rx_desc,
-                                          unsigned int size)
+                                          struct xdp_buff *xdp,
+                                          union ixgbe_adv_rx_desc *rx_desc)
 {
-       void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
+       unsigned int size = xdp->data_end - xdp->data;
 #if (PAGE_SIZE < 8192)
        unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
-       unsigned int truesize = SKB_DATA_ALIGN(size);
+       unsigned int truesize = SKB_DATA_ALIGN(xdp->data_end -
+                                              xdp->data_hard_start);
 #endif
        struct sk_buff *skb;
 
        /* prefetch first cache line of first page */
-       prefetch(va);
+       prefetch(xdp->data);
 #if L1_CACHE_BYTES < 128
-       prefetch(va + L1_CACHE_BYTES);
+       prefetch(xdp->data + L1_CACHE_BYTES);
 #endif
 
        /* allocate a skb to store the frags */
@@ -2096,7 +2108,7 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
                        IXGBE_CB(skb)->dma = rx_buffer->dma;
 
                skb_add_rx_frag(skb, 0, rx_buffer->page,
-                               rx_buffer->page_offset,
+                               xdp->data - page_address(rx_buffer->page),
                                size, truesize);
 #if (PAGE_SIZE < 8192)
                rx_buffer->page_offset ^= truesize;
@@ -2104,7 +2116,8 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
                rx_buffer->page_offset += truesize;
 #endif
        } else {
-               memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
+               memcpy(__skb_put(skb, size),
+                      xdp->data, ALIGN(size, sizeof(long)));
                rx_buffer->pagecnt_bias++;
        }
 
@@ -2113,32 +2126,32 @@ static struct sk_buff *ixgbe_construct_skb(struct ixgbe_ring *rx_ring,
 
 static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
                                       struct ixgbe_rx_buffer *rx_buffer,
-                                      union ixgbe_adv_rx_desc *rx_desc,
-                                      unsigned int size)
+                                      struct xdp_buff *xdp,
+                                      union ixgbe_adv_rx_desc *rx_desc)
 {
-       void *va = page_address(rx_buffer->page) + rx_buffer->page_offset;
 #if (PAGE_SIZE < 8192)
        unsigned int truesize = ixgbe_rx_pg_size(rx_ring) / 2;
 #else
        unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
-                               SKB_DATA_ALIGN(IXGBE_SKB_PAD + size);
+                               SKB_DATA_ALIGN(xdp->data_end -
+                                              xdp->data_hard_start);
 #endif
        struct sk_buff *skb;
 
        /* prefetch first cache line of first page */
-       prefetch(va);
+       prefetch(xdp->data);
 #if L1_CACHE_BYTES < 128
-       prefetch(va + L1_CACHE_BYTES);
+       prefetch(xdp->data + L1_CACHE_BYTES);
 #endif
 
-       /* build an skb around the page buffer */
-       skb = build_skb(va - IXGBE_SKB_PAD, truesize);
+       /* build an skb to around the page buffer */
+       skb = build_skb(xdp->data_hard_start, truesize);
        if (unlikely(!skb))
                return NULL;
 
        /* update pointers within the skb to store the data */
-       skb_reserve(skb, IXGBE_SKB_PAD);
-       __skb_put(skb, size);
+       skb_reserve(skb, xdp->data - xdp->data_hard_start);
+       __skb_put(skb, xdp->data_end - xdp->data);
 
        /* record DMA address if this is the start of a chain of buffers */
        if (!ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_EOP))
@@ -2154,6 +2167,41 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
        return skb;
 }
 
+#define IXGBE_XDP_PASS 0
+#define IXGBE_XDP_CONSUMED 1
+
+static struct sk_buff *ixgbe_run_xdp(struct ixgbe_ring  *rx_ring,
+                                    struct xdp_buff *xdp)
+{
+       int result = IXGBE_XDP_PASS;
+       struct bpf_prog *xdp_prog;
+       u32 act;
+
+       rcu_read_lock();
+       xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+
+       if (!xdp_prog)
+               goto xdp_out;
+
+       act = bpf_prog_run_xdp(xdp_prog, xdp);
+       switch (act) {
+       case XDP_PASS:
+               break;
+       default:
+               bpf_warn_invalid_xdp_action(act);
+       case XDP_TX:
+       case XDP_ABORTED:
+               trace_xdp_exception(rx_ring->netdev, xdp_prog, act);
+               /* fallthrough -- handle aborts by dropping packet */
+       case XDP_DROP:
+               result = IXGBE_XDP_CONSUMED;
+               break;
+       }
+xdp_out:
+       rcu_read_unlock();
+       return ERR_PTR(-result);
+}
+
 /**
  * ixgbe_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
  * @q_vector: structure containing interrupt and ring information
@@ -2183,6 +2231,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
                union ixgbe_adv_rx_desc *rx_desc;
                struct ixgbe_rx_buffer *rx_buffer;
                struct sk_buff *skb;
+               struct xdp_buff xdp;
                unsigned int size;
 
                /* return some buffers to hardware, one at a time is too slow */
@@ -2205,14 +2254,29 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
                rx_buffer = ixgbe_get_rx_buffer(rx_ring, rx_desc, &skb, size);
 
                /* retrieve a buffer from the ring */
-               if (skb)
+               if (!skb) {
+                       xdp.data = page_address(rx_buffer->page) +
+                                  rx_buffer->page_offset;
+                       xdp.data_hard_start = xdp.data -
+                                             ixgbe_rx_offset(rx_ring);
+                       xdp.data_end = xdp.data + size;
+
+                       skb = ixgbe_run_xdp(rx_ring, &xdp);
+               }
+
+               if (IS_ERR(skb)) {
+                       total_rx_packets++;
+                       total_rx_bytes += size;
+                       rx_buffer->pagecnt_bias++;
+               } else if (skb) {
                        ixgbe_add_rx_frag(rx_ring, rx_buffer, skb, size);
-               else if (ring_uses_build_skb(rx_ring))
+               } else if (ring_uses_build_skb(rx_ring)) {
                        skb = ixgbe_build_skb(rx_ring, rx_buffer,
-                                             rx_desc, size);
-               else
+                                             &xdp, rx_desc);
+               } else {
                        skb = ixgbe_construct_skb(rx_ring, rx_buffer,
-                                                 rx_desc, size);
+                                                 &xdp, rx_desc);
+               }
 
                /* exit if we failed to retrieve a buffer */
                if (!skb) {
@@ -6073,7 +6137,8 @@ err_setup_tx:
  *
  * Returns 0 on success, negative on failure
  **/
-int ixgbe_setup_rx_resources(struct ixgbe_ring *rx_ring)
+int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
+                            struct ixgbe_ring *rx_ring)
 {
        struct device *dev = rx_ring->dev;
        int orig_node = dev_to_node(dev);
@@ -6112,6 +6177,8 @@ int ixgbe_setup_rx_resources(struct ixgbe_ring *rx_ring)
        rx_ring->next_to_clean = 0;
        rx_ring->next_to_use = 0;
 
+       rx_ring->xdp_prog = adapter->xdp_prog;
+
        return 0;
 err:
        vfree(rx_ring->rx_buffer_info);
@@ -6135,7 +6202,7 @@ static int ixgbe_setup_all_rx_resources(struct ixgbe_adapter *adapter)
        int i, err = 0;
 
        for (i = 0; i < adapter->num_rx_queues; i++) {
-               err = ixgbe_setup_rx_resources(adapter->rx_ring[i]);
+               err = ixgbe_setup_rx_resources(adapter, adapter->rx_ring[i]);
                if (!err)
                        continue;
 
@@ -6203,6 +6270,7 @@ void ixgbe_free_rx_resources(struct ixgbe_ring *rx_ring)
 {
        ixgbe_clean_rx_ring(rx_ring);
 
+       rx_ring->xdp_prog = NULL;
        vfree(rx_ring->rx_buffer_info);
        rx_ring->rx_buffer_info = NULL;
 
@@ -9468,6 +9536,54 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev,
        return features;
 }
 
+static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
+{
+       int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+       struct ixgbe_adapter *adapter = netdev_priv(dev);
+       struct bpf_prog *old_prog;
+
+       if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)
+               return -EINVAL;
+
+       if (adapter->flags & IXGBE_FLAG_DCB_ENABLED)
+               return -EINVAL;
+
+       /* verify ixgbe ring attributes are sufficient for XDP */
+       for (i = 0; i < adapter->num_rx_queues; i++) {
+               struct ixgbe_ring *ring = adapter->rx_ring[i];
+
+               if (ring_is_rsc_enabled(ring))
+                       return -EINVAL;
+
+               if (frame_size > ixgbe_rx_bufsz(ring))
+                       return -EINVAL;
+       }
+
+       old_prog = xchg(&adapter->xdp_prog, prog);
+       for (i = 0; i < adapter->num_rx_queues; i++)
+               xchg(&adapter->rx_ring[i]->xdp_prog, adapter->xdp_prog);
+
+       if (old_prog)
+               bpf_prog_put(old_prog);
+
+       return 0;
+}
+
+static int ixgbe_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+       struct ixgbe_adapter *adapter = netdev_priv(dev);
+
+       switch (xdp->command) {
+       case XDP_SETUP_PROG:
+               return ixgbe_xdp_setup(dev, xdp->prog);
+       case XDP_QUERY_PROG:
+               xdp->prog_attached = !!(adapter->xdp_prog);
+               return 0;
+       default:
+               return -EINVAL;
+       }
+}
+
 static const struct net_device_ops ixgbe_netdev_ops = {
        .ndo_open               = ixgbe_open,
        .ndo_stop               = ixgbe_close,
@@ -9513,6 +9629,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
        .ndo_udp_tunnel_add     = ixgbe_add_udp_tunnel_port,
        .ndo_udp_tunnel_del     = ixgbe_del_udp_tunnel_port,
        .ndo_features_check     = ixgbe_features_check,
+       .ndo_xdp                = ixgbe_xdp,
 };
 
 /**