Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
authorDavid S. Miller <davem@davemloft.net>
Fri, 6 Sep 2019 14:49:17 +0000 (16:49 +0200)
committerDavid S. Miller <davem@davemloft.net>
Fri, 6 Sep 2019 14:49:17 +0000 (16:49 +0200)
Daniel Borkmann says:

====================
The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Add the ability to use unaligned chunks in the AF_XDP umem. By
   relaxing where the chunks can be placed, it allows to use an
   arbitrary buffer size and place whenever there is a free
   address in the umem. Helps more seamless DPDK AF_XDP driver
   integration. Support for i40e, ixgbe and mlx5e, from Kevin and
   Maxim.

2) Addition of a wakeup flag for AF_XDP tx and fill rings so the
   application can wake up the kernel for rx/tx processing which
   avoids busy-spinning of the latter, useful when app and driver
   is located on the same core. Support for i40e, ixgbe and mlx5e,
   from Magnus and Maxim.

3) bpftool fixes for printf()-like functions so compiler can actually
   enforce checks, bpftool build system improvements for custom output
   directories, and addition of 'bpftool map freeze' command, from Quentin.

4) Support attaching/detaching XDP programs from 'bpftool net' command,
   from Daniel.

5) Automatic xskmap cleanup when AF_XDP socket is released, and several
   barrier/{read,write}_once fixes in AF_XDP code, from Björn.

6) Relicense of bpf_helpers.h/bpf_endian.h for future libbpf
   inclusion as well as libbpf versioning improvements, from Andrii.

7) Several new BPF kselftests for verifier precision tracking, from Alexei.

8) Several BPF kselftest fixes wrt endianess to run on s390x, from Ilya.

9) And more BPF kselftest improvements all over the place, from Stanislav.

10) Add simple BPF map op cache for nfp driver to batch dumps, from Jakub.

11) AF_XDP socket umem mapping improvements for 32bit archs, from Ivan.

12) Add BPF-to-BPF call and BTF line info support for s390x JIT, from Yauheni.

13) Small optimization in arm64 JIT to spare 1 insns for BPF_MOD, from Jerin.

14) Fix an error check in bpf_tcp_gen_syncookie() helper, from Petar.

15) Various minor fixes and cleanups, from Nathan, Masahiro, Masanari,
    Peter, Wei, Yue.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
124 files changed:
Documentation/networking/af_xdp.rst
arch/arm64/net/bpf_jit.h
arch/arm64/net/bpf_jit_comp.c
arch/s390/net/bpf_jit_comp.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_xsk.c
drivers/net/ethernet/intel/i40e/i40e_xsk.h
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h
drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
drivers/net/ethernet/mellanox/mlx5/core/en/params.c
drivers/net/ethernet/mellanox/mlx5/core/en/params.h
drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
drivers/net/ethernet/netronome/nfp/bpf/cmsg.c
drivers/net/ethernet/netronome/nfp/bpf/fw.h
drivers/net/ethernet/netronome/nfp/bpf/main.c
drivers/net/ethernet/netronome/nfp/bpf/main.h
drivers/net/ethernet/netronome/nfp/bpf/offload.c
drivers/net/ethernet/netronome/nfp/nfp_net.h
drivers/net/ethernet/netronome/nfp/nfp_net_common.c
include/linux/bpf.h
include/linux/bpf_verifier.h
include/linux/netdevice.h
include/linux/tnum.h
include/net/bpf_sk_storage.h
include/net/xdp_sock.h
include/uapi/linux/bpf.h
include/uapi/linux/if_xdp.h
kernel/bpf/btf.c
kernel/bpf/syscall.c
kernel/bpf/sysfs_btf.c
kernel/bpf/verifier.c
kernel/bpf/xskmap.c
kernel/trace/Kconfig
lib/test_bpf.c
net/core/bpf_sk_storage.c
net/core/dev.c
net/core/filter.c
net/core/sock.c
net/xdp/xdp_umem.c
net/xdp/xsk.c
net/xdp/xsk.h
net/xdp/xsk_diag.c
net/xdp/xsk_queue.h
samples/bpf/syscall_nrs.c
samples/bpf/tracex5_kern.c
samples/bpf/xdpsock_user.c
scripts/link-vmlinux.sh
tools/bpf/.gitignore
tools/bpf/Makefile
tools/bpf/bpftool/.gitignore
tools/bpf/bpftool/Documentation/bpftool-btf.rst
tools/bpf/bpftool/Documentation/bpftool-map.rst
tools/bpf/bpftool/Documentation/bpftool-net.rst
tools/bpf/bpftool/Makefile
tools/bpf/bpftool/bash-completion/bpftool
tools/bpf/bpftool/btf.c
tools/bpf/bpftool/btf_dumper.c
tools/bpf/bpftool/cgroup.c
tools/bpf/bpftool/common.c
tools/bpf/bpftool/json_writer.c
tools/bpf/bpftool/json_writer.h
tools/bpf/bpftool/main.c
tools/bpf/bpftool/main.h
tools/bpf/bpftool/map.c
tools/bpf/bpftool/map_perf_ring.c
tools/bpf/bpftool/net.c
tools/bpf/bpftool/perf.c
tools/include/linux/compiler-gcc.h
tools/include/uapi/linux/bpf.h
tools/include/uapi/linux/if_xdp.h
tools/lib/bpf/Makefile
tools/lib/bpf/bpf.c
tools/lib/bpf/bpf.h
tools/lib/bpf/libbpf.map
tools/lib/bpf/xsk.c
tools/lib/bpf/xsk.h
tools/testing/selftests/bpf/.gitignore
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/bpf_endian.h
tools/testing/selftests/bpf/bpf_helpers.h
tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
tools/testing/selftests/bpf/prog_tests/flow_dissector.c
tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
tools/testing/selftests/bpf/prog_tests/global_data.c
tools/testing/selftests/bpf/prog_tests/l4lb_all.c
tools/testing/selftests/bpf/prog_tests/map_lock.c
tools/testing/selftests/bpf/prog_tests/pkt_access.c
tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
tools/testing/selftests/bpf/prog_tests/reference_tracking.c
tools/testing/selftests/bpf/prog_tests/send_signal.c
tools/testing/selftests/bpf/prog_tests/spinlock.c
tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
tools/testing/selftests/bpf/prog_tests/tcp_estats.c
tools/testing/selftests/bpf/prog_tests/xdp.c
tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
tools/testing/selftests/bpf/progs/sockopt_inherit.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_lwt_seg6local.c
tools/testing/selftests/bpf/progs/test_seg6_loop.c
tools/testing/selftests/bpf/test_bpftool_build.sh [new file with mode: 0755]
tools/testing/selftests/bpf/test_offload.py
tools/testing/selftests/bpf/test_progs.c
tools/testing/selftests/bpf/test_progs.h
tools/testing/selftests/bpf/test_sockopt_inherit.c [new file with mode: 0644]
tools/testing/selftests/bpf/test_sysctl.c
tools/testing/selftests/bpf/test_tcp_rtt.c
tools/testing/selftests/bpf/test_verifier.c
tools/testing/selftests/bpf/verifier/precise.c [new file with mode: 0644]

index eeedc2e826aa066c89b2073d8fd26100a66fdf8d..83f7ae5fc045e22ad7e35dfdb65dd3e7e342e0b3 100644 (file)
@@ -153,10 +153,12 @@ an example, if the UMEM is 64k and each chunk is 4k, then the UMEM has
 
 Frames passed to the kernel are used for the ingress path (RX rings).
 
-The user application produces UMEM addrs to this ring. Note that the
-kernel will mask the incoming addr. E.g. for a chunk size of 2k, the
-log2(2048) LSB of the addr will be masked off, meaning that 2048, 2050
-and 3000 refers to the same chunk.
+The user application produces UMEM addrs to this ring. Note that, if
+running the application with aligned chunk mode, the kernel will mask
+the incoming addr.  E.g. for a chunk size of 2k, the log2(2048) LSB of
+the addr will be masked off, meaning that 2048, 2050 and 3000 refers
+to the same chunk. If the user application is run in the unaligned
+chunks mode, then the incoming addr will be left untouched.
 
 
 UMEM Completion Ring
index cb7ab50b76579067d29fbb575396fa47b0a36127..eb73f9f72c467a1bbacecb7ea270b75066b8d2ab 100644 (file)
 /* Rd = Ra + Rn * Rm */
 #define A64_MADD(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
        A64_VARIANT(sf), AARCH64_INSN_DATA3_MADD)
+/* Rd = Ra - Rn * Rm */
+#define A64_MSUB(sf, Rd, Ra, Rn, Rm) aarch64_insn_gen_data3(Rd, Ra, Rn, Rm, \
+       A64_VARIANT(sf), AARCH64_INSN_DATA3_MSUB)
 /* Rd = Rn * Rm */
 #define A64_MUL(sf, Rd, Rn, Rm) A64_MADD(sf, Rd, A64_ZR, Rn, Rm)
 
index f5b437f8a22b4042f29d4e8c55b181efd3350210..cdc79de0c794af4e46176c6924b14fd127426f54 100644 (file)
@@ -409,8 +409,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
                        break;
                case BPF_MOD:
                        emit(A64_UDIV(is64, tmp, dst, src), ctx);
-                       emit(A64_MUL(is64, tmp, tmp, src), ctx);
-                       emit(A64_SUB(is64, dst, dst, tmp), ctx);
+                       emit(A64_MSUB(is64, dst, dst, tmp, src), ctx);
                        break;
                }
                break;
@@ -516,8 +515,7 @@ emit_bswap_uxt:
        case BPF_ALU64 | BPF_MOD | BPF_K:
                emit_a64_mov_i(is64, tmp2, imm, ctx);
                emit(A64_UDIV(is64, tmp, dst, tmp2), ctx);
-               emit(A64_MUL(is64, tmp, tmp, tmp2), ctx);
-               emit(A64_SUB(is64, dst, dst, tmp), ctx);
+               emit(A64_MSUB(is64, dst, dst, tmp, tmp2), ctx);
                break;
        case BPF_ALU | BPF_LSH | BPF_K:
        case BPF_ALU64 | BPF_LSH | BPF_K:
index 955eb355c2fdea049412ecbcf81b84427e4cf406..ce88211b9c6cdda55164f82ef00327a98ba11b6d 100644 (file)
@@ -502,7 +502,8 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
  * NOTE: Use noinline because for gcov (-fprofile-arcs) gcc allocates a lot of
  * stack space for the large switch statement.
  */
-static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
+static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
+                                int i, bool extra_pass)
 {
        struct bpf_insn *insn = &fp->insnsi[i];
        int jmp_off, last, insn_count = 1;
@@ -1011,10 +1012,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
         */
        case BPF_JMP | BPF_CALL:
        {
-               /*
-                * b0 = (__bpf_call_base + imm)(b1, b2, b3, b4, b5)
-                */
-               const u64 func = (u64)__bpf_call_base + imm;
+               u64 func;
+               bool func_addr_fixed;
+               int ret;
+
+               ret = bpf_jit_get_func_addr(fp, insn, extra_pass,
+                                           &func, &func_addr_fixed);
+               if (ret < 0)
+                       return -1;
 
                REG_SET_SEEN(BPF_REG_5);
                jit->seen |= SEEN_FUNC;
@@ -1283,7 +1288,8 @@ branch_oc:
 /*
  * Compile eBPF program into s390x code
  */
-static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
+static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
+                       bool extra_pass)
 {
        int i, insn_count;
 
@@ -1292,7 +1298,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp)
 
        bpf_jit_prologue(jit, fp->aux->stack_depth);
        for (i = 0; i < fp->len; i += insn_count) {
-               insn_count = bpf_jit_insn(jit, fp, i);
+               insn_count = bpf_jit_insn(jit, fp, i, extra_pass);
                if (insn_count < 0)
                        return -1;
                /* Next instruction address */
@@ -1311,6 +1317,12 @@ bool bpf_jit_needs_zext(void)
        return true;
 }
 
+struct s390_jit_data {
+       struct bpf_binary_header *header;
+       struct bpf_jit ctx;
+       int pass;
+};
+
 /*
  * Compile eBPF program "fp"
  */
@@ -1318,7 +1330,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
        struct bpf_prog *tmp, *orig_fp = fp;
        struct bpf_binary_header *header;
+       struct s390_jit_data *jit_data;
        bool tmp_blinded = false;
+       bool extra_pass = false;
        struct bpf_jit jit;
        int pass;
 
@@ -1337,6 +1351,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
                fp = tmp;
        }
 
+       jit_data = fp->aux->jit_data;
+       if (!jit_data) {
+               jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
+               if (!jit_data) {
+                       fp = orig_fp;
+                       goto out;
+               }
+               fp->aux->jit_data = jit_data;
+       }
+       if (jit_data->ctx.addrs) {
+               jit = jit_data->ctx;
+               header = jit_data->header;
+               extra_pass = true;
+               pass = jit_data->pass + 1;
+               goto skip_init_ctx;
+       }
+
        memset(&jit, 0, sizeof(jit));
        jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
        if (jit.addrs == NULL) {
@@ -1349,7 +1380,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
         *   - 3:   Calculate program size and addrs arrray
         */
        for (pass = 1; pass <= 3; pass++) {
-               if (bpf_jit_prog(&jit, fp)) {
+               if (bpf_jit_prog(&jit, fp, extra_pass)) {
                        fp = orig_fp;
                        goto free_addrs;
                }
@@ -1361,12 +1392,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
                fp = orig_fp;
                goto free_addrs;
        }
+
        header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 2, jit_fill_hole);
        if (!header) {
                fp = orig_fp;
                goto free_addrs;
        }
-       if (bpf_jit_prog(&jit, fp)) {
+skip_init_ctx:
+       if (bpf_jit_prog(&jit, fp, extra_pass)) {
                bpf_jit_binary_free(header);
                fp = orig_fp;
                goto free_addrs;
@@ -1375,12 +1408,24 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
                bpf_jit_dump(fp->len, jit.size, pass, jit.prg_buf);
                print_fn_code(jit.prg_buf, jit.size_prg);
        }
-       bpf_jit_binary_lock_ro(header);
+       if (!fp->is_func || extra_pass) {
+               bpf_jit_binary_lock_ro(header);
+       } else {
+               jit_data->header = header;
+               jit_data->ctx = jit;
+               jit_data->pass = pass;
+       }
        fp->bpf_func = (void *) jit.prg_buf;
        fp->jited = 1;
        fp->jited_len = jit.size;
+
+       if (!fp->is_func || extra_pass) {
+               bpf_prog_fill_jited_linfo(fp, jit.addrs + 1);
 free_addrs:
-       kfree(jit.addrs);
+               kfree(jit.addrs);
+               kfree(jit_data);
+               fp->aux->jit_data = NULL;
+       }
 out:
        if (tmp_blinded)
                bpf_jit_prog_release_other(fp, fp == orig_fp ?
index fdf43d87e983fe5b6ca478bb4baeece34c3e37d9..3c8a2f55c43af77d9b260bee9604918f0e5692b1 100644 (file)
@@ -12530,7 +12530,8 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi,
        if (need_reset && prog)
                for (i = 0; i < vsi->num_queue_pairs; i++)
                        if (vsi->xdp_rings[i]->xsk_umem)
-                               (void)i40e_xsk_async_xmit(vsi->netdev, i);
+                               (void)i40e_xsk_wakeup(vsi->netdev, i,
+                                                     XDP_WAKEUP_RX);
 
        return 0;
 }
@@ -12852,7 +12853,7 @@ static const struct net_device_ops i40e_netdev_ops = {
        .ndo_bridge_setlink     = i40e_ndo_bridge_setlink,
        .ndo_bpf                = i40e_xdp,
        .ndo_xdp_xmit           = i40e_xdp_xmit,
-       .ndo_xsk_async_xmit     = i40e_xsk_async_xmit,
+       .ndo_xsk_wakeup         = i40e_xsk_wakeup,
        .ndo_dfwd_add_station   = i40e_fwd_add,
        .ndo_dfwd_del_station   = i40e_fwd_del,
 };
index 32bad014d76cbaadbc90d117b703898e7f3e067f..0373bc6c7e61ce00b0abcaa3f9189e809edaeb7e 100644 (file)
@@ -116,7 +116,7 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
                        return err;
 
                /* Kick start the NAPI context so that receiving will start */
-               err = i40e_xsk_async_xmit(vsi->netdev, qid);
+               err = i40e_xsk_wakeup(vsi->netdev, qid, XDP_WAKEUP_RX);
                if (err)
                        return err;
        }
@@ -190,7 +190,9 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
  **/
 static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
 {
+       struct xdp_umem *umem = rx_ring->xsk_umem;
        int err, result = I40E_XDP_PASS;
+       u64 offset = umem->headroom;
        struct i40e_ring *xdp_ring;
        struct bpf_prog *xdp_prog;
        u32 act;
@@ -201,7 +203,10 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
         */
        xdp_prog = READ_ONCE(rx_ring->xdp_prog);
        act = bpf_prog_run_xdp(xdp_prog, xdp);
-       xdp->handle += xdp->data - xdp->data_hard_start;
+       offset += xdp->data - xdp->data_hard_start;
+
+       xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
+
        switch (act) {
        case XDP_PASS:
                break;
@@ -262,7 +267,7 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
        bi->addr = xdp_umem_get_data(umem, handle);
        bi->addr += hr;
 
-       bi->handle = handle + umem->headroom;
+       bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
 
        xsk_umem_discard_addr(umem);
        return true;
@@ -299,7 +304,7 @@ static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
        bi->addr = xdp_umem_get_data(umem, handle);
        bi->addr += hr;
 
-       bi->handle = handle + umem->headroom;
+       bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
 
        xsk_umem_discard_addr_rq(umem);
        return true;
@@ -420,8 +425,6 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
                                    struct i40e_rx_buffer *old_bi)
 {
        struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
-       unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
-       u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
        u16 nta = rx_ring->next_to_alloc;
 
        /* update, and store next to alloc */
@@ -429,14 +432,9 @@ static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
        rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
 
        /* transfer page from old buffer to new buffer */
-       new_bi->dma = old_bi->dma & mask;
-       new_bi->dma += hr;
-
-       new_bi->addr = (void *)((unsigned long)old_bi->addr & mask);
-       new_bi->addr += hr;
-
-       new_bi->handle = old_bi->handle & mask;
-       new_bi->handle += rx_ring->xsk_umem->headroom;
+       new_bi->dma = old_bi->dma;
+       new_bi->addr = old_bi->addr;
+       new_bi->handle = old_bi->handle;
 
        old_bi->addr = NULL;
 }
@@ -471,7 +469,8 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
        bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
        bi->addr += hr;
 
-       bi->handle = (u64)handle + rx_ring->xsk_umem->headroom;
+       bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
+                                           rx_ring->xsk_umem->headroom);
 }
 
 /**
@@ -626,6 +625,15 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 
        i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
        i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
+
+       if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
+               if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+                       xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
+               else
+                       xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
+
+               return (int)total_rx_packets;
+       }
        return failure ? budget : (int)total_rx_packets;
 }
 
@@ -681,6 +689,8 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
                i40e_xdp_ring_update_tail(xdp_ring);
 
                xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
+               if (xsk_umem_uses_need_wakeup(xdp_ring->xsk_umem))
+                       xsk_clear_tx_need_wakeup(xdp_ring->xsk_umem);
        }
 
        return !!budget && work_done;
@@ -759,19 +769,27 @@ bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
        i40e_update_tx_stats(tx_ring, completed_frames, total_bytes);
 
 out_xmit:
+       if (xsk_umem_uses_need_wakeup(tx_ring->xsk_umem)) {
+               if (tx_ring->next_to_clean == tx_ring->next_to_use)
+                       xsk_set_tx_need_wakeup(tx_ring->xsk_umem);
+               else
+                       xsk_clear_tx_need_wakeup(tx_ring->xsk_umem);
+       }
+
        xmit_done = i40e_xmit_zc(tx_ring, budget);
 
        return work_done && xmit_done;
 }
 
 /**
- * i40e_xsk_async_xmit - Implements the ndo_xsk_async_xmit
+ * i40e_xsk_wakeup - Implements the ndo_xsk_wakeup
  * @dev: the netdevice
  * @queue_id: queue id to wake up
+ * @flags: ignored in our case since we have Rx and Tx in the same NAPI.
  *
  * Returns <0 for errors, 0 otherwise.
  **/
-int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id)
+int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
 {
        struct i40e_netdev_priv *np = netdev_priv(dev);
        struct i40e_vsi *vsi = np->vsi;
index 8cc0a2e7d9a2fa3253ceaec272099ace792d9e44..9ed59c14eb55f931c05bca8f01a772bff390fa71 100644 (file)
@@ -18,6 +18,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
 
 bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
                           struct i40e_ring *tx_ring, int napi_budget);
-int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id);
+int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
 
 #endif /* _I40E_XSK_H_ */
index 17b7ae9f46ecdc704cde918339536b08c1ba0357..9bcae44e98835926789cf366183d3d3eb5c649f8 100644 (file)
@@ -10260,7 +10260,8 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
        if (need_reset && prog)
                for (i = 0; i < adapter->num_rx_queues; i++)
                        if (adapter->xdp_ring[i]->xsk_umem)
-                               (void)ixgbe_xsk_async_xmit(adapter->netdev, i);
+                               (void)ixgbe_xsk_wakeup(adapter->netdev, i,
+                                                      XDP_WAKEUP_RX);
 
        return 0;
 }
@@ -10379,7 +10380,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
        .ndo_features_check     = ixgbe_features_check,
        .ndo_bpf                = ixgbe_xdp,
        .ndo_xdp_xmit           = ixgbe_xdp_xmit,
-       .ndo_xsk_async_xmit     = ixgbe_xsk_async_xmit,
+       .ndo_xsk_wakeup         = ixgbe_xsk_wakeup,
 };
 
 static void ixgbe_disable_txr_hw(struct ixgbe_adapter *adapter,
index d93a690aff74f404549d067947e1faf0a4d08a02..6d01700b46bc3d8b42886f98931117456a65a783 100644 (file)
@@ -42,7 +42,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
 void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring);
 bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
                            struct ixgbe_ring *tx_ring, int napi_budget);
-int ixgbe_xsk_async_xmit(struct net_device *dev, u32 queue_id);
+int ixgbe_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
 void ixgbe_xsk_clean_tx_ring(struct ixgbe_ring *tx_ring);
 
 #endif /* #define _IXGBE_TXRX_COMMON_H_ */
index 6b609553329fa98647b78f4e1e5d3e9223a8aa7a..ad802a8909e0d4423785592ba3e87cc92191b466 100644 (file)
@@ -100,7 +100,7 @@ static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
                ixgbe_txrx_ring_enable(adapter, qid);
 
                /* Kick start the NAPI context so that receiving will start */
-               err = ixgbe_xsk_async_xmit(adapter->netdev, qid);
+               err = ixgbe_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX);
                if (err)
                        return err;
        }
@@ -143,7 +143,9 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
                            struct ixgbe_ring *rx_ring,
                            struct xdp_buff *xdp)
 {
+       struct xdp_umem *umem = rx_ring->xsk_umem;
        int err, result = IXGBE_XDP_PASS;
+       u64 offset = umem->headroom;
        struct bpf_prog *xdp_prog;
        struct xdp_frame *xdpf;
        u32 act;
@@ -151,7 +153,10 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
        rcu_read_lock();
        xdp_prog = READ_ONCE(rx_ring->xdp_prog);
        act = bpf_prog_run_xdp(xdp_prog, xdp);
-       xdp->handle += xdp->data - xdp->data_hard_start;
+       offset += xdp->data - xdp->data_hard_start;
+
+       xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
+
        switch (act) {
        case XDP_PASS:
                break;
@@ -201,8 +206,6 @@ ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring,
 static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
                                     struct ixgbe_rx_buffer *obi)
 {
-       unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
-       u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
        u16 nta = rx_ring->next_to_alloc;
        struct ixgbe_rx_buffer *nbi;
 
@@ -212,14 +215,9 @@ static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
        rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
 
        /* transfer page from old buffer to new buffer */
-       nbi->dma = obi->dma & mask;
-       nbi->dma += hr;
-
-       nbi->addr = (void *)((unsigned long)obi->addr & mask);
-       nbi->addr += hr;
-
-       nbi->handle = obi->handle & mask;
-       nbi->handle += rx_ring->xsk_umem->headroom;
+       nbi->dma = obi->dma;
+       nbi->addr = obi->addr;
+       nbi->handle = obi->handle;
 
        obi->addr = NULL;
        obi->skb = NULL;
@@ -250,7 +248,8 @@ void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
        bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
        bi->addr += hr;
 
-       bi->handle = (u64)handle + rx_ring->xsk_umem->headroom;
+       bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
+                                           rx_ring->xsk_umem->headroom);
 }
 
 static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
@@ -276,7 +275,7 @@ static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
        bi->addr = xdp_umem_get_data(umem, handle);
        bi->addr += hr;
 
-       bi->handle = handle + umem->headroom;
+       bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
 
        xsk_umem_discard_addr(umem);
        return true;
@@ -303,7 +302,7 @@ static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring,
        bi->addr = xdp_umem_get_data(umem, handle);
        bi->addr += hr;
 
-       bi->handle = handle + umem->headroom;
+       bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
 
        xsk_umem_discard_addr_rq(umem);
        return true;
@@ -547,6 +546,14 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
        q_vector->rx.total_packets += total_rx_packets;
        q_vector->rx.total_bytes += total_rx_bytes;
 
+       if (xsk_umem_uses_need_wakeup(rx_ring->xsk_umem)) {
+               if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+                       xsk_set_rx_need_wakeup(rx_ring->xsk_umem);
+               else
+                       xsk_clear_rx_need_wakeup(rx_ring->xsk_umem);
+
+               return (int)total_rx_packets;
+       }
        return failure ? budget : (int)total_rx_packets;
 }
 
@@ -615,6 +622,8 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
        if (tx_desc) {
                ixgbe_xdp_ring_update_tail(xdp_ring);
                xsk_umem_consume_tx_done(xdp_ring->xsk_umem);
+               if (xsk_umem_uses_need_wakeup(xdp_ring->xsk_umem))
+                       xsk_clear_tx_need_wakeup(xdp_ring->xsk_umem);
        }
 
        return !!budget && work_done;
@@ -688,11 +697,19 @@ bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector,
        if (xsk_frames)
                xsk_umem_complete_tx(umem, xsk_frames);
 
+       if (xsk_umem_uses_need_wakeup(tx_ring->xsk_umem)) {
+               if (tx_ring->next_to_clean == tx_ring->next_to_use)
+                       xsk_set_tx_need_wakeup(tx_ring->xsk_umem);
+               else
+                       xsk_clear_tx_need_wakeup(tx_ring->xsk_umem);
+       }
+
        xmit_done = ixgbe_xmit_zc(tx_ring, q_vector->tx.work_limit);
+
        return budget > 0 && xmit_done;
 }
 
-int ixgbe_xsk_async_xmit(struct net_device *dev, u32 qid)
+int ixgbe_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 {
        struct ixgbe_adapter *adapter = netdev_priv(dev);
        struct ixgbe_ring *ring;
index 79301d11666762bdbc27cf052e149acb93d6a8eb..eb2e1f2138e458eab9edb29b88932b2b84e9dad5 100644 (file)
@@ -25,18 +25,33 @@ u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
        return headroom;
 }
 
-u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
-                               struct mlx5e_xsk_param *xsk)
+u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params,
+                            struct mlx5e_xsk_param *xsk)
 {
        u32 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
        u16 linear_rq_headroom = mlx5e_get_linear_rq_headroom(params, xsk);
-       u32 frag_sz = linear_rq_headroom + hw_mtu;
+
+       return linear_rq_headroom + hw_mtu;
+}
+
+u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
+                               struct mlx5e_xsk_param *xsk)
+{
+       u32 frag_sz = mlx5e_rx_get_min_frag_sz(params, xsk);
 
        /* AF_XDP doesn't build SKBs in place. */
        if (!xsk)
                frag_sz = MLX5_SKB_FRAG_SZ(frag_sz);
 
-       /* XDP in mlx5e doesn't support multiple packets per page. */
+       /* XDP in mlx5e doesn't support multiple packets per page. AF_XDP is a
+        * special case. It can run with frames smaller than a page, as it
+        * doesn't allocate pages dynamically. However, here we pretend that
+        * fragments are page-sized: it allows to treat XSK frames like pages
+        * by redirecting alloc and free operations to XSK rings and by using
+        * the fact there are no multiple packets per "page" (which is a frame).
+        * The latter is important, because frames may come in a random order,
+        * and we will have trouble assemblying a real page of multiple frames.
+        */
        if (mlx5e_rx_is_xdp(params, xsk))
                frag_sz = max_t(u32, frag_sz, PAGE_SIZE);
 
index 3a615d663d84ec51c2142514bdbfde897cb18f2d..989d8f4294388c1d21facd0a21b1df3e42d86020 100644 (file)
@@ -76,6 +76,8 @@ static inline bool mlx5e_qid_validate(const struct mlx5e_profile *profile,
 
 u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
                                 struct mlx5e_xsk_param *xsk);
+u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params,
+                            struct mlx5e_xsk_param *xsk);
 u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
                                struct mlx5e_xsk_param *xsk);
 u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params,
index 1ed5c33e022f5d8fbdeff14ba598fe8a1cef9c05..f049e0ac308a00c587a3260aa110e48f4accb445 100644 (file)
@@ -122,6 +122,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
                      void *va, u16 *rx_headroom, u32 *len, bool xsk)
 {
        struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
+       struct xdp_umem *umem = rq->umem;
        struct xdp_buff xdp;
        u32 act;
        int err;
@@ -138,8 +139,11 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
        xdp.rxq = &rq->xdp_rxq;
 
        act = bpf_prog_run_xdp(prog, &xdp);
-       if (xsk)
-               xdp.handle += xdp.data - xdp.data_hard_start;
+       if (xsk) {
+               u64 off = xdp.data - xdp.data_hard_start;
+
+               xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
+       }
        switch (act) {
        case XDP_PASS:
                *rx_headroom = xdp.data - xdp.data_hard_start;
index 6a55573ec8f2964c226e766c7adba007d7c0d717..475b6bd5d29be7683cc6bbe058eb04c48d4ea357 100644 (file)
@@ -24,7 +24,8 @@ int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
        if (!xsk_umem_peek_addr_rq(umem, &handle))
                return -ENOMEM;
 
-       dma_info->xsk.handle = handle + rq->buff.umem_headroom;
+       dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
+                                                     rq->buff.umem_headroom);
        dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
 
        /* No need to add headroom to the DMA address. In striding RQ case, we
@@ -104,7 +105,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 
        /* head_offset is not used in this function, because di->xsk.data and
         * di->addr point directly to the necessary place. Furthermore, in the
-        * current implementation, one page = one packet = one frame, so
+        * current implementation, UMR pages are mapped to XSK frames, so
         * head_offset should always be 0.
         */
        WARN_ON_ONCE(head_offset);
index 307b923a136139facb90fc44fb433fcb2fa1077a..cab0e93497ae6d2cf53197e86f5ec3ee3df8607d 100644 (file)
@@ -5,6 +5,7 @@
 #define __MLX5_EN_XSK_RX_H__
 
 #include "en.h"
+#include <net/xdp_sock.h>
 
 /* RX data path */
 
@@ -24,4 +25,17 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
                                              struct mlx5e_wqe_frag_info *wi,
                                              u32 cqe_bcnt);
 
+static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
+{
+       if (!xsk_umem_uses_need_wakeup(rq->umem))
+               return alloc_err;
+
+       if (unlikely(alloc_err))
+               xsk_set_rx_need_wakeup(rq->umem);
+       else
+               xsk_clear_rx_need_wakeup(rq->umem);
+
+       return false;
+}
+
 #endif /* __MLX5_EN_XSK_RX_H__ */
index d360750b25b7922f7308413257ec4b3d1408f942..631af8dee5171d67447e5eefac4809a115bb4e3e 100644 (file)
@@ -4,18 +4,23 @@
 #include "setup.h"
 #include "en/params.h"
 
+/* It matches XDP_UMEM_MIN_CHUNK_SIZE, but as this constant is private and may
+ * change unexpectedly, and mlx5e has a minimum valid stride size for striding
+ * RQ, keep this check in the driver.
+ */
+#define MLX5E_MIN_XSK_CHUNK_SIZE 2048
+
 bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
                              struct mlx5e_xsk_param *xsk,
                              struct mlx5_core_dev *mdev)
 {
-       /* AF_XDP doesn't support frames larger than PAGE_SIZE, and the current
-        * mlx5e XDP implementation doesn't support multiple packets per page.
-        */
-       if (xsk->chunk_size != PAGE_SIZE)
+       /* AF_XDP doesn't support frames larger than PAGE_SIZE. */
+       if (xsk->chunk_size > PAGE_SIZE ||
+                       xsk->chunk_size < MLX5E_MIN_XSK_CHUNK_SIZE)
                return false;
 
        /* Current MTU and XSK headroom don't allow packets to fit the frames. */
-       if (mlx5e_rx_get_linear_frag_sz(params, xsk) > xsk->chunk_size)
+       if (mlx5e_rx_get_min_frag_sz(params, xsk) > xsk->chunk_size)
                return false;
 
        /* frag_sz is different for regular and XSK RQs, so ensure that linear
index fd2c75b4b519e71fda8a2af052f1aa0afd7639ce..87827477d38c48dc71d795a1a142634a1c3a5933 100644 (file)
@@ -7,7 +7,7 @@
 #include "en/params.h"
 #include <net/xdp_sock.h>
 
-int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid)
+int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 {
        struct mlx5e_priv *priv = netdev_priv(dev);
        struct mlx5e_params *params = &priv->channels.params;
index 7add18bf78d8f0688769945e5075f7f0a5f352a6..79b487d897570d1c7194d3b4e9b02de29078d246 100644 (file)
@@ -5,11 +5,23 @@
 #define __MLX5_EN_XSK_TX_H__
 
 #include "en.h"
+#include <net/xdp_sock.h>
 
 /* TX data path */
 
-int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid);
+int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
 
 bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget);
 
+static inline void mlx5e_xsk_update_tx_wakeup(struct mlx5e_xdpsq *sq)
+{
+       if (!xsk_umem_uses_need_wakeup(sq->umem))
+               return;
+
+       if (sq->pc != sq->cc)
+               xsk_clear_tx_need_wakeup(sq->umem);
+       else
+               xsk_set_tx_need_wakeup(sq->umem);
+}
+
 #endif /* __MLX5_EN_XSK_TX_H__ */
index dadadf22108723a3c8deece3fd488d6b197a4344..1cacda1bc1b410f1d470eb452b1839d92bbf93b4 100644 (file)
@@ -4580,7 +4580,7 @@ const struct net_device_ops mlx5e_netdev_ops = {
        .ndo_tx_timeout          = mlx5e_tx_timeout,
        .ndo_bpf                 = mlx5e_xdp,
        .ndo_xdp_xmit            = mlx5e_xdp_xmit,
-       .ndo_xsk_async_xmit      = mlx5e_xsk_async_xmit,
+       .ndo_xsk_wakeup          = mlx5e_xsk_wakeup,
 #ifdef CONFIG_MLX5_EN_ARFS
        .ndo_rx_flow_steer       = mlx5e_rx_flow_steer,
 #endif
index 2fd2760d0bb7c0b1000b8687f4b0af1c0c056ee7..d6a547238de03fd9778d7f786346c00647963c68 100644 (file)
@@ -695,8 +695,11 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
        rq->mpwqe.umr_in_progress += rq->mpwqe.umr_last_bulk;
        rq->mpwqe.actual_wq_head   = head;
 
-       /* If XSK Fill Ring doesn't have enough frames, busy poll by
-        * rescheduling the NAPI poll.
+       /* If XSK Fill Ring doesn't have enough frames, report the error, so
+        * that one of the actions can be performed:
+        * 1. If need_wakeup is used, signal that the application has to kick
+        * the driver when it refills the Fill Ring.
+        * 2. Otherwise, busy poll by rescheduling the NAPI poll.
         */
        if (unlikely(alloc_err == -ENOMEM && rq->umem))
                return true;
index 49b06b256c92955d8b0054ade614b2d33ee41c1e..257a7c9f7a14d4cef416ec5992ed4e4c21832f43 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/irq.h>
 #include "en.h"
 #include "en/xdp.h"
+#include "en/xsk/rx.h"
 #include "en/xsk/tx.h"
 
 static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
@@ -81,6 +82,29 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq)
        mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl);
 }
 
+static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq)
+{
+       bool busy_xsk = false, xsk_rx_alloc_err;
+
+       /* Handle the race between the application querying need_wakeup and the
+        * driver setting it:
+        * 1. Update need_wakeup both before and after the TX. If it goes to
+        * "yes", it can only happen with the first update.
+        * 2. If the application queried need_wakeup before we set it, the
+        * packets will be transmitted anyway, even w/o a wakeup.
+        * 3. Give a chance to clear need_wakeup after new packets were queued
+        * for TX.
+        */
+       mlx5e_xsk_update_tx_wakeup(xsksq);
+       busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
+       mlx5e_xsk_update_tx_wakeup(xsksq);
+
+       xsk_rx_alloc_err = xskrq->post_wqes(xskrq);
+       busy_xsk |= mlx5e_xsk_update_rx_wakeup(xskrq, xsk_rx_alloc_err);
+
+       return busy_xsk;
+}
+
 int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 {
        struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
@@ -122,8 +146,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
        if (xsk_open) {
                mlx5e_poll_ico_cq(&c->xskicosq.cq);
                busy |= mlx5e_poll_xdpsq_cq(&xsksq->cq);
-               busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
-               busy_xsk |= xskrq->post_wqes(xskrq);
+               busy_xsk |= mlx5e_napi_xsk_post(xsksq, xskrq);
        }
 
        busy |= busy_xsk;
index bc9850e4ec5e646e7e21511a995c5315b1b26062..0e2db6ea79e96f7e9daca0fdcd84ce7b4483b0ed 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/bug.h>
 #include <linux/jiffies.h>
 #include <linux/skbuff.h>
+#include <linux/timekeeping.h>
 
 #include "../ccm.h"
 #include "../nfp_app.h"
@@ -175,29 +176,151 @@ nfp_bpf_ctrl_reply_val(struct nfp_app_bpf *bpf, struct cmsg_reply_map_op *reply,
        return &reply->data[bpf->cmsg_key_sz * (n + 1) + bpf->cmsg_val_sz * n];
 }
 
+static bool nfp_bpf_ctrl_op_cache_invalidate(enum nfp_ccm_type op)
+{
+       return op == NFP_CCM_TYPE_BPF_MAP_UPDATE ||
+              op == NFP_CCM_TYPE_BPF_MAP_DELETE;
+}
+
+static bool nfp_bpf_ctrl_op_cache_capable(enum nfp_ccm_type op)
+{
+       return op == NFP_CCM_TYPE_BPF_MAP_LOOKUP ||
+              op == NFP_CCM_TYPE_BPF_MAP_GETNEXT;
+}
+
+static bool nfp_bpf_ctrl_op_cache_fill(enum nfp_ccm_type op)
+{
+       return op == NFP_CCM_TYPE_BPF_MAP_GETFIRST ||
+              op == NFP_CCM_TYPE_BPF_MAP_GETNEXT;
+}
+
+static unsigned int
+nfp_bpf_ctrl_op_cache_get(struct nfp_bpf_map *nfp_map, enum nfp_ccm_type op,
+                         const u8 *key, u8 *out_key, u8 *out_value,
+                         u32 *cache_gen)
+{
+       struct bpf_map *map = &nfp_map->offmap->map;
+       struct nfp_app_bpf *bpf = nfp_map->bpf;
+       unsigned int i, count, n_entries;
+       struct cmsg_reply_map_op *reply;
+
+       n_entries = nfp_bpf_ctrl_op_cache_fill(op) ? bpf->cmsg_cache_cnt : 1;
+
+       spin_lock(&nfp_map->cache_lock);
+       *cache_gen = nfp_map->cache_gen;
+       if (nfp_map->cache_blockers)
+               n_entries = 1;
+
+       if (nfp_bpf_ctrl_op_cache_invalidate(op))
+               goto exit_block;
+       if (!nfp_bpf_ctrl_op_cache_capable(op))
+               goto exit_unlock;
+
+       if (!nfp_map->cache)
+               goto exit_unlock;
+       if (nfp_map->cache_to < ktime_get_ns())
+               goto exit_invalidate;
+
+       reply = (void *)nfp_map->cache->data;
+       count = be32_to_cpu(reply->count);
+
+       for (i = 0; i < count; i++) {
+               void *cached_key;
+
+               cached_key = nfp_bpf_ctrl_reply_key(bpf, reply, i);
+               if (memcmp(cached_key, key, map->key_size))
+                       continue;
+
+               if (op == NFP_CCM_TYPE_BPF_MAP_LOOKUP)
+                       memcpy(out_value, nfp_bpf_ctrl_reply_val(bpf, reply, i),
+                              map->value_size);
+               if (op == NFP_CCM_TYPE_BPF_MAP_GETNEXT) {
+                       if (i + 1 == count)
+                               break;
+
+                       memcpy(out_key,
+                              nfp_bpf_ctrl_reply_key(bpf, reply, i + 1),
+                              map->key_size);
+               }
+
+               n_entries = 0;
+               goto exit_unlock;
+       }
+       goto exit_unlock;
+
+exit_block:
+       nfp_map->cache_blockers++;
+exit_invalidate:
+       dev_consume_skb_any(nfp_map->cache);
+       nfp_map->cache = NULL;
+exit_unlock:
+       spin_unlock(&nfp_map->cache_lock);
+       return n_entries;
+}
+
+static void
+nfp_bpf_ctrl_op_cache_put(struct nfp_bpf_map *nfp_map, enum nfp_ccm_type op,
+                         struct sk_buff *skb, u32 cache_gen)
+{
+       bool blocker, filler;
+
+       blocker = nfp_bpf_ctrl_op_cache_invalidate(op);
+       filler = nfp_bpf_ctrl_op_cache_fill(op);
+       if (blocker || filler) {
+               u64 to = 0;
+
+               if (filler)
+                       to = ktime_get_ns() + NFP_BPF_MAP_CACHE_TIME_NS;
+
+               spin_lock(&nfp_map->cache_lock);
+               if (blocker) {
+                       nfp_map->cache_blockers--;
+                       nfp_map->cache_gen++;
+               }
+               if (filler && !nfp_map->cache_blockers &&
+                   nfp_map->cache_gen == cache_gen) {
+                       nfp_map->cache_to = to;
+                       swap(nfp_map->cache, skb);
+               }
+               spin_unlock(&nfp_map->cache_lock);
+       }
+
+       dev_consume_skb_any(skb);
+}
+
 static int
 nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, enum nfp_ccm_type op,
                      u8 *key, u8 *value, u64 flags, u8 *out_key, u8 *out_value)
 {
        struct nfp_bpf_map *nfp_map = offmap->dev_priv;
+       unsigned int n_entries, reply_entries, count;
        struct nfp_app_bpf *bpf = nfp_map->bpf;
        struct bpf_map *map = &offmap->map;
        struct cmsg_reply_map_op *reply;
        struct cmsg_req_map_op *req;
        struct sk_buff *skb;
+       u32 cache_gen;
        int err;
 
        /* FW messages have no space for more than 32 bits of flags */
        if (flags >> 32)
                return -EOPNOTSUPP;
 
+       /* Handle op cache */
+       n_entries = nfp_bpf_ctrl_op_cache_get(nfp_map, op, key, out_key,
+                                             out_value, &cache_gen);
+       if (!n_entries)
+               return 0;
+
        skb = nfp_bpf_cmsg_map_req_alloc(bpf, 1);
-       if (!skb)
-               return -ENOMEM;
+       if (!skb) {
+               err = -ENOMEM;
+               goto err_cache_put;
+       }
 
        req = (void *)skb->data;
        req->tid = cpu_to_be32(nfp_map->tid);
-       req->count = cpu_to_be32(1);
+       req->count = cpu_to_be32(n_entries);
        req->flags = cpu_to_be32(flags);
 
        /* Copy inputs */
@@ -207,16 +330,38 @@ nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, enum nfp_ccm_type op,
                memcpy(nfp_bpf_ctrl_req_val(bpf, req, 0), value,
                       map->value_size);
 
-       skb = nfp_ccm_communicate(&bpf->ccm, skb, op,
-                                 nfp_bpf_cmsg_map_reply_size(bpf, 1));
-       if (IS_ERR(skb))
-               return PTR_ERR(skb);
+       skb = nfp_ccm_communicate(&bpf->ccm, skb, op, 0);
+       if (IS_ERR(skb)) {
+               err = PTR_ERR(skb);
+               goto err_cache_put;
+       }
+
+       if (skb->len < sizeof(*reply)) {
+               cmsg_warn(bpf, "cmsg drop - type 0x%02x too short %d!\n",
+                         op, skb->len);
+               err = -EIO;
+               goto err_free;
+       }
 
        reply = (void *)skb->data;
+       count = be32_to_cpu(reply->count);
        err = nfp_bpf_ctrl_rc_to_errno(bpf, &reply->reply_hdr);
+       /* FW responds with message sized to hold the good entries,
+        * plus one extra entry if there was an error.
+        */
+       reply_entries = count + !!err;
+       if (n_entries > 1 && count)
+               err = 0;
        if (err)
                goto err_free;
 
+       if (skb->len != nfp_bpf_cmsg_map_reply_size(bpf, reply_entries)) {
+               cmsg_warn(bpf, "cmsg drop - type 0x%02x too short %d for %d entries!\n",
+                         op, skb->len, reply_entries);
+               err = -EIO;
+               goto err_free;
+       }
+
        /* Copy outputs */
        if (out_key)
                memcpy(out_key, nfp_bpf_ctrl_reply_key(bpf, reply, 0),
@@ -225,11 +370,13 @@ nfp_bpf_ctrl_entry_op(struct bpf_offloaded_map *offmap, enum nfp_ccm_type op,
                memcpy(out_value, nfp_bpf_ctrl_reply_val(bpf, reply, 0),
                       map->value_size);
 
-       dev_consume_skb_any(skb);
+       nfp_bpf_ctrl_op_cache_put(nfp_map, op, skb, cache_gen);
 
        return 0;
 err_free:
        dev_kfree_skb_any(skb);
+err_cache_put:
+       nfp_bpf_ctrl_op_cache_put(nfp_map, op, NULL, cache_gen);
        return err;
 }
 
@@ -267,11 +414,29 @@ int nfp_bpf_ctrl_getnext_entry(struct bpf_offloaded_map *offmap,
                                     key, NULL, 0, next_key, NULL);
 }
 
+unsigned int nfp_bpf_ctrl_cmsg_min_mtu(struct nfp_app_bpf *bpf)
+{
+       return max(nfp_bpf_cmsg_map_req_size(bpf, 1),
+                  nfp_bpf_cmsg_map_reply_size(bpf, 1));
+}
+
 unsigned int nfp_bpf_ctrl_cmsg_mtu(struct nfp_app_bpf *bpf)
 {
-       return max3((unsigned int)NFP_NET_DEFAULT_MTU,
-                   nfp_bpf_cmsg_map_req_size(bpf, 1),
-                   nfp_bpf_cmsg_map_reply_size(bpf, 1));
+       return max3(NFP_NET_DEFAULT_MTU,
+                   nfp_bpf_cmsg_map_req_size(bpf, NFP_BPF_MAP_CACHE_CNT),
+                   nfp_bpf_cmsg_map_reply_size(bpf, NFP_BPF_MAP_CACHE_CNT));
+}
+
+unsigned int nfp_bpf_ctrl_cmsg_cache_cnt(struct nfp_app_bpf *bpf)
+{
+       unsigned int mtu, req_max, reply_max, entry_sz;
+
+       mtu = bpf->app->ctrl->dp.mtu;
+       entry_sz = bpf->cmsg_key_sz + bpf->cmsg_val_sz;
+       req_max = (mtu - sizeof(struct cmsg_req_map_op)) / entry_sz;
+       reply_max = (mtu - sizeof(struct cmsg_reply_map_op)) / entry_sz;
+
+       return min3(req_max, reply_max, NFP_BPF_MAP_CACHE_CNT);
 }
 
 void nfp_bpf_ctrl_msg_rx(struct nfp_app *app, struct sk_buff *skb)
index 06c4286bd79e0a0b3578da221783a20d54d8a7f9..a83a0ad5e27de0c61f9748299e78f040b3ca7289 100644 (file)
@@ -24,6 +24,7 @@ enum bpf_cap_tlv_type {
        NFP_BPF_CAP_TYPE_QUEUE_SELECT   = 5,
        NFP_BPF_CAP_TYPE_ADJUST_TAIL    = 6,
        NFP_BPF_CAP_TYPE_ABI_VERSION    = 7,
+       NFP_BPF_CAP_TYPE_CMSG_MULTI_ENT = 8,
 };
 
 struct nfp_bpf_cap_tlv_func {
index 1c9fb11470df7b3d4af3f8ee749c6cc2a10c2d94..8f732771d3fad8965318dca81a5e292a5310d8e6 100644 (file)
@@ -299,6 +299,14 @@ nfp_bpf_parse_cap_adjust_tail(struct nfp_app_bpf *bpf, void __iomem *value,
        return 0;
 }
 
+static int
+nfp_bpf_parse_cap_cmsg_multi_ent(struct nfp_app_bpf *bpf, void __iomem *value,
+                                u32 length)
+{
+       bpf->cmsg_multi_ent = true;
+       return 0;
+}
+
 static int
 nfp_bpf_parse_cap_abi_version(struct nfp_app_bpf *bpf, void __iomem *value,
                              u32 length)
@@ -375,6 +383,11 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app)
                                                          length))
                                goto err_release_free;
                        break;
+               case NFP_BPF_CAP_TYPE_CMSG_MULTI_ENT:
+                       if (nfp_bpf_parse_cap_cmsg_multi_ent(app->priv, value,
+                                                            length))
+                               goto err_release_free;
+                       break;
                default:
                        nfp_dbg(cpp, "unknown BPF capability: %d\n", type);
                        break;
@@ -415,6 +428,25 @@ static void nfp_bpf_ndo_uninit(struct nfp_app *app, struct net_device *netdev)
        bpf_offload_dev_netdev_unregister(bpf->bpf_dev, netdev);
 }
 
+static int nfp_bpf_start(struct nfp_app *app)
+{
+       struct nfp_app_bpf *bpf = app->priv;
+
+       if (app->ctrl->dp.mtu < nfp_bpf_ctrl_cmsg_min_mtu(bpf)) {
+               nfp_err(bpf->app->cpp,
+                       "ctrl channel MTU below min required %u < %u\n",
+                       app->ctrl->dp.mtu, nfp_bpf_ctrl_cmsg_min_mtu(bpf));
+               return -EINVAL;
+       }
+
+       if (bpf->cmsg_multi_ent)
+               bpf->cmsg_cache_cnt = nfp_bpf_ctrl_cmsg_cache_cnt(bpf);
+       else
+               bpf->cmsg_cache_cnt = 1;
+
+       return 0;
+}
+
 static int nfp_bpf_init(struct nfp_app *app)
 {
        struct nfp_app_bpf *bpf;
@@ -488,6 +520,7 @@ const struct nfp_app_type app_bpf = {
 
        .init           = nfp_bpf_init,
        .clean          = nfp_bpf_clean,
+       .start          = nfp_bpf_start,
 
        .check_mtu      = nfp_bpf_check_mtu,
 
index 57d6ff51e980c00236886d457c65245c2006bfe3..fac9c6f9e197b44882309b08522187ee0f5eba74 100644 (file)
@@ -99,6 +99,7 @@ enum pkt_vec {
  * @maps_neutral:      hash table of offload-neutral maps (on pointer)
  *
  * @abi_version:       global BPF ABI version
+ * @cmsg_cache_cnt:    number of entries to read for caching
  *
  * @adjust_head:       adjust head capability
  * @adjust_head.flags:         extra flags for adjust head
@@ -124,6 +125,7 @@ enum pkt_vec {
  * @pseudo_random:     FW initialized the pseudo-random machinery (CSRs)
  * @queue_select:      BPF can set the RX queue ID in packet vector
  * @adjust_tail:       BPF can simply trunc packet size for adjust tail
+ * @cmsg_multi_ent:    FW can pack multiple map entries in a single cmsg
  */
 struct nfp_app_bpf {
        struct nfp_app *app;
@@ -134,6 +136,8 @@ struct nfp_app_bpf {
        unsigned int cmsg_key_sz;
        unsigned int cmsg_val_sz;
 
+       unsigned int cmsg_cache_cnt;
+
        struct list_head map_list;
        unsigned int maps_in_use;
        unsigned int map_elems_in_use;
@@ -169,6 +173,7 @@ struct nfp_app_bpf {
        bool pseudo_random;
        bool queue_select;
        bool adjust_tail;
+       bool cmsg_multi_ent;
 };
 
 enum nfp_bpf_map_use {
@@ -183,11 +188,21 @@ struct nfp_bpf_map_word {
        unsigned char non_zero_update   :1;
 };
 
+#define NFP_BPF_MAP_CACHE_CNT          4U
+#define NFP_BPF_MAP_CACHE_TIME_NS      (250 * 1000)
+
 /**
  * struct nfp_bpf_map - private per-map data attached to BPF maps for offload
  * @offmap:    pointer to the offloaded BPF map
  * @bpf:       back pointer to bpf app private structure
  * @tid:       table id identifying map on datapath
+ *
+ * @cache_lock:        protects @cache_blockers, @cache_to, @cache
+ * @cache_blockers:    number of ops in flight which block caching
+ * @cache_gen: counter incremented by every blocker on exit
+ * @cache_to:  time when cache will no longer be valid (ns)
+ * @cache:     skb with cached response
+ *
  * @l:         link on the nfp_app_bpf->map_list list
  * @use_map:   map of how the value is used (in 4B chunks)
  */
@@ -195,6 +210,13 @@ struct nfp_bpf_map {
        struct bpf_offloaded_map *offmap;
        struct nfp_app_bpf *bpf;
        u32 tid;
+
+       spinlock_t cache_lock;
+       u32 cache_blockers;
+       u32 cache_gen;
+       u64 cache_to;
+       struct sk_buff *cache;
+
        struct list_head l;
        struct nfp_bpf_map_word use_map[];
 };
@@ -564,7 +586,9 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 
 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv);
 
+unsigned int nfp_bpf_ctrl_cmsg_min_mtu(struct nfp_app_bpf *bpf);
 unsigned int nfp_bpf_ctrl_cmsg_mtu(struct nfp_app_bpf *bpf);
+unsigned int nfp_bpf_ctrl_cmsg_cache_cnt(struct nfp_app_bpf *bpf);
 long long int
 nfp_bpf_ctrl_alloc_map(struct nfp_app_bpf *bpf, struct bpf_map *map);
 void
index 39c9fec222b45823eca3772a77b239938188fcd2..88fab6a82acff88716dc70b20ca3b295b257e6c9 100644 (file)
@@ -385,6 +385,7 @@ nfp_bpf_map_alloc(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap)
        offmap->dev_priv = nfp_map;
        nfp_map->offmap = offmap;
        nfp_map->bpf = bpf;
+       spin_lock_init(&nfp_map->cache_lock);
 
        res = nfp_bpf_ctrl_alloc_map(bpf, &offmap->map);
        if (res < 0) {
@@ -407,6 +408,8 @@ nfp_bpf_map_free(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap)
        struct nfp_bpf_map *nfp_map = offmap->dev_priv;
 
        nfp_bpf_ctrl_free_map(bpf, nfp_map);
+       dev_consume_skb_any(nfp_map->cache);
+       WARN_ON_ONCE(nfp_map->cache_blockers);
        list_del_init(&nfp_map->l);
        bpf->map_elems_in_use -= offmap->map.max_entries;
        bpf->maps_in_use--;
index 5d6c3738b4946f5046ebb0b7762f69ea1aaf93a9..250f510b1d212f65c4e8c2d7f9c99f32891c09e3 100644 (file)
@@ -66,7 +66,7 @@
 #define NFP_NET_MAX_DMA_BITS   40
 
 /* Default size for MTU and freelist buffer sizes */
-#define NFP_NET_DEFAULT_MTU            1500
+#define NFP_NET_DEFAULT_MTU            1500U
 
 /* Maximum number of bytes prepended to a packet */
 #define NFP_NET_MAX_PREPEND            64
index 6f97b554f7da7ab49782a8f5cd4b77b7aa48d56f..61aabffc8888d50c4f0cc8dcd14a40594fce61d8 100644 (file)
@@ -4116,14 +4116,7 @@ int nfp_net_init(struct nfp_net *nn)
 
        /* Set default MTU and Freelist buffer size */
        if (!nfp_net_is_data_vnic(nn) && nn->app->ctrl_mtu) {
-               if (nn->app->ctrl_mtu <= nn->max_mtu) {
-                       nn->dp.mtu = nn->app->ctrl_mtu;
-               } else {
-                       if (nn->app->ctrl_mtu != NFP_APP_CTRL_MTU_MAX)
-                               nn_warn(nn, "app requested MTU above max supported %u > %u\n",
-                                       nn->app->ctrl_mtu, nn->max_mtu);
-                       nn->dp.mtu = nn->max_mtu;
-               }
+               nn->dp.mtu = min(nn->app->ctrl_mtu, nn->max_mtu);
        } else if (nn->max_mtu < NFP_NET_DEFAULT_MTU) {
                nn->dp.mtu = nn->max_mtu;
        } else {
index f9a506147c8a4410b122bb6db3525e1672262af3..5b9d2233860654512da6ff5ec8bf56f2343ed722 100644 (file)
@@ -24,6 +24,9 @@ struct seq_file;
 struct btf;
 struct btf_type;
 
+extern struct idr btf_idr;
+extern spinlock_t btf_idr_lock;
+
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
        /* funcs callable from userspace (via syscall) */
@@ -647,6 +650,8 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
 struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
+struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map,
+                                                  bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
index 5fe99f322b1c5e7d6333ba651a58e1d07995e9a1..26a6d58ca78ccb2a60b91e03471e8dd1eb96df9c 100644 (file)
@@ -355,6 +355,7 @@ struct bpf_verifier_env {
        struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
        int stack_size;                 /* number of states to be processed */
        bool strict_alignment;          /* perform strict pointer alignment checks */
+       bool test_state_freq;           /* test verifier with different pruning frequency */
        struct bpf_verifier_state *cur_state; /* current verifier state */
        struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
        struct bpf_verifier_state_list *free_list;
index b5d28dadf9645d0a694be2c6543712bdeaca52a2..d7d5626002e970e34101436de038a9c6ad4024d5 100644 (file)
@@ -901,6 +901,10 @@ struct netdev_bpf {
        };
 };
 
+/* Flags for ndo_xsk_wakeup. */
+#define XDP_WAKEUP_RX (1 << 0)
+#define XDP_WAKEUP_TX (1 << 1)
+
 #ifdef CONFIG_XFRM_OFFLOAD
 struct xfrmdev_ops {
        int     (*xdo_dev_state_add) (struct xfrm_state *x);
@@ -1227,6 +1231,12 @@ struct tlsdev_ops;
  *     that got dropped are freed/returned via xdp_return_frame().
  *     Returns negative number, means general error invoking ndo, meaning
  *     no frames were xmit'ed and core-caller will free all frames.
+ * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
+ *      This function is used to wake up the softirq, ksoftirqd or kthread
+ *     responsible for sending and/or receiving packets on a specific
+ *     queue id bound to an AF_XDP socket. The flags field specifies if
+ *     only RX, only Tx, or both should be woken up using the flags
+ *     XDP_WAKEUP_RX and XDP_WAKEUP_TX.
  * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev);
  *     Get devlink port instance associated with a given netdev.
  *     Called with a reference on the netdevice and devlink locks only,
@@ -1426,8 +1436,8 @@ struct net_device_ops {
        int                     (*ndo_xdp_xmit)(struct net_device *dev, int n,
                                                struct xdp_frame **xdp,
                                                u32 flags);
-       int                     (*ndo_xsk_async_xmit)(struct net_device *dev,
-                                                     u32 queue_id);
+       int                     (*ndo_xsk_wakeup)(struct net_device *dev,
+                                                 u32 queue_id, u32 flags);
        struct devlink_port *   (*ndo_get_devlink_port)(struct net_device *dev);
 };
 
index c7dc2b5902c057ee0475f786f576fd921b3a8014..c17af77f3fae7f98814b521b127b2e10945d1e5e 100644 (file)
@@ -5,6 +5,10 @@
  * propagate the unknown bits such that the tnum result represents all the
  * possible results for possible values of the operands.
  */
+
+#ifndef _LINUX_TNUM_H
+#define _LINUX_TNUM_H
+
 #include <linux/types.h>
 
 struct tnum {
@@ -81,3 +85,5 @@ bool tnum_in(struct tnum a, struct tnum b);
 int tnum_strn(char *str, size_t size, struct tnum a);
 /* Format a tnum as tristate binary expansion */
 int tnum_sbin(char *str, size_t size, struct tnum a);
+
+#endif /* _LINUX_TNUM_H */
index b9dcb02e756b29d080ba9343f7a9e24178cfad67..8e4f831d2e52e59a12aa3acab43c0de85b39d6c7 100644 (file)
@@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
 extern const struct bpf_func_proto bpf_sk_storage_get_proto;
 extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
 
+#ifdef CONFIG_BPF_SYSCALL
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
+#else
+static inline int bpf_sk_storage_clone(const struct sock *sk,
+                                      struct sock *newsk)
+{
+       return 0;
+}
+#endif
+
 #endif /* _BPF_SK_STORAGE_H */
index 69796d264f0638457526345f5cc423bf2c27bf17..c9398ce7960f9e909db8f231a8da08aa38d42025 100644 (file)
 struct net_device;
 struct xsk_queue;
 
+/* Masks for xdp_umem_page flags.
+ * The low 12-bits of the addr will be 0 since this is the page address, so we
+ * can use them for flags.
+ */
+#define XSK_NEXT_PG_CONTIG_SHIFT 0
+#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
+
 struct xdp_umem_page {
        void *addr;
        dma_addr_t dma;
@@ -27,6 +34,13 @@ struct xdp_umem_fq_reuse {
        u64 handles[];
 };
 
+/* Flags for the umem flags field.
+ *
+ * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
+ * flags. See inlude/uapi/include/linux/if_xdp.h.
+ */
+#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
+
 struct xdp_umem {
        struct xsk_queue *fq;
        struct xsk_queue *cq;
@@ -41,15 +55,27 @@ struct xdp_umem {
        struct work_struct work;
        struct page **pgs;
        u32 npgs;
+       u16 queue_id;
+       u8 need_wakeup;
+       u8 flags;
        int id;
        struct net_device *dev;
        struct xdp_umem_fq_reuse *fq_reuse;
-       u16 queue_id;
        bool zc;
        spinlock_t xsk_list_lock;
        struct list_head xsk_list;
 };
 
+/* Nodes are linked in the struct xdp_sock map_list field, and used to
+ * track which maps a certain socket reside in.
+ */
+struct xsk_map;
+struct xsk_map_node {
+       struct list_head node;
+       struct xsk_map *map;
+       struct xdp_sock **map_entry;
+};
+
 struct xdp_sock {
        /* struct sock must be the first member of struct xdp_sock */
        struct sock sk;
@@ -75,6 +101,9 @@ struct xdp_sock {
        /* Protects generic receive. */
        spinlock_t rx_lock;
        u64 rx_dropped;
+       struct list_head map_list;
+       /* Protects map_list */
+       spinlock_t map_list_lock;
 };
 
 struct xdp_buff;
@@ -95,15 +124,47 @@ struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
                                          struct xdp_umem_fq_reuse *newq);
 void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
+
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+                            struct xdp_sock **map_entry);
+int xsk_map_inc(struct xsk_map *map);
+void xsk_map_put(struct xsk_map *map);
+
+static inline u64 xsk_umem_extract_addr(u64 addr)
+{
+       return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline u64 xsk_umem_extract_offset(u64 addr)
+{
+       return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
+{
+       return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
+}
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-       return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
+       unsigned long page_addr;
+
+       addr = xsk_umem_add_offset_to_addr(addr);
+       page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
+
+       return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
 }
 
 static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
 {
-       return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
+       addr = xsk_umem_add_offset_to_addr(addr);
+
+       return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
 }
 
 /* Reuse-queue aware version of FILL queue helpers */
@@ -144,6 +205,19 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
 
        rq->handles[rq->length++] = addr;
 }
+
+/* Handle the offset appropriately depending on aligned or unaligned mode.
+ * For unaligned mode, we store the offset in the upper 16-bits of the address.
+ * For aligned mode, we simply add the offset to the address.
+ */
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
+                                        u64 offset)
+{
+       if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
+               return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
+       else
+               return address + offset;
+}
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
@@ -213,6 +287,21 @@ static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
        return NULL;
 }
 
+static inline u64 xsk_umem_extract_addr(u64 addr)
+{
+       return 0;
+}
+
+static inline u64 xsk_umem_extract_offset(u64 addr)
+{
+       return 0;
+}
+
+static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
+{
+       return 0;
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
        return NULL;
@@ -241,6 +330,33 @@ static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
 {
 }
 
+static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+}
+
+static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+       return false;
+}
+
+static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
+                                        u64 offset)
+{
+       return 0;
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
index 0e66371bea13fdb93411c87aeb08a88615b5f8bb..77c6be96d676222e446d41d2668b40cafb0ef1fe 100644 (file)
@@ -106,6 +106,7 @@ enum bpf_cmd {
        BPF_TASK_FD_QUERY,
        BPF_MAP_LOOKUP_AND_DELETE_ELEM,
        BPF_MAP_FREEZE,
+       BPF_BTF_GET_NEXT_ID,
 };
 
 enum bpf_map_type {
@@ -284,6 +285,9 @@ enum bpf_attach_type {
  */
 #define BPF_F_TEST_RND_HI32    (1U << 2)
 
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_STATE_FREQ  (1U << 3)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * two extensions:
  *
@@ -337,6 +341,9 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY_PROG      (1U << 7)
 #define BPF_F_WRONLY_PROG      (1U << 8)
 
+/* Clone map from listener for newly accepted socket */
+#define BPF_F_CLONE            (1U << 9)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE  (1U << 0)
 
@@ -576,6 +583,8 @@ union bpf_attr {
  *             limited to five).
  *
  *             Each time the helper is called, it appends a line to the trace.
+ *             Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ *             open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
  *             The format of the trace is customizable, and the exact output
  *             one will get depends on the options set in
  *             *\/sys/kernel/debug/tracing/trace_options* (see also the
@@ -1014,7 +1023,7 @@ union bpf_attr {
  *             The realm of the route for the packet associated to *skb*, or 0
  *             if none was found.
  *
- * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  *     Description
  *             Write raw *data* blob into a special BPF perf event held by
  *             *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1076,7 +1085,7 @@ union bpf_attr {
  *     Return
  *             0 on success, or a negative error in case of failure.
  *
- * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
  *     Description
  *             Walk a user or a kernel stack and return its id. To achieve
  *             this, the helper needs *ctx*, which is a pointer to the context
@@ -1725,7 +1734,7 @@ union bpf_attr {
  *     Return
  *             0 on success, or a negative error in case of failure.
  *
- * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * int bpf_override_return(struct pt_regs *regs, u64 rc)
  *     Description
  *             Used for error injection, this helper uses kprobes to override
  *             the return value of the probed function, and to set it to *rc*.
index faaa5ca2a11767a3cfd967661f21645799b95f9c..be328c59389d56861f95aeb488860ed81ef19e0c 100644 (file)
 #define XDP_SHARED_UMEM        (1 << 0)
 #define XDP_COPY       (1 << 1) /* Force copy-mode */
 #define XDP_ZEROCOPY   (1 << 2) /* Force zero-copy mode */
+/* If this option is set, the driver might go sleep and in that case
+ * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
+ * set. If it is set, the application need to explicitly wake up the
+ * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
+ * running the driver and the application on the same core, you should
+ * use this option so that the kernel will yield to the user space
+ * application.
+ */
+#define XDP_USE_NEED_WAKEUP (1 << 3)
+
+/* Flags for xsk_umem_config flags */
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
 
 struct sockaddr_xdp {
        __u16 sxdp_family;
@@ -25,10 +37,14 @@ struct sockaddr_xdp {
        __u32 sxdp_shared_umem_fd;
 };
 
+/* XDP_RING flags */
+#define XDP_RING_NEED_WAKEUP (1 << 0)
+
 struct xdp_ring_offset {
        __u64 producer;
        __u64 consumer;
        __u64 desc;
+       __u64 flags;
 };
 
 struct xdp_mmap_offsets {
@@ -53,6 +69,7 @@ struct xdp_umem_reg {
        __u64 len; /* Length of packet data area */
        __u32 chunk_size;
        __u32 headroom;
+       __u32 flags;
 };
 
 struct xdp_statistics {
@@ -74,6 +91,11 @@ struct xdp_options {
 #define XDP_UMEM_PGOFF_FILL_RING       0x100000000ULL
 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL
 
+/* Masks for unaligned chunks mode */
+#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
+#define XSK_UNALIGNED_BUF_ADDR_MASK \
+       ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+
 /* Rx/Tx descriptor */
 struct xdp_desc {
        __u64 addr;
index 5fcc7a17eb5a4d538ffef61335481a262bc23720..adb3adcebe3c5ac0707ed38c8a184504b67aa2f7 100644 (file)
             i < btf_type_vlen(struct_type);                                    \
             i++, member++)
 
-static DEFINE_IDR(btf_idr);
-static DEFINE_SPINLOCK(btf_idr_lock);
+DEFINE_IDR(btf_idr);
+DEFINE_SPINLOCK(btf_idr_lock);
 
 struct btf {
        void *data;
@@ -3376,6 +3376,15 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
        btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
 }
 
+#ifdef CONFIG_PROC_FS
+static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+       const struct btf *btf = filp->private_data;
+
+       seq_printf(m, "btf_id:\t%u\n", btf->id);
+}
+#endif
+
 static int btf_release(struct inode *inode, struct file *filp)
 {
        btf_put(filp->private_data);
@@ -3383,6 +3392,9 @@ static int btf_release(struct inode *inode, struct file *filp)
 }
 
 const struct file_operations btf_fops = {
+#ifdef CONFIG_PROC_FS
+       .show_fdinfo    = bpf_btf_show_fdinfo,
+#endif
        .release        = btf_release,
 };
 
index 272071e9112f3bc7ccfb111fc734238f9561ffde..82eabd4e38adda6e95f8113d9a7f9bb61b45ab00 100644 (file)
@@ -683,8 +683,8 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 }
 
 /* map_idr_lock should have been held */
-static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
-                                           bool uref)
+static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map,
+                                             bool uref)
 {
        int refold;
 
@@ -704,6 +704,16 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
        return map;
 }
 
+struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+{
+       spin_lock_bh(&map_idr_lock);
+       map = __bpf_map_inc_not_zero(map, uref);
+       spin_unlock_bh(&map_idr_lock);
+
+       return map;
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
+
 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 {
        return -ENOTSUPP;
@@ -1619,6 +1629,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
        if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
                                 BPF_F_ANY_ALIGNMENT |
+                                BPF_F_TEST_STATE_FREQ |
                                 BPF_F_TEST_RND_HI32))
                return -EINVAL;
 
@@ -2183,7 +2194,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
        spin_lock_bh(&map_idr_lock);
        map = idr_find(&map_idr, id);
        if (map)
-               map = bpf_map_inc_not_zero(map, true);
+               map = __bpf_map_inc_not_zero(map, true);
        else
                map = ERR_PTR(-ENOENT);
        spin_unlock_bh(&map_idr_lock);
@@ -2880,6 +2891,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
                err = bpf_obj_get_next_id(&attr, uattr,
                                          &map_idr, &map_idr_lock);
                break;
+       case BPF_BTF_GET_NEXT_ID:
+               err = bpf_obj_get_next_id(&attr, uattr,
+                                         &btf_idr, &btf_idr_lock);
+               break;
        case BPF_PROG_GET_FD_BY_ID:
                err = bpf_prog_get_fd_by_id(&attr);
                break;
index 4659349fc7953c5481701e4740f1bf46321b9342..7ae5dddd1fe6a56cd335d659c9a74b6dacef9d1c 100644 (file)
@@ -30,17 +30,12 @@ static struct kobject *btf_kobj;
 
 static int __init btf_vmlinux_init(void)
 {
-       int err;
-
        if (!_binary__btf_vmlinux_bin_start)
                return 0;
 
        btf_kobj = kobject_create_and_add("btf", kernel_kobj);
-       if (IS_ERR(btf_kobj)) {
-               err = PTR_ERR(btf_kobj);
-               btf_kobj = NULL;
-               return err;
-       }
+       if (!btf_kobj)
+               return -ENOMEM;
 
        bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end -
                                    _binary__btf_vmlinux_bin_start;
index 16d66bd7af09fb71e63f8520489d05680b4e2772..3fb50757e8124fc7d45b5368223197e3fdf9d0bf 100644 (file)
@@ -7223,7 +7223,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
        struct bpf_verifier_state_list *sl, **pprev;
        struct bpf_verifier_state *cur = env->cur_state, *new;
        int i, j, err, states_cnt = 0;
-       bool add_new_state = false;
+       bool add_new_state = env->test_state_freq ? true : false;
 
        cur->last_insn_idx = env->prev_insn_idx;
        if (!env->insn_aux_data[insn_idx].prune_point)
@@ -9263,6 +9263,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 
        env->allow_ptr_leaks = is_priv;
 
+       if (is_priv)
+               env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+
        ret = replace_map_fd_with_map_ptr(env);
        if (ret < 0)
                goto skip_full_check;
index 9bb96ace9fa126762af4761cb556937cefe880bd..942c662e2eed77c4b78bd66c34057114e946fcde 100644 (file)
@@ -13,8 +13,71 @@ struct xsk_map {
        struct bpf_map map;
        struct xdp_sock **xsk_map;
        struct list_head __percpu *flush_list;
+       spinlock_t lock; /* Synchronize map updates */
 };
 
+int xsk_map_inc(struct xsk_map *map)
+{
+       struct bpf_map *m = &map->map;
+
+       m = bpf_map_inc(m, false);
+       return PTR_ERR_OR_ZERO(m);
+}
+
+void xsk_map_put(struct xsk_map *map)
+{
+       bpf_map_put(&map->map);
+}
+
+static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
+                                              struct xdp_sock **map_entry)
+{
+       struct xsk_map_node *node;
+       int err;
+
+       node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
+       if (!node)
+               return NULL;
+
+       err = xsk_map_inc(map);
+       if (err) {
+               kfree(node);
+               return ERR_PTR(err);
+       }
+
+       node->map = map;
+       node->map_entry = map_entry;
+       return node;
+}
+
+static void xsk_map_node_free(struct xsk_map_node *node)
+{
+       xsk_map_put(node->map);
+       kfree(node);
+}
+
+static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
+{
+       spin_lock_bh(&xs->map_list_lock);
+       list_add_tail(&node->node, &xs->map_list);
+       spin_unlock_bh(&xs->map_list_lock);
+}
+
+static void xsk_map_sock_delete(struct xdp_sock *xs,
+                               struct xdp_sock **map_entry)
+{
+       struct xsk_map_node *n, *tmp;
+
+       spin_lock_bh(&xs->map_list_lock);
+       list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
+               if (map_entry == n->map_entry) {
+                       list_del(&n->node);
+                       xsk_map_node_free(n);
+               }
+       }
+       spin_unlock_bh(&xs->map_list_lock);
+}
+
 static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 {
        struct xsk_map *m;
@@ -34,6 +97,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
                return ERR_PTR(-ENOMEM);
 
        bpf_map_init_from_attr(&m->map, attr);
+       spin_lock_init(&m->lock);
 
        cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
        cost += sizeof(struct list_head) * num_possible_cpus();
@@ -71,21 +135,9 @@ free_m:
 static void xsk_map_free(struct bpf_map *map)
 {
        struct xsk_map *m = container_of(map, struct xsk_map, map);
-       int i;
 
        bpf_clear_redirect_map(map);
        synchronize_net();
-
-       for (i = 0; i < map->max_entries; i++) {
-               struct xdp_sock *xs;
-
-               xs = m->xsk_map[i];
-               if (!xs)
-                       continue;
-
-               sock_put((struct sock *)xs);
-       }
-
        free_percpu(m->flush_list);
        bpf_map_area_free(m->xsk_map);
        kfree(m);
@@ -164,8 +216,9 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
                               u64 map_flags)
 {
        struct xsk_map *m = container_of(map, struct xsk_map, map);
+       struct xdp_sock *xs, *old_xs, **map_entry;
        u32 i = *(u32 *)key, fd = *(u32 *)value;
-       struct xdp_sock *xs, *old_xs;
+       struct xsk_map_node *node;
        struct socket *sock;
        int err;
 
@@ -173,8 +226,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
                return -EINVAL;
        if (unlikely(i >= m->map.max_entries))
                return -E2BIG;
-       if (unlikely(map_flags == BPF_NOEXIST))
-               return -EEXIST;
 
        sock = sockfd_lookup(fd, &err);
        if (!sock)
@@ -192,32 +243,70 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
                return -EOPNOTSUPP;
        }
 
-       sock_hold(sock->sk);
+       map_entry = &m->xsk_map[i];
+       node = xsk_map_node_alloc(m, map_entry);
+       if (IS_ERR(node)) {
+               sockfd_put(sock);
+               return PTR_ERR(node);
+       }
 
-       old_xs = xchg(&m->xsk_map[i], xs);
+       spin_lock_bh(&m->lock);
+       old_xs = READ_ONCE(*map_entry);
+       if (old_xs == xs) {
+               err = 0;
+               goto out;
+       } else if (old_xs && map_flags == BPF_NOEXIST) {
+               err = -EEXIST;
+               goto out;
+       } else if (!old_xs && map_flags == BPF_EXIST) {
+               err = -ENOENT;
+               goto out;
+       }
+       xsk_map_sock_add(xs, node);
+       WRITE_ONCE(*map_entry, xs);
        if (old_xs)
-               sock_put((struct sock *)old_xs);
-
+               xsk_map_sock_delete(old_xs, map_entry);
+       spin_unlock_bh(&m->lock);
        sockfd_put(sock);
        return 0;
+
+out:
+       spin_unlock_bh(&m->lock);
+       sockfd_put(sock);
+       xsk_map_node_free(node);
+       return err;
 }
 
 static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 {
        struct xsk_map *m = container_of(map, struct xsk_map, map);
-       struct xdp_sock *old_xs;
+       struct xdp_sock *old_xs, **map_entry;
        int k = *(u32 *)key;
 
        if (k >= map->max_entries)
                return -EINVAL;
 
-       old_xs = xchg(&m->xsk_map[k], NULL);
+       spin_lock_bh(&m->lock);
+       map_entry = &m->xsk_map[k];
+       old_xs = xchg(map_entry, NULL);
        if (old_xs)
-               sock_put((struct sock *)old_xs);
+               xsk_map_sock_delete(old_xs, map_entry);
+       spin_unlock_bh(&m->lock);
 
        return 0;
 }
 
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+                            struct xdp_sock **map_entry)
+{
+       spin_lock_bh(&map->lock);
+       if (READ_ONCE(*map_entry) == xs) {
+               WRITE_ONCE(*map_entry, NULL);
+               xsk_map_sock_delete(xs, map_entry);
+       }
+       spin_unlock_bh(&map->lock);
+}
+
 const struct bpf_map_ops xsk_map_ops = {
        .map_alloc = xsk_map_alloc,
        .map_free = xsk_map_free,
index 98da8998c25ce406a2b0c9620f7295231120ae27..b09d7b1ffffdbb518f255f8b4db8dbb0eb7f172d 100644 (file)
@@ -520,7 +520,8 @@ config BPF_EVENTS
        bool
        default y
        help
-         This allows the user to attach BPF programs to kprobe events.
+         This allows the user to attach BPF programs to kprobe, uprobe, and
+         tracepoint events.
 
 config DYNAMIC_EVENTS
        def_bool n
index c41705835cbabbbb3e12dd9fb56f83ae20f7c789..5ef3eccee27cbd9e3c8c17b9d92cc3bbc4b10f92 100644 (file)
@@ -867,7 +867,7 @@ static struct bpf_test tests[] = {
                },
                CLASSIC,
                { },
-               { { 4, 10 ^ 300 }, { 20, 10 ^ 300 } },
+               { { 4, 0xA ^ 300 }, { 20, 0xA ^ 300 } },
        },
        {
                "SPILL_FILL",
index 94c7f77ecb6b66b388dcc23c151d487e614196d1..da5639a5bd3b950ce0c65a8f0d43bf465b8819a2 100644 (file)
@@ -12,6 +12,9 @@
 
 static atomic_t cache_idx;
 
+#define SK_STORAGE_CREATE_FLAG_MASK                                    \
+       (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
 struct bucket {
        struct hlist_head list;
        raw_spinlock_t lock;
@@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
                kfree_rcu(sk_storage, rcu);
 }
 
-/* sk_storage->lock must be held and sk_storage->list cannot be empty */
 static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
                            struct bpf_sk_storage_elem *selem)
 {
@@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
        return 0;
 }
 
-/* Called by __sk_destruct() */
+/* Called by __sk_destruct() & bpf_sk_storage_clone() */
 void bpf_sk_storage_free(struct sock *sk)
 {
        struct bpf_sk_storage_elem *selem;
@@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
        smap = (struct bpf_sk_storage_map *)map;
 
+       /* Note that this map might be concurrently cloned from
+        * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+        * RCU read section to finish before proceeding. New RCU
+        * read sections should be prevented via bpf_map_inc_not_zero.
+        */
        synchronize_rcu();
 
        /* bpf prog and the userspace can no longer access this map
@@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
 static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 {
-       if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
+       if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
+           !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+           attr->max_entries ||
            attr->key_size != sizeof(int) || !attr->value_size ||
            /* Enforce BTF for userspace sk dumping */
            !attr->btf_key_type_id || !attr->btf_value_type_id)
@@ -739,6 +748,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
        return err;
 }
 
+static struct bpf_sk_storage_elem *
+bpf_sk_storage_clone_elem(struct sock *newsk,
+                         struct bpf_sk_storage_map *smap,
+                         struct bpf_sk_storage_elem *selem)
+{
+       struct bpf_sk_storage_elem *copy_selem;
+
+       copy_selem = selem_alloc(smap, newsk, NULL, true);
+       if (!copy_selem)
+               return NULL;
+
+       if (map_value_has_spin_lock(&smap->map))
+               copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
+                                     SDATA(selem)->data, true);
+       else
+               copy_map_value(&smap->map, SDATA(copy_selem)->data,
+                              SDATA(selem)->data);
+
+       return copy_selem;
+}
+
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
+{
+       struct bpf_sk_storage *new_sk_storage = NULL;
+       struct bpf_sk_storage *sk_storage;
+       struct bpf_sk_storage_elem *selem;
+       int ret = 0;
+
+       RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+
+       rcu_read_lock();
+       sk_storage = rcu_dereference(sk->sk_bpf_storage);
+
+       if (!sk_storage || hlist_empty(&sk_storage->list))
+               goto out;
+
+       hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+               struct bpf_sk_storage_elem *copy_selem;
+               struct bpf_sk_storage_map *smap;
+               struct bpf_map *map;
+
+               smap = rcu_dereference(SDATA(selem)->smap);
+               if (!(smap->map.map_flags & BPF_F_CLONE))
+                       continue;
+
+               /* Note that for lockless listeners adding new element
+                * here can race with cleanup in bpf_sk_storage_map_free.
+                * Try to grab map refcnt to make sure that it's still
+                * alive and prevent concurrent removal.
+                */
+               map = bpf_map_inc_not_zero(&smap->map, false);
+               if (IS_ERR(map))
+                       continue;
+
+               copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
+               if (!copy_selem) {
+                       ret = -ENOMEM;
+                       bpf_map_put(map);
+                       goto out;
+               }
+
+               if (new_sk_storage) {
+                       selem_link_map(smap, copy_selem);
+                       __selem_link_sk(new_sk_storage, copy_selem);
+               } else {
+                       ret = sk_storage_alloc(newsk, smap, copy_selem);
+                       if (ret) {
+                               kfree(copy_selem);
+                               atomic_sub(smap->elem_size,
+                                          &newsk->sk_omem_alloc);
+                               bpf_map_put(map);
+                               goto out;
+                       }
+
+                       new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+               }
+               bpf_map_put(map);
+       }
+
+out:
+       rcu_read_unlock();
+
+       /* In case of an error, don't free anything explicitly here, the
+        * caller is responsible to call bpf_sk_storage_free.
+        */
+
+       return ret;
+}
+
 BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
           void *, value, u64, flags)
 {
index 49589ed2018df1d8552afeaa0c18b53f81a0be29..b1afafee3e2acb57f732032b087d4689c090e105 100644 (file)
@@ -8126,12 +8126,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                bpf_chk = generic_xdp_install;
 
        if (fd >= 0) {
+               u32 prog_id;
+
                if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
                        NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
                        return -EEXIST;
                }
-               if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
-                   __dev_xdp_query(dev, bpf_op, query)) {
+
+               prog_id = __dev_xdp_query(dev, bpf_op, query);
+               if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
                        NL_SET_ERR_MSG(extack, "XDP program already attached");
                        return -EBUSY;
                }
@@ -8146,6 +8149,14 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                        bpf_prog_put(prog);
                        return -EINVAL;
                }
+
+               if (prog->aux->id == prog_id) {
+                       bpf_prog_put(prog);
+                       return 0;
+               }
+       } else {
+               if (!__dev_xdp_query(dev, bpf_op, query))
+                       return 0;
        }
 
        err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
index b91988f8b94e580d6ed9bd5b61d180a475e785cb..ed6563622ce31dcced4e6ba622770e26f1f7756a 100644 (file)
@@ -5903,7 +5903,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
        default:
                return -EPROTONOSUPPORT;
        }
-       if (mss <= 0)
+       if (mss == 0)
                return -ENOENT;
 
        return cookie | ((u64)mss << 32);
index 545fac19a711f261fc6cdbdb54a3f08cb9705987..07863edbe6fc4842e47ebebf00bc21bc406d9264 100644 (file)
@@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                        goto out;
                }
                RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
-#ifdef CONFIG_BPF_SYSCALL
-               RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
-#endif
+
+               if (bpf_sk_storage_clone(sk, newsk)) {
+                       sk_free_unlock_clone(newsk);
+                       newsk = NULL;
+                       goto out;
+               }
 
                newsk->sk_err      = 0;
                newsk->sk_err_soft = 0;
index 0e0062127124762e3975ea72fee4d3ed8ed37720..947b8ff0227e64ad190116178a2e6c9c4a154102 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/idr.h>
-#include <linux/highmem.h>
+#include <linux/vmalloc.h>
 
 #include "xdp_umem.h"
 #include "xsk_queue.h"
@@ -106,14 +106,22 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
        umem->dev = dev;
        umem->queue_id = queue_id;
 
+       if (flags & XDP_USE_NEED_WAKEUP) {
+               umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
+               /* Tx needs to be explicitly woken up the first time.
+                * Also for supporting drivers that do not implement this
+                * feature. They will always have to call sendto().
+                */
+               xsk_set_tx_need_wakeup(umem);
+       }
+
        dev_hold(dev);
 
        if (force_copy)
                /* For copy-mode, we are done. */
                return 0;
 
-       if (!dev->netdev_ops->ndo_bpf ||
-           !dev->netdev_ops->ndo_xsk_async_xmit) {
+       if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
                err = -EOPNOTSUPP;
                goto err_unreg_umem;
        }
@@ -170,7 +178,30 @@ static void xdp_umem_unmap_pages(struct xdp_umem *umem)
        unsigned int i;
 
        for (i = 0; i < umem->npgs; i++)
-               kunmap(umem->pgs[i]);
+               if (PageHighMem(umem->pgs[i]))
+                       vunmap(umem->pages[i].addr);
+}
+
+static int xdp_umem_map_pages(struct xdp_umem *umem)
+{
+       unsigned int i;
+       void *addr;
+
+       for (i = 0; i < umem->npgs; i++) {
+               if (PageHighMem(umem->pgs[i]))
+                       addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
+               else
+                       addr = page_address(umem->pgs[i]);
+
+               if (!addr) {
+                       xdp_umem_unmap_pages(umem);
+                       return -ENOMEM;
+               }
+
+               umem->pages[i].addr = addr;
+       }
+
+       return 0;
 }
 
 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
@@ -309,10 +340,11 @@ static int xdp_umem_account_pages(struct xdp_umem *umem)
 
 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 {
+       bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
        u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
        unsigned int chunks, chunks_per_page;
        u64 addr = mr->addr, size = mr->len;
-       int size_chk, err, i;
+       int size_chk, err;
 
        if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
                /* Strictly speaking we could support this, if:
@@ -324,7 +356,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
                return -EINVAL;
        }
 
-       if (!is_power_of_2(chunk_size))
+       if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
+                       XDP_UMEM_USES_NEED_WAKEUP))
+               return -EINVAL;
+
+       if (!unaligned_chunks && !is_power_of_2(chunk_size))
                return -EINVAL;
 
        if (!PAGE_ALIGNED(addr)) {
@@ -341,9 +377,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
        if (chunks == 0)
                return -EINVAL;
 
-       chunks_per_page = PAGE_SIZE / chunk_size;
-       if (chunks < chunks_per_page || chunks % chunks_per_page)
-               return -EINVAL;
+       if (!unaligned_chunks) {
+               chunks_per_page = PAGE_SIZE / chunk_size;
+               if (chunks < chunks_per_page || chunks % chunks_per_page)
+                       return -EINVAL;
+       }
 
        headroom = ALIGN(headroom, 64);
 
@@ -352,13 +390,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
                return -EINVAL;
 
        umem->address = (unsigned long)addr;
-       umem->chunk_mask = ~((u64)chunk_size - 1);
+       umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
+                                           : ~((u64)chunk_size - 1);
        umem->size = size;
        umem->headroom = headroom;
        umem->chunk_size_nohr = chunk_size - headroom;
        umem->npgs = size / PAGE_SIZE;
        umem->pgs = NULL;
        umem->user = NULL;
+       umem->flags = mr->flags;
        INIT_LIST_HEAD(&umem->xsk_list);
        spin_lock_init(&umem->xsk_list_lock);
 
@@ -378,10 +418,11 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
                goto out_pin;
        }
 
-       for (i = 0; i < umem->npgs; i++)
-               umem->pages[i].addr = kmap(umem->pgs[i]);
+       err = xdp_umem_map_pages(umem);
+       if (!err)
+               return 0;
 
-       return 0;
+       kfree(umem->pages);
 
 out_pin:
        xdp_umem_unpin_pages(umem);
index 59b57d7086970b53b140bc500c881ab9cf1fb98b..c2f1af3b6a7c4ec2aed2beab304e0692fb462535 100644 (file)
@@ -45,7 +45,7 @@ EXPORT_SYMBOL(xsk_umem_has_addrs);
 
 u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
 {
-       return xskq_peek_addr(umem->fq, addr);
+       return xskq_peek_addr(umem->fq, addr, umem);
 }
 EXPORT_SYMBOL(xsk_umem_peek_addr);
 
@@ -55,21 +55,103 @@ void xsk_umem_discard_addr(struct xdp_umem *umem)
 }
 EXPORT_SYMBOL(xsk_umem_discard_addr);
 
+void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
+{
+       if (umem->need_wakeup & XDP_WAKEUP_RX)
+               return;
+
+       umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
+       umem->need_wakeup |= XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
+
+void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
+{
+       struct xdp_sock *xs;
+
+       if (umem->need_wakeup & XDP_WAKEUP_TX)
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+               xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
+       }
+       rcu_read_unlock();
+
+       umem->need_wakeup |= XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
+
+void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
+{
+       if (!(umem->need_wakeup & XDP_WAKEUP_RX))
+               return;
+
+       umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+       umem->need_wakeup &= ~XDP_WAKEUP_RX;
+}
+EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
+
+void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
+{
+       struct xdp_sock *xs;
+
+       if (!(umem->need_wakeup & XDP_WAKEUP_TX))
+               return;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+               xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
+       }
+       rcu_read_unlock();
+
+       umem->need_wakeup &= ~XDP_WAKEUP_TX;
+}
+EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
+
+bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
+{
+       return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
+}
+EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
+
+/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
+ * each page. This is only required in copy mode.
+ */
+static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
+                            u32 len, u32 metalen)
+{
+       void *to_buf = xdp_umem_get_data(umem, addr);
+
+       addr = xsk_umem_add_offset_to_addr(addr);
+       if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
+               void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
+               u64 page_start = addr & ~(PAGE_SIZE - 1);
+               u64 first_len = PAGE_SIZE - (addr - page_start);
+
+               memcpy(to_buf, from_buf, first_len + metalen);
+               memcpy(next_pg_addr, from_buf + first_len, len - first_len);
+
+               return;
+       }
+
+       memcpy(to_buf, from_buf, len + metalen);
+}
+
 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 {
-       void *to_buf, *from_buf;
+       u64 offset = xs->umem->headroom;
+       u64 addr, memcpy_addr;
+       void *from_buf;
        u32 metalen;
-       u64 addr;
        int err;
 
-       if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+       if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
            len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
                xs->rx_dropped++;
                return -ENOSPC;
        }
 
-       addr += xs->umem->headroom;
-
        if (unlikely(xdp_data_meta_unsupported(xdp))) {
                from_buf = xdp->data;
                metalen = 0;
@@ -78,9 +160,11 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
                metalen = xdp->data - xdp->data_meta;
        }
 
-       to_buf = xdp_umem_get_data(xs->umem, addr);
-       memcpy(to_buf, from_buf, len + metalen);
-       addr += metalen;
+       memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
+       __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
+
+       offset += metalen;
+       addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
        err = xskq_produce_batch_desc(xs->rx, addr, len);
        if (!err) {
                xskq_discard_addr(xs->umem->fq);
@@ -102,10 +186,23 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
        return err;
 }
 
+static bool xsk_is_bound(struct xdp_sock *xs)
+{
+       if (READ_ONCE(xs->state) == XSK_BOUND) {
+               /* Matches smp_wmb() in bind(). */
+               smp_rmb();
+               return true;
+       }
+       return false;
+}
+
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
        u32 len;
 
+       if (!xsk_is_bound(xs))
+               return -EINVAL;
+
        if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
                return -EINVAL;
 
@@ -125,6 +222,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
        u32 metalen = xdp->data - xdp->data_meta;
        u32 len = xdp->data_end - xdp->data;
+       u64 offset = xs->umem->headroom;
        void *buffer;
        u64 addr;
        int err;
@@ -136,17 +234,17 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
                goto out_unlock;
        }
 
-       if (!xskq_peek_addr(xs->umem->fq, &addr) ||
+       if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
            len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
                err = -ENOSPC;
                goto out_drop;
        }
 
-       addr += xs->umem->headroom;
-
+       addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
        buffer = xdp_umem_get_data(xs->umem, addr);
        memcpy(buffer, xdp->data_meta, len + metalen);
-       addr += metalen;
+
+       addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
        err = xskq_produce_batch_desc(xs->rx, addr, len);
        if (err)
                goto out_drop;
@@ -190,7 +288,7 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 
        rcu_read_lock();
        list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
-               if (!xskq_peek_desc(xs->tx, desc))
+               if (!xskq_peek_desc(xs->tx, desc, umem))
                        continue;
 
                if (xskq_produce_addr_lazy(umem->cq, desc->addr))
@@ -212,7 +310,8 @@ static int xsk_zc_xmit(struct sock *sk)
        struct xdp_sock *xs = xdp_sk(sk);
        struct net_device *dev = xs->dev;
 
-       return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
+       return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
+                                              XDP_WAKEUP_TX);
 }
 
 static void xsk_destruct_skb(struct sk_buff *skb)
@@ -243,7 +342,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
        if (xs->queue_id >= xs->dev->real_num_tx_queues)
                goto out;
 
-       while (xskq_peek_desc(xs->tx, &desc)) {
+       while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
                char *buffer;
                u64 addr;
                u32 len;
@@ -272,7 +371,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
                skb->dev = xs->dev;
                skb->priority = sk->sk_priority;
                skb->mark = sk->sk_mark;
-               skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+               skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
                skb->destructor = xsk_destruct_skb;
 
                err = dev_direct_xmit(skb, xs->queue_id);
@@ -301,7 +400,7 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
        struct sock *sk = sock->sk;
        struct xdp_sock *xs = xdp_sk(sk);
 
-       if (unlikely(!xs->dev))
+       if (unlikely(!xsk_is_bound(xs)))
                return -ENXIO;
        if (unlikely(!(xs->dev->flags & IFF_UP)))
                return -ENETDOWN;
@@ -317,8 +416,19 @@ static unsigned int xsk_poll(struct file *file, struct socket *sock,
                             struct poll_table_struct *wait)
 {
        unsigned int mask = datagram_poll(file, sock, wait);
-       struct sock *sk = sock->sk;
-       struct xdp_sock *xs = xdp_sk(sk);
+       struct xdp_sock *xs = xdp_sk(sock->sk);
+       struct net_device *dev;
+       struct xdp_umem *umem;
+
+       if (unlikely(!xsk_is_bound(xs)))
+               return mask;
+
+       dev = xs->dev;
+       umem = xs->umem;
+
+       if (umem->need_wakeup)
+               dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
+                                               umem->need_wakeup);
 
        if (xs->rx && !xskq_empty_desc(xs->rx))
                mask |= POLLIN | POLLRDNORM;
@@ -342,7 +452,7 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 
        /* Make sure queue is ready before it can be seen by others */
        smp_wmb();
-       *queue = q;
+       WRITE_ONCE(*queue, q);
        return 0;
 }
 
@@ -350,10 +460,9 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
 {
        struct net_device *dev = xs->dev;
 
-       if (!dev || xs->state != XSK_BOUND)
+       if (xs->state != XSK_BOUND)
                return;
-
-       xs->state = XSK_UNBOUND;
+       WRITE_ONCE(xs->state, XSK_UNBOUND);
 
        /* Wait for driver to stop using the xdp socket. */
        xdp_del_sk_umem(xs->umem, xs);
@@ -362,6 +471,52 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
        dev_put(dev);
 }
 
+static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
+                                             struct xdp_sock ***map_entry)
+{
+       struct xsk_map *map = NULL;
+       struct xsk_map_node *node;
+
+       *map_entry = NULL;
+
+       spin_lock_bh(&xs->map_list_lock);
+       node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
+                                       node);
+       if (node) {
+               WARN_ON(xsk_map_inc(node->map));
+               map = node->map;
+               *map_entry = node->map_entry;
+       }
+       spin_unlock_bh(&xs->map_list_lock);
+       return map;
+}
+
+static void xsk_delete_from_maps(struct xdp_sock *xs)
+{
+       /* This function removes the current XDP socket from all the
+        * maps it resides in. We need to take extra care here, due to
+        * the two locks involved. Each map has a lock synchronizing
+        * updates to the entries, and each socket has a lock that
+        * synchronizes access to the list of maps (map_list). For
+        * deadlock avoidance the locks need to be taken in the order
+        * "map lock"->"socket map list lock". We start off by
+        * accessing the socket map list, and take a reference to the
+        * map to guarantee existence between the
+        * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
+        * calls. Then we ask the map to remove the socket, which
+        * tries to remove the socket from the map. Note that there
+        * might be updates to the map between
+        * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
+        */
+       struct xdp_sock **map_entry = NULL;
+       struct xsk_map *map;
+
+       while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
+               xsk_map_try_sock_delete(map, xs, map_entry);
+               xsk_map_put(map);
+       }
+}
+
 static int xsk_release(struct socket *sock)
 {
        struct sock *sk = sock->sk;
@@ -381,7 +536,10 @@ static int xsk_release(struct socket *sock)
        sock_prot_inuse_add(net, sk->sk_prot, -1);
        local_bh_enable();
 
+       xsk_delete_from_maps(xs);
+       mutex_lock(&xs->mutex);
        xsk_unbind_dev(xs);
+       mutex_unlock(&xs->mutex);
 
        xskq_destroy(xs->rx);
        xskq_destroy(xs->tx);
@@ -412,6 +570,24 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
        return sock;
 }
 
+/* Check if umem pages are contiguous.
+ * If zero-copy mode, use the DMA address to do the page contiguity check
+ * For all other modes we use addr (kernel virtual address)
+ * Store the result in the low bits of addr.
+ */
+static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
+{
+       struct xdp_umem_page *pgs = umem->pages;
+       int i, is_contig;
+
+       for (i = 0; i < umem->npgs - 1; i++) {
+               is_contig = (flags & XDP_ZEROCOPY) ?
+                       (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
+                       (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
+               pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
+       }
+}
+
 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 {
        struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@@ -427,7 +603,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
                return -EINVAL;
 
        flags = sxdp->sxdp_flags;
-       if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
+       if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
+                     XDP_USE_NEED_WAKEUP))
                return -EINVAL;
 
        rtnl_lock();
@@ -454,7 +631,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
                struct xdp_sock *umem_xs;
                struct socket *sock;
 
-               if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
+               if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
+                   (flags & XDP_USE_NEED_WAKEUP)) {
                        /* Cannot specify flags for shared sockets. */
                        err = -EINVAL;
                        goto out_unlock;
@@ -473,19 +651,19 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
                }
 
                umem_xs = xdp_sk(sock->sk);
-               if (!umem_xs->umem) {
-                       /* No umem to inherit. */
+               if (!xsk_is_bound(umem_xs)) {
                        err = -EBADF;
                        sockfd_put(sock);
                        goto out_unlock;
-               } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
+               }
+               if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
                        err = -EINVAL;
                        sockfd_put(sock);
                        goto out_unlock;
                }
 
                xdp_get_umem(umem_xs->umem);
-               xs->umem = umem_xs->umem;
+               WRITE_ONCE(xs->umem, umem_xs->umem);
                sockfd_put(sock);
        } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
                err = -EINVAL;
@@ -500,6 +678,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
                err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
                if (err)
                        goto out_unlock;
+
+               xsk_check_page_contiguity(xs->umem, flags);
        }
 
        xs->dev = dev;
@@ -510,16 +690,28 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
        xdp_add_sk_umem(xs->umem, xs);
 
 out_unlock:
-       if (err)
+       if (err) {
                dev_put(dev);
-       else
-               xs->state = XSK_BOUND;
+       } else {
+               /* Matches smp_rmb() in bind() for shared umem
+                * sockets, and xsk_is_bound().
+                */
+               smp_wmb();
+               WRITE_ONCE(xs->state, XSK_BOUND);
+       }
 out_release:
        mutex_unlock(&xs->mutex);
        rtnl_unlock();
        return err;
 }
 
+struct xdp_umem_reg_v1 {
+       __u64 addr; /* Start of packet data area */
+       __u64 len; /* Length of packet data area */
+       __u32 chunk_size;
+       __u32 headroom;
+};
+
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
                          char __user *optval, unsigned int optlen)
 {
@@ -549,15 +741,24 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
                }
                q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
                err = xsk_init_queue(entries, q, false);
+               if (!err && optname == XDP_TX_RING)
+                       /* Tx needs to be explicitly woken up the first time */
+                       xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
                mutex_unlock(&xs->mutex);
                return err;
        }
        case XDP_UMEM_REG:
        {
-               struct xdp_umem_reg mr;
+               size_t mr_size = sizeof(struct xdp_umem_reg);
+               struct xdp_umem_reg mr = {};
                struct xdp_umem *umem;
 
-               if (copy_from_user(&mr, optval, sizeof(mr)))
+               if (optlen < sizeof(struct xdp_umem_reg_v1))
+                       return -EINVAL;
+               else if (optlen < sizeof(mr))
+                       mr_size = sizeof(struct xdp_umem_reg_v1);
+
+               if (copy_from_user(&mr, optval, mr_size))
                        return -EFAULT;
 
                mutex_lock(&xs->mutex);
@@ -574,7 +775,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 
                /* Make sure umem is ready before it can be seen by others */
                smp_wmb();
-               xs->umem = umem;
+               WRITE_ONCE(xs->umem, umem);
                mutex_unlock(&xs->mutex);
                return 0;
        }
@@ -610,6 +811,20 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
        return -ENOPROTOOPT;
 }
 
+static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
+{
+       ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
+       ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
+       ring->desc = offsetof(struct xdp_rxtx_ring, desc);
+}
+
+static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
+{
+       ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
+       ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
+       ring->desc = offsetof(struct xdp_umem_ring, desc);
+}
+
 static int xsk_getsockopt(struct socket *sock, int level, int optname,
                          char __user *optval, int __user *optlen)
 {
@@ -649,26 +864,49 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
        case XDP_MMAP_OFFSETS:
        {
                struct xdp_mmap_offsets off;
+               struct xdp_mmap_offsets_v1 off_v1;
+               bool flags_supported = true;
+               void *to_copy;
 
-               if (len < sizeof(off))
+               if (len < sizeof(off_v1))
                        return -EINVAL;
+               else if (len < sizeof(off))
+                       flags_supported = false;
+
+               if (flags_supported) {
+                       /* xdp_ring_offset is identical to xdp_ring_offset_v1
+                        * except for the flags field added to the end.
+                        */
+                       xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+                                              &off.rx);
+                       xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
+                                              &off.tx);
+                       xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+                                              &off.fr);
+                       xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
+                                              &off.cr);
+                       off.rx.flags = offsetof(struct xdp_rxtx_ring,
+                                               ptrs.flags);
+                       off.tx.flags = offsetof(struct xdp_rxtx_ring,
+                                               ptrs.flags);
+                       off.fr.flags = offsetof(struct xdp_umem_ring,
+                                               ptrs.flags);
+                       off.cr.flags = offsetof(struct xdp_umem_ring,
+                                               ptrs.flags);
+
+                       len = sizeof(off);
+                       to_copy = &off;
+               } else {
+                       xsk_enter_rxtx_offsets(&off_v1.rx);
+                       xsk_enter_rxtx_offsets(&off_v1.tx);
+                       xsk_enter_umem_offsets(&off_v1.fr);
+                       xsk_enter_umem_offsets(&off_v1.cr);
+
+                       len = sizeof(off_v1);
+                       to_copy = &off_v1;
+               }
 
-               off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
-               off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
-               off.rx.desc     = offsetof(struct xdp_rxtx_ring, desc);
-               off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
-               off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
-               off.tx.desc     = offsetof(struct xdp_rxtx_ring, desc);
-
-               off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
-               off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
-               off.fr.desc     = offsetof(struct xdp_umem_ring, desc);
-               off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
-               off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
-               off.cr.desc     = offsetof(struct xdp_umem_ring, desc);
-
-               len = sizeof(off);
-               if (copy_to_user(optval, &off, len))
+               if (copy_to_user(optval, to_copy, len))
                        return -EFAULT;
                if (put_user(len, optlen))
                        return -EFAULT;
@@ -713,7 +951,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
        unsigned long pfn;
        struct page *qpg;
 
-       if (xs->state != XSK_READY)
+       if (READ_ONCE(xs->state) != XSK_READY)
                return -EBUSY;
 
        if (offset == XDP_PGOFF_RX_RING) {
@@ -855,6 +1093,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
        spin_lock_init(&xs->rx_lock);
        spin_lock_init(&xs->tx_completion_lock);
 
+       INIT_LIST_HEAD(&xs->map_list);
+       spin_lock_init(&xs->map_list_lock);
+
        mutex_lock(&net->xdp.lock);
        sk_add_node_rcu(sk, &net->xdp.list);
        mutex_unlock(&net->xdp.lock);
index ba81206104266c4f662db880a757f30c8ff97fb7..4cfd106bdb5335ce878f936dd61b26f9914ff0b7 100644 (file)
@@ -4,6 +4,19 @@
 #ifndef XSK_H_
 #define XSK_H_
 
+struct xdp_ring_offset_v1 {
+       __u64 producer;
+       __u64 consumer;
+       __u64 desc;
+};
+
+struct xdp_mmap_offsets_v1 {
+       struct xdp_ring_offset_v1 rx;
+       struct xdp_ring_offset_v1 tx;
+       struct xdp_ring_offset_v1 fr;
+       struct xdp_ring_offset_v1 cr;
+};
+
 static inline struct xdp_sock *xdp_sk(struct sock *sk)
 {
        return (struct xdp_sock *)sk;
index d5e06c8e0cbf9f30fdcdfb6a6ce65f7356c9d89e..f59791ba43a04c08e6d486edf418c63e267a0220 100644 (file)
@@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
        du.id = umem->id;
        du.size = umem->size;
        du.num_pages = umem->npgs;
-       du.chunk_size = (__u32)(~umem->chunk_mask + 1);
+       du.chunk_size = umem->chunk_size_nohr + umem->headroom;
        du.headroom = umem->headroom;
        du.ifindex = umem->dev ? umem->dev->ifindex : 0;
        du.queue_id = umem->queue_id;
@@ -97,6 +97,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
        msg->xdiag_ino = sk_ino;
        sock_diag_save_cookie(sk, msg->xdiag_cookie);
 
+       mutex_lock(&xs->mutex);
        if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb))
                goto out_nlmsg_trim;
 
@@ -117,10 +118,12 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
            sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO))
                goto out_nlmsg_trim;
 
+       mutex_unlock(&xs->mutex);
        nlmsg_end(nlskb, nlh);
        return 0;
 
 out_nlmsg_trim:
+       mutex_unlock(&xs->mutex);
        nlmsg_cancel(nlskb, nlh);
        return -EMSGSIZE;
 }
index 909c5168ed0f87c3d0e1d14359efb67fe0bebab1..eddae4688862906413940960469fba3baf6bfe0c 100644 (file)
@@ -16,6 +16,7 @@
 struct xdp_ring {
        u32 producer ____cacheline_aligned_in_smp;
        u32 consumer ____cacheline_aligned_in_smp;
+       u32 flags;
 };
 
 /* Used for the RX and TX queues for packets */
@@ -133,6 +134,17 @@ static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
 
 /* UMEM queue */
 
+static inline bool xskq_crosses_non_contig_pg(struct xdp_umem *umem, u64 addr,
+                                             u64 length)
+{
+       bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
+       bool next_pg_contig =
+               (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
+                       XSK_NEXT_PG_CONTIG_MASK;
+
+       return cross_pg && !next_pg_contig;
+}
+
 static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
 {
        if (addr >= q->size) {
@@ -143,23 +155,51 @@ static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
        return true;
 }
 
-static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
+static inline bool xskq_is_valid_addr_unaligned(struct xsk_queue *q, u64 addr,
+                                               u64 length,
+                                               struct xdp_umem *umem)
+{
+       u64 base_addr = xsk_umem_extract_addr(addr);
+
+       addr = xsk_umem_add_offset_to_addr(addr);
+       if (base_addr >= q->size || addr >= q->size ||
+           xskq_crosses_non_contig_pg(umem, addr, length)) {
+               q->invalid_descs++;
+               return false;
+       }
+
+       return true;
+}
+
+static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr,
+                                     struct xdp_umem *umem)
 {
        while (q->cons_tail != q->cons_head) {
                struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
                unsigned int idx = q->cons_tail & q->ring_mask;
 
                *addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
+
+               if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+                       if (xskq_is_valid_addr_unaligned(q, *addr,
+                                                        umem->chunk_size_nohr,
+                                                        umem))
+                               return addr;
+                       goto out;
+               }
+
                if (xskq_is_valid_addr(q, *addr))
                        return addr;
 
+out:
                q->cons_tail++;
        }
 
        return NULL;
 }
 
-static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
+static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr,
+                                 struct xdp_umem *umem)
 {
        if (q->cons_tail == q->cons_head) {
                smp_mb(); /* D, matches A */
@@ -170,7 +210,7 @@ static inline u64 *xskq_peek_addr(struct xsk_queue *q, u64 *addr)
                smp_rmb();
        }
 
-       return xskq_validate_addr(q, addr);
+       return xskq_validate_addr(q, addr, umem);
 }
 
 static inline void xskq_discard_addr(struct xsk_queue *q)
@@ -229,8 +269,21 @@ static inline int xskq_reserve_addr(struct xsk_queue *q)
 
 /* Rx/Tx queue */
 
-static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
+static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d,
+                                     struct xdp_umem *umem)
 {
+       if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
+               if (!xskq_is_valid_addr_unaligned(q, d->addr, d->len, umem))
+                       return false;
+
+               if (d->len > umem->chunk_size_nohr || d->options) {
+                       q->invalid_descs++;
+                       return false;
+               }
+
+               return true;
+       }
+
        if (!xskq_is_valid_addr(q, d->addr))
                return false;
 
@@ -244,14 +297,15 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
 }
 
 static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
-                                                 struct xdp_desc *desc)
+                                                 struct xdp_desc *desc,
+                                                 struct xdp_umem *umem)
 {
        while (q->cons_tail != q->cons_head) {
                struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring;
                unsigned int idx = q->cons_tail & q->ring_mask;
 
                *desc = READ_ONCE(ring->desc[idx]);
-               if (xskq_is_valid_desc(q, desc))
+               if (xskq_is_valid_desc(q, desc, umem))
                        return desc;
 
                q->cons_tail++;
@@ -261,7 +315,8 @@ static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q,
 }
 
 static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
-                                             struct xdp_desc *desc)
+                                             struct xdp_desc *desc,
+                                             struct xdp_umem *umem)
 {
        if (q->cons_tail == q->cons_head) {
                smp_mb(); /* D, matches A */
@@ -272,7 +327,7 @@ static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
                smp_rmb(); /* C, matches B */
        }
 
-       return xskq_validate_desc(q, desc);
+       return xskq_validate_desc(q, desc, umem);
 }
 
 static inline void xskq_discard_desc(struct xsk_queue *q)
index 516e255cbe8fe2f5ac67b34317bbbadfd69d1535..88f9400524509d85122967d694e142257abab315 100644 (file)
@@ -9,5 +9,11 @@ void syscall_defines(void)
        COMMENT("Linux system call numbers.");
        SYSNR(__NR_write);
        SYSNR(__NR_read);
+#ifdef __NR_mmap2
+       SYSNR(__NR_mmap2);
+#endif
+#ifdef __NR_mmap
        SYSNR(__NR_mmap);
+#endif
+
 }
index f57f4e1ea1ec3c97d4c9f405349e6c514b6bfaee..35cb0eed3be591c810235e1f6f623827f70097d1 100644 (file)
@@ -68,12 +68,25 @@ PROG(SYS__NR_read)(struct pt_regs *ctx)
        return 0;
 }
 
+#ifdef __NR_mmap2
+PROG(SYS__NR_mmap2)(struct pt_regs *ctx)
+{
+       char fmt[] = "mmap2\n";
+
+       bpf_trace_printk(fmt, sizeof(fmt));
+       return 0;
+}
+#endif
+
+#ifdef __NR_mmap
 PROG(SYS__NR_mmap)(struct pt_regs *ctx)
 {
        char fmt[] = "mmap\n";
+
        bpf_trace_printk(fmt, sizeof(fmt));
        return 0;
 }
+#endif
 
 char _license[] SEC("license") = "GPL";
 u32 _version SEC("version") = LINUX_VERSION_CODE;
index 93eaaf7239b293d366fedb57ac1e6e7df643385d..102eace229568e38e7ff1a64adebdb9152194a8e 100644 (file)
@@ -67,8 +67,14 @@ static int opt_ifindex;
 static int opt_queue;
 static int opt_poll;
 static int opt_interval = 1;
+static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
+static u32 opt_umem_flags;
+static int opt_unaligned_chunks;
+static int opt_mmap_flags;
 static u32 opt_xdp_bind_flags;
 static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+static int opt_timeout = 1000;
+static bool opt_need_wakeup = true;
 static __u32 prog_id;
 
 struct xsk_umem_info {
@@ -282,7 +288,9 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
                .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
                .frame_size = opt_xsk_frame_size,
                .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
+               .flags = opt_umem_flags
        };
+
        int ret;
 
        umem = calloc(1, sizeof(*umem));
@@ -291,6 +299,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
 
        ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
                               &cfg);
+
        if (ret)
                exit_with_error(-ret);
 
@@ -352,6 +361,8 @@ static struct option long_options[] = {
        {"zero-copy", no_argument, 0, 'z'},
        {"copy", no_argument, 0, 'c'},
        {"frame-size", required_argument, 0, 'f'},
+       {"no-need-wakeup", no_argument, 0, 'm'},
+       {"unaligned", no_argument, 0, 'u'},
        {0, 0, 0, 0}
 };
 
@@ -372,6 +383,9 @@ static void usage(const char *prog)
                "  -z, --zero-copy      Force zero-copy mode.\n"
                "  -c, --copy           Force copy mode.\n"
                "  -f, --frame-size=n   Set the frame size (must be a power of two, default is %d).\n"
+               "  -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
+               "  -f, --frame-size=n   Set the frame size (must be a power of two in aligned mode, default is %d).\n"
+               "  -u, --unaligned      Enable unaligned chunk placement\n"
                "\n";
        fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
        exit(EXIT_FAILURE);
@@ -384,8 +398,8 @@ static void parse_command_line(int argc, char **argv)
        opterr = 0;
 
        for (;;) {
-               c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
-                               &option_index);
+               c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:mu",
+                               long_options, &option_index);
                if (c == -1)
                        break;
 
@@ -424,12 +438,21 @@ static void parse_command_line(int argc, char **argv)
                case 'c':
                        opt_xdp_bind_flags |= XDP_COPY;
                        break;
+               case 'u':
+                       opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+                       opt_unaligned_chunks = 1;
+                       opt_mmap_flags = MAP_HUGETLB;
+                       break;
                case 'F':
                        opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
                        break;
                case 'f':
                        opt_xsk_frame_size = atoi(optarg);
+               case 'm':
+                       opt_need_wakeup = false;
+                       opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
                        break;
+
                default:
                        usage(basename(argv[0]));
                }
@@ -442,7 +465,8 @@ static void parse_command_line(int argc, char **argv)
                usage(basename(argv[0]));
        }
 
-       if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) {
+       if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) &&
+           !opt_unaligned_chunks) {
                fprintf(stderr, "--frame-size=%d is not a power of two\n",
                        opt_xsk_frame_size);
                usage(basename(argv[0]));
@@ -459,8 +483,10 @@ static void kick_tx(struct xsk_socket_info *xsk)
        exit_with_error(errno);
 }
 
-static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
+static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
+                                    struct pollfd *fds)
 {
+       struct xsk_umem_info *umem = xsk->umem;
        u32 idx_cq = 0, idx_fq = 0;
        unsigned int rcvd;
        size_t ndescs;
@@ -468,27 +494,30 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
        if (!xsk->outstanding_tx)
                return;
 
-       kick_tx(xsk);
+       if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+               kick_tx(xsk);
+
        ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
                xsk->outstanding_tx;
 
        /* re-add completed Tx buffers */
-       rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq);
+       rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq);
        if (rcvd > 0) {
                unsigned int i;
                int ret;
 
-               ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+               ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
                while (ret != rcvd) {
                        if (ret < 0)
                                exit_with_error(-ret);
-                       ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
-                                                    &idx_fq);
+                       if (xsk_ring_prod__needs_wakeup(&umem->fq))
+                               ret = poll(fds, num_socks, opt_timeout);
+                       ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
                }
+
                for (i = 0; i < rcvd; i++)
-                       *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
-                               *xsk_ring_cons__comp_addr(&xsk->umem->cq,
-                                                         idx_cq++);
+                       *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) =
+                               *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++);
 
                xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
                xsk_ring_cons__release(&xsk->umem->cq, rcvd);
@@ -505,7 +534,8 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
        if (!xsk->outstanding_tx)
                return;
 
-       kick_tx(xsk);
+       if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+               kick_tx(xsk);
 
        rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
        if (rcvd > 0) {
@@ -515,30 +545,38 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
        }
 }
 
-static void rx_drop(struct xsk_socket_info *xsk)
+static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 {
        unsigned int rcvd, i;
        u32 idx_rx = 0, idx_fq = 0;
        int ret;
 
        rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
-       if (!rcvd)
+       if (!rcvd) {
+               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+                       ret = poll(fds, num_socks, opt_timeout);
                return;
+       }
 
        ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
        while (ret != rcvd) {
                if (ret < 0)
                        exit_with_error(-ret);
+               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+                       ret = poll(fds, num_socks, opt_timeout);
                ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
        }
 
        for (i = 0; i < rcvd; i++) {
                u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
                u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
+               u64 orig = xsk_umem__extract_addr(addr);
+
+               addr = xsk_umem__add_offset_to_addr(addr);
                char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
 
                hex_dump(pkt, len, addr);
-               *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr;
+               *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
        }
 
        xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
@@ -549,42 +587,65 @@ static void rx_drop(struct xsk_socket_info *xsk)
 static void rx_drop_all(void)
 {
        struct pollfd fds[MAX_SOCKS + 1];
-       int i, ret, timeout, nfds = 1;
+       int i, ret;
 
        memset(fds, 0, sizeof(fds));
 
        for (i = 0; i < num_socks; i++) {
                fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
                fds[i].events = POLLIN;
-               timeout = 1000; /* 1sn */
        }
 
        for (;;) {
                if (opt_poll) {
-                       ret = poll(fds, nfds, timeout);
+                       ret = poll(fds, num_socks, opt_timeout);
                        if (ret <= 0)
                                continue;
                }
 
                for (i = 0; i < num_socks; i++)
-                       rx_drop(xsks[i]);
+                       rx_drop(xsks[i], fds);
+       }
+}
+
+static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb)
+{
+       u32 idx;
+
+       if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == BATCH_SIZE) {
+               unsigned int i;
+
+               for (i = 0; i < BATCH_SIZE; i++) {
+                       xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr =
+                               (frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
+                       xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
+                               sizeof(pkt_data) - 1;
+               }
+
+               xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
+               xsk->outstanding_tx += BATCH_SIZE;
+               frame_nb += BATCH_SIZE;
+               frame_nb %= NUM_FRAMES;
        }
+
+       complete_tx_only(xsk);
 }
 
-static void tx_only(struct xsk_socket_info *xsk)
+static void tx_only_all(void)
 {
-       int timeout, ret, nfds = 1;
-       struct pollfd fds[nfds + 1];
-       u32 idx, frame_nb = 0;
+       struct pollfd fds[MAX_SOCKS];
+       u32 frame_nb[MAX_SOCKS] = {};
+       int i, ret;
 
        memset(fds, 0, sizeof(fds));
-       fds[0].fd = xsk_socket__fd(xsk->xsk);
-       fds[0].events = POLLOUT;
-       timeout = 1000; /* 1sn */
+       for (i = 0; i < num_socks; i++) {
+               fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
+               fds[0].events = POLLOUT;
+       }
 
        for (;;) {
                if (opt_poll) {
-                       ret = poll(fds, nfds, timeout);
+                       ret = poll(fds, num_socks, opt_timeout);
                        if (ret <= 0)
                                continue;
 
@@ -592,69 +653,78 @@ static void tx_only(struct xsk_socket_info *xsk)
                                continue;
                }
 
-               if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
-                   BATCH_SIZE) {
-                       unsigned int i;
-
-                       for (i = 0; i < BATCH_SIZE; i++) {
-                               xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
-                                       = (frame_nb + i) * opt_xsk_frame_size;
-                               xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
-                                       sizeof(pkt_data) - 1;
-                       }
-
-                       xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
-                       xsk->outstanding_tx += BATCH_SIZE;
-                       frame_nb += BATCH_SIZE;
-                       frame_nb %= NUM_FRAMES;
-               }
-
-               complete_tx_only(xsk);
+               for (i = 0; i < num_socks; i++)
+                       tx_only(xsks[i], frame_nb[i]);
        }
 }
 
-static void l2fwd(struct xsk_socket_info *xsk)
+static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
 {
-       for (;;) {
-               unsigned int rcvd, i;
-               u32 idx_rx = 0, idx_tx = 0;
-               int ret;
+       unsigned int rcvd, i;
+       u32 idx_rx = 0, idx_tx = 0;
+       int ret;
 
-               for (;;) {
-                       complete_tx_l2fwd(xsk);
+       complete_tx_l2fwd(xsk, fds);
 
-                       rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
-                                                  &idx_rx);
-                       if (rcvd > 0)
-                               break;
-               }
+       rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
+       if (!rcvd) {
+               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+                       ret = poll(fds, num_socks, opt_timeout);
+               return;
+       }
 
+       ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+       while (ret != rcvd) {
+               if (ret < 0)
+                       exit_with_error(-ret);
+               if (xsk_ring_prod__needs_wakeup(&xsk->tx))
+                       kick_tx(xsk);
                ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
-               while (ret != rcvd) {
-                       if (ret < 0)
-                               exit_with_error(-ret);
-                       ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
-               }
+       }
 
-               for (i = 0; i < rcvd; i++) {
-                       u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
-                                                         idx_rx)->addr;
-                       u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
-                                                        idx_rx++)->len;
-                       char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
+       for (i = 0; i < rcvd; i++) {
+               u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
+               u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
+               u64 orig = xsk_umem__extract_addr(addr);
 
-                       swap_mac_addresses(pkt);
+               addr = xsk_umem__add_offset_to_addr(addr);
+               char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
 
-                       hex_dump(pkt, len, addr);
-                       xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
-                       xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
-               }
+               swap_mac_addresses(pkt);
+
+               hex_dump(pkt, len, addr);
+               xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig;
+               xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
+       }
+
+       xsk_ring_prod__submit(&xsk->tx, rcvd);
+       xsk_ring_cons__release(&xsk->rx, rcvd);
+
+       xsk->rx_npkts += rcvd;
+       xsk->outstanding_tx += rcvd;
+}
 
-               xsk_ring_prod__submit(&xsk->tx, rcvd);
-               xsk_ring_cons__release(&xsk->rx, rcvd);
+static void l2fwd_all(void)
+{
+       struct pollfd fds[MAX_SOCKS];
+       int i, ret;
+
+       memset(fds, 0, sizeof(fds));
 
-               xsk->rx_npkts += rcvd;
-               xsk->outstanding_tx += rcvd;
+       for (i = 0; i < num_socks; i++) {
+               fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
+               fds[i].events = POLLOUT | POLLIN;
+       }
+
+       for (;;) {
+               if (opt_poll) {
+                       ret = poll(fds, num_socks, opt_timeout);
+                       if (ret <= 0)
+                               continue;
+               }
+
+               for (i = 0; i < num_socks; i++)
+                       l2fwd(xsks[i], fds);
        }
 }
 
@@ -674,11 +744,14 @@ int main(int argc, char **argv)
                exit(EXIT_FAILURE);
        }
 
-       ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
-                            NUM_FRAMES * opt_xsk_frame_size);
-       if (ret)
-               exit_with_error(ret);
-
+       /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */
+       bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size,
+                   PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0);
+       if (bufs == MAP_FAILED) {
+               printf("ERROR: mmap failed\n");
+               exit(EXIT_FAILURE);
+       }
        /* Create sockets... */
        umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
        xsks[num_socks++] = xsk_configure_socket(umem);
@@ -705,9 +778,9 @@ int main(int argc, char **argv)
        if (opt_bench == BENCH_RXDROP)
                rx_drop_all();
        else if (opt_bench == BENCH_TXONLY)
-               tx_only(xsks[0]);
+               tx_only_all();
        else
-               l2fwd(xsks[0]);
+               l2fwd_all();
 
        return 0;
 }
index c311933401081fd48b5451605ff3a4a11f4893da..0d8f41db8cd6e04a3fce6c6cfe9fcb300e6a4500 100755 (executable)
@@ -115,10 +115,12 @@ gen_btf()
        LLVM_OBJCOPY=${OBJCOPY} ${PAHOLE} -J ${1}
 
        # dump .BTF section into raw binary file to link with final vmlinux
-       bin_arch=$(${OBJDUMP} -f ${1} | grep architecture | \
+       bin_arch=$(LANG=C ${OBJDUMP} -f ${1} | grep architecture | \
                cut -d, -f1 | cut -d' ' -f2)
+       bin_format=$(LANG=C ${OBJDUMP} -f ${1} | grep 'file format' | \
+               awk '{print $4}')
        ${OBJCOPY} --dump-section .BTF=.btf.vmlinux.bin ${1} 2>/dev/null
-       ${OBJCOPY} -I binary -O ${CONFIG_OUTPUT_FORMAT} -B ${bin_arch} \
+       ${OBJCOPY} -I binary -O ${bin_format} -B ${bin_arch} \
                --rename-section .data=.BTF .btf.vmlinux.bin ${2}
 }
 
index dfe2bd5a4b9538a87572d45a36c6ec8e4dcdfaf0..59024197e71dd185b393544d8a3b0714a41c7b91 100644 (file)
@@ -1,4 +1,5 @@
 FEATURE-DUMP.bpf
+feature
 bpf_asm
 bpf_dbg
 bpf_exp.yacc.*
index 53b60ad452f5d3ebd5d2edad0b80ef12558f2215..fbf5e4a0cb9c9cdb7c3c13c28e2d4cf3b5b90f3b 100644 (file)
@@ -81,10 +81,11 @@ $(OUTPUT)bpf_exp.lex.o: $(OUTPUT)bpf_exp.lex.c
 
 clean: bpftool_clean
        $(call QUIET_CLEAN, bpf-progs)
-       $(Q)rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
+       $(Q)$(RM) -r -- $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
               $(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.*
        $(call QUIET_CLEAN, core-gen)
-       $(Q)rm -f $(OUTPUT)FEATURE-DUMP.bpf
+       $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpf
+       $(Q)$(RM) -r -- $(OUTPUT)feature
 
 install: $(PROGS) bpftool_install
        $(call QUIET_INSTALL, bpf_jit_disasm)
index 8248b8dd89d4b9cd4b2d9622623d43f2140a42f2..b13926432b84c6fa7da8afd938bb06be8ffcb981 100644 (file)
@@ -3,3 +3,5 @@
 bpftool*.8
 bpf-helpers.*
 FEATURE-DUMP.bpftool
+feature
+libbpf
index 6694a0fc8f99d505a4d64d1a7e604d9c646896ef..39615f8e145b255fbbb13b0d5edc8adf35a7cdbb 100644 (file)
@@ -19,6 +19,7 @@ SYNOPSIS
 BTF COMMANDS
 =============
 
+|      **bpftool** **btf** { **show** | **list** } [**id** *BTF_ID*]
 |      **bpftool** **btf dump** *BTF_SRC* [**format** *FORMAT*]
 |      **bpftool** **btf help**
 |
@@ -29,6 +30,12 @@ BTF COMMANDS
 
 DESCRIPTION
 ===========
+       **bpftool btf { show | list }** [**id** *BTF_ID*]
+                 Show information about loaded BTF objects. If a BTF ID is
+                 specified, show information only about given BTF object,
+                 otherwise list all BTF objects currently loaded on the
+                 system.
+
        **bpftool btf dump** *BTF_SRC*
                  Dump BTF entries from a given *BTF_SRC*.
 
index 61d1d270eb5eb577a28e178710ea59cd5fec60c8..1c0f7146aab0a8fc749f6ec87aae642fd3552783 100644 (file)
@@ -36,6 +36,7 @@ MAP COMMANDS
 |      **bpftool** **map pop**        *MAP*
 |      **bpftool** **map enqueue**    *MAP* **value** *VALUE*
 |      **bpftool** **map dequeue**    *MAP*
+|      **bpftool** **map freeze**     *MAP*
 |      **bpftool** **map help**
 |
 |      *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@@ -127,6 +128,14 @@ DESCRIPTION
        **bpftool map dequeue**  *MAP*
                  Dequeue and print **value** from the queue.
 
+       **bpftool map freeze**  *MAP*
+                 Freeze the map as read-only from user space. Entries from a
+                 frozen map can not longer be updated or deleted with the
+                 **bpf\ ()** system call. This operation is not reversible,
+                 and the map remains immutable from user space until its
+                 destruction. However, read and write permissions for BPF
+                 programs to the map remain unchanged.
+
        **bpftool map help**
                  Print short help message.
 
index d8e5237a2085966faa2b1ed312ab5b857e920bef..8651b00b81ea05ff6d9c7275167be87fce83d27c 100644 (file)
@@ -15,17 +15,22 @@ SYNOPSIS
        *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
 
        *COMMANDS* :=
-       { **show** | **list** } [ **dev** name ] | **help**
+       { **show** | **list** | **attach** | **detach** | **help** }
 
 NET COMMANDS
 ============
 
-|      **bpftool** **net { show | list } [ dev name ]**
+|      **bpftool** **net { show | list }** [ **dev** *NAME* ]
+|      **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
+|      **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
 |      **bpftool** **net help**
+|
+|      *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+|      *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** }
 
 DESCRIPTION
 ===========
-       **bpftool net { show | list } [ dev name ]**
+       **bpftool net { show | list }** [ **dev** *NAME* ]
                   List bpf program attachments in the kernel networking subsystem.
 
                   Currently, only device driver xdp attachments and tc filter
@@ -47,6 +52,24 @@ DESCRIPTION
                   all bpf programs attached to non clsact qdiscs, and finally all
                   bpf programs attached to root and clsact qdisc.
 
+       **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *NAME* [ **overwrite** ]
+                  Attach bpf program *PROG* to network interface *NAME* with
+                  type specified by *ATTACH_TYPE*. Previously attached bpf program
+                  can be replaced by the command used with **overwrite** option.
+                  Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
+
+                  *ATTACH_TYPE* can be of:
+                  **xdp** - try native XDP and fallback to generic XDP if NIC driver does not support it;
+                  **xdpgeneric** - Generic XDP. runs at generic XDP hook when packet already enters receive path as skb;
+                  **xdpdrv** - Native XDP. runs earliest point in driver's receive path;
+                  **xdpoffload** - Offload XDP. runs directly on NIC on each packet reception;
+
+       **bpftool** **net detach** *ATTACH_TYPE* **dev** *NAME*
+                  Detach bpf program attached to network interface *NAME* with
+                  type specified by *ATTACH_TYPE*. To detach bpf program, same
+                  *ATTACH_TYPE* previously used for attach must be specified.
+                  Currently, only XDP-related modes are supported for *ATTACH_TYPE*.
+
        **bpftool net help**
                  Print short help message.
 
@@ -137,6 +160,34 @@ EXAMPLES
         }
     ]
 
+|
+| **# bpftool net attach xdpdrv id 16 dev enp6s0np0**
+| **# bpftool net**
+
+::
+
+      xdp:
+      enp6s0np0(4) driver id 16
+
+|
+| **# bpftool net attach xdpdrv id 16 dev enp6s0np0**
+| **# bpftool net attach xdpdrv id 20 dev enp6s0np0 overwrite**
+| **# bpftool net**
+
+::
+
+      xdp:
+      enp6s0np0(4) driver id 20
+
+|
+| **# bpftool net attach xdpdrv id 16 dev enp6s0np0**
+| **# bpftool net detach xdpdrv dev enp6s0np0**
+| **# bpftool net**
+
+::
+
+      xdp:
+
 
 SEE ALSO
 ========
index 4c9d1ffc3fc72194117df93b5a37a5dcd21ab9c5..39bc6f0f4f0bb839ade8b6962bff515729f318ea 100644 (file)
@@ -17,27 +17,30 @@ endif
 BPF_DIR = $(srctree)/tools/lib/bpf/
 
 ifneq ($(OUTPUT),)
-  BPF_PATH = $(OUTPUT)
+  LIBBPF_OUTPUT = $(OUTPUT)/libbpf/
+  LIBBPF_PATH = $(LIBBPF_OUTPUT)
 else
-  BPF_PATH = $(BPF_DIR)
+  LIBBPF_PATH = $(BPF_DIR)
 endif
 
-LIBBPF = $(BPF_PATH)libbpf.a
+LIBBPF = $(LIBBPF_PATH)libbpf.a
 
-BPFTOOL_VERSION := $(shell make --no-print-directory -sC ../../.. kernelversion)
+BPFTOOL_VERSION := $(shell make -rR --no-print-directory -sC ../../.. kernelversion)
 
 $(LIBBPF): FORCE
-       $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) $(OUTPUT)libbpf.a
+       $(if $(LIBBPF_OUTPUT),@mkdir -p $(LIBBPF_OUTPUT))
+       $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) $(LIBBPF_OUTPUT)libbpf.a
 
 $(LIBBPF)-clean:
        $(call QUIET_CLEAN, libbpf)
-       $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(OUTPUT) clean >/dev/null
+       $(Q)$(MAKE) -C $(BPF_DIR) OUTPUT=$(LIBBPF_OUTPUT) clean >/dev/null
 
 prefix ?= /usr/local
 bash_compdir ?= /usr/share/bash-completion/completions
 
 CFLAGS += -O2
-CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -Wno-missing-field-initializers
+CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers
+CFLAGS += $(filter-out -Wswitch-enum,$(EXTRA_WARNINGS))
 CFLAGS += -DPACKAGE='"bpftool"' -D__EXPORTED_HEADERS__ \
        -I$(srctree)/kernel/bpf/ \
        -I$(srctree)/tools/include \
@@ -52,7 +55,7 @@ ifneq ($(EXTRA_LDFLAGS),)
 LDFLAGS += $(EXTRA_LDFLAGS)
 endif
 
-LIBS = -lelf -lz $(LIBBPF)
+LIBS = $(LIBBPF) -lelf -lz
 
 INSTALL ?= install
 RM ?= rm -f
@@ -114,16 +117,18 @@ $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
 $(OUTPUT)feature.o: | zdep
 
 $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
-       $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
+       $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJS) $(LIBS)
 
 $(OUTPUT)%.o: %.c
        $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
 
 clean: $(LIBBPF)-clean
        $(call QUIET_CLEAN, bpftool)
-       $(Q)$(RM) $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+       $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+       $(Q)$(RM) -r -- $(OUTPUT)libbpf/
        $(call QUIET_CLEAN, core-gen)
-       $(Q)$(RM) $(OUTPUT)FEATURE-DUMP.bpftool
+       $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool
+       $(Q)$(RM) -r -- $(OUTPUT)feature/
 
 install: $(OUTPUT)bpftool
        $(call QUIET_INSTALL, bpftool)
@@ -134,8 +139,8 @@ install: $(OUTPUT)bpftool
 
 uninstall:
        $(call QUIET_UNINST, bpftool)
-       $(Q)$(RM) $(DESTDIR)$(prefix)/sbin/bpftool
-       $(Q)$(RM) $(DESTDIR)$(bash_compdir)/bpftool
+       $(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool
+       $(Q)$(RM) -- $(DESTDIR)$(bash_compdir)/bpftool
 
 doc:
        $(call descend,Documentation)
index df16c54154442e87830963755c557ff89f7410eb..70493a6da206728112149cbb7aa31f3b8448197a 100644 (file)
@@ -73,8 +73,8 @@ _bpftool_get_prog_tags()
 
 _bpftool_get_btf_ids()
 {
-    COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \
-        command sed -n 's/.*"btf_id": \(.*\),\?$/\1/p' )" -- "$cur" ) )
+    COMPREPLY+=( $( compgen -W "$( bpftool -jp btf 2>&1 | \
+        command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) )
 }
 
 _bpftool_get_obj_map_names()
@@ -201,6 +201,10 @@ _bpftool()
             _bpftool_get_prog_tags
             return 0
             ;;
+        dev)
+            _sysfs_get_netdevs
+            return 0
+            ;;
         file|pinned)
             _filedir
             return 0
@@ -399,10 +403,6 @@ _bpftool()
                             _filedir
                             return 0
                             ;;
-                        dev)
-                            _sysfs_get_netdevs
-                            return 0
-                            ;;
                         *)
                             COMPREPLY=( $( compgen -W "map" -- "$cur" ) )
                             _bpftool_once_attr 'type'
@@ -449,7 +449,7 @@ _bpftool()
         map)
             local MAP_TYPE='id pinned'
             case $command in
-                show|list|dump|peek|pop|dequeue)
+                show|list|dump|peek|pop|dequeue|freeze)
                     case $prev in
                         $command)
                             COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) )
@@ -498,10 +498,6 @@ _bpftool()
                         key|value|flags|name|entries)
                             return 0
                             ;;
-                        dev)
-                            _sysfs_get_netdevs
-                            return 0
-                            ;;
                         *)
                             _bpftool_once_attr 'type'
                             _bpftool_once_attr 'key'
@@ -642,7 +638,7 @@ _bpftool()
                     [[ $prev == $object ]] && \
                         COMPREPLY=( $( compgen -W 'delete dump getnext help \
                             lookup pin event_pipe show list update create \
-                            peek push enqueue pop dequeue' -- \
+                            peek push enqueue pop dequeue freeze' -- \
                             "$cur" ) )
                     ;;
             esac
@@ -674,7 +670,7 @@ _bpftool()
                                 map)
                                     _bpftool_get_map_ids
                                     ;;
-                                dump)
+                                $command)
                                     _bpftool_get_btf_ids
                                     ;;
                             esac
@@ -702,9 +698,21 @@ _bpftool()
                             ;;
                     esac
                     ;;
+                show|list)
+                    case $prev in
+                        $command)
+                            COMPREPLY+=( $( compgen -W "id" -- "$cur" ) )
+                            ;;
+                        id)
+                            _bpftool_get_btf_ids
+                            ;;
+                    esac
+                    return 0
+                    ;;
                 *)
                     [[ $prev == $object ]] && \
-                        COMPREPLY=( $( compgen -W 'dump help' -- "$cur" ) )
+                        COMPREPLY=( $( compgen -W 'dump help show list' \
+                            -- "$cur" ) )
                     ;;
             esac
             ;;
@@ -778,18 +786,67 @@ _bpftool()
             esac
             ;;
         net)
+            local PROG_TYPE='id pinned tag'
+            local ATTACH_TYPES='xdp xdpgeneric xdpdrv xdpoffload'
             case $command in
+                show|list)
+                    [[ $prev != "$command" ]] && return 0
+                    COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
+                    return 0
+                    ;;
+                attach)
+                    case $cword in
+                        3)
+                            COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- "$cur" ) )
+                            return 0
+                            ;;
+                        4)
+                            COMPREPLY=( $( compgen -W "$PROG_TYPE" -- "$cur" ) )
+                            return 0
+                            ;;
+                        5)
+                            case $prev in
+                                id)
+                                    _bpftool_get_prog_ids
+                                    ;;
+                                pinned)
+                                    _filedir
+                                    ;;
+                            esac
+                            return 0
+                            ;;
+                        6)
+                            COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
+                            return 0
+                            ;;
+                        8)
+                            _bpftool_once_attr 'overwrite'
+                            return 0
+                            ;;
+                    esac
+                    ;;
+                detach)
+                    case $cword in
+                        3)
+                            COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- "$cur" ) )
+                            return 0
+                            ;;
+                        4)
+                            COMPREPLY=( $( compgen -W 'dev' -- "$cur" ) )
+                            return 0
+                            ;;
+                    esac
+                    ;;
                 *)
                     [[ $prev == $object ]] && \
                         COMPREPLY=( $( compgen -W 'help \
-                            show list' -- "$cur" ) )
+                            show list attach detach' -- "$cur" ) )
                     ;;
             esac
             ;;
         feature)
             case $command in
                 probe)
-                    [[ $prev == "dev" ]] && _sysfs_get_netdevs && return 0
                     [[ $prev == "prefix" ]] && return 0
                     if _bpftool_search_list 'macros'; then
                         COMPREPLY+=( $( compgen -W 'prefix' -- "$cur" ) )
index 1b8ec91899e6aad9a069e658f4e08c9d761084f2..9a9376d1d3df2aab84609cc0a6e42ce5beb687a7 100644 (file)
@@ -11,6 +11,7 @@
 #include <bpf.h>
 #include <libbpf.h>
 #include <linux/btf.h>
+#include <linux/hashtable.h>
 
 #include "btf.h"
 #include "json_writer.h"
@@ -35,6 +36,16 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
        [BTF_KIND_DATASEC]      = "DATASEC",
 };
 
+struct btf_attach_table {
+       DECLARE_HASHTABLE(table, 16);
+};
+
+struct btf_attach_point {
+       __u32 obj_id;
+       __u32 btf_id;
+       struct hlist_node hash;
+};
+
 static const char *btf_int_enc_str(__u8 encoding)
 {
        switch (encoding) {
@@ -449,7 +460,7 @@ static int do_dump(int argc, char **argv)
 
                btf_id = strtoul(*argv, &endptr, 0);
                if (*endptr) {
-                       p_err("can't parse %s as ID", **argv);
+                       p_err("can't parse %s as ID", *argv);
                        return -1;
                }
                NEXT_ARG();
@@ -522,6 +533,330 @@ done:
        return err;
 }
 
+static int btf_parse_fd(int *argc, char ***argv)
+{
+       unsigned int id;
+       char *endptr;
+       int fd;
+
+       if (!is_prefix(*argv[0], "id")) {
+               p_err("expected 'id', got: '%s'?", **argv);
+               return -1;
+       }
+       NEXT_ARGP();
+
+       id = strtoul(**argv, &endptr, 0);
+       if (*endptr) {
+               p_err("can't parse %s as ID", **argv);
+               return -1;
+       }
+       NEXT_ARGP();
+
+       fd = bpf_btf_get_fd_by_id(id);
+       if (fd < 0)
+               p_err("can't get BTF object by id (%u): %s",
+                     id, strerror(errno));
+
+       return fd;
+}
+
+static void delete_btf_table(struct btf_attach_table *tab)
+{
+       struct btf_attach_point *obj;
+       struct hlist_node *tmp;
+
+       unsigned int bkt;
+
+       hash_for_each_safe(tab->table, bkt, tmp, obj, hash) {
+               hash_del(&obj->hash);
+               free(obj);
+       }
+}
+
+static int
+build_btf_type_table(struct btf_attach_table *tab, enum bpf_obj_type type,
+                    void *info, __u32 *len)
+{
+       static const char * const names[] = {
+               [BPF_OBJ_UNKNOWN]       = "unknown",
+               [BPF_OBJ_PROG]          = "prog",
+               [BPF_OBJ_MAP]           = "map",
+       };
+       struct btf_attach_point *obj_node;
+       __u32 btf_id, id = 0;
+       int err;
+       int fd;
+
+       while (true) {
+               switch (type) {
+               case BPF_OBJ_PROG:
+                       err = bpf_prog_get_next_id(id, &id);
+                       break;
+               case BPF_OBJ_MAP:
+                       err = bpf_map_get_next_id(id, &id);
+                       break;
+               default:
+                       err = -1;
+                       p_err("unexpected object type: %d", type);
+                       goto err_free;
+               }
+               if (err) {
+                       if (errno == ENOENT) {
+                               err = 0;
+                               break;
+                       }
+                       p_err("can't get next %s: %s%s", names[type],
+                             strerror(errno),
+                             errno == EINVAL ? " -- kernel too old?" : "");
+                       goto err_free;
+               }
+
+               switch (type) {
+               case BPF_OBJ_PROG:
+                       fd = bpf_prog_get_fd_by_id(id);
+                       break;
+               case BPF_OBJ_MAP:
+                       fd = bpf_map_get_fd_by_id(id);
+                       break;
+               default:
+                       err = -1;
+                       p_err("unexpected object type: %d", type);
+                       goto err_free;
+               }
+               if (fd < 0) {
+                       if (errno == ENOENT)
+                               continue;
+                       p_err("can't get %s by id (%u): %s", names[type], id,
+                             strerror(errno));
+                       err = -1;
+                       goto err_free;
+               }
+
+               memset(info, 0, *len);
+               err = bpf_obj_get_info_by_fd(fd, info, len);
+               close(fd);
+               if (err) {
+                       p_err("can't get %s info: %s", names[type],
+                             strerror(errno));
+                       goto err_free;
+               }
+
+               switch (type) {
+               case BPF_OBJ_PROG:
+                       btf_id = ((struct bpf_prog_info *)info)->btf_id;
+                       break;
+               case BPF_OBJ_MAP:
+                       btf_id = ((struct bpf_map_info *)info)->btf_id;
+                       break;
+               default:
+                       err = -1;
+                       p_err("unexpected object type: %d", type);
+                       goto err_free;
+               }
+               if (!btf_id)
+                       continue;
+
+               obj_node = calloc(1, sizeof(*obj_node));
+               if (!obj_node) {
+                       p_err("failed to allocate memory: %s", strerror(errno));
+                       goto err_free;
+               }
+
+               obj_node->obj_id = id;
+               obj_node->btf_id = btf_id;
+               hash_add(tab->table, &obj_node->hash, obj_node->btf_id);
+       }
+
+       return 0;
+
+err_free:
+       delete_btf_table(tab);
+       return err;
+}
+
+static int
+build_btf_tables(struct btf_attach_table *btf_prog_table,
+                struct btf_attach_table *btf_map_table)
+{
+       struct bpf_prog_info prog_info;
+       __u32 prog_len = sizeof(prog_info);
+       struct bpf_map_info map_info;
+       __u32 map_len = sizeof(map_info);
+       int err = 0;
+
+       err = build_btf_type_table(btf_prog_table, BPF_OBJ_PROG, &prog_info,
+                                  &prog_len);
+       if (err)
+               return err;
+
+       err = build_btf_type_table(btf_map_table, BPF_OBJ_MAP, &map_info,
+                                  &map_len);
+       if (err) {
+               delete_btf_table(btf_prog_table);
+               return err;
+       }
+
+       return 0;
+}
+
+static void
+show_btf_plain(struct bpf_btf_info *info, int fd,
+              struct btf_attach_table *btf_prog_table,
+              struct btf_attach_table *btf_map_table)
+{
+       struct btf_attach_point *obj;
+       int n;
+
+       printf("%u: ", info->id);
+       printf("size %uB", info->btf_size);
+
+       n = 0;
+       hash_for_each_possible(btf_prog_table->table, obj, hash, info->id) {
+               if (obj->btf_id == info->id)
+                       printf("%s%u", n++ == 0 ? "  prog_ids " : ",",
+                              obj->obj_id);
+       }
+
+       n = 0;
+       hash_for_each_possible(btf_map_table->table, obj, hash, info->id) {
+               if (obj->btf_id == info->id)
+                       printf("%s%u", n++ == 0 ? "  map_ids " : ",",
+                              obj->obj_id);
+       }
+
+       printf("\n");
+}
+
+static void
+show_btf_json(struct bpf_btf_info *info, int fd,
+             struct btf_attach_table *btf_prog_table,
+             struct btf_attach_table *btf_map_table)
+{
+       struct btf_attach_point *obj;
+
+       jsonw_start_object(json_wtr);   /* btf object */
+       jsonw_uint_field(json_wtr, "id", info->id);
+       jsonw_uint_field(json_wtr, "size", info->btf_size);
+
+       jsonw_name(json_wtr, "prog_ids");
+       jsonw_start_array(json_wtr);    /* prog_ids */
+       hash_for_each_possible(btf_prog_table->table, obj, hash,
+                              info->id) {
+               if (obj->btf_id == info->id)
+                       jsonw_uint(json_wtr, obj->obj_id);
+       }
+       jsonw_end_array(json_wtr);      /* prog_ids */
+
+       jsonw_name(json_wtr, "map_ids");
+       jsonw_start_array(json_wtr);    /* map_ids */
+       hash_for_each_possible(btf_map_table->table, obj, hash,
+                              info->id) {
+               if (obj->btf_id == info->id)
+                       jsonw_uint(json_wtr, obj->obj_id);
+       }
+       jsonw_end_array(json_wtr);      /* map_ids */
+       jsonw_end_object(json_wtr);     /* btf object */
+}
+
+static int
+show_btf(int fd, struct btf_attach_table *btf_prog_table,
+        struct btf_attach_table *btf_map_table)
+{
+       struct bpf_btf_info info = {};
+       __u32 len = sizeof(info);
+       int err;
+
+       err = bpf_obj_get_info_by_fd(fd, &info, &len);
+       if (err) {
+               p_err("can't get BTF object info: %s", strerror(errno));
+               return -1;
+       }
+
+       if (json_output)
+               show_btf_json(&info, fd, btf_prog_table, btf_map_table);
+       else
+               show_btf_plain(&info, fd, btf_prog_table, btf_map_table);
+
+       return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+       struct btf_attach_table btf_prog_table;
+       struct btf_attach_table btf_map_table;
+       int err, fd = -1;
+       __u32 id = 0;
+
+       if (argc == 2) {
+               fd = btf_parse_fd(&argc, &argv);
+               if (fd < 0)
+                       return -1;
+       }
+
+       if (argc) {
+               if (fd >= 0)
+                       close(fd);
+               return BAD_ARG();
+       }
+
+       hash_init(btf_prog_table.table);
+       hash_init(btf_map_table.table);
+       err = build_btf_tables(&btf_prog_table, &btf_map_table);
+       if (err) {
+               if (fd >= 0)
+                       close(fd);
+               return err;
+       }
+
+       if (fd >= 0) {
+               err = show_btf(fd, &btf_prog_table, &btf_map_table);
+               close(fd);
+               goto exit_free;
+       }
+
+       if (json_output)
+               jsonw_start_array(json_wtr);    /* root array */
+
+       while (true) {
+               err = bpf_btf_get_next_id(id, &id);
+               if (err) {
+                       if (errno == ENOENT) {
+                               err = 0;
+                               break;
+                       }
+                       p_err("can't get next BTF object: %s%s",
+                             strerror(errno),
+                             errno == EINVAL ? " -- kernel too old?" : "");
+                       err = -1;
+                       break;
+               }
+
+               fd = bpf_btf_get_fd_by_id(id);
+               if (fd < 0) {
+                       if (errno == ENOENT)
+                               continue;
+                       p_err("can't get BTF object by id (%u): %s",
+                             id, strerror(errno));
+                       err = -1;
+                       break;
+               }
+
+               err = show_btf(fd, &btf_prog_table, &btf_map_table);
+               close(fd);
+               if (err)
+                       break;
+       }
+
+       if (json_output)
+               jsonw_end_array(json_wtr);      /* root array */
+
+exit_free:
+       delete_btf_table(&btf_prog_table);
+       delete_btf_table(&btf_map_table);
+
+       return err;
+}
+
 static int do_help(int argc, char **argv)
 {
        if (json_output) {
@@ -530,7 +865,8 @@ static int do_help(int argc, char **argv)
        }
 
        fprintf(stderr,
-               "Usage: %s btf dump BTF_SRC [format FORMAT]\n"
+               "Usage: %s btf { show | list } [id BTF_ID]\n"
+               "       %s btf dump BTF_SRC [format FORMAT]\n"
                "       %s btf help\n"
                "\n"
                "       BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n"
@@ -539,12 +875,14 @@ static int do_help(int argc, char **argv)
                "       " HELP_SPEC_PROGRAM "\n"
                "       " HELP_SPEC_OPTIONS "\n"
                "",
-               bin_name, bin_name);
+               bin_name, bin_name, bin_name);
 
        return 0;
 }
 
 static const struct cmd cmds[] = {
+       { "show",       do_show },
+       { "list",       do_show },
        { "help",       do_help },
        { "dump",       do_dump },
        { 0 }
index 8cafb9b314672fe238cb3d4e40b9dafcba1f2301..d66131f696892065240e225e2c86d44278ea4e6e 100644 (file)
@@ -26,9 +26,9 @@ static void btf_dumper_ptr(const void *data, json_writer_t *jw,
                           bool is_plain_text)
 {
        if (is_plain_text)
-               jsonw_printf(jw, "%p", *(unsigned long *)data);
+               jsonw_printf(jw, "%p", data);
        else
-               jsonw_printf(jw, "%u", *(unsigned long *)data);
+               jsonw_printf(jw, "%lu", *(unsigned long *)data);
 }
 
 static int btf_dumper_modifier(const struct btf_dumper *d, __u32 type_id,
@@ -216,7 +216,7 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset,
        switch (BTF_INT_ENCODING(*int_type)) {
        case 0:
                if (BTF_INT_BITS(*int_type) == 64)
-                       jsonw_printf(jw, "%lu", *(__u64 *)data);
+                       jsonw_printf(jw, "%llu", *(__u64 *)data);
                else if (BTF_INT_BITS(*int_type) == 32)
                        jsonw_printf(jw, "%u", *(__u32 *)data);
                else if (BTF_INT_BITS(*int_type) == 16)
@@ -229,7 +229,7 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset,
                break;
        case BTF_INT_SIGNED:
                if (BTF_INT_BITS(*int_type) == 64)
-                       jsonw_printf(jw, "%ld", *(long long *)data);
+                       jsonw_printf(jw, "%lld", *(long long *)data);
                else if (BTF_INT_BITS(*int_type) == 32)
                        jsonw_printf(jw, "%d", *(int *)data);
                else if (BTF_INT_BITS(*int_type) == 16)
index 44352b5aca8507467e42710c13c19c1083dfb08c..1ef45e55039e191da13e6138be3e62c7e24dcebf 100644 (file)
@@ -120,8 +120,8 @@ static int count_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type)
 static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
                                   int level)
 {
+       const char *attach_flags_str;
        __u32 prog_ids[1024] = {0};
-       char *attach_flags_str;
        __u32 prog_cnt, iter;
        __u32 attach_flags;
        char buf[32];
index 6a71324be6283eccb0d816a72dd61f66b4527de6..88264abaa738ad1dbab60b407ba584dac617a0b6 100644 (file)
@@ -29,7 +29,7 @@
 #define BPF_FS_MAGIC           0xcafe4a11
 #endif
 
-void __printf(1, 2) p_err(const char *fmt, ...)
+void p_err(const char *fmt, ...)
 {
        va_list ap;
 
@@ -47,7 +47,7 @@ void __printf(1, 2) p_err(const char *fmt, ...)
        va_end(ap);
 }
 
-void __printf(1, 2) p_info(const char *fmt, ...)
+void p_info(const char *fmt, ...)
 {
        va_list ap;
 
index 6046dcab51cc538b679428def22ff377a6fada7c..86501cd3c763e7fbaff97f1944da70f8b7d67183 100644 (file)
@@ -15,7 +15,6 @@
 #include <malloc.h>
 #include <inttypes.h>
 #include <stdint.h>
-#include <linux/compiler.h>
 
 #include "json_writer.h"
 
@@ -153,8 +152,7 @@ void jsonw_name(json_writer_t *self, const char *name)
                putc(' ', self->out);
 }
 
-void __printf(2, 0)
-jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap)
+void jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap)
 {
        jsonw_eor(self);
        putc('"', self->out);
@@ -162,7 +160,7 @@ jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap)
        putc('"', self->out);
 }
 
-void __printf(2, 3) jsonw_printf(json_writer_t *self, const char *fmt, ...)
+void jsonw_printf(json_writer_t *self, const char *fmt, ...)
 {
        va_list ap;
 
index cb9a1993681c6e3997e07343b3d5c9a4a68b9e6e..35cf1f00f96cadb463ad65c725df2d314bfd866c 100644 (file)
@@ -14,6 +14,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdarg.h>
+#include <linux/compiler.h>
 
 /* Opaque class structure */
 typedef struct json_writer json_writer_t;
@@ -30,8 +31,9 @@ void jsonw_pretty(json_writer_t *self, bool on);
 void jsonw_name(json_writer_t *self, const char *name);
 
 /* Add value  */
-void jsonw_vprintf_enquote(json_writer_t *self, const char *fmt, va_list ap);
-void jsonw_printf(json_writer_t *self, const char *fmt, ...);
+void __printf(2, 0) jsonw_vprintf_enquote(json_writer_t *self, const char *fmt,
+                                         va_list ap);
+void __printf(2, 3) jsonw_printf(json_writer_t *self, const char *fmt, ...);
 void jsonw_string(json_writer_t *self, const char *value);
 void jsonw_bool(json_writer_t *self, bool value);
 void jsonw_float(json_writer_t *self, double number);
index e916ff25697f1917e87d04814599c0b0bc01ece9..93d008687020cdac64f931ca22f911e6019d38ea 100644 (file)
@@ -139,7 +139,7 @@ int detect_common_prefix(const char *arg, ...)
        strncat(msg, "'", sizeof(msg) - strlen(msg) - 1);
 
        if (count >= 2) {
-               p_err(msg);
+               p_err("%s", msg);
                return -1;
        }
 
index 7031a4bf87a020716df5eb20fa0c641c423de9d7..af9ad56c303a1dff6359c6af2530fcdbf60e6fb4 100644 (file)
@@ -98,8 +98,8 @@ extern int bpf_flags;
 extern struct pinned_obj_table prog_table;
 extern struct pinned_obj_table map_table;
 
-void p_err(const char *fmt, ...);
-void p_info(const char *fmt, ...);
+void __printf(1, 2) p_err(const char *fmt, ...);
+void __printf(1, 2) p_info(const char *fmt, ...);
 
 bool is_prefix(const char *pfx, const char *str);
 int detect_common_prefix(const char *arg, ...);
index bfbbc6b4cb83c7c7db9029cebbd5f49118fb66a3..de61d73b9030b27b5d993040afb6738f416a6c72 100644 (file)
@@ -481,9 +481,11 @@ static int parse_elem(char **argv, struct bpf_map_info *info,
 
 static int show_map_close_json(int fd, struct bpf_map_info *info)
 {
-       char *memlock;
+       char *memlock, *frozen_str;
+       int frozen = 0;
 
        memlock = get_fdinfo(fd, "memlock");
+       frozen_str = get_fdinfo(fd, "frozen");
 
        jsonw_start_object(json_wtr);
 
@@ -533,6 +535,12 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
        }
        close(fd);
 
+       if (frozen_str) {
+               frozen = atoi(frozen_str);
+               free(frozen_str);
+       }
+       jsonw_int_field(json_wtr, "frozen", frozen);
+
        if (info->btf_id)
                jsonw_int_field(json_wtr, "btf_id", info->btf_id);
 
@@ -555,9 +563,11 @@ static int show_map_close_json(int fd, struct bpf_map_info *info)
 
 static int show_map_close_plain(int fd, struct bpf_map_info *info)
 {
-       char *memlock;
+       char *memlock, *frozen_str;
+       int frozen = 0;
 
        memlock = get_fdinfo(fd, "memlock");
+       frozen_str = get_fdinfo(fd, "frozen");
 
        printf("%u: ", info->id);
        if (info->type < ARRAY_SIZE(map_type_name))
@@ -610,9 +620,23 @@ static int show_map_close_plain(int fd, struct bpf_map_info *info)
                                printf("\n\tpinned %s", obj->path);
                }
        }
+       printf("\n");
+
+       if (frozen_str) {
+               frozen = atoi(frozen_str);
+               free(frozen_str);
+       }
+
+       if (!info->btf_id && !frozen)
+               return 0;
+
+       printf("\t");
 
        if (info->btf_id)
-               printf("\n\tbtf_id %d", info->btf_id);
+               printf("btf_id %d", info->btf_id);
+
+       if (frozen)
+               printf("%sfrozen", info->btf_id ? "  " : "");
 
        printf("\n");
        return 0;
@@ -1238,6 +1262,35 @@ exit_free:
        return err;
 }
 
+static int do_freeze(int argc, char **argv)
+{
+       int err, fd;
+
+       if (!REQ_ARGS(2))
+               return -1;
+
+       fd = map_parse_fd(&argc, &argv);
+       if (fd < 0)
+               return -1;
+
+       if (argc) {
+               close(fd);
+               return BAD_ARG();
+       }
+
+       err = bpf_map_freeze(fd);
+       close(fd);
+       if (err) {
+               p_err("failed to freeze map: %s", strerror(errno));
+               return err;
+       }
+
+       if (json_output)
+               jsonw_null(json_wtr);
+
+       return 0;
+}
+
 static int do_help(int argc, char **argv)
 {
        if (json_output) {
@@ -1262,6 +1315,7 @@ static int do_help(int argc, char **argv)
                "       %s %s pop        MAP\n"
                "       %s %s enqueue    MAP value VALUE\n"
                "       %s %s dequeue    MAP\n"
+               "       %s %s freeze     MAP\n"
                "       %s %s help\n"
                "\n"
                "       " HELP_SPEC_MAP "\n"
@@ -1280,7 +1334,8 @@ static int do_help(int argc, char **argv)
                bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
                bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
                bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
-               bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
+               bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+               bin_name, argv[-2]);
 
        return 0;
 }
@@ -1302,6 +1357,7 @@ static const struct cmd cmds[] = {
        { "enqueue",    do_update },
        { "pop",        do_pop_dequeue },
        { "dequeue",    do_pop_dequeue },
+       { "freeze",     do_freeze },
        { 0 }
 };
 
index 3f108ab177973761a03601e8d69da0719e3bd052..4c5531d1a45002e79f2e0b2de48317bb408d9988 100644 (file)
@@ -157,7 +157,7 @@ int do_event_pipe(int argc, char **argv)
                        NEXT_ARG();
                        ctx.cpu = strtoul(*argv, &endptr, 0);
                        if (*endptr) {
-                               p_err("can't parse %s as CPU ID", **argv);
+                               p_err("can't parse %s as CPU ID", *argv);
                                goto err_close_map;
                        }
 
@@ -168,7 +168,7 @@ int do_event_pipe(int argc, char **argv)
                        NEXT_ARG();
                        ctx.idx = strtoul(*argv, &endptr, 0);
                        if (*endptr) {
-                               p_err("can't parse %s as index", **argv);
+                               p_err("can't parse %s as index", *argv);
                                goto err_close_map;
                        }
 
index 67e99c56bc88c0e4684755f97a562020c0731bf0..4f52d31516166f0625af6baaaf8847c916ff0a93 100644 (file)
@@ -55,6 +55,35 @@ struct bpf_attach_info {
        __u32 flow_dissector_id;
 };
 
+enum net_attach_type {
+       NET_ATTACH_TYPE_XDP,
+       NET_ATTACH_TYPE_XDP_GENERIC,
+       NET_ATTACH_TYPE_XDP_DRIVER,
+       NET_ATTACH_TYPE_XDP_OFFLOAD,
+};
+
+static const char * const attach_type_strings[] = {
+       [NET_ATTACH_TYPE_XDP]           = "xdp",
+       [NET_ATTACH_TYPE_XDP_GENERIC]   = "xdpgeneric",
+       [NET_ATTACH_TYPE_XDP_DRIVER]    = "xdpdrv",
+       [NET_ATTACH_TYPE_XDP_OFFLOAD]   = "xdpoffload",
+};
+
+const size_t net_attach_type_size = ARRAY_SIZE(attach_type_strings);
+
+static enum net_attach_type parse_attach_type(const char *str)
+{
+       enum net_attach_type type;
+
+       for (type = 0; type < net_attach_type_size; type++) {
+               if (attach_type_strings[type] &&
+                   is_prefix(str, attach_type_strings[type]))
+                       return type;
+       }
+
+       return net_attach_type_size;
+}
+
 static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb)
 {
        struct bpf_netdev_t *netinfo = cookie;
@@ -197,7 +226,7 @@ static int query_flow_dissector(struct bpf_attach_info *attach_info)
 
        fd = open("/proc/self/ns/net", O_RDONLY);
        if (fd < 0) {
-               p_err("can't open /proc/self/ns/net: %d",
+               p_err("can't open /proc/self/ns/net: %s",
                      strerror(errno));
                return -1;
        }
@@ -223,6 +252,134 @@ static int query_flow_dissector(struct bpf_attach_info *attach_info)
        return 0;
 }
 
+static int net_parse_dev(int *argc, char ***argv)
+{
+       int ifindex;
+
+       if (is_prefix(**argv, "dev")) {
+               NEXT_ARGP();
+
+               ifindex = if_nametoindex(**argv);
+               if (!ifindex)
+                       p_err("invalid devname %s", **argv);
+
+               NEXT_ARGP();
+       } else {
+               p_err("expected 'dev', got: '%s'?", **argv);
+               return -1;
+       }
+
+       return ifindex;
+}
+
+static int do_attach_detach_xdp(int progfd, enum net_attach_type attach_type,
+                               int ifindex, bool overwrite)
+{
+       __u32 flags = 0;
+
+       if (!overwrite)
+               flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+       if (attach_type == NET_ATTACH_TYPE_XDP_GENERIC)
+               flags |= XDP_FLAGS_SKB_MODE;
+       if (attach_type == NET_ATTACH_TYPE_XDP_DRIVER)
+               flags |= XDP_FLAGS_DRV_MODE;
+       if (attach_type == NET_ATTACH_TYPE_XDP_OFFLOAD)
+               flags |= XDP_FLAGS_HW_MODE;
+
+       return bpf_set_link_xdp_fd(ifindex, progfd, flags);
+}
+
+static int do_attach(int argc, char **argv)
+{
+       enum net_attach_type attach_type;
+       int progfd, ifindex, err = 0;
+       bool overwrite = false;
+
+       /* parse attach args */
+       if (!REQ_ARGS(5))
+               return -EINVAL;
+
+       attach_type = parse_attach_type(*argv);
+       if (attach_type == net_attach_type_size) {
+               p_err("invalid net attach/detach type: %s", *argv);
+               return -EINVAL;
+       }
+       NEXT_ARG();
+
+       progfd = prog_parse_fd(&argc, &argv);
+       if (progfd < 0)
+               return -EINVAL;
+
+       ifindex = net_parse_dev(&argc, &argv);
+       if (ifindex < 1) {
+               close(progfd);
+               return -EINVAL;
+       }
+
+       if (argc) {
+               if (is_prefix(*argv, "overwrite")) {
+                       overwrite = true;
+               } else {
+                       p_err("expected 'overwrite', got: '%s'?", *argv);
+                       close(progfd);
+                       return -EINVAL;
+               }
+       }
+
+       /* attach xdp prog */
+       if (is_prefix("xdp", attach_type_strings[attach_type]))
+               err = do_attach_detach_xdp(progfd, attach_type, ifindex,
+                                          overwrite);
+
+       if (err < 0) {
+               p_err("interface %s attach failed: %s",
+                     attach_type_strings[attach_type], strerror(-err));
+               return err;
+       }
+
+       if (json_output)
+               jsonw_null(json_wtr);
+
+       return 0;
+}
+
+static int do_detach(int argc, char **argv)
+{
+       enum net_attach_type attach_type;
+       int progfd, ifindex, err = 0;
+
+       /* parse detach args */
+       if (!REQ_ARGS(3))
+               return -EINVAL;
+
+       attach_type = parse_attach_type(*argv);
+       if (attach_type == net_attach_type_size) {
+               p_err("invalid net attach/detach type: %s", *argv);
+               return -EINVAL;
+       }
+       NEXT_ARG();
+
+       ifindex = net_parse_dev(&argc, &argv);
+       if (ifindex < 1)
+               return -EINVAL;
+
+       /* detach xdp prog */
+       progfd = -1;
+       if (is_prefix("xdp", attach_type_strings[attach_type]))
+               err = do_attach_detach_xdp(progfd, attach_type, ifindex, NULL);
+
+       if (err < 0) {
+               p_err("interface %s detach failed: %s",
+                     attach_type_strings[attach_type], strerror(-err));
+               return err;
+       }
+
+       if (json_output)
+               jsonw_null(json_wtr);
+
+       return 0;
+}
+
 static int do_show(int argc, char **argv)
 {
        struct bpf_attach_info attach_info = {};
@@ -232,13 +389,9 @@ static int do_show(int argc, char **argv)
        char err_buf[256];
 
        if (argc == 2) {
-               if (strcmp(argv[0], "dev") != 0)
-                       usage();
-               filter_idx = if_nametoindex(argv[1]);
-               if (filter_idx == 0) {
-                       fprintf(stderr, "invalid dev name %s\n", argv[1]);
+               filter_idx = net_parse_dev(&argc, &argv);
+               if (filter_idx < 1)
                        return -1;
-               }
        } else if (argc != 0) {
                usage();
        }
@@ -305,13 +458,20 @@ static int do_help(int argc, char **argv)
 
        fprintf(stderr,
                "Usage: %s %s { show | list } [dev <devname>]\n"
+               "       %s %s attach ATTACH_TYPE PROG dev <devname> [ overwrite ]\n"
+               "       %s %s detach ATTACH_TYPE dev <devname>\n"
                "       %s %s help\n"
+               "\n"
+               "       " HELP_SPEC_PROGRAM "\n"
+               "       ATTACH_TYPE := { xdp | xdpgeneric | xdpdrv | xdpoffload }\n"
+               "\n"
                "Note: Only xdp and tc attachments are supported now.\n"
                "      For progs attached to cgroups, use \"bpftool cgroup\"\n"
                "      to dump program attachments. For program types\n"
                "      sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n"
                "      consult iproute2.\n",
-               bin_name, argv[-2], bin_name, argv[-2]);
+               bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+               bin_name, argv[-2]);
 
        return 0;
 }
@@ -319,6 +479,8 @@ static int do_help(int argc, char **argv)
 static const struct cmd cmds[] = {
        { "show",       do_show },
        { "list",       do_show },
+       { "attach",     do_attach },
+       { "detach",     do_detach },
        { "help",       do_help },
        { 0 }
 };
index f2a545e667c4e35a44eeca32c54faeaae17a6fbf..b2046f33e23f1b719a617f7a66f96d5bb13cb934 100644 (file)
@@ -104,6 +104,8 @@ static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type,
                jsonw_string_field(json_wtr, "filename", buf);
                jsonw_lluint_field(json_wtr, "offset", probe_offset);
                break;
+       default:
+               break;
        }
        jsonw_end_object(json_wtr);
 }
@@ -140,6 +142,8 @@ static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type,
                printf("uretprobe  filename %s  offset %llu\n", buf,
                       probe_offset);
                break;
+       default:
+               break;
        }
 }
 
index 0d35f18006a136b4578f2298243e3728a067c272..95c072b70d0e832f70a4ac07470bcd0d1ed46deb 100644 (file)
@@ -6,9 +6,11 @@
 /*
  * Common definitions for all gcc versions go here.
  */
+#ifndef GCC_VERSION
 #define GCC_VERSION (__GNUC__ * 10000          \
                     + __GNUC_MINOR__ * 100     \
                     + __GNUC_PATCHLEVEL__)
+#endif
 
 #if GCC_VERSION >= 70000 && !defined(__CHECKER__)
 # define __fallthrough __attribute__ ((fallthrough))
index 0e66371bea13fdb93411c87aeb08a88615b5f8bb..77c6be96d676222e446d41d2668b40cafb0ef1fe 100644 (file)
@@ -106,6 +106,7 @@ enum bpf_cmd {
        BPF_TASK_FD_QUERY,
        BPF_MAP_LOOKUP_AND_DELETE_ELEM,
        BPF_MAP_FREEZE,
+       BPF_BTF_GET_NEXT_ID,
 };
 
 enum bpf_map_type {
@@ -284,6 +285,9 @@ enum bpf_attach_type {
  */
 #define BPF_F_TEST_RND_HI32    (1U << 2)
 
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_STATE_FREQ  (1U << 3)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * two extensions:
  *
@@ -337,6 +341,9 @@ enum bpf_attach_type {
 #define BPF_F_RDONLY_PROG      (1U << 7)
 #define BPF_F_WRONLY_PROG      (1U << 8)
 
+/* Clone map from listener for newly accepted socket */
+#define BPF_F_CLONE            (1U << 9)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE  (1U << 0)
 
@@ -576,6 +583,8 @@ union bpf_attr {
  *             limited to five).
  *
  *             Each time the helper is called, it appends a line to the trace.
+ *             Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ *             open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
  *             The format of the trace is customizable, and the exact output
  *             one will get depends on the options set in
  *             *\/sys/kernel/debug/tracing/trace_options* (see also the
@@ -1014,7 +1023,7 @@ union bpf_attr {
  *             The realm of the route for the packet associated to *skb*, or 0
  *             if none was found.
  *
- * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
  *     Description
  *             Write raw *data* blob into a special BPF perf event held by
  *             *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
@@ -1076,7 +1085,7 @@ union bpf_attr {
  *     Return
  *             0 on success, or a negative error in case of failure.
  *
- * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags)
+ * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags)
  *     Description
  *             Walk a user or a kernel stack and return its id. To achieve
  *             this, the helper needs *ctx*, which is a pointer to the context
@@ -1725,7 +1734,7 @@ union bpf_attr {
  *     Return
  *             0 on success, or a negative error in case of failure.
  *
- * int bpf_override_return(struct pt_reg *regs, u64 rc)
+ * int bpf_override_return(struct pt_regs *regs, u64 rc)
  *     Description
  *             Used for error injection, this helper uses kprobes to override
  *             the return value of the probed function, and to set it to *rc*.
index faaa5ca2a11767a3cfd967661f21645799b95f9c..be328c59389d56861f95aeb488860ed81ef19e0c 100644 (file)
 #define XDP_SHARED_UMEM        (1 << 0)
 #define XDP_COPY       (1 << 1) /* Force copy-mode */
 #define XDP_ZEROCOPY   (1 << 2) /* Force zero-copy mode */
+/* If this option is set, the driver might go sleep and in that case
+ * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be
+ * set. If it is set, the application need to explicitly wake up the
+ * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are
+ * running the driver and the application on the same core, you should
+ * use this option so that the kernel will yield to the user space
+ * application.
+ */
+#define XDP_USE_NEED_WAKEUP (1 << 3)
+
+/* Flags for xsk_umem_config flags */
+#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
 
 struct sockaddr_xdp {
        __u16 sxdp_family;
@@ -25,10 +37,14 @@ struct sockaddr_xdp {
        __u32 sxdp_shared_umem_fd;
 };
 
+/* XDP_RING flags */
+#define XDP_RING_NEED_WAKEUP (1 << 0)
+
 struct xdp_ring_offset {
        __u64 producer;
        __u64 consumer;
        __u64 desc;
+       __u64 flags;
 };
 
 struct xdp_mmap_offsets {
@@ -53,6 +69,7 @@ struct xdp_umem_reg {
        __u64 len; /* Length of packet data area */
        __u32 chunk_size;
        __u32 headroom;
+       __u32 flags;
 };
 
 struct xdp_statistics {
@@ -74,6 +91,11 @@ struct xdp_options {
 #define XDP_UMEM_PGOFF_FILL_RING       0x100000000ULL
 #define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL
 
+/* Masks for unaligned chunks mode */
+#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48
+#define XSK_UNALIGNED_BUF_ADDR_MASK \
+       ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1)
+
 /* Rx/Tx descriptor */
 struct xdp_desc {
        __u64 addr;
index 9312066a1ae380b0ce9b9abb6c5923d86d047bd8..c6f94cffe06e106549b835e9970b6aeeb577c9a8 100644 (file)
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
 # Most of this file is copied from tools/lib/traceevent/Makefile
 
-BPF_VERSION = 0
-BPF_PATCHLEVEL = 0
-BPF_EXTRAVERSION = 4
+LIBBPF_VERSION := $(shell \
+       grep -oE '^LIBBPF_([0-9.]+)' libbpf.map | \
+       sort -rV | head -n1 | cut -d'_' -f2)
+LIBBPF_MAJOR_VERSION := $(firstword $(subst ., ,$(LIBBPF_VERSION)))
 
 MAKEFLAGS += --no-print-directory
 
@@ -79,15 +80,9 @@ export prefix libdir src obj
 libdir_SQ = $(subst ','\'',$(libdir))
 libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
 
-VERSION                = $(BPF_VERSION)
-PATCHLEVEL     = $(BPF_PATCHLEVEL)
-EXTRAVERSION   = $(BPF_EXTRAVERSION)
-
 OBJ            = $@
 N              =
 
-LIBBPF_VERSION = $(BPF_VERSION).$(BPF_PATCHLEVEL).$(BPF_EXTRAVERSION)
-
 LIB_TARGET     = libbpf.a libbpf.so.$(LIBBPF_VERSION)
 LIB_FILE       = libbpf.a libbpf.so*
 PC_FILE                = libbpf.pc
@@ -113,6 +108,7 @@ override CFLAGS += -Werror -Wall
 override CFLAGS += -fPIC
 override CFLAGS += $(INCLUDES)
 override CFLAGS += -fvisibility=hidden
+override CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
 
 ifeq ($(VERBOSE),1)
   Q =
@@ -138,7 +134,9 @@ LIB_FILE    := $(addprefix $(OUTPUT),$(LIB_FILE))
 PC_FILE                := $(addprefix $(OUTPUT),$(PC_FILE))
 
 GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN) | \
-                          awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print s}')
+                          cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' | \
+                          awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}' | \
+                          sort -u | wc -l)
 VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \
                              grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l)
 
@@ -178,10 +176,10 @@ $(BPF_IN): force elfdep bpfdep
 $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
 
 $(OUTPUT)libbpf.so.$(LIBBPF_VERSION): $(BPF_IN)
-       $(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(VERSION) \
+       $(QUIET_LINK)$(CC) --shared -Wl,-soname,libbpf.so.$(LIBBPF_MAJOR_VERSION) \
                                    -Wl,--version-script=$(VERSION_SCRIPT) $^ -lelf -o $@
        @ln -sf $(@F) $(OUTPUT)libbpf.so
-       @ln -sf $(@F) $(OUTPUT)libbpf.so.$(VERSION)
+       @ln -sf $(@F) $(OUTPUT)libbpf.so.$(LIBBPF_MAJOR_VERSION)
 
 $(OUTPUT)libbpf.a: $(BPF_IN)
        $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
@@ -205,6 +203,7 @@ check_abi: $(OUTPUT)libbpf.so
                     "Please make sure all LIBBPF_API symbols are"       \
                     "versioned in $(VERSION_SCRIPT)." >&2;              \
                readelf -s --wide $(OUTPUT)libbpf-in.o |                 \
+                   cut -d "@" -f1 | sed 's/_v[0-9]_[0-9]_[0-9].*//' |   \
                    awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {print $$8}'|   \
                    sort -u > $(OUTPUT)libbpf_global_syms.tmp;           \
                readelf -s --wide $(OUTPUT)libbpf.so |                   \
@@ -257,7 +256,8 @@ config-clean:
 
 clean:
        $(call QUIET_CLEAN, libbpf) $(RM) $(TARGETS) $(CXX_TEST_TARGET) \
-               *.o *~ *.a *.so *.so.$(VERSION) .*.d .*.cmd *.pc LIBBPF-CFLAGS
+               *.o *~ *.a *.so *.so.$(LIBBPF_MAJOR_VERSION) .*.d .*.cmd \
+               *.pc LIBBPF-CFLAGS
        $(call QUIET_CLEAN, core-gen) $(RM) $(OUTPUT)FEATURE-DUMP.libbpf
 
 
index c7d7993c44bb0e1b6f7cdef1dc747050fdd0dbe9..cbb933532981f0923b2d37ac3acacf1d5e428b46 100644 (file)
@@ -568,7 +568,7 @@ int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr)
        return ret;
 }
 
-int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
+static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd)
 {
        union bpf_attr attr;
        int err;
@@ -576,26 +576,26 @@ int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
        memset(&attr, 0, sizeof(attr));
        attr.start_id = start_id;
 
-       err = sys_bpf(BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr));
+       err = sys_bpf(cmd, &attr, sizeof(attr));
        if (!err)
                *next_id = attr.next_id;
 
        return err;
 }
 
-int bpf_map_get_next_id(__u32 start_id, __u32 *next_id)
+int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
 {
-       union bpf_attr attr;
-       int err;
-
-       memset(&attr, 0, sizeof(attr));
-       attr.start_id = start_id;
+       return bpf_obj_get_next_id(start_id, next_id, BPF_PROG_GET_NEXT_ID);
+}
 
-       err = sys_bpf(BPF_MAP_GET_NEXT_ID, &attr, sizeof(attr));
-       if (!err)
-               *next_id = attr.next_id;
+int bpf_map_get_next_id(__u32 start_id, __u32 *next_id)
+{
+       return bpf_obj_get_next_id(start_id, next_id, BPF_MAP_GET_NEXT_ID);
+}
 
-       return err;
+int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id)
+{
+       return bpf_obj_get_next_id(start_id, next_id, BPF_BTF_GET_NEXT_ID);
 }
 
 int bpf_prog_get_fd_by_id(__u32 id)
index ff42ca043dc8fc1e0e94eaf1334367ec4585b543..0db01334740f8d1961a54bb28d8cb7f11d8ea9f6 100644 (file)
@@ -156,6 +156,7 @@ LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data,
                                 __u32 *retval, __u32 *duration);
 LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id);
 LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id);
+LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id);
 LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id);
 LIBBPF_API int bpf_map_get_fd_by_id(__u32 id);
 LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id);
index f9d316e873d8d2d7ff37d138fdef0cbbbd425bcd..d04c7cb623ed01494304f2cc9425f5a2561644ab 100644 (file)
@@ -183,4 +183,10 @@ LIBBPF_0.0.4 {
                perf_buffer__new;
                perf_buffer__new_raw;
                perf_buffer__poll;
+               xsk_umem__create;
 } LIBBPF_0.0.3;
+
+LIBBPF_0.0.5 {
+       global:
+               bpf_btf_get_next_id;
+} LIBBPF_0.0.4;
index 680e63066cf39c7f3bd06cdf645b05065060728e..842c4fd558592183a6ce0a007acfa68ebbcf7237 100644 (file)
@@ -74,23 +74,6 @@ struct xsk_nl_info {
        int fd;
 };
 
-/* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit.
- * Unfortunately, it is not part of glibc.
- */
-static inline void *xsk_mmap(void *addr, size_t length, int prot, int flags,
-                            int fd, __u64 offset)
-{
-#ifdef __NR_mmap2
-       unsigned int page_shift = __builtin_ffs(getpagesize()) - 1;
-       long ret = syscall(__NR_mmap2, addr, length, prot, flags, fd,
-                          (off_t)(offset >> page_shift));
-
-       return (void *)ret;
-#else
-       return mmap(addr, length, prot, flags, fd, offset);
-#endif
-}
-
 int xsk_umem__fd(const struct xsk_umem *umem)
 {
        return umem ? umem->fd : -EINVAL;
@@ -116,6 +99,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
                cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
                cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
                cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
+               cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
                return;
        }
 
@@ -123,6 +107,7 @@ static void xsk_set_umem_config(struct xsk_umem_config *cfg,
        cfg->comp_size = usr_cfg->comp_size;
        cfg->frame_size = usr_cfg->frame_size;
        cfg->frame_headroom = usr_cfg->frame_headroom;
+       cfg->flags = usr_cfg->flags;
 }
 
 static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
@@ -149,9 +134,10 @@ static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
        return 0;
 }
 
-int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
-                    struct xsk_ring_prod *fill, struct xsk_ring_cons *comp,
-                    const struct xsk_umem_config *usr_config)
+int xsk_umem__create_v0_0_4(struct xsk_umem **umem_ptr, void *umem_area,
+                           __u64 size, struct xsk_ring_prod *fill,
+                           struct xsk_ring_cons *comp,
+                           const struct xsk_umem_config *usr_config)
 {
        struct xdp_mmap_offsets off;
        struct xdp_umem_reg mr;
@@ -182,6 +168,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
        mr.len = size;
        mr.chunk_size = umem->config.frame_size;
        mr.headroom = umem->config.frame_headroom;
+       mr.flags = umem->config.flags;
 
        err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
        if (err) {
@@ -210,10 +197,9 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
                goto out_socket;
        }
 
-       map = xsk_mmap(NULL, off.fr.desc +
-                      umem->config.fill_size * sizeof(__u64),
-                      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
-                      umem->fd, XDP_UMEM_PGOFF_FILL_RING);
+       map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
+                  PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, umem->fd,
+                  XDP_UMEM_PGOFF_FILL_RING);
        if (map == MAP_FAILED) {
                err = -errno;
                goto out_socket;
@@ -224,13 +210,13 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
        fill->size = umem->config.fill_size;
        fill->producer = map + off.fr.producer;
        fill->consumer = map + off.fr.consumer;
+       fill->flags = map + off.fr.flags;
        fill->ring = map + off.fr.desc;
        fill->cached_cons = umem->config.fill_size;
 
-       map = xsk_mmap(NULL,
-                      off.cr.desc + umem->config.comp_size * sizeof(__u64),
-                      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
-                      umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING);
+       map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
+                  PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, umem->fd,
+                  XDP_UMEM_PGOFF_COMPLETION_RING);
        if (map == MAP_FAILED) {
                err = -errno;
                goto out_mmap;
@@ -241,6 +227,7 @@ int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size,
        comp->size = umem->config.comp_size;
        comp->producer = map + off.cr.producer;
        comp->consumer = map + off.cr.consumer;
+       comp->flags = map + off.cr.flags;
        comp->ring = map + off.cr.desc;
 
        *umem_ptr = umem;
@@ -255,6 +242,29 @@ out_umem_alloc:
        return err;
 }
 
+struct xsk_umem_config_v1 {
+       __u32 fill_size;
+       __u32 comp_size;
+       __u32 frame_size;
+       __u32 frame_headroom;
+};
+
+int xsk_umem__create_v0_0_2(struct xsk_umem **umem_ptr, void *umem_area,
+                           __u64 size, struct xsk_ring_prod *fill,
+                           struct xsk_ring_cons *comp,
+                           const struct xsk_umem_config *usr_config)
+{
+       struct xsk_umem_config config;
+
+       memcpy(&config, usr_config, sizeof(struct xsk_umem_config_v1));
+       config.flags = 0;
+
+       return xsk_umem__create_v0_0_4(umem_ptr, umem_area, size, fill, comp,
+                                       &config);
+}
+asm(".symver xsk_umem__create_v0_0_2, xsk_umem__create@LIBBPF_0.0.2");
+asm(".symver xsk_umem__create_v0_0_4, xsk_umem__create@@LIBBPF_0.0.4");
+
 static int xsk_load_xdp_prog(struct xsk_socket *xsk)
 {
        static const int log_buf_size = 16 * 1024;
@@ -550,11 +560,10 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
        }
 
        if (rx) {
-               rx_map = xsk_mmap(NULL, off.rx.desc +
-                                 xsk->config.rx_size * sizeof(struct xdp_desc),
-                                 PROT_READ | PROT_WRITE,
-                                 MAP_SHARED | MAP_POPULATE,
-                                 xsk->fd, XDP_PGOFF_RX_RING);
+               rx_map = mmap(NULL, off.rx.desc +
+                             xsk->config.rx_size * sizeof(struct xdp_desc),
+                             PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+                             xsk->fd, XDP_PGOFF_RX_RING);
                if (rx_map == MAP_FAILED) {
                        err = -errno;
                        goto out_socket;
@@ -564,16 +573,16 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
                rx->size = xsk->config.rx_size;
                rx->producer = rx_map + off.rx.producer;
                rx->consumer = rx_map + off.rx.consumer;
+               rx->flags = rx_map + off.rx.flags;
                rx->ring = rx_map + off.rx.desc;
        }
        xsk->rx = rx;
 
        if (tx) {
-               tx_map = xsk_mmap(NULL, off.tx.desc +
-                                 xsk->config.tx_size * sizeof(struct xdp_desc),
-                                 PROT_READ | PROT_WRITE,
-                                 MAP_SHARED | MAP_POPULATE,
-                                 xsk->fd, XDP_PGOFF_TX_RING);
+               tx_map = mmap(NULL, off.tx.desc +
+                             xsk->config.tx_size * sizeof(struct xdp_desc),
+                             PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+                             xsk->fd, XDP_PGOFF_TX_RING);
                if (tx_map == MAP_FAILED) {
                        err = -errno;
                        goto out_mmap_rx;
@@ -583,6 +592,7 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
                tx->size = xsk->config.tx_size;
                tx->producer = tx_map + off.tx.producer;
                tx->consumer = tx_map + off.tx.consumer;
+               tx->flags = tx_map + off.tx.flags;
                tx->ring = tx_map + off.tx.desc;
                tx->cached_cons = xsk->config.tx_size;
        }
index 833a6e60d065fc3128e052eccc693d454aba1a04..584f6820a6397aa6ce9da58751ff7952bedcb2f9 100644 (file)
@@ -32,6 +32,7 @@ struct name { \
        __u32 *producer; \
        __u32 *consumer; \
        void *ring; \
+       __u32 *flags; \
 }
 
 DEFINE_XSK_RING(xsk_ring_prod);
@@ -76,6 +77,11 @@ xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx)
        return &descs[idx & rx->mask];
 }
 
+static inline int xsk_ring_prod__needs_wakeup(const struct xsk_ring_prod *r)
+{
+       return *r->flags & XDP_RING_NEED_WAKEUP;
+}
+
 static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
 {
        __u32 free_entries = r->cached_cons - r->cached_prod;
@@ -162,6 +168,21 @@ static inline void *xsk_umem__get_data(void *umem_area, __u64 addr)
        return &((char *)umem_area)[addr];
 }
 
+static inline __u64 xsk_umem__extract_addr(__u64 addr)
+{
+       return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
+}
+
+static inline __u64 xsk_umem__extract_offset(__u64 addr)
+{
+       return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+}
+
+static inline __u64 xsk_umem__add_offset_to_addr(__u64 addr)
+{
+       return xsk_umem__extract_addr(addr) + xsk_umem__extract_offset(addr);
+}
+
 LIBBPF_API int xsk_umem__fd(const struct xsk_umem *umem);
 LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
 
@@ -170,12 +191,14 @@ LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
 #define XSK_UMEM__DEFAULT_FRAME_SHIFT    12 /* 4096 bytes */
 #define XSK_UMEM__DEFAULT_FRAME_SIZE     (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
 #define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
+#define XSK_UMEM__DEFAULT_FLAGS 0
 
 struct xsk_umem_config {
        __u32 fill_size;
        __u32 comp_size;
        __u32 frame_size;
        __u32 frame_headroom;
+       __u32 flags;
 };
 
 /* Flags for the libbpf_flags field. */
@@ -195,6 +218,16 @@ LIBBPF_API int xsk_umem__create(struct xsk_umem **umem,
                                struct xsk_ring_prod *fill,
                                struct xsk_ring_cons *comp,
                                const struct xsk_umem_config *config);
+LIBBPF_API int xsk_umem__create_v0_0_2(struct xsk_umem **umem,
+                                      void *umem_area, __u64 size,
+                                      struct xsk_ring_prod *fill,
+                                      struct xsk_ring_cons *comp,
+                                      const struct xsk_umem_config *config);
+LIBBPF_API int xsk_umem__create_v0_0_4(struct xsk_umem **umem,
+                                      void *umem_area, __u64 size,
+                                      struct xsk_ring_prod *fill,
+                                      struct xsk_ring_cons *comp,
+                                      const struct xsk_umem_config *config);
 LIBBPF_API int xsk_socket__create(struct xsk_socket **xsk,
                                  const char *ifname, __u32 queue_id,
                                  struct xsk_umem *umem,
index 90f70d2c7c22aa00b0874104fcc52c887bdbab64..60c9338cd9b411e589e3b5d1d13839bd4a1584b4 100644 (file)
@@ -42,4 +42,5 @@ xdping
 test_sockopt
 test_sockopt_sk
 test_sockopt_multi
+test_sockopt_inherit
 test_tcp_rtt
index d69c541e20390ce6b6678ab4b130ed62bc3a82d8..9eef5edf17be5ec8e30ffddb6ab7d1c4fb62831c 100644 (file)
@@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
        test_cgroup_storage test_select_reuseport test_section_names \
        test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
        test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
-       test_sockopt_multi test_tcp_rtt
+       test_sockopt_multi test_sockopt_inherit test_tcp_rtt
 
 BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
 TEST_GEN_FILES = $(BPF_OBJ_FILES)
@@ -66,7 +66,8 @@ TEST_PROGS := test_kmod.sh \
        test_tcp_check_syncookie.sh \
        test_tc_tunnel.sh \
        test_tc_edt.sh \
-       test_xdping.sh
+       test_xdping.sh \
+       test_bpftool_build.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
        with_tunnels.sh \
@@ -115,6 +116,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
 $(OUTPUT)/test_sockopt: cgroup_helpers.c
 $(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
 $(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
+$(OUTPUT)/test_sockopt_inherit: cgroup_helpers.c
 $(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
 
 .PHONY: force
index 05f036df8a4c51b7c32fd72f65b9766e13ac7689..fbe28008450fdffb37e8d6de0be94bd303f3efaf 100644 (file)
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 #ifndef __BPF_ENDIAN__
 #define __BPF_ENDIAN__
 
 # define __bpf_htonl(x)                        __builtin_bswap32(x)
 # define __bpf_constant_ntohl(x)       ___constant_swab32(x)
 # define __bpf_constant_htonl(x)       ___constant_swab32(x)
+# define __bpf_be64_to_cpu(x)          __builtin_bswap64(x)
+# define __bpf_cpu_to_be64(x)          __builtin_bswap64(x)
+# define __bpf_constant_be64_to_cpu(x) ___constant_swab64(x)
+# define __bpf_constant_cpu_to_be64(x) ___constant_swab64(x)
 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 # define __bpf_ntohs(x)                        (x)
 # define __bpf_htons(x)                        (x)
 # define __bpf_htonl(x)                        (x)
 # define __bpf_constant_ntohl(x)       (x)
 # define __bpf_constant_htonl(x)       (x)
+# define __bpf_be64_to_cpu(x)          (x)
+# define __bpf_cpu_to_be64(x)          (x)
+# define __bpf_constant_be64_to_cpu(x)  (x)
+# define __bpf_constant_cpu_to_be64(x)  (x)
 #else
 # error "Fix your compiler's __BYTE_ORDER__?!"
 #endif
 #define bpf_ntohl(x)                           \
        (__builtin_constant_p(x) ?              \
         __bpf_constant_ntohl(x) : __bpf_ntohl(x))
+#define bpf_cpu_to_be64(x)                     \
+       (__builtin_constant_p(x) ?              \
+        __bpf_constant_cpu_to_be64(x) : __bpf_cpu_to_be64(x))
+#define bpf_be64_to_cpu(x)                     \
+       (__builtin_constant_p(x) ?              \
+        __bpf_constant_be64_to_cpu(x) : __bpf_be64_to_cpu(x))
 
 #endif /* __BPF_ENDIAN__ */
index 8b503ea142f07aea2a395b636f4a3d32d4f40178..6c4930bc6e2ec9fc3263e916865e1fa5e8069eec 100644 (file)
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 #ifndef __BPF_HELPERS_H
 #define __BPF_HELPERS_H
 
index fb5840a6254887eae6258ef6061c162c7df75a01..f10029821e1672f7038bc8f86ae43c393835b8f4 100644 (file)
@@ -48,16 +48,17 @@ void test_bpf_obj_id(void)
                /* test_obj_id.o is a dumb prog. It should never fail
                 * to load.
                 */
-               if (err)
-                       error_cnt++;
-               assert(!err);
+               if (CHECK_FAIL(err))
+                       continue;
 
                /* Insert a magic value to the map */
                map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
-               assert(map_fds[i] >= 0);
+               if (CHECK_FAIL(map_fds[i] < 0))
+                       goto done;
                err = bpf_map_update_elem(map_fds[i], &array_key,
                                          &array_magic_value, 0);
-               assert(!err);
+               if (CHECK_FAIL(err))
+                       goto done;
 
                /* Check getting map info */
                info_len = sizeof(struct bpf_map_info) * 2;
@@ -96,9 +97,11 @@ void test_bpf_obj_id(void)
                prog_infos[i].map_ids = ptr_to_u64(map_ids + i);
                prog_infos[i].nr_map_ids = 2;
                err = clock_gettime(CLOCK_REALTIME, &real_time_ts);
-               assert(!err);
+               if (CHECK_FAIL(err))
+                       goto done;
                err = clock_gettime(CLOCK_BOOTTIME, &boot_time_ts);
-               assert(!err);
+               if (CHECK_FAIL(err))
+                       goto done;
                err = bpf_obj_get_info_by_fd(prog_fds[i], &prog_infos[i],
                                             &info_len);
                load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec)
@@ -224,7 +227,8 @@ void test_bpf_obj_id(void)
                nr_id_found++;
 
                err = bpf_map_lookup_elem(map_fd, &array_key, &array_value);
-               assert(!err);
+               if (CHECK_FAIL(err))
+                       goto done;
 
                err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
                CHECK(err || info_len != sizeof(struct bpf_map_info) ||
index 1a1eae356f81b3f25f81d5cad76fd178c907025a..1c01ee2600a97ce02a0b622dc51f21c30b5c3e07 100644 (file)
@@ -28,8 +28,6 @@ static int check_load(const char *file, enum bpf_prog_type type)
        attr.prog_flags = BPF_F_TEST_RND_HI32;
        err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
        bpf_object__close(obj);
-       if (err)
-               error_cnt++;
        return err;
 }
 
@@ -105,12 +103,7 @@ void test_bpf_verif_scale(void)
                        continue;
 
                err = check_load(test->file, test->attach_type);
-               if (test->fails) { /* expected to fail */
-                       if (err)
-                               error_cnt--;
-                       else
-                               error_cnt++;
-               }
+               CHECK_FAIL(err && !test->fails);
        }
 
        if (env.verifier_stats)
index 6892b88ae0652404ad9d3fafce6a95f05251d239..92563898867cb690e8d8193756603f4a8683ee81 100644 (file)
@@ -344,7 +344,6 @@ struct test tests[] = {
                        .tcp.dest = 8080,
                },
                .keys = {
-                       .nhoff = 0,
                        .nhoff = ETH_HLEN,
                        .thoff = ETH_HLEN + sizeof(struct iphdr) +
                                sizeof(struct iphdr),
@@ -452,10 +451,8 @@ void test_flow_dissector(void)
 
        err = bpf_flow_load(&obj, "./bpf_flow.o", "flow_dissector",
                            "jmp_table", "last_dissection", &prog_fd, &keys_fd);
-       if (err) {
-               error_cnt++;
+       if (CHECK_FAIL(err))
                return;
-       }
 
        for (i = 0; i < ARRAY_SIZE(tests); i++) {
                struct bpf_flow_keys flow_keys;
index 3d59b3c841fee8def11939484b771dd5ad03b9e2..eba9a970703b6a0481eb749ed4e6dfa24f1f9d3e 100644 (file)
@@ -135,10 +135,7 @@ void test_get_stack_raw_tp(void)
                exp_cnt -= err;
        }
 
-       goto close_prog_noerr;
 close_prog:
-       error_cnt++;
-close_prog_noerr:
        if (!IS_ERR_OR_NULL(link))
                bpf_link__destroy(link);
        if (!IS_ERR_OR_NULL(pb))
index d011079fb0bfe6dbd6c963c342787fc25e97b819..c680926fce7384914aa34eb368b4c5ebc83c3c65 100644 (file)
@@ -7,10 +7,8 @@ static void test_global_data_number(struct bpf_object *obj, __u32 duration)
        uint64_t num;
 
        map_fd = bpf_find_map(__func__, obj, "result_number");
-       if (map_fd < 0) {
-               error_cnt++;
+       if (CHECK_FAIL(map_fd < 0))
                return;
-       }
 
        struct {
                char *name;
@@ -44,10 +42,8 @@ static void test_global_data_string(struct bpf_object *obj, __u32 duration)
        char str[32];
 
        map_fd = bpf_find_map(__func__, obj, "result_string");
-       if (map_fd < 0) {
-               error_cnt++;
+       if (CHECK_FAIL(map_fd < 0))
                return;
-       }
 
        struct {
                char *name;
@@ -81,10 +77,8 @@ static void test_global_data_struct(struct bpf_object *obj, __u32 duration)
        struct foo val;
 
        map_fd = bpf_find_map(__func__, obj, "result_struct");
-       if (map_fd < 0) {
-               error_cnt++;
+       if (CHECK_FAIL(map_fd < 0))
                return;
-       }
 
        struct {
                char *name;
@@ -112,16 +106,12 @@ static void test_global_data_rdonly(struct bpf_object *obj, __u32 duration)
        __u8 *buff;
 
        map = bpf_object__find_map_by_name(obj, "test_glo.rodata");
-       if (!map || !bpf_map__is_internal(map)) {
-               error_cnt++;
+       if (CHECK_FAIL(!map || !bpf_map__is_internal(map)))
                return;
-       }
 
        map_fd = bpf_map__fd(map);
-       if (map_fd < 0) {
-               error_cnt++;
+       if (CHECK_FAIL(map_fd < 0))
                return;
-       }
 
        buff = malloc(bpf_map__def(map)->value_size);
        if (buff)
index 20ddca830e6838284321523d087a0e83fb347962..eaf64595be8810aaaaf0a6fff5665ebf9a3f018e 100644 (file)
@@ -30,10 +30,8 @@ static void test_l4lb(const char *file)
        u32 *magic = (u32 *)buf;
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
-       if (err) {
-               error_cnt++;
+       if (CHECK_FAIL(err))
                return;
-       }
 
        map_fd = bpf_find_map(__func__, obj, "vip_map");
        if (map_fd < 0)
@@ -72,10 +70,9 @@ static void test_l4lb(const char *file)
                bytes += stats[i].bytes;
                pkts += stats[i].pkts;
        }
-       if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
-               error_cnt++;
+       if (CHECK_FAIL(bytes != MAGIC_BYTES * NUM_ITER * 2 ||
+                      pkts != NUM_ITER * 2))
                printf("test_l4lb:FAIL:stats %lld %lld\n", bytes, pkts);
-       }
 out:
        bpf_object__close(obj);
 }
index ee99368c595ca0b0768ad7938212bc80977bacf1..8f91f1881d114b1e13cc239efb22180c28fcad88 100644 (file)
@@ -8,14 +8,12 @@ static void *parallel_map_access(void *arg)
 
        for (i = 0; i < 10000; i++) {
                err = bpf_map_lookup_elem_flags(map_fd, &key, vars, BPF_F_LOCK);
-               if (err) {
+               if (CHECK_FAIL(err)) {
                        printf("lookup failed\n");
-                       error_cnt++;
                        goto out;
                }
-               if (vars[0] != 0) {
+               if (CHECK_FAIL(vars[0] != 0)) {
                        printf("lookup #%d var[0]=%d\n", i, vars[0]);
-                       error_cnt++;
                        goto out;
                }
                rnd = vars[1];
@@ -24,7 +22,7 @@ static void *parallel_map_access(void *arg)
                                continue;
                        printf("lookup #%d var[1]=%d var[%d]=%d\n",
                               i, rnd, j, vars[j]);
-                       error_cnt++;
+                       CHECK_FAIL(vars[j] != rnd);
                        goto out;
                }
        }
@@ -42,34 +40,36 @@ void test_map_lock(void)
        void *ret;
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
-       if (err) {
+       if (CHECK_FAIL(err)) {
                printf("test_map_lock:bpf_prog_load errno %d\n", errno);
                goto close_prog;
        }
        map_fd[0] = bpf_find_map(__func__, obj, "hash_map");
-       if (map_fd[0] < 0)
+       if (CHECK_FAIL(map_fd[0] < 0))
                goto close_prog;
        map_fd[1] = bpf_find_map(__func__, obj, "array_map");
-       if (map_fd[1] < 0)
+       if (CHECK_FAIL(map_fd[1] < 0))
                goto close_prog;
 
        bpf_map_update_elem(map_fd[0], &key, vars, BPF_F_LOCK);
 
        for (i = 0; i < 4; i++)
-               assert(pthread_create(&thread_id[i], NULL,
-                                     &spin_lock_thread, &prog_fd) == 0);
+               if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
+                                             &spin_lock_thread, &prog_fd)))
+                       goto close_prog;
        for (i = 4; i < 6; i++)
-               assert(pthread_create(&thread_id[i], NULL,
-                                     &parallel_map_access, &map_fd[i - 4]) == 0);
+               if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
+                                             &parallel_map_access,
+                                             &map_fd[i - 4])))
+                       goto close_prog;
        for (i = 0; i < 4; i++)
-               assert(pthread_join(thread_id[i], &ret) == 0 &&
-                      ret == (void *)&prog_fd);
+               if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
+                              ret != (void *)&prog_fd))
+                       goto close_prog;
        for (i = 4; i < 6; i++)
-               assert(pthread_join(thread_id[i], &ret) == 0 &&
-                      ret == (void *)&map_fd[i - 4]);
-       goto close_prog_noerr;
+               if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
+                              ret != (void *)&map_fd[i - 4]))
+                       goto close_prog;
 close_prog:
-       error_cnt++;
-close_prog_noerr:
        bpf_object__close(obj);
 }
index 4ecfd721a044bb6b53b616732219ffa21eb146b4..a2537dfa899c6707e6ac46d074de9621bc6d96f6 100644 (file)
@@ -9,10 +9,8 @@ void test_pkt_access(void)
        int err, prog_fd;
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
-       if (err) {
-               error_cnt++;
+       if (CHECK_FAIL(err))
                return;
-       }
 
        err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
                                NULL, NULL, &retval, &duration);
index ac0d434358061a57eccd854fa1d1ad0c752d451c..5f7aea6050199bf8b2c0dffce6d0cbd424a005ab 100644 (file)
@@ -9,10 +9,8 @@ void test_pkt_md_access(void)
        int err, prog_fd;
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
-       if (err) {
-               error_cnt++;
+       if (CHECK_FAIL(err))
                return;
-       }
 
        err = bpf_prog_test_run(prog_fd, 10, &pkt_v4, sizeof(pkt_v4),
                                NULL, NULL, &retval, &duration);
index e60cd5ff1f559543d05a2963967f4624f708d094..faccc66f4e3968af3c4efd0a03c33719f9d151dc 100644 (file)
@@ -27,10 +27,8 @@ static void test_queue_stack_map_by_type(int type)
                return;
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
-       if (err) {
-               error_cnt++;
+       if (CHECK_FAIL(err))
                return;
-       }
 
        map_in_fd = bpf_find_map(__func__, obj, "map_in");
        if (map_in_fd < 0)
@@ -43,10 +41,8 @@ static void test_queue_stack_map_by_type(int type)
        /* Push 32 elements to the input map */
        for (i = 0; i < MAP_SIZE; i++) {
                err = bpf_map_update_elem(map_in_fd, NULL, &vals[i], 0);
-               if (err) {
-                       error_cnt++;
+               if (CHECK_FAIL(err))
                        goto out;
-               }
        }
 
        /* The eBPF program pushes iph.saddr in the output map,
index 4a4f428d1a78e1732c1e71384017b9b1379624df..5c78e2b5a917421d0f8c2d21a91584b3514379a7 100644 (file)
@@ -10,10 +10,8 @@ void test_reference_tracking(void)
        int err = 0;
 
        obj = bpf_object__open(file);
-       if (IS_ERR(obj)) {
-               error_cnt++;
+       if (CHECK_FAIL(IS_ERR(obj)))
                return;
-       }
 
        bpf_object__for_each_program(prog, obj) {
                const char *title;
index 1575f0a1f58659752e848848fe23e475daa16c9d..b607112c64e7ac90781b38b16c90921132f61dc6 100644 (file)
@@ -8,7 +8,7 @@ static void sigusr1_handler(int signum)
        sigusr1_received++;
 }
 
-static int test_send_signal_common(struct perf_event_attr *attr,
+static void test_send_signal_common(struct perf_event_attr *attr,
                                    int prog_type,
                                    const char *test_name)
 {
@@ -23,13 +23,13 @@ static int test_send_signal_common(struct perf_event_attr *attr,
 
        if (CHECK(pipe(pipe_c2p), test_name,
                  "pipe pipe_c2p error: %s\n", strerror(errno)))
-               goto no_fork_done;
+               return;
 
        if (CHECK(pipe(pipe_p2c), test_name,
                  "pipe pipe_p2c error: %s\n", strerror(errno))) {
                close(pipe_c2p[0]);
                close(pipe_c2p[1]);
-               goto no_fork_done;
+               return;
        }
 
        pid = fork();
@@ -38,7 +38,7 @@ static int test_send_signal_common(struct perf_event_attr *attr,
                close(pipe_c2p[1]);
                close(pipe_p2c[0]);
                close(pipe_p2c[1]);
-               goto no_fork_done;
+               return;
        }
 
        if (pid == 0) {
@@ -125,7 +125,7 @@ static int test_send_signal_common(struct perf_event_attr *attr,
                goto disable_pmu;
        }
 
-       err = CHECK(buf[0] != '2', test_name, "incorrect result\n");
+       CHECK(buf[0] != '2', test_name, "incorrect result\n");
 
        /* notify child safe to exit */
        write(pipe_p2c[1], buf, 1);
@@ -138,11 +138,9 @@ prog_load_failure:
        close(pipe_c2p[0]);
        close(pipe_p2c[1]);
        wait(NULL);
-no_fork_done:
-       return err;
 }
 
-static int test_send_signal_tracepoint(void)
+static void test_send_signal_tracepoint(void)
 {
        const char *id_path = "/sys/kernel/debug/tracing/events/syscalls/sys_enter_nanosleep/id";
        struct perf_event_attr attr = {
@@ -159,21 +157,21 @@ static int test_send_signal_tracepoint(void)
        if (CHECK(efd < 0, "tracepoint",
                  "open syscalls/sys_enter_nanosleep/id failure: %s\n",
                  strerror(errno)))
-               return -1;
+               return;
 
        bytes = read(efd, buf, sizeof(buf));
        close(efd);
        if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "tracepoint",
                  "read syscalls/sys_enter_nanosleep/id failure: %s\n",
                  strerror(errno)))
-               return -1;
+               return;
 
        attr.config = strtol(buf, NULL, 0);
 
-       return test_send_signal_common(&attr, BPF_PROG_TYPE_TRACEPOINT, "tracepoint");
+       test_send_signal_common(&attr, BPF_PROG_TYPE_TRACEPOINT, "tracepoint");
 }
 
-static int test_send_signal_perf(void)
+static void test_send_signal_perf(void)
 {
        struct perf_event_attr attr = {
                .sample_period = 1,
@@ -181,11 +179,11 @@ static int test_send_signal_perf(void)
                .config = PERF_COUNT_SW_CPU_CLOCK,
        };
 
-       return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT,
-                                      "perf_sw_event");
+       test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT,
+                               "perf_sw_event");
 }
 
-static int test_send_signal_nmi(void)
+static void test_send_signal_nmi(void)
 {
        struct perf_event_attr attr = {
                .sample_freq = 50,
@@ -204,25 +202,24 @@ static int test_send_signal_nmi(void)
                if (errno == ENOENT) {
                        printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n",
                               __func__);
-                       return 0;
+                       test__skip();
+                       return;
                }
                /* Let the test fail with a more informative message */
        } else {
                close(pmu_fd);
        }
 
-       return test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT,
-                                      "perf_hw_event");
+       test_send_signal_common(&attr, BPF_PROG_TYPE_PERF_EVENT,
+                               "perf_hw_event");
 }
 
 void test_send_signal(void)
 {
-       int ret = 0;
-
        if (test__start_subtest("send_signal_tracepoint"))
-               ret |= test_send_signal_tracepoint();
+               test_send_signal_tracepoint();
        if (test__start_subtest("send_signal_perf"))
-               ret |= test_send_signal_perf();
+               test_send_signal_perf();
        if (test__start_subtest("send_signal_nmi"))
-               ret |= test_send_signal_nmi();
+               test_send_signal_nmi();
 }
index 114ebe6a438e562d864971a5a5d174b1e0936f8a..1ae00cd3174ef8723c98c88350b85ccad7466ac2 100644 (file)
@@ -11,19 +11,19 @@ void test_spinlock(void)
        void *ret;
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
-       if (err) {
+       if (CHECK_FAIL(err)) {
                printf("test_spin_lock:bpf_prog_load errno %d\n", errno);
                goto close_prog;
        }
        for (i = 0; i < 4; i++)
-               assert(pthread_create(&thread_id[i], NULL,
-                                     &spin_lock_thread, &prog_fd) == 0);
+               if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
+                                             &spin_lock_thread, &prog_fd)))
+                       goto close_prog;
+
        for (i = 0; i < 4; i++)
-               assert(pthread_join(thread_id[i], &ret) == 0 &&
-                      ret == (void *)&prog_fd);
-       goto close_prog_noerr;
+               if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
+                              ret != (void *)&prog_fd))
+                       goto close_prog;
 close_prog:
-       error_cnt++;
-close_prog_noerr:
        bpf_object__close(obj);
 }
index ac44fda84833b4ca1f9a2b409cc8f7295de6ead8..d841dced971ff65458a4296573b2d56594a4291c 100644 (file)
@@ -51,9 +51,10 @@ retry:
                  "err %d errno %d\n", err, errno))
                goto disable_pmu;
 
-       assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
-              == 0);
-       assert(system("./urandom_read") == 0);
+       if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")))
+               goto disable_pmu;
+       if (CHECK_FAIL(system("./urandom_read")))
+               goto disable_pmu;
        /* disable stack trace collection */
        key = 0;
        val = 1;
index 9557b7dfb78270c46fdf38fb53980edfbc76d49b..f62aa0eb959bb0fc51bee1c2c3f66007818e311b 100644 (file)
@@ -82,9 +82,10 @@ retry:
                  "err %d errno %d\n", err, errno))
                goto disable_pmu;
 
-       assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
-              == 0);
-       assert(system("taskset 0x1 ./urandom_read 100000") == 0);
+       if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")))
+               goto disable_pmu;
+       if (CHECK_FAIL(system("taskset 0x1 ./urandom_read 100000")))
+               goto disable_pmu;
        /* disable stack trace collection */
        key = 0;
        val = 1;
index fc539335c5b3ecc14c953ffddcf16d92d56e2791..37269d23df93e6f5b26a20e190a0108db90e1633 100644 (file)
@@ -26,19 +26,19 @@ void test_stacktrace_map(void)
 
        /* find map fds */
        control_map_fd = bpf_find_map(__func__, obj, "control_map");
-       if (control_map_fd < 0)
+       if (CHECK_FAIL(control_map_fd < 0))
                goto disable_pmu;
 
        stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
-       if (stackid_hmap_fd < 0)
+       if (CHECK_FAIL(stackid_hmap_fd < 0))
                goto disable_pmu;
 
        stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
-       if (stackmap_fd < 0)
+       if (CHECK_FAIL(stackmap_fd < 0))
                goto disable_pmu;
 
        stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
-       if (stack_amap_fd < 0)
+       if (CHECK_FAIL(stack_amap_fd < 0))
                goto disable_pmu;
 
        /* give some time for bpf program run */
@@ -55,23 +55,20 @@ void test_stacktrace_map(void)
        err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
        if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
                  "err %d errno %d\n", err, errno))
-               goto disable_pmu_noerr;
+               goto disable_pmu;
 
        err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
        if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
                  "err %d errno %d\n", err, errno))
-               goto disable_pmu_noerr;
+               goto disable_pmu;
 
        stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
        err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
        if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
                  "err %d errno %d\n", err, errno))
-               goto disable_pmu_noerr;
+               goto disable_pmu;
 
-       goto disable_pmu_noerr;
 disable_pmu:
-       error_cnt++;
-disable_pmu_noerr:
        bpf_link__destroy(link);
 close_prog:
        bpf_object__close(obj);
index fbfa8e76cf631b36afea5d322fcf91e13c9cebe9..404a5498e1a35705ab3c47aa6cefe65d12b5bb83 100644 (file)
@@ -26,15 +26,15 @@ void test_stacktrace_map_raw_tp(void)
 
        /* find map fds */
        control_map_fd = bpf_find_map(__func__, obj, "control_map");
-       if (control_map_fd < 0)
+       if (CHECK_FAIL(control_map_fd < 0))
                goto close_prog;
 
        stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
-       if (stackid_hmap_fd < 0)
+       if (CHECK_FAIL(stackid_hmap_fd < 0))
                goto close_prog;
 
        stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
-       if (stackmap_fd < 0)
+       if (CHECK_FAIL(stackmap_fd < 0))
                goto close_prog;
 
        /* give some time for bpf program run */
@@ -58,10 +58,7 @@ void test_stacktrace_map_raw_tp(void)
                  "err %d errno %d\n", err, errno))
                goto close_prog;
 
-       goto close_prog_noerr;
 close_prog:
-       error_cnt++;
-close_prog_noerr:
        if (!IS_ERR_OR_NULL(link))
                bpf_link__destroy(link);
        bpf_object__close(obj);
index 958a3d88de9958f771f26b4f1b9bdb30234eb5b8..1bdc1d86a50c85906f771042dae01e1c3621f1c3 100644 (file)
@@ -70,9 +70,6 @@ void test_task_fd_query_rawtp(void)
        if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
                goto close_prog;
 
-       goto close_prog_noerr;
 close_prog:
-       error_cnt++;
-close_prog_noerr:
        bpf_object__close(obj);
 }
index f9b70e81682b7e8d57b82b0755c3c90f480300c8..3f131b8fe328a7e4e2240ec5f0164ef518b30801 100644 (file)
@@ -62,14 +62,9 @@ static void test_task_fd_query_tp_core(const char *probe_name,
                  fd_type, buf))
                goto close_pmu;
 
-       close(pmu_fd);
-       goto close_prog_noerr;
-
 close_pmu:
        close(pmu_fd);
 close_prog:
-       error_cnt++;
-close_prog_noerr:
        bpf_object__close(obj);
 }
 
index bb8759d69099c7a948877110bf2420f3858a5454..594307dffd13bc42b66cd3bae370a2ac70806ad6 100644 (file)
@@ -10,10 +10,8 @@ void test_tcp_estats(void)
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
        CHECK(err, "", "err %d errno %d\n", err, errno);
-       if (err) {
-               error_cnt++;
+       if (err)
                return;
-       }
 
        bpf_object__close(obj);
 }
index a74167289545b85c418b3e454335fb06e8065427..dcb5ecac778e8a2cf07d6103f25a5252fe6fd1cd 100644 (file)
@@ -16,10 +16,8 @@ void test_xdp(void)
        int err, prog_fd, map_fd;
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
-       if (err) {
-               error_cnt++;
+       if (CHECK_FAIL(err))
                return;
-       }
 
        map_fd = bpf_find_map(__func__, obj, "vip2tnl");
        if (map_fd < 0)
index 922aa0a1976425993719ac46e8909debd0143c46..3744196d7cba923d249bfe541679210ab09a78eb 100644 (file)
@@ -10,10 +10,8 @@ void test_xdp_adjust_tail(void)
        int err, prog_fd;
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
-       if (err) {
-               error_cnt++;
+       if (CHECK_FAIL(err))
                return;
-       }
 
        err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
                                buf, &size, &retval, &duration);
index 15f7c272edb0ab419075cc134927fd1189a38f2e..c9404e6b226ee7f0c652a55b3eda3e1e3f65cf7f 100644 (file)
@@ -31,10 +31,8 @@ void test_xdp_noinline(void)
        u32 *magic = (u32 *)buf;
 
        err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
-       if (err) {
-               error_cnt++;
+       if (CHECK_FAIL(err))
                return;
-       }
 
        map_fd = bpf_find_map(__func__, obj, "vip_map");
        if (map_fd < 0)
@@ -73,8 +71,8 @@ void test_xdp_noinline(void)
                bytes += stats[i].bytes;
                pkts += stats[i].pkts;
        }
-       if (bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2) {
-               error_cnt++;
+       if (CHECK_FAIL(bytes != MAGIC_BYTES * NUM_ITER * 2 ||
+                      pkts != NUM_ITER * 2)) {
                printf("test_xdp_noinline:FAIL:stats %lld %lld\n",
                       bytes, pkts);
        }
diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
new file mode 100644 (file)
index 0000000..dede0fc
--- /dev/null
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+#define SOL_CUSTOM                     0xdeadbeef
+#define CUSTOM_INHERIT1                        0
+#define CUSTOM_INHERIT2                        1
+#define CUSTOM_LISTENER                        2
+
+struct sockopt_inherit {
+       __u8 val;
+};
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+       __type(key, int);
+       __type(value, struct sockopt_inherit);
+} cloned1_map SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+       __type(key, int);
+       __type(value, struct sockopt_inherit);
+} cloned2_map SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, struct sockopt_inherit);
+} listener_only_map SEC(".maps");
+
+static __inline struct sockopt_inherit *get_storage(struct bpf_sockopt *ctx)
+{
+       if (ctx->optname == CUSTOM_INHERIT1)
+               return bpf_sk_storage_get(&cloned1_map, ctx->sk, 0,
+                                         BPF_SK_STORAGE_GET_F_CREATE);
+       else if (ctx->optname == CUSTOM_INHERIT2)
+               return bpf_sk_storage_get(&cloned2_map, ctx->sk, 0,
+                                         BPF_SK_STORAGE_GET_F_CREATE);
+       else
+               return bpf_sk_storage_get(&listener_only_map, ctx->sk, 0,
+                                         BPF_SK_STORAGE_GET_F_CREATE);
+}
+
+SEC("cgroup/getsockopt")
+int _getsockopt(struct bpf_sockopt *ctx)
+{
+       __u8 *optval_end = ctx->optval_end;
+       struct sockopt_inherit *storage;
+       __u8 *optval = ctx->optval;
+
+       if (ctx->level != SOL_CUSTOM)
+               return 1; /* only interested in SOL_CUSTOM */
+
+       if (optval + 1 > optval_end)
+               return 0; /* EPERM, bounds check */
+
+       storage = get_storage(ctx);
+       if (!storage)
+               return 0; /* EPERM, couldn't get sk storage */
+
+       ctx->retval = 0; /* Reset system call return value to zero */
+
+       optval[0] = storage->val;
+       ctx->optlen = 1;
+
+       return 1;
+}
+
+SEC("cgroup/setsockopt")
+int _setsockopt(struct bpf_sockopt *ctx)
+{
+       __u8 *optval_end = ctx->optval_end;
+       struct sockopt_inherit *storage;
+       __u8 *optval = ctx->optval;
+
+       if (ctx->level != SOL_CUSTOM)
+               return 1; /* only interested in SOL_CUSTOM */
+
+       if (optval + 1 > optval_end)
+               return 0; /* EPERM, bounds check */
+
+       storage = get_storage(ctx);
+       if (!storage)
+               return 0; /* EPERM, couldn't get sk storage */
+
+       storage->val = optval[0];
+       ctx->optlen = -1;
+
+       return 1;
+}
index a334a0e882e46006eaad43ba09a3e75bf73be525..41a3ebcd593dc766cd81d81108a4b9888a3cf34b 100644 (file)
 
 #define SR6_FLAG_ALERT (1 << 4)
 
-#define htonll(x) ((bpf_htonl(1)) == 1 ? (x) : ((uint64_t)bpf_htonl((x) & \
-                               0xFFFFFFFF) << 32) | bpf_htonl((x) >> 32))
-#define ntohll(x) ((bpf_ntohl(1)) == 1 ? (x) : ((uint64_t)bpf_ntohl((x) & \
-                               0xFFFFFFFF) << 32) | bpf_ntohl((x) >> 32))
 #define BPF_PACKET_HEADER __attribute__((packed))
 
 struct ip6_t {
@@ -276,8 +272,8 @@ int has_egr_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh)
                        return 0;
 
                // check if egress TLV value is correct
-               if (ntohll(egr_addr.hi) == 0xfd00000000000000 &&
-                               ntohll(egr_addr.lo) == 0x4)
+               if (bpf_be64_to_cpu(egr_addr.hi) == 0xfd00000000000000 &&
+                   bpf_be64_to_cpu(egr_addr.lo) == 0x4)
                        return 1;
        }
 
@@ -308,8 +304,8 @@ int __encap_srh(struct __sk_buff *skb)
 
        #pragma clang loop unroll(full)
        for (unsigned long long lo = 0; lo < 4; lo++) {
-               seg->lo = htonll(4 - lo);
-               seg->hi = htonll(hi);
+               seg->lo = bpf_cpu_to_be64(4 - lo);
+               seg->hi = bpf_cpu_to_be64(hi);
                seg = (struct ip6_addr_t *)((char *)seg + sizeof(*seg));
        }
 
@@ -349,8 +345,8 @@ int __add_egr_x(struct __sk_buff *skb)
        if (err)
                return BPF_DROP;
 
-       addr.lo = htonll(lo);
-       addr.hi = htonll(hi);
+       addr.lo = bpf_cpu_to_be64(lo);
+       addr.hi = bpf_cpu_to_be64(hi);
        err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X,
                                  (void *)&addr, sizeof(addr));
        if (err)
index 1dbe1d4d467e7cfc00506948baabfbf14d3cad5b..c4d104428643ea3b9b960c0ba1be98badc77f306 100644 (file)
 
 #define SR6_FLAG_ALERT (1 << 4)
 
-#define htonll(x) ((bpf_htonl(1)) == 1 ? (x) : ((uint64_t)bpf_htonl((x) & \
-                               0xFFFFFFFF) << 32) | bpf_htonl((x) >> 32))
-#define ntohll(x) ((bpf_ntohl(1)) == 1 ? (x) : ((uint64_t)bpf_ntohl((x) & \
-                               0xFFFFFFFF) << 32) | bpf_ntohl((x) >> 32))
 #define BPF_PACKET_HEADER __attribute__((packed))
 
 struct ip6_t {
@@ -251,8 +247,8 @@ int __add_egr_x(struct __sk_buff *skb)
        if (err)
                return BPF_DROP;
 
-       addr.lo = htonll(lo);
-       addr.hi = htonll(hi);
+       addr.lo = bpf_cpu_to_be64(lo);
+       addr.hi = bpf_cpu_to_be64(hi);
        err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X,
                                  (void *)&addr, sizeof(addr));
        if (err)
diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh
new file mode 100755 (executable)
index 0000000..4ba5a34
--- /dev/null
@@ -0,0 +1,143 @@
+#!/bin/bash
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+ERROR=0
+TMPDIR=
+
+# If one build fails, continue but return non-0 on exit.
+return_value() {
+       if [ -d "$TMPDIR" ] ; then
+               rm -rf -- $TMPDIR
+       fi
+       exit $ERROR
+}
+trap return_value EXIT
+
+case $1 in
+       -h|--help)
+               echo -e "$0 [-j <n>]"
+               echo -e "\tTest the different ways of building bpftool."
+               echo -e ""
+               echo -e "\tOptions:"
+               echo -e "\t\t-j <n>:\tPass -j flag to 'make'."
+               exit
+               ;;
+esac
+
+J=$*
+
+# Assume script is located under tools/testing/selftests/bpf/. We want to start
+# build attempts from the top of kernel repository.
+SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0)
+SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH)
+KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../)
+cd $KDIR_ROOT_DIR
+
+check() {
+       local dir=$(realpath $1)
+
+       echo -n "binary:  "
+       # Returns non-null if file is found (and "false" is run)
+       find $dir -type f -executable -name bpftool -print -exec false {} + && \
+               ERROR=1 && printf "FAILURE: Did not find bpftool\n"
+}
+
+make_and_clean() {
+       echo -e "\$PWD:    $PWD"
+       echo -e "command: make -s $* >/dev/null"
+       make $J -s $* >/dev/null
+       if [ $? -ne 0 ] ; then
+               ERROR=1
+       fi
+       if [ $# -ge 1 ] ; then
+               check ${@: -1}
+       else
+               check .
+       fi
+       (
+               if [ $# -ge 1 ] ; then
+                       cd ${@: -1}
+               fi
+               make -s clean
+       )
+       echo
+}
+
+make_with_tmpdir() {
+       local ARGS
+
+       TMPDIR=$(mktemp -d)
+       if [ $# -ge 2 ] ; then
+               ARGS=${@:1:(($# - 1))}
+       fi
+       echo -e "\$PWD:    $PWD"
+       echo -e "command: make -s $ARGS ${@: -1}=$TMPDIR/ >/dev/null"
+       make $J -s $ARGS ${@: -1}=$TMPDIR/ >/dev/null
+       if [ $? -ne 0 ] ; then
+               ERROR=1
+       fi
+       check $TMPDIR
+       rm -rf -- $TMPDIR
+       echo
+}
+
+echo "Trying to build bpftool"
+echo -e "... through kbuild\n"
+
+if [ -f ".config" ] ; then
+       make_and_clean tools/bpf
+
+       ## $OUTPUT is overwritten in kbuild Makefile, and thus cannot be passed
+       ## down from toplevel Makefile to bpftool's Makefile.
+
+       # make_with_tmpdir tools/bpf OUTPUT
+       echo -e "skip:    make tools/bpf OUTPUT=<dir> (not supported)\n"
+
+       make_with_tmpdir tools/bpf O
+else
+       echo -e "skip:    make tools/bpf (no .config found)\n"
+       echo -e "skip:    make tools/bpf OUTPUT=<dir> (not supported)\n"
+       echo -e "skip:    make tools/bpf O=<dir> (no .config found)\n"
+fi
+
+echo -e "... from kernel source tree\n"
+
+make_and_clean -C tools/bpf/bpftool
+
+make_with_tmpdir -C tools/bpf/bpftool OUTPUT
+
+make_with_tmpdir -C tools/bpf/bpftool O
+
+echo -e "... from tools/\n"
+cd tools/
+
+make_and_clean bpf
+
+## In tools/bpf/Makefile, function "descend" is called and passes $(O) and
+## $(OUTPUT). We would like $(OUTPUT) to have "bpf/bpftool/" appended before
+## calling bpftool's Makefile, but this is not the case as the "descend"
+## function focuses on $(O)/$(subdir). However, in the present case, updating
+## $(O) to have $(OUTPUT) recomputed from it in bpftool's Makefile does not
+## work, because $(O) is not defined from command line and $(OUTPUT) is not
+## updated in tools/scripts/Makefile.include.
+##
+## Workarounds would require to a) edit "descend" or use an alternative way to
+## call bpftool's Makefile, b) modify the conditions to update $(OUTPUT) and
+## other variables in tools/scripts/Makefile.include (at the risk of breaking
+## the build of other tools), or c) append manually the "bpf/bpftool" suffix to
+## $(OUTPUT) in bpf's Makefile, which may break if targets for other directories
+## use "descend" in the future.
+
+# make_with_tmpdir bpf OUTPUT
+echo -e "skip:    make bpf OUTPUT=<dir> (not supported)\n"
+
+make_with_tmpdir bpf O
+
+echo -e "... from bpftool's dir\n"
+cd bpf/bpftool
+
+make_and_clean
+
+make_with_tmpdir OUTPUT
+
+make_with_tmpdir O
index 425f9ed27c3b1b9de7e733358c3035b595dbef21..15a666329a34df435dad7021845c0cbb377fb615 100755 (executable)
@@ -1353,7 +1353,7 @@ try:
     bpftool_prog_list_wait(expected=1)
 
     ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"]
-    fail(ifnameB != simB1['ifname'], "program not bound to originial device")
+    fail(ifnameB != simB1['ifname'], "program not bound to original device")
     simB1.remove()
     bpftool_prog_list_wait(expected=1)
 
index 12895d03d58b0d5f7e0f0e40d95e85577dab2840..e8616e778cb5070cd6a1685f459499e09677b28f 100644 (file)
@@ -8,22 +8,20 @@
 
 /* defined in test_progs.h */
 struct test_env env;
-int error_cnt, pass_cnt;
 
 struct prog_test_def {
        const char *test_name;
        int test_num;
        void (*run_test)(void);
        bool force_log;
-       int pass_cnt;
        int error_cnt;
+       int skip_cnt;
        bool tested;
 
        const char *subtest_name;
        int subtest_num;
 
        /* store counts before subtest started */
-       int old_pass_cnt;
        int old_error_cnt;
 };
 
@@ -47,6 +45,7 @@ static void dump_test_log(const struct prog_test_def *test, bool failed)
 
        if (env.verbose || test->force_log || failed) {
                if (env.log_cnt) {
+                       env.log_buf[env.log_cnt] = '\0';
                        fprintf(env.stdout, "%s", env.log_buf);
                        if (env.log_buf[env.log_cnt - 1] != '\n')
                                fprintf(env.stdout, "\n");
@@ -56,15 +55,24 @@ static void dump_test_log(const struct prog_test_def *test, bool failed)
        fseeko(stdout, 0, SEEK_SET); /* rewind */
 }
 
+static void skip_account(void)
+{
+       if (env.test->skip_cnt) {
+               env.skip_cnt++;
+               env.test->skip_cnt = 0;
+       }
+}
+
 void test__end_subtest()
 {
        struct prog_test_def *test = env.test;
-       int sub_error_cnt = error_cnt - test->old_error_cnt;
+       int sub_error_cnt = test->error_cnt - test->old_error_cnt;
 
        if (sub_error_cnt)
                env.fail_cnt++;
        else
                env.sub_succ_cnt++;
+       skip_account();
 
        dump_test_log(test, sub_error_cnt);
 
@@ -95,8 +103,7 @@ bool test__start_subtest(const char *name)
                return false;
 
        test->subtest_name = name;
-       env.test->old_pass_cnt = pass_cnt;
-       env.test->old_error_cnt = error_cnt;
+       env.test->old_error_cnt = env.test->error_cnt;
 
        return true;
 }
@@ -105,6 +112,16 @@ void test__force_log() {
        env.test->force_log = true;
 }
 
+void test__skip(void)
+{
+       env.test->skip_cnt++;
+}
+
+void test__fail(void)
+{
+       env.test->error_cnt++;
+}
+
 struct ipv4_packet pkt_v4 = {
        .eth.h_proto = __bpf_constant_htons(ETH_P_IP),
        .iph.ihl = 5,
@@ -129,7 +146,7 @@ int bpf_find_map(const char *test, struct bpf_object *obj, const char *name)
        map = bpf_object__find_map_by_name(obj, name);
        if (!map) {
                printf("%s:FAIL:map '%s' not found\n", test, name);
-               error_cnt++;
+               test__fail();
                return -1;
        }
        return bpf_map__fd(map);
@@ -488,8 +505,6 @@ int main(int argc, char **argv)
        stdio_hijack();
        for (i = 0; i < prog_test_cnt; i++) {
                struct prog_test_def *test = &prog_test_defs[i];
-               int old_pass_cnt = pass_cnt;
-               int old_error_cnt = error_cnt;
 
                env.test = test;
                test->test_num = i + 1;
@@ -504,12 +519,11 @@ int main(int argc, char **argv)
                        test__end_subtest();
 
                test->tested = true;
-               test->pass_cnt = pass_cnt - old_pass_cnt;
-               test->error_cnt = error_cnt - old_error_cnt;
                if (test->error_cnt)
                        env.fail_cnt++;
                else
                        env.succ_cnt++;
+               skip_account();
 
                dump_test_log(test, test->error_cnt);
 
@@ -518,11 +532,11 @@ int main(int argc, char **argv)
                        test->error_cnt ? "FAIL" : "OK");
        }
        stdio_restore();
-       printf("Summary: %d/%d PASSED, %d FAILED\n",
-              env.succ_cnt, env.sub_succ_cnt, env.fail_cnt);
+       printf("Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n",
+              env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt);
 
        free(env.test_selector.num_set);
        free(env.subtest_selector.num_set);
 
-       return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
+       return env.fail_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
 }
index 37d427f5a1e5c68a62e8d71f95b79905e96acace..c8edb9464ba637c03412ef2ceadc4bab55eab544 100644 (file)
@@ -38,8 +38,6 @@ typedef __u16 __sum16;
 #include "trace_helpers.h"
 #include "flow_dissector_load.h"
 
-struct prog_test_def;
-
 struct test_selector {
        const char *name;
        bool *num_set;
@@ -64,14 +62,15 @@ struct test_env {
        int succ_cnt; /* successful tests */
        int sub_succ_cnt; /* successful sub-tests */
        int fail_cnt; /* total failed tests + sub-tests */
+       int skip_cnt; /* skipped tests */
 };
 
-extern int error_cnt;
-extern int pass_cnt;
 extern struct test_env env;
 
 extern void test__force_log();
 extern bool test__start_subtest(const char *name);
+extern void test__skip(void);
+extern void test__fail(void);
 
 #define MAGIC_BYTES 123
 
@@ -94,17 +93,25 @@ extern struct ipv6_packet pkt_v6;
 #define _CHECK(condition, tag, duration, format...) ({                 \
        int __ret = !!(condition);                                      \
        if (__ret) {                                                    \
-               error_cnt++;                                            \
+               test__fail();                                           \
                printf("%s:FAIL:%s ", __func__, tag);                   \
                printf(format);                                         \
        } else {                                                        \
-               pass_cnt++;                                             \
                printf("%s:PASS:%s %d nsec\n",                          \
                       __func__, tag, duration);                        \
        }                                                               \
        __ret;                                                          \
 })
 
+#define CHECK_FAIL(condition) ({                                       \
+       int __ret = !!(condition);                                      \
+       if (__ret) {                                                    \
+               test__fail();                                           \
+               printf("%s:FAIL:%d\n", __func__, __LINE__);             \
+       }                                                               \
+       __ret;                                                          \
+})
+
 #define CHECK(condition, tag, format...) \
        _CHECK(condition, tag, duration, format)
 #define CHECK_ATTR(condition, tag, format...) \
diff --git a/tools/testing/selftests/bpf/test_sockopt_inherit.c b/tools/testing/selftests/bpf/test_sockopt_inherit.c
new file mode 100644 (file)
index 0000000..1bf6998
--- /dev/null
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pthread.h>
+
+#include <linux/filter.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH                                "/sockopt_inherit"
+#define SOL_CUSTOM                     0xdeadbeef
+#define CUSTOM_INHERIT1                        0
+#define CUSTOM_INHERIT2                        1
+#define CUSTOM_LISTENER                        2
+
+static int connect_to_server(int server_fd)
+{
+       struct sockaddr_storage addr;
+       socklen_t len = sizeof(addr);
+       int fd;
+
+       fd = socket(AF_INET, SOCK_STREAM, 0);
+       if (fd < 0) {
+               log_err("Failed to create client socket");
+               return -1;
+       }
+
+       if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+               log_err("Failed to get server addr");
+               goto out;
+       }
+
+       if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
+               log_err("Fail to connect to server");
+               goto out;
+       }
+
+       return fd;
+
+out:
+       close(fd);
+       return -1;
+}
+
+static int verify_sockopt(int fd, int optname, const char *msg, char expected)
+{
+       socklen_t optlen = 1;
+       char buf = 0;
+       int err;
+
+       err = getsockopt(fd, SOL_CUSTOM, optname, &buf, &optlen);
+       if (err) {
+               log_err("%s: failed to call getsockopt", msg);
+               return 1;
+       }
+
+       printf("%s %d: got=0x%x ? expected=0x%x\n", msg, optname, buf, expected);
+
+       if (buf != expected) {
+               log_err("%s: unexpected getsockopt value %d != %d", msg,
+                       buf, expected);
+               return 1;
+       }
+
+       return 0;
+}
+
+static void *server_thread(void *arg)
+{
+       struct sockaddr_storage addr;
+       socklen_t len = sizeof(addr);
+       int fd = *(int *)arg;
+       int client_fd;
+       int err = 0;
+
+       if (listen(fd, 1) < 0)
+               error(1, errno, "Failed to listed on socket");
+
+       err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1);
+       err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1);
+       err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1);
+
+       client_fd = accept(fd, (struct sockaddr *)&addr, &len);
+       if (client_fd < 0)
+               error(1, errno, "Failed to accept client");
+
+       err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1);
+       err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1);
+       err += verify_sockopt(client_fd, CUSTOM_LISTENER, "accept", 0);
+
+       close(client_fd);
+
+       return (void *)(long)err;
+}
+
+static int start_server(void)
+{
+       struct sockaddr_in addr = {
+               .sin_family = AF_INET,
+               .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+       };
+       char buf;
+       int err;
+       int fd;
+       int i;
+
+       fd = socket(AF_INET, SOCK_STREAM, 0);
+       if (fd < 0) {
+               log_err("Failed to create server socket");
+               return -1;
+       }
+
+       for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) {
+               buf = 0x01;
+               err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1);
+               if (err) {
+                       log_err("Failed to call setsockopt(%d)", i);
+                       close(fd);
+                       return -1;
+               }
+       }
+
+       if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
+               log_err("Failed to bind socket");
+               close(fd);
+               return -1;
+       }
+
+       return fd;
+}
+
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+       enum bpf_attach_type attach_type;
+       enum bpf_prog_type prog_type;
+       struct bpf_program *prog;
+       int err;
+
+       err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+       if (err) {
+               log_err("Failed to deduct types for %s BPF program", title);
+               return -1;
+       }
+
+       prog = bpf_object__find_program_by_title(obj, title);
+       if (!prog) {
+               log_err("Failed to find %s BPF program", title);
+               return -1;
+       }
+
+       err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
+                             attach_type, 0);
+       if (err) {
+               log_err("Failed to attach %s BPF program", title);
+               return -1;
+       }
+
+       return 0;
+}
+
+static int run_test(int cgroup_fd)
+{
+       struct bpf_prog_load_attr attr = {
+               .file = "./sockopt_inherit.o",
+       };
+       int server_fd = -1, client_fd;
+       struct bpf_object *obj;
+       void *server_err;
+       pthread_t tid;
+       int ignored;
+       int err;
+
+       err = bpf_prog_load_xattr(&attr, &obj, &ignored);
+       if (err) {
+               log_err("Failed to load BPF object");
+               return -1;
+       }
+
+       err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
+       if (err)
+               goto close_bpf_object;
+
+       err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
+       if (err)
+               goto close_bpf_object;
+
+       server_fd = start_server();
+       if (server_fd < 0) {
+               err = -1;
+               goto close_bpf_object;
+       }
+
+       pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
+
+       client_fd = connect_to_server(server_fd);
+       if (client_fd < 0) {
+               err = -1;
+               goto close_server_fd;
+       }
+
+       err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0);
+       err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0);
+       err += verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0);
+
+       pthread_join(tid, &server_err);
+
+       err += (int)(long)server_err;
+
+       close(client_fd);
+
+close_server_fd:
+       close(server_fd);
+close_bpf_object:
+       bpf_object__close(obj);
+       return err;
+}
+
+int main(int args, char **argv)
+{
+       int cgroup_fd;
+       int err = EXIT_SUCCESS;
+
+       if (setup_cgroup_environment())
+               return err;
+
+       cgroup_fd = create_and_get_cgroup(CG_PATH);
+       if (cgroup_fd < 0)
+               goto cleanup_cgroup_env;
+
+       if (join_cgroup(CG_PATH))
+               goto cleanup_cgroup;
+
+       if (run_test(cgroup_fd))
+               err = EXIT_FAILURE;
+
+       printf("test_sockopt_inherit: %s\n",
+              err == EXIT_SUCCESS ? "PASSED" : "FAILED");
+
+cleanup_cgroup:
+       close(cgroup_fd);
+cleanup_cgroup_env:
+       cleanup_cgroup_environment();
+       return err;
+}
index a3bebd7c68ddc6262c5eaf6021bd8c8304b6a006..fc33ae36b760c5deec76176d66953c24d95852fe 100644 (file)
@@ -13,6 +13,7 @@
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
 
+#include "bpf_endian.h"
 #include "bpf_rlimit.h"
 #include "bpf_util.h"
 #include "cgroup_helpers.h"
@@ -100,7 +101,7 @@ static struct sysctl_test tests[] = {
                .descr = "ctx:write sysctl:write read ok",
                .insns = {
                        /* If (write) */
-                       BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_1,
+                       BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
                                    offsetof(struct bpf_sysctl, write)),
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 1, 2),
 
@@ -214,7 +215,8 @@ static struct sysctl_test tests[] = {
                        /* if (ret == expected && */
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, sizeof("tcp_mem") - 1, 6),
                        /*     buf == "tcp_mem\0") */
-                       BPF_LD_IMM64(BPF_REG_8, 0x006d656d5f706374ULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x7463705f6d656d00ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
 
@@ -255,7 +257,8 @@ static struct sysctl_test tests[] = {
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
 
                        /*     buf[0:7] == "tcp_me\0") */
-                       BPF_LD_IMM64(BPF_REG_8, 0x00656d5f706374ULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x7463705f6d650000ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
 
@@ -298,12 +301,14 @@ static struct sysctl_test tests[] = {
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 16, 14),
 
                        /*     buf[0:8] == "net/ipv4" && */
-                       BPF_LD_IMM64(BPF_REG_8, 0x347670692f74656eULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x6e65742f69707634ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
 
                        /*     buf[8:16] == "/tcp_mem" && */
-                       BPF_LD_IMM64(BPF_REG_8, 0x6d656d5f7063742fULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x2f7463705f6d656dULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
 
@@ -350,12 +355,14 @@ static struct sysctl_test tests[] = {
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 10),
 
                        /*     buf[0:8] == "net/ipv4" && */
-                       BPF_LD_IMM64(BPF_REG_8, 0x347670692f74656eULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x6e65742f69707634ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
 
                        /*     buf[8:16] == "/tcp_me\0") */
-                       BPF_LD_IMM64(BPF_REG_8, 0x00656d5f7063742fULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x2f7463705f6d6500ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
 
@@ -396,7 +403,8 @@ static struct sysctl_test tests[] = {
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
 
                        /*     buf[0:8] == "net/ip\0") */
-                       BPF_LD_IMM64(BPF_REG_8, 0x000070692f74656eULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x6e65742f69700000ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
 
@@ -431,7 +439,8 @@ static struct sysctl_test tests[] = {
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
 
                        /*     buf[0:6] == "Linux\n\0") */
-                       BPF_LD_IMM64(BPF_REG_8, 0x000a78756e694cULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
 
@@ -469,7 +478,8 @@ static struct sysctl_test tests[] = {
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
 
                        /*     buf[0:6] == "Linux\n\0") */
-                       BPF_LD_IMM64(BPF_REG_8, 0x000a78756e694cULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
 
@@ -507,7 +517,8 @@ static struct sysctl_test tests[] = {
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
 
                        /*     buf[0:6] == "Linux\0") */
-                       BPF_LD_IMM64(BPF_REG_8, 0x000078756e694cULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x4c696e7578000000ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
 
@@ -650,7 +661,8 @@ static struct sysctl_test tests[] = {
 
                        /*     buf[0:4] == "606\0") */
                        BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
-                       BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0x00363036, 2),
+                       BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
+                                   bpf_ntohl(0x36303600), 2),
 
                        /* return DENY; */
                        BPF_MOV64_IMM(BPF_REG_0, 0),
@@ -685,17 +697,20 @@ static struct sysctl_test tests[] = {
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 23, 14),
 
                        /*     buf[0:8] == "3000000 " && */
-                       BPF_LD_IMM64(BPF_REG_8, 0x2030303030303033ULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x3330303030303020ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
 
                        /*     buf[8:16] == "4000000 " && */
-                       BPF_LD_IMM64(BPF_REG_8, 0x2030303030303034ULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x3430303030303020ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
 
                        /*     buf[16:24] == "6000000\0") */
-                       BPF_LD_IMM64(BPF_REG_8, 0x0030303030303036ULL),
+                       BPF_LD_IMM64(BPF_REG_8,
+                                    bpf_be64_to_cpu(0x3630303030303000ULL)),
                        BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 16),
                        BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
 
@@ -735,7 +750,8 @@ static struct sysctl_test tests[] = {
 
                        /*     buf[0:3] == "60\0") */
                        BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
-                       BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0x003036, 2),
+                       BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
+                                   bpf_ntohl(0x36300000), 2),
 
                        /* return DENY; */
                        BPF_MOV64_IMM(BPF_REG_0, 0),
@@ -757,7 +773,8 @@ static struct sysctl_test tests[] = {
                        /* sysctl_set_new_value arg2 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+                       BPF_MOV64_IMM(BPF_REG_0,
+                                     bpf_ntohl(0x36303000)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
@@ -791,7 +808,7 @@ static struct sysctl_test tests[] = {
                        /* sysctl_set_new_value arg2 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, FIXUP_SYSCTL_VALUE),
+                       BPF_LD_IMM64(BPF_REG_0, FIXUP_SYSCTL_VALUE),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
@@ -825,8 +842,9 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
-                       BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+                       BPF_MOV64_IMM(BPF_REG_0,
+                                     bpf_ntohl(0x36303000)),
+                       BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
 
@@ -869,7 +887,8 @@ static struct sysctl_test tests[] = {
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
                        /* "600 602\0" */
-                       BPF_LD_IMM64(BPF_REG_0, 0x0032303620303036ULL),
+                       BPF_LD_IMM64(BPF_REG_0,
+                                    bpf_be64_to_cpu(0x3630302036303200ULL)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
 
@@ -937,7 +956,8 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+                       BPF_MOV64_IMM(BPF_REG_0,
+                                     bpf_ntohl(0x36303000)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -969,8 +989,9 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, 0x00373730),
-                       BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+                       BPF_MOV64_IMM(BPF_REG_0,
+                                     bpf_ntohl(0x30373700)),
+                       BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
 
@@ -1012,7 +1033,8 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+                       BPF_MOV64_IMM(BPF_REG_0,
+                                     bpf_ntohl(0x36303000)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1052,7 +1074,8 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, 0x090a0c0d),
+                       BPF_MOV64_IMM(BPF_REG_0,
+                                     bpf_ntohl(0x0d0c0a09)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1092,7 +1115,9 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, 0x00362d0a), /* " -6\0" */
+                       /* " -6\0" */
+                       BPF_MOV64_IMM(BPF_REG_0,
+                                     bpf_ntohl(0x0a2d3600)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1132,8 +1157,10 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, 0x00362d0a), /* " -6\0" */
-                       BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+                       /* " -6\0" */
+                       BPF_MOV64_IMM(BPF_REG_0,
+                                     bpf_ntohl(0x0a2d3600)),
+                       BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
 
@@ -1175,8 +1202,10 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
-                       BPF_MOV64_IMM(BPF_REG_0, 0x65667830), /* "0xfe" */
-                       BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+                       /* "0xfe" */
+                       BPF_MOV64_IMM(BPF_REG_0,
+                                     bpf_ntohl(0x30786665)),
+                       BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
 
@@ -1218,11 +1247,14 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) 9223372036854775807 */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
-                       BPF_LD_IMM64(BPF_REG_0, 0x3032373333323239ULL),
+                       BPF_LD_IMM64(BPF_REG_0,
+                                    bpf_be64_to_cpu(0x3932323333373230ULL)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-                       BPF_LD_IMM64(BPF_REG_0, 0x3537373435383633ULL),
+                       BPF_LD_IMM64(BPF_REG_0,
+                                    bpf_be64_to_cpu(0x3336383534373735ULL)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
-                       BPF_LD_IMM64(BPF_REG_0, 0x0000000000373038ULL),
+                       BPF_LD_IMM64(BPF_REG_0,
+                                    bpf_be64_to_cpu(0x3830370000000000ULL)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1266,11 +1298,14 @@ static struct sysctl_test tests[] = {
                        /* arg1 (buf) 9223372036854775808 */
                        BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
-                       BPF_LD_IMM64(BPF_REG_0, 0x3032373333323239ULL),
+                       BPF_LD_IMM64(BPF_REG_0,
+                                    bpf_be64_to_cpu(0x3932323333373230ULL)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
-                       BPF_LD_IMM64(BPF_REG_0, 0x3537373435383633ULL),
+                       BPF_LD_IMM64(BPF_REG_0,
+                                    bpf_be64_to_cpu(0x3336383534373735ULL)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
-                       BPF_LD_IMM64(BPF_REG_0, 0x0000000000383038ULL),
+                       BPF_LD_IMM64(BPF_REG_0,
+                                    bpf_be64_to_cpu(0x3830380000000000ULL)),
                        BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
 
                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
@@ -1344,20 +1379,24 @@ static size_t probe_prog_length(const struct bpf_insn *fp)
 static int fixup_sysctl_value(const char *buf, size_t buf_len,
                              struct bpf_insn *prog, size_t insn_num)
 {
-       uint32_t value_num = 0;
+       union {
+               uint8_t raw[sizeof(uint64_t)];
+               uint64_t num;
+       } value = {};
        uint8_t c, i;
 
-       if (buf_len > sizeof(value_num)) {
+       if (buf_len > sizeof(value)) {
                log_err("Value is too big (%zd) to use in fixup", buf_len);
                return -1;
        }
-
-       for (i = 0; i < buf_len; ++i) {
-               c = buf[i];
-               value_num |= (c << i * 8);
+       if (prog[insn_num].code != (BPF_LD | BPF_DW | BPF_IMM)) {
+               log_err("Can fixup only BPF_LD_IMM64 insns");
+               return -1;
        }
 
-       prog[insn_num].imm = value_num;
+       memcpy(value.raw, buf, buf_len);
+       prog[insn_num].imm = (uint32_t)value.num;
+       prog[insn_num + 1].imm = (uint32_t)(value.num >> 32);
 
        return 0;
 }
@@ -1499,6 +1538,7 @@ static int run_test_case(int cgfd, struct sysctl_test *test)
                        goto err;
        }
 
+       errno = 0;
        if (access_sysctl(sysctl_path, test) == -1) {
                if (test->result == OP_EPERM && errno == EPERM)
                        goto out;
@@ -1507,7 +1547,7 @@ static int run_test_case(int cgfd, struct sysctl_test *test)
        }
 
        if (test->result != SUCCESS) {
-               log_err("Unexpected failure");
+               log_err("Unexpected success");
                goto err;
        }
 
index 90c3862f74a856cc10685130a3313dedcdb1dd34..93916a69823e51d5e0df0ce79c647d9fd03f168e 100644 (file)
@@ -6,6 +6,7 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
+#include <netinet/tcp.h>
 #include <pthread.h>
 
 #include <linux/filter.h>
@@ -34,6 +35,30 @@ static void send_byte(int fd)
                error(1, errno, "Failed to send single byte");
 }
 
+static int wait_for_ack(int fd, int retries)
+{
+       struct tcp_info info;
+       socklen_t optlen;
+       int i, err;
+
+       for (i = 0; i < retries; i++) {
+               optlen = sizeof(info);
+               err = getsockopt(fd, SOL_TCP, TCP_INFO, &info, &optlen);
+               if (err < 0) {
+                       log_err("Failed to lookup TCP stats");
+                       return err;
+               }
+
+               if (info.tcpi_unacked == 0)
+                       return 0;
+
+               usleep(10);
+       }
+
+       log_err("Did not receive ACK");
+       return -1;
+}
+
 static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
                     __u32 dsack_dups, __u32 delivered, __u32 delivered_ce,
                     __u32 icsk_retransmits)
@@ -149,6 +174,11 @@ static int run_test(int cgroup_fd, int server_fd)
                         /*icsk_retransmits=*/0);
 
        send_byte(client_fd);
+       if (wait_for_ack(client_fd, 100) < 0) {
+               err = -1;
+               goto close_client_fd;
+       }
+
 
        err += verify_sk(map_fd, client_fd, "first payload byte",
                         /*invoked=*/2,
@@ -157,6 +187,7 @@ static int run_test(int cgroup_fd, int server_fd)
                         /*delivered_ce=*/0,
                         /*icsk_retransmits=*/0);
 
+close_client_fd:
        close(client_fd);
 
 close_bpf_object:
index 44e2d640b088ca10668d28e3a1c3b6acae8ec194..d27fd929abb9003ec4562e970ee321c17e730532 100644 (file)
@@ -61,6 +61,7 @@
 #define UNPRIV_SYSCTL "kernel/unprivileged_bpf_disabled"
 static bool unpriv_disabled = false;
 static int skips;
+static bool verbose = false;
 
 struct bpf_test {
        const char *descr;
@@ -92,7 +93,8 @@ struct bpf_test {
        enum {
                UNDEF,
                ACCEPT,
-               REJECT
+               REJECT,
+               VERBOSE_ACCEPT,
        } result, result_unpriv;
        enum bpf_prog_type prog_type;
        uint8_t flags;
@@ -859,6 +861,36 @@ static int do_prog_test_run(int fd_prog, bool unpriv, uint32_t expected_val,
        return 0;
 }
 
+static bool cmp_str_seq(const char *log, const char *exp)
+{
+       char needle[80];
+       const char *p, *q;
+       int len;
+
+       do {
+               p = strchr(exp, '\t');
+               if (!p)
+                       p = exp + strlen(exp);
+
+               len = p - exp;
+               if (len >= sizeof(needle) || !len) {
+                       printf("FAIL\nTestcase bug\n");
+                       return false;
+               }
+               strncpy(needle, exp, len);
+               needle[len] = 0;
+               q = strstr(log, needle);
+               if (!q) {
+                       printf("FAIL\nUnexpected verifier log in successful load!\n"
+                              "EXP: %s\nRES:\n", needle);
+                       return false;
+               }
+               log = q + len;
+               exp = p + 1;
+       } while (*p);
+       return true;
+}
+
 static void do_test_single(struct bpf_test *test, bool unpriv,
                           int *passes, int *errors)
 {
@@ -897,14 +929,20 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
                pflags |= BPF_F_STRICT_ALIGNMENT;
        if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS)
                pflags |= BPF_F_ANY_ALIGNMENT;
+       if (test->flags & ~3)
+               pflags |= test->flags;
 
+       expected_ret = unpriv && test->result_unpriv != UNDEF ?
+                      test->result_unpriv : test->result;
+       expected_err = unpriv && test->errstr_unpriv ?
+                      test->errstr_unpriv : test->errstr;
        memset(&attr, 0, sizeof(attr));
        attr.prog_type = prog_type;
        attr.expected_attach_type = test->expected_attach_type;
        attr.insns = prog;
        attr.insns_cnt = prog_len;
        attr.license = "GPL";
-       attr.log_level = 4;
+       attr.log_level = verbose || expected_ret == VERBOSE_ACCEPT ? 1 : 4;
        attr.prog_flags = pflags;
 
        fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog));
@@ -914,14 +952,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
                goto close_fds;
        }
 
-       expected_ret = unpriv && test->result_unpriv != UNDEF ?
-                      test->result_unpriv : test->result;
-       expected_err = unpriv && test->errstr_unpriv ?
-                      test->errstr_unpriv : test->errstr;
-
        alignment_prevented_execution = 0;
 
-       if (expected_ret == ACCEPT) {
+       if (expected_ret == ACCEPT || expected_ret == VERBOSE_ACCEPT) {
                if (fd_prog < 0) {
                        printf("FAIL\nFailed to load prog '%s'!\n",
                               strerror(errno));
@@ -932,6 +965,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
                    (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS))
                        alignment_prevented_execution = 1;
 #endif
+               if (expected_ret == VERBOSE_ACCEPT && !cmp_str_seq(bpf_vlog, expected_err)) {
+                       goto fail_log;
+               }
        } else {
                if (fd_prog >= 0) {
                        printf("FAIL\nUnexpected success to load!\n");
@@ -957,6 +993,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
                }
        }
 
+       if (verbose)
+               printf(", verifier log:\n%s", bpf_vlog);
+
        run_errs = 0;
        run_successes = 0;
        if (!alignment_prevented_execution && fd_prog >= 0) {
@@ -1097,17 +1136,24 @@ int main(int argc, char **argv)
 {
        unsigned int from = 0, to = ARRAY_SIZE(tests);
        bool unpriv = !is_admin();
+       int arg = 1;
+
+       if (argc > 1 && strcmp(argv[1], "-v") == 0) {
+               arg++;
+               verbose = true;
+               argc--;
+       }
 
        if (argc == 3) {
-               unsigned int l = atoi(argv[argc - 2]);
-               unsigned int u = atoi(argv[argc - 1]);
+               unsigned int l = atoi(argv[arg]);
+               unsigned int u = atoi(argv[arg + 1]);
 
                if (l < to && u < to) {
                        from = l;
                        to   = u + 1;
                }
        } else if (argc == 2) {
-               unsigned int t = atoi(argv[argc - 1]);
+               unsigned int t = atoi(argv[arg]);
 
                if (t < to) {
                        from = t;
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
new file mode 100644 (file)
index 0000000..02151f8
--- /dev/null
@@ -0,0 +1,194 @@
+{
+       "precise: test 1",
+       .insns = {
+       BPF_MOV64_IMM(BPF_REG_0, 1),
+       BPF_LD_MAP_FD(BPF_REG_6, 0),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+       BPF_EXIT_INSN(),
+
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+       BPF_EXIT_INSN(),
+
+       BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+
+       BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8), /* map_value_ptr -= map_value_ptr */
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_9),
+       BPF_JMP_IMM(BPF_JLT, BPF_REG_2, 8, 1),
+       BPF_EXIT_INSN(),
+
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), /* R2=inv(umin=1, umax=8) */
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+       BPF_MOV64_IMM(BPF_REG_3, 0),
+       BPF_EMIT_CALL(BPF_FUNC_probe_read),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+       .fixup_map_array_48b = { 1 },
+       .result = VERBOSE_ACCEPT,
+       .errstr =
+       "26: (85) call bpf_probe_read#4\
+       last_idx 26 first_idx 20\
+       regs=4 stack=0 before 25\
+       regs=4 stack=0 before 24\
+       regs=4 stack=0 before 23\
+       regs=4 stack=0 before 22\
+       regs=4 stack=0 before 20\
+       parent didn't have regs=4 stack=0 marks\
+       last_idx 19 first_idx 10\
+       regs=4 stack=0 before 19\
+       regs=200 stack=0 before 18\
+       regs=300 stack=0 before 17\
+       regs=201 stack=0 before 15\
+       regs=201 stack=0 before 14\
+       regs=200 stack=0 before 13\
+       regs=200 stack=0 before 12\
+       regs=200 stack=0 before 11\
+       regs=200 stack=0 before 10\
+       parent already had regs=0 stack=0 marks",
+},
+{
+       "precise: test 2",
+       .insns = {
+       BPF_MOV64_IMM(BPF_REG_0, 1),
+       BPF_LD_MAP_FD(BPF_REG_6, 0),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+       BPF_EXIT_INSN(),
+
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+       BPF_EXIT_INSN(),
+
+       BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+
+       BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8), /* map_value_ptr -= map_value_ptr */
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_9),
+       BPF_JMP_IMM(BPF_JLT, BPF_REG_2, 8, 1),
+       BPF_EXIT_INSN(),
+
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), /* R2=inv(umin=1, umax=8) */
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+       BPF_MOV64_IMM(BPF_REG_3, 0),
+       BPF_EMIT_CALL(BPF_FUNC_probe_read),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+       .fixup_map_array_48b = { 1 },
+       .result = VERBOSE_ACCEPT,
+       .flags = BPF_F_TEST_STATE_FREQ,
+       .errstr =
+       "26: (85) call bpf_probe_read#4\
+       last_idx 26 first_idx 22\
+       regs=4 stack=0 before 25\
+       regs=4 stack=0 before 24\
+       regs=4 stack=0 before 23\
+       regs=4 stack=0 before 22\
+       parent didn't have regs=4 stack=0 marks\
+       last_idx 20 first_idx 20\
+       regs=4 stack=0 before 20\
+       parent didn't have regs=4 stack=0 marks\
+       last_idx 19 first_idx 17\
+       regs=4 stack=0 before 19\
+       regs=200 stack=0 before 18\
+       regs=300 stack=0 before 17\
+       parent already had regs=0 stack=0 marks",
+},
+{
+       "precise: cross frame pruning",
+       .insns = {
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_IMM(BPF_REG_8, 0),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+       BPF_MOV64_IMM(BPF_REG_8, 1),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_IMM(BPF_REG_9, 0),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+       BPF_MOV64_IMM(BPF_REG_9, 1),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 1, 1),
+       BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_2, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_XDP,
+       .flags = BPF_F_TEST_STATE_FREQ,
+       .errstr = "!read_ok",
+       .result = REJECT,
+},
+{
+       "precise: ST insn causing spi > allocated_stack",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
+       BPF_ST_MEM(BPF_DW, BPF_REG_3, -8, 0),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+       BPF_MOV64_IMM(BPF_REG_0, -1),
+       BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_XDP,
+       .flags = BPF_F_TEST_STATE_FREQ,
+       .errstr = "5: (2d) if r4 > r0 goto pc+0\
+       last_idx 5 first_idx 5\
+       parent didn't have regs=10 stack=0 marks\
+       last_idx 4 first_idx 2\
+       regs=10 stack=0 before 4\
+       regs=10 stack=0 before 3\
+       regs=0 stack=1 before 2\
+       last_idx 5 first_idx 5\
+       parent didn't have regs=1 stack=0 marks",
+       .result = VERBOSE_ACCEPT,
+       .retval = -1,
+},
+{
+       "precise: STX insn causing spi > allocated_stack",
+       .insns = {
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+       BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
+       BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, -8),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+       BPF_MOV64_IMM(BPF_REG_0, -1),
+       BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .prog_type = BPF_PROG_TYPE_XDP,
+       .flags = BPF_F_TEST_STATE_FREQ,
+       .errstr = "last_idx 6 first_idx 6\
+       parent didn't have regs=10 stack=0 marks\
+       last_idx 5 first_idx 3\
+       regs=10 stack=0 before 5\
+       regs=10 stack=0 before 4\
+       regs=0 stack=1 before 3\
+       last_idx 6 first_idx 6\
+       parent didn't have regs=1 stack=0 marks\
+       last_idx 5 first_idx 3\
+       regs=1 stack=0 before 5",
+       .result = VERBOSE_ACCEPT,
+       .retval = -1,
+},