samples/bpf: xdp_monitor include cpumap tracepoints in monitoring
authorJesper Dangaard Brouer <brouer@redhat.com>
Fri, 19 Jan 2018 16:15:50 +0000 (17:15 +0100)
committerDaniel Borkmann <daniel@iogearbox.net>
Sat, 20 Jan 2018 01:10:55 +0000 (02:10 +0100)
The xdp_redirect_cpu sample have some "builtin" monitoring of the
tracepoints for xdp_cpumap_*, but it is practical to have an external
tool that can monitor these transpoint as an easy way to troubleshoot
an application using XDP + cpumap.

Specifically I need such external tool when working on Suricata and
XDP cpumap redirect. Extend the xdp_monitor tool sample with
monitoring of these xdp_cpumap_* tracepoints.  Model the output format
like xdp_redirect_cpu.

Given I needed to handle per CPU decoding for cpumap, this patch also
add per CPU info on the existing monitor events.  This resembles part
of the builtin monitoring output from sample xdp_rxq_info.  Thus, also
covering part of that sample in an external monitoring tool.

Performance wise, the cpumap tracepoints uses bulking, which cause
them to have very little overhead.  Thus, they are enabled by default.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
samples/bpf/xdp_monitor_kern.c
samples/bpf/xdp_monitor_user.c

index c969141bfa8bc2cff506a9fe3ea1c4043c7e03ee..211db8ded0de36e8514d3b91af0587cba78d89c2 100644 (file)
@@ -1,6 +1,7 @@
-/* XDP monitor tool, based on tracepoints
+/* SPDX-License-Identifier: GPL-2.0
+ *  Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc.
  *
- *  Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * XDP monitor tool, based on tracepoints
  */
 #include <uapi/linux/bpf.h>
 #include "bpf_helpers.h"
@@ -118,3 +119,92 @@ int trace_xdp_exception(struct xdp_exception_ctx *ctx)
 
        return 0;
 }
+
+/* Common stats data record shared with _user.c */
+struct datarec {
+       u64 processed;
+       u64 dropped;
+       u64 info;
+};
+#define MAX_CPUS 64
+
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+       .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
+       .key_size       = sizeof(u32),
+       .value_size     = sizeof(struct datarec),
+       .max_entries    = MAX_CPUS,
+};
+
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+       .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
+       .key_size       = sizeof(u32),
+       .value_size     = sizeof(struct datarec),
+       .max_entries    = 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+       u64 __pad;              // First 8 bytes are not accessible by bpf code
+       int map_id;             //      offset:8;  size:4; signed:1;
+       u32 act;                //      offset:12; size:4; signed:0;
+       int cpu;                //      offset:16; size:4; signed:1;
+       unsigned int drops;     //      offset:20; size:4; signed:0;
+       unsigned int processed; //      offset:24; size:4; signed:0;
+       int to_cpu;             //      offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+       u32 to_cpu = ctx->to_cpu;
+       struct datarec *rec;
+
+       if (to_cpu >= MAX_CPUS)
+               return 1;
+
+       rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+       if (!rec)
+               return 0;
+       rec->processed += ctx->processed;
+       rec->dropped   += ctx->drops;
+
+       /* Record bulk events, then userspace can calc average bulk size */
+       if (ctx->processed > 0)
+               rec->info += 1;
+
+       return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+       u64 __pad;              // First 8 bytes are not accessible by bpf code
+       int map_id;             //      offset:8;  size:4; signed:1;
+       u32 act;                //      offset:12; size:4; signed:0;
+       int cpu;                //      offset:16; size:4; signed:1;
+       unsigned int drops;     //      offset:20; size:4; signed:0;
+       unsigned int processed; //      offset:24; size:4; signed:0;
+       int sched;              //      offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+       struct datarec *rec;
+       u32 key = 0;
+
+       rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+       if (!rec)
+               return 0;
+       rec->processed += ctx->processed;
+       rec->dropped   += ctx->drops;
+
+       /* Count times kthread yielded CPU via schedule call */
+       if (ctx->sched)
+               rec->info++;
+
+       return 0;
+}
index eaba165b35492e9a8d683762920833ee78e04f81..eec14520d5135f26bcfbdbf7b6d737274d14cd0c 100644 (file)
@@ -1,4 +1,5 @@
-/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
  */
 static const char *__doc__=
  "XDP monitor tool, based on tracepoints\n"
@@ -40,6 +41,9 @@ static const struct option long_options[] = {
        {0, 0, NULL,  0 }
 };
 
+/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
+#define EXIT_FAIL_MEM  5
+
 static void usage(char *argv[])
 {
        int i;
@@ -108,23 +112,93 @@ static const char *action2str(int action)
        return NULL;
 }
 
+/* Common stats data record shared with _kern.c */
+struct datarec {
+       __u64 processed;
+       __u64 dropped;
+       __u64 info;
+};
+#define MAX_CPUS 64
+
+/* Userspace structs for collection of stats from maps */
 struct record {
-       __u64 counter;
        __u64 timestamp;
+       struct datarec total;
+       struct datarec *cpu;
+};
+struct u64rec {
+       __u64 processed;
+};
+struct record_u64 {
+       /* record for _kern side __u64 values */
+       __u64 timestamp;
+       struct u64rec total;
+       struct u64rec *cpu;
 };
 
 struct stats_record {
-       struct record xdp_redir[REDIR_RES_MAX];
-       struct record xdp_exception[XDP_ACTION_MAX];
+       struct record_u64 xdp_redirect[REDIR_RES_MAX];
+       struct record_u64 xdp_exception[XDP_ACTION_MAX];
+       struct record xdp_cpumap_kthread;
+       struct record xdp_cpumap_enqueue[MAX_CPUS];
 };
 
-static void stats_print_headers(bool err_only)
+static bool map_collect_record(int fd, __u32 key, struct record *rec)
 {
-       if (err_only)
-               printf("\n%s\n", __doc_err_only__);
+       /* For percpu maps, userspace gets a value per possible CPU */
+       unsigned int nr_cpus = bpf_num_possible_cpus();
+       struct datarec values[nr_cpus];
+       __u64 sum_processed = 0;
+       __u64 sum_dropped = 0;
+       __u64 sum_info = 0;
+       int i;
+
+       if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+               fprintf(stderr,
+                       "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+               return false;
+       }
+       /* Get time as close as possible to reading map contents */
+       rec->timestamp = gettime();
 
-       printf("%-14s %-11s %-10s %-18s %-9s\n",
-              "ACTION", "result", "pps ", "pps-human-readable", "measure-period");
+       /* Record and sum values from each CPU */
+       for (i = 0; i < nr_cpus; i++) {
+               rec->cpu[i].processed = values[i].processed;
+               sum_processed        += values[i].processed;
+               rec->cpu[i].dropped = values[i].dropped;
+               sum_dropped        += values[i].dropped;
+               rec->cpu[i].info = values[i].info;
+               sum_info        += values[i].info;
+       }
+       rec->total.processed = sum_processed;
+       rec->total.dropped   = sum_dropped;
+       rec->total.info      = sum_info;
+       return true;
+}
+
+static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec)
+{
+       /* For percpu maps, userspace gets a value per possible CPU */
+       unsigned int nr_cpus = bpf_num_possible_cpus();
+       struct u64rec values[nr_cpus];
+       __u64 sum_total = 0;
+       int i;
+
+       if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+               fprintf(stderr,
+                       "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+               return false;
+       }
+       /* Get time as close as possible to reading map contents */
+       rec->timestamp = gettime();
+
+       /* Record and sum values from each CPU */
+       for (i = 0; i < nr_cpus; i++) {
+               rec->cpu[i].processed = values[i].processed;
+               sum_total            += values[i].processed;
+       }
+       rec->total.processed = sum_total;
+       return true;
 }
 
 static double calc_period(struct record *r, struct record *p)
@@ -139,77 +213,203 @@ static double calc_period(struct record *r, struct record *p)
        return period_;
 }
 
-static double calc_pps(struct record *r, struct record *p, double period)
+static double calc_period_u64(struct record_u64 *r, struct record_u64 *p)
+{
+       double period_ = 0;
+       __u64 period = 0;
+
+       period = r->timestamp - p->timestamp;
+       if (period > 0)
+               period_ = ((double) period / NANOSEC_PER_SEC);
+
+       return period_;
+}
+
+static double calc_pps(struct datarec *r, struct datarec *p, double period)
+{
+       __u64 packets = 0;
+       double pps = 0;
+
+       if (period > 0) {
+               packets = r->processed - p->processed;
+               pps = packets / period;
+       }
+       return pps;
+}
+
+static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period)
+{
+       __u64 packets = 0;
+       double pps = 0;
+
+       if (period > 0) {
+               packets = r->processed - p->processed;
+               pps = packets / period;
+       }
+       return pps;
+}
+
+static double calc_drop(struct datarec *r, struct datarec *p, double period)
+{
+       __u64 packets = 0;
+       double pps = 0;
+
+       if (period > 0) {
+               packets = r->dropped - p->dropped;
+               pps = packets / period;
+       }
+       return pps;
+}
+
+static double calc_info(struct datarec *r, struct datarec *p, double period)
 {
        __u64 packets = 0;
        double pps = 0;
 
        if (period > 0) {
-               packets = r->counter - p->counter;
+               packets = r->info - p->info;
                pps = packets / period;
        }
        return pps;
 }
 
-static void stats_print(struct stats_record *rec,
-                       struct stats_record *prev,
+static void stats_print(struct stats_record *stats_rec,
+                       struct stats_record *stats_prev,
                        bool err_only)
 {
-       double period = 0, pps = 0;
-       struct record *r, *p;
-       int i = 0;
+       unsigned int nr_cpus = bpf_num_possible_cpus();
+       int rec_i = 0, i, to_cpu;
+       double t = 0, pps = 0;
 
-       char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n";
+       /* Header */
+       printf("%-15s %-7s %-12s %-12s %-9s\n",
+              "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info");
 
        /* tracepoint: xdp:xdp_redirect_* */
        if (err_only)
-               i = REDIR_ERROR;
-
-       for (; i < REDIR_RES_MAX; i++) {
-               r = &rec->xdp_redir[i];
-               p = &prev->xdp_redir[i];
-
-               if (p->timestamp) {
-                       period = calc_period(r, p);
-                       pps = calc_pps(r, p, period);
+               rec_i = REDIR_ERROR;
+
+       for (; rec_i < REDIR_RES_MAX; rec_i++) {
+               struct record_u64 *rec, *prev;
+               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+               rec  =  &stats_rec->xdp_redirect[rec_i];
+               prev = &stats_prev->xdp_redirect[rec_i];
+               t = calc_period_u64(rec, prev);
+
+               for (i = 0; i < nr_cpus; i++) {
+                       struct u64rec *r = &rec->cpu[i];
+                       struct u64rec *p = &prev->cpu[i];
+
+                       pps = calc_pps_u64(r, p, t);
+                       if (pps > 0)
+                               printf(fmt1, "XDP_REDIRECT", i,
+                                      rec_i ? 0.0: pps, rec_i ? pps : 0.0,
+                                      err2str(rec_i));
                }
-               printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period);
+               pps = calc_pps_u64(&rec->total, &prev->total, t);
+               printf(fmt2, "XDP_REDIRECT", "total",
+                      rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i));
        }
 
        /* tracepoint: xdp:xdp_exception */
-       for (i = 0; i < XDP_ACTION_MAX; i++) {
-               r = &rec->xdp_exception[i];
-               p = &prev->xdp_exception[i];
-               if (p->timestamp) {
-                       period = calc_period(r, p);
-                       pps = calc_pps(r, p, period);
+       for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
+               struct record_u64 *rec, *prev;
+               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+               rec  =  &stats_rec->xdp_exception[rec_i];
+               prev = &stats_prev->xdp_exception[rec_i];
+               t = calc_period_u64(rec, prev);
+
+               for (i = 0; i < nr_cpus; i++) {
+                       struct u64rec *r = &rec->cpu[i];
+                       struct u64rec *p = &prev->cpu[i];
+
+                       pps = calc_pps_u64(r, p, t);
+                       if (pps > 0)
+                               printf(fmt1, "Exception", i,
+                                      0.0, pps, err2str(rec_i));
                }
+               pps = calc_pps_u64(&rec->total, &prev->total, t);
                if (pps > 0)
-                       printf(fmt, action2str(i), "Exception",
-                              pps, pps, period);
+                       printf(fmt2, "Exception", "total",
+                              0.0, pps, action2str(rec_i));
        }
-       printf("\n");
-}
 
-static __u64 get_key32_value64_percpu(int fd, __u32 key)
-{
-       /* For percpu maps, userspace gets a value per possible CPU */
-       unsigned int nr_cpus = bpf_num_possible_cpus();
-       __u64 values[nr_cpus];
-       __u64 sum = 0;
-       int i;
-
-       if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
-               fprintf(stderr,
-                       "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
-               return 0;
+       /* cpumap enqueue stats */
+       for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+               char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+               char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+               struct record *rec, *prev;
+               char *info_str = "";
+               double drop, info;
+
+               rec  =  &stats_rec->xdp_cpumap_enqueue[to_cpu];
+               prev = &stats_prev->xdp_cpumap_enqueue[to_cpu];
+               t = calc_period(rec, prev);
+               for (i = 0; i < nr_cpus; i++) {
+                       struct datarec *r = &rec->cpu[i];
+                       struct datarec *p = &prev->cpu[i];
+
+                       pps  = calc_pps(r, p, t);
+                       drop = calc_drop(r, p, t);
+                       info = calc_info(r, p, t);
+                       if (info > 0) {
+                               info_str = "bulk-average";
+                               info = pps / info; /* calc average bulk size */
+                       }
+                       if (pps > 0)
+                               printf(fmt1, "cpumap-enqueue",
+                                      i, to_cpu, pps, drop, info, info_str);
+               }
+               pps = calc_pps(&rec->total, &prev->total, t);
+               if (pps > 0) {
+                       drop = calc_drop(&rec->total, &prev->total, t);
+                       info = calc_info(&rec->total, &prev->total, t);
+                       if (info > 0) {
+                               info_str = "bulk-average";
+                               info = pps / info; /* calc average bulk size */
+                       }
+                       printf(fmt2, "cpumap-enqueue",
+                              "sum", to_cpu, pps, drop, info, info_str);
+               }
        }
 
-       /* Sum values from each CPU */
-       for (i = 0; i < nr_cpus; i++) {
-               sum += values[i];
+       /* cpumap kthread stats */
+       {
+               char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n";
+               char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n";
+               struct record *rec, *prev;
+               double drop, info;
+               char *i_str = "";
+
+               rec  =  &stats_rec->xdp_cpumap_kthread;
+               prev = &stats_prev->xdp_cpumap_kthread;
+               t = calc_period(rec, prev);
+               for (i = 0; i < nr_cpus; i++) {
+                       struct datarec *r = &rec->cpu[i];
+                       struct datarec *p = &prev->cpu[i];
+
+                       pps  = calc_pps(r, p, t);
+                       drop = calc_drop(r, p, t);
+                       info = calc_info(r, p, t);
+                       if (info > 0)
+                               i_str = "sched";
+                       if (pps > 0)
+                               printf(fmt1, "cpumap-kthread",
+                                      i, pps, drop, info, i_str);
+               }
+               pps = calc_pps(&rec->total, &prev->total, t);
+               drop = calc_drop(&rec->total, &prev->total, t);
+               info = calc_info(&rec->total, &prev->total, t);
+               if (info > 0)
+                       i_str = "sched-sum";
+               printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
        }
-       return sum;
+
+       printf("\n");
 }
 
 static bool stats_collect(struct stats_record *rec)
@@ -222,25 +422,109 @@ static bool stats_collect(struct stats_record *rec)
         */
 
        fd = map_data[0].fd; /* map0: redirect_err_cnt */
-       for (i = 0; i < REDIR_RES_MAX; i++) {
-               rec->xdp_redir[i].timestamp = gettime();
-               rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
-       }
+       for (i = 0; i < REDIR_RES_MAX; i++)
+               map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
 
        fd = map_data[1].fd; /* map1: exception_cnt */
        for (i = 0; i < XDP_ACTION_MAX; i++) {
-               rec->xdp_exception[i].timestamp = gettime();
-               rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i);
+               map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
        }
 
+       fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+       for (i = 0; i < MAX_CPUS; i++)
+               map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
+
+       fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+       map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
+
        return true;
 }
 
+static void *alloc_rec_per_cpu(int record_size)
+{
+       unsigned int nr_cpus = bpf_num_possible_cpus();
+       void *array;
+       size_t size;
+
+       size = record_size * nr_cpus;
+       array = malloc(size);
+       memset(array, 0, size);
+       if (!array) {
+               fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+               exit(EXIT_FAIL_MEM);
+       }
+       return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+       struct stats_record *rec;
+       int rec_sz;
+       int i;
+
+       /* Alloc main stats_record structure */
+       rec = malloc(sizeof(*rec));
+       memset(rec, 0, sizeof(*rec));
+       if (!rec) {
+               fprintf(stderr, "Mem alloc error\n");
+               exit(EXIT_FAIL_MEM);
+       }
+
+       /* Alloc stats stored per CPU for each record */
+       rec_sz = sizeof(struct u64rec);
+       for (i = 0; i < REDIR_RES_MAX; i++)
+               rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+       for (i = 0; i < XDP_ACTION_MAX; i++)
+               rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+       rec_sz = sizeof(struct datarec);
+       rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+
+       for (i = 0; i < MAX_CPUS; i++)
+               rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+       return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+       int i;
+
+       for (i = 0; i < REDIR_RES_MAX; i++)
+               free(r->xdp_redirect[i].cpu);
+
+       for (i = 0; i < XDP_ACTION_MAX; i++)
+               free(r->xdp_exception[i].cpu);
+
+       free(r->xdp_cpumap_kthread.cpu);
+
+       for (i = 0; i < MAX_CPUS; i++)
+               free(r->xdp_cpumap_enqueue[i].cpu);
+
+       free(r);
+}
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+       struct stats_record *tmp;
+
+       tmp = *a;
+       *a = *b;
+       *b = tmp;
+}
+
 static void stats_poll(int interval, bool err_only)
 {
-       struct stats_record rec, prev;
+       struct stats_record *rec, *prev;
 
-       memset(&rec, 0, sizeof(rec));
+       rec  = alloc_stats_record();
+       prev = alloc_stats_record();
+       stats_collect(rec);
+
+       if (err_only)
+               printf("\n%s\n", __doc_err_only__);
 
        /* Trick to pretty printf with thousands separators use %' */
        setlocale(LC_NUMERIC, "en_US");
@@ -258,13 +542,15 @@ static void stats_poll(int interval, bool err_only)
        fflush(stdout);
 
        while (1) {
-               memcpy(&prev, &rec, sizeof(rec));
-               stats_collect(&rec);
-               stats_print_headers(err_only);
-               stats_print(&rec, &prev, err_only);
+               swap(&prev, &rec);
+               stats_collect(rec);
+               stats_print(rec, prev, err_only);
                fflush(stdout);
                sleep(interval);
        }
+
+       free_stats_record(rec);
+       free_stats_record(prev);
 }
 
 static void print_bpf_prog_info(void)