kill dentry_update_name_case()
[sfrench/cifs-2.6.git] / samples / bpf / xdp_redirect_cpu_kern.c
1 /*  XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
2  *
3  *  GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
4  */
5 #include <uapi/linux/if_ether.h>
6 #include <uapi/linux/if_packet.h>
7 #include <uapi/linux/if_vlan.h>
8 #include <uapi/linux/ip.h>
9 #include <uapi/linux/ipv6.h>
10 #include <uapi/linux/in.h>
11 #include <uapi/linux/tcp.h>
12 #include <uapi/linux/udp.h>
13
14 #include <uapi/linux/bpf.h>
15 #include "bpf_helpers.h"
16
17 #define MAX_CPUS 12 /* WARNING - sync with _user.c */
18
19 /* Special map type that can XDP_REDIRECT frames to another CPU */
20 struct bpf_map_def SEC("maps") cpu_map = {
21         .type           = BPF_MAP_TYPE_CPUMAP,
22         .key_size       = sizeof(u32),
23         .value_size     = sizeof(u32),
24         .max_entries    = MAX_CPUS,
25 };
26
27 /* Common stats data record to keep userspace more simple */
28 struct datarec {
29         __u64 processed;
30         __u64 dropped;
31         __u64 issue;
32 };
33
34 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
35  * feedback.  Redirect TX errors can be caught via a tracepoint.
36  */
37 struct bpf_map_def SEC("maps") rx_cnt = {
38         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
39         .key_size       = sizeof(u32),
40         .value_size     = sizeof(struct datarec),
41         .max_entries    = 1,
42 };
43
44 /* Used by trace point */
45 struct bpf_map_def SEC("maps") redirect_err_cnt = {
46         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
47         .key_size       = sizeof(u32),
48         .value_size     = sizeof(struct datarec),
49         .max_entries    = 2,
50         /* TODO: have entries for all possible errno's */
51 };
52
53 /* Used by trace point */
54 struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
55         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
56         .key_size       = sizeof(u32),
57         .value_size     = sizeof(struct datarec),
58         .max_entries    = MAX_CPUS,
59 };
60
61 /* Used by trace point */
62 struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
63         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
64         .key_size       = sizeof(u32),
65         .value_size     = sizeof(struct datarec),
66         .max_entries    = 1,
67 };
68
69 /* Set of maps controlling available CPU, and for iterating through
70  * selectable redirect CPUs.
71  */
72 struct bpf_map_def SEC("maps") cpus_available = {
73         .type           = BPF_MAP_TYPE_ARRAY,
74         .key_size       = sizeof(u32),
75         .value_size     = sizeof(u32),
76         .max_entries    = MAX_CPUS,
77 };
78 struct bpf_map_def SEC("maps") cpus_count = {
79         .type           = BPF_MAP_TYPE_ARRAY,
80         .key_size       = sizeof(u32),
81         .value_size     = sizeof(u32),
82         .max_entries    = 1,
83 };
84 struct bpf_map_def SEC("maps") cpus_iterator = {
85         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
86         .key_size       = sizeof(u32),
87         .value_size     = sizeof(u32),
88         .max_entries    = 1,
89 };
90
91 /* Used by trace point */
92 struct bpf_map_def SEC("maps") exception_cnt = {
93         .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
94         .key_size       = sizeof(u32),
95         .value_size     = sizeof(struct datarec),
96         .max_entries    = 1,
97 };
98
99 /* Helper parse functions */
100
101 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
102  *
103  * Returns false on error and non-supported ether-type
104  */
105 struct vlan_hdr {
106         __be16 h_vlan_TCI;
107         __be16 h_vlan_encapsulated_proto;
108 };
109
110 static __always_inline
111 bool parse_eth(struct ethhdr *eth, void *data_end,
112                u16 *eth_proto, u64 *l3_offset)
113 {
114         u16 eth_type;
115         u64 offset;
116
117         offset = sizeof(*eth);
118         if ((void *)eth + offset > data_end)
119                 return false;
120
121         eth_type = eth->h_proto;
122
123         /* Skip non 802.3 Ethertypes */
124         if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
125                 return false;
126
127         /* Handle VLAN tagged packet */
128         if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
129                 struct vlan_hdr *vlan_hdr;
130
131                 vlan_hdr = (void *)eth + offset;
132                 offset += sizeof(*vlan_hdr);
133                 if ((void *)eth + offset > data_end)
134                         return false;
135                 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
136         }
137         /* TODO: Handle double VLAN tagged packet */
138
139         *eth_proto = ntohs(eth_type);
140         *l3_offset = offset;
141         return true;
142 }
143
144 static __always_inline
145 u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
146 {
147         void *data_end = (void *)(long)ctx->data_end;
148         void *data     = (void *)(long)ctx->data;
149         struct iphdr *iph = data + nh_off;
150         struct udphdr *udph;
151         u16 dport;
152
153         if (iph + 1 > data_end)
154                 return 0;
155         if (!(iph->protocol == IPPROTO_UDP))
156                 return 0;
157
158         udph = (void *)(iph + 1);
159         if (udph + 1 > data_end)
160                 return 0;
161
162         dport = ntohs(udph->dest);
163         return dport;
164 }
165
166 static __always_inline
167 int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
168 {
169         void *data_end = (void *)(long)ctx->data_end;
170         void *data     = (void *)(long)ctx->data;
171         struct iphdr *iph = data + nh_off;
172
173         if (iph + 1 > data_end)
174                 return 0;
175         return iph->protocol;
176 }
177
178 static __always_inline
179 int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
180 {
181         void *data_end = (void *)(long)ctx->data_end;
182         void *data     = (void *)(long)ctx->data;
183         struct ipv6hdr *ip6h = data + nh_off;
184
185         if (ip6h + 1 > data_end)
186                 return 0;
187         return ip6h->nexthdr;
188 }
189
190 SEC("xdp_cpu_map0")
191 int  xdp_prognum0_no_touch(struct xdp_md *ctx)
192 {
193         void *data_end = (void *)(long)ctx->data_end;
194         void *data     = (void *)(long)ctx->data;
195         struct datarec *rec;
196         u32 *cpu_selected;
197         u32 cpu_dest;
198         u32 key = 0;
199
200         /* Only use first entry in cpus_available */
201         cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
202         if (!cpu_selected)
203                 return XDP_ABORTED;
204         cpu_dest = *cpu_selected;
205
206         /* Count RX packet in map */
207         rec = bpf_map_lookup_elem(&rx_cnt, &key);
208         if (!rec)
209                 return XDP_ABORTED;
210         rec->processed++;
211
212         if (cpu_dest >= MAX_CPUS) {
213                 rec->issue++;
214                 return XDP_ABORTED;
215         }
216
217         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
218 }
219
220 SEC("xdp_cpu_map1_touch_data")
221 int  xdp_prognum1_touch_data(struct xdp_md *ctx)
222 {
223         void *data_end = (void *)(long)ctx->data_end;
224         void *data     = (void *)(long)ctx->data;
225         struct ethhdr *eth = data;
226         struct datarec *rec;
227         u32 *cpu_selected;
228         u32 cpu_dest;
229         u16 eth_type;
230         u32 key = 0;
231
232         /* Only use first entry in cpus_available */
233         cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
234         if (!cpu_selected)
235                 return XDP_ABORTED;
236         cpu_dest = *cpu_selected;
237
238         /* Validate packet length is minimum Eth header size */
239         if (eth + 1 > data_end)
240                 return XDP_ABORTED;
241
242         /* Count RX packet in map */
243         rec = bpf_map_lookup_elem(&rx_cnt, &key);
244         if (!rec)
245                 return XDP_ABORTED;
246         rec->processed++;
247
248         /* Read packet data, and use it (drop non 802.3 Ethertypes) */
249         eth_type = eth->h_proto;
250         if (ntohs(eth_type) < ETH_P_802_3_MIN) {
251                 rec->dropped++;
252                 return XDP_DROP;
253         }
254
255         if (cpu_dest >= MAX_CPUS) {
256                 rec->issue++;
257                 return XDP_ABORTED;
258         }
259
260         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
261 }
262
263 SEC("xdp_cpu_map2_round_robin")
264 int  xdp_prognum2_round_robin(struct xdp_md *ctx)
265 {
266         void *data_end = (void *)(long)ctx->data_end;
267         void *data     = (void *)(long)ctx->data;
268         struct ethhdr *eth = data;
269         struct datarec *rec;
270         u32 cpu_dest;
271         u32 *cpu_lookup;
272         u32 key0 = 0;
273
274         u32 *cpu_selected;
275         u32 *cpu_iterator;
276         u32 *cpu_max;
277         u32 cpu_idx;
278
279         cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
280         if (!cpu_max)
281                 return XDP_ABORTED;
282
283         cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
284         if (!cpu_iterator)
285                 return XDP_ABORTED;
286         cpu_idx = *cpu_iterator;
287
288         *cpu_iterator += 1;
289         if (*cpu_iterator == *cpu_max)
290                 *cpu_iterator = 0;
291
292         cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
293         if (!cpu_selected)
294                 return XDP_ABORTED;
295         cpu_dest = *cpu_selected;
296
297         /* Count RX packet in map */
298         rec = bpf_map_lookup_elem(&rx_cnt, &key0);
299         if (!rec)
300                 return XDP_ABORTED;
301         rec->processed++;
302
303         if (cpu_dest >= MAX_CPUS) {
304                 rec->issue++;
305                 return XDP_ABORTED;
306         }
307
308         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
309 }
310
311 SEC("xdp_cpu_map3_proto_separate")
312 int  xdp_prognum3_proto_separate(struct xdp_md *ctx)
313 {
314         void *data_end = (void *)(long)ctx->data_end;
315         void *data     = (void *)(long)ctx->data;
316         struct ethhdr *eth = data;
317         u8 ip_proto = IPPROTO_UDP;
318         struct datarec *rec;
319         u16 eth_proto = 0;
320         u64 l3_offset = 0;
321         u32 cpu_dest = 0;
322         u32 cpu_idx = 0;
323         u32 *cpu_lookup;
324         u32 key = 0;
325
326         /* Count RX packet in map */
327         rec = bpf_map_lookup_elem(&rx_cnt, &key);
328         if (!rec)
329                 return XDP_ABORTED;
330         rec->processed++;
331
332         if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
333                 return XDP_PASS; /* Just skip */
334
335         /* Extract L4 protocol */
336         switch (eth_proto) {
337         case ETH_P_IP:
338                 ip_proto = get_proto_ipv4(ctx, l3_offset);
339                 break;
340         case ETH_P_IPV6:
341                 ip_proto = get_proto_ipv6(ctx, l3_offset);
342                 break;
343         case ETH_P_ARP:
344                 cpu_idx = 0; /* ARP packet handled on separate CPU */
345                 break;
346         default:
347                 cpu_idx = 0;
348         }
349
350         /* Choose CPU based on L4 protocol */
351         switch (ip_proto) {
352         case IPPROTO_ICMP:
353         case IPPROTO_ICMPV6:
354                 cpu_idx = 2;
355                 break;
356         case IPPROTO_TCP:
357                 cpu_idx = 0;
358                 break;
359         case IPPROTO_UDP:
360                 cpu_idx = 1;
361                 break;
362         default:
363                 cpu_idx = 0;
364         }
365
366         cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
367         if (!cpu_lookup)
368                 return XDP_ABORTED;
369         cpu_dest = *cpu_lookup;
370
371         if (cpu_dest >= MAX_CPUS) {
372                 rec->issue++;
373                 return XDP_ABORTED;
374         }
375
376         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
377 }
378
379 SEC("xdp_cpu_map4_ddos_filter_pktgen")
380 int  xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
381 {
382         void *data_end = (void *)(long)ctx->data_end;
383         void *data     = (void *)(long)ctx->data;
384         struct ethhdr *eth = data;
385         u8 ip_proto = IPPROTO_UDP;
386         struct datarec *rec;
387         u16 eth_proto = 0;
388         u64 l3_offset = 0;
389         u32 cpu_dest = 0;
390         u32 cpu_idx = 0;
391         u16 dest_port;
392         u32 *cpu_lookup;
393         u32 key = 0;
394
395         /* Count RX packet in map */
396         rec = bpf_map_lookup_elem(&rx_cnt, &key);
397         if (!rec)
398                 return XDP_ABORTED;
399         rec->processed++;
400
401         if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
402                 return XDP_PASS; /* Just skip */
403
404         /* Extract L4 protocol */
405         switch (eth_proto) {
406         case ETH_P_IP:
407                 ip_proto = get_proto_ipv4(ctx, l3_offset);
408                 break;
409         case ETH_P_IPV6:
410                 ip_proto = get_proto_ipv6(ctx, l3_offset);
411                 break;
412         case ETH_P_ARP:
413                 cpu_idx = 0; /* ARP packet handled on separate CPU */
414                 break;
415         default:
416                 cpu_idx = 0;
417         }
418
419         /* Choose CPU based on L4 protocol */
420         switch (ip_proto) {
421         case IPPROTO_ICMP:
422         case IPPROTO_ICMPV6:
423                 cpu_idx = 2;
424                 break;
425         case IPPROTO_TCP:
426                 cpu_idx = 0;
427                 break;
428         case IPPROTO_UDP:
429                 cpu_idx = 1;
430                 /* DDoS filter UDP port 9 (pktgen) */
431                 dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
432                 if (dest_port == 9) {
433                         if (rec)
434                                 rec->dropped++;
435                         return XDP_DROP;
436                 }
437                 break;
438         default:
439                 cpu_idx = 0;
440         }
441
442         cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
443         if (!cpu_lookup)
444                 return XDP_ABORTED;
445         cpu_dest = *cpu_lookup;
446
447         if (cpu_dest >= MAX_CPUS) {
448                 rec->issue++;
449                 return XDP_ABORTED;
450         }
451
452         return bpf_redirect_map(&cpu_map, cpu_dest, 0);
453 }
454
455
456 char _license[] SEC("license") = "GPL";
457
458 /*** Trace point code ***/
459
460 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
461  * Code in:                kernel/include/trace/events/xdp.h
462  */
463 struct xdp_redirect_ctx {
464         u64 __pad;      // First 8 bytes are not accessible by bpf code
465         int prog_id;    //      offset:8;  size:4; signed:1;
466         u32 act;        //      offset:12  size:4; signed:0;
467         int ifindex;    //      offset:16  size:4; signed:1;
468         int err;        //      offset:20  size:4; signed:1;
469         int to_ifindex; //      offset:24  size:4; signed:1;
470         u32 map_id;     //      offset:28  size:4; signed:0;
471         int map_index;  //      offset:32  size:4; signed:1;
472 };                      //      offset:36
473
474 enum {
475         XDP_REDIRECT_SUCCESS = 0,
476         XDP_REDIRECT_ERROR = 1
477 };
478
479 static __always_inline
480 int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
481 {
482         u32 key = XDP_REDIRECT_ERROR;
483         struct datarec *rec;
484         int err = ctx->err;
485
486         if (!err)
487                 key = XDP_REDIRECT_SUCCESS;
488
489         rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
490         if (!rec)
491                 return 0;
492         rec->dropped += 1;
493
494         return 0; /* Indicate event was filtered (no further processing)*/
495         /*
496          * Returning 1 here would allow e.g. a perf-record tracepoint
497          * to see and record these events, but it doesn't work well
498          * in-practice as stopping perf-record also unload this
499          * bpf_prog.  Plus, there is additional overhead of doing so.
500          */
501 }
502
503 SEC("tracepoint/xdp/xdp_redirect_err")
504 int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
505 {
506         return xdp_redirect_collect_stat(ctx);
507 }
508
509 SEC("tracepoint/xdp/xdp_redirect_map_err")
510 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
511 {
512         return xdp_redirect_collect_stat(ctx);
513 }
514
515 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
516  * Code in:                kernel/include/trace/events/xdp.h
517  */
518 struct xdp_exception_ctx {
519         u64 __pad;      // First 8 bytes are not accessible by bpf code
520         int prog_id;    //      offset:8;  size:4; signed:1;
521         u32 act;        //      offset:12; size:4; signed:0;
522         int ifindex;    //      offset:16; size:4; signed:1;
523 };
524
525 SEC("tracepoint/xdp/xdp_exception")
526 int trace_xdp_exception(struct xdp_exception_ctx *ctx)
527 {
528         struct datarec *rec;
529         u32 key = 0;
530
531         rec = bpf_map_lookup_elem(&exception_cnt, &key);
532         if (!rec)
533                 return 1;
534         rec->dropped += 1;
535
536         return 0;
537 }
538
539 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
540  * Code in:         kernel/include/trace/events/xdp.h
541  */
542 struct cpumap_enqueue_ctx {
543         u64 __pad;              // First 8 bytes are not accessible by bpf code
544         int map_id;             //      offset:8;  size:4; signed:1;
545         u32 act;                //      offset:12; size:4; signed:0;
546         int cpu;                //      offset:16; size:4; signed:1;
547         unsigned int drops;     //      offset:20; size:4; signed:0;
548         unsigned int processed; //      offset:24; size:4; signed:0;
549         int to_cpu;             //      offset:28; size:4; signed:1;
550 };
551
552 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
553 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
554 {
555         u32 to_cpu = ctx->to_cpu;
556         struct datarec *rec;
557
558         if (to_cpu >= MAX_CPUS)
559                 return 1;
560
561         rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
562         if (!rec)
563                 return 0;
564         rec->processed += ctx->processed;
565         rec->dropped   += ctx->drops;
566
567         /* Record bulk events, then userspace can calc average bulk size */
568         if (ctx->processed > 0)
569                 rec->issue += 1;
570
571         /* Inception: It's possible to detect overload situations, via
572          * this tracepoint.  This can be used for creating a feedback
573          * loop to XDP, which can take appropriate actions to mitigate
574          * this overload situation.
575          */
576         return 0;
577 }
578
579 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
580  * Code in:         kernel/include/trace/events/xdp.h
581  */
582 struct cpumap_kthread_ctx {
583         u64 __pad;              // First 8 bytes are not accessible by bpf code
584         int map_id;             //      offset:8;  size:4; signed:1;
585         u32 act;                //      offset:12; size:4; signed:0;
586         int cpu;                //      offset:16; size:4; signed:1;
587         unsigned int drops;     //      offset:20; size:4; signed:0;
588         unsigned int processed; //      offset:24; size:4; signed:0;
589         int sched;              //      offset:28; size:4; signed:1;
590 };
591
592 SEC("tracepoint/xdp/xdp_cpumap_kthread")
593 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
594 {
595         struct datarec *rec;
596         u32 key = 0;
597
598         rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
599         if (!rec)
600                 return 0;
601         rec->processed += ctx->processed;
602         rec->dropped   += ctx->drops;
603
604         /* Count times kthread yielded CPU via schedule call */
605         if (ctx->sched)
606                 rec->issue++;
607
608         return 0;
609 }