1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2021 Facebook
3 // Copyright (c) 2021 Google
5 #include <bpf/bpf_helpers.h>
6 #include <bpf/bpf_tracing.h>
7 #include <bpf/bpf_core_read.h>
9 #define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary
10 #define MAX_EVENTS 32 // max events per cgroup: arbitrary
12 // NOTE: many of map and global data will be modified before loading
13 // from the userspace (perf tool) using the skeleton helpers.
15 // single set of global perf events to measure
17 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
18 __uint(key_size, sizeof(__u32));
19 __uint(value_size, sizeof(int));
20 __uint(max_entries, 1);
21 } events SEC(".maps");
23 // from cgroup id to event index
25 __uint(type, BPF_MAP_TYPE_HASH);
26 __uint(key_size, sizeof(__u64));
27 __uint(value_size, sizeof(__u32));
28 __uint(max_entries, 1);
29 } cgrp_idx SEC(".maps");
31 // per-cpu event snapshots to calculate delta
33 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
34 __uint(key_size, sizeof(__u32));
35 __uint(value_size, sizeof(struct bpf_perf_event_value));
36 } prev_readings SEC(".maps");
38 // aggregated event values for each cgroup (per-cpu)
39 // will be read from the user-space
41 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
42 __uint(key_size, sizeof(__u32));
43 __uint(value_size, sizeof(struct bpf_perf_event_value));
44 } cgrp_readings SEC(".maps");
46 /* new kernel cgroup definition */
49 struct cgroup *ancestors[];
50 } __attribute__((preserve_access_index));
52 /* old kernel cgroup definition */
56 } __attribute__((preserve_access_index));
58 const volatile __u32 num_events = 1;
59 const volatile __u32 num_cpus = 1;
62 int use_cgroup_v2 = 0;
63 int perf_subsys_id = -1;
65 static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
67 /* recast pointer to capture new type for compiler */
68 struct cgroup___new *cgrp_new = (void *)cgrp;
70 if (bpf_core_field_exists(cgrp_new->ancestors)) {
71 return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
73 /* recast pointer to capture old type for compiler */
74 struct cgroup___old *cgrp_old = (void *)cgrp;
76 return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
80 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
82 struct task_struct *p = (void *)bpf_get_current_task();
89 if (perf_subsys_id == -1) {
90 #if __has_builtin(__builtin_preserve_enum_value)
91 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
94 perf_subsys_id = perf_event_cgrp_id;
97 cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
98 level = BPF_CORE_READ(cgrp, level);
100 for (cnt = 0; i < MAX_LEVELS; i++) {
106 // convert cgroup-id to a map index
107 cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
108 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
112 cgrps[cnt++] = *elem;
120 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
126 for (cnt = 0; i < MAX_LEVELS; i++) {
127 __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
132 // convert cgroup-id to a map index
133 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
137 cgrps[cnt++] = *elem;
145 static int bperf_cgroup_count(void)
147 register __u32 idx = 0; // to have it in a register to pass BPF verifier
149 struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
150 __u32 cpu = bpf_get_smp_processor_id();
151 __u32 cgrp_idx[MAX_LEVELS];
157 cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
159 cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
161 for ( ; idx < MAX_EVENTS; idx++) {
162 if (idx == num_events)
165 // XXX: do not pass idx directly (for verifier)
167 // this is per-cpu array for diff
168 prev_val = bpf_map_lookup_elem(&prev_readings, &key);
170 val.counter = val.enabled = val.running = 0;
171 bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
173 prev_val = bpf_map_lookup_elem(&prev_readings, &key);
178 // read from global perf_event array
179 key = idx * num_cpus + cpu;
180 err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
185 delta.counter = val.counter - prev_val->counter;
186 delta.enabled = val.enabled - prev_val->enabled;
187 delta.running = val.running - prev_val->running;
189 for (c = 0; c < MAX_LEVELS; c++) {
195 // aggregate the result by cgroup
196 key = cgrp * num_events + idx;
197 cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
199 cgrp_val->counter += delta.counter;
200 cgrp_val->enabled += delta.enabled;
201 cgrp_val->running += delta.running;
203 bpf_map_update_elem(&cgrp_readings, &key,
214 // This will be attached to cgroup-switches event for each cpu
216 int BPF_PROG(on_cgrp_switch)
218 return bperf_cgroup_count();
221 SEC("raw_tp/sched_switch")
222 int BPF_PROG(trigger_read)
224 return bperf_cgroup_count();
227 char LICENSE[] SEC("license") = "Dual BSD/GPL";