Merge tag 'pm-5.2-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael...
[sfrench/cifs-2.6.git] / samples / bpf / cpustat_kern.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include <linux/version.h>
4 #include <linux/ptrace.h>
5 #include <uapi/linux/bpf.h>
6 #include "bpf_helpers.h"
7
8 /*
9  * The CPU number, cstate number and pstate number are based
10  * on 96boards Hikey with octa CA53 CPUs.
11  *
12  * Every CPU have three idle states for cstate:
13  *   WFI, CPU_OFF, CLUSTER_OFF
14  *
15  * Every CPU have 5 operating points:
16  *   208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
17  *
18  * This code is based on these assumption and other platforms
19  * need to adjust these definitions.
20  */
21 #define MAX_CPU                 8
22 #define MAX_PSTATE_ENTRIES      5
23 #define MAX_CSTATE_ENTRIES      3
24
25 static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
26
27 /*
28  * my_map structure is used to record cstate and pstate index and
29  * timestamp (Idx, Ts), when new event incoming we need to update
30  * combination for new state index and timestamp (Idx`, Ts`).
31  *
32  * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
33  * interval for the previous state: Duration(Idx) = Ts` - Ts.
34  *
35  * Every CPU has one below array for recording state index and
36  * timestamp, and record for cstate and pstate saperately:
37  *
38  * +--------------------------+
39  * | cstate timestamp         |
40  * +--------------------------+
41  * | cstate index             |
42  * +--------------------------+
43  * | pstate timestamp         |
44  * +--------------------------+
45  * | pstate index             |
46  * +--------------------------+
47  */
48 #define MAP_OFF_CSTATE_TIME     0
49 #define MAP_OFF_CSTATE_IDX      1
50 #define MAP_OFF_PSTATE_TIME     2
51 #define MAP_OFF_PSTATE_IDX      3
52 #define MAP_OFF_NUM             4
53
54 struct bpf_map_def SEC("maps") my_map = {
55         .type = BPF_MAP_TYPE_ARRAY,
56         .key_size = sizeof(u32),
57         .value_size = sizeof(u64),
58         .max_entries = MAX_CPU * MAP_OFF_NUM,
59 };
60
61 /* cstate_duration records duration time for every idle state per CPU */
62 struct bpf_map_def SEC("maps") cstate_duration = {
63         .type = BPF_MAP_TYPE_ARRAY,
64         .key_size = sizeof(u32),
65         .value_size = sizeof(u64),
66         .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES,
67 };
68
69 /* pstate_duration records duration time for every operating point per CPU */
70 struct bpf_map_def SEC("maps") pstate_duration = {
71         .type = BPF_MAP_TYPE_ARRAY,
72         .key_size = sizeof(u32),
73         .value_size = sizeof(u64),
74         .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES,
75 };
76
77 /*
78  * The trace events for cpu_idle and cpu_frequency are taken from:
79  * /sys/kernel/debug/tracing/events/power/cpu_idle/format
80  * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
81  *
82  * These two events have same format, so define one common structure.
83  */
84 struct cpu_args {
85         u64 pad;
86         u32 state;
87         u32 cpu_id;
88 };
89
90 /* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
91 static u32 find_cpu_pstate_idx(u32 frequency)
92 {
93         u32 i;
94
95         for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
96                 if (frequency == cpu_opps[i])
97                         return i;
98         }
99
100         return i;
101 }
102
103 SEC("tracepoint/power/cpu_idle")
104 int bpf_prog1(struct cpu_args *ctx)
105 {
106         u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
107         u32 key, cpu, pstate_idx;
108         u64 *val;
109
110         if (ctx->cpu_id > MAX_CPU)
111                 return 0;
112
113         cpu = ctx->cpu_id;
114
115         key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
116         cts = bpf_map_lookup_elem(&my_map, &key);
117         if (!cts)
118                 return 0;
119
120         key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
121         cstate = bpf_map_lookup_elem(&my_map, &key);
122         if (!cstate)
123                 return 0;
124
125         key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
126         pts = bpf_map_lookup_elem(&my_map, &key);
127         if (!pts)
128                 return 0;
129
130         key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
131         pstate = bpf_map_lookup_elem(&my_map, &key);
132         if (!pstate)
133                 return 0;
134
135         prev_state = *cstate;
136         *cstate = ctx->state;
137
138         if (!*cts) {
139                 *cts = bpf_ktime_get_ns();
140                 return 0;
141         }
142
143         cur_ts = bpf_ktime_get_ns();
144         delta = cur_ts - *cts;
145         *cts = cur_ts;
146
147         /*
148          * When state doesn't equal to (u32)-1, the cpu will enter
149          * one idle state; for this case we need to record interval
150          * for the pstate.
151          *
152          *                 OPP2
153          *            +---------------------+
154          *     OPP1   |                     |
155          *   ---------+                     |
156          *                                  |  Idle state
157          *                                  +---------------
158          *
159          *            |<- pstate duration ->|
160          *            ^                     ^
161          *           pts                  cur_ts
162          */
163         if (ctx->state != (u32)-1) {
164
165                 /* record pstate after have first cpu_frequency event */
166                 if (!*pts)
167                         return 0;
168
169                 delta = cur_ts - *pts;
170
171                 pstate_idx = find_cpu_pstate_idx(*pstate);
172                 if (pstate_idx >= MAX_PSTATE_ENTRIES)
173                         return 0;
174
175                 key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
176                 val = bpf_map_lookup_elem(&pstate_duration, &key);
177                 if (val)
178                         __sync_fetch_and_add((long *)val, delta);
179
180         /*
181          * When state equal to (u32)-1, the cpu just exits from one
182          * specific idle state; for this case we need to record
183          * interval for the pstate.
184          *
185          *       OPP2
186          *   -----------+
187          *              |                          OPP1
188          *              |                     +-----------
189          *              |     Idle state      |
190          *              +---------------------+
191          *
192          *              |<- cstate duration ->|
193          *              ^                     ^
194          *             cts                  cur_ts
195          */
196         } else {
197
198                 key = cpu * MAX_CSTATE_ENTRIES + prev_state;
199                 val = bpf_map_lookup_elem(&cstate_duration, &key);
200                 if (val)
201                         __sync_fetch_and_add((long *)val, delta);
202         }
203
204         /* Update timestamp for pstate as new start time */
205         if (*pts)
206                 *pts = cur_ts;
207
208         return 0;
209 }
210
211 SEC("tracepoint/power/cpu_frequency")
212 int bpf_prog2(struct cpu_args *ctx)
213 {
214         u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
215         u32 key, cpu, pstate_idx;
216         u64 *val;
217
218         cpu = ctx->cpu_id;
219
220         key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
221         pts = bpf_map_lookup_elem(&my_map, &key);
222         if (!pts)
223                 return 0;
224
225         key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
226         pstate = bpf_map_lookup_elem(&my_map, &key);
227         if (!pstate)
228                 return 0;
229
230         key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
231         cstate = bpf_map_lookup_elem(&my_map, &key);
232         if (!cstate)
233                 return 0;
234
235         prev_state = *pstate;
236         *pstate = ctx->state;
237
238         if (!*pts) {
239                 *pts = bpf_ktime_get_ns();
240                 return 0;
241         }
242
243         cur_ts = bpf_ktime_get_ns();
244         delta = cur_ts - *pts;
245         *pts = cur_ts;
246
247         /* When CPU is in idle, bail out to skip pstate statistics */
248         if (*cstate != (u32)(-1))
249                 return 0;
250
251         /*
252          * The cpu changes to another different OPP (in below diagram
253          * change frequency from OPP3 to OPP1), need recording interval
254          * for previous frequency OPP3 and update timestamp as start
255          * time for new frequency OPP1.
256          *
257          *                 OPP3
258          *            +---------------------+
259          *     OPP2   |                     |
260          *   ---------+                     |
261          *                                  |    OPP1
262          *                                  +---------------
263          *
264          *            |<- pstate duration ->|
265          *            ^                     ^
266          *           pts                  cur_ts
267          */
268         pstate_idx = find_cpu_pstate_idx(*pstate);
269         if (pstate_idx >= MAX_PSTATE_ENTRIES)
270                 return 0;
271
272         key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
273         val = bpf_map_lookup_elem(&pstate_duration, &key);
274         if (val)
275                 __sync_fetch_and_add((long *)val, delta);
276
277         return 0;
278 }
279
280 char _license[] SEC("license") = "GPL";
281 u32 _version SEC("version") = LINUX_VERSION_CODE;