Merge tag 'sound-5.3-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai...
[sfrench/cifs-2.6.git] / tools / perf / util / thread-stack.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * thread-stack.c: Synthesize a thread's stack using call / return events
4  * Copyright (c) 2014, Intel Corporation.
5  */
6
7 #include <linux/rbtree.h>
8 #include <linux/list.h>
9 #include <linux/log2.h>
10 #include <linux/zalloc.h>
11 #include <errno.h>
12 #include <stdlib.h>
13 #include "thread.h"
14 #include "event.h"
15 #include "machine.h"
16 #include "env.h"
17 #include "debug.h"
18 #include "symbol.h"
19 #include "comm.h"
20 #include "call-path.h"
21 #include "thread-stack.h"
22
23 #define STACK_GROWTH 2048
24
25 /*
26  * State of retpoline detection.
27  *
28  * RETPOLINE_NONE: no retpoline detection
29  * X86_RETPOLINE_POSSIBLE: x86 retpoline possible
30  * X86_RETPOLINE_DETECTED: x86 retpoline detected
31  */
32 enum retpoline_state_t {
33         RETPOLINE_NONE,
34         X86_RETPOLINE_POSSIBLE,
35         X86_RETPOLINE_DETECTED,
36 };
37
38 /**
39  * struct thread_stack_entry - thread stack entry.
40  * @ret_addr: return address
41  * @timestamp: timestamp (if known)
42  * @ref: external reference (e.g. db_id of sample)
43  * @branch_count: the branch count when the entry was created
44  * @insn_count: the instruction count when the entry was created
45  * @cyc_count the cycle count when the entry was created
46  * @db_id: id used for db-export
47  * @cp: call path
48  * @no_call: a 'call' was not seen
49  * @trace_end: a 'call' but trace ended
50  * @non_call: a branch but not a 'call' to the start of a different symbol
51  */
52 struct thread_stack_entry {
53         u64 ret_addr;
54         u64 timestamp;
55         u64 ref;
56         u64 branch_count;
57         u64 insn_count;
58         u64 cyc_count;
59         u64 db_id;
60         struct call_path *cp;
61         bool no_call;
62         bool trace_end;
63         bool non_call;
64 };
65
66 /**
67  * struct thread_stack - thread stack constructed from 'call' and 'return'
68  *                       branch samples.
69  * @stack: array that holds the stack
70  * @cnt: number of entries in the stack
71  * @sz: current maximum stack size
72  * @trace_nr: current trace number
73  * @branch_count: running branch count
74  * @insn_count: running  instruction count
75  * @cyc_count running  cycle count
76  * @kernel_start: kernel start address
77  * @last_time: last timestamp
78  * @crp: call/return processor
79  * @comm: current comm
80  * @arr_sz: size of array if this is the first element of an array
81  * @rstate: used to detect retpolines
82  */
83 struct thread_stack {
84         struct thread_stack_entry *stack;
85         size_t cnt;
86         size_t sz;
87         u64 trace_nr;
88         u64 branch_count;
89         u64 insn_count;
90         u64 cyc_count;
91         u64 kernel_start;
92         u64 last_time;
93         struct call_return_processor *crp;
94         struct comm *comm;
95         unsigned int arr_sz;
96         enum retpoline_state_t rstate;
97 };
98
99 /*
100  * Assume pid == tid == 0 identifies the idle task as defined by
101  * perf_session__register_idle_thread(). The idle task is really 1 task per cpu,
102  * and therefore requires a stack for each cpu.
103  */
104 static inline bool thread_stack__per_cpu(struct thread *thread)
105 {
106         return !(thread->tid || thread->pid_);
107 }
108
109 static int thread_stack__grow(struct thread_stack *ts)
110 {
111         struct thread_stack_entry *new_stack;
112         size_t sz, new_sz;
113
114         new_sz = ts->sz + STACK_GROWTH;
115         sz = new_sz * sizeof(struct thread_stack_entry);
116
117         new_stack = realloc(ts->stack, sz);
118         if (!new_stack)
119                 return -ENOMEM;
120
121         ts->stack = new_stack;
122         ts->sz = new_sz;
123
124         return 0;
125 }
126
127 static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
128                               struct call_return_processor *crp)
129 {
130         int err;
131
132         err = thread_stack__grow(ts);
133         if (err)
134                 return err;
135
136         if (thread->mg && thread->mg->machine) {
137                 struct machine *machine = thread->mg->machine;
138                 const char *arch = perf_env__arch(machine->env);
139
140                 ts->kernel_start = machine__kernel_start(machine);
141                 if (!strcmp(arch, "x86"))
142                         ts->rstate = X86_RETPOLINE_POSSIBLE;
143         } else {
144                 ts->kernel_start = 1ULL << 63;
145         }
146         ts->crp = crp;
147
148         return 0;
149 }
150
151 static struct thread_stack *thread_stack__new(struct thread *thread, int cpu,
152                                               struct call_return_processor *crp)
153 {
154         struct thread_stack *ts = thread->ts, *new_ts;
155         unsigned int old_sz = ts ? ts->arr_sz : 0;
156         unsigned int new_sz = 1;
157
158         if (thread_stack__per_cpu(thread) && cpu > 0)
159                 new_sz = roundup_pow_of_two(cpu + 1);
160
161         if (!ts || new_sz > old_sz) {
162                 new_ts = calloc(new_sz, sizeof(*ts));
163                 if (!new_ts)
164                         return NULL;
165                 if (ts)
166                         memcpy(new_ts, ts, old_sz * sizeof(*ts));
167                 new_ts->arr_sz = new_sz;
168                 zfree(&thread->ts);
169                 thread->ts = new_ts;
170                 ts = new_ts;
171         }
172
173         if (thread_stack__per_cpu(thread) && cpu > 0 &&
174             (unsigned int)cpu < ts->arr_sz)
175                 ts += cpu;
176
177         if (!ts->stack &&
178             thread_stack__init(ts, thread, crp))
179                 return NULL;
180
181         return ts;
182 }
183
184 static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu)
185 {
186         struct thread_stack *ts = thread->ts;
187
188         if (cpu < 0)
189                 cpu = 0;
190
191         if (!ts || (unsigned int)cpu >= ts->arr_sz)
192                 return NULL;
193
194         ts += cpu;
195
196         if (!ts->stack)
197                 return NULL;
198
199         return ts;
200 }
201
202 static inline struct thread_stack *thread__stack(struct thread *thread,
203                                                     int cpu)
204 {
205         if (!thread)
206                 return NULL;
207
208         if (thread_stack__per_cpu(thread))
209                 return thread__cpu_stack(thread, cpu);
210
211         return thread->ts;
212 }
213
214 static int thread_stack__push(struct thread_stack *ts, u64 ret_addr,
215                               bool trace_end)
216 {
217         int err = 0;
218
219         if (ts->cnt == ts->sz) {
220                 err = thread_stack__grow(ts);
221                 if (err) {
222                         pr_warning("Out of memory: discarding thread stack\n");
223                         ts->cnt = 0;
224                 }
225         }
226
227         ts->stack[ts->cnt].trace_end = trace_end;
228         ts->stack[ts->cnt++].ret_addr = ret_addr;
229
230         return err;
231 }
232
233 static void thread_stack__pop(struct thread_stack *ts, u64 ret_addr)
234 {
235         size_t i;
236
237         /*
238          * In some cases there may be functions which are not seen to return.
239          * For example when setjmp / longjmp has been used.  Or the perf context
240          * switch in the kernel which doesn't stop and start tracing in exactly
241          * the same code path.  When that happens the return address will be
242          * further down the stack.  If the return address is not found at all,
243          * we assume the opposite (i.e. this is a return for a call that wasn't
244          * seen for some reason) and leave the stack alone.
245          */
246         for (i = ts->cnt; i; ) {
247                 if (ts->stack[--i].ret_addr == ret_addr) {
248                         ts->cnt = i;
249                         return;
250                 }
251         }
252 }
253
254 static void thread_stack__pop_trace_end(struct thread_stack *ts)
255 {
256         size_t i;
257
258         for (i = ts->cnt; i; ) {
259                 if (ts->stack[--i].trace_end)
260                         ts->cnt = i;
261                 else
262                         return;
263         }
264 }
265
266 static bool thread_stack__in_kernel(struct thread_stack *ts)
267 {
268         if (!ts->cnt)
269                 return false;
270
271         return ts->stack[ts->cnt - 1].cp->in_kernel;
272 }
273
274 static int thread_stack__call_return(struct thread *thread,
275                                      struct thread_stack *ts, size_t idx,
276                                      u64 timestamp, u64 ref, bool no_return)
277 {
278         struct call_return_processor *crp = ts->crp;
279         struct thread_stack_entry *tse;
280         struct call_return cr = {
281                 .thread = thread,
282                 .comm = ts->comm,
283                 .db_id = 0,
284         };
285         u64 *parent_db_id;
286
287         tse = &ts->stack[idx];
288         cr.cp = tse->cp;
289         cr.call_time = tse->timestamp;
290         cr.return_time = timestamp;
291         cr.branch_count = ts->branch_count - tse->branch_count;
292         cr.insn_count = ts->insn_count - tse->insn_count;
293         cr.cyc_count = ts->cyc_count - tse->cyc_count;
294         cr.db_id = tse->db_id;
295         cr.call_ref = tse->ref;
296         cr.return_ref = ref;
297         if (tse->no_call)
298                 cr.flags |= CALL_RETURN_NO_CALL;
299         if (no_return)
300                 cr.flags |= CALL_RETURN_NO_RETURN;
301         if (tse->non_call)
302                 cr.flags |= CALL_RETURN_NON_CALL;
303
304         /*
305          * The parent db_id must be assigned before exporting the child. Note
306          * it is not possible to export the parent first because its information
307          * is not yet complete because its 'return' has not yet been processed.
308          */
309         parent_db_id = idx ? &(tse - 1)->db_id : NULL;
310
311         return crp->process(&cr, parent_db_id, crp->data);
312 }
313
314 static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts)
315 {
316         struct call_return_processor *crp = ts->crp;
317         int err;
318
319         if (!crp) {
320                 ts->cnt = 0;
321                 return 0;
322         }
323
324         while (ts->cnt) {
325                 err = thread_stack__call_return(thread, ts, --ts->cnt,
326                                                 ts->last_time, 0, true);
327                 if (err) {
328                         pr_err("Error flushing thread stack!\n");
329                         ts->cnt = 0;
330                         return err;
331                 }
332         }
333
334         return 0;
335 }
336
337 int thread_stack__flush(struct thread *thread)
338 {
339         struct thread_stack *ts = thread->ts;
340         unsigned int pos;
341         int err = 0;
342
343         if (ts) {
344                 for (pos = 0; pos < ts->arr_sz; pos++) {
345                         int ret = __thread_stack__flush(thread, ts + pos);
346
347                         if (ret)
348                                 err = ret;
349                 }
350         }
351
352         return err;
353 }
354
355 int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip,
356                         u64 to_ip, u16 insn_len, u64 trace_nr)
357 {
358         struct thread_stack *ts = thread__stack(thread, cpu);
359
360         if (!thread)
361                 return -EINVAL;
362
363         if (!ts) {
364                 ts = thread_stack__new(thread, cpu, NULL);
365                 if (!ts) {
366                         pr_warning("Out of memory: no thread stack\n");
367                         return -ENOMEM;
368                 }
369                 ts->trace_nr = trace_nr;
370         }
371
372         /*
373          * When the trace is discontinuous, the trace_nr changes.  In that case
374          * the stack might be completely invalid.  Better to report nothing than
375          * to report something misleading, so flush the stack.
376          */
377         if (trace_nr != ts->trace_nr) {
378                 if (ts->trace_nr)
379                         __thread_stack__flush(thread, ts);
380                 ts->trace_nr = trace_nr;
381         }
382
383         /* Stop here if thread_stack__process() is in use */
384         if (ts->crp)
385                 return 0;
386
387         if (flags & PERF_IP_FLAG_CALL) {
388                 u64 ret_addr;
389
390                 if (!to_ip)
391                         return 0;
392                 ret_addr = from_ip + insn_len;
393                 if (ret_addr == to_ip)
394                         return 0; /* Zero-length calls are excluded */
395                 return thread_stack__push(ts, ret_addr,
396                                           flags & PERF_IP_FLAG_TRACE_END);
397         } else if (flags & PERF_IP_FLAG_TRACE_BEGIN) {
398                 /*
399                  * If the caller did not change the trace number (which would
400                  * have flushed the stack) then try to make sense of the stack.
401                  * Possibly, tracing began after returning to the current
402                  * address, so try to pop that. Also, do not expect a call made
403                  * when the trace ended, to return, so pop that.
404                  */
405                 thread_stack__pop(ts, to_ip);
406                 thread_stack__pop_trace_end(ts);
407         } else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) {
408                 thread_stack__pop(ts, to_ip);
409         }
410
411         return 0;
412 }
413
414 void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr)
415 {
416         struct thread_stack *ts = thread__stack(thread, cpu);
417
418         if (!ts)
419                 return;
420
421         if (trace_nr != ts->trace_nr) {
422                 if (ts->trace_nr)
423                         __thread_stack__flush(thread, ts);
424                 ts->trace_nr = trace_nr;
425         }
426 }
427
428 static void __thread_stack__free(struct thread *thread, struct thread_stack *ts)
429 {
430         __thread_stack__flush(thread, ts);
431         zfree(&ts->stack);
432 }
433
434 static void thread_stack__reset(struct thread *thread, struct thread_stack *ts)
435 {
436         unsigned int arr_sz = ts->arr_sz;
437
438         __thread_stack__free(thread, ts);
439         memset(ts, 0, sizeof(*ts));
440         ts->arr_sz = arr_sz;
441 }
442
443 void thread_stack__free(struct thread *thread)
444 {
445         struct thread_stack *ts = thread->ts;
446         unsigned int pos;
447
448         if (ts) {
449                 for (pos = 0; pos < ts->arr_sz; pos++)
450                         __thread_stack__free(thread, ts + pos);
451                 zfree(&thread->ts);
452         }
453 }
454
455 static inline u64 callchain_context(u64 ip, u64 kernel_start)
456 {
457         return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL;
458 }
459
460 void thread_stack__sample(struct thread *thread, int cpu,
461                           struct ip_callchain *chain,
462                           size_t sz, u64 ip, u64 kernel_start)
463 {
464         struct thread_stack *ts = thread__stack(thread, cpu);
465         u64 context = callchain_context(ip, kernel_start);
466         u64 last_context;
467         size_t i, j;
468
469         if (sz < 2) {
470                 chain->nr = 0;
471                 return;
472         }
473
474         chain->ips[0] = context;
475         chain->ips[1] = ip;
476
477         if (!ts) {
478                 chain->nr = 2;
479                 return;
480         }
481
482         last_context = context;
483
484         for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) {
485                 ip = ts->stack[ts->cnt - j].ret_addr;
486                 context = callchain_context(ip, kernel_start);
487                 if (context != last_context) {
488                         if (i >= sz - 1)
489                                 break;
490                         chain->ips[i++] = context;
491                         last_context = context;
492                 }
493                 chain->ips[i] = ip;
494         }
495
496         chain->nr = i;
497 }
498
499 struct call_return_processor *
500 call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data),
501                            void *data)
502 {
503         struct call_return_processor *crp;
504
505         crp = zalloc(sizeof(struct call_return_processor));
506         if (!crp)
507                 return NULL;
508         crp->cpr = call_path_root__new();
509         if (!crp->cpr)
510                 goto out_free;
511         crp->process = process;
512         crp->data = data;
513         return crp;
514
515 out_free:
516         free(crp);
517         return NULL;
518 }
519
520 void call_return_processor__free(struct call_return_processor *crp)
521 {
522         if (crp) {
523                 call_path_root__free(crp->cpr);
524                 free(crp);
525         }
526 }
527
528 static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr,
529                                  u64 timestamp, u64 ref, struct call_path *cp,
530                                  bool no_call, bool trace_end)
531 {
532         struct thread_stack_entry *tse;
533         int err;
534
535         if (!cp)
536                 return -ENOMEM;
537
538         if (ts->cnt == ts->sz) {
539                 err = thread_stack__grow(ts);
540                 if (err)
541                         return err;
542         }
543
544         tse = &ts->stack[ts->cnt++];
545         tse->ret_addr = ret_addr;
546         tse->timestamp = timestamp;
547         tse->ref = ref;
548         tse->branch_count = ts->branch_count;
549         tse->insn_count = ts->insn_count;
550         tse->cyc_count = ts->cyc_count;
551         tse->cp = cp;
552         tse->no_call = no_call;
553         tse->trace_end = trace_end;
554         tse->non_call = false;
555         tse->db_id = 0;
556
557         return 0;
558 }
559
560 static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts,
561                                 u64 ret_addr, u64 timestamp, u64 ref,
562                                 struct symbol *sym)
563 {
564         int err;
565
566         if (!ts->cnt)
567                 return 1;
568
569         if (ts->cnt == 1) {
570                 struct thread_stack_entry *tse = &ts->stack[0];
571
572                 if (tse->cp->sym == sym)
573                         return thread_stack__call_return(thread, ts, --ts->cnt,
574                                                          timestamp, ref, false);
575         }
576
577         if (ts->stack[ts->cnt - 1].ret_addr == ret_addr &&
578             !ts->stack[ts->cnt - 1].non_call) {
579                 return thread_stack__call_return(thread, ts, --ts->cnt,
580                                                  timestamp, ref, false);
581         } else {
582                 size_t i = ts->cnt - 1;
583
584                 while (i--) {
585                         if (ts->stack[i].ret_addr != ret_addr ||
586                             ts->stack[i].non_call)
587                                 continue;
588                         i += 1;
589                         while (ts->cnt > i) {
590                                 err = thread_stack__call_return(thread, ts,
591                                                                 --ts->cnt,
592                                                                 timestamp, ref,
593                                                                 true);
594                                 if (err)
595                                         return err;
596                         }
597                         return thread_stack__call_return(thread, ts, --ts->cnt,
598                                                          timestamp, ref, false);
599                 }
600         }
601
602         return 1;
603 }
604
605 static int thread_stack__bottom(struct thread_stack *ts,
606                                 struct perf_sample *sample,
607                                 struct addr_location *from_al,
608                                 struct addr_location *to_al, u64 ref)
609 {
610         struct call_path_root *cpr = ts->crp->cpr;
611         struct call_path *cp;
612         struct symbol *sym;
613         u64 ip;
614
615         if (sample->ip) {
616                 ip = sample->ip;
617                 sym = from_al->sym;
618         } else if (sample->addr) {
619                 ip = sample->addr;
620                 sym = to_al->sym;
621         } else {
622                 return 0;
623         }
624
625         cp = call_path__findnew(cpr, &cpr->call_path, sym, ip,
626                                 ts->kernel_start);
627
628         return thread_stack__push_cp(ts, ip, sample->time, ref, cp,
629                                      true, false);
630 }
631
632 static int thread_stack__pop_ks(struct thread *thread, struct thread_stack *ts,
633                                 struct perf_sample *sample, u64 ref)
634 {
635         u64 tm = sample->time;
636         int err;
637
638         /* Return to userspace, so pop all kernel addresses */
639         while (thread_stack__in_kernel(ts)) {
640                 err = thread_stack__call_return(thread, ts, --ts->cnt,
641                                                 tm, ref, true);
642                 if (err)
643                         return err;
644         }
645
646         return 0;
647 }
648
649 static int thread_stack__no_call_return(struct thread *thread,
650                                         struct thread_stack *ts,
651                                         struct perf_sample *sample,
652                                         struct addr_location *from_al,
653                                         struct addr_location *to_al, u64 ref)
654 {
655         struct call_path_root *cpr = ts->crp->cpr;
656         struct call_path *root = &cpr->call_path;
657         struct symbol *fsym = from_al->sym;
658         struct symbol *tsym = to_al->sym;
659         struct call_path *cp, *parent;
660         u64 ks = ts->kernel_start;
661         u64 addr = sample->addr;
662         u64 tm = sample->time;
663         u64 ip = sample->ip;
664         int err;
665
666         if (ip >= ks && addr < ks) {
667                 /* Return to userspace, so pop all kernel addresses */
668                 err = thread_stack__pop_ks(thread, ts, sample, ref);
669                 if (err)
670                         return err;
671
672                 /* If the stack is empty, push the userspace address */
673                 if (!ts->cnt) {
674                         cp = call_path__findnew(cpr, root, tsym, addr, ks);
675                         return thread_stack__push_cp(ts, 0, tm, ref, cp, true,
676                                                      false);
677                 }
678         } else if (thread_stack__in_kernel(ts) && ip < ks) {
679                 /* Return to userspace, so pop all kernel addresses */
680                 err = thread_stack__pop_ks(thread, ts, sample, ref);
681                 if (err)
682                         return err;
683         }
684
685         if (ts->cnt)
686                 parent = ts->stack[ts->cnt - 1].cp;
687         else
688                 parent = root;
689
690         if (parent->sym == from_al->sym) {
691                 /*
692                  * At the bottom of the stack, assume the missing 'call' was
693                  * before the trace started. So, pop the current symbol and push
694                  * the 'to' symbol.
695                  */
696                 if (ts->cnt == 1) {
697                         err = thread_stack__call_return(thread, ts, --ts->cnt,
698                                                         tm, ref, false);
699                         if (err)
700                                 return err;
701                 }
702
703                 if (!ts->cnt) {
704                         cp = call_path__findnew(cpr, root, tsym, addr, ks);
705
706                         return thread_stack__push_cp(ts, addr, tm, ref, cp,
707                                                      true, false);
708                 }
709
710                 /*
711                  * Otherwise assume the 'return' is being used as a jump (e.g.
712                  * retpoline) and just push the 'to' symbol.
713                  */
714                 cp = call_path__findnew(cpr, parent, tsym, addr, ks);
715
716                 err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false);
717                 if (!err)
718                         ts->stack[ts->cnt - 1].non_call = true;
719
720                 return err;
721         }
722
723         /*
724          * Assume 'parent' has not yet returned, so push 'to', and then push and
725          * pop 'from'.
726          */
727
728         cp = call_path__findnew(cpr, parent, tsym, addr, ks);
729
730         err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false);
731         if (err)
732                 return err;
733
734         cp = call_path__findnew(cpr, cp, fsym, ip, ks);
735
736         err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false);
737         if (err)
738                 return err;
739
740         return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false);
741 }
742
743 static int thread_stack__trace_begin(struct thread *thread,
744                                      struct thread_stack *ts, u64 timestamp,
745                                      u64 ref)
746 {
747         struct thread_stack_entry *tse;
748         int err;
749
750         if (!ts->cnt)
751                 return 0;
752
753         /* Pop trace end */
754         tse = &ts->stack[ts->cnt - 1];
755         if (tse->trace_end) {
756                 err = thread_stack__call_return(thread, ts, --ts->cnt,
757                                                 timestamp, ref, false);
758                 if (err)
759                         return err;
760         }
761
762         return 0;
763 }
764
765 static int thread_stack__trace_end(struct thread_stack *ts,
766                                    struct perf_sample *sample, u64 ref)
767 {
768         struct call_path_root *cpr = ts->crp->cpr;
769         struct call_path *cp;
770         u64 ret_addr;
771
772         /* No point having 'trace end' on the bottom of the stack */
773         if (!ts->cnt || (ts->cnt == 1 && ts->stack[0].ref == ref))
774                 return 0;
775
776         cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, NULL, 0,
777                                 ts->kernel_start);
778
779         ret_addr = sample->ip + sample->insn_len;
780
781         return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp,
782                                      false, true);
783 }
784
785 static bool is_x86_retpoline(const char *name)
786 {
787         const char *p = strstr(name, "__x86_indirect_thunk_");
788
789         return p == name || !strcmp(name, "__indirect_thunk_start");
790 }
791
792 /*
793  * x86 retpoline functions pollute the call graph. This function removes them.
794  * This does not handle function return thunks, nor is there any improvement
795  * for the handling of inline thunks or extern thunks.
796  */
797 static int thread_stack__x86_retpoline(struct thread_stack *ts,
798                                        struct perf_sample *sample,
799                                        struct addr_location *to_al)
800 {
801         struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1];
802         struct call_path_root *cpr = ts->crp->cpr;
803         struct symbol *sym = tse->cp->sym;
804         struct symbol *tsym = to_al->sym;
805         struct call_path *cp;
806
807         if (sym && is_x86_retpoline(sym->name)) {
808                 /*
809                  * This is a x86 retpoline fn. It pollutes the call graph by
810                  * showing up everywhere there is an indirect branch, but does
811                  * not itself mean anything. Here the top-of-stack is removed,
812                  * by decrementing the stack count, and then further down, the
813                  * resulting top-of-stack is replaced with the actual target.
814                  * The result is that the retpoline functions will no longer
815                  * appear in the call graph. Note this only affects the call
816                  * graph, since all the original branches are left unchanged.
817                  */
818                 ts->cnt -= 1;
819                 sym = ts->stack[ts->cnt - 2].cp->sym;
820                 if (sym && sym == tsym && to_al->addr != tsym->start) {
821                         /*
822                          * Target is back to the middle of the symbol we came
823                          * from so assume it is an indirect jmp and forget it
824                          * altogether.
825                          */
826                         ts->cnt -= 1;
827                         return 0;
828                 }
829         } else if (sym && sym == tsym) {
830                 /*
831                  * Target is back to the symbol we came from so assume it is an
832                  * indirect jmp and forget it altogether.
833                  */
834                 ts->cnt -= 1;
835                 return 0;
836         }
837
838         cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym,
839                                 sample->addr, ts->kernel_start);
840         if (!cp)
841                 return -ENOMEM;
842
843         /* Replace the top-of-stack with the actual target */
844         ts->stack[ts->cnt - 1].cp = cp;
845
846         return 0;
847 }
848
849 int thread_stack__process(struct thread *thread, struct comm *comm,
850                           struct perf_sample *sample,
851                           struct addr_location *from_al,
852                           struct addr_location *to_al, u64 ref,
853                           struct call_return_processor *crp)
854 {
855         struct thread_stack *ts = thread__stack(thread, sample->cpu);
856         enum retpoline_state_t rstate;
857         int err = 0;
858
859         if (ts && !ts->crp) {
860                 /* Supersede thread_stack__event() */
861                 thread_stack__reset(thread, ts);
862                 ts = NULL;
863         }
864
865         if (!ts) {
866                 ts = thread_stack__new(thread, sample->cpu, crp);
867                 if (!ts)
868                         return -ENOMEM;
869                 ts->comm = comm;
870         }
871
872         rstate = ts->rstate;
873         if (rstate == X86_RETPOLINE_DETECTED)
874                 ts->rstate = X86_RETPOLINE_POSSIBLE;
875
876         /* Flush stack on exec */
877         if (ts->comm != comm && thread->pid_ == thread->tid) {
878                 err = __thread_stack__flush(thread, ts);
879                 if (err)
880                         return err;
881                 ts->comm = comm;
882         }
883
884         /* If the stack is empty, put the current symbol on the stack */
885         if (!ts->cnt) {
886                 err = thread_stack__bottom(ts, sample, from_al, to_al, ref);
887                 if (err)
888                         return err;
889         }
890
891         ts->branch_count += 1;
892         ts->insn_count += sample->insn_cnt;
893         ts->cyc_count += sample->cyc_cnt;
894         ts->last_time = sample->time;
895
896         if (sample->flags & PERF_IP_FLAG_CALL) {
897                 bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END;
898                 struct call_path_root *cpr = ts->crp->cpr;
899                 struct call_path *cp;
900                 u64 ret_addr;
901
902                 if (!sample->ip || !sample->addr)
903                         return 0;
904
905                 ret_addr = sample->ip + sample->insn_len;
906                 if (ret_addr == sample->addr)
907                         return 0; /* Zero-length calls are excluded */
908
909                 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
910                                         to_al->sym, sample->addr,
911                                         ts->kernel_start);
912                 err = thread_stack__push_cp(ts, ret_addr, sample->time, ref,
913                                             cp, false, trace_end);
914
915                 /*
916                  * A call to the same symbol but not the start of the symbol,
917                  * may be the start of a x86 retpoline.
918                  */
919                 if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
920                     from_al->sym == to_al->sym &&
921                     to_al->addr != to_al->sym->start)
922                         ts->rstate = X86_RETPOLINE_DETECTED;
923
924         } else if (sample->flags & PERF_IP_FLAG_RETURN) {
925                 if (!sample->addr) {
926                         u32 return_from_kernel = PERF_IP_FLAG_SYSCALLRET |
927                                                  PERF_IP_FLAG_INTERRUPT;
928
929                         if (!(sample->flags & return_from_kernel))
930                                 return 0;
931
932                         /* Pop kernel stack */
933                         return thread_stack__pop_ks(thread, ts, sample, ref);
934                 }
935
936                 if (!sample->ip)
937                         return 0;
938
939                 /* x86 retpoline 'return' doesn't match the stack */
940                 if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 &&
941                     ts->stack[ts->cnt - 1].ret_addr != sample->addr)
942                         return thread_stack__x86_retpoline(ts, sample, to_al);
943
944                 err = thread_stack__pop_cp(thread, ts, sample->addr,
945                                            sample->time, ref, from_al->sym);
946                 if (err) {
947                         if (err < 0)
948                                 return err;
949                         err = thread_stack__no_call_return(thread, ts, sample,
950                                                            from_al, to_al, ref);
951                 }
952         } else if (sample->flags & PERF_IP_FLAG_TRACE_BEGIN) {
953                 err = thread_stack__trace_begin(thread, ts, sample->time, ref);
954         } else if (sample->flags & PERF_IP_FLAG_TRACE_END) {
955                 err = thread_stack__trace_end(ts, sample, ref);
956         } else if (sample->flags & PERF_IP_FLAG_BRANCH &&
957                    from_al->sym != to_al->sym && to_al->sym &&
958                    to_al->addr == to_al->sym->start) {
959                 struct call_path_root *cpr = ts->crp->cpr;
960                 struct call_path *cp;
961
962                 /*
963                  * The compiler might optimize a call/ret combination by making
964                  * it a jmp. Make that visible by recording on the stack a
965                  * branch to the start of a different symbol. Note, that means
966                  * when a ret pops the stack, all jmps must be popped off first.
967                  */
968                 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
969                                         to_al->sym, sample->addr,
970                                         ts->kernel_start);
971                 err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false,
972                                             false);
973                 if (!err)
974                         ts->stack[ts->cnt - 1].non_call = true;
975         }
976
977         return err;
978 }
979
980 size_t thread_stack__depth(struct thread *thread, int cpu)
981 {
982         struct thread_stack *ts = thread__stack(thread, cpu);
983
984         if (!ts)
985                 return 0;
986         return ts->cnt;
987 }