Merge branch 'for-next' of git://git.samba.org/sfrench/cifs-2.6
[sfrench/cifs-2.6.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60
61 #include "sane_ctype.h"
62
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC              02000000
65 #endif
66
67 struct trace {
68         struct perf_tool        tool;
69         struct syscalltbl       *sctbl;
70         struct {
71                 int             max;
72                 struct syscall  *table;
73                 struct {
74                         struct perf_evsel *sys_enter,
75                                           *sys_exit;
76                 }               events;
77         } syscalls;
78         struct record_opts      opts;
79         struct perf_evlist      *evlist;
80         struct machine          *host;
81         struct thread           *current;
82         u64                     base_time;
83         FILE                    *output;
84         unsigned long           nr_events;
85         struct strlist          *ev_qualifier;
86         struct {
87                 size_t          nr;
88                 int             *entries;
89         }                       ev_qualifier_ids;
90         struct {
91                 size_t          nr;
92                 pid_t           *entries;
93         }                       filter_pids;
94         double                  duration_filter;
95         double                  runtime_ms;
96         struct {
97                 u64             vfs_getname,
98                                 proc_getname;
99         } stats;
100         unsigned int            max_stack;
101         unsigned int            min_stack;
102         bool                    not_ev_qualifier;
103         bool                    live;
104         bool                    full_time;
105         bool                    sched;
106         bool                    multiple_threads;
107         bool                    summary;
108         bool                    summary_only;
109         bool                    show_comm;
110         bool                    show_tool_stats;
111         bool                    trace_syscalls;
112         bool                    kernel_syscallchains;
113         bool                    force;
114         bool                    vfs_getname;
115         int                     trace_pgfaults;
116         int                     open_id;
117 };
118
119 struct tp_field {
120         int offset;
121         union {
122                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
123                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
124         };
125 };
126
127 #define TP_UINT_FIELD(bits) \
128 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
129 { \
130         u##bits value; \
131         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
132         return value;  \
133 }
134
135 TP_UINT_FIELD(8);
136 TP_UINT_FIELD(16);
137 TP_UINT_FIELD(32);
138 TP_UINT_FIELD(64);
139
140 #define TP_UINT_FIELD__SWAPPED(bits) \
141 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
142 { \
143         u##bits value; \
144         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
145         return bswap_##bits(value);\
146 }
147
148 TP_UINT_FIELD__SWAPPED(16);
149 TP_UINT_FIELD__SWAPPED(32);
150 TP_UINT_FIELD__SWAPPED(64);
151
152 static int tp_field__init_uint(struct tp_field *field,
153                                struct format_field *format_field,
154                                bool needs_swap)
155 {
156         field->offset = format_field->offset;
157
158         switch (format_field->size) {
159         case 1:
160                 field->integer = tp_field__u8;
161                 break;
162         case 2:
163                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
164                 break;
165         case 4:
166                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
167                 break;
168         case 8:
169                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
170                 break;
171         default:
172                 return -1;
173         }
174
175         return 0;
176 }
177
178 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
179 {
180         return sample->raw_data + field->offset;
181 }
182
183 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
184 {
185         field->offset = format_field->offset;
186         field->pointer = tp_field__ptr;
187         return 0;
188 }
189
190 struct syscall_tp {
191         struct tp_field id;
192         union {
193                 struct tp_field args, ret;
194         };
195 };
196
197 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
198                                           struct tp_field *field,
199                                           const char *name)
200 {
201         struct format_field *format_field = perf_evsel__field(evsel, name);
202
203         if (format_field == NULL)
204                 return -1;
205
206         return tp_field__init_uint(field, format_field, evsel->needs_swap);
207 }
208
209 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
210         ({ struct syscall_tp *sc = evsel->priv;\
211            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
212
213 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
214                                          struct tp_field *field,
215                                          const char *name)
216 {
217         struct format_field *format_field = perf_evsel__field(evsel, name);
218
219         if (format_field == NULL)
220                 return -1;
221
222         return tp_field__init_ptr(field, format_field);
223 }
224
225 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
226         ({ struct syscall_tp *sc = evsel->priv;\
227            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
228
229 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
230 {
231         zfree(&evsel->priv);
232         perf_evsel__delete(evsel);
233 }
234
235 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
236 {
237         evsel->priv = malloc(sizeof(struct syscall_tp));
238         if (evsel->priv != NULL) {
239                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
240                         goto out_delete;
241
242                 evsel->handler = handler;
243                 return 0;
244         }
245
246         return -ENOMEM;
247
248 out_delete:
249         zfree(&evsel->priv);
250         return -ENOENT;
251 }
252
253 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
254 {
255         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
256
257         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
258         if (IS_ERR(evsel))
259                 evsel = perf_evsel__newtp("syscalls", direction);
260
261         if (IS_ERR(evsel))
262                 return NULL;
263
264         if (perf_evsel__init_syscall_tp(evsel, handler))
265                 goto out_delete;
266
267         return evsel;
268
269 out_delete:
270         perf_evsel__delete_priv(evsel);
271         return NULL;
272 }
273
274 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
275         ({ struct syscall_tp *fields = evsel->priv; \
276            fields->name.integer(&fields->name, sample); })
277
278 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
279         ({ struct syscall_tp *fields = evsel->priv; \
280            fields->name.pointer(&fields->name, sample); })
281
282 struct strarray {
283         int         offset;
284         int         nr_entries;
285         const char **entries;
286 };
287
288 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
289         .nr_entries = ARRAY_SIZE(array), \
290         .entries = array, \
291 }
292
293 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
294         .offset     = off, \
295         .nr_entries = ARRAY_SIZE(array), \
296         .entries = array, \
297 }
298
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300                                                 const char *intfmt,
301                                                 struct syscall_arg *arg)
302 {
303         struct strarray *sa = arg->parm;
304         int idx = arg->val - sa->offset;
305
306         if (idx < 0 || idx >= sa->nr_entries)
307                 return scnprintf(bf, size, intfmt, arg->val);
308
309         return scnprintf(bf, size, "%s", sa->entries[idx]);
310 }
311
312 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
313                                               struct syscall_arg *arg)
314 {
315         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 }
317
318 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
319
320 #if defined(__i386__) || defined(__x86_64__)
321 /*
322  * FIXME: Make this available to all arches as soon as the ioctl beautifier
323  *        gets rewritten to support all arches.
324  */
325 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
326                                                  struct syscall_arg *arg)
327 {
328         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 }
330
331 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
332 #endif /* defined(__i386__) || defined(__x86_64__) */
333
334 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
335                                         struct syscall_arg *arg);
336
337 #define SCA_FD syscall_arg__scnprintf_fd
338
339 #ifndef AT_FDCWD
340 #define AT_FDCWD        -100
341 #endif
342
343 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
344                                            struct syscall_arg *arg)
345 {
346         int fd = arg->val;
347
348         if (fd == AT_FDCWD)
349                 return scnprintf(bf, size, "CWD");
350
351         return syscall_arg__scnprintf_fd(bf, size, arg);
352 }
353
354 #define SCA_FDAT syscall_arg__scnprintf_fd_at
355
356 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
357                                               struct syscall_arg *arg);
358
359 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
360
361 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
362                                          struct syscall_arg *arg)
363 {
364         return scnprintf(bf, size, "%#lx", arg->val);
365 }
366
367 #define SCA_HEX syscall_arg__scnprintf_hex
368
369 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
370                                          struct syscall_arg *arg)
371 {
372         return scnprintf(bf, size, "%d", arg->val);
373 }
374
375 #define SCA_INT syscall_arg__scnprintf_int
376
377 static const char *bpf_cmd[] = {
378         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
379         "MAP_GET_NEXT_KEY", "PROG_LOAD",
380 };
381 static DEFINE_STRARRAY(bpf_cmd);
382
383 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
384 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
385
386 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
387 static DEFINE_STRARRAY(itimers);
388
389 static const char *keyctl_options[] = {
390         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
391         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
392         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
393         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
394         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
395 };
396 static DEFINE_STRARRAY(keyctl_options);
397
398 static const char *whences[] = { "SET", "CUR", "END",
399 #ifdef SEEK_DATA
400 "DATA",
401 #endif
402 #ifdef SEEK_HOLE
403 "HOLE",
404 #endif
405 };
406 static DEFINE_STRARRAY(whences);
407
408 static const char *fcntl_cmds[] = {
409         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
410         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
411         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
412         "F_GETOWNER_UIDS",
413 };
414 static DEFINE_STRARRAY(fcntl_cmds);
415
416 static const char *rlimit_resources[] = {
417         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
418         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
419         "RTTIME",
420 };
421 static DEFINE_STRARRAY(rlimit_resources);
422
423 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
424 static DEFINE_STRARRAY(sighow);
425
426 static const char *clockid[] = {
427         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
428         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
429         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
430 };
431 static DEFINE_STRARRAY(clockid);
432
433 static const char *socket_families[] = {
434         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
435         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
436         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
437         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
438         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
439         "ALG", "NFC", "VSOCK",
440 };
441 static DEFINE_STRARRAY(socket_families);
442
443 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
444                                                  struct syscall_arg *arg)
445 {
446         size_t printed = 0;
447         int mode = arg->val;
448
449         if (mode == F_OK) /* 0 */
450                 return scnprintf(bf, size, "F");
451 #define P_MODE(n) \
452         if (mode & n##_OK) { \
453                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
454                 mode &= ~n##_OK; \
455         }
456
457         P_MODE(R);
458         P_MODE(W);
459         P_MODE(X);
460 #undef P_MODE
461
462         if (mode)
463                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
464
465         return printed;
466 }
467
468 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
469
470 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
471                                               struct syscall_arg *arg);
472
473 #define SCA_FILENAME syscall_arg__scnprintf_filename
474
475 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
476                                                 struct syscall_arg *arg)
477 {
478         int printed = 0, flags = arg->val;
479
480 #define P_FLAG(n) \
481         if (flags & O_##n) { \
482                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
483                 flags &= ~O_##n; \
484         }
485
486         P_FLAG(CLOEXEC);
487         P_FLAG(NONBLOCK);
488 #undef P_FLAG
489
490         if (flags)
491                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
492
493         return printed;
494 }
495
496 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
497
498 #if defined(__i386__) || defined(__x86_64__)
499 /*
500  * FIXME: Make this available to all arches.
501  */
502 #define TCGETS          0x5401
503
504 static const char *tioctls[] = {
505         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
506         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
507         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
508         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
509         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
510         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
511         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
512         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
513         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
514         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
515         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
516         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
517         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
518         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
519         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
520 };
521
522 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
523 #endif /* defined(__i386__) || defined(__x86_64__) */
524
525 #ifndef GRND_NONBLOCK
526 #define GRND_NONBLOCK   0x0001
527 #endif
528 #ifndef GRND_RANDOM
529 #define GRND_RANDOM     0x0002
530 #endif
531
532 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
533                                                    struct syscall_arg *arg)
534 {
535         int printed = 0, flags = arg->val;
536
537 #define P_FLAG(n) \
538         if (flags & GRND_##n) { \
539                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
540                 flags &= ~GRND_##n; \
541         }
542
543         P_FLAG(RANDOM);
544         P_FLAG(NONBLOCK);
545 #undef P_FLAG
546
547         if (flags)
548                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
549
550         return printed;
551 }
552
553 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
554
555 #define STRARRAY(arg, name, array) \
556           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
557           .arg_parm      = { [arg] = &strarray__##array, }
558
559 #include "trace/beauty/eventfd.c"
560 #include "trace/beauty/flock.c"
561 #include "trace/beauty/futex_op.c"
562 #include "trace/beauty/mmap.c"
563 #include "trace/beauty/mode_t.c"
564 #include "trace/beauty/msg_flags.c"
565 #include "trace/beauty/open_flags.c"
566 #include "trace/beauty/perf_event_open.c"
567 #include "trace/beauty/pid.c"
568 #include "trace/beauty/sched_policy.c"
569 #include "trace/beauty/seccomp.c"
570 #include "trace/beauty/signum.c"
571 #include "trace/beauty/socket_type.c"
572 #include "trace/beauty/waitid_options.c"
573
574 static struct syscall_fmt {
575         const char *name;
576         const char *alias;
577         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
578         void       *arg_parm[6];
579         bool       errmsg;
580         bool       errpid;
581         bool       timeout;
582         bool       hexret;
583 } syscall_fmts[] = {
584         { .name     = "access",     .errmsg = true,
585           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
586         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
587         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
588         { .name     = "brk",        .hexret = true,
589           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
590         { .name     = "chdir",      .errmsg = true, },
591         { .name     = "chmod",      .errmsg = true, },
592         { .name     = "chroot",     .errmsg = true, },
593         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
594         { .name     = "clone",      .errpid = true, },
595         { .name     = "close",      .errmsg = true,
596           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
597         { .name     = "connect",    .errmsg = true, },
598         { .name     = "creat",      .errmsg = true, },
599         { .name     = "dup",        .errmsg = true, },
600         { .name     = "dup2",       .errmsg = true, },
601         { .name     = "dup3",       .errmsg = true, },
602         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
603         { .name     = "eventfd2",   .errmsg = true,
604           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
605         { .name     = "faccessat",  .errmsg = true, },
606         { .name     = "fadvise64",  .errmsg = true, },
607         { .name     = "fallocate",  .errmsg = true, },
608         { .name     = "fchdir",     .errmsg = true, },
609         { .name     = "fchmod",     .errmsg = true, },
610         { .name     = "fchmodat",   .errmsg = true,
611           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
612         { .name     = "fchown",     .errmsg = true, },
613         { .name     = "fchownat",   .errmsg = true,
614           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
615         { .name     = "fcntl",      .errmsg = true,
616           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
617           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
618         { .name     = "fdatasync",  .errmsg = true, },
619         { .name     = "flock",      .errmsg = true,
620           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
621         { .name     = "fsetxattr",  .errmsg = true, },
622         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
623         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
624         { .name     = "fstatfs",    .errmsg = true, },
625         { .name     = "fsync",    .errmsg = true, },
626         { .name     = "ftruncate", .errmsg = true, },
627         { .name     = "futex",      .errmsg = true,
628           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
629         { .name     = "futimesat", .errmsg = true,
630           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
631         { .name     = "getdents",   .errmsg = true, },
632         { .name     = "getdents64", .errmsg = true, },
633         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
634         { .name     = "getpid",     .errpid = true, },
635         { .name     = "getpgid",    .errpid = true, },
636         { .name     = "getppid",    .errpid = true, },
637         { .name     = "getrandom",  .errmsg = true,
638           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
639         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
640         { .name     = "getxattr",   .errmsg = true, },
641         { .name     = "inotify_add_watch",          .errmsg = true, },
642         { .name     = "ioctl",      .errmsg = true,
643           .arg_scnprintf = {
644 #if defined(__i386__) || defined(__x86_64__)
645 /*
646  * FIXME: Make this available to all arches.
647  */
648                              [1] = SCA_STRHEXARRAY, /* cmd */
649                              [2] = SCA_HEX, /* arg */ },
650           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
651 #else
652                              [2] = SCA_HEX, /* arg */ }, },
653 #endif
654         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
655         { .name     = "kill",       .errmsg = true,
656           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
657         { .name     = "lchown",    .errmsg = true, },
658         { .name     = "lgetxattr",  .errmsg = true, },
659         { .name     = "linkat",     .errmsg = true,
660           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
661         { .name     = "listxattr",  .errmsg = true, },
662         { .name     = "llistxattr", .errmsg = true, },
663         { .name     = "lremovexattr",  .errmsg = true, },
664         { .name     = "lseek",      .errmsg = true,
665           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
666           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
667         { .name     = "lsetxattr",  .errmsg = true, },
668         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
669         { .name     = "lsxattr",    .errmsg = true, },
670         { .name     = "madvise",    .errmsg = true,
671           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
672                              [2] = SCA_MADV_BHV, /* behavior */ }, },
673         { .name     = "mkdir",    .errmsg = true, },
674         { .name     = "mkdirat",    .errmsg = true,
675           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
676         { .name     = "mknod",      .errmsg = true, },
677         { .name     = "mknodat",    .errmsg = true,
678           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
679         { .name     = "mlock",      .errmsg = true,
680           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681         { .name     = "mlockall",   .errmsg = true,
682           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
683         { .name     = "mmap",       .hexret = true,
684           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
685                              [2] = SCA_MMAP_PROT, /* prot */
686                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
687         { .name     = "mprotect",   .errmsg = true,
688           .arg_scnprintf = { [0] = SCA_HEX, /* start */
689                              [2] = SCA_MMAP_PROT, /* prot */ }, },
690         { .name     = "mq_unlink", .errmsg = true,
691           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
692         { .name     = "mremap",     .hexret = true,
693           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
694                              [3] = SCA_MREMAP_FLAGS, /* flags */
695                              [4] = SCA_HEX, /* new_addr */ }, },
696         { .name     = "munlock",    .errmsg = true,
697           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
698         { .name     = "munmap",     .errmsg = true,
699           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
700         { .name     = "name_to_handle_at", .errmsg = true,
701           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
702         { .name     = "newfstatat", .errmsg = true,
703           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
704         { .name     = "open",       .errmsg = true,
705           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
706         { .name     = "open_by_handle_at", .errmsg = true,
707           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
708                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
709         { .name     = "openat",     .errmsg = true,
710           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
711                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
712         { .name     = "perf_event_open", .errmsg = true,
713           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
714                              [3] = SCA_FD,  /* group_fd */
715                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
716         { .name     = "pipe2",      .errmsg = true,
717           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
718         { .name     = "poll",       .errmsg = true, .timeout = true, },
719         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
720         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
721         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
722         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
723         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
724         { .name     = "pwritev",    .errmsg = true, },
725         { .name     = "read",       .errmsg = true, },
726         { .name     = "readlink",   .errmsg = true, },
727         { .name     = "readlinkat", .errmsg = true,
728           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
729         { .name     = "readv",      .errmsg = true, },
730         { .name     = "recvfrom",   .errmsg = true,
731           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
732         { .name     = "recvmmsg",   .errmsg = true,
733           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
734         { .name     = "recvmsg",    .errmsg = true,
735           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
736         { .name     = "removexattr", .errmsg = true, },
737         { .name     = "renameat",   .errmsg = true,
738           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
739         { .name     = "rmdir",    .errmsg = true, },
740         { .name     = "rt_sigaction", .errmsg = true,
741           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
742         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
743         { .name     = "rt_sigqueueinfo", .errmsg = true,
744           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
745         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
746           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
747         { .name     = "sched_getattr",        .errmsg = true, },
748         { .name     = "sched_setattr",        .errmsg = true, },
749         { .name     = "sched_setscheduler",   .errmsg = true,
750           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
751         { .name     = "seccomp", .errmsg = true,
752           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
753                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
754         { .name     = "select",     .errmsg = true, .timeout = true, },
755         { .name     = "sendmmsg",    .errmsg = true,
756           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
757         { .name     = "sendmsg",    .errmsg = true,
758           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
759         { .name     = "sendto",     .errmsg = true,
760           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
761         { .name     = "set_tid_address", .errpid = true, },
762         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
763         { .name     = "setpgid",    .errmsg = true, },
764         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
765         { .name     = "setxattr",   .errmsg = true, },
766         { .name     = "shutdown",   .errmsg = true, },
767         { .name     = "socket",     .errmsg = true,
768           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
769                              [1] = SCA_SK_TYPE, /* type */ },
770           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
771         { .name     = "socketpair", .errmsg = true,
772           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
773                              [1] = SCA_SK_TYPE, /* type */ },
774           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
775         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
776         { .name     = "statfs",     .errmsg = true, },
777         { .name     = "statx",      .errmsg = true,
778           .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
779                              [2] = SCA_STATX_FLAGS, /* flags */
780                              [3] = SCA_STATX_MASK, /* mask */ }, },
781         { .name     = "swapoff",    .errmsg = true,
782           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
783         { .name     = "swapon",     .errmsg = true,
784           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
785         { .name     = "symlinkat",  .errmsg = true,
786           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
787         { .name     = "tgkill",     .errmsg = true,
788           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
789         { .name     = "tkill",      .errmsg = true,
790           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
791         { .name     = "truncate",   .errmsg = true, },
792         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
793         { .name     = "unlinkat",   .errmsg = true,
794           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
795         { .name     = "utime",  .errmsg = true, },
796         { .name     = "utimensat",  .errmsg = true,
797           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
798         { .name     = "utimes",  .errmsg = true, },
799         { .name     = "vmsplice",  .errmsg = true, },
800         { .name     = "wait4",      .errpid = true,
801           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
802         { .name     = "waitid",     .errpid = true,
803           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
804         { .name     = "write",      .errmsg = true, },
805         { .name     = "writev",     .errmsg = true, },
806 };
807
808 static int syscall_fmt__cmp(const void *name, const void *fmtp)
809 {
810         const struct syscall_fmt *fmt = fmtp;
811         return strcmp(name, fmt->name);
812 }
813
814 static struct syscall_fmt *syscall_fmt__find(const char *name)
815 {
816         const int nmemb = ARRAY_SIZE(syscall_fmts);
817         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
818 }
819
820 struct syscall {
821         struct event_format *tp_format;
822         int                 nr_args;
823         struct format_field *args;
824         const char          *name;
825         bool                is_exit;
826         struct syscall_fmt  *fmt;
827         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
828         void                **arg_parm;
829 };
830
831 /*
832  * We need to have this 'calculated' boolean because in some cases we really
833  * don't know what is the duration of a syscall, for instance, when we start
834  * a session and some threads are waiting for a syscall to finish, say 'poll',
835  * in which case all we can do is to print "( ? ) for duration and for the
836  * start timestamp.
837  */
838 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
839 {
840         double duration = (double)t / NSEC_PER_MSEC;
841         size_t printed = fprintf(fp, "(");
842
843         if (!calculated)
844                 printed += fprintf(fp, "     ?   ");
845         else if (duration >= 1.0)
846                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
847         else if (duration >= 0.01)
848                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
849         else
850                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
851         return printed + fprintf(fp, "): ");
852 }
853
854 /**
855  * filename.ptr: The filename char pointer that will be vfs_getname'd
856  * filename.entry_str_pos: Where to insert the string translated from
857  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
858  */
859 struct thread_trace {
860         u64               entry_time;
861         bool              entry_pending;
862         unsigned long     nr_events;
863         unsigned long     pfmaj, pfmin;
864         char              *entry_str;
865         double            runtime_ms;
866         struct {
867                 unsigned long ptr;
868                 short int     entry_str_pos;
869                 bool          pending_open;
870                 unsigned int  namelen;
871                 char          *name;
872         } filename;
873         struct {
874                 int       max;
875                 char      **table;
876         } paths;
877
878         struct intlist *syscall_stats;
879 };
880
881 static struct thread_trace *thread_trace__new(void)
882 {
883         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
884
885         if (ttrace)
886                 ttrace->paths.max = -1;
887
888         ttrace->syscall_stats = intlist__new(NULL);
889
890         return ttrace;
891 }
892
893 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
894 {
895         struct thread_trace *ttrace;
896
897         if (thread == NULL)
898                 goto fail;
899
900         if (thread__priv(thread) == NULL)
901                 thread__set_priv(thread, thread_trace__new());
902
903         if (thread__priv(thread) == NULL)
904                 goto fail;
905
906         ttrace = thread__priv(thread);
907         ++ttrace->nr_events;
908
909         return ttrace;
910 fail:
911         color_fprintf(fp, PERF_COLOR_RED,
912                       "WARNING: not enough memory, dropping samples!\n");
913         return NULL;
914 }
915
916 #define TRACE_PFMAJ             (1 << 0)
917 #define TRACE_PFMIN             (1 << 1)
918
919 static const size_t trace__entry_str_size = 2048;
920
921 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
922 {
923         struct thread_trace *ttrace = thread__priv(thread);
924
925         if (fd > ttrace->paths.max) {
926                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
927
928                 if (npath == NULL)
929                         return -1;
930
931                 if (ttrace->paths.max != -1) {
932                         memset(npath + ttrace->paths.max + 1, 0,
933                                (fd - ttrace->paths.max) * sizeof(char *));
934                 } else {
935                         memset(npath, 0, (fd + 1) * sizeof(char *));
936                 }
937
938                 ttrace->paths.table = npath;
939                 ttrace->paths.max   = fd;
940         }
941
942         ttrace->paths.table[fd] = strdup(pathname);
943
944         return ttrace->paths.table[fd] != NULL ? 0 : -1;
945 }
946
947 static int thread__read_fd_path(struct thread *thread, int fd)
948 {
949         char linkname[PATH_MAX], pathname[PATH_MAX];
950         struct stat st;
951         int ret;
952
953         if (thread->pid_ == thread->tid) {
954                 scnprintf(linkname, sizeof(linkname),
955                           "/proc/%d/fd/%d", thread->pid_, fd);
956         } else {
957                 scnprintf(linkname, sizeof(linkname),
958                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
959         }
960
961         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
962                 return -1;
963
964         ret = readlink(linkname, pathname, sizeof(pathname));
965
966         if (ret < 0 || ret > st.st_size)
967                 return -1;
968
969         pathname[ret] = '\0';
970         return trace__set_fd_pathname(thread, fd, pathname);
971 }
972
973 static const char *thread__fd_path(struct thread *thread, int fd,
974                                    struct trace *trace)
975 {
976         struct thread_trace *ttrace = thread__priv(thread);
977
978         if (ttrace == NULL)
979                 return NULL;
980
981         if (fd < 0)
982                 return NULL;
983
984         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
985                 if (!trace->live)
986                         return NULL;
987                 ++trace->stats.proc_getname;
988                 if (thread__read_fd_path(thread, fd))
989                         return NULL;
990         }
991
992         return ttrace->paths.table[fd];
993 }
994
995 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
996                                         struct syscall_arg *arg)
997 {
998         int fd = arg->val;
999         size_t printed = scnprintf(bf, size, "%d", fd);
1000         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1001
1002         if (path)
1003                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1004
1005         return printed;
1006 }
1007
1008 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1009                                               struct syscall_arg *arg)
1010 {
1011         int fd = arg->val;
1012         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1013         struct thread_trace *ttrace = thread__priv(arg->thread);
1014
1015         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1016                 zfree(&ttrace->paths.table[fd]);
1017
1018         return printed;
1019 }
1020
1021 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1022                                      unsigned long ptr)
1023 {
1024         struct thread_trace *ttrace = thread__priv(thread);
1025
1026         ttrace->filename.ptr = ptr;
1027         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1028 }
1029
1030 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1031                                               struct syscall_arg *arg)
1032 {
1033         unsigned long ptr = arg->val;
1034
1035         if (!arg->trace->vfs_getname)
1036                 return scnprintf(bf, size, "%#x", ptr);
1037
1038         thread__set_filename_pos(arg->thread, bf, ptr);
1039         return 0;
1040 }
1041
1042 static bool trace__filter_duration(struct trace *trace, double t)
1043 {
1044         return t < (trace->duration_filter * NSEC_PER_MSEC);
1045 }
1046
1047 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1048 {
1049         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1050
1051         return fprintf(fp, "%10.3f ", ts);
1052 }
1053
1054 /*
1055  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1056  * using ttrace->entry_time for a thread that receives a sys_exit without
1057  * first having received a sys_enter ("poll" issued before tracing session
1058  * starts, lost sys_enter exit due to ring buffer overflow).
1059  */
1060 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061 {
1062         if (tstamp > 0)
1063                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1064
1065         return fprintf(fp, "         ? ");
1066 }
1067
1068 static bool done = false;
1069 static bool interrupted = false;
1070
1071 static void sig_handler(int sig)
1072 {
1073         done = true;
1074         interrupted = sig == SIGINT;
1075 }
1076
1077 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1078                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1079 {
1080         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1081         printed += fprintf_duration(duration, duration_calculated, fp);
1082
1083         if (trace->multiple_threads) {
1084                 if (trace->show_comm)
1085                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1086                 printed += fprintf(fp, "%d ", thread->tid);
1087         }
1088
1089         return printed;
1090 }
1091
1092 static int trace__process_event(struct trace *trace, struct machine *machine,
1093                                 union perf_event *event, struct perf_sample *sample)
1094 {
1095         int ret = 0;
1096
1097         switch (event->header.type) {
1098         case PERF_RECORD_LOST:
1099                 color_fprintf(trace->output, PERF_COLOR_RED,
1100                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1101                 ret = machine__process_lost_event(machine, event, sample);
1102                 break;
1103         default:
1104                 ret = machine__process_event(machine, event, sample);
1105                 break;
1106         }
1107
1108         return ret;
1109 }
1110
1111 static int trace__tool_process(struct perf_tool *tool,
1112                                union perf_event *event,
1113                                struct perf_sample *sample,
1114                                struct machine *machine)
1115 {
1116         struct trace *trace = container_of(tool, struct trace, tool);
1117         return trace__process_event(trace, machine, event, sample);
1118 }
1119
1120 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1121 {
1122         struct machine *machine = vmachine;
1123
1124         if (machine->kptr_restrict_warned)
1125                 return NULL;
1126
1127         if (symbol_conf.kptr_restrict) {
1128                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1129                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1130                            "Kernel samples will not be resolved.\n");
1131                 machine->kptr_restrict_warned = true;
1132                 return NULL;
1133         }
1134
1135         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1136 }
1137
1138 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1139 {
1140         int err = symbol__init(NULL);
1141
1142         if (err)
1143                 return err;
1144
1145         trace->host = machine__new_host();
1146         if (trace->host == NULL)
1147                 return -ENOMEM;
1148
1149         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1150                 return -errno;
1151
1152         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1153                                             evlist->threads, trace__tool_process, false,
1154                                             trace->opts.proc_map_timeout);
1155         if (err)
1156                 symbol__exit();
1157
1158         return err;
1159 }
1160
1161 static int syscall__set_arg_fmts(struct syscall *sc)
1162 {
1163         struct format_field *field;
1164         int idx = 0, len;
1165
1166         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1167         if (sc->arg_scnprintf == NULL)
1168                 return -1;
1169
1170         if (sc->fmt)
1171                 sc->arg_parm = sc->fmt->arg_parm;
1172
1173         for (field = sc->args; field; field = field->next) {
1174                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1175                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1176                 else if (strcmp(field->type, "const char *") == 0 &&
1177                          (strcmp(field->name, "filename") == 0 ||
1178                           strcmp(field->name, "path") == 0 ||
1179                           strcmp(field->name, "pathname") == 0))
1180                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1181                 else if (field->flags & FIELD_IS_POINTER)
1182                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1183                 else if (strcmp(field->type, "pid_t") == 0)
1184                         sc->arg_scnprintf[idx] = SCA_PID;
1185                 else if (strcmp(field->type, "umode_t") == 0)
1186                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1187                 else if ((strcmp(field->type, "int") == 0 ||
1188                           strcmp(field->type, "unsigned int") == 0 ||
1189                           strcmp(field->type, "long") == 0) &&
1190                          (len = strlen(field->name)) >= 2 &&
1191                          strcmp(field->name + len - 2, "fd") == 0) {
1192                         /*
1193                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1194                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1195                          * 65 int
1196                          * 23 unsigned int
1197                          * 7 unsigned long
1198                          */
1199                         sc->arg_scnprintf[idx] = SCA_FD;
1200                 }
1201                 ++idx;
1202         }
1203
1204         return 0;
1205 }
1206
1207 static int trace__read_syscall_info(struct trace *trace, int id)
1208 {
1209         char tp_name[128];
1210         struct syscall *sc;
1211         const char *name = syscalltbl__name(trace->sctbl, id);
1212
1213         if (name == NULL)
1214                 return -1;
1215
1216         if (id > trace->syscalls.max) {
1217                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1218
1219                 if (nsyscalls == NULL)
1220                         return -1;
1221
1222                 if (trace->syscalls.max != -1) {
1223                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1224                                (id - trace->syscalls.max) * sizeof(*sc));
1225                 } else {
1226                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1227                 }
1228
1229                 trace->syscalls.table = nsyscalls;
1230                 trace->syscalls.max   = id;
1231         }
1232
1233         sc = trace->syscalls.table + id;
1234         sc->name = name;
1235
1236         sc->fmt  = syscall_fmt__find(sc->name);
1237
1238         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1239         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1240
1241         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1242                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1243                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1244         }
1245
1246         if (IS_ERR(sc->tp_format))
1247                 return -1;
1248
1249         sc->args = sc->tp_format->format.fields;
1250         sc->nr_args = sc->tp_format->format.nr_fields;
1251         /*
1252          * We need to check and discard the first variable '__syscall_nr'
1253          * or 'nr' that mean the syscall number. It is needless here.
1254          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1255          */
1256         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1257                 sc->args = sc->args->next;
1258                 --sc->nr_args;
1259         }
1260
1261         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1262
1263         return syscall__set_arg_fmts(sc);
1264 }
1265
1266 static int trace__validate_ev_qualifier(struct trace *trace)
1267 {
1268         int err = 0, i;
1269         struct str_node *pos;
1270
1271         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1272         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1273                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1274
1275         if (trace->ev_qualifier_ids.entries == NULL) {
1276                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1277                        trace->output);
1278                 err = -EINVAL;
1279                 goto out;
1280         }
1281
1282         i = 0;
1283
1284         strlist__for_each_entry(pos, trace->ev_qualifier) {
1285                 const char *sc = pos->s;
1286                 int id = syscalltbl__id(trace->sctbl, sc);
1287
1288                 if (id < 0) {
1289                         if (err == 0) {
1290                                 fputs("Error:\tInvalid syscall ", trace->output);
1291                                 err = -EINVAL;
1292                         } else {
1293                                 fputs(", ", trace->output);
1294                         }
1295
1296                         fputs(sc, trace->output);
1297                 }
1298
1299                 trace->ev_qualifier_ids.entries[i++] = id;
1300         }
1301
1302         if (err < 0) {
1303                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1304                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1305                 zfree(&trace->ev_qualifier_ids.entries);
1306                 trace->ev_qualifier_ids.nr = 0;
1307         }
1308 out:
1309         return err;
1310 }
1311
1312 /*
1313  * args is to be interpreted as a series of longs but we need to handle
1314  * 8-byte unaligned accesses. args points to raw_data within the event
1315  * and raw_data is guaranteed to be 8-byte unaligned because it is
1316  * preceded by raw_size which is a u32. So we need to copy args to a temp
1317  * variable to read it. Most notably this avoids extended load instructions
1318  * on unaligned addresses
1319  */
1320
1321 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1322                                       unsigned char *args, struct trace *trace,
1323                                       struct thread *thread)
1324 {
1325         size_t printed = 0;
1326         unsigned char *p;
1327         unsigned long val;
1328
1329         if (sc->args != NULL) {
1330                 struct format_field *field;
1331                 u8 bit = 1;
1332                 struct syscall_arg arg = {
1333                         .idx    = 0,
1334                         .mask   = 0,
1335                         .trace  = trace,
1336                         .thread = thread,
1337                 };
1338
1339                 for (field = sc->args; field;
1340                      field = field->next, ++arg.idx, bit <<= 1) {
1341                         if (arg.mask & bit)
1342                                 continue;
1343
1344                         /* special care for unaligned accesses */
1345                         p = args + sizeof(unsigned long) * arg.idx;
1346                         memcpy(&val, p, sizeof(val));
1347
1348                         /*
1349                          * Suppress this argument if its value is zero and
1350                          * and we don't have a string associated in an
1351                          * strarray for it.
1352                          */
1353                         if (val == 0 &&
1354                             !(sc->arg_scnprintf &&
1355                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1356                               sc->arg_parm[arg.idx]))
1357                                 continue;
1358
1359                         printed += scnprintf(bf + printed, size - printed,
1360                                              "%s%s: ", printed ? ", " : "", field->name);
1361                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1362                                 arg.val = val;
1363                                 if (sc->arg_parm)
1364                                         arg.parm = sc->arg_parm[arg.idx];
1365                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1366                                                                       size - printed, &arg);
1367                         } else {
1368                                 printed += scnprintf(bf + printed, size - printed,
1369                                                      "%ld", val);
1370                         }
1371                 }
1372         } else if (IS_ERR(sc->tp_format)) {
1373                 /*
1374                  * If we managed to read the tracepoint /format file, then we
1375                  * may end up not having any args, like with gettid(), so only
1376                  * print the raw args when we didn't manage to read it.
1377                  */
1378                 int i = 0;
1379
1380                 while (i < 6) {
1381                         /* special care for unaligned accesses */
1382                         p = args + sizeof(unsigned long) * i;
1383                         memcpy(&val, p, sizeof(val));
1384                         printed += scnprintf(bf + printed, size - printed,
1385                                              "%sarg%d: %ld",
1386                                              printed ? ", " : "", i, val);
1387                         ++i;
1388                 }
1389         }
1390
1391         return printed;
1392 }
1393
1394 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1395                                   union perf_event *event,
1396                                   struct perf_sample *sample);
1397
1398 static struct syscall *trace__syscall_info(struct trace *trace,
1399                                            struct perf_evsel *evsel, int id)
1400 {
1401
1402         if (id < 0) {
1403
1404                 /*
1405                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1406                  * before that, leaving at a higher verbosity level till that is
1407                  * explained. Reproduced with plain ftrace with:
1408                  *
1409                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1410                  * grep "NR -1 " /t/trace_pipe
1411                  *
1412                  * After generating some load on the machine.
1413                  */
1414                 if (verbose > 1) {
1415                         static u64 n;
1416                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1417                                 id, perf_evsel__name(evsel), ++n);
1418                 }
1419                 return NULL;
1420         }
1421
1422         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1423             trace__read_syscall_info(trace, id))
1424                 goto out_cant_read;
1425
1426         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1427                 goto out_cant_read;
1428
1429         return &trace->syscalls.table[id];
1430
1431 out_cant_read:
1432         if (verbose > 0) {
1433                 fprintf(trace->output, "Problems reading syscall %d", id);
1434                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1435                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1436                 fputs(" information\n", trace->output);
1437         }
1438         return NULL;
1439 }
1440
1441 static void thread__update_stats(struct thread_trace *ttrace,
1442                                  int id, struct perf_sample *sample)
1443 {
1444         struct int_node *inode;
1445         struct stats *stats;
1446         u64 duration = 0;
1447
1448         inode = intlist__findnew(ttrace->syscall_stats, id);
1449         if (inode == NULL)
1450                 return;
1451
1452         stats = inode->priv;
1453         if (stats == NULL) {
1454                 stats = malloc(sizeof(struct stats));
1455                 if (stats == NULL)
1456                         return;
1457                 init_stats(stats);
1458                 inode->priv = stats;
1459         }
1460
1461         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1462                 duration = sample->time - ttrace->entry_time;
1463
1464         update_stats(stats, duration);
1465 }
1466
1467 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1468 {
1469         struct thread_trace *ttrace;
1470         u64 duration;
1471         size_t printed;
1472
1473         if (trace->current == NULL)
1474                 return 0;
1475
1476         ttrace = thread__priv(trace->current);
1477
1478         if (!ttrace->entry_pending)
1479                 return 0;
1480
1481         duration = sample->time - ttrace->entry_time;
1482
1483         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1484         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1485         ttrace->entry_pending = false;
1486
1487         return printed;
1488 }
1489
1490 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1491                             union perf_event *event __maybe_unused,
1492                             struct perf_sample *sample)
1493 {
1494         char *msg;
1495         void *args;
1496         size_t printed = 0;
1497         struct thread *thread;
1498         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1499         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1500         struct thread_trace *ttrace;
1501
1502         if (sc == NULL)
1503                 return -1;
1504
1505         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1506         ttrace = thread__trace(thread, trace->output);
1507         if (ttrace == NULL)
1508                 goto out_put;
1509
1510         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1511
1512         if (ttrace->entry_str == NULL) {
1513                 ttrace->entry_str = malloc(trace__entry_str_size);
1514                 if (!ttrace->entry_str)
1515                         goto out_put;
1516         }
1517
1518         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1519                 trace__printf_interrupted_entry(trace, sample);
1520
1521         ttrace->entry_time = sample->time;
1522         msg = ttrace->entry_str;
1523         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1524
1525         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1526                                            args, trace, thread);
1527
1528         if (sc->is_exit) {
1529                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1530                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1531                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1532                 }
1533         } else {
1534                 ttrace->entry_pending = true;
1535                 /* See trace__vfs_getname & trace__sys_exit */
1536                 ttrace->filename.pending_open = false;
1537         }
1538
1539         if (trace->current != thread) {
1540                 thread__put(trace->current);
1541                 trace->current = thread__get(thread);
1542         }
1543         err = 0;
1544 out_put:
1545         thread__put(thread);
1546         return err;
1547 }
1548
1549 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1550                                     struct perf_sample *sample,
1551                                     struct callchain_cursor *cursor)
1552 {
1553         struct addr_location al;
1554
1555         if (machine__resolve(trace->host, &al, sample) < 0 ||
1556             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1557                 return -1;
1558
1559         return 0;
1560 }
1561
1562 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1563 {
1564         /* TODO: user-configurable print_opts */
1565         const unsigned int print_opts = EVSEL__PRINT_SYM |
1566                                         EVSEL__PRINT_DSO |
1567                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1568
1569         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1570 }
1571
1572 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1573                            union perf_event *event __maybe_unused,
1574                            struct perf_sample *sample)
1575 {
1576         long ret;
1577         u64 duration = 0;
1578         bool duration_calculated = false;
1579         struct thread *thread;
1580         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1581         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1582         struct thread_trace *ttrace;
1583
1584         if (sc == NULL)
1585                 return -1;
1586
1587         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1588         ttrace = thread__trace(thread, trace->output);
1589         if (ttrace == NULL)
1590                 goto out_put;
1591
1592         if (trace->summary)
1593                 thread__update_stats(ttrace, id, sample);
1594
1595         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1596
1597         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1598                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1599                 ttrace->filename.pending_open = false;
1600                 ++trace->stats.vfs_getname;
1601         }
1602
1603         if (ttrace->entry_time) {
1604                 duration = sample->time - ttrace->entry_time;
1605                 if (trace__filter_duration(trace, duration))
1606                         goto out;
1607                 duration_calculated = true;
1608         } else if (trace->duration_filter)
1609                 goto out;
1610
1611         if (sample->callchain) {
1612                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1613                 if (callchain_ret == 0) {
1614                         if (callchain_cursor.nr < trace->min_stack)
1615                                 goto out;
1616                         callchain_ret = 1;
1617                 }
1618         }
1619
1620         if (trace->summary_only)
1621                 goto out;
1622
1623         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1624
1625         if (ttrace->entry_pending) {
1626                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1627         } else {
1628                 fprintf(trace->output, " ... [");
1629                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1630                 fprintf(trace->output, "]: %s()", sc->name);
1631         }
1632
1633         if (sc->fmt == NULL) {
1634 signed_print:
1635                 fprintf(trace->output, ") = %ld", ret);
1636         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1637                 char bf[STRERR_BUFSIZE];
1638                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1639                            *e = audit_errno_to_name(-ret);
1640
1641                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1642         } else if (ret == 0 && sc->fmt->timeout)
1643                 fprintf(trace->output, ") = 0 Timeout");
1644         else if (sc->fmt->hexret)
1645                 fprintf(trace->output, ") = %#lx", ret);
1646         else if (sc->fmt->errpid) {
1647                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1648
1649                 if (child != NULL) {
1650                         fprintf(trace->output, ") = %ld", ret);
1651                         if (child->comm_set)
1652                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1653                         thread__put(child);
1654                 }
1655         } else
1656                 goto signed_print;
1657
1658         fputc('\n', trace->output);
1659
1660         if (callchain_ret > 0)
1661                 trace__fprintf_callchain(trace, sample);
1662         else if (callchain_ret < 0)
1663                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1664 out:
1665         ttrace->entry_pending = false;
1666         err = 0;
1667 out_put:
1668         thread__put(thread);
1669         return err;
1670 }
1671
1672 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1673                               union perf_event *event __maybe_unused,
1674                               struct perf_sample *sample)
1675 {
1676         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1677         struct thread_trace *ttrace;
1678         size_t filename_len, entry_str_len, to_move;
1679         ssize_t remaining_space;
1680         char *pos;
1681         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1682
1683         if (!thread)
1684                 goto out;
1685
1686         ttrace = thread__priv(thread);
1687         if (!ttrace)
1688                 goto out_put;
1689
1690         filename_len = strlen(filename);
1691         if (filename_len == 0)
1692                 goto out_put;
1693
1694         if (ttrace->filename.namelen < filename_len) {
1695                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1696
1697                 if (f == NULL)
1698                         goto out_put;
1699
1700                 ttrace->filename.namelen = filename_len;
1701                 ttrace->filename.name = f;
1702         }
1703
1704         strcpy(ttrace->filename.name, filename);
1705         ttrace->filename.pending_open = true;
1706
1707         if (!ttrace->filename.ptr)
1708                 goto out_put;
1709
1710         entry_str_len = strlen(ttrace->entry_str);
1711         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1712         if (remaining_space <= 0)
1713                 goto out_put;
1714
1715         if (filename_len > (size_t)remaining_space) {
1716                 filename += filename_len - remaining_space;
1717                 filename_len = remaining_space;
1718         }
1719
1720         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1721         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1722         memmove(pos + filename_len, pos, to_move);
1723         memcpy(pos, filename, filename_len);
1724
1725         ttrace->filename.ptr = 0;
1726         ttrace->filename.entry_str_pos = 0;
1727 out_put:
1728         thread__put(thread);
1729 out:
1730         return 0;
1731 }
1732
1733 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1734                                      union perf_event *event __maybe_unused,
1735                                      struct perf_sample *sample)
1736 {
1737         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1738         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1739         struct thread *thread = machine__findnew_thread(trace->host,
1740                                                         sample->pid,
1741                                                         sample->tid);
1742         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1743
1744         if (ttrace == NULL)
1745                 goto out_dump;
1746
1747         ttrace->runtime_ms += runtime_ms;
1748         trace->runtime_ms += runtime_ms;
1749 out_put:
1750         thread__put(thread);
1751         return 0;
1752
1753 out_dump:
1754         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1755                evsel->name,
1756                perf_evsel__strval(evsel, sample, "comm"),
1757                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1758                runtime,
1759                perf_evsel__intval(evsel, sample, "vruntime"));
1760         goto out_put;
1761 }
1762
1763 static void bpf_output__printer(enum binary_printer_ops op,
1764                                 unsigned int val, void *extra)
1765 {
1766         FILE *output = extra;
1767         unsigned char ch = (unsigned char)val;
1768
1769         switch (op) {
1770         case BINARY_PRINT_CHAR_DATA:
1771                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1772                 break;
1773         case BINARY_PRINT_DATA_BEGIN:
1774         case BINARY_PRINT_LINE_BEGIN:
1775         case BINARY_PRINT_ADDR:
1776         case BINARY_PRINT_NUM_DATA:
1777         case BINARY_PRINT_NUM_PAD:
1778         case BINARY_PRINT_SEP:
1779         case BINARY_PRINT_CHAR_PAD:
1780         case BINARY_PRINT_LINE_END:
1781         case BINARY_PRINT_DATA_END:
1782         default:
1783                 break;
1784         }
1785 }
1786
1787 static void bpf_output__fprintf(struct trace *trace,
1788                                 struct perf_sample *sample)
1789 {
1790         print_binary(sample->raw_data, sample->raw_size, 8,
1791                      bpf_output__printer, trace->output);
1792 }
1793
1794 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1795                                 union perf_event *event __maybe_unused,
1796                                 struct perf_sample *sample)
1797 {
1798         int callchain_ret = 0;
1799
1800         if (sample->callchain) {
1801                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1802                 if (callchain_ret == 0) {
1803                         if (callchain_cursor.nr < trace->min_stack)
1804                                 goto out;
1805                         callchain_ret = 1;
1806                 }
1807         }
1808
1809         trace__printf_interrupted_entry(trace, sample);
1810         trace__fprintf_tstamp(trace, sample->time, trace->output);
1811
1812         if (trace->trace_syscalls)
1813                 fprintf(trace->output, "(         ): ");
1814
1815         fprintf(trace->output, "%s:", evsel->name);
1816
1817         if (perf_evsel__is_bpf_output(evsel)) {
1818                 bpf_output__fprintf(trace, sample);
1819         } else if (evsel->tp_format) {
1820                 event_format__fprintf(evsel->tp_format, sample->cpu,
1821                                       sample->raw_data, sample->raw_size,
1822                                       trace->output);
1823         }
1824
1825         fprintf(trace->output, ")\n");
1826
1827         if (callchain_ret > 0)
1828                 trace__fprintf_callchain(trace, sample);
1829         else if (callchain_ret < 0)
1830                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1831 out:
1832         return 0;
1833 }
1834
1835 static void print_location(FILE *f, struct perf_sample *sample,
1836                            struct addr_location *al,
1837                            bool print_dso, bool print_sym)
1838 {
1839
1840         if ((verbose > 0 || print_dso) && al->map)
1841                 fprintf(f, "%s@", al->map->dso->long_name);
1842
1843         if ((verbose > 0 || print_sym) && al->sym)
1844                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1845                         al->addr - al->sym->start);
1846         else if (al->map)
1847                 fprintf(f, "0x%" PRIx64, al->addr);
1848         else
1849                 fprintf(f, "0x%" PRIx64, sample->addr);
1850 }
1851
1852 static int trace__pgfault(struct trace *trace,
1853                           struct perf_evsel *evsel,
1854                           union perf_event *event __maybe_unused,
1855                           struct perf_sample *sample)
1856 {
1857         struct thread *thread;
1858         struct addr_location al;
1859         char map_type = 'd';
1860         struct thread_trace *ttrace;
1861         int err = -1;
1862         int callchain_ret = 0;
1863
1864         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1865
1866         if (sample->callchain) {
1867                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1868                 if (callchain_ret == 0) {
1869                         if (callchain_cursor.nr < trace->min_stack)
1870                                 goto out_put;
1871                         callchain_ret = 1;
1872                 }
1873         }
1874
1875         ttrace = thread__trace(thread, trace->output);
1876         if (ttrace == NULL)
1877                 goto out_put;
1878
1879         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1880                 ttrace->pfmaj++;
1881         else
1882                 ttrace->pfmin++;
1883
1884         if (trace->summary_only)
1885                 goto out;
1886
1887         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1888                               sample->ip, &al);
1889
1890         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1891
1892         fprintf(trace->output, "%sfault [",
1893                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1894                 "maj" : "min");
1895
1896         print_location(trace->output, sample, &al, false, true);
1897
1898         fprintf(trace->output, "] => ");
1899
1900         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1901                                    sample->addr, &al);
1902
1903         if (!al.map) {
1904                 thread__find_addr_location(thread, sample->cpumode,
1905                                            MAP__FUNCTION, sample->addr, &al);
1906
1907                 if (al.map)
1908                         map_type = 'x';
1909                 else
1910                         map_type = '?';
1911         }
1912
1913         print_location(trace->output, sample, &al, true, false);
1914
1915         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1916
1917         if (callchain_ret > 0)
1918                 trace__fprintf_callchain(trace, sample);
1919         else if (callchain_ret < 0)
1920                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1921 out:
1922         err = 0;
1923 out_put:
1924         thread__put(thread);
1925         return err;
1926 }
1927
1928 static void trace__set_base_time(struct trace *trace,
1929                                  struct perf_evsel *evsel,
1930                                  struct perf_sample *sample)
1931 {
1932         /*
1933          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1934          * and don't use sample->time unconditionally, we may end up having
1935          * some other event in the future without PERF_SAMPLE_TIME for good
1936          * reason, i.e. we may not be interested in its timestamps, just in
1937          * it taking place, picking some piece of information when it
1938          * appears in our event stream (vfs_getname comes to mind).
1939          */
1940         if (trace->base_time == 0 && !trace->full_time &&
1941             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1942                 trace->base_time = sample->time;
1943 }
1944
1945 static int trace__process_sample(struct perf_tool *tool,
1946                                  union perf_event *event,
1947                                  struct perf_sample *sample,
1948                                  struct perf_evsel *evsel,
1949                                  struct machine *machine __maybe_unused)
1950 {
1951         struct trace *trace = container_of(tool, struct trace, tool);
1952         struct thread *thread;
1953         int err = 0;
1954
1955         tracepoint_handler handler = evsel->handler;
1956
1957         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1958         if (thread && thread__is_filtered(thread))
1959                 goto out;
1960
1961         trace__set_base_time(trace, evsel, sample);
1962
1963         if (handler) {
1964                 ++trace->nr_events;
1965                 handler(trace, evsel, event, sample);
1966         }
1967 out:
1968         thread__put(thread);
1969         return err;
1970 }
1971
1972 static int trace__record(struct trace *trace, int argc, const char **argv)
1973 {
1974         unsigned int rec_argc, i, j;
1975         const char **rec_argv;
1976         const char * const record_args[] = {
1977                 "record",
1978                 "-R",
1979                 "-m", "1024",
1980                 "-c", "1",
1981         };
1982
1983         const char * const sc_args[] = { "-e", };
1984         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1985         const char * const majpf_args[] = { "-e", "major-faults" };
1986         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1987         const char * const minpf_args[] = { "-e", "minor-faults" };
1988         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1989
1990         /* +1 is for the event string below */
1991         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1992                 majpf_args_nr + minpf_args_nr + argc;
1993         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1994
1995         if (rec_argv == NULL)
1996                 return -ENOMEM;
1997
1998         j = 0;
1999         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2000                 rec_argv[j++] = record_args[i];
2001
2002         if (trace->trace_syscalls) {
2003                 for (i = 0; i < sc_args_nr; i++)
2004                         rec_argv[j++] = sc_args[i];
2005
2006                 /* event string may be different for older kernels - e.g., RHEL6 */
2007                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2008                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2009                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2010                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2011                 else {
2012                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2013                         return -1;
2014                 }
2015         }
2016
2017         if (trace->trace_pgfaults & TRACE_PFMAJ)
2018                 for (i = 0; i < majpf_args_nr; i++)
2019                         rec_argv[j++] = majpf_args[i];
2020
2021         if (trace->trace_pgfaults & TRACE_PFMIN)
2022                 for (i = 0; i < minpf_args_nr; i++)
2023                         rec_argv[j++] = minpf_args[i];
2024
2025         for (i = 0; i < (unsigned int)argc; i++)
2026                 rec_argv[j++] = argv[i];
2027
2028         return cmd_record(j, rec_argv);
2029 }
2030
2031 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2032
2033 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2034 {
2035         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2036
2037         if (IS_ERR(evsel))
2038                 return false;
2039
2040         if (perf_evsel__field(evsel, "pathname") == NULL) {
2041                 perf_evsel__delete(evsel);
2042                 return false;
2043         }
2044
2045         evsel->handler = trace__vfs_getname;
2046         perf_evlist__add(evlist, evsel);
2047         return true;
2048 }
2049
2050 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2051 {
2052         struct perf_evsel *evsel;
2053         struct perf_event_attr attr = {
2054                 .type = PERF_TYPE_SOFTWARE,
2055                 .mmap_data = 1,
2056         };
2057
2058         attr.config = config;
2059         attr.sample_period = 1;
2060
2061         event_attr_init(&attr);
2062
2063         evsel = perf_evsel__new(&attr);
2064         if (evsel)
2065                 evsel->handler = trace__pgfault;
2066
2067         return evsel;
2068 }
2069
2070 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2071 {
2072         const u32 type = event->header.type;
2073         struct perf_evsel *evsel;
2074
2075         if (type != PERF_RECORD_SAMPLE) {
2076                 trace__process_event(trace, trace->host, event, sample);
2077                 return;
2078         }
2079
2080         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2081         if (evsel == NULL) {
2082                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2083                 return;
2084         }
2085
2086         trace__set_base_time(trace, evsel, sample);
2087
2088         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2089             sample->raw_data == NULL) {
2090                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2091                        perf_evsel__name(evsel), sample->tid,
2092                        sample->cpu, sample->raw_size);
2093         } else {
2094                 tracepoint_handler handler = evsel->handler;
2095                 handler(trace, evsel, event, sample);
2096         }
2097 }
2098
2099 static int trace__add_syscall_newtp(struct trace *trace)
2100 {
2101         int ret = -1;
2102         struct perf_evlist *evlist = trace->evlist;
2103         struct perf_evsel *sys_enter, *sys_exit;
2104
2105         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2106         if (sys_enter == NULL)
2107                 goto out;
2108
2109         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2110                 goto out_delete_sys_enter;
2111
2112         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2113         if (sys_exit == NULL)
2114                 goto out_delete_sys_enter;
2115
2116         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2117                 goto out_delete_sys_exit;
2118
2119         perf_evlist__add(evlist, sys_enter);
2120         perf_evlist__add(evlist, sys_exit);
2121
2122         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2123                 /*
2124                  * We're interested only in the user space callchain
2125                  * leading to the syscall, allow overriding that for
2126                  * debugging reasons using --kernel_syscall_callchains
2127                  */
2128                 sys_exit->attr.exclude_callchain_kernel = 1;
2129         }
2130
2131         trace->syscalls.events.sys_enter = sys_enter;
2132         trace->syscalls.events.sys_exit  = sys_exit;
2133
2134         ret = 0;
2135 out:
2136         return ret;
2137
2138 out_delete_sys_exit:
2139         perf_evsel__delete_priv(sys_exit);
2140 out_delete_sys_enter:
2141         perf_evsel__delete_priv(sys_enter);
2142         goto out;
2143 }
2144
2145 static int trace__set_ev_qualifier_filter(struct trace *trace)
2146 {
2147         int err = -1;
2148         struct perf_evsel *sys_exit;
2149         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2150                                                 trace->ev_qualifier_ids.nr,
2151                                                 trace->ev_qualifier_ids.entries);
2152
2153         if (filter == NULL)
2154                 goto out_enomem;
2155
2156         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2157                                           filter)) {
2158                 sys_exit = trace->syscalls.events.sys_exit;
2159                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2160         }
2161
2162         free(filter);
2163 out:
2164         return err;
2165 out_enomem:
2166         errno = ENOMEM;
2167         goto out;
2168 }
2169
2170 static int trace__run(struct trace *trace, int argc, const char **argv)
2171 {
2172         struct perf_evlist *evlist = trace->evlist;
2173         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2174         int err = -1, i;
2175         unsigned long before;
2176         const bool forks = argc > 0;
2177         bool draining = false;
2178
2179         trace->live = true;
2180
2181         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2182                 goto out_error_raw_syscalls;
2183
2184         if (trace->trace_syscalls)
2185                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2186
2187         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2188                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2189                 if (pgfault_maj == NULL)
2190                         goto out_error_mem;
2191                 perf_evlist__add(evlist, pgfault_maj);
2192         }
2193
2194         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2195                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2196                 if (pgfault_min == NULL)
2197                         goto out_error_mem;
2198                 perf_evlist__add(evlist, pgfault_min);
2199         }
2200
2201         if (trace->sched &&
2202             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2203                                    trace__sched_stat_runtime))
2204                 goto out_error_sched_stat_runtime;
2205
2206         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2207         if (err < 0) {
2208                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2209                 goto out_delete_evlist;
2210         }
2211
2212         err = trace__symbols_init(trace, evlist);
2213         if (err < 0) {
2214                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2215                 goto out_delete_evlist;
2216         }
2217
2218         perf_evlist__config(evlist, &trace->opts, NULL);
2219
2220         if (callchain_param.enabled) {
2221                 bool use_identifier = false;
2222
2223                 if (trace->syscalls.events.sys_exit) {
2224                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2225                                                      &trace->opts, &callchain_param);
2226                         use_identifier = true;
2227                 }
2228
2229                 if (pgfault_maj) {
2230                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2231                         use_identifier = true;
2232                 }
2233
2234                 if (pgfault_min) {
2235                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2236                         use_identifier = true;
2237                 }
2238
2239                 if (use_identifier) {
2240                        /*
2241                         * Now we have evsels with different sample_ids, use
2242                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2243                         * from a fixed position in each ring buffer record.
2244                         *
2245                         * As of this the changeset introducing this comment, this
2246                         * isn't strictly needed, as the fields that can come before
2247                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2248                         * some of those for things like copying the payload of
2249                         * pointer syscall arguments, and for vfs_getname we don't
2250                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2251                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2252                         */
2253                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2254                         perf_evlist__reset_sample_bit(evlist, ID);
2255                 }
2256         }
2257
2258         signal(SIGCHLD, sig_handler);
2259         signal(SIGINT, sig_handler);
2260
2261         if (forks) {
2262                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2263                                                     argv, false, NULL);
2264                 if (err < 0) {
2265                         fprintf(trace->output, "Couldn't run the workload!\n");
2266                         goto out_delete_evlist;
2267                 }
2268         }
2269
2270         err = perf_evlist__open(evlist);
2271         if (err < 0)
2272                 goto out_error_open;
2273
2274         err = bpf__apply_obj_config();
2275         if (err) {
2276                 char errbuf[BUFSIZ];
2277
2278                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2279                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2280                          errbuf);
2281                 goto out_error_open;
2282         }
2283
2284         /*
2285          * Better not use !target__has_task() here because we need to cover the
2286          * case where no threads were specified in the command line, but a
2287          * workload was, and in that case we will fill in the thread_map when
2288          * we fork the workload in perf_evlist__prepare_workload.
2289          */
2290         if (trace->filter_pids.nr > 0)
2291                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2292         else if (thread_map__pid(evlist->threads, 0) == -1)
2293                 err = perf_evlist__set_filter_pid(evlist, getpid());
2294
2295         if (err < 0)
2296                 goto out_error_mem;
2297
2298         if (trace->ev_qualifier_ids.nr > 0) {
2299                 err = trace__set_ev_qualifier_filter(trace);
2300                 if (err < 0)
2301                         goto out_errno;
2302
2303                 pr_debug("event qualifier tracepoint filter: %s\n",
2304                          trace->syscalls.events.sys_exit->filter);
2305         }
2306
2307         err = perf_evlist__apply_filters(evlist, &evsel);
2308         if (err < 0)
2309                 goto out_error_apply_filters;
2310
2311         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2312         if (err < 0)
2313                 goto out_error_mmap;
2314
2315         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2316                 perf_evlist__enable(evlist);
2317
2318         if (forks)
2319                 perf_evlist__start_workload(evlist);
2320
2321         if (trace->opts.initial_delay) {
2322                 usleep(trace->opts.initial_delay * 1000);
2323                 perf_evlist__enable(evlist);
2324         }
2325
2326         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2327                                   evlist->threads->nr > 1 ||
2328                                   perf_evlist__first(evlist)->attr.inherit;
2329 again:
2330         before = trace->nr_events;
2331
2332         for (i = 0; i < evlist->nr_mmaps; i++) {
2333                 union perf_event *event;
2334
2335                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2336                         struct perf_sample sample;
2337
2338                         ++trace->nr_events;
2339
2340                         err = perf_evlist__parse_sample(evlist, event, &sample);
2341                         if (err) {
2342                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2343                                 goto next_event;
2344                         }
2345
2346                         trace__handle_event(trace, event, &sample);
2347 next_event:
2348                         perf_evlist__mmap_consume(evlist, i);
2349
2350                         if (interrupted)
2351                                 goto out_disable;
2352
2353                         if (done && !draining) {
2354                                 perf_evlist__disable(evlist);
2355                                 draining = true;
2356                         }
2357                 }
2358         }
2359
2360         if (trace->nr_events == before) {
2361                 int timeout = done ? 100 : -1;
2362
2363                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2364                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2365                                 draining = true;
2366
2367                         goto again;
2368                 }
2369         } else {
2370                 goto again;
2371         }
2372
2373 out_disable:
2374         thread__zput(trace->current);
2375
2376         perf_evlist__disable(evlist);
2377
2378         if (!err) {
2379                 if (trace->summary)
2380                         trace__fprintf_thread_summary(trace, trace->output);
2381
2382                 if (trace->show_tool_stats) {
2383                         fprintf(trace->output, "Stats:\n "
2384                                                " vfs_getname : %" PRIu64 "\n"
2385                                                " proc_getname: %" PRIu64 "\n",
2386                                 trace->stats.vfs_getname,
2387                                 trace->stats.proc_getname);
2388                 }
2389         }
2390
2391 out_delete_evlist:
2392         perf_evlist__delete(evlist);
2393         trace->evlist = NULL;
2394         trace->live = false;
2395         return err;
2396 {
2397         char errbuf[BUFSIZ];
2398
2399 out_error_sched_stat_runtime:
2400         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2401         goto out_error;
2402
2403 out_error_raw_syscalls:
2404         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2405         goto out_error;
2406
2407 out_error_mmap:
2408         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2409         goto out_error;
2410
2411 out_error_open:
2412         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2413
2414 out_error:
2415         fprintf(trace->output, "%s\n", errbuf);
2416         goto out_delete_evlist;
2417
2418 out_error_apply_filters:
2419         fprintf(trace->output,
2420                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2421                 evsel->filter, perf_evsel__name(evsel), errno,
2422                 str_error_r(errno, errbuf, sizeof(errbuf)));
2423         goto out_delete_evlist;
2424 }
2425 out_error_mem:
2426         fprintf(trace->output, "Not enough memory to run!\n");
2427         goto out_delete_evlist;
2428
2429 out_errno:
2430         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2431         goto out_delete_evlist;
2432 }
2433
2434 static int trace__replay(struct trace *trace)
2435 {
2436         const struct perf_evsel_str_handler handlers[] = {
2437                 { "probe:vfs_getname",       trace__vfs_getname, },
2438         };
2439         struct perf_data_file file = {
2440                 .path  = input_name,
2441                 .mode  = PERF_DATA_MODE_READ,
2442                 .force = trace->force,
2443         };
2444         struct perf_session *session;
2445         struct perf_evsel *evsel;
2446         int err = -1;
2447
2448         trace->tool.sample        = trace__process_sample;
2449         trace->tool.mmap          = perf_event__process_mmap;
2450         trace->tool.mmap2         = perf_event__process_mmap2;
2451         trace->tool.comm          = perf_event__process_comm;
2452         trace->tool.exit          = perf_event__process_exit;
2453         trace->tool.fork          = perf_event__process_fork;
2454         trace->tool.attr          = perf_event__process_attr;
2455         trace->tool.tracing_data  = perf_event__process_tracing_data;
2456         trace->tool.build_id      = perf_event__process_build_id;
2457         trace->tool.namespaces    = perf_event__process_namespaces;
2458
2459         trace->tool.ordered_events = true;
2460         trace->tool.ordering_requires_timestamps = true;
2461
2462         /* add tid to output */
2463         trace->multiple_threads = true;
2464
2465         session = perf_session__new(&file, false, &trace->tool);
2466         if (session == NULL)
2467                 return -1;
2468
2469         if (trace->opts.target.pid)
2470                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2471
2472         if (trace->opts.target.tid)
2473                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2474
2475         if (symbol__init(&session->header.env) < 0)
2476                 goto out;
2477
2478         trace->host = &session->machines.host;
2479
2480         err = perf_session__set_tracepoints_handlers(session, handlers);
2481         if (err)
2482                 goto out;
2483
2484         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2485                                                      "raw_syscalls:sys_enter");
2486         /* older kernels have syscalls tp versus raw_syscalls */
2487         if (evsel == NULL)
2488                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2489                                                              "syscalls:sys_enter");
2490
2491         if (evsel &&
2492             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2493             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2494                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2495                 goto out;
2496         }
2497
2498         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2499                                                      "raw_syscalls:sys_exit");
2500         if (evsel == NULL)
2501                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2502                                                              "syscalls:sys_exit");
2503         if (evsel &&
2504             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2505             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2506                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2507                 goto out;
2508         }
2509
2510         evlist__for_each_entry(session->evlist, evsel) {
2511                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2512                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2513                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2514                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2515                         evsel->handler = trace__pgfault;
2516         }
2517
2518         setup_pager();
2519
2520         err = perf_session__process_events(session);
2521         if (err)
2522                 pr_err("Failed to process events, error %d", err);
2523
2524         else if (trace->summary)
2525                 trace__fprintf_thread_summary(trace, trace->output);
2526
2527 out:
2528         perf_session__delete(session);
2529
2530         return err;
2531 }
2532
2533 static size_t trace__fprintf_threads_header(FILE *fp)
2534 {
2535         size_t printed;
2536
2537         printed  = fprintf(fp, "\n Summary of events:\n\n");
2538
2539         return printed;
2540 }
2541
2542 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2543         struct stats    *stats;
2544         double          msecs;
2545         int             syscall;
2546 )
2547 {
2548         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2549         struct stats *stats = source->priv;
2550
2551         entry->syscall = source->i;
2552         entry->stats   = stats;
2553         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2554 }
2555
2556 static size_t thread__dump_stats(struct thread_trace *ttrace,
2557                                  struct trace *trace, FILE *fp)
2558 {
2559         size_t printed = 0;
2560         struct syscall *sc;
2561         struct rb_node *nd;
2562         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2563
2564         if (syscall_stats == NULL)
2565                 return 0;
2566
2567         printed += fprintf(fp, "\n");
2568
2569         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2570         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2571         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2572
2573         resort_rb__for_each_entry(nd, syscall_stats) {
2574                 struct stats *stats = syscall_stats_entry->stats;
2575                 if (stats) {
2576                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2577                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2578                         double avg = avg_stats(stats);
2579                         double pct;
2580                         u64 n = (u64) stats->n;
2581
2582                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2583                         avg /= NSEC_PER_MSEC;
2584
2585                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2586                         printed += fprintf(fp, "   %-15s", sc->name);
2587                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2588                                            n, syscall_stats_entry->msecs, min, avg);
2589                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2590                 }
2591         }
2592
2593         resort_rb__delete(syscall_stats);
2594         printed += fprintf(fp, "\n\n");
2595
2596         return printed;
2597 }
2598
2599 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2600 {
2601         size_t printed = 0;
2602         struct thread_trace *ttrace = thread__priv(thread);
2603         double ratio;
2604
2605         if (ttrace == NULL)
2606                 return 0;
2607
2608         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2609
2610         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2611         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2612         printed += fprintf(fp, "%.1f%%", ratio);
2613         if (ttrace->pfmaj)
2614                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2615         if (ttrace->pfmin)
2616                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2617         if (trace->sched)
2618                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2619         else if (fputc('\n', fp) != EOF)
2620                 ++printed;
2621
2622         printed += thread__dump_stats(ttrace, trace, fp);
2623
2624         return printed;
2625 }
2626
2627 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2628 {
2629         return ttrace ? ttrace->nr_events : 0;
2630 }
2631
2632 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2633         struct thread *thread;
2634 )
2635 {
2636         entry->thread = rb_entry(nd, struct thread, rb_node);
2637 }
2638
2639 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2640 {
2641         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2642         size_t printed = trace__fprintf_threads_header(fp);
2643         struct rb_node *nd;
2644
2645         if (threads == NULL) {
2646                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2647                 return 0;
2648         }
2649
2650         resort_rb__for_each_entry(nd, threads)
2651                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2652
2653         resort_rb__delete(threads);
2654
2655         return printed;
2656 }
2657
2658 static int trace__set_duration(const struct option *opt, const char *str,
2659                                int unset __maybe_unused)
2660 {
2661         struct trace *trace = opt->value;
2662
2663         trace->duration_filter = atof(str);
2664         return 0;
2665 }
2666
2667 static int trace__set_filter_pids(const struct option *opt, const char *str,
2668                                   int unset __maybe_unused)
2669 {
2670         int ret = -1;
2671         size_t i;
2672         struct trace *trace = opt->value;
2673         /*
2674          * FIXME: introduce a intarray class, plain parse csv and create a
2675          * { int nr, int entries[] } struct...
2676          */
2677         struct intlist *list = intlist__new(str);
2678
2679         if (list == NULL)
2680                 return -1;
2681
2682         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2683         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2684
2685         if (trace->filter_pids.entries == NULL)
2686                 goto out;
2687
2688         trace->filter_pids.entries[0] = getpid();
2689
2690         for (i = 1; i < trace->filter_pids.nr; ++i)
2691                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2692
2693         intlist__delete(list);
2694         ret = 0;
2695 out:
2696         return ret;
2697 }
2698
2699 static int trace__open_output(struct trace *trace, const char *filename)
2700 {
2701         struct stat st;
2702
2703         if (!stat(filename, &st) && st.st_size) {
2704                 char oldname[PATH_MAX];
2705
2706                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2707                 unlink(oldname);
2708                 rename(filename, oldname);
2709         }
2710
2711         trace->output = fopen(filename, "w");
2712
2713         return trace->output == NULL ? -errno : 0;
2714 }
2715
2716 static int parse_pagefaults(const struct option *opt, const char *str,
2717                             int unset __maybe_unused)
2718 {
2719         int *trace_pgfaults = opt->value;
2720
2721         if (strcmp(str, "all") == 0)
2722                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2723         else if (strcmp(str, "maj") == 0)
2724                 *trace_pgfaults |= TRACE_PFMAJ;
2725         else if (strcmp(str, "min") == 0)
2726                 *trace_pgfaults |= TRACE_PFMIN;
2727         else
2728                 return -1;
2729
2730         return 0;
2731 }
2732
2733 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2734 {
2735         struct perf_evsel *evsel;
2736
2737         evlist__for_each_entry(evlist, evsel)
2738                 evsel->handler = handler;
2739 }
2740
2741 /*
2742  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2743  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2744  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2745  *
2746  * It'd be better to introduce a parse_options() variant that would return a
2747  * list with the terms it didn't match to an event...
2748  */
2749 static int trace__parse_events_option(const struct option *opt, const char *str,
2750                                       int unset __maybe_unused)
2751 {
2752         struct trace *trace = (struct trace *)opt->value;
2753         const char *s = str;
2754         char *sep = NULL, *lists[2] = { NULL, NULL, };
2755         int len = strlen(str), err = -1, list;
2756         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2757         char group_name[PATH_MAX];
2758
2759         if (strace_groups_dir == NULL)
2760                 return -1;
2761
2762         if (*s == '!') {
2763                 ++s;
2764                 trace->not_ev_qualifier = true;
2765         }
2766
2767         while (1) {
2768                 if ((sep = strchr(s, ',')) != NULL)
2769                         *sep = '\0';
2770
2771                 list = 0;
2772                 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2773                         list = 1;
2774                 } else {
2775                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2776                         if (access(group_name, R_OK) == 0)
2777                                 list = 1;
2778                 }
2779
2780                 if (lists[list]) {
2781                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2782                 } else {
2783                         lists[list] = malloc(len);
2784                         if (lists[list] == NULL)
2785                                 goto out;
2786                         strcpy(lists[list], s);
2787                 }
2788
2789                 if (!sep)
2790                         break;
2791
2792                 *sep = ',';
2793                 s = sep + 1;
2794         }
2795
2796         if (lists[1] != NULL) {
2797                 struct strlist_config slist_config = {
2798                         .dirname = strace_groups_dir,
2799                 };
2800
2801                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2802                 if (trace->ev_qualifier == NULL) {
2803                         fputs("Not enough memory to parse event qualifier", trace->output);
2804                         goto out;
2805                 }
2806
2807                 if (trace__validate_ev_qualifier(trace))
2808                         goto out;
2809         }
2810
2811         err = 0;
2812
2813         if (lists[0]) {
2814                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2815                                                "event selector. use 'perf list' to list available events",
2816                                                parse_events_option);
2817                 err = parse_events_option(&o, lists[0], 0);
2818         }
2819 out:
2820         if (sep)
2821                 *sep = ',';
2822
2823         return err;
2824 }
2825
2826 int cmd_trace(int argc, const char **argv)
2827 {
2828         const char *trace_usage[] = {
2829                 "perf trace [<options>] [<command>]",
2830                 "perf trace [<options>] -- <command> [<options>]",
2831                 "perf trace record [<options>] [<command>]",
2832                 "perf trace record [<options>] -- <command> [<options>]",
2833                 NULL
2834         };
2835         struct trace trace = {
2836                 .syscalls = {
2837                         . max = -1,
2838                 },
2839                 .opts = {
2840                         .target = {
2841                                 .uid       = UINT_MAX,
2842                                 .uses_mmap = true,
2843                         },
2844                         .user_freq     = UINT_MAX,
2845                         .user_interval = ULLONG_MAX,
2846                         .no_buffering  = true,
2847                         .mmap_pages    = UINT_MAX,
2848                         .proc_map_timeout  = 500,
2849                 },
2850                 .output = stderr,
2851                 .show_comm = true,
2852                 .trace_syscalls = true,
2853                 .kernel_syscallchains = false,
2854                 .max_stack = UINT_MAX,
2855         };
2856         const char *output_name = NULL;
2857         const struct option trace_options[] = {
2858         OPT_CALLBACK('e', "event", &trace, "event",
2859                      "event/syscall selector. use 'perf list' to list available events",
2860                      trace__parse_events_option),
2861         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2862                     "show the thread COMM next to its id"),
2863         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2864         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2865                      trace__parse_events_option),
2866         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2867         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2868         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2869                     "trace events on existing process id"),
2870         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2871                     "trace events on existing thread id"),
2872         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2873                      "pids to filter (by the kernel)", trace__set_filter_pids),
2874         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2875                     "system-wide collection from all CPUs"),
2876         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2877                     "list of cpus to monitor"),
2878         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2879                     "child tasks do not inherit counters"),
2880         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2881                      "number of mmap data pages",
2882                      perf_evlist__parse_mmap_pages),
2883         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2884                    "user to profile"),
2885         OPT_CALLBACK(0, "duration", &trace, "float",
2886                      "show only events with duration > N.M ms",
2887                      trace__set_duration),
2888         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2889         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2890         OPT_BOOLEAN('T', "time", &trace.full_time,
2891                     "Show full timestamp, not time relative to first start"),
2892         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2893                     "Show only syscall summary with statistics"),
2894         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2895                     "Show all syscalls and summary with statistics"),
2896         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2897                      "Trace pagefaults", parse_pagefaults, "maj"),
2898         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2899         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2900         OPT_CALLBACK(0, "call-graph", &trace.opts,
2901                      "record_mode[,record_size]", record_callchain_help,
2902                      &record_parse_callchain_opt),
2903         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2904                     "Show the kernel callchains on the syscall exit path"),
2905         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2906                      "Set the minimum stack depth when parsing the callchain, "
2907                      "anything below the specified depth will be ignored."),
2908         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2909                      "Set the maximum stack depth when parsing the callchain, "
2910                      "anything beyond the specified depth will be ignored. "
2911                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2912         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2913                         "per thread proc mmap processing timeout in ms"),
2914         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2915                      "ms to wait before starting measurement after program "
2916                      "start"),
2917         OPT_END()
2918         };
2919         bool __maybe_unused max_stack_user_set = true;
2920         bool mmap_pages_user_set = true;
2921         const char * const trace_subcommands[] = { "record", NULL };
2922         int err;
2923         char bf[BUFSIZ];
2924
2925         signal(SIGSEGV, sighandler_dump_stack);
2926         signal(SIGFPE, sighandler_dump_stack);
2927
2928         trace.evlist = perf_evlist__new();
2929         trace.sctbl = syscalltbl__new();
2930
2931         if (trace.evlist == NULL || trace.sctbl == NULL) {
2932                 pr_err("Not enough memory to run!\n");
2933                 err = -ENOMEM;
2934                 goto out;
2935         }
2936
2937         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2938                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2939
2940         err = bpf__setup_stdout(trace.evlist);
2941         if (err) {
2942                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2943                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2944                 goto out;
2945         }
2946
2947         err = -1;
2948
2949         if (trace.trace_pgfaults) {
2950                 trace.opts.sample_address = true;
2951                 trace.opts.sample_time = true;
2952         }
2953
2954         if (trace.opts.mmap_pages == UINT_MAX)
2955                 mmap_pages_user_set = false;
2956
2957         if (trace.max_stack == UINT_MAX) {
2958                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2959                 max_stack_user_set = false;
2960         }
2961
2962 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2963         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2964                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2965 #endif
2966
2967         if (callchain_param.enabled) {
2968                 if (!mmap_pages_user_set && geteuid() == 0)
2969                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2970
2971                 symbol_conf.use_callchain = true;
2972         }
2973
2974         if (trace.evlist->nr_entries > 0)
2975                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2976
2977         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2978                 return trace__record(&trace, argc-1, &argv[1]);
2979
2980         /* summary_only implies summary option, but don't overwrite summary if set */
2981         if (trace.summary_only)
2982                 trace.summary = trace.summary_only;
2983
2984         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2985             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2986                 pr_err("Please specify something to trace.\n");
2987                 return -1;
2988         }
2989
2990         if (!trace.trace_syscalls && trace.ev_qualifier) {
2991                 pr_err("The -e option can't be used with --no-syscalls.\n");
2992                 goto out;
2993         }
2994
2995         if (output_name != NULL) {
2996                 err = trace__open_output(&trace, output_name);
2997                 if (err < 0) {
2998                         perror("failed to create output file");
2999                         goto out;
3000                 }
3001         }
3002
3003         trace.open_id = syscalltbl__id(trace.sctbl, "open");
3004
3005         err = target__validate(&trace.opts.target);
3006         if (err) {
3007                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3008                 fprintf(trace.output, "%s", bf);
3009                 goto out_close;
3010         }
3011
3012         err = target__parse_uid(&trace.opts.target);
3013         if (err) {
3014                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3015                 fprintf(trace.output, "%s", bf);
3016                 goto out_close;
3017         }
3018
3019         if (!argc && target__none(&trace.opts.target))
3020                 trace.opts.target.system_wide = true;
3021
3022         if (input_name)
3023                 err = trace__replay(&trace);
3024         else
3025                 err = trace__run(&trace, argc, argv);
3026
3027 out_close:
3028         if (output_name != NULL)
3029                 fclose(trace.output);
3030 out:
3031         return err;
3032 }