kill dentry_update_name_case()
[sfrench/cifs-2.6.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61
62 #include "sane_ctype.h"
63
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC              02000000
66 #endif
67
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE  1024
70 #endif
71
72 struct trace {
73         struct perf_tool        tool;
74         struct syscalltbl       *sctbl;
75         struct {
76                 int             max;
77                 struct syscall  *table;
78                 struct {
79                         struct perf_evsel *sys_enter,
80                                           *sys_exit;
81                 }               events;
82         } syscalls;
83         struct record_opts      opts;
84         struct perf_evlist      *evlist;
85         struct machine          *host;
86         struct thread           *current;
87         struct cgroup           *cgroup;
88         u64                     base_time;
89         FILE                    *output;
90         unsigned long           nr_events;
91         struct strlist          *ev_qualifier;
92         struct {
93                 size_t          nr;
94                 int             *entries;
95         }                       ev_qualifier_ids;
96         struct {
97                 size_t          nr;
98                 pid_t           *entries;
99         }                       filter_pids;
100         double                  duration_filter;
101         double                  runtime_ms;
102         struct {
103                 u64             vfs_getname,
104                                 proc_getname;
105         } stats;
106         unsigned int            max_stack;
107         unsigned int            min_stack;
108         bool                    not_ev_qualifier;
109         bool                    live;
110         bool                    full_time;
111         bool                    sched;
112         bool                    multiple_threads;
113         bool                    summary;
114         bool                    summary_only;
115         bool                    failure_only;
116         bool                    show_comm;
117         bool                    print_sample;
118         bool                    show_tool_stats;
119         bool                    trace_syscalls;
120         bool                    kernel_syscallchains;
121         bool                    force;
122         bool                    vfs_getname;
123         int                     trace_pgfaults;
124         int                     open_id;
125 };
126
127 struct tp_field {
128         int offset;
129         union {
130                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132         };
133 };
134
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138         u##bits value; \
139         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140         return value;  \
141 }
142
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151         u##bits value; \
152         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153         return bswap_##bits(value);\
154 }
155
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159
160 static int tp_field__init_uint(struct tp_field *field,
161                                struct format_field *format_field,
162                                bool needs_swap)
163 {
164         field->offset = format_field->offset;
165
166         switch (format_field->size) {
167         case 1:
168                 field->integer = tp_field__u8;
169                 break;
170         case 2:
171                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
172                 break;
173         case 4:
174                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
175                 break;
176         case 8:
177                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
178                 break;
179         default:
180                 return -1;
181         }
182
183         return 0;
184 }
185
186 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
187 {
188         return sample->raw_data + field->offset;
189 }
190
191 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
192 {
193         field->offset = format_field->offset;
194         field->pointer = tp_field__ptr;
195         return 0;
196 }
197
198 struct syscall_tp {
199         struct tp_field id;
200         union {
201                 struct tp_field args, ret;
202         };
203 };
204
205 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
206                                           struct tp_field *field,
207                                           const char *name)
208 {
209         struct format_field *format_field = perf_evsel__field(evsel, name);
210
211         if (format_field == NULL)
212                 return -1;
213
214         return tp_field__init_uint(field, format_field, evsel->needs_swap);
215 }
216
217 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
218         ({ struct syscall_tp *sc = evsel->priv;\
219            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
220
221 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
222                                          struct tp_field *field,
223                                          const char *name)
224 {
225         struct format_field *format_field = perf_evsel__field(evsel, name);
226
227         if (format_field == NULL)
228                 return -1;
229
230         return tp_field__init_ptr(field, format_field);
231 }
232
233 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
234         ({ struct syscall_tp *sc = evsel->priv;\
235            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
236
237 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
238 {
239         zfree(&evsel->priv);
240         perf_evsel__delete(evsel);
241 }
242
243 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
244 {
245         evsel->priv = malloc(sizeof(struct syscall_tp));
246         if (evsel->priv != NULL) {
247                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
248                         goto out_delete;
249
250                 evsel->handler = handler;
251                 return 0;
252         }
253
254         return -ENOMEM;
255
256 out_delete:
257         zfree(&evsel->priv);
258         return -ENOENT;
259 }
260
261 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
262 {
263         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
264
265         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
266         if (IS_ERR(evsel))
267                 evsel = perf_evsel__newtp("syscalls", direction);
268
269         if (IS_ERR(evsel))
270                 return NULL;
271
272         if (perf_evsel__init_syscall_tp(evsel, handler))
273                 goto out_delete;
274
275         return evsel;
276
277 out_delete:
278         perf_evsel__delete_priv(evsel);
279         return NULL;
280 }
281
282 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
283         ({ struct syscall_tp *fields = evsel->priv; \
284            fields->name.integer(&fields->name, sample); })
285
286 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
287         ({ struct syscall_tp *fields = evsel->priv; \
288            fields->name.pointer(&fields->name, sample); })
289
290 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
291 {
292         int idx = val - sa->offset;
293
294         if (idx < 0 || idx >= sa->nr_entries)
295                 return scnprintf(bf, size, intfmt, val);
296
297         return scnprintf(bf, size, "%s", sa->entries[idx]);
298 }
299
300 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301                                                 const char *intfmt,
302                                                 struct syscall_arg *arg)
303 {
304         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
305 }
306
307 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
308                                               struct syscall_arg *arg)
309 {
310         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
311 }
312
313 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
314
315 struct strarrays {
316         int             nr_entries;
317         struct strarray **entries;
318 };
319
320 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
321         .nr_entries = ARRAY_SIZE(array), \
322         .entries = array, \
323 }
324
325 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
326                                         struct syscall_arg *arg)
327 {
328         struct strarrays *sas = arg->parm;
329         int i;
330
331         for (i = 0; i < sas->nr_entries; ++i) {
332                 struct strarray *sa = sas->entries[i];
333                 int idx = arg->val - sa->offset;
334
335                 if (idx >= 0 && idx < sa->nr_entries) {
336                         if (sa->entries[idx] == NULL)
337                                 break;
338                         return scnprintf(bf, size, "%s", sa->entries[idx]);
339                 }
340         }
341
342         return scnprintf(bf, size, "%d", arg->val);
343 }
344
345 #ifndef AT_FDCWD
346 #define AT_FDCWD        -100
347 #endif
348
349 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
350                                            struct syscall_arg *arg)
351 {
352         int fd = arg->val;
353
354         if (fd == AT_FDCWD)
355                 return scnprintf(bf, size, "CWD");
356
357         return syscall_arg__scnprintf_fd(bf, size, arg);
358 }
359
360 #define SCA_FDAT syscall_arg__scnprintf_fd_at
361
362 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
363                                               struct syscall_arg *arg);
364
365 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
366
367 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
368 {
369         return scnprintf(bf, size, "%#lx", arg->val);
370 }
371
372 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
373 {
374         return scnprintf(bf, size, "%d", arg->val);
375 }
376
377 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
378 {
379         return scnprintf(bf, size, "%ld", arg->val);
380 }
381
382 static const char *bpf_cmd[] = {
383         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
384         "MAP_GET_NEXT_KEY", "PROG_LOAD",
385 };
386 static DEFINE_STRARRAY(bpf_cmd);
387
388 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
389 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
390
391 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
392 static DEFINE_STRARRAY(itimers);
393
394 static const char *keyctl_options[] = {
395         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
396         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
397         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
398         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
399         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
400 };
401 static DEFINE_STRARRAY(keyctl_options);
402
403 static const char *whences[] = { "SET", "CUR", "END",
404 #ifdef SEEK_DATA
405 "DATA",
406 #endif
407 #ifdef SEEK_HOLE
408 "HOLE",
409 #endif
410 };
411 static DEFINE_STRARRAY(whences);
412
413 static const char *fcntl_cmds[] = {
414         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
415         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
416         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
417         "GETOWNER_UIDS",
418 };
419 static DEFINE_STRARRAY(fcntl_cmds);
420
421 static const char *fcntl_linux_specific_cmds[] = {
422         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
423         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
424         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
425 };
426
427 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
428
429 static struct strarray *fcntl_cmds_arrays[] = {
430         &strarray__fcntl_cmds,
431         &strarray__fcntl_linux_specific_cmds,
432 };
433
434 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
435
436 static const char *rlimit_resources[] = {
437         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
438         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
439         "RTTIME",
440 };
441 static DEFINE_STRARRAY(rlimit_resources);
442
443 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
444 static DEFINE_STRARRAY(sighow);
445
446 static const char *clockid[] = {
447         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
448         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
449         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
450 };
451 static DEFINE_STRARRAY(clockid);
452
453 static const char *socket_families[] = {
454         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
455         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
456         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
457         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
458         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
459         "ALG", "NFC", "VSOCK",
460 };
461 static DEFINE_STRARRAY(socket_families);
462
463 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
464                                                  struct syscall_arg *arg)
465 {
466         size_t printed = 0;
467         int mode = arg->val;
468
469         if (mode == F_OK) /* 0 */
470                 return scnprintf(bf, size, "F");
471 #define P_MODE(n) \
472         if (mode & n##_OK) { \
473                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
474                 mode &= ~n##_OK; \
475         }
476
477         P_MODE(R);
478         P_MODE(W);
479         P_MODE(X);
480 #undef P_MODE
481
482         if (mode)
483                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
484
485         return printed;
486 }
487
488 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
489
490 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
491                                               struct syscall_arg *arg);
492
493 #define SCA_FILENAME syscall_arg__scnprintf_filename
494
495 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
496                                                 struct syscall_arg *arg)
497 {
498         int printed = 0, flags = arg->val;
499
500 #define P_FLAG(n) \
501         if (flags & O_##n) { \
502                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
503                 flags &= ~O_##n; \
504         }
505
506         P_FLAG(CLOEXEC);
507         P_FLAG(NONBLOCK);
508 #undef P_FLAG
509
510         if (flags)
511                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
512
513         return printed;
514 }
515
516 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
517
518 #ifndef GRND_NONBLOCK
519 #define GRND_NONBLOCK   0x0001
520 #endif
521 #ifndef GRND_RANDOM
522 #define GRND_RANDOM     0x0002
523 #endif
524
525 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
526                                                    struct syscall_arg *arg)
527 {
528         int printed = 0, flags = arg->val;
529
530 #define P_FLAG(n) \
531         if (flags & GRND_##n) { \
532                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
533                 flags &= ~GRND_##n; \
534         }
535
536         P_FLAG(RANDOM);
537         P_FLAG(NONBLOCK);
538 #undef P_FLAG
539
540         if (flags)
541                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542
543         return printed;
544 }
545
546 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547
548 #define STRARRAY(name, array) \
549           { .scnprintf  = SCA_STRARRAY, \
550             .parm       = &strarray__##array, }
551
552 #include "trace/beauty/arch_errno_names.c"
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/futex_op.c"
555 #include "trace/beauty/futex_val3.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567
568 struct syscall_arg_fmt {
569         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
570         void       *parm;
571         const char *name;
572         bool       show_zero;
573 };
574
575 static struct syscall_fmt {
576         const char *name;
577         const char *alias;
578         struct syscall_arg_fmt arg[6];
579         u8         nr_args;
580         bool       errpid;
581         bool       timeout;
582         bool       hexret;
583 } syscall_fmts[] = {
584         { .name     = "access",
585           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
586         { .name     = "bpf",
587           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
588         { .name     = "brk",        .hexret = true,
589           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
590         { .name     = "clock_gettime",
591           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
592         { .name     = "clone",      .errpid = true, .nr_args = 5,
593           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
594                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
595                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
596                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
597                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
598         { .name     = "close",
599           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
600         { .name     = "epoll_ctl",
601           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
602         { .name     = "eventfd2",
603           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
604         { .name     = "fchmodat",
605           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
606         { .name     = "fchownat",
607           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
608         { .name     = "fcntl",
609           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
610                            .parm      = &strarrays__fcntl_cmds_arrays,
611                            .show_zero = true, },
612                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
613         { .name     = "flock",
614           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
615         { .name     = "fstat", .alias = "newfstat", },
616         { .name     = "fstatat", .alias = "newfstatat", },
617         { .name     = "futex",
618           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
619                    [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
620         { .name     = "futimesat",
621           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
622         { .name     = "getitimer",
623           .arg = { [0] = STRARRAY(which, itimers), }, },
624         { .name     = "getpid",     .errpid = true, },
625         { .name     = "getpgid",    .errpid = true, },
626         { .name     = "getppid",    .errpid = true, },
627         { .name     = "getrandom",
628           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
629         { .name     = "getrlimit",
630           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
631         { .name     = "gettid",     .errpid = true, },
632         { .name     = "ioctl",
633           .arg = {
634 #if defined(__i386__) || defined(__x86_64__)
635 /*
636  * FIXME: Make this available to all arches.
637  */
638                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
639                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
640 #else
641                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
642 #endif
643         { .name     = "kcmp",       .nr_args = 5,
644           .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
645                    [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
646                    [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
647                    [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
648                    [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
649         { .name     = "keyctl",
650           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
651         { .name     = "kill",
652           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
653         { .name     = "linkat",
654           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
655         { .name     = "lseek",
656           .arg = { [2] = STRARRAY(whence, whences), }, },
657         { .name     = "lstat", .alias = "newlstat", },
658         { .name     = "madvise",
659           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
660                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
661         { .name     = "mkdirat",
662           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
663         { .name     = "mknodat",
664           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
665         { .name     = "mlock",
666           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
667         { .name     = "mlockall",
668           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
669         { .name     = "mmap",       .hexret = true,
670 /* The standard mmap maps to old_mmap on s390x */
671 #if defined(__s390x__)
672         .alias = "old_mmap",
673 #endif
674           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
675                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
676                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
677         { .name     = "mprotect",
678           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
679                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
680         { .name     = "mq_unlink",
681           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
682         { .name     = "mremap",     .hexret = true,
683           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
684                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
685                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
686         { .name     = "munlock",
687           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
688         { .name     = "munmap",
689           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
690         { .name     = "name_to_handle_at",
691           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
692         { .name     = "newfstatat",
693           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
694         { .name     = "open",
695           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696         { .name     = "open_by_handle_at",
697           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
698                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699         { .name     = "openat",
700           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
701                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
702         { .name     = "perf_event_open",
703           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
704                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
705                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
706         { .name     = "pipe2",
707           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
708         { .name     = "pkey_alloc",
709           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
710         { .name     = "pkey_free",
711           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
712         { .name     = "pkey_mprotect",
713           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
714                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
715                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
716         { .name     = "poll", .timeout = true, },
717         { .name     = "ppoll", .timeout = true, },
718         { .name     = "prctl", .alias = "arch_prctl",
719           .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
720                    [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
721                    [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
722         { .name     = "pread", .alias = "pread64", },
723         { .name     = "preadv", .alias = "pread", },
724         { .name     = "prlimit64",
725           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
726         { .name     = "pwrite", .alias = "pwrite64", },
727         { .name     = "readlinkat",
728           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
729         { .name     = "recvfrom",
730           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
731         { .name     = "recvmmsg",
732           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
733         { .name     = "recvmsg",
734           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
735         { .name     = "renameat",
736           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
737         { .name     = "rt_sigaction",
738           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
739         { .name     = "rt_sigprocmask",
740           .arg = { [0] = STRARRAY(how, sighow), }, },
741         { .name     = "rt_sigqueueinfo",
742           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
743         { .name     = "rt_tgsigqueueinfo",
744           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
745         { .name     = "sched_setscheduler",
746           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
747         { .name     = "seccomp",
748           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
749                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
750         { .name     = "select", .timeout = true, },
751         { .name     = "sendmmsg",
752           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
753         { .name     = "sendmsg",
754           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
755         { .name     = "sendto",
756           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
757         { .name     = "set_tid_address", .errpid = true, },
758         { .name     = "setitimer",
759           .arg = { [0] = STRARRAY(which, itimers), }, },
760         { .name     = "setrlimit",
761           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
762         { .name     = "socket",
763           .arg = { [0] = STRARRAY(family, socket_families),
764                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
765         { .name     = "socketpair",
766           .arg = { [0] = STRARRAY(family, socket_families),
767                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
768         { .name     = "stat", .alias = "newstat", },
769         { .name     = "statx",
770           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
771                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
772                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
773         { .name     = "swapoff",
774           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
775         { .name     = "swapon",
776           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
777         { .name     = "symlinkat",
778           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
779         { .name     = "tgkill",
780           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
781         { .name     = "tkill",
782           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
783         { .name     = "uname", .alias = "newuname", },
784         { .name     = "unlinkat",
785           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
786         { .name     = "utimensat",
787           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
788         { .name     = "wait4",      .errpid = true,
789           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
790         { .name     = "waitid",     .errpid = true,
791           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
792 };
793
794 static int syscall_fmt__cmp(const void *name, const void *fmtp)
795 {
796         const struct syscall_fmt *fmt = fmtp;
797         return strcmp(name, fmt->name);
798 }
799
800 static struct syscall_fmt *syscall_fmt__find(const char *name)
801 {
802         const int nmemb = ARRAY_SIZE(syscall_fmts);
803         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
804 }
805
806 struct syscall {
807         struct event_format *tp_format;
808         int                 nr_args;
809         struct format_field *args;
810         const char          *name;
811         bool                is_exit;
812         struct syscall_fmt  *fmt;
813         struct syscall_arg_fmt *arg_fmt;
814 };
815
816 /*
817  * We need to have this 'calculated' boolean because in some cases we really
818  * don't know what is the duration of a syscall, for instance, when we start
819  * a session and some threads are waiting for a syscall to finish, say 'poll',
820  * in which case all we can do is to print "( ? ) for duration and for the
821  * start timestamp.
822  */
823 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
824 {
825         double duration = (double)t / NSEC_PER_MSEC;
826         size_t printed = fprintf(fp, "(");
827
828         if (!calculated)
829                 printed += fprintf(fp, "         ");
830         else if (duration >= 1.0)
831                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
832         else if (duration >= 0.01)
833                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
834         else
835                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
836         return printed + fprintf(fp, "): ");
837 }
838
839 /**
840  * filename.ptr: The filename char pointer that will be vfs_getname'd
841  * filename.entry_str_pos: Where to insert the string translated from
842  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
843  * ret_scnprintf: syscall args may set this to a different syscall return
844  *                formatter, for instance, fcntl may return fds, file flags, etc.
845  */
846 struct thread_trace {
847         u64               entry_time;
848         bool              entry_pending;
849         unsigned long     nr_events;
850         unsigned long     pfmaj, pfmin;
851         char              *entry_str;
852         double            runtime_ms;
853         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
854         struct {
855                 unsigned long ptr;
856                 short int     entry_str_pos;
857                 bool          pending_open;
858                 unsigned int  namelen;
859                 char          *name;
860         } filename;
861         struct {
862                 int       max;
863                 char      **table;
864         } paths;
865
866         struct intlist *syscall_stats;
867 };
868
869 static struct thread_trace *thread_trace__new(void)
870 {
871         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
872
873         if (ttrace)
874                 ttrace->paths.max = -1;
875
876         ttrace->syscall_stats = intlist__new(NULL);
877
878         return ttrace;
879 }
880
881 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
882 {
883         struct thread_trace *ttrace;
884
885         if (thread == NULL)
886                 goto fail;
887
888         if (thread__priv(thread) == NULL)
889                 thread__set_priv(thread, thread_trace__new());
890
891         if (thread__priv(thread) == NULL)
892                 goto fail;
893
894         ttrace = thread__priv(thread);
895         ++ttrace->nr_events;
896
897         return ttrace;
898 fail:
899         color_fprintf(fp, PERF_COLOR_RED,
900                       "WARNING: not enough memory, dropping samples!\n");
901         return NULL;
902 }
903
904
905 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
906                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
907 {
908         struct thread_trace *ttrace = thread__priv(arg->thread);
909
910         ttrace->ret_scnprintf = ret_scnprintf;
911 }
912
913 #define TRACE_PFMAJ             (1 << 0)
914 #define TRACE_PFMIN             (1 << 1)
915
916 static const size_t trace__entry_str_size = 2048;
917
918 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
919 {
920         struct thread_trace *ttrace = thread__priv(thread);
921
922         if (fd > ttrace->paths.max) {
923                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
924
925                 if (npath == NULL)
926                         return -1;
927
928                 if (ttrace->paths.max != -1) {
929                         memset(npath + ttrace->paths.max + 1, 0,
930                                (fd - ttrace->paths.max) * sizeof(char *));
931                 } else {
932                         memset(npath, 0, (fd + 1) * sizeof(char *));
933                 }
934
935                 ttrace->paths.table = npath;
936                 ttrace->paths.max   = fd;
937         }
938
939         ttrace->paths.table[fd] = strdup(pathname);
940
941         return ttrace->paths.table[fd] != NULL ? 0 : -1;
942 }
943
944 static int thread__read_fd_path(struct thread *thread, int fd)
945 {
946         char linkname[PATH_MAX], pathname[PATH_MAX];
947         struct stat st;
948         int ret;
949
950         if (thread->pid_ == thread->tid) {
951                 scnprintf(linkname, sizeof(linkname),
952                           "/proc/%d/fd/%d", thread->pid_, fd);
953         } else {
954                 scnprintf(linkname, sizeof(linkname),
955                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
956         }
957
958         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
959                 return -1;
960
961         ret = readlink(linkname, pathname, sizeof(pathname));
962
963         if (ret < 0 || ret > st.st_size)
964                 return -1;
965
966         pathname[ret] = '\0';
967         return trace__set_fd_pathname(thread, fd, pathname);
968 }
969
970 static const char *thread__fd_path(struct thread *thread, int fd,
971                                    struct trace *trace)
972 {
973         struct thread_trace *ttrace = thread__priv(thread);
974
975         if (ttrace == NULL)
976                 return NULL;
977
978         if (fd < 0)
979                 return NULL;
980
981         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
982                 if (!trace->live)
983                         return NULL;
984                 ++trace->stats.proc_getname;
985                 if (thread__read_fd_path(thread, fd))
986                         return NULL;
987         }
988
989         return ttrace->paths.table[fd];
990 }
991
992 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
993 {
994         int fd = arg->val;
995         size_t printed = scnprintf(bf, size, "%d", fd);
996         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
997
998         if (path)
999                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1000
1001         return printed;
1002 }
1003
1004 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1005 {
1006         size_t printed = scnprintf(bf, size, "%d", fd);
1007         struct thread *thread = machine__find_thread(trace->host, pid, pid);
1008
1009         if (thread) {
1010                 const char *path = thread__fd_path(thread, fd, trace);
1011
1012                 if (path)
1013                         printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1014
1015                 thread__put(thread);
1016         }
1017
1018         return printed;
1019 }
1020
1021 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1022                                               struct syscall_arg *arg)
1023 {
1024         int fd = arg->val;
1025         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1026         struct thread_trace *ttrace = thread__priv(arg->thread);
1027
1028         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1029                 zfree(&ttrace->paths.table[fd]);
1030
1031         return printed;
1032 }
1033
1034 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1035                                      unsigned long ptr)
1036 {
1037         struct thread_trace *ttrace = thread__priv(thread);
1038
1039         ttrace->filename.ptr = ptr;
1040         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1041 }
1042
1043 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1044                                               struct syscall_arg *arg)
1045 {
1046         unsigned long ptr = arg->val;
1047
1048         if (!arg->trace->vfs_getname)
1049                 return scnprintf(bf, size, "%#x", ptr);
1050
1051         thread__set_filename_pos(arg->thread, bf, ptr);
1052         return 0;
1053 }
1054
1055 static bool trace__filter_duration(struct trace *trace, double t)
1056 {
1057         return t < (trace->duration_filter * NSEC_PER_MSEC);
1058 }
1059
1060 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061 {
1062         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1063
1064         return fprintf(fp, "%10.3f ", ts);
1065 }
1066
1067 /*
1068  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1069  * using ttrace->entry_time for a thread that receives a sys_exit without
1070  * first having received a sys_enter ("poll" issued before tracing session
1071  * starts, lost sys_enter exit due to ring buffer overflow).
1072  */
1073 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1074 {
1075         if (tstamp > 0)
1076                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1077
1078         return fprintf(fp, "         ? ");
1079 }
1080
1081 static bool done = false;
1082 static bool interrupted = false;
1083
1084 static void sig_handler(int sig)
1085 {
1086         done = true;
1087         interrupted = sig == SIGINT;
1088 }
1089
1090 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1091                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1092 {
1093         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1094         printed += fprintf_duration(duration, duration_calculated, fp);
1095
1096         if (trace->multiple_threads) {
1097                 if (trace->show_comm)
1098                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1099                 printed += fprintf(fp, "%d ", thread->tid);
1100         }
1101
1102         return printed;
1103 }
1104
1105 static int trace__process_event(struct trace *trace, struct machine *machine,
1106                                 union perf_event *event, struct perf_sample *sample)
1107 {
1108         int ret = 0;
1109
1110         switch (event->header.type) {
1111         case PERF_RECORD_LOST:
1112                 color_fprintf(trace->output, PERF_COLOR_RED,
1113                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1114                 ret = machine__process_lost_event(machine, event, sample);
1115                 break;
1116         default:
1117                 ret = machine__process_event(machine, event, sample);
1118                 break;
1119         }
1120
1121         return ret;
1122 }
1123
1124 static int trace__tool_process(struct perf_tool *tool,
1125                                union perf_event *event,
1126                                struct perf_sample *sample,
1127                                struct machine *machine)
1128 {
1129         struct trace *trace = container_of(tool, struct trace, tool);
1130         return trace__process_event(trace, machine, event, sample);
1131 }
1132
1133 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1134 {
1135         struct machine *machine = vmachine;
1136
1137         if (machine->kptr_restrict_warned)
1138                 return NULL;
1139
1140         if (symbol_conf.kptr_restrict) {
1141                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1142                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1143                            "Kernel samples will not be resolved.\n");
1144                 machine->kptr_restrict_warned = true;
1145                 return NULL;
1146         }
1147
1148         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1149 }
1150
1151 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1152 {
1153         int err = symbol__init(NULL);
1154
1155         if (err)
1156                 return err;
1157
1158         trace->host = machine__new_host();
1159         if (trace->host == NULL)
1160                 return -ENOMEM;
1161
1162         err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1163         if (err < 0)
1164                 goto out;
1165
1166         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1167                                             evlist->threads, trace__tool_process, false,
1168                                             trace->opts.proc_map_timeout, 1);
1169 out:
1170         if (err)
1171                 symbol__exit();
1172
1173         return err;
1174 }
1175
1176 static void trace__symbols__exit(struct trace *trace)
1177 {
1178         machine__exit(trace->host);
1179         trace->host = NULL;
1180
1181         symbol__exit();
1182 }
1183
1184 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1185 {
1186         int idx;
1187
1188         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1189                 nr_args = sc->fmt->nr_args;
1190
1191         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1192         if (sc->arg_fmt == NULL)
1193                 return -1;
1194
1195         for (idx = 0; idx < nr_args; ++idx) {
1196                 if (sc->fmt)
1197                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1198         }
1199
1200         sc->nr_args = nr_args;
1201         return 0;
1202 }
1203
1204 static int syscall__set_arg_fmts(struct syscall *sc)
1205 {
1206         struct format_field *field;
1207         int idx = 0, len;
1208
1209         for (field = sc->args; field; field = field->next, ++idx) {
1210                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1211                         continue;
1212
1213                 if (strcmp(field->type, "const char *") == 0 &&
1214                          (strcmp(field->name, "filename") == 0 ||
1215                           strcmp(field->name, "path") == 0 ||
1216                           strcmp(field->name, "pathname") == 0))
1217                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1218                 else if (field->flags & FIELD_IS_POINTER)
1219                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1220                 else if (strcmp(field->type, "pid_t") == 0)
1221                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1222                 else if (strcmp(field->type, "umode_t") == 0)
1223                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1224                 else if ((strcmp(field->type, "int") == 0 ||
1225                           strcmp(field->type, "unsigned int") == 0 ||
1226                           strcmp(field->type, "long") == 0) &&
1227                          (len = strlen(field->name)) >= 2 &&
1228                          strcmp(field->name + len - 2, "fd") == 0) {
1229                         /*
1230                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1231                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1232                          * 65 int
1233                          * 23 unsigned int
1234                          * 7 unsigned long
1235                          */
1236                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1237                 }
1238         }
1239
1240         return 0;
1241 }
1242
1243 static int trace__read_syscall_info(struct trace *trace, int id)
1244 {
1245         char tp_name[128];
1246         struct syscall *sc;
1247         const char *name = syscalltbl__name(trace->sctbl, id);
1248
1249         if (name == NULL)
1250                 return -1;
1251
1252         if (id > trace->syscalls.max) {
1253                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1254
1255                 if (nsyscalls == NULL)
1256                         return -1;
1257
1258                 if (trace->syscalls.max != -1) {
1259                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1260                                (id - trace->syscalls.max) * sizeof(*sc));
1261                 } else {
1262                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1263                 }
1264
1265                 trace->syscalls.table = nsyscalls;
1266                 trace->syscalls.max   = id;
1267         }
1268
1269         sc = trace->syscalls.table + id;
1270         sc->name = name;
1271
1272         sc->fmt  = syscall_fmt__find(sc->name);
1273
1274         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1275         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1276
1277         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1278                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1279                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1280         }
1281
1282         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1283                 return -1;
1284
1285         if (IS_ERR(sc->tp_format))
1286                 return -1;
1287
1288         sc->args = sc->tp_format->format.fields;
1289         /*
1290          * We need to check and discard the first variable '__syscall_nr'
1291          * or 'nr' that mean the syscall number. It is needless here.
1292          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1293          */
1294         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1295                 sc->args = sc->args->next;
1296                 --sc->nr_args;
1297         }
1298
1299         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1300
1301         return syscall__set_arg_fmts(sc);
1302 }
1303
1304 static int trace__validate_ev_qualifier(struct trace *trace)
1305 {
1306         int err = 0, i;
1307         size_t nr_allocated;
1308         struct str_node *pos;
1309
1310         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1311         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1312                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1313
1314         if (trace->ev_qualifier_ids.entries == NULL) {
1315                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1316                        trace->output);
1317                 err = -EINVAL;
1318                 goto out;
1319         }
1320
1321         nr_allocated = trace->ev_qualifier_ids.nr;
1322         i = 0;
1323
1324         strlist__for_each_entry(pos, trace->ev_qualifier) {
1325                 const char *sc = pos->s;
1326                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1327
1328                 if (id < 0) {
1329                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1330                         if (id >= 0)
1331                                 goto matches;
1332
1333                         if (err == 0) {
1334                                 fputs("Error:\tInvalid syscall ", trace->output);
1335                                 err = -EINVAL;
1336                         } else {
1337                                 fputs(", ", trace->output);
1338                         }
1339
1340                         fputs(sc, trace->output);
1341                 }
1342 matches:
1343                 trace->ev_qualifier_ids.entries[i++] = id;
1344                 if (match_next == -1)
1345                         continue;
1346
1347                 while (1) {
1348                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1349                         if (id < 0)
1350                                 break;
1351                         if (nr_allocated == trace->ev_qualifier_ids.nr) {
1352                                 void *entries;
1353
1354                                 nr_allocated += 8;
1355                                 entries = realloc(trace->ev_qualifier_ids.entries,
1356                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1357                                 if (entries == NULL) {
1358                                         err = -ENOMEM;
1359                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1360                                         goto out_free;
1361                                 }
1362                                 trace->ev_qualifier_ids.entries = entries;
1363                         }
1364                         trace->ev_qualifier_ids.nr++;
1365                         trace->ev_qualifier_ids.entries[i++] = id;
1366                 }
1367         }
1368
1369         if (err < 0) {
1370                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1371                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1372 out_free:
1373                 zfree(&trace->ev_qualifier_ids.entries);
1374                 trace->ev_qualifier_ids.nr = 0;
1375         }
1376 out:
1377         return err;
1378 }
1379
1380 /*
1381  * args is to be interpreted as a series of longs but we need to handle
1382  * 8-byte unaligned accesses. args points to raw_data within the event
1383  * and raw_data is guaranteed to be 8-byte unaligned because it is
1384  * preceded by raw_size which is a u32. So we need to copy args to a temp
1385  * variable to read it. Most notably this avoids extended load instructions
1386  * on unaligned addresses
1387  */
1388 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1389 {
1390         unsigned long val;
1391         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1392
1393         memcpy(&val, p, sizeof(val));
1394         return val;
1395 }
1396
1397 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1398                                       struct syscall_arg *arg)
1399 {
1400         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1401                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1402
1403         return scnprintf(bf, size, "arg%d: ", arg->idx);
1404 }
1405
1406 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1407                                      struct syscall_arg *arg, unsigned long val)
1408 {
1409         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1410                 arg->val = val;
1411                 if (sc->arg_fmt[arg->idx].parm)
1412                         arg->parm = sc->arg_fmt[arg->idx].parm;
1413                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1414         }
1415         return scnprintf(bf, size, "%ld", val);
1416 }
1417
1418 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1419                                       unsigned char *args, struct trace *trace,
1420                                       struct thread *thread)
1421 {
1422         size_t printed = 0;
1423         unsigned long val;
1424         u8 bit = 1;
1425         struct syscall_arg arg = {
1426                 .args   = args,
1427                 .idx    = 0,
1428                 .mask   = 0,
1429                 .trace  = trace,
1430                 .thread = thread,
1431         };
1432         struct thread_trace *ttrace = thread__priv(thread);
1433
1434         /*
1435          * Things like fcntl will set this in its 'cmd' formatter to pick the
1436          * right formatter for the return value (an fd? file flags?), which is
1437          * not needed for syscalls that always return a given type, say an fd.
1438          */
1439         ttrace->ret_scnprintf = NULL;
1440
1441         if (sc->args != NULL) {
1442                 struct format_field *field;
1443
1444                 for (field = sc->args; field;
1445                      field = field->next, ++arg.idx, bit <<= 1) {
1446                         if (arg.mask & bit)
1447                                 continue;
1448
1449                         val = syscall_arg__val(&arg, arg.idx);
1450
1451                         /*
1452                          * Suppress this argument if its value is zero and
1453                          * and we don't have a string associated in an
1454                          * strarray for it.
1455                          */
1456                         if (val == 0 &&
1457                             !(sc->arg_fmt &&
1458                               (sc->arg_fmt[arg.idx].show_zero ||
1459                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1460                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1461                               sc->arg_fmt[arg.idx].parm))
1462                                 continue;
1463
1464                         printed += scnprintf(bf + printed, size - printed,
1465                                              "%s%s: ", printed ? ", " : "", field->name);
1466                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1467                 }
1468         } else if (IS_ERR(sc->tp_format)) {
1469                 /*
1470                  * If we managed to read the tracepoint /format file, then we
1471                  * may end up not having any args, like with gettid(), so only
1472                  * print the raw args when we didn't manage to read it.
1473                  */
1474                 while (arg.idx < sc->nr_args) {
1475                         if (arg.mask & bit)
1476                                 goto next_arg;
1477                         val = syscall_arg__val(&arg, arg.idx);
1478                         if (printed)
1479                                 printed += scnprintf(bf + printed, size - printed, ", ");
1480                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1481                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1482 next_arg:
1483                         ++arg.idx;
1484                         bit <<= 1;
1485                 }
1486         }
1487
1488         return printed;
1489 }
1490
1491 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1492                                   union perf_event *event,
1493                                   struct perf_sample *sample);
1494
1495 static struct syscall *trace__syscall_info(struct trace *trace,
1496                                            struct perf_evsel *evsel, int id)
1497 {
1498
1499         if (id < 0) {
1500
1501                 /*
1502                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1503                  * before that, leaving at a higher verbosity level till that is
1504                  * explained. Reproduced with plain ftrace with:
1505                  *
1506                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1507                  * grep "NR -1 " /t/trace_pipe
1508                  *
1509                  * After generating some load on the machine.
1510                  */
1511                 if (verbose > 1) {
1512                         static u64 n;
1513                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1514                                 id, perf_evsel__name(evsel), ++n);
1515                 }
1516                 return NULL;
1517         }
1518
1519         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1520             trace__read_syscall_info(trace, id))
1521                 goto out_cant_read;
1522
1523         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1524                 goto out_cant_read;
1525
1526         return &trace->syscalls.table[id];
1527
1528 out_cant_read:
1529         if (verbose > 0) {
1530                 fprintf(trace->output, "Problems reading syscall %d", id);
1531                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1532                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1533                 fputs(" information\n", trace->output);
1534         }
1535         return NULL;
1536 }
1537
1538 static void thread__update_stats(struct thread_trace *ttrace,
1539                                  int id, struct perf_sample *sample)
1540 {
1541         struct int_node *inode;
1542         struct stats *stats;
1543         u64 duration = 0;
1544
1545         inode = intlist__findnew(ttrace->syscall_stats, id);
1546         if (inode == NULL)
1547                 return;
1548
1549         stats = inode->priv;
1550         if (stats == NULL) {
1551                 stats = malloc(sizeof(struct stats));
1552                 if (stats == NULL)
1553                         return;
1554                 init_stats(stats);
1555                 inode->priv = stats;
1556         }
1557
1558         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1559                 duration = sample->time - ttrace->entry_time;
1560
1561         update_stats(stats, duration);
1562 }
1563
1564 static int trace__printf_interrupted_entry(struct trace *trace)
1565 {
1566         struct thread_trace *ttrace;
1567         size_t printed;
1568
1569         if (trace->failure_only || trace->current == NULL)
1570                 return 0;
1571
1572         ttrace = thread__priv(trace->current);
1573
1574         if (!ttrace->entry_pending)
1575                 return 0;
1576
1577         printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1578         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1579         ttrace->entry_pending = false;
1580
1581         return printed;
1582 }
1583
1584 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1585                                  struct perf_sample *sample, struct thread *thread)
1586 {
1587         int printed = 0;
1588
1589         if (trace->print_sample) {
1590                 double ts = (double)sample->time / NSEC_PER_MSEC;
1591
1592                 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1593                                    perf_evsel__name(evsel), ts,
1594                                    thread__comm_str(thread),
1595                                    sample->pid, sample->tid, sample->cpu);
1596         }
1597
1598         return printed;
1599 }
1600
1601 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1602                             union perf_event *event __maybe_unused,
1603                             struct perf_sample *sample)
1604 {
1605         char *msg;
1606         void *args;
1607         size_t printed = 0;
1608         struct thread *thread;
1609         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1610         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1611         struct thread_trace *ttrace;
1612
1613         if (sc == NULL)
1614                 return -1;
1615
1616         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1617         ttrace = thread__trace(thread, trace->output);
1618         if (ttrace == NULL)
1619                 goto out_put;
1620
1621         trace__fprintf_sample(trace, evsel, sample, thread);
1622
1623         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1624
1625         if (ttrace->entry_str == NULL) {
1626                 ttrace->entry_str = malloc(trace__entry_str_size);
1627                 if (!ttrace->entry_str)
1628                         goto out_put;
1629         }
1630
1631         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1632                 trace__printf_interrupted_entry(trace);
1633
1634         ttrace->entry_time = sample->time;
1635         msg = ttrace->entry_str;
1636         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1637
1638         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1639                                            args, trace, thread);
1640
1641         if (sc->is_exit) {
1642                 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1643                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1644                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1645                 }
1646         } else {
1647                 ttrace->entry_pending = true;
1648                 /* See trace__vfs_getname & trace__sys_exit */
1649                 ttrace->filename.pending_open = false;
1650         }
1651
1652         if (trace->current != thread) {
1653                 thread__put(trace->current);
1654                 trace->current = thread__get(thread);
1655         }
1656         err = 0;
1657 out_put:
1658         thread__put(thread);
1659         return err;
1660 }
1661
1662 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1663                                     struct perf_sample *sample,
1664                                     struct callchain_cursor *cursor)
1665 {
1666         struct addr_location al;
1667         int max_stack = evsel->attr.sample_max_stack ?
1668                         evsel->attr.sample_max_stack :
1669                         trace->max_stack;
1670
1671         if (machine__resolve(trace->host, &al, sample) < 0 ||
1672             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1673                 return -1;
1674
1675         return 0;
1676 }
1677
1678 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1679 {
1680         /* TODO: user-configurable print_opts */
1681         const unsigned int print_opts = EVSEL__PRINT_SYM |
1682                                         EVSEL__PRINT_DSO |
1683                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1684
1685         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1686 }
1687
1688 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1689 {
1690         struct perf_env *env = perf_evsel__env(evsel);
1691         const char *arch_name = perf_env__arch(env);
1692
1693         return arch_syscalls__strerrno(arch_name, err);
1694 }
1695
1696 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1697                            union perf_event *event __maybe_unused,
1698                            struct perf_sample *sample)
1699 {
1700         long ret;
1701         u64 duration = 0;
1702         bool duration_calculated = false;
1703         struct thread *thread;
1704         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1705         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1706         struct thread_trace *ttrace;
1707
1708         if (sc == NULL)
1709                 return -1;
1710
1711         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1712         ttrace = thread__trace(thread, trace->output);
1713         if (ttrace == NULL)
1714                 goto out_put;
1715
1716         trace__fprintf_sample(trace, evsel, sample, thread);
1717
1718         if (trace->summary)
1719                 thread__update_stats(ttrace, id, sample);
1720
1721         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1722
1723         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1724                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1725                 ttrace->filename.pending_open = false;
1726                 ++trace->stats.vfs_getname;
1727         }
1728
1729         if (ttrace->entry_time) {
1730                 duration = sample->time - ttrace->entry_time;
1731                 if (trace__filter_duration(trace, duration))
1732                         goto out;
1733                 duration_calculated = true;
1734         } else if (trace->duration_filter)
1735                 goto out;
1736
1737         if (sample->callchain) {
1738                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1739                 if (callchain_ret == 0) {
1740                         if (callchain_cursor.nr < trace->min_stack)
1741                                 goto out;
1742                         callchain_ret = 1;
1743                 }
1744         }
1745
1746         if (trace->summary_only || (ret >= 0 && trace->failure_only))
1747                 goto out;
1748
1749         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1750
1751         if (ttrace->entry_pending) {
1752                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1753         } else {
1754                 fprintf(trace->output, " ... [");
1755                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1756                 fprintf(trace->output, "]: %s()", sc->name);
1757         }
1758
1759         if (sc->fmt == NULL) {
1760                 if (ret < 0)
1761                         goto errno_print;
1762 signed_print:
1763                 fprintf(trace->output, ") = %ld", ret);
1764         } else if (ret < 0) {
1765 errno_print: {
1766                 char bf[STRERR_BUFSIZE];
1767                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1768                            *e = errno_to_name(evsel, -ret);
1769
1770                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1771         }
1772         } else if (ret == 0 && sc->fmt->timeout)
1773                 fprintf(trace->output, ") = 0 Timeout");
1774         else if (ttrace->ret_scnprintf) {
1775                 char bf[1024];
1776                 struct syscall_arg arg = {
1777                         .val    = ret,
1778                         .thread = thread,
1779                         .trace  = trace,
1780                 };
1781                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1782                 ttrace->ret_scnprintf = NULL;
1783                 fprintf(trace->output, ") = %s", bf);
1784         } else if (sc->fmt->hexret)
1785                 fprintf(trace->output, ") = %#lx", ret);
1786         else if (sc->fmt->errpid) {
1787                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1788
1789                 if (child != NULL) {
1790                         fprintf(trace->output, ") = %ld", ret);
1791                         if (child->comm_set)
1792                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1793                         thread__put(child);
1794                 }
1795         } else
1796                 goto signed_print;
1797
1798         fputc('\n', trace->output);
1799
1800         if (callchain_ret > 0)
1801                 trace__fprintf_callchain(trace, sample);
1802         else if (callchain_ret < 0)
1803                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1804 out:
1805         ttrace->entry_pending = false;
1806         err = 0;
1807 out_put:
1808         thread__put(thread);
1809         return err;
1810 }
1811
1812 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1813                               union perf_event *event __maybe_unused,
1814                               struct perf_sample *sample)
1815 {
1816         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1817         struct thread_trace *ttrace;
1818         size_t filename_len, entry_str_len, to_move;
1819         ssize_t remaining_space;
1820         char *pos;
1821         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1822
1823         if (!thread)
1824                 goto out;
1825
1826         ttrace = thread__priv(thread);
1827         if (!ttrace)
1828                 goto out_put;
1829
1830         filename_len = strlen(filename);
1831         if (filename_len == 0)
1832                 goto out_put;
1833
1834         if (ttrace->filename.namelen < filename_len) {
1835                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1836
1837                 if (f == NULL)
1838                         goto out_put;
1839
1840                 ttrace->filename.namelen = filename_len;
1841                 ttrace->filename.name = f;
1842         }
1843
1844         strcpy(ttrace->filename.name, filename);
1845         ttrace->filename.pending_open = true;
1846
1847         if (!ttrace->filename.ptr)
1848                 goto out_put;
1849
1850         entry_str_len = strlen(ttrace->entry_str);
1851         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1852         if (remaining_space <= 0)
1853                 goto out_put;
1854
1855         if (filename_len > (size_t)remaining_space) {
1856                 filename += filename_len - remaining_space;
1857                 filename_len = remaining_space;
1858         }
1859
1860         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1861         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1862         memmove(pos + filename_len, pos, to_move);
1863         memcpy(pos, filename, filename_len);
1864
1865         ttrace->filename.ptr = 0;
1866         ttrace->filename.entry_str_pos = 0;
1867 out_put:
1868         thread__put(thread);
1869 out:
1870         return 0;
1871 }
1872
1873 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1874                                      union perf_event *event __maybe_unused,
1875                                      struct perf_sample *sample)
1876 {
1877         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1878         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1879         struct thread *thread = machine__findnew_thread(trace->host,
1880                                                         sample->pid,
1881                                                         sample->tid);
1882         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1883
1884         if (ttrace == NULL)
1885                 goto out_dump;
1886
1887         ttrace->runtime_ms += runtime_ms;
1888         trace->runtime_ms += runtime_ms;
1889 out_put:
1890         thread__put(thread);
1891         return 0;
1892
1893 out_dump:
1894         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1895                evsel->name,
1896                perf_evsel__strval(evsel, sample, "comm"),
1897                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1898                runtime,
1899                perf_evsel__intval(evsel, sample, "vruntime"));
1900         goto out_put;
1901 }
1902
1903 static int bpf_output__printer(enum binary_printer_ops op,
1904                                unsigned int val, void *extra __maybe_unused, FILE *fp)
1905 {
1906         unsigned char ch = (unsigned char)val;
1907
1908         switch (op) {
1909         case BINARY_PRINT_CHAR_DATA:
1910                 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1911         case BINARY_PRINT_DATA_BEGIN:
1912         case BINARY_PRINT_LINE_BEGIN:
1913         case BINARY_PRINT_ADDR:
1914         case BINARY_PRINT_NUM_DATA:
1915         case BINARY_PRINT_NUM_PAD:
1916         case BINARY_PRINT_SEP:
1917         case BINARY_PRINT_CHAR_PAD:
1918         case BINARY_PRINT_LINE_END:
1919         case BINARY_PRINT_DATA_END:
1920         default:
1921                 break;
1922         }
1923
1924         return 0;
1925 }
1926
1927 static void bpf_output__fprintf(struct trace *trace,
1928                                 struct perf_sample *sample)
1929 {
1930         binary__fprintf(sample->raw_data, sample->raw_size, 8,
1931                         bpf_output__printer, NULL, trace->output);
1932 }
1933
1934 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1935                                 union perf_event *event __maybe_unused,
1936                                 struct perf_sample *sample)
1937 {
1938         int callchain_ret = 0;
1939
1940         if (sample->callchain) {
1941                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1942                 if (callchain_ret == 0) {
1943                         if (callchain_cursor.nr < trace->min_stack)
1944                                 goto out;
1945                         callchain_ret = 1;
1946                 }
1947         }
1948
1949         trace__printf_interrupted_entry(trace);
1950         trace__fprintf_tstamp(trace, sample->time, trace->output);
1951
1952         if (trace->trace_syscalls)
1953                 fprintf(trace->output, "(         ): ");
1954
1955         fprintf(trace->output, "%s:", evsel->name);
1956
1957         if (perf_evsel__is_bpf_output(evsel)) {
1958                 bpf_output__fprintf(trace, sample);
1959         } else if (evsel->tp_format) {
1960                 event_format__fprintf(evsel->tp_format, sample->cpu,
1961                                       sample->raw_data, sample->raw_size,
1962                                       trace->output);
1963         }
1964
1965         fprintf(trace->output, "\n");
1966
1967         if (callchain_ret > 0)
1968                 trace__fprintf_callchain(trace, sample);
1969         else if (callchain_ret < 0)
1970                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1971 out:
1972         return 0;
1973 }
1974
1975 static void print_location(FILE *f, struct perf_sample *sample,
1976                            struct addr_location *al,
1977                            bool print_dso, bool print_sym)
1978 {
1979
1980         if ((verbose > 0 || print_dso) && al->map)
1981                 fprintf(f, "%s@", al->map->dso->long_name);
1982
1983         if ((verbose > 0 || print_sym) && al->sym)
1984                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1985                         al->addr - al->sym->start);
1986         else if (al->map)
1987                 fprintf(f, "0x%" PRIx64, al->addr);
1988         else
1989                 fprintf(f, "0x%" PRIx64, sample->addr);
1990 }
1991
1992 static int trace__pgfault(struct trace *trace,
1993                           struct perf_evsel *evsel,
1994                           union perf_event *event __maybe_unused,
1995                           struct perf_sample *sample)
1996 {
1997         struct thread *thread;
1998         struct addr_location al;
1999         char map_type = 'd';
2000         struct thread_trace *ttrace;
2001         int err = -1;
2002         int callchain_ret = 0;
2003
2004         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2005
2006         if (sample->callchain) {
2007                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2008                 if (callchain_ret == 0) {
2009                         if (callchain_cursor.nr < trace->min_stack)
2010                                 goto out_put;
2011                         callchain_ret = 1;
2012                 }
2013         }
2014
2015         ttrace = thread__trace(thread, trace->output);
2016         if (ttrace == NULL)
2017                 goto out_put;
2018
2019         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2020                 ttrace->pfmaj++;
2021         else
2022                 ttrace->pfmin++;
2023
2024         if (trace->summary_only)
2025                 goto out;
2026
2027         thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2028
2029         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2030
2031         fprintf(trace->output, "%sfault [",
2032                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2033                 "maj" : "min");
2034
2035         print_location(trace->output, sample, &al, false, true);
2036
2037         fprintf(trace->output, "] => ");
2038
2039         thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2040
2041         if (!al.map) {
2042                 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2043
2044                 if (al.map)
2045                         map_type = 'x';
2046                 else
2047                         map_type = '?';
2048         }
2049
2050         print_location(trace->output, sample, &al, true, false);
2051
2052         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2053
2054         if (callchain_ret > 0)
2055                 trace__fprintf_callchain(trace, sample);
2056         else if (callchain_ret < 0)
2057                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2058 out:
2059         err = 0;
2060 out_put:
2061         thread__put(thread);
2062         return err;
2063 }
2064
2065 static void trace__set_base_time(struct trace *trace,
2066                                  struct perf_evsel *evsel,
2067                                  struct perf_sample *sample)
2068 {
2069         /*
2070          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2071          * and don't use sample->time unconditionally, we may end up having
2072          * some other event in the future without PERF_SAMPLE_TIME for good
2073          * reason, i.e. we may not be interested in its timestamps, just in
2074          * it taking place, picking some piece of information when it
2075          * appears in our event stream (vfs_getname comes to mind).
2076          */
2077         if (trace->base_time == 0 && !trace->full_time &&
2078             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2079                 trace->base_time = sample->time;
2080 }
2081
2082 static int trace__process_sample(struct perf_tool *tool,
2083                                  union perf_event *event,
2084                                  struct perf_sample *sample,
2085                                  struct perf_evsel *evsel,
2086                                  struct machine *machine __maybe_unused)
2087 {
2088         struct trace *trace = container_of(tool, struct trace, tool);
2089         struct thread *thread;
2090         int err = 0;
2091
2092         tracepoint_handler handler = evsel->handler;
2093
2094         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2095         if (thread && thread__is_filtered(thread))
2096                 goto out;
2097
2098         trace__set_base_time(trace, evsel, sample);
2099
2100         if (handler) {
2101                 ++trace->nr_events;
2102                 handler(trace, evsel, event, sample);
2103         }
2104 out:
2105         thread__put(thread);
2106         return err;
2107 }
2108
2109 static int trace__record(struct trace *trace, int argc, const char **argv)
2110 {
2111         unsigned int rec_argc, i, j;
2112         const char **rec_argv;
2113         const char * const record_args[] = {
2114                 "record",
2115                 "-R",
2116                 "-m", "1024",
2117                 "-c", "1",
2118         };
2119
2120         const char * const sc_args[] = { "-e", };
2121         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2122         const char * const majpf_args[] = { "-e", "major-faults" };
2123         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2124         const char * const minpf_args[] = { "-e", "minor-faults" };
2125         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2126
2127         /* +1 is for the event string below */
2128         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2129                 majpf_args_nr + minpf_args_nr + argc;
2130         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2131
2132         if (rec_argv == NULL)
2133                 return -ENOMEM;
2134
2135         j = 0;
2136         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2137                 rec_argv[j++] = record_args[i];
2138
2139         if (trace->trace_syscalls) {
2140                 for (i = 0; i < sc_args_nr; i++)
2141                         rec_argv[j++] = sc_args[i];
2142
2143                 /* event string may be different for older kernels - e.g., RHEL6 */
2144                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2145                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2146                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2147                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2148                 else {
2149                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2150                         free(rec_argv);
2151                         return -1;
2152                 }
2153         }
2154
2155         if (trace->trace_pgfaults & TRACE_PFMAJ)
2156                 for (i = 0; i < majpf_args_nr; i++)
2157                         rec_argv[j++] = majpf_args[i];
2158
2159         if (trace->trace_pgfaults & TRACE_PFMIN)
2160                 for (i = 0; i < minpf_args_nr; i++)
2161                         rec_argv[j++] = minpf_args[i];
2162
2163         for (i = 0; i < (unsigned int)argc; i++)
2164                 rec_argv[j++] = argv[i];
2165
2166         return cmd_record(j, rec_argv);
2167 }
2168
2169 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2170
2171 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2172 {
2173         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2174
2175         if (IS_ERR(evsel))
2176                 return false;
2177
2178         if (perf_evsel__field(evsel, "pathname") == NULL) {
2179                 perf_evsel__delete(evsel);
2180                 return false;
2181         }
2182
2183         evsel->handler = trace__vfs_getname;
2184         perf_evlist__add(evlist, evsel);
2185         return true;
2186 }
2187
2188 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2189 {
2190         struct perf_evsel *evsel;
2191         struct perf_event_attr attr = {
2192                 .type = PERF_TYPE_SOFTWARE,
2193                 .mmap_data = 1,
2194         };
2195
2196         attr.config = config;
2197         attr.sample_period = 1;
2198
2199         event_attr_init(&attr);
2200
2201         evsel = perf_evsel__new(&attr);
2202         if (evsel)
2203                 evsel->handler = trace__pgfault;
2204
2205         return evsel;
2206 }
2207
2208 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2209 {
2210         const u32 type = event->header.type;
2211         struct perf_evsel *evsel;
2212
2213         if (type != PERF_RECORD_SAMPLE) {
2214                 trace__process_event(trace, trace->host, event, sample);
2215                 return;
2216         }
2217
2218         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2219         if (evsel == NULL) {
2220                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2221                 return;
2222         }
2223
2224         trace__set_base_time(trace, evsel, sample);
2225
2226         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2227             sample->raw_data == NULL) {
2228                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2229                        perf_evsel__name(evsel), sample->tid,
2230                        sample->cpu, sample->raw_size);
2231         } else {
2232                 tracepoint_handler handler = evsel->handler;
2233                 handler(trace, evsel, event, sample);
2234         }
2235 }
2236
2237 static int trace__add_syscall_newtp(struct trace *trace)
2238 {
2239         int ret = -1;
2240         struct perf_evlist *evlist = trace->evlist;
2241         struct perf_evsel *sys_enter, *sys_exit;
2242
2243         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2244         if (sys_enter == NULL)
2245                 goto out;
2246
2247         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2248                 goto out_delete_sys_enter;
2249
2250         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2251         if (sys_exit == NULL)
2252                 goto out_delete_sys_enter;
2253
2254         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2255                 goto out_delete_sys_exit;
2256
2257         perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2258         perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2259
2260         perf_evlist__add(evlist, sys_enter);
2261         perf_evlist__add(evlist, sys_exit);
2262
2263         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2264                 /*
2265                  * We're interested only in the user space callchain
2266                  * leading to the syscall, allow overriding that for
2267                  * debugging reasons using --kernel_syscall_callchains
2268                  */
2269                 sys_exit->attr.exclude_callchain_kernel = 1;
2270         }
2271
2272         trace->syscalls.events.sys_enter = sys_enter;
2273         trace->syscalls.events.sys_exit  = sys_exit;
2274
2275         ret = 0;
2276 out:
2277         return ret;
2278
2279 out_delete_sys_exit:
2280         perf_evsel__delete_priv(sys_exit);
2281 out_delete_sys_enter:
2282         perf_evsel__delete_priv(sys_enter);
2283         goto out;
2284 }
2285
2286 static int trace__set_ev_qualifier_filter(struct trace *trace)
2287 {
2288         int err = -1;
2289         struct perf_evsel *sys_exit;
2290         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2291                                                 trace->ev_qualifier_ids.nr,
2292                                                 trace->ev_qualifier_ids.entries);
2293
2294         if (filter == NULL)
2295                 goto out_enomem;
2296
2297         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2298                                           filter)) {
2299                 sys_exit = trace->syscalls.events.sys_exit;
2300                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2301         }
2302
2303         free(filter);
2304 out:
2305         return err;
2306 out_enomem:
2307         errno = ENOMEM;
2308         goto out;
2309 }
2310
2311 static int trace__set_filter_loop_pids(struct trace *trace)
2312 {
2313         unsigned int nr = 1;
2314         pid_t pids[32] = {
2315                 getpid(),
2316         };
2317         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2318
2319         while (thread && nr < ARRAY_SIZE(pids)) {
2320                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2321
2322                 if (parent == NULL)
2323                         break;
2324
2325                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2326                         pids[nr++] = parent->tid;
2327                         break;
2328                 }
2329                 thread = parent;
2330         }
2331
2332         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2333 }
2334
2335 static int trace__run(struct trace *trace, int argc, const char **argv)
2336 {
2337         struct perf_evlist *evlist = trace->evlist;
2338         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2339         int err = -1, i;
2340         unsigned long before;
2341         const bool forks = argc > 0;
2342         bool draining = false;
2343
2344         trace->live = true;
2345
2346         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2347                 goto out_error_raw_syscalls;
2348
2349         if (trace->trace_syscalls)
2350                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2351
2352         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2353                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2354                 if (pgfault_maj == NULL)
2355                         goto out_error_mem;
2356                 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2357                 perf_evlist__add(evlist, pgfault_maj);
2358         }
2359
2360         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2361                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2362                 if (pgfault_min == NULL)
2363                         goto out_error_mem;
2364                 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2365                 perf_evlist__add(evlist, pgfault_min);
2366         }
2367
2368         if (trace->sched &&
2369             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2370                                    trace__sched_stat_runtime))
2371                 goto out_error_sched_stat_runtime;
2372
2373         /*
2374          * If a global cgroup was set, apply it to all the events without an
2375          * explicit cgroup. I.e.:
2376          *
2377          *      trace -G A -e sched:*switch
2378          *
2379          * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2380          * _and_ sched:sched_switch to the 'A' cgroup, while:
2381          *
2382          * trace -e sched:*switch -G A
2383          *
2384          * will only set the sched:sched_switch event to the 'A' cgroup, all the
2385          * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2386          * a cgroup (on the root cgroup, sys wide, etc).
2387          *
2388          * Multiple cgroups:
2389          *
2390          * trace -G A -e sched:*switch -G B
2391          *
2392          * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2393          * to the 'B' cgroup.
2394          *
2395          * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2396          * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2397          */
2398         if (trace->cgroup)
2399                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2400
2401         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2402         if (err < 0) {
2403                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2404                 goto out_delete_evlist;
2405         }
2406
2407         err = trace__symbols_init(trace, evlist);
2408         if (err < 0) {
2409                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2410                 goto out_delete_evlist;
2411         }
2412
2413         perf_evlist__config(evlist, &trace->opts, &callchain_param);
2414
2415         signal(SIGCHLD, sig_handler);
2416         signal(SIGINT, sig_handler);
2417
2418         if (forks) {
2419                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2420                                                     argv, false, NULL);
2421                 if (err < 0) {
2422                         fprintf(trace->output, "Couldn't run the workload!\n");
2423                         goto out_delete_evlist;
2424                 }
2425         }
2426
2427         err = perf_evlist__open(evlist);
2428         if (err < 0)
2429                 goto out_error_open;
2430
2431         err = bpf__apply_obj_config();
2432         if (err) {
2433                 char errbuf[BUFSIZ];
2434
2435                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2436                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2437                          errbuf);
2438                 goto out_error_open;
2439         }
2440
2441         /*
2442          * Better not use !target__has_task() here because we need to cover the
2443          * case where no threads were specified in the command line, but a
2444          * workload was, and in that case we will fill in the thread_map when
2445          * we fork the workload in perf_evlist__prepare_workload.
2446          */
2447         if (trace->filter_pids.nr > 0)
2448                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2449         else if (thread_map__pid(evlist->threads, 0) == -1)
2450                 err = trace__set_filter_loop_pids(trace);
2451
2452         if (err < 0)
2453                 goto out_error_mem;
2454
2455         if (trace->ev_qualifier_ids.nr > 0) {
2456                 err = trace__set_ev_qualifier_filter(trace);
2457                 if (err < 0)
2458                         goto out_errno;
2459
2460                 pr_debug("event qualifier tracepoint filter: %s\n",
2461                          trace->syscalls.events.sys_exit->filter);
2462         }
2463
2464         err = perf_evlist__apply_filters(evlist, &evsel);
2465         if (err < 0)
2466                 goto out_error_apply_filters;
2467
2468         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2469         if (err < 0)
2470                 goto out_error_mmap;
2471
2472         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2473                 perf_evlist__enable(evlist);
2474
2475         if (forks)
2476                 perf_evlist__start_workload(evlist);
2477
2478         if (trace->opts.initial_delay) {
2479                 usleep(trace->opts.initial_delay * 1000);
2480                 perf_evlist__enable(evlist);
2481         }
2482
2483         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2484                                   evlist->threads->nr > 1 ||
2485                                   perf_evlist__first(evlist)->attr.inherit;
2486
2487         /*
2488          * Now that we already used evsel->attr to ask the kernel to setup the
2489          * events, lets reuse evsel->attr.sample_max_stack as the limit in
2490          * trace__resolve_callchain(), allowing per-event max-stack settings
2491          * to override an explicitely set --max-stack global setting.
2492          */
2493         evlist__for_each_entry(evlist, evsel) {
2494                 if (evsel__has_callchain(evsel) &&
2495                     evsel->attr.sample_max_stack == 0)
2496                         evsel->attr.sample_max_stack = trace->max_stack;
2497         }
2498 again:
2499         before = trace->nr_events;
2500
2501         for (i = 0; i < evlist->nr_mmaps; i++) {
2502                 union perf_event *event;
2503                 struct perf_mmap *md;
2504
2505                 md = &evlist->mmap[i];
2506                 if (perf_mmap__read_init(md) < 0)
2507                         continue;
2508
2509                 while ((event = perf_mmap__read_event(md)) != NULL) {
2510                         struct perf_sample sample;
2511
2512                         ++trace->nr_events;
2513
2514                         err = perf_evlist__parse_sample(evlist, event, &sample);
2515                         if (err) {
2516                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2517                                 goto next_event;
2518                         }
2519
2520                         trace__handle_event(trace, event, &sample);
2521 next_event:
2522                         perf_mmap__consume(md);
2523
2524                         if (interrupted)
2525                                 goto out_disable;
2526
2527                         if (done && !draining) {
2528                                 perf_evlist__disable(evlist);
2529                                 draining = true;
2530                         }
2531                 }
2532                 perf_mmap__read_done(md);
2533         }
2534
2535         if (trace->nr_events == before) {
2536                 int timeout = done ? 100 : -1;
2537
2538                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2539                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2540                                 draining = true;
2541
2542                         goto again;
2543                 }
2544         } else {
2545                 goto again;
2546         }
2547
2548 out_disable:
2549         thread__zput(trace->current);
2550
2551         perf_evlist__disable(evlist);
2552
2553         if (!err) {
2554                 if (trace->summary)
2555                         trace__fprintf_thread_summary(trace, trace->output);
2556
2557                 if (trace->show_tool_stats) {
2558                         fprintf(trace->output, "Stats:\n "
2559                                                " vfs_getname : %" PRIu64 "\n"
2560                                                " proc_getname: %" PRIu64 "\n",
2561                                 trace->stats.vfs_getname,
2562                                 trace->stats.proc_getname);
2563                 }
2564         }
2565
2566 out_delete_evlist:
2567         trace__symbols__exit(trace);
2568
2569         perf_evlist__delete(evlist);
2570         cgroup__put(trace->cgroup);
2571         trace->evlist = NULL;
2572         trace->live = false;
2573         return err;
2574 {
2575         char errbuf[BUFSIZ];
2576
2577 out_error_sched_stat_runtime:
2578         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2579         goto out_error;
2580
2581 out_error_raw_syscalls:
2582         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2583         goto out_error;
2584
2585 out_error_mmap:
2586         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2587         goto out_error;
2588
2589 out_error_open:
2590         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2591
2592 out_error:
2593         fprintf(trace->output, "%s\n", errbuf);
2594         goto out_delete_evlist;
2595
2596 out_error_apply_filters:
2597         fprintf(trace->output,
2598                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2599                 evsel->filter, perf_evsel__name(evsel), errno,
2600                 str_error_r(errno, errbuf, sizeof(errbuf)));
2601         goto out_delete_evlist;
2602 }
2603 out_error_mem:
2604         fprintf(trace->output, "Not enough memory to run!\n");
2605         goto out_delete_evlist;
2606
2607 out_errno:
2608         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2609         goto out_delete_evlist;
2610 }
2611
2612 static int trace__replay(struct trace *trace)
2613 {
2614         const struct perf_evsel_str_handler handlers[] = {
2615                 { "probe:vfs_getname",       trace__vfs_getname, },
2616         };
2617         struct perf_data data = {
2618                 .file      = {
2619                         .path = input_name,
2620                 },
2621                 .mode      = PERF_DATA_MODE_READ,
2622                 .force     = trace->force,
2623         };
2624         struct perf_session *session;
2625         struct perf_evsel *evsel;
2626         int err = -1;
2627
2628         trace->tool.sample        = trace__process_sample;
2629         trace->tool.mmap          = perf_event__process_mmap;
2630         trace->tool.mmap2         = perf_event__process_mmap2;
2631         trace->tool.comm          = perf_event__process_comm;
2632         trace->tool.exit          = perf_event__process_exit;
2633         trace->tool.fork          = perf_event__process_fork;
2634         trace->tool.attr          = perf_event__process_attr;
2635         trace->tool.tracing_data  = perf_event__process_tracing_data;
2636         trace->tool.build_id      = perf_event__process_build_id;
2637         trace->tool.namespaces    = perf_event__process_namespaces;
2638
2639         trace->tool.ordered_events = true;
2640         trace->tool.ordering_requires_timestamps = true;
2641
2642         /* add tid to output */
2643         trace->multiple_threads = true;
2644
2645         session = perf_session__new(&data, false, &trace->tool);
2646         if (session == NULL)
2647                 return -1;
2648
2649         if (trace->opts.target.pid)
2650                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2651
2652         if (trace->opts.target.tid)
2653                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2654
2655         if (symbol__init(&session->header.env) < 0)
2656                 goto out;
2657
2658         trace->host = &session->machines.host;
2659
2660         err = perf_session__set_tracepoints_handlers(session, handlers);
2661         if (err)
2662                 goto out;
2663
2664         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2665                                                      "raw_syscalls:sys_enter");
2666         /* older kernels have syscalls tp versus raw_syscalls */
2667         if (evsel == NULL)
2668                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2669                                                              "syscalls:sys_enter");
2670
2671         if (evsel &&
2672             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2673             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2674                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2675                 goto out;
2676         }
2677
2678         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2679                                                      "raw_syscalls:sys_exit");
2680         if (evsel == NULL)
2681                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2682                                                              "syscalls:sys_exit");
2683         if (evsel &&
2684             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2685             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2686                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2687                 goto out;
2688         }
2689
2690         evlist__for_each_entry(session->evlist, evsel) {
2691                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2692                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2693                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2694                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2695                         evsel->handler = trace__pgfault;
2696         }
2697
2698         setup_pager();
2699
2700         err = perf_session__process_events(session);
2701         if (err)
2702                 pr_err("Failed to process events, error %d", err);
2703
2704         else if (trace->summary)
2705                 trace__fprintf_thread_summary(trace, trace->output);
2706
2707 out:
2708         perf_session__delete(session);
2709
2710         return err;
2711 }
2712
2713 static size_t trace__fprintf_threads_header(FILE *fp)
2714 {
2715         size_t printed;
2716
2717         printed  = fprintf(fp, "\n Summary of events:\n\n");
2718
2719         return printed;
2720 }
2721
2722 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2723         struct stats    *stats;
2724         double          msecs;
2725         int             syscall;
2726 )
2727 {
2728         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2729         struct stats *stats = source->priv;
2730
2731         entry->syscall = source->i;
2732         entry->stats   = stats;
2733         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2734 }
2735
2736 static size_t thread__dump_stats(struct thread_trace *ttrace,
2737                                  struct trace *trace, FILE *fp)
2738 {
2739         size_t printed = 0;
2740         struct syscall *sc;
2741         struct rb_node *nd;
2742         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2743
2744         if (syscall_stats == NULL)
2745                 return 0;
2746
2747         printed += fprintf(fp, "\n");
2748
2749         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2750         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2751         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2752
2753         resort_rb__for_each_entry(nd, syscall_stats) {
2754                 struct stats *stats = syscall_stats_entry->stats;
2755                 if (stats) {
2756                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2757                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2758                         double avg = avg_stats(stats);
2759                         double pct;
2760                         u64 n = (u64) stats->n;
2761
2762                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2763                         avg /= NSEC_PER_MSEC;
2764
2765                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2766                         printed += fprintf(fp, "   %-15s", sc->name);
2767                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2768                                            n, syscall_stats_entry->msecs, min, avg);
2769                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2770                 }
2771         }
2772
2773         resort_rb__delete(syscall_stats);
2774         printed += fprintf(fp, "\n\n");
2775
2776         return printed;
2777 }
2778
2779 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2780 {
2781         size_t printed = 0;
2782         struct thread_trace *ttrace = thread__priv(thread);
2783         double ratio;
2784
2785         if (ttrace == NULL)
2786                 return 0;
2787
2788         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2789
2790         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2791         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2792         printed += fprintf(fp, "%.1f%%", ratio);
2793         if (ttrace->pfmaj)
2794                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2795         if (ttrace->pfmin)
2796                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2797         if (trace->sched)
2798                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2799         else if (fputc('\n', fp) != EOF)
2800                 ++printed;
2801
2802         printed += thread__dump_stats(ttrace, trace, fp);
2803
2804         return printed;
2805 }
2806
2807 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2808 {
2809         return ttrace ? ttrace->nr_events : 0;
2810 }
2811
2812 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2813         struct thread *thread;
2814 )
2815 {
2816         entry->thread = rb_entry(nd, struct thread, rb_node);
2817 }
2818
2819 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2820 {
2821         size_t printed = trace__fprintf_threads_header(fp);
2822         struct rb_node *nd;
2823         int i;
2824
2825         for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2826                 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2827
2828                 if (threads == NULL) {
2829                         fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2830                         return 0;
2831                 }
2832
2833                 resort_rb__for_each_entry(nd, threads)
2834                         printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2835
2836                 resort_rb__delete(threads);
2837         }
2838         return printed;
2839 }
2840
2841 static int trace__set_duration(const struct option *opt, const char *str,
2842                                int unset __maybe_unused)
2843 {
2844         struct trace *trace = opt->value;
2845
2846         trace->duration_filter = atof(str);
2847         return 0;
2848 }
2849
2850 static int trace__set_filter_pids(const struct option *opt, const char *str,
2851                                   int unset __maybe_unused)
2852 {
2853         int ret = -1;
2854         size_t i;
2855         struct trace *trace = opt->value;
2856         /*
2857          * FIXME: introduce a intarray class, plain parse csv and create a
2858          * { int nr, int entries[] } struct...
2859          */
2860         struct intlist *list = intlist__new(str);
2861
2862         if (list == NULL)
2863                 return -1;
2864
2865         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2866         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2867
2868         if (trace->filter_pids.entries == NULL)
2869                 goto out;
2870
2871         trace->filter_pids.entries[0] = getpid();
2872
2873         for (i = 1; i < trace->filter_pids.nr; ++i)
2874                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2875
2876         intlist__delete(list);
2877         ret = 0;
2878 out:
2879         return ret;
2880 }
2881
2882 static int trace__open_output(struct trace *trace, const char *filename)
2883 {
2884         struct stat st;
2885
2886         if (!stat(filename, &st) && st.st_size) {
2887                 char oldname[PATH_MAX];
2888
2889                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2890                 unlink(oldname);
2891                 rename(filename, oldname);
2892         }
2893
2894         trace->output = fopen(filename, "w");
2895
2896         return trace->output == NULL ? -errno : 0;
2897 }
2898
2899 static int parse_pagefaults(const struct option *opt, const char *str,
2900                             int unset __maybe_unused)
2901 {
2902         int *trace_pgfaults = opt->value;
2903
2904         if (strcmp(str, "all") == 0)
2905                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2906         else if (strcmp(str, "maj") == 0)
2907                 *trace_pgfaults |= TRACE_PFMAJ;
2908         else if (strcmp(str, "min") == 0)
2909                 *trace_pgfaults |= TRACE_PFMIN;
2910         else
2911                 return -1;
2912
2913         return 0;
2914 }
2915
2916 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2917 {
2918         struct perf_evsel *evsel;
2919
2920         evlist__for_each_entry(evlist, evsel)
2921                 evsel->handler = handler;
2922 }
2923
2924 /*
2925  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2926  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2927  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2928  *
2929  * It'd be better to introduce a parse_options() variant that would return a
2930  * list with the terms it didn't match to an event...
2931  */
2932 static int trace__parse_events_option(const struct option *opt, const char *str,
2933                                       int unset __maybe_unused)
2934 {
2935         struct trace *trace = (struct trace *)opt->value;
2936         const char *s = str;
2937         char *sep = NULL, *lists[2] = { NULL, NULL, };
2938         int len = strlen(str) + 1, err = -1, list, idx;
2939         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2940         char group_name[PATH_MAX];
2941
2942         if (strace_groups_dir == NULL)
2943                 return -1;
2944
2945         if (*s == '!') {
2946                 ++s;
2947                 trace->not_ev_qualifier = true;
2948         }
2949
2950         while (1) {
2951                 if ((sep = strchr(s, ',')) != NULL)
2952                         *sep = '\0';
2953
2954                 list = 0;
2955                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2956                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2957                         list = 1;
2958                 } else {
2959                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2960                         if (access(group_name, R_OK) == 0)
2961                                 list = 1;
2962                 }
2963
2964                 if (lists[list]) {
2965                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2966                 } else {
2967                         lists[list] = malloc(len);
2968                         if (lists[list] == NULL)
2969                                 goto out;
2970                         strcpy(lists[list], s);
2971                 }
2972
2973                 if (!sep)
2974                         break;
2975
2976                 *sep = ',';
2977                 s = sep + 1;
2978         }
2979
2980         if (lists[1] != NULL) {
2981                 struct strlist_config slist_config = {
2982                         .dirname = strace_groups_dir,
2983                 };
2984
2985                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2986                 if (trace->ev_qualifier == NULL) {
2987                         fputs("Not enough memory to parse event qualifier", trace->output);
2988                         goto out;
2989                 }
2990
2991                 if (trace__validate_ev_qualifier(trace))
2992                         goto out;
2993         }
2994
2995         err = 0;
2996
2997         if (lists[0]) {
2998                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2999                                                "event selector. use 'perf list' to list available events",
3000                                                parse_events_option);
3001                 err = parse_events_option(&o, lists[0], 0);
3002         }
3003 out:
3004         if (sep)
3005                 *sep = ',';
3006
3007         return err;
3008 }
3009
3010 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3011 {
3012         struct trace *trace = opt->value;
3013
3014         if (!list_empty(&trace->evlist->entries))
3015                 return parse_cgroups(opt, str, unset);
3016
3017         trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3018
3019         return 0;
3020 }
3021
3022 int cmd_trace(int argc, const char **argv)
3023 {
3024         const char *trace_usage[] = {
3025                 "perf trace [<options>] [<command>]",
3026                 "perf trace [<options>] -- <command> [<options>]",
3027                 "perf trace record [<options>] [<command>]",
3028                 "perf trace record [<options>] -- <command> [<options>]",
3029                 NULL
3030         };
3031         struct trace trace = {
3032                 .syscalls = {
3033                         . max = -1,
3034                 },
3035                 .opts = {
3036                         .target = {
3037                                 .uid       = UINT_MAX,
3038                                 .uses_mmap = true,
3039                         },
3040                         .user_freq     = UINT_MAX,
3041                         .user_interval = ULLONG_MAX,
3042                         .no_buffering  = true,
3043                         .mmap_pages    = UINT_MAX,
3044                         .proc_map_timeout  = 500,
3045                 },
3046                 .output = stderr,
3047                 .show_comm = true,
3048                 .trace_syscalls = true,
3049                 .kernel_syscallchains = false,
3050                 .max_stack = UINT_MAX,
3051         };
3052         const char *output_name = NULL;
3053         const struct option trace_options[] = {
3054         OPT_CALLBACK('e', "event", &trace, "event",
3055                      "event/syscall selector. use 'perf list' to list available events",
3056                      trace__parse_events_option),
3057         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3058                     "show the thread COMM next to its id"),
3059         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3060         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3061                      trace__parse_events_option),
3062         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3063         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3064         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3065                     "trace events on existing process id"),
3066         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3067                     "trace events on existing thread id"),
3068         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3069                      "pids to filter (by the kernel)", trace__set_filter_pids),
3070         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3071                     "system-wide collection from all CPUs"),
3072         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3073                     "list of cpus to monitor"),
3074         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3075                     "child tasks do not inherit counters"),
3076         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3077                      "number of mmap data pages",
3078                      perf_evlist__parse_mmap_pages),
3079         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3080                    "user to profile"),
3081         OPT_CALLBACK(0, "duration", &trace, "float",
3082                      "show only events with duration > N.M ms",
3083                      trace__set_duration),
3084         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3085         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3086         OPT_BOOLEAN('T', "time", &trace.full_time,
3087                     "Show full timestamp, not time relative to first start"),
3088         OPT_BOOLEAN(0, "failure", &trace.failure_only,
3089                     "Show only syscalls that failed"),
3090         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3091                     "Show only syscall summary with statistics"),
3092         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3093                     "Show all syscalls and summary with statistics"),
3094         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3095                      "Trace pagefaults", parse_pagefaults, "maj"),
3096         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3097         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3098         OPT_CALLBACK(0, "call-graph", &trace.opts,
3099                      "record_mode[,record_size]", record_callchain_help,
3100                      &record_parse_callchain_opt),
3101         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3102                     "Show the kernel callchains on the syscall exit path"),
3103         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3104                      "Set the minimum stack depth when parsing the callchain, "
3105                      "anything below the specified depth will be ignored."),
3106         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3107                      "Set the maximum stack depth when parsing the callchain, "
3108                      "anything beyond the specified depth will be ignored. "
3109                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3110         OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3111                         "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3112         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3113                         "per thread proc mmap processing timeout in ms"),
3114         OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3115                      trace__parse_cgroups),
3116         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3117                      "ms to wait before starting measurement after program "
3118                      "start"),
3119         OPT_END()
3120         };
3121         bool __maybe_unused max_stack_user_set = true;
3122         bool mmap_pages_user_set = true;
3123         const char * const trace_subcommands[] = { "record", NULL };
3124         int err;
3125         char bf[BUFSIZ];
3126
3127         signal(SIGSEGV, sighandler_dump_stack);
3128         signal(SIGFPE, sighandler_dump_stack);
3129
3130         trace.evlist = perf_evlist__new();
3131         trace.sctbl = syscalltbl__new();
3132
3133         if (trace.evlist == NULL || trace.sctbl == NULL) {
3134                 pr_err("Not enough memory to run!\n");
3135                 err = -ENOMEM;
3136                 goto out;
3137         }
3138
3139         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3140                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3141
3142         if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3143                 usage_with_options_msg(trace_usage, trace_options,
3144                                        "cgroup monitoring only available in system-wide mode");
3145         }
3146
3147         err = bpf__setup_stdout(trace.evlist);
3148         if (err) {
3149                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3150                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3151                 goto out;
3152         }
3153
3154         err = -1;
3155
3156         if (trace.trace_pgfaults) {
3157                 trace.opts.sample_address = true;
3158                 trace.opts.sample_time = true;
3159         }
3160
3161         if (trace.opts.mmap_pages == UINT_MAX)
3162                 mmap_pages_user_set = false;
3163
3164         if (trace.max_stack == UINT_MAX) {
3165                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3166                 max_stack_user_set = false;
3167         }
3168
3169 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3170         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3171                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3172         }
3173 #endif
3174
3175         if (callchain_param.enabled) {
3176                 if (!mmap_pages_user_set && geteuid() == 0)
3177                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3178
3179                 symbol_conf.use_callchain = true;
3180         }
3181
3182         if (trace.evlist->nr_entries > 0)
3183                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3184
3185         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3186                 return trace__record(&trace, argc-1, &argv[1]);
3187
3188         /* summary_only implies summary option, but don't overwrite summary if set */
3189         if (trace.summary_only)
3190                 trace.summary = trace.summary_only;
3191
3192         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3193             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3194                 pr_err("Please specify something to trace.\n");
3195                 return -1;
3196         }
3197
3198         if (!trace.trace_syscalls && trace.ev_qualifier) {
3199                 pr_err("The -e option can't be used with --no-syscalls.\n");
3200                 goto out;
3201         }
3202
3203         if (output_name != NULL) {
3204                 err = trace__open_output(&trace, output_name);
3205                 if (err < 0) {
3206                         perror("failed to create output file");
3207                         goto out;
3208                 }
3209         }
3210
3211         trace.open_id = syscalltbl__id(trace.sctbl, "open");
3212
3213         err = target__validate(&trace.opts.target);
3214         if (err) {
3215                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3216                 fprintf(trace.output, "%s", bf);
3217                 goto out_close;
3218         }
3219
3220         err = target__parse_uid(&trace.opts.target);
3221         if (err) {
3222                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3223                 fprintf(trace.output, "%s", bf);
3224                 goto out_close;
3225         }
3226
3227         if (!argc && target__none(&trace.opts.target))
3228                 trace.opts.target.system_wide = true;
3229
3230         if (input_name)
3231                 err = trace__replay(&trace);
3232         else
3233                 err = trace__run(&trace, argc, argv);
3234
3235 out_close:
3236         if (output_name != NULL)
3237                 fclose(trace.output);
3238 out:
3239         return err;
3240 }