Merge tag 'for-linus-20170904' of git://git.infradead.org/linux-mtd
[sfrench/cifs-2.6.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60
61 #include "sane_ctype.h"
62
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC              02000000
65 #endif
66
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE  1024
69 #endif
70
71 struct trace {
72         struct perf_tool        tool;
73         struct syscalltbl       *sctbl;
74         struct {
75                 int             max;
76                 struct syscall  *table;
77                 struct {
78                         struct perf_evsel *sys_enter,
79                                           *sys_exit;
80                 }               events;
81         } syscalls;
82         struct record_opts      opts;
83         struct perf_evlist      *evlist;
84         struct machine          *host;
85         struct thread           *current;
86         u64                     base_time;
87         FILE                    *output;
88         unsigned long           nr_events;
89         struct strlist          *ev_qualifier;
90         struct {
91                 size_t          nr;
92                 int             *entries;
93         }                       ev_qualifier_ids;
94         struct {
95                 size_t          nr;
96                 pid_t           *entries;
97         }                       filter_pids;
98         double                  duration_filter;
99         double                  runtime_ms;
100         struct {
101                 u64             vfs_getname,
102                                 proc_getname;
103         } stats;
104         unsigned int            max_stack;
105         unsigned int            min_stack;
106         bool                    not_ev_qualifier;
107         bool                    live;
108         bool                    full_time;
109         bool                    sched;
110         bool                    multiple_threads;
111         bool                    summary;
112         bool                    summary_only;
113         bool                    show_comm;
114         bool                    show_tool_stats;
115         bool                    trace_syscalls;
116         bool                    kernel_syscallchains;
117         bool                    force;
118         bool                    vfs_getname;
119         int                     trace_pgfaults;
120         int                     open_id;
121 };
122
123 struct tp_field {
124         int offset;
125         union {
126                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128         };
129 };
130
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134         u##bits value; \
135         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136         return value;  \
137 }
138
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147         u##bits value; \
148         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149         return bswap_##bits(value);\
150 }
151
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155
156 static int tp_field__init_uint(struct tp_field *field,
157                                struct format_field *format_field,
158                                bool needs_swap)
159 {
160         field->offset = format_field->offset;
161
162         switch (format_field->size) {
163         case 1:
164                 field->integer = tp_field__u8;
165                 break;
166         case 2:
167                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168                 break;
169         case 4:
170                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171                 break;
172         case 8:
173                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174                 break;
175         default:
176                 return -1;
177         }
178
179         return 0;
180 }
181
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184         return sample->raw_data + field->offset;
185 }
186
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189         field->offset = format_field->offset;
190         field->pointer = tp_field__ptr;
191         return 0;
192 }
193
194 struct syscall_tp {
195         struct tp_field id;
196         union {
197                 struct tp_field args, ret;
198         };
199 };
200
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202                                           struct tp_field *field,
203                                           const char *name)
204 {
205         struct format_field *format_field = perf_evsel__field(evsel, name);
206
207         if (format_field == NULL)
208                 return -1;
209
210         return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214         ({ struct syscall_tp *sc = evsel->priv;\
215            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218                                          struct tp_field *field,
219                                          const char *name)
220 {
221         struct format_field *format_field = perf_evsel__field(evsel, name);
222
223         if (format_field == NULL)
224                 return -1;
225
226         return tp_field__init_ptr(field, format_field);
227 }
228
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230         ({ struct syscall_tp *sc = evsel->priv;\
231            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235         zfree(&evsel->priv);
236         perf_evsel__delete(evsel);
237 }
238
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241         evsel->priv = malloc(sizeof(struct syscall_tp));
242         if (evsel->priv != NULL) {
243                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244                         goto out_delete;
245
246                 evsel->handler = handler;
247                 return 0;
248         }
249
250         return -ENOMEM;
251
252 out_delete:
253         zfree(&evsel->priv);
254         return -ENOENT;
255 }
256
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260
261         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262         if (IS_ERR(evsel))
263                 evsel = perf_evsel__newtp("syscalls", direction);
264
265         if (IS_ERR(evsel))
266                 return NULL;
267
268         if (perf_evsel__init_syscall_tp(evsel, handler))
269                 goto out_delete;
270
271         return evsel;
272
273 out_delete:
274         perf_evsel__delete_priv(evsel);
275         return NULL;
276 }
277
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279         ({ struct syscall_tp *fields = evsel->priv; \
280            fields->name.integer(&fields->name, sample); })
281
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283         ({ struct syscall_tp *fields = evsel->priv; \
284            fields->name.pointer(&fields->name, sample); })
285
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288         int idx = val - sa->offset;
289
290         if (idx < 0 || idx >= sa->nr_entries)
291                 return scnprintf(bf, size, intfmt, val);
292
293         return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297                                                 const char *intfmt,
298                                                 struct syscall_arg *arg)
299 {
300         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304                                               struct syscall_arg *arg)
305 {
306         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310
311 struct strarrays {
312         int             nr_entries;
313         struct strarray **entries;
314 };
315
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317         .nr_entries = ARRAY_SIZE(array), \
318         .entries = array, \
319 }
320
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322                                         struct syscall_arg *arg)
323 {
324         struct strarrays *sas = arg->parm;
325         int i;
326
327         for (i = 0; i < sas->nr_entries; ++i) {
328                 struct strarray *sa = sas->entries[i];
329                 int idx = arg->val - sa->offset;
330
331                 if (idx >= 0 && idx < sa->nr_entries) {
332                         if (sa->entries[idx] == NULL)
333                                 break;
334                         return scnprintf(bf, size, "%s", sa->entries[idx]);
335                 }
336         }
337
338         return scnprintf(bf, size, "%d", arg->val);
339 }
340
341 #ifndef AT_FDCWD
342 #define AT_FDCWD        -100
343 #endif
344
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346                                            struct syscall_arg *arg)
347 {
348         int fd = arg->val;
349
350         if (fd == AT_FDCWD)
351                 return scnprintf(bf, size, "CWD");
352
353         return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359                                               struct syscall_arg *arg);
360
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365         return scnprintf(bf, size, "%#lx", arg->val);
366 }
367
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370         return scnprintf(bf, size, "%d", arg->val);
371 }
372
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375         return scnprintf(bf, size, "%ld", arg->val);
376 }
377
378 static const char *bpf_cmd[] = {
379         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380         "MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389
390 static const char *keyctl_options[] = {
391         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408
409 static const char *fcntl_cmds[] = {
410         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413         "GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416
417 static const char *fcntl_linux_specific_cmds[] = {
418         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
419         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424
425 static struct strarray *fcntl_cmds_arrays[] = {
426         &strarray__fcntl_cmds,
427         &strarray__fcntl_linux_specific_cmds,
428 };
429
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431
432 static const char *rlimit_resources[] = {
433         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435         "RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441
442 static const char *clockid[] = {
443         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448
449 static const char *socket_families[] = {
450         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455         "ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460                                                  struct syscall_arg *arg)
461 {
462         size_t printed = 0;
463         int mode = arg->val;
464
465         if (mode == F_OK) /* 0 */
466                 return scnprintf(bf, size, "F");
467 #define P_MODE(n) \
468         if (mode & n##_OK) { \
469                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470                 mode &= ~n##_OK; \
471         }
472
473         P_MODE(R);
474         P_MODE(W);
475         P_MODE(X);
476 #undef P_MODE
477
478         if (mode)
479                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480
481         return printed;
482 }
483
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487                                               struct syscall_arg *arg);
488
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492                                                 struct syscall_arg *arg)
493 {
494         int printed = 0, flags = arg->val;
495
496 #define P_FLAG(n) \
497         if (flags & O_##n) { \
498                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499                 flags &= ~O_##n; \
500         }
501
502         P_FLAG(CLOEXEC);
503         P_FLAG(NONBLOCK);
504 #undef P_FLAG
505
506         if (flags)
507                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508
509         return printed;
510 }
511
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK   0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM     0x0002
519 #endif
520
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522                                                    struct syscall_arg *arg)
523 {
524         int printed = 0, flags = arg->val;
525
526 #define P_FLAG(n) \
527         if (flags & GRND_##n) { \
528                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529                 flags &= ~GRND_##n; \
530         }
531
532         P_FLAG(RANDOM);
533         P_FLAG(NONBLOCK);
534 #undef P_FLAG
535
536         if (flags)
537                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538
539         return printed;
540 }
541
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543
544 #define STRARRAY(name, array) \
545           { .scnprintf  = SCA_STRARRAY, \
546             .parm       = &strarray__##array, }
547
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562
563 struct syscall_arg_fmt {
564         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565         void       *parm;
566         const char *name;
567         bool       show_zero;
568 };
569
570 static struct syscall_fmt {
571         const char *name;
572         const char *alias;
573         struct syscall_arg_fmt arg[6];
574         u8         nr_args;
575         bool       errpid;
576         bool       timeout;
577         bool       hexret;
578 } syscall_fmts[] = {
579         { .name     = "access",
580           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
581         { .name     = "arch_prctl", .alias = "prctl", },
582         { .name     = "bpf",
583           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584         { .name     = "brk",        .hexret = true,
585           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586         { .name     = "clock_gettime",
587           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588         { .name     = "clone",      .errpid = true, .nr_args = 5,
589           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
590                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
591                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
593                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
594         { .name     = "close",
595           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596         { .name     = "epoll_ctl",
597           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598         { .name     = "eventfd2",
599           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600         { .name     = "fchmodat",
601           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602         { .name     = "fchownat",
603           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
604         { .name     = "fcntl",
605           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606                            .parm      = &strarrays__fcntl_cmds_arrays,
607                            .show_zero = true, },
608                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
609         { .name     = "flock",
610           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611         { .name     = "fstat", .alias = "newfstat", },
612         { .name     = "fstatat", .alias = "newfstatat", },
613         { .name     = "futex",
614           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615         { .name     = "futimesat",
616           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617         { .name     = "getitimer",
618           .arg = { [0] = STRARRAY(which, itimers), }, },
619         { .name     = "getpid",     .errpid = true, },
620         { .name     = "getpgid",    .errpid = true, },
621         { .name     = "getppid",    .errpid = true, },
622         { .name     = "getrandom",
623           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624         { .name     = "getrlimit",
625           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
626         { .name     = "ioctl",
627           .arg = {
628 #if defined(__i386__) || defined(__x86_64__)
629 /*
630  * FIXME: Make this available to all arches.
631  */
632                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
634 #else
635                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
636 #endif
637         { .name     = "keyctl",
638           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
639         { .name     = "kill",
640           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
641         { .name     = "linkat",
642           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
643         { .name     = "lseek",
644           .arg = { [2] = STRARRAY(whence, whences), }, },
645         { .name     = "lstat", .alias = "newlstat", },
646         { .name     = "madvise",
647           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
648                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
649         { .name     = "mkdirat",
650           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
651         { .name     = "mknodat",
652           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
653         { .name     = "mlock",
654           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
655         { .name     = "mlockall",
656           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
657         { .name     = "mmap",       .hexret = true,
658 /* The standard mmap maps to old_mmap on s390x */
659 #if defined(__s390x__)
660         .alias = "old_mmap",
661 #endif
662           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
663                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
664                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
665         { .name     = "mprotect",
666           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
667                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
668         { .name     = "mq_unlink",
669           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
670         { .name     = "mremap",     .hexret = true,
671           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
672                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
673                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
674         { .name     = "munlock",
675           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
676         { .name     = "munmap",
677           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
678         { .name     = "name_to_handle_at",
679           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
680         { .name     = "newfstatat",
681           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
682         { .name     = "open",
683           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
684         { .name     = "open_by_handle_at",
685           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
686                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
687         { .name     = "openat",
688           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
689                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690         { .name     = "perf_event_open",
691           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
692                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
693                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
694         { .name     = "pipe2",
695           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
696         { .name     = "pkey_alloc",
697           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
698         { .name     = "pkey_free",
699           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
700         { .name     = "pkey_mprotect",
701           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
702                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
703                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
704         { .name     = "poll", .timeout = true, },
705         { .name     = "ppoll", .timeout = true, },
706         { .name     = "pread", .alias = "pread64", },
707         { .name     = "preadv", .alias = "pread", },
708         { .name     = "prlimit64",
709           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
710         { .name     = "pwrite", .alias = "pwrite64", },
711         { .name     = "readlinkat",
712           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
713         { .name     = "recvfrom",
714           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
715         { .name     = "recvmmsg",
716           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
717         { .name     = "recvmsg",
718           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
719         { .name     = "renameat",
720           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
721         { .name     = "rt_sigaction",
722           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
723         { .name     = "rt_sigprocmask",
724           .arg = { [0] = STRARRAY(how, sighow), }, },
725         { .name     = "rt_sigqueueinfo",
726           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
727         { .name     = "rt_tgsigqueueinfo",
728           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
729         { .name     = "sched_setscheduler",
730           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
731         { .name     = "seccomp",
732           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
733                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
734         { .name     = "select", .timeout = true, },
735         { .name     = "sendmmsg",
736           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
737         { .name     = "sendmsg",
738           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
739         { .name     = "sendto",
740           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
741         { .name     = "set_tid_address", .errpid = true, },
742         { .name     = "setitimer",
743           .arg = { [0] = STRARRAY(which, itimers), }, },
744         { .name     = "setrlimit",
745           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
746         { .name     = "socket",
747           .arg = { [0] = STRARRAY(family, socket_families),
748                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
749         { .name     = "socketpair",
750           .arg = { [0] = STRARRAY(family, socket_families),
751                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
752         { .name     = "stat", .alias = "newstat", },
753         { .name     = "statx",
754           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
755                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
756                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
757         { .name     = "swapoff",
758           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
759         { .name     = "swapon",
760           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
761         { .name     = "symlinkat",
762           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
763         { .name     = "tgkill",
764           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
765         { .name     = "tkill",
766           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
767         { .name     = "uname", .alias = "newuname", },
768         { .name     = "unlinkat",
769           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
770         { .name     = "utimensat",
771           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
772         { .name     = "wait4",      .errpid = true,
773           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
774         { .name     = "waitid",     .errpid = true,
775           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
776 };
777
778 static int syscall_fmt__cmp(const void *name, const void *fmtp)
779 {
780         const struct syscall_fmt *fmt = fmtp;
781         return strcmp(name, fmt->name);
782 }
783
784 static struct syscall_fmt *syscall_fmt__find(const char *name)
785 {
786         const int nmemb = ARRAY_SIZE(syscall_fmts);
787         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
788 }
789
790 struct syscall {
791         struct event_format *tp_format;
792         int                 nr_args;
793         struct format_field *args;
794         const char          *name;
795         bool                is_exit;
796         struct syscall_fmt  *fmt;
797         struct syscall_arg_fmt *arg_fmt;
798 };
799
800 /*
801  * We need to have this 'calculated' boolean because in some cases we really
802  * don't know what is the duration of a syscall, for instance, when we start
803  * a session and some threads are waiting for a syscall to finish, say 'poll',
804  * in which case all we can do is to print "( ? ) for duration and for the
805  * start timestamp.
806  */
807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
808 {
809         double duration = (double)t / NSEC_PER_MSEC;
810         size_t printed = fprintf(fp, "(");
811
812         if (!calculated)
813                 printed += fprintf(fp, "     ?   ");
814         else if (duration >= 1.0)
815                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
816         else if (duration >= 0.01)
817                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
818         else
819                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
820         return printed + fprintf(fp, "): ");
821 }
822
823 /**
824  * filename.ptr: The filename char pointer that will be vfs_getname'd
825  * filename.entry_str_pos: Where to insert the string translated from
826  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
827  * ret_scnprintf: syscall args may set this to a different syscall return
828  *                formatter, for instance, fcntl may return fds, file flags, etc.
829  */
830 struct thread_trace {
831         u64               entry_time;
832         bool              entry_pending;
833         unsigned long     nr_events;
834         unsigned long     pfmaj, pfmin;
835         char              *entry_str;
836         double            runtime_ms;
837         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
838         struct {
839                 unsigned long ptr;
840                 short int     entry_str_pos;
841                 bool          pending_open;
842                 unsigned int  namelen;
843                 char          *name;
844         } filename;
845         struct {
846                 int       max;
847                 char      **table;
848         } paths;
849
850         struct intlist *syscall_stats;
851 };
852
853 static struct thread_trace *thread_trace__new(void)
854 {
855         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
856
857         if (ttrace)
858                 ttrace->paths.max = -1;
859
860         ttrace->syscall_stats = intlist__new(NULL);
861
862         return ttrace;
863 }
864
865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
866 {
867         struct thread_trace *ttrace;
868
869         if (thread == NULL)
870                 goto fail;
871
872         if (thread__priv(thread) == NULL)
873                 thread__set_priv(thread, thread_trace__new());
874
875         if (thread__priv(thread) == NULL)
876                 goto fail;
877
878         ttrace = thread__priv(thread);
879         ++ttrace->nr_events;
880
881         return ttrace;
882 fail:
883         color_fprintf(fp, PERF_COLOR_RED,
884                       "WARNING: not enough memory, dropping samples!\n");
885         return NULL;
886 }
887
888
889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
890                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
891 {
892         struct thread_trace *ttrace = thread__priv(arg->thread);
893
894         ttrace->ret_scnprintf = ret_scnprintf;
895 }
896
897 #define TRACE_PFMAJ             (1 << 0)
898 #define TRACE_PFMIN             (1 << 1)
899
900 static const size_t trace__entry_str_size = 2048;
901
902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
903 {
904         struct thread_trace *ttrace = thread__priv(thread);
905
906         if (fd > ttrace->paths.max) {
907                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
908
909                 if (npath == NULL)
910                         return -1;
911
912                 if (ttrace->paths.max != -1) {
913                         memset(npath + ttrace->paths.max + 1, 0,
914                                (fd - ttrace->paths.max) * sizeof(char *));
915                 } else {
916                         memset(npath, 0, (fd + 1) * sizeof(char *));
917                 }
918
919                 ttrace->paths.table = npath;
920                 ttrace->paths.max   = fd;
921         }
922
923         ttrace->paths.table[fd] = strdup(pathname);
924
925         return ttrace->paths.table[fd] != NULL ? 0 : -1;
926 }
927
928 static int thread__read_fd_path(struct thread *thread, int fd)
929 {
930         char linkname[PATH_MAX], pathname[PATH_MAX];
931         struct stat st;
932         int ret;
933
934         if (thread->pid_ == thread->tid) {
935                 scnprintf(linkname, sizeof(linkname),
936                           "/proc/%d/fd/%d", thread->pid_, fd);
937         } else {
938                 scnprintf(linkname, sizeof(linkname),
939                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
940         }
941
942         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
943                 return -1;
944
945         ret = readlink(linkname, pathname, sizeof(pathname));
946
947         if (ret < 0 || ret > st.st_size)
948                 return -1;
949
950         pathname[ret] = '\0';
951         return trace__set_fd_pathname(thread, fd, pathname);
952 }
953
954 static const char *thread__fd_path(struct thread *thread, int fd,
955                                    struct trace *trace)
956 {
957         struct thread_trace *ttrace = thread__priv(thread);
958
959         if (ttrace == NULL)
960                 return NULL;
961
962         if (fd < 0)
963                 return NULL;
964
965         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
966                 if (!trace->live)
967                         return NULL;
968                 ++trace->stats.proc_getname;
969                 if (thread__read_fd_path(thread, fd))
970                         return NULL;
971         }
972
973         return ttrace->paths.table[fd];
974 }
975
976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
977 {
978         int fd = arg->val;
979         size_t printed = scnprintf(bf, size, "%d", fd);
980         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
981
982         if (path)
983                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
984
985         return printed;
986 }
987
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989                                               struct syscall_arg *arg)
990 {
991         int fd = arg->val;
992         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993         struct thread_trace *ttrace = thread__priv(arg->thread);
994
995         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996                 zfree(&ttrace->paths.table[fd]);
997
998         return printed;
999 }
1000
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1002                                      unsigned long ptr)
1003 {
1004         struct thread_trace *ttrace = thread__priv(thread);
1005
1006         ttrace->filename.ptr = ptr;
1007         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1008 }
1009
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011                                               struct syscall_arg *arg)
1012 {
1013         unsigned long ptr = arg->val;
1014
1015         if (!arg->trace->vfs_getname)
1016                 return scnprintf(bf, size, "%#x", ptr);
1017
1018         thread__set_filename_pos(arg->thread, bf, ptr);
1019         return 0;
1020 }
1021
1022 static bool trace__filter_duration(struct trace *trace, double t)
1023 {
1024         return t < (trace->duration_filter * NSEC_PER_MSEC);
1025 }
1026
1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1028 {
1029         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1030
1031         return fprintf(fp, "%10.3f ", ts);
1032 }
1033
1034 /*
1035  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1036  * using ttrace->entry_time for a thread that receives a sys_exit without
1037  * first having received a sys_enter ("poll" issued before tracing session
1038  * starts, lost sys_enter exit due to ring buffer overflow).
1039  */
1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1041 {
1042         if (tstamp > 0)
1043                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1044
1045         return fprintf(fp, "         ? ");
1046 }
1047
1048 static bool done = false;
1049 static bool interrupted = false;
1050
1051 static void sig_handler(int sig)
1052 {
1053         done = true;
1054         interrupted = sig == SIGINT;
1055 }
1056
1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1058                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1059 {
1060         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1061         printed += fprintf_duration(duration, duration_calculated, fp);
1062
1063         if (trace->multiple_threads) {
1064                 if (trace->show_comm)
1065                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1066                 printed += fprintf(fp, "%d ", thread->tid);
1067         }
1068
1069         return printed;
1070 }
1071
1072 static int trace__process_event(struct trace *trace, struct machine *machine,
1073                                 union perf_event *event, struct perf_sample *sample)
1074 {
1075         int ret = 0;
1076
1077         switch (event->header.type) {
1078         case PERF_RECORD_LOST:
1079                 color_fprintf(trace->output, PERF_COLOR_RED,
1080                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1081                 ret = machine__process_lost_event(machine, event, sample);
1082                 break;
1083         default:
1084                 ret = machine__process_event(machine, event, sample);
1085                 break;
1086         }
1087
1088         return ret;
1089 }
1090
1091 static int trace__tool_process(struct perf_tool *tool,
1092                                union perf_event *event,
1093                                struct perf_sample *sample,
1094                                struct machine *machine)
1095 {
1096         struct trace *trace = container_of(tool, struct trace, tool);
1097         return trace__process_event(trace, machine, event, sample);
1098 }
1099
1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1101 {
1102         struct machine *machine = vmachine;
1103
1104         if (machine->kptr_restrict_warned)
1105                 return NULL;
1106
1107         if (symbol_conf.kptr_restrict) {
1108                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1109                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1110                            "Kernel samples will not be resolved.\n");
1111                 machine->kptr_restrict_warned = true;
1112                 return NULL;
1113         }
1114
1115         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1116 }
1117
1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1119 {
1120         int err = symbol__init(NULL);
1121
1122         if (err)
1123                 return err;
1124
1125         trace->host = machine__new_host();
1126         if (trace->host == NULL)
1127                 return -ENOMEM;
1128
1129         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1130                 return -errno;
1131
1132         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133                                             evlist->threads, trace__tool_process, false,
1134                                             trace->opts.proc_map_timeout);
1135         if (err)
1136                 symbol__exit();
1137
1138         return err;
1139 }
1140
1141 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1142 {
1143         int idx;
1144
1145         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1146                 nr_args = sc->fmt->nr_args;
1147
1148         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1149         if (sc->arg_fmt == NULL)
1150                 return -1;
1151
1152         for (idx = 0; idx < nr_args; ++idx) {
1153                 if (sc->fmt)
1154                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1155         }
1156
1157         sc->nr_args = nr_args;
1158         return 0;
1159 }
1160
1161 static int syscall__set_arg_fmts(struct syscall *sc)
1162 {
1163         struct format_field *field;
1164         int idx = 0, len;
1165
1166         for (field = sc->args; field; field = field->next, ++idx) {
1167                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1168                         continue;
1169
1170                 if (strcmp(field->type, "const char *") == 0 &&
1171                          (strcmp(field->name, "filename") == 0 ||
1172                           strcmp(field->name, "path") == 0 ||
1173                           strcmp(field->name, "pathname") == 0))
1174                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1175                 else if (field->flags & FIELD_IS_POINTER)
1176                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1177                 else if (strcmp(field->type, "pid_t") == 0)
1178                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1179                 else if (strcmp(field->type, "umode_t") == 0)
1180                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1181                 else if ((strcmp(field->type, "int") == 0 ||
1182                           strcmp(field->type, "unsigned int") == 0 ||
1183                           strcmp(field->type, "long") == 0) &&
1184                          (len = strlen(field->name)) >= 2 &&
1185                          strcmp(field->name + len - 2, "fd") == 0) {
1186                         /*
1187                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1188                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1189                          * 65 int
1190                          * 23 unsigned int
1191                          * 7 unsigned long
1192                          */
1193                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1194                 }
1195         }
1196
1197         return 0;
1198 }
1199
1200 static int trace__read_syscall_info(struct trace *trace, int id)
1201 {
1202         char tp_name[128];
1203         struct syscall *sc;
1204         const char *name = syscalltbl__name(trace->sctbl, id);
1205
1206         if (name == NULL)
1207                 return -1;
1208
1209         if (id > trace->syscalls.max) {
1210                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1211
1212                 if (nsyscalls == NULL)
1213                         return -1;
1214
1215                 if (trace->syscalls.max != -1) {
1216                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1217                                (id - trace->syscalls.max) * sizeof(*sc));
1218                 } else {
1219                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1220                 }
1221
1222                 trace->syscalls.table = nsyscalls;
1223                 trace->syscalls.max   = id;
1224         }
1225
1226         sc = trace->syscalls.table + id;
1227         sc->name = name;
1228
1229         sc->fmt  = syscall_fmt__find(sc->name);
1230
1231         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1232         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1233
1234         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1235                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1236                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1237         }
1238
1239         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1240                 return -1;
1241
1242         if (IS_ERR(sc->tp_format))
1243                 return -1;
1244
1245         sc->args = sc->tp_format->format.fields;
1246         /*
1247          * We need to check and discard the first variable '__syscall_nr'
1248          * or 'nr' that mean the syscall number. It is needless here.
1249          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1250          */
1251         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1252                 sc->args = sc->args->next;
1253                 --sc->nr_args;
1254         }
1255
1256         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1257
1258         return syscall__set_arg_fmts(sc);
1259 }
1260
1261 static int trace__validate_ev_qualifier(struct trace *trace)
1262 {
1263         int err = 0, i;
1264         struct str_node *pos;
1265
1266         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1267         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1268                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1269
1270         if (trace->ev_qualifier_ids.entries == NULL) {
1271                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1272                        trace->output);
1273                 err = -EINVAL;
1274                 goto out;
1275         }
1276
1277         i = 0;
1278
1279         strlist__for_each_entry(pos, trace->ev_qualifier) {
1280                 const char *sc = pos->s;
1281                 int id = syscalltbl__id(trace->sctbl, sc);
1282
1283                 if (id < 0) {
1284                         if (err == 0) {
1285                                 fputs("Error:\tInvalid syscall ", trace->output);
1286                                 err = -EINVAL;
1287                         } else {
1288                                 fputs(", ", trace->output);
1289                         }
1290
1291                         fputs(sc, trace->output);
1292                 }
1293
1294                 trace->ev_qualifier_ids.entries[i++] = id;
1295         }
1296
1297         if (err < 0) {
1298                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1299                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1300                 zfree(&trace->ev_qualifier_ids.entries);
1301                 trace->ev_qualifier_ids.nr = 0;
1302         }
1303 out:
1304         return err;
1305 }
1306
1307 /*
1308  * args is to be interpreted as a series of longs but we need to handle
1309  * 8-byte unaligned accesses. args points to raw_data within the event
1310  * and raw_data is guaranteed to be 8-byte unaligned because it is
1311  * preceded by raw_size which is a u32. So we need to copy args to a temp
1312  * variable to read it. Most notably this avoids extended load instructions
1313  * on unaligned addresses
1314  */
1315 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1316 {
1317         unsigned long val;
1318         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1319
1320         memcpy(&val, p, sizeof(val));
1321         return val;
1322 }
1323
1324 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1325                                       struct syscall_arg *arg)
1326 {
1327         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1328                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1329
1330         return scnprintf(bf, size, "arg%d: ", arg->idx);
1331 }
1332
1333 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1334                                      struct syscall_arg *arg, unsigned long val)
1335 {
1336         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1337                 arg->val = val;
1338                 if (sc->arg_fmt[arg->idx].parm)
1339                         arg->parm = sc->arg_fmt[arg->idx].parm;
1340                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1341         }
1342         return scnprintf(bf, size, "%ld", val);
1343 }
1344
1345 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1346                                       unsigned char *args, struct trace *trace,
1347                                       struct thread *thread)
1348 {
1349         size_t printed = 0;
1350         unsigned long val;
1351         u8 bit = 1;
1352         struct syscall_arg arg = {
1353                 .args   = args,
1354                 .idx    = 0,
1355                 .mask   = 0,
1356                 .trace  = trace,
1357                 .thread = thread,
1358         };
1359         struct thread_trace *ttrace = thread__priv(thread);
1360
1361         /*
1362          * Things like fcntl will set this in its 'cmd' formatter to pick the
1363          * right formatter for the return value (an fd? file flags?), which is
1364          * not needed for syscalls that always return a given type, say an fd.
1365          */
1366         ttrace->ret_scnprintf = NULL;
1367
1368         if (sc->args != NULL) {
1369                 struct format_field *field;
1370
1371                 for (field = sc->args; field;
1372                      field = field->next, ++arg.idx, bit <<= 1) {
1373                         if (arg.mask & bit)
1374                                 continue;
1375
1376                         val = syscall_arg__val(&arg, arg.idx);
1377
1378                         /*
1379                          * Suppress this argument if its value is zero and
1380                          * and we don't have a string associated in an
1381                          * strarray for it.
1382                          */
1383                         if (val == 0 &&
1384                             !(sc->arg_fmt &&
1385                               (sc->arg_fmt[arg.idx].show_zero ||
1386                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1387                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1388                               sc->arg_fmt[arg.idx].parm))
1389                                 continue;
1390
1391                         printed += scnprintf(bf + printed, size - printed,
1392                                              "%s%s: ", printed ? ", " : "", field->name);
1393                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1394                 }
1395         } else if (IS_ERR(sc->tp_format)) {
1396                 /*
1397                  * If we managed to read the tracepoint /format file, then we
1398                  * may end up not having any args, like with gettid(), so only
1399                  * print the raw args when we didn't manage to read it.
1400                  */
1401                 while (arg.idx < sc->nr_args) {
1402                         if (arg.mask & bit)
1403                                 goto next_arg;
1404                         val = syscall_arg__val(&arg, arg.idx);
1405                         if (printed)
1406                                 printed += scnprintf(bf + printed, size - printed, ", ");
1407                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1408                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1409 next_arg:
1410                         ++arg.idx;
1411                         bit <<= 1;
1412                 }
1413         }
1414
1415         return printed;
1416 }
1417
1418 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1419                                   union perf_event *event,
1420                                   struct perf_sample *sample);
1421
1422 static struct syscall *trace__syscall_info(struct trace *trace,
1423                                            struct perf_evsel *evsel, int id)
1424 {
1425
1426         if (id < 0) {
1427
1428                 /*
1429                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1430                  * before that, leaving at a higher verbosity level till that is
1431                  * explained. Reproduced with plain ftrace with:
1432                  *
1433                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1434                  * grep "NR -1 " /t/trace_pipe
1435                  *
1436                  * After generating some load on the machine.
1437                  */
1438                 if (verbose > 1) {
1439                         static u64 n;
1440                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1441                                 id, perf_evsel__name(evsel), ++n);
1442                 }
1443                 return NULL;
1444         }
1445
1446         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1447             trace__read_syscall_info(trace, id))
1448                 goto out_cant_read;
1449
1450         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1451                 goto out_cant_read;
1452
1453         return &trace->syscalls.table[id];
1454
1455 out_cant_read:
1456         if (verbose > 0) {
1457                 fprintf(trace->output, "Problems reading syscall %d", id);
1458                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1459                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1460                 fputs(" information\n", trace->output);
1461         }
1462         return NULL;
1463 }
1464
1465 static void thread__update_stats(struct thread_trace *ttrace,
1466                                  int id, struct perf_sample *sample)
1467 {
1468         struct int_node *inode;
1469         struct stats *stats;
1470         u64 duration = 0;
1471
1472         inode = intlist__findnew(ttrace->syscall_stats, id);
1473         if (inode == NULL)
1474                 return;
1475
1476         stats = inode->priv;
1477         if (stats == NULL) {
1478                 stats = malloc(sizeof(struct stats));
1479                 if (stats == NULL)
1480                         return;
1481                 init_stats(stats);
1482                 inode->priv = stats;
1483         }
1484
1485         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1486                 duration = sample->time - ttrace->entry_time;
1487
1488         update_stats(stats, duration);
1489 }
1490
1491 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1492 {
1493         struct thread_trace *ttrace;
1494         u64 duration;
1495         size_t printed;
1496
1497         if (trace->current == NULL)
1498                 return 0;
1499
1500         ttrace = thread__priv(trace->current);
1501
1502         if (!ttrace->entry_pending)
1503                 return 0;
1504
1505         duration = sample->time - ttrace->entry_time;
1506
1507         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1508         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1509         ttrace->entry_pending = false;
1510
1511         return printed;
1512 }
1513
1514 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1515                             union perf_event *event __maybe_unused,
1516                             struct perf_sample *sample)
1517 {
1518         char *msg;
1519         void *args;
1520         size_t printed = 0;
1521         struct thread *thread;
1522         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1523         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1524         struct thread_trace *ttrace;
1525
1526         if (sc == NULL)
1527                 return -1;
1528
1529         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1530         ttrace = thread__trace(thread, trace->output);
1531         if (ttrace == NULL)
1532                 goto out_put;
1533
1534         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1535
1536         if (ttrace->entry_str == NULL) {
1537                 ttrace->entry_str = malloc(trace__entry_str_size);
1538                 if (!ttrace->entry_str)
1539                         goto out_put;
1540         }
1541
1542         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1543                 trace__printf_interrupted_entry(trace, sample);
1544
1545         ttrace->entry_time = sample->time;
1546         msg = ttrace->entry_str;
1547         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1548
1549         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1550                                            args, trace, thread);
1551
1552         if (sc->is_exit) {
1553                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1554                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1555                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1556                 }
1557         } else {
1558                 ttrace->entry_pending = true;
1559                 /* See trace__vfs_getname & trace__sys_exit */
1560                 ttrace->filename.pending_open = false;
1561         }
1562
1563         if (trace->current != thread) {
1564                 thread__put(trace->current);
1565                 trace->current = thread__get(thread);
1566         }
1567         err = 0;
1568 out_put:
1569         thread__put(thread);
1570         return err;
1571 }
1572
1573 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1574                                     struct perf_sample *sample,
1575                                     struct callchain_cursor *cursor)
1576 {
1577         struct addr_location al;
1578
1579         if (machine__resolve(trace->host, &al, sample) < 0 ||
1580             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1581                 return -1;
1582
1583         return 0;
1584 }
1585
1586 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1587 {
1588         /* TODO: user-configurable print_opts */
1589         const unsigned int print_opts = EVSEL__PRINT_SYM |
1590                                         EVSEL__PRINT_DSO |
1591                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1592
1593         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1594 }
1595
1596 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1597                            union perf_event *event __maybe_unused,
1598                            struct perf_sample *sample)
1599 {
1600         long ret;
1601         u64 duration = 0;
1602         bool duration_calculated = false;
1603         struct thread *thread;
1604         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1605         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1606         struct thread_trace *ttrace;
1607
1608         if (sc == NULL)
1609                 return -1;
1610
1611         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1612         ttrace = thread__trace(thread, trace->output);
1613         if (ttrace == NULL)
1614                 goto out_put;
1615
1616         if (trace->summary)
1617                 thread__update_stats(ttrace, id, sample);
1618
1619         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1620
1621         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1622                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1623                 ttrace->filename.pending_open = false;
1624                 ++trace->stats.vfs_getname;
1625         }
1626
1627         if (ttrace->entry_time) {
1628                 duration = sample->time - ttrace->entry_time;
1629                 if (trace__filter_duration(trace, duration))
1630                         goto out;
1631                 duration_calculated = true;
1632         } else if (trace->duration_filter)
1633                 goto out;
1634
1635         if (sample->callchain) {
1636                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1637                 if (callchain_ret == 0) {
1638                         if (callchain_cursor.nr < trace->min_stack)
1639                                 goto out;
1640                         callchain_ret = 1;
1641                 }
1642         }
1643
1644         if (trace->summary_only)
1645                 goto out;
1646
1647         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1648
1649         if (ttrace->entry_pending) {
1650                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1651         } else {
1652                 fprintf(trace->output, " ... [");
1653                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1654                 fprintf(trace->output, "]: %s()", sc->name);
1655         }
1656
1657         if (sc->fmt == NULL) {
1658                 if (ret < 0)
1659                         goto errno_print;
1660 signed_print:
1661                 fprintf(trace->output, ") = %ld", ret);
1662         } else if (ret < 0) {
1663 errno_print: {
1664                 char bf[STRERR_BUFSIZE];
1665                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1666                            *e = audit_errno_to_name(-ret);
1667
1668                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1669         }
1670         } else if (ret == 0 && sc->fmt->timeout)
1671                 fprintf(trace->output, ") = 0 Timeout");
1672         else if (ttrace->ret_scnprintf) {
1673                 char bf[1024];
1674                 struct syscall_arg arg = {
1675                         .val    = ret,
1676                         .thread = thread,
1677                         .trace  = trace,
1678                 };
1679                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1680                 ttrace->ret_scnprintf = NULL;
1681                 fprintf(trace->output, ") = %s", bf);
1682         } else if (sc->fmt->hexret)
1683                 fprintf(trace->output, ") = %#lx", ret);
1684         else if (sc->fmt->errpid) {
1685                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1686
1687                 if (child != NULL) {
1688                         fprintf(trace->output, ") = %ld", ret);
1689                         if (child->comm_set)
1690                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1691                         thread__put(child);
1692                 }
1693         } else
1694                 goto signed_print;
1695
1696         fputc('\n', trace->output);
1697
1698         if (callchain_ret > 0)
1699                 trace__fprintf_callchain(trace, sample);
1700         else if (callchain_ret < 0)
1701                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1702 out:
1703         ttrace->entry_pending = false;
1704         err = 0;
1705 out_put:
1706         thread__put(thread);
1707         return err;
1708 }
1709
1710 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1711                               union perf_event *event __maybe_unused,
1712                               struct perf_sample *sample)
1713 {
1714         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1715         struct thread_trace *ttrace;
1716         size_t filename_len, entry_str_len, to_move;
1717         ssize_t remaining_space;
1718         char *pos;
1719         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1720
1721         if (!thread)
1722                 goto out;
1723
1724         ttrace = thread__priv(thread);
1725         if (!ttrace)
1726                 goto out_put;
1727
1728         filename_len = strlen(filename);
1729         if (filename_len == 0)
1730                 goto out_put;
1731
1732         if (ttrace->filename.namelen < filename_len) {
1733                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1734
1735                 if (f == NULL)
1736                         goto out_put;
1737
1738                 ttrace->filename.namelen = filename_len;
1739                 ttrace->filename.name = f;
1740         }
1741
1742         strcpy(ttrace->filename.name, filename);
1743         ttrace->filename.pending_open = true;
1744
1745         if (!ttrace->filename.ptr)
1746                 goto out_put;
1747
1748         entry_str_len = strlen(ttrace->entry_str);
1749         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1750         if (remaining_space <= 0)
1751                 goto out_put;
1752
1753         if (filename_len > (size_t)remaining_space) {
1754                 filename += filename_len - remaining_space;
1755                 filename_len = remaining_space;
1756         }
1757
1758         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1759         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1760         memmove(pos + filename_len, pos, to_move);
1761         memcpy(pos, filename, filename_len);
1762
1763         ttrace->filename.ptr = 0;
1764         ttrace->filename.entry_str_pos = 0;
1765 out_put:
1766         thread__put(thread);
1767 out:
1768         return 0;
1769 }
1770
1771 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1772                                      union perf_event *event __maybe_unused,
1773                                      struct perf_sample *sample)
1774 {
1775         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1776         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1777         struct thread *thread = machine__findnew_thread(trace->host,
1778                                                         sample->pid,
1779                                                         sample->tid);
1780         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1781
1782         if (ttrace == NULL)
1783                 goto out_dump;
1784
1785         ttrace->runtime_ms += runtime_ms;
1786         trace->runtime_ms += runtime_ms;
1787 out_put:
1788         thread__put(thread);
1789         return 0;
1790
1791 out_dump:
1792         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1793                evsel->name,
1794                perf_evsel__strval(evsel, sample, "comm"),
1795                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1796                runtime,
1797                perf_evsel__intval(evsel, sample, "vruntime"));
1798         goto out_put;
1799 }
1800
1801 static void bpf_output__printer(enum binary_printer_ops op,
1802                                 unsigned int val, void *extra)
1803 {
1804         FILE *output = extra;
1805         unsigned char ch = (unsigned char)val;
1806
1807         switch (op) {
1808         case BINARY_PRINT_CHAR_DATA:
1809                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1810                 break;
1811         case BINARY_PRINT_DATA_BEGIN:
1812         case BINARY_PRINT_LINE_BEGIN:
1813         case BINARY_PRINT_ADDR:
1814         case BINARY_PRINT_NUM_DATA:
1815         case BINARY_PRINT_NUM_PAD:
1816         case BINARY_PRINT_SEP:
1817         case BINARY_PRINT_CHAR_PAD:
1818         case BINARY_PRINT_LINE_END:
1819         case BINARY_PRINT_DATA_END:
1820         default:
1821                 break;
1822         }
1823 }
1824
1825 static void bpf_output__fprintf(struct trace *trace,
1826                                 struct perf_sample *sample)
1827 {
1828         print_binary(sample->raw_data, sample->raw_size, 8,
1829                      bpf_output__printer, trace->output);
1830 }
1831
1832 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1833                                 union perf_event *event __maybe_unused,
1834                                 struct perf_sample *sample)
1835 {
1836         int callchain_ret = 0;
1837
1838         if (sample->callchain) {
1839                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1840                 if (callchain_ret == 0) {
1841                         if (callchain_cursor.nr < trace->min_stack)
1842                                 goto out;
1843                         callchain_ret = 1;
1844                 }
1845         }
1846
1847         trace__printf_interrupted_entry(trace, sample);
1848         trace__fprintf_tstamp(trace, sample->time, trace->output);
1849
1850         if (trace->trace_syscalls)
1851                 fprintf(trace->output, "(         ): ");
1852
1853         fprintf(trace->output, "%s:", evsel->name);
1854
1855         if (perf_evsel__is_bpf_output(evsel)) {
1856                 bpf_output__fprintf(trace, sample);
1857         } else if (evsel->tp_format) {
1858                 event_format__fprintf(evsel->tp_format, sample->cpu,
1859                                       sample->raw_data, sample->raw_size,
1860                                       trace->output);
1861         }
1862
1863         fprintf(trace->output, ")\n");
1864
1865         if (callchain_ret > 0)
1866                 trace__fprintf_callchain(trace, sample);
1867         else if (callchain_ret < 0)
1868                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1869 out:
1870         return 0;
1871 }
1872
1873 static void print_location(FILE *f, struct perf_sample *sample,
1874                            struct addr_location *al,
1875                            bool print_dso, bool print_sym)
1876 {
1877
1878         if ((verbose > 0 || print_dso) && al->map)
1879                 fprintf(f, "%s@", al->map->dso->long_name);
1880
1881         if ((verbose > 0 || print_sym) && al->sym)
1882                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1883                         al->addr - al->sym->start);
1884         else if (al->map)
1885                 fprintf(f, "0x%" PRIx64, al->addr);
1886         else
1887                 fprintf(f, "0x%" PRIx64, sample->addr);
1888 }
1889
1890 static int trace__pgfault(struct trace *trace,
1891                           struct perf_evsel *evsel,
1892                           union perf_event *event __maybe_unused,
1893                           struct perf_sample *sample)
1894 {
1895         struct thread *thread;
1896         struct addr_location al;
1897         char map_type = 'd';
1898         struct thread_trace *ttrace;
1899         int err = -1;
1900         int callchain_ret = 0;
1901
1902         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1903
1904         if (sample->callchain) {
1905                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1906                 if (callchain_ret == 0) {
1907                         if (callchain_cursor.nr < trace->min_stack)
1908                                 goto out_put;
1909                         callchain_ret = 1;
1910                 }
1911         }
1912
1913         ttrace = thread__trace(thread, trace->output);
1914         if (ttrace == NULL)
1915                 goto out_put;
1916
1917         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1918                 ttrace->pfmaj++;
1919         else
1920                 ttrace->pfmin++;
1921
1922         if (trace->summary_only)
1923                 goto out;
1924
1925         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1926                               sample->ip, &al);
1927
1928         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1929
1930         fprintf(trace->output, "%sfault [",
1931                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1932                 "maj" : "min");
1933
1934         print_location(trace->output, sample, &al, false, true);
1935
1936         fprintf(trace->output, "] => ");
1937
1938         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1939                                    sample->addr, &al);
1940
1941         if (!al.map) {
1942                 thread__find_addr_location(thread, sample->cpumode,
1943                                            MAP__FUNCTION, sample->addr, &al);
1944
1945                 if (al.map)
1946                         map_type = 'x';
1947                 else
1948                         map_type = '?';
1949         }
1950
1951         print_location(trace->output, sample, &al, true, false);
1952
1953         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1954
1955         if (callchain_ret > 0)
1956                 trace__fprintf_callchain(trace, sample);
1957         else if (callchain_ret < 0)
1958                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1959 out:
1960         err = 0;
1961 out_put:
1962         thread__put(thread);
1963         return err;
1964 }
1965
1966 static void trace__set_base_time(struct trace *trace,
1967                                  struct perf_evsel *evsel,
1968                                  struct perf_sample *sample)
1969 {
1970         /*
1971          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1972          * and don't use sample->time unconditionally, we may end up having
1973          * some other event in the future without PERF_SAMPLE_TIME for good
1974          * reason, i.e. we may not be interested in its timestamps, just in
1975          * it taking place, picking some piece of information when it
1976          * appears in our event stream (vfs_getname comes to mind).
1977          */
1978         if (trace->base_time == 0 && !trace->full_time &&
1979             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1980                 trace->base_time = sample->time;
1981 }
1982
1983 static int trace__process_sample(struct perf_tool *tool,
1984                                  union perf_event *event,
1985                                  struct perf_sample *sample,
1986                                  struct perf_evsel *evsel,
1987                                  struct machine *machine __maybe_unused)
1988 {
1989         struct trace *trace = container_of(tool, struct trace, tool);
1990         struct thread *thread;
1991         int err = 0;
1992
1993         tracepoint_handler handler = evsel->handler;
1994
1995         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1996         if (thread && thread__is_filtered(thread))
1997                 goto out;
1998
1999         trace__set_base_time(trace, evsel, sample);
2000
2001         if (handler) {
2002                 ++trace->nr_events;
2003                 handler(trace, evsel, event, sample);
2004         }
2005 out:
2006         thread__put(thread);
2007         return err;
2008 }
2009
2010 static int trace__record(struct trace *trace, int argc, const char **argv)
2011 {
2012         unsigned int rec_argc, i, j;
2013         const char **rec_argv;
2014         const char * const record_args[] = {
2015                 "record",
2016                 "-R",
2017                 "-m", "1024",
2018                 "-c", "1",
2019         };
2020
2021         const char * const sc_args[] = { "-e", };
2022         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2023         const char * const majpf_args[] = { "-e", "major-faults" };
2024         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2025         const char * const minpf_args[] = { "-e", "minor-faults" };
2026         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2027
2028         /* +1 is for the event string below */
2029         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2030                 majpf_args_nr + minpf_args_nr + argc;
2031         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2032
2033         if (rec_argv == NULL)
2034                 return -ENOMEM;
2035
2036         j = 0;
2037         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2038                 rec_argv[j++] = record_args[i];
2039
2040         if (trace->trace_syscalls) {
2041                 for (i = 0; i < sc_args_nr; i++)
2042                         rec_argv[j++] = sc_args[i];
2043
2044                 /* event string may be different for older kernels - e.g., RHEL6 */
2045                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2046                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2047                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2048                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2049                 else {
2050                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2051                         return -1;
2052                 }
2053         }
2054
2055         if (trace->trace_pgfaults & TRACE_PFMAJ)
2056                 for (i = 0; i < majpf_args_nr; i++)
2057                         rec_argv[j++] = majpf_args[i];
2058
2059         if (trace->trace_pgfaults & TRACE_PFMIN)
2060                 for (i = 0; i < minpf_args_nr; i++)
2061                         rec_argv[j++] = minpf_args[i];
2062
2063         for (i = 0; i < (unsigned int)argc; i++)
2064                 rec_argv[j++] = argv[i];
2065
2066         return cmd_record(j, rec_argv);
2067 }
2068
2069 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2070
2071 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2072 {
2073         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2074
2075         if (IS_ERR(evsel))
2076                 return false;
2077
2078         if (perf_evsel__field(evsel, "pathname") == NULL) {
2079                 perf_evsel__delete(evsel);
2080                 return false;
2081         }
2082
2083         evsel->handler = trace__vfs_getname;
2084         perf_evlist__add(evlist, evsel);
2085         return true;
2086 }
2087
2088 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2089 {
2090         struct perf_evsel *evsel;
2091         struct perf_event_attr attr = {
2092                 .type = PERF_TYPE_SOFTWARE,
2093                 .mmap_data = 1,
2094         };
2095
2096         attr.config = config;
2097         attr.sample_period = 1;
2098
2099         event_attr_init(&attr);
2100
2101         evsel = perf_evsel__new(&attr);
2102         if (evsel)
2103                 evsel->handler = trace__pgfault;
2104
2105         return evsel;
2106 }
2107
2108 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2109 {
2110         const u32 type = event->header.type;
2111         struct perf_evsel *evsel;
2112
2113         if (type != PERF_RECORD_SAMPLE) {
2114                 trace__process_event(trace, trace->host, event, sample);
2115                 return;
2116         }
2117
2118         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2119         if (evsel == NULL) {
2120                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2121                 return;
2122         }
2123
2124         trace__set_base_time(trace, evsel, sample);
2125
2126         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2127             sample->raw_data == NULL) {
2128                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2129                        perf_evsel__name(evsel), sample->tid,
2130                        sample->cpu, sample->raw_size);
2131         } else {
2132                 tracepoint_handler handler = evsel->handler;
2133                 handler(trace, evsel, event, sample);
2134         }
2135 }
2136
2137 static int trace__add_syscall_newtp(struct trace *trace)
2138 {
2139         int ret = -1;
2140         struct perf_evlist *evlist = trace->evlist;
2141         struct perf_evsel *sys_enter, *sys_exit;
2142
2143         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2144         if (sys_enter == NULL)
2145                 goto out;
2146
2147         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2148                 goto out_delete_sys_enter;
2149
2150         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2151         if (sys_exit == NULL)
2152                 goto out_delete_sys_enter;
2153
2154         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2155                 goto out_delete_sys_exit;
2156
2157         perf_evlist__add(evlist, sys_enter);
2158         perf_evlist__add(evlist, sys_exit);
2159
2160         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2161                 /*
2162                  * We're interested only in the user space callchain
2163                  * leading to the syscall, allow overriding that for
2164                  * debugging reasons using --kernel_syscall_callchains
2165                  */
2166                 sys_exit->attr.exclude_callchain_kernel = 1;
2167         }
2168
2169         trace->syscalls.events.sys_enter = sys_enter;
2170         trace->syscalls.events.sys_exit  = sys_exit;
2171
2172         ret = 0;
2173 out:
2174         return ret;
2175
2176 out_delete_sys_exit:
2177         perf_evsel__delete_priv(sys_exit);
2178 out_delete_sys_enter:
2179         perf_evsel__delete_priv(sys_enter);
2180         goto out;
2181 }
2182
2183 static int trace__set_ev_qualifier_filter(struct trace *trace)
2184 {
2185         int err = -1;
2186         struct perf_evsel *sys_exit;
2187         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2188                                                 trace->ev_qualifier_ids.nr,
2189                                                 trace->ev_qualifier_ids.entries);
2190
2191         if (filter == NULL)
2192                 goto out_enomem;
2193
2194         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2195                                           filter)) {
2196                 sys_exit = trace->syscalls.events.sys_exit;
2197                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2198         }
2199
2200         free(filter);
2201 out:
2202         return err;
2203 out_enomem:
2204         errno = ENOMEM;
2205         goto out;
2206 }
2207
2208 static int trace__set_filter_loop_pids(struct trace *trace)
2209 {
2210         unsigned int nr = 1;
2211         pid_t pids[32] = {
2212                 getpid(),
2213         };
2214         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2215
2216         while (thread && nr < ARRAY_SIZE(pids)) {
2217                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2218
2219                 if (parent == NULL)
2220                         break;
2221
2222                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2223                         pids[nr++] = parent->tid;
2224                         break;
2225                 }
2226                 thread = parent;
2227         }
2228
2229         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2230 }
2231
2232 static int trace__run(struct trace *trace, int argc, const char **argv)
2233 {
2234         struct perf_evlist *evlist = trace->evlist;
2235         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2236         int err = -1, i;
2237         unsigned long before;
2238         const bool forks = argc > 0;
2239         bool draining = false;
2240
2241         trace->live = true;
2242
2243         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2244                 goto out_error_raw_syscalls;
2245
2246         if (trace->trace_syscalls)
2247                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2248
2249         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2250                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2251                 if (pgfault_maj == NULL)
2252                         goto out_error_mem;
2253                 perf_evlist__add(evlist, pgfault_maj);
2254         }
2255
2256         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2257                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2258                 if (pgfault_min == NULL)
2259                         goto out_error_mem;
2260                 perf_evlist__add(evlist, pgfault_min);
2261         }
2262
2263         if (trace->sched &&
2264             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2265                                    trace__sched_stat_runtime))
2266                 goto out_error_sched_stat_runtime;
2267
2268         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2269         if (err < 0) {
2270                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2271                 goto out_delete_evlist;
2272         }
2273
2274         err = trace__symbols_init(trace, evlist);
2275         if (err < 0) {
2276                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2277                 goto out_delete_evlist;
2278         }
2279
2280         perf_evlist__config(evlist, &trace->opts, NULL);
2281
2282         if (callchain_param.enabled) {
2283                 bool use_identifier = false;
2284
2285                 if (trace->syscalls.events.sys_exit) {
2286                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2287                                                      &trace->opts, &callchain_param);
2288                         use_identifier = true;
2289                 }
2290
2291                 if (pgfault_maj) {
2292                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2293                         use_identifier = true;
2294                 }
2295
2296                 if (pgfault_min) {
2297                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2298                         use_identifier = true;
2299                 }
2300
2301                 if (use_identifier) {
2302                        /*
2303                         * Now we have evsels with different sample_ids, use
2304                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2305                         * from a fixed position in each ring buffer record.
2306                         *
2307                         * As of this the changeset introducing this comment, this
2308                         * isn't strictly needed, as the fields that can come before
2309                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2310                         * some of those for things like copying the payload of
2311                         * pointer syscall arguments, and for vfs_getname we don't
2312                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2313                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2314                         */
2315                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2316                         perf_evlist__reset_sample_bit(evlist, ID);
2317                 }
2318         }
2319
2320         signal(SIGCHLD, sig_handler);
2321         signal(SIGINT, sig_handler);
2322
2323         if (forks) {
2324                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2325                                                     argv, false, NULL);
2326                 if (err < 0) {
2327                         fprintf(trace->output, "Couldn't run the workload!\n");
2328                         goto out_delete_evlist;
2329                 }
2330         }
2331
2332         err = perf_evlist__open(evlist);
2333         if (err < 0)
2334                 goto out_error_open;
2335
2336         err = bpf__apply_obj_config();
2337         if (err) {
2338                 char errbuf[BUFSIZ];
2339
2340                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2341                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2342                          errbuf);
2343                 goto out_error_open;
2344         }
2345
2346         /*
2347          * Better not use !target__has_task() here because we need to cover the
2348          * case where no threads were specified in the command line, but a
2349          * workload was, and in that case we will fill in the thread_map when
2350          * we fork the workload in perf_evlist__prepare_workload.
2351          */
2352         if (trace->filter_pids.nr > 0)
2353                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2354         else if (thread_map__pid(evlist->threads, 0) == -1)
2355                 err = trace__set_filter_loop_pids(trace);
2356
2357         if (err < 0)
2358                 goto out_error_mem;
2359
2360         if (trace->ev_qualifier_ids.nr > 0) {
2361                 err = trace__set_ev_qualifier_filter(trace);
2362                 if (err < 0)
2363                         goto out_errno;
2364
2365                 pr_debug("event qualifier tracepoint filter: %s\n",
2366                          trace->syscalls.events.sys_exit->filter);
2367         }
2368
2369         err = perf_evlist__apply_filters(evlist, &evsel);
2370         if (err < 0)
2371                 goto out_error_apply_filters;
2372
2373         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2374         if (err < 0)
2375                 goto out_error_mmap;
2376
2377         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2378                 perf_evlist__enable(evlist);
2379
2380         if (forks)
2381                 perf_evlist__start_workload(evlist);
2382
2383         if (trace->opts.initial_delay) {
2384                 usleep(trace->opts.initial_delay * 1000);
2385                 perf_evlist__enable(evlist);
2386         }
2387
2388         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2389                                   evlist->threads->nr > 1 ||
2390                                   perf_evlist__first(evlist)->attr.inherit;
2391 again:
2392         before = trace->nr_events;
2393
2394         for (i = 0; i < evlist->nr_mmaps; i++) {
2395                 union perf_event *event;
2396
2397                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2398                         struct perf_sample sample;
2399
2400                         ++trace->nr_events;
2401
2402                         err = perf_evlist__parse_sample(evlist, event, &sample);
2403                         if (err) {
2404                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2405                                 goto next_event;
2406                         }
2407
2408                         trace__handle_event(trace, event, &sample);
2409 next_event:
2410                         perf_evlist__mmap_consume(evlist, i);
2411
2412                         if (interrupted)
2413                                 goto out_disable;
2414
2415                         if (done && !draining) {
2416                                 perf_evlist__disable(evlist);
2417                                 draining = true;
2418                         }
2419                 }
2420         }
2421
2422         if (trace->nr_events == before) {
2423                 int timeout = done ? 100 : -1;
2424
2425                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2426                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2427                                 draining = true;
2428
2429                         goto again;
2430                 }
2431         } else {
2432                 goto again;
2433         }
2434
2435 out_disable:
2436         thread__zput(trace->current);
2437
2438         perf_evlist__disable(evlist);
2439
2440         if (!err) {
2441                 if (trace->summary)
2442                         trace__fprintf_thread_summary(trace, trace->output);
2443
2444                 if (trace->show_tool_stats) {
2445                         fprintf(trace->output, "Stats:\n "
2446                                                " vfs_getname : %" PRIu64 "\n"
2447                                                " proc_getname: %" PRIu64 "\n",
2448                                 trace->stats.vfs_getname,
2449                                 trace->stats.proc_getname);
2450                 }
2451         }
2452
2453 out_delete_evlist:
2454         perf_evlist__delete(evlist);
2455         trace->evlist = NULL;
2456         trace->live = false;
2457         return err;
2458 {
2459         char errbuf[BUFSIZ];
2460
2461 out_error_sched_stat_runtime:
2462         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2463         goto out_error;
2464
2465 out_error_raw_syscalls:
2466         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2467         goto out_error;
2468
2469 out_error_mmap:
2470         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2471         goto out_error;
2472
2473 out_error_open:
2474         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2475
2476 out_error:
2477         fprintf(trace->output, "%s\n", errbuf);
2478         goto out_delete_evlist;
2479
2480 out_error_apply_filters:
2481         fprintf(trace->output,
2482                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2483                 evsel->filter, perf_evsel__name(evsel), errno,
2484                 str_error_r(errno, errbuf, sizeof(errbuf)));
2485         goto out_delete_evlist;
2486 }
2487 out_error_mem:
2488         fprintf(trace->output, "Not enough memory to run!\n");
2489         goto out_delete_evlist;
2490
2491 out_errno:
2492         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2493         goto out_delete_evlist;
2494 }
2495
2496 static int trace__replay(struct trace *trace)
2497 {
2498         const struct perf_evsel_str_handler handlers[] = {
2499                 { "probe:vfs_getname",       trace__vfs_getname, },
2500         };
2501         struct perf_data_file file = {
2502                 .path  = input_name,
2503                 .mode  = PERF_DATA_MODE_READ,
2504                 .force = trace->force,
2505         };
2506         struct perf_session *session;
2507         struct perf_evsel *evsel;
2508         int err = -1;
2509
2510         trace->tool.sample        = trace__process_sample;
2511         trace->tool.mmap          = perf_event__process_mmap;
2512         trace->tool.mmap2         = perf_event__process_mmap2;
2513         trace->tool.comm          = perf_event__process_comm;
2514         trace->tool.exit          = perf_event__process_exit;
2515         trace->tool.fork          = perf_event__process_fork;
2516         trace->tool.attr          = perf_event__process_attr;
2517         trace->tool.tracing_data  = perf_event__process_tracing_data;
2518         trace->tool.build_id      = perf_event__process_build_id;
2519         trace->tool.namespaces    = perf_event__process_namespaces;
2520
2521         trace->tool.ordered_events = true;
2522         trace->tool.ordering_requires_timestamps = true;
2523
2524         /* add tid to output */
2525         trace->multiple_threads = true;
2526
2527         session = perf_session__new(&file, false, &trace->tool);
2528         if (session == NULL)
2529                 return -1;
2530
2531         if (trace->opts.target.pid)
2532                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2533
2534         if (trace->opts.target.tid)
2535                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2536
2537         if (symbol__init(&session->header.env) < 0)
2538                 goto out;
2539
2540         trace->host = &session->machines.host;
2541
2542         err = perf_session__set_tracepoints_handlers(session, handlers);
2543         if (err)
2544                 goto out;
2545
2546         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2547                                                      "raw_syscalls:sys_enter");
2548         /* older kernels have syscalls tp versus raw_syscalls */
2549         if (evsel == NULL)
2550                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2551                                                              "syscalls:sys_enter");
2552
2553         if (evsel &&
2554             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2555             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2556                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2557                 goto out;
2558         }
2559
2560         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2561                                                      "raw_syscalls:sys_exit");
2562         if (evsel == NULL)
2563                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2564                                                              "syscalls:sys_exit");
2565         if (evsel &&
2566             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2567             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2568                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2569                 goto out;
2570         }
2571
2572         evlist__for_each_entry(session->evlist, evsel) {
2573                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2574                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2575                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2576                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2577                         evsel->handler = trace__pgfault;
2578         }
2579
2580         setup_pager();
2581
2582         err = perf_session__process_events(session);
2583         if (err)
2584                 pr_err("Failed to process events, error %d", err);
2585
2586         else if (trace->summary)
2587                 trace__fprintf_thread_summary(trace, trace->output);
2588
2589 out:
2590         perf_session__delete(session);
2591
2592         return err;
2593 }
2594
2595 static size_t trace__fprintf_threads_header(FILE *fp)
2596 {
2597         size_t printed;
2598
2599         printed  = fprintf(fp, "\n Summary of events:\n\n");
2600
2601         return printed;
2602 }
2603
2604 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2605         struct stats    *stats;
2606         double          msecs;
2607         int             syscall;
2608 )
2609 {
2610         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2611         struct stats *stats = source->priv;
2612
2613         entry->syscall = source->i;
2614         entry->stats   = stats;
2615         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2616 }
2617
2618 static size_t thread__dump_stats(struct thread_trace *ttrace,
2619                                  struct trace *trace, FILE *fp)
2620 {
2621         size_t printed = 0;
2622         struct syscall *sc;
2623         struct rb_node *nd;
2624         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2625
2626         if (syscall_stats == NULL)
2627                 return 0;
2628
2629         printed += fprintf(fp, "\n");
2630
2631         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2632         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2633         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2634
2635         resort_rb__for_each_entry(nd, syscall_stats) {
2636                 struct stats *stats = syscall_stats_entry->stats;
2637                 if (stats) {
2638                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2639                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2640                         double avg = avg_stats(stats);
2641                         double pct;
2642                         u64 n = (u64) stats->n;
2643
2644                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2645                         avg /= NSEC_PER_MSEC;
2646
2647                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2648                         printed += fprintf(fp, "   %-15s", sc->name);
2649                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2650                                            n, syscall_stats_entry->msecs, min, avg);
2651                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2652                 }
2653         }
2654
2655         resort_rb__delete(syscall_stats);
2656         printed += fprintf(fp, "\n\n");
2657
2658         return printed;
2659 }
2660
2661 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2662 {
2663         size_t printed = 0;
2664         struct thread_trace *ttrace = thread__priv(thread);
2665         double ratio;
2666
2667         if (ttrace == NULL)
2668                 return 0;
2669
2670         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2671
2672         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2673         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2674         printed += fprintf(fp, "%.1f%%", ratio);
2675         if (ttrace->pfmaj)
2676                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2677         if (ttrace->pfmin)
2678                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2679         if (trace->sched)
2680                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2681         else if (fputc('\n', fp) != EOF)
2682                 ++printed;
2683
2684         printed += thread__dump_stats(ttrace, trace, fp);
2685
2686         return printed;
2687 }
2688
2689 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2690 {
2691         return ttrace ? ttrace->nr_events : 0;
2692 }
2693
2694 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2695         struct thread *thread;
2696 )
2697 {
2698         entry->thread = rb_entry(nd, struct thread, rb_node);
2699 }
2700
2701 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2702 {
2703         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2704         size_t printed = trace__fprintf_threads_header(fp);
2705         struct rb_node *nd;
2706
2707         if (threads == NULL) {
2708                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2709                 return 0;
2710         }
2711
2712         resort_rb__for_each_entry(nd, threads)
2713                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2714
2715         resort_rb__delete(threads);
2716
2717         return printed;
2718 }
2719
2720 static int trace__set_duration(const struct option *opt, const char *str,
2721                                int unset __maybe_unused)
2722 {
2723         struct trace *trace = opt->value;
2724
2725         trace->duration_filter = atof(str);
2726         return 0;
2727 }
2728
2729 static int trace__set_filter_pids(const struct option *opt, const char *str,
2730                                   int unset __maybe_unused)
2731 {
2732         int ret = -1;
2733         size_t i;
2734         struct trace *trace = opt->value;
2735         /*
2736          * FIXME: introduce a intarray class, plain parse csv and create a
2737          * { int nr, int entries[] } struct...
2738          */
2739         struct intlist *list = intlist__new(str);
2740
2741         if (list == NULL)
2742                 return -1;
2743
2744         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2745         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2746
2747         if (trace->filter_pids.entries == NULL)
2748                 goto out;
2749
2750         trace->filter_pids.entries[0] = getpid();
2751
2752         for (i = 1; i < trace->filter_pids.nr; ++i)
2753                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2754
2755         intlist__delete(list);
2756         ret = 0;
2757 out:
2758         return ret;
2759 }
2760
2761 static int trace__open_output(struct trace *trace, const char *filename)
2762 {
2763         struct stat st;
2764
2765         if (!stat(filename, &st) && st.st_size) {
2766                 char oldname[PATH_MAX];
2767
2768                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2769                 unlink(oldname);
2770                 rename(filename, oldname);
2771         }
2772
2773         trace->output = fopen(filename, "w");
2774
2775         return trace->output == NULL ? -errno : 0;
2776 }
2777
2778 static int parse_pagefaults(const struct option *opt, const char *str,
2779                             int unset __maybe_unused)
2780 {
2781         int *trace_pgfaults = opt->value;
2782
2783         if (strcmp(str, "all") == 0)
2784                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2785         else if (strcmp(str, "maj") == 0)
2786                 *trace_pgfaults |= TRACE_PFMAJ;
2787         else if (strcmp(str, "min") == 0)
2788                 *trace_pgfaults |= TRACE_PFMIN;
2789         else
2790                 return -1;
2791
2792         return 0;
2793 }
2794
2795 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2796 {
2797         struct perf_evsel *evsel;
2798
2799         evlist__for_each_entry(evlist, evsel)
2800                 evsel->handler = handler;
2801 }
2802
2803 /*
2804  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2805  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2806  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2807  *
2808  * It'd be better to introduce a parse_options() variant that would return a
2809  * list with the terms it didn't match to an event...
2810  */
2811 static int trace__parse_events_option(const struct option *opt, const char *str,
2812                                       int unset __maybe_unused)
2813 {
2814         struct trace *trace = (struct trace *)opt->value;
2815         const char *s = str;
2816         char *sep = NULL, *lists[2] = { NULL, NULL, };
2817         int len = strlen(str) + 1, err = -1, list;
2818         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2819         char group_name[PATH_MAX];
2820
2821         if (strace_groups_dir == NULL)
2822                 return -1;
2823
2824         if (*s == '!') {
2825                 ++s;
2826                 trace->not_ev_qualifier = true;
2827         }
2828
2829         while (1) {
2830                 if ((sep = strchr(s, ',')) != NULL)
2831                         *sep = '\0';
2832
2833                 list = 0;
2834                 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2835                         list = 1;
2836                 } else {
2837                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2838                         if (access(group_name, R_OK) == 0)
2839                                 list = 1;
2840                 }
2841
2842                 if (lists[list]) {
2843                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2844                 } else {
2845                         lists[list] = malloc(len);
2846                         if (lists[list] == NULL)
2847                                 goto out;
2848                         strcpy(lists[list], s);
2849                 }
2850
2851                 if (!sep)
2852                         break;
2853
2854                 *sep = ',';
2855                 s = sep + 1;
2856         }
2857
2858         if (lists[1] != NULL) {
2859                 struct strlist_config slist_config = {
2860                         .dirname = strace_groups_dir,
2861                 };
2862
2863                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2864                 if (trace->ev_qualifier == NULL) {
2865                         fputs("Not enough memory to parse event qualifier", trace->output);
2866                         goto out;
2867                 }
2868
2869                 if (trace__validate_ev_qualifier(trace))
2870                         goto out;
2871         }
2872
2873         err = 0;
2874
2875         if (lists[0]) {
2876                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2877                                                "event selector. use 'perf list' to list available events",
2878                                                parse_events_option);
2879                 err = parse_events_option(&o, lists[0], 0);
2880         }
2881 out:
2882         if (sep)
2883                 *sep = ',';
2884
2885         return err;
2886 }
2887
2888 int cmd_trace(int argc, const char **argv)
2889 {
2890         const char *trace_usage[] = {
2891                 "perf trace [<options>] [<command>]",
2892                 "perf trace [<options>] -- <command> [<options>]",
2893                 "perf trace record [<options>] [<command>]",
2894                 "perf trace record [<options>] -- <command> [<options>]",
2895                 NULL
2896         };
2897         struct trace trace = {
2898                 .syscalls = {
2899                         . max = -1,
2900                 },
2901                 .opts = {
2902                         .target = {
2903                                 .uid       = UINT_MAX,
2904                                 .uses_mmap = true,
2905                         },
2906                         .user_freq     = UINT_MAX,
2907                         .user_interval = ULLONG_MAX,
2908                         .no_buffering  = true,
2909                         .mmap_pages    = UINT_MAX,
2910                         .proc_map_timeout  = 500,
2911                 },
2912                 .output = stderr,
2913                 .show_comm = true,
2914                 .trace_syscalls = true,
2915                 .kernel_syscallchains = false,
2916                 .max_stack = UINT_MAX,
2917         };
2918         const char *output_name = NULL;
2919         const struct option trace_options[] = {
2920         OPT_CALLBACK('e', "event", &trace, "event",
2921                      "event/syscall selector. use 'perf list' to list available events",
2922                      trace__parse_events_option),
2923         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2924                     "show the thread COMM next to its id"),
2925         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2926         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2927                      trace__parse_events_option),
2928         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2929         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2930         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2931                     "trace events on existing process id"),
2932         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2933                     "trace events on existing thread id"),
2934         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2935                      "pids to filter (by the kernel)", trace__set_filter_pids),
2936         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2937                     "system-wide collection from all CPUs"),
2938         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2939                     "list of cpus to monitor"),
2940         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2941                     "child tasks do not inherit counters"),
2942         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2943                      "number of mmap data pages",
2944                      perf_evlist__parse_mmap_pages),
2945         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2946                    "user to profile"),
2947         OPT_CALLBACK(0, "duration", &trace, "float",
2948                      "show only events with duration > N.M ms",
2949                      trace__set_duration),
2950         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2951         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2952         OPT_BOOLEAN('T', "time", &trace.full_time,
2953                     "Show full timestamp, not time relative to first start"),
2954         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2955                     "Show only syscall summary with statistics"),
2956         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2957                     "Show all syscalls and summary with statistics"),
2958         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2959                      "Trace pagefaults", parse_pagefaults, "maj"),
2960         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2961         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2962         OPT_CALLBACK(0, "call-graph", &trace.opts,
2963                      "record_mode[,record_size]", record_callchain_help,
2964                      &record_parse_callchain_opt),
2965         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2966                     "Show the kernel callchains on the syscall exit path"),
2967         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2968                      "Set the minimum stack depth when parsing the callchain, "
2969                      "anything below the specified depth will be ignored."),
2970         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2971                      "Set the maximum stack depth when parsing the callchain, "
2972                      "anything beyond the specified depth will be ignored. "
2973                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2974         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2975                         "per thread proc mmap processing timeout in ms"),
2976         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2977                      "ms to wait before starting measurement after program "
2978                      "start"),
2979         OPT_END()
2980         };
2981         bool __maybe_unused max_stack_user_set = true;
2982         bool mmap_pages_user_set = true;
2983         const char * const trace_subcommands[] = { "record", NULL };
2984         int err;
2985         char bf[BUFSIZ];
2986
2987         signal(SIGSEGV, sighandler_dump_stack);
2988         signal(SIGFPE, sighandler_dump_stack);
2989
2990         trace.evlist = perf_evlist__new();
2991         trace.sctbl = syscalltbl__new();
2992
2993         if (trace.evlist == NULL || trace.sctbl == NULL) {
2994                 pr_err("Not enough memory to run!\n");
2995                 err = -ENOMEM;
2996                 goto out;
2997         }
2998
2999         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3000                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3001
3002         err = bpf__setup_stdout(trace.evlist);
3003         if (err) {
3004                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3005                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3006                 goto out;
3007         }
3008
3009         err = -1;
3010
3011         if (trace.trace_pgfaults) {
3012                 trace.opts.sample_address = true;
3013                 trace.opts.sample_time = true;
3014         }
3015
3016         if (trace.opts.mmap_pages == UINT_MAX)
3017                 mmap_pages_user_set = false;
3018
3019         if (trace.max_stack == UINT_MAX) {
3020                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3021                 max_stack_user_set = false;
3022         }
3023
3024 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3025         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3026                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3027 #endif
3028
3029         if (callchain_param.enabled) {
3030                 if (!mmap_pages_user_set && geteuid() == 0)
3031                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3032
3033                 symbol_conf.use_callchain = true;
3034         }
3035
3036         if (trace.evlist->nr_entries > 0)
3037                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3038
3039         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3040                 return trace__record(&trace, argc-1, &argv[1]);
3041
3042         /* summary_only implies summary option, but don't overwrite summary if set */
3043         if (trace.summary_only)
3044                 trace.summary = trace.summary_only;
3045
3046         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3047             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3048                 pr_err("Please specify something to trace.\n");
3049                 return -1;
3050         }
3051
3052         if (!trace.trace_syscalls && trace.ev_qualifier) {
3053                 pr_err("The -e option can't be used with --no-syscalls.\n");
3054                 goto out;
3055         }
3056
3057         if (output_name != NULL) {
3058                 err = trace__open_output(&trace, output_name);
3059                 if (err < 0) {
3060                         perror("failed to create output file");
3061                         goto out;
3062                 }
3063         }
3064
3065         trace.open_id = syscalltbl__id(trace.sctbl, "open");
3066
3067         err = target__validate(&trace.opts.target);
3068         if (err) {
3069                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3070                 fprintf(trace.output, "%s", bf);
3071                 goto out_close;
3072         }
3073
3074         err = target__parse_uid(&trace.opts.target);
3075         if (err) {
3076                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3077                 fprintf(trace.output, "%s", bf);
3078                 goto out_close;
3079         }
3080
3081         if (!argc && target__none(&trace.opts.target))
3082                 trace.opts.target.system_wide = true;
3083
3084         if (input_name)
3085                 err = trace__replay(&trace);
3086         else
3087                 err = trace__run(&trace, argc, argv);
3088
3089 out_close:
3090         if (output_name != NULL)
3091                 fclose(trace.output);
3092 out:
3093         return err;
3094 }