Merge tag 'for-4.17-part2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...
[sfrench/cifs-2.6.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "perf.h"
12
13 #include "util/build-id.h"
14 #include "util/util.h"
15 #include <subcmd/parse-options.h>
16 #include "util/parse-events.h"
17 #include "util/config.h"
18
19 #include "util/callchain.h"
20 #include "util/cgroup.h"
21 #include "util/header.h"
22 #include "util/event.h"
23 #include "util/evlist.h"
24 #include "util/evsel.h"
25 #include "util/debug.h"
26 #include "util/drv_configs.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/cpumap.h"
31 #include "util/thread_map.h"
32 #include "util/data.h"
33 #include "util/perf_regs.h"
34 #include "util/auxtrace.h"
35 #include "util/tsc.h"
36 #include "util/parse-branch-options.h"
37 #include "util/parse-regs-options.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "asm/bug.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <locale.h>
49 #include <poll.h>
50 #include <unistd.h>
51 #include <sched.h>
52 #include <signal.h>
53 #include <sys/mman.h>
54 #include <sys/wait.h>
55 #include <linux/time64.h>
56
57 struct switch_output {
58         bool             enabled;
59         bool             signal;
60         unsigned long    size;
61         unsigned long    time;
62         const char      *str;
63         bool             set;
64 };
65
66 struct record {
67         struct perf_tool        tool;
68         struct record_opts      opts;
69         u64                     bytes_written;
70         struct perf_data        data;
71         struct auxtrace_record  *itr;
72         struct perf_evlist      *evlist;
73         struct perf_session     *session;
74         int                     realtime_prio;
75         bool                    no_buildid;
76         bool                    no_buildid_set;
77         bool                    no_buildid_cache;
78         bool                    no_buildid_cache_set;
79         bool                    buildid_all;
80         bool                    timestamp_filename;
81         bool                    timestamp_boundary;
82         struct switch_output    switch_output;
83         unsigned long long      samples;
84 };
85
86 static volatile int auxtrace_record__snapshot_started;
87 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
88 static DEFINE_TRIGGER(switch_output_trigger);
89
90 static bool switch_output_signal(struct record *rec)
91 {
92         return rec->switch_output.signal &&
93                trigger_is_ready(&switch_output_trigger);
94 }
95
96 static bool switch_output_size(struct record *rec)
97 {
98         return rec->switch_output.size &&
99                trigger_is_ready(&switch_output_trigger) &&
100                (rec->bytes_written >= rec->switch_output.size);
101 }
102
103 static bool switch_output_time(struct record *rec)
104 {
105         return rec->switch_output.time &&
106                trigger_is_ready(&switch_output_trigger);
107 }
108
109 static int record__write(struct record *rec, void *bf, size_t size)
110 {
111         if (perf_data__write(rec->session->data, bf, size) < 0) {
112                 pr_err("failed to write perf data, error: %m\n");
113                 return -1;
114         }
115
116         rec->bytes_written += size;
117
118         if (switch_output_size(rec))
119                 trigger_hit(&switch_output_trigger);
120
121         return 0;
122 }
123
124 static int process_synthesized_event(struct perf_tool *tool,
125                                      union perf_event *event,
126                                      struct perf_sample *sample __maybe_unused,
127                                      struct machine *machine __maybe_unused)
128 {
129         struct record *rec = container_of(tool, struct record, tool);
130         return record__write(rec, event, event->header.size);
131 }
132
133 static int record__pushfn(void *to, void *bf, size_t size)
134 {
135         struct record *rec = to;
136
137         rec->samples++;
138         return record__write(rec, bf, size);
139 }
140
141 static volatile int done;
142 static volatile int signr = -1;
143 static volatile int child_finished;
144
145 static void sig_handler(int sig)
146 {
147         if (sig == SIGCHLD)
148                 child_finished = 1;
149         else
150                 signr = sig;
151
152         done = 1;
153 }
154
155 static void sigsegv_handler(int sig)
156 {
157         perf_hooks__recover();
158         sighandler_dump_stack(sig);
159 }
160
161 static void record__sig_exit(void)
162 {
163         if (signr == -1)
164                 return;
165
166         signal(signr, SIG_DFL);
167         raise(signr);
168 }
169
170 #ifdef HAVE_AUXTRACE_SUPPORT
171
172 static int record__process_auxtrace(struct perf_tool *tool,
173                                     union perf_event *event, void *data1,
174                                     size_t len1, void *data2, size_t len2)
175 {
176         struct record *rec = container_of(tool, struct record, tool);
177         struct perf_data *data = &rec->data;
178         size_t padding;
179         u8 pad[8] = {0};
180
181         if (!perf_data__is_pipe(data)) {
182                 off_t file_offset;
183                 int fd = perf_data__fd(data);
184                 int err;
185
186                 file_offset = lseek(fd, 0, SEEK_CUR);
187                 if (file_offset == -1)
188                         return -1;
189                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
190                                                      event, file_offset);
191                 if (err)
192                         return err;
193         }
194
195         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
196         padding = (len1 + len2) & 7;
197         if (padding)
198                 padding = 8 - padding;
199
200         record__write(rec, event, event->header.size);
201         record__write(rec, data1, len1);
202         if (len2)
203                 record__write(rec, data2, len2);
204         record__write(rec, &pad, padding);
205
206         return 0;
207 }
208
209 static int record__auxtrace_mmap_read(struct record *rec,
210                                       struct auxtrace_mmap *mm)
211 {
212         int ret;
213
214         ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
215                                   record__process_auxtrace);
216         if (ret < 0)
217                 return ret;
218
219         if (ret)
220                 rec->samples++;
221
222         return 0;
223 }
224
225 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
226                                                struct auxtrace_mmap *mm)
227 {
228         int ret;
229
230         ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
231                                            record__process_auxtrace,
232                                            rec->opts.auxtrace_snapshot_size);
233         if (ret < 0)
234                 return ret;
235
236         if (ret)
237                 rec->samples++;
238
239         return 0;
240 }
241
242 static int record__auxtrace_read_snapshot_all(struct record *rec)
243 {
244         int i;
245         int rc = 0;
246
247         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
248                 struct auxtrace_mmap *mm =
249                                 &rec->evlist->mmap[i].auxtrace_mmap;
250
251                 if (!mm->base)
252                         continue;
253
254                 if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
255                         rc = -1;
256                         goto out;
257                 }
258         }
259 out:
260         return rc;
261 }
262
263 static void record__read_auxtrace_snapshot(struct record *rec)
264 {
265         pr_debug("Recording AUX area tracing snapshot\n");
266         if (record__auxtrace_read_snapshot_all(rec) < 0) {
267                 trigger_error(&auxtrace_snapshot_trigger);
268         } else {
269                 if (auxtrace_record__snapshot_finish(rec->itr))
270                         trigger_error(&auxtrace_snapshot_trigger);
271                 else
272                         trigger_ready(&auxtrace_snapshot_trigger);
273         }
274 }
275
276 static int record__auxtrace_init(struct record *rec)
277 {
278         int err;
279
280         if (!rec->itr) {
281                 rec->itr = auxtrace_record__init(rec->evlist, &err);
282                 if (err)
283                         return err;
284         }
285
286         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
287                                               rec->opts.auxtrace_snapshot_opts);
288         if (err)
289                 return err;
290
291         return auxtrace_parse_filters(rec->evlist);
292 }
293
294 #else
295
296 static inline
297 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
298                                struct auxtrace_mmap *mm __maybe_unused)
299 {
300         return 0;
301 }
302
303 static inline
304 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
305 {
306 }
307
308 static inline
309 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
310 {
311         return 0;
312 }
313
314 static int record__auxtrace_init(struct record *rec __maybe_unused)
315 {
316         return 0;
317 }
318
319 #endif
320
321 static int record__mmap_evlist(struct record *rec,
322                                struct perf_evlist *evlist)
323 {
324         struct record_opts *opts = &rec->opts;
325         char msg[512];
326
327         if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
328                                  opts->auxtrace_mmap_pages,
329                                  opts->auxtrace_snapshot_mode) < 0) {
330                 if (errno == EPERM) {
331                         pr_err("Permission error mapping pages.\n"
332                                "Consider increasing "
333                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
334                                "or try again with a smaller value of -m/--mmap_pages.\n"
335                                "(current value: %u,%u)\n",
336                                opts->mmap_pages, opts->auxtrace_mmap_pages);
337                         return -errno;
338                 } else {
339                         pr_err("failed to mmap with %d (%s)\n", errno,
340                                 str_error_r(errno, msg, sizeof(msg)));
341                         if (errno)
342                                 return -errno;
343                         else
344                                 return -EINVAL;
345                 }
346         }
347         return 0;
348 }
349
350 static int record__mmap(struct record *rec)
351 {
352         return record__mmap_evlist(rec, rec->evlist);
353 }
354
355 static int record__open(struct record *rec)
356 {
357         char msg[BUFSIZ];
358         struct perf_evsel *pos;
359         struct perf_evlist *evlist = rec->evlist;
360         struct perf_session *session = rec->session;
361         struct record_opts *opts = &rec->opts;
362         struct perf_evsel_config_term *err_term;
363         int rc = 0;
364
365         /*
366          * For initial_delay we need to add a dummy event so that we can track
367          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
368          * real events, the ones asked by the user.
369          */
370         if (opts->initial_delay) {
371                 if (perf_evlist__add_dummy(evlist))
372                         return -ENOMEM;
373
374                 pos = perf_evlist__first(evlist);
375                 pos->tracking = 0;
376                 pos = perf_evlist__last(evlist);
377                 pos->tracking = 1;
378                 pos->attr.enable_on_exec = 1;
379         }
380
381         perf_evlist__config(evlist, opts, &callchain_param);
382
383         evlist__for_each_entry(evlist, pos) {
384 try_again:
385                 if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
386                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
387                                 if (verbose > 0)
388                                         ui__warning("%s\n", msg);
389                                 goto try_again;
390                         }
391
392                         rc = -errno;
393                         perf_evsel__open_strerror(pos, &opts->target,
394                                                   errno, msg, sizeof(msg));
395                         ui__error("%s\n", msg);
396                         goto out;
397                 }
398
399                 pos->supported = true;
400         }
401
402         if (perf_evlist__apply_filters(evlist, &pos)) {
403                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
404                         pos->filter, perf_evsel__name(pos), errno,
405                         str_error_r(errno, msg, sizeof(msg)));
406                 rc = -1;
407                 goto out;
408         }
409
410         if (perf_evlist__apply_drv_configs(evlist, &pos, &err_term)) {
411                 pr_err("failed to set config \"%s\" on event %s with %d (%s)\n",
412                       err_term->val.drv_cfg, perf_evsel__name(pos), errno,
413                       str_error_r(errno, msg, sizeof(msg)));
414                 rc = -1;
415                 goto out;
416         }
417
418         rc = record__mmap(rec);
419         if (rc)
420                 goto out;
421
422         session->evlist = evlist;
423         perf_session__set_id_hdr_size(session);
424 out:
425         return rc;
426 }
427
428 static int process_sample_event(struct perf_tool *tool,
429                                 union perf_event *event,
430                                 struct perf_sample *sample,
431                                 struct perf_evsel *evsel,
432                                 struct machine *machine)
433 {
434         struct record *rec = container_of(tool, struct record, tool);
435
436         if (rec->evlist->first_sample_time == 0)
437                 rec->evlist->first_sample_time = sample->time;
438
439         rec->evlist->last_sample_time = sample->time;
440
441         if (rec->buildid_all)
442                 return 0;
443
444         rec->samples++;
445         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
446 }
447
448 static int process_buildids(struct record *rec)
449 {
450         struct perf_data *data = &rec->data;
451         struct perf_session *session = rec->session;
452
453         if (data->size == 0)
454                 return 0;
455
456         /*
457          * During this process, it'll load kernel map and replace the
458          * dso->long_name to a real pathname it found.  In this case
459          * we prefer the vmlinux path like
460          *   /lib/modules/3.16.4/build/vmlinux
461          *
462          * rather than build-id path (in debug directory).
463          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
464          */
465         symbol_conf.ignore_vmlinux_buildid = true;
466
467         /*
468          * If --buildid-all is given, it marks all DSO regardless of hits,
469          * so no need to process samples. But if timestamp_boundary is enabled,
470          * it still needs to walk on all samples to get the timestamps of
471          * first/last samples.
472          */
473         if (rec->buildid_all && !rec->timestamp_boundary)
474                 rec->tool.sample = NULL;
475
476         return perf_session__process_events(session);
477 }
478
479 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
480 {
481         int err;
482         struct perf_tool *tool = data;
483         /*
484          *As for guest kernel when processing subcommand record&report,
485          *we arrange module mmap prior to guest kernel mmap and trigger
486          *a preload dso because default guest module symbols are loaded
487          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
488          *method is used to avoid symbol missing when the first addr is
489          *in module instead of in guest kernel.
490          */
491         err = perf_event__synthesize_modules(tool, process_synthesized_event,
492                                              machine);
493         if (err < 0)
494                 pr_err("Couldn't record guest kernel [%d]'s reference"
495                        " relocation symbol.\n", machine->pid);
496
497         /*
498          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
499          * have no _text sometimes.
500          */
501         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
502                                                  machine);
503         if (err < 0)
504                 pr_err("Couldn't record guest kernel [%d]'s reference"
505                        " relocation symbol.\n", machine->pid);
506 }
507
508 static struct perf_event_header finished_round_event = {
509         .size = sizeof(struct perf_event_header),
510         .type = PERF_RECORD_FINISHED_ROUND,
511 };
512
513 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
514                                     bool overwrite)
515 {
516         u64 bytes_written = rec->bytes_written;
517         int i;
518         int rc = 0;
519         struct perf_mmap *maps;
520
521         if (!evlist)
522                 return 0;
523
524         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
525         if (!maps)
526                 return 0;
527
528         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
529                 return 0;
530
531         for (i = 0; i < evlist->nr_mmaps; i++) {
532                 struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap;
533
534                 if (maps[i].base) {
535                         if (perf_mmap__push(&maps[i], rec, record__pushfn) != 0) {
536                                 rc = -1;
537                                 goto out;
538                         }
539                 }
540
541                 if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
542                     record__auxtrace_mmap_read(rec, mm) != 0) {
543                         rc = -1;
544                         goto out;
545                 }
546         }
547
548         /*
549          * Mark the round finished in case we wrote
550          * at least one event.
551          */
552         if (bytes_written != rec->bytes_written)
553                 rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
554
555         if (overwrite)
556                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
557 out:
558         return rc;
559 }
560
561 static int record__mmap_read_all(struct record *rec)
562 {
563         int err;
564
565         err = record__mmap_read_evlist(rec, rec->evlist, false);
566         if (err)
567                 return err;
568
569         return record__mmap_read_evlist(rec, rec->evlist, true);
570 }
571
572 static void record__init_features(struct record *rec)
573 {
574         struct perf_session *session = rec->session;
575         int feat;
576
577         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
578                 perf_header__set_feat(&session->header, feat);
579
580         if (rec->no_buildid)
581                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
582
583         if (!have_tracepoints(&rec->evlist->entries))
584                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
585
586         if (!rec->opts.branch_stack)
587                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
588
589         if (!rec->opts.full_auxtrace)
590                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
591
592         perf_header__clear_feat(&session->header, HEADER_STAT);
593 }
594
595 static void
596 record__finish_output(struct record *rec)
597 {
598         struct perf_data *data = &rec->data;
599         int fd = perf_data__fd(data);
600
601         if (data->is_pipe)
602                 return;
603
604         rec->session->header.data_size += rec->bytes_written;
605         data->size = lseek(perf_data__fd(data), 0, SEEK_CUR);
606
607         if (!rec->no_buildid) {
608                 process_buildids(rec);
609
610                 if (rec->buildid_all)
611                         dsos__hit_all(rec->session);
612         }
613         perf_session__write_header(rec->session, rec->evlist, fd, true);
614
615         return;
616 }
617
618 static int record__synthesize_workload(struct record *rec, bool tail)
619 {
620         int err;
621         struct thread_map *thread_map;
622
623         if (rec->opts.tail_synthesize != tail)
624                 return 0;
625
626         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
627         if (thread_map == NULL)
628                 return -1;
629
630         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
631                                                  process_synthesized_event,
632                                                  &rec->session->machines.host,
633                                                  rec->opts.sample_address,
634                                                  rec->opts.proc_map_timeout);
635         thread_map__put(thread_map);
636         return err;
637 }
638
639 static int record__synthesize(struct record *rec, bool tail);
640
641 static int
642 record__switch_output(struct record *rec, bool at_exit)
643 {
644         struct perf_data *data = &rec->data;
645         int fd, err;
646
647         /* Same Size:      "2015122520103046"*/
648         char timestamp[] = "InvalidTimestamp";
649
650         record__synthesize(rec, true);
651         if (target__none(&rec->opts.target))
652                 record__synthesize_workload(rec, true);
653
654         rec->samples = 0;
655         record__finish_output(rec);
656         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
657         if (err) {
658                 pr_err("Failed to get current timestamp\n");
659                 return -EINVAL;
660         }
661
662         fd = perf_data__switch(data, timestamp,
663                                     rec->session->header.data_offset,
664                                     at_exit);
665         if (fd >= 0 && !at_exit) {
666                 rec->bytes_written = 0;
667                 rec->session->header.data_size = 0;
668         }
669
670         if (!quiet)
671                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
672                         data->file.path, timestamp);
673
674         /* Output tracking events */
675         if (!at_exit) {
676                 record__synthesize(rec, false);
677
678                 /*
679                  * In 'perf record --switch-output' without -a,
680                  * record__synthesize() in record__switch_output() won't
681                  * generate tracking events because there's no thread_map
682                  * in evlist. Which causes newly created perf.data doesn't
683                  * contain map and comm information.
684                  * Create a fake thread_map and directly call
685                  * perf_event__synthesize_thread_map() for those events.
686                  */
687                 if (target__none(&rec->opts.target))
688                         record__synthesize_workload(rec, false);
689         }
690         return fd;
691 }
692
693 static volatile int workload_exec_errno;
694
695 /*
696  * perf_evlist__prepare_workload will send a SIGUSR1
697  * if the fork fails, since we asked by setting its
698  * want_signal to true.
699  */
700 static void workload_exec_failed_signal(int signo __maybe_unused,
701                                         siginfo_t *info,
702                                         void *ucontext __maybe_unused)
703 {
704         workload_exec_errno = info->si_value.sival_int;
705         done = 1;
706         child_finished = 1;
707 }
708
709 static void snapshot_sig_handler(int sig);
710 static void alarm_sig_handler(int sig);
711
712 int __weak
713 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
714                             struct perf_tool *tool __maybe_unused,
715                             perf_event__handler_t process __maybe_unused,
716                             struct machine *machine __maybe_unused)
717 {
718         return 0;
719 }
720
721 static const struct perf_event_mmap_page *
722 perf_evlist__pick_pc(struct perf_evlist *evlist)
723 {
724         if (evlist) {
725                 if (evlist->mmap && evlist->mmap[0].base)
726                         return evlist->mmap[0].base;
727                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
728                         return evlist->overwrite_mmap[0].base;
729         }
730         return NULL;
731 }
732
733 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
734 {
735         const struct perf_event_mmap_page *pc;
736
737         pc = perf_evlist__pick_pc(rec->evlist);
738         if (pc)
739                 return pc;
740         return NULL;
741 }
742
743 static int record__synthesize(struct record *rec, bool tail)
744 {
745         struct perf_session *session = rec->session;
746         struct machine *machine = &session->machines.host;
747         struct perf_data *data = &rec->data;
748         struct record_opts *opts = &rec->opts;
749         struct perf_tool *tool = &rec->tool;
750         int fd = perf_data__fd(data);
751         int err = 0;
752
753         if (rec->opts.tail_synthesize != tail)
754                 return 0;
755
756         if (data->is_pipe) {
757                 /*
758                  * We need to synthesize events first, because some
759                  * features works on top of them (on report side).
760                  */
761                 err = perf_event__synthesize_attrs(tool, session,
762                                                    process_synthesized_event);
763                 if (err < 0) {
764                         pr_err("Couldn't synthesize attrs.\n");
765                         goto out;
766                 }
767
768                 err = perf_event__synthesize_features(tool, session, rec->evlist,
769                                                       process_synthesized_event);
770                 if (err < 0) {
771                         pr_err("Couldn't synthesize features.\n");
772                         return err;
773                 }
774
775                 if (have_tracepoints(&rec->evlist->entries)) {
776                         /*
777                          * FIXME err <= 0 here actually means that
778                          * there were no tracepoints so its not really
779                          * an error, just that we don't need to
780                          * synthesize anything.  We really have to
781                          * return this more properly and also
782                          * propagate errors that now are calling die()
783                          */
784                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
785                                                                   process_synthesized_event);
786                         if (err <= 0) {
787                                 pr_err("Couldn't record tracing data.\n");
788                                 goto out;
789                         }
790                         rec->bytes_written += err;
791                 }
792         }
793
794         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
795                                           process_synthesized_event, machine);
796         if (err)
797                 goto out;
798
799         if (rec->opts.full_auxtrace) {
800                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
801                                         session, process_synthesized_event);
802                 if (err)
803                         goto out;
804         }
805
806         if (!perf_evlist__exclude_kernel(rec->evlist)) {
807                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
808                                                          machine);
809                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
810                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
811                                    "Check /proc/kallsyms permission or run as root.\n");
812
813                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
814                                                      machine);
815                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
816                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
817                                    "Check /proc/modules permission or run as root.\n");
818         }
819
820         if (perf_guest) {
821                 machines__process_guests(&session->machines,
822                                          perf_event__synthesize_guest_os, tool);
823         }
824
825         err = perf_event__synthesize_extra_attr(&rec->tool,
826                                                 rec->evlist,
827                                                 process_synthesized_event,
828                                                 data->is_pipe);
829         if (err)
830                 goto out;
831
832         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
833                                                  process_synthesized_event,
834                                                 NULL);
835         if (err < 0) {
836                 pr_err("Couldn't synthesize thread map.\n");
837                 return err;
838         }
839
840         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
841                                              process_synthesized_event, NULL);
842         if (err < 0) {
843                 pr_err("Couldn't synthesize cpu map.\n");
844                 return err;
845         }
846
847         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
848                                             process_synthesized_event, opts->sample_address,
849                                             opts->proc_map_timeout, 1);
850 out:
851         return err;
852 }
853
854 static int __cmd_record(struct record *rec, int argc, const char **argv)
855 {
856         int err;
857         int status = 0;
858         unsigned long waking = 0;
859         const bool forks = argc > 0;
860         struct perf_tool *tool = &rec->tool;
861         struct record_opts *opts = &rec->opts;
862         struct perf_data *data = &rec->data;
863         struct perf_session *session;
864         bool disabled = false, draining = false;
865         int fd;
866
867         atexit(record__sig_exit);
868         signal(SIGCHLD, sig_handler);
869         signal(SIGINT, sig_handler);
870         signal(SIGTERM, sig_handler);
871         signal(SIGSEGV, sigsegv_handler);
872
873         if (rec->opts.record_namespaces)
874                 tool->namespace_events = true;
875
876         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
877                 signal(SIGUSR2, snapshot_sig_handler);
878                 if (rec->opts.auxtrace_snapshot_mode)
879                         trigger_on(&auxtrace_snapshot_trigger);
880                 if (rec->switch_output.enabled)
881                         trigger_on(&switch_output_trigger);
882         } else {
883                 signal(SIGUSR2, SIG_IGN);
884         }
885
886         session = perf_session__new(data, false, tool);
887         if (session == NULL) {
888                 pr_err("Perf session creation failed.\n");
889                 return -1;
890         }
891
892         fd = perf_data__fd(data);
893         rec->session = session;
894
895         record__init_features(rec);
896
897         if (forks) {
898                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
899                                                     argv, data->is_pipe,
900                                                     workload_exec_failed_signal);
901                 if (err < 0) {
902                         pr_err("Couldn't run the workload!\n");
903                         status = err;
904                         goto out_delete_session;
905                 }
906         }
907
908         /*
909          * If we have just single event and are sending data
910          * through pipe, we need to force the ids allocation,
911          * because we synthesize event name through the pipe
912          * and need the id for that.
913          */
914         if (data->is_pipe && rec->evlist->nr_entries == 1)
915                 rec->opts.sample_id = true;
916
917         if (record__open(rec) != 0) {
918                 err = -1;
919                 goto out_child;
920         }
921
922         err = bpf__apply_obj_config();
923         if (err) {
924                 char errbuf[BUFSIZ];
925
926                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
927                 pr_err("ERROR: Apply config to BPF failed: %s\n",
928                          errbuf);
929                 goto out_child;
930         }
931
932         /*
933          * Normally perf_session__new would do this, but it doesn't have the
934          * evlist.
935          */
936         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
937                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
938                 rec->tool.ordered_events = false;
939         }
940
941         if (!rec->evlist->nr_groups)
942                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
943
944         if (data->is_pipe) {
945                 err = perf_header__write_pipe(fd);
946                 if (err < 0)
947                         goto out_child;
948         } else {
949                 err = perf_session__write_header(session, rec->evlist, fd, false);
950                 if (err < 0)
951                         goto out_child;
952         }
953
954         if (!rec->no_buildid
955             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
956                 pr_err("Couldn't generate buildids. "
957                        "Use --no-buildid to profile anyway.\n");
958                 err = -1;
959                 goto out_child;
960         }
961
962         err = record__synthesize(rec, false);
963         if (err < 0)
964                 goto out_child;
965
966         if (rec->realtime_prio) {
967                 struct sched_param param;
968
969                 param.sched_priority = rec->realtime_prio;
970                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
971                         pr_err("Could not set realtime priority.\n");
972                         err = -1;
973                         goto out_child;
974                 }
975         }
976
977         /*
978          * When perf is starting the traced process, all the events
979          * (apart from group members) have enable_on_exec=1 set,
980          * so don't spoil it by prematurely enabling them.
981          */
982         if (!target__none(&opts->target) && !opts->initial_delay)
983                 perf_evlist__enable(rec->evlist);
984
985         /*
986          * Let the child rip
987          */
988         if (forks) {
989                 struct machine *machine = &session->machines.host;
990                 union perf_event *event;
991                 pid_t tgid;
992
993                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
994                 if (event == NULL) {
995                         err = -ENOMEM;
996                         goto out_child;
997                 }
998
999                 /*
1000                  * Some H/W events are generated before COMM event
1001                  * which is emitted during exec(), so perf script
1002                  * cannot see a correct process name for those events.
1003                  * Synthesize COMM event to prevent it.
1004                  */
1005                 tgid = perf_event__synthesize_comm(tool, event,
1006                                                    rec->evlist->workload.pid,
1007                                                    process_synthesized_event,
1008                                                    machine);
1009                 free(event);
1010
1011                 if (tgid == -1)
1012                         goto out_child;
1013
1014                 event = malloc(sizeof(event->namespaces) +
1015                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1016                                machine->id_hdr_size);
1017                 if (event == NULL) {
1018                         err = -ENOMEM;
1019                         goto out_child;
1020                 }
1021
1022                 /*
1023                  * Synthesize NAMESPACES event for the command specified.
1024                  */
1025                 perf_event__synthesize_namespaces(tool, event,
1026                                                   rec->evlist->workload.pid,
1027                                                   tgid, process_synthesized_event,
1028                                                   machine);
1029                 free(event);
1030
1031                 perf_evlist__start_workload(rec->evlist);
1032         }
1033
1034         if (opts->initial_delay) {
1035                 usleep(opts->initial_delay * USEC_PER_MSEC);
1036                 perf_evlist__enable(rec->evlist);
1037         }
1038
1039         trigger_ready(&auxtrace_snapshot_trigger);
1040         trigger_ready(&switch_output_trigger);
1041         perf_hooks__invoke_record_start();
1042         for (;;) {
1043                 unsigned long long hits = rec->samples;
1044
1045                 /*
1046                  * rec->evlist->bkw_mmap_state is possible to be
1047                  * BKW_MMAP_EMPTY here: when done == true and
1048                  * hits != rec->samples in previous round.
1049                  *
1050                  * perf_evlist__toggle_bkw_mmap ensure we never
1051                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1052                  */
1053                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1054                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1055
1056                 if (record__mmap_read_all(rec) < 0) {
1057                         trigger_error(&auxtrace_snapshot_trigger);
1058                         trigger_error(&switch_output_trigger);
1059                         err = -1;
1060                         goto out_child;
1061                 }
1062
1063                 if (auxtrace_record__snapshot_started) {
1064                         auxtrace_record__snapshot_started = 0;
1065                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1066                                 record__read_auxtrace_snapshot(rec);
1067                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1068                                 pr_err("AUX area tracing snapshot failed\n");
1069                                 err = -1;
1070                                 goto out_child;
1071                         }
1072                 }
1073
1074                 if (trigger_is_hit(&switch_output_trigger)) {
1075                         /*
1076                          * If switch_output_trigger is hit, the data in
1077                          * overwritable ring buffer should have been collected,
1078                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1079                          *
1080                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1081                          * record__mmap_read_all() didn't collect data from
1082                          * overwritable ring buffer. Read again.
1083                          */
1084                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1085                                 continue;
1086                         trigger_ready(&switch_output_trigger);
1087
1088                         /*
1089                          * Reenable events in overwrite ring buffer after
1090                          * record__mmap_read_all(): we should have collected
1091                          * data from it.
1092                          */
1093                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1094
1095                         if (!quiet)
1096                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1097                                         waking);
1098                         waking = 0;
1099                         fd = record__switch_output(rec, false);
1100                         if (fd < 0) {
1101                                 pr_err("Failed to switch to new file\n");
1102                                 trigger_error(&switch_output_trigger);
1103                                 err = fd;
1104                                 goto out_child;
1105                         }
1106
1107                         /* re-arm the alarm */
1108                         if (rec->switch_output.time)
1109                                 alarm(rec->switch_output.time);
1110                 }
1111
1112                 if (hits == rec->samples) {
1113                         if (done || draining)
1114                                 break;
1115                         err = perf_evlist__poll(rec->evlist, -1);
1116                         /*
1117                          * Propagate error, only if there's any. Ignore positive
1118                          * number of returned events and interrupt error.
1119                          */
1120                         if (err > 0 || (err < 0 && errno == EINTR))
1121                                 err = 0;
1122                         waking++;
1123
1124                         if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1125                                 draining = true;
1126                 }
1127
1128                 /*
1129                  * When perf is starting the traced process, at the end events
1130                  * die with the process and we wait for that. Thus no need to
1131                  * disable events in this case.
1132                  */
1133                 if (done && !disabled && !target__none(&opts->target)) {
1134                         trigger_off(&auxtrace_snapshot_trigger);
1135                         perf_evlist__disable(rec->evlist);
1136                         disabled = true;
1137                 }
1138         }
1139         trigger_off(&auxtrace_snapshot_trigger);
1140         trigger_off(&switch_output_trigger);
1141
1142         if (forks && workload_exec_errno) {
1143                 char msg[STRERR_BUFSIZE];
1144                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1145                 pr_err("Workload failed: %s\n", emsg);
1146                 err = -1;
1147                 goto out_child;
1148         }
1149
1150         if (!quiet)
1151                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1152
1153         if (target__none(&rec->opts.target))
1154                 record__synthesize_workload(rec, true);
1155
1156 out_child:
1157         if (forks) {
1158                 int exit_status;
1159
1160                 if (!child_finished)
1161                         kill(rec->evlist->workload.pid, SIGTERM);
1162
1163                 wait(&exit_status);
1164
1165                 if (err < 0)
1166                         status = err;
1167                 else if (WIFEXITED(exit_status))
1168                         status = WEXITSTATUS(exit_status);
1169                 else if (WIFSIGNALED(exit_status))
1170                         signr = WTERMSIG(exit_status);
1171         } else
1172                 status = err;
1173
1174         record__synthesize(rec, true);
1175         /* this will be recalculated during process_buildids() */
1176         rec->samples = 0;
1177
1178         if (!err) {
1179                 if (!rec->timestamp_filename) {
1180                         record__finish_output(rec);
1181                 } else {
1182                         fd = record__switch_output(rec, true);
1183                         if (fd < 0) {
1184                                 status = fd;
1185                                 goto out_delete_session;
1186                         }
1187                 }
1188         }
1189
1190         perf_hooks__invoke_record_end();
1191
1192         if (!err && !quiet) {
1193                 char samples[128];
1194                 const char *postfix = rec->timestamp_filename ?
1195                                         ".<timestamp>" : "";
1196
1197                 if (rec->samples && !rec->opts.full_auxtrace)
1198                         scnprintf(samples, sizeof(samples),
1199                                   " (%" PRIu64 " samples)", rec->samples);
1200                 else
1201                         samples[0] = '\0';
1202
1203                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s ]\n",
1204                         perf_data__size(data) / 1024.0 / 1024.0,
1205                         data->file.path, postfix, samples);
1206         }
1207
1208 out_delete_session:
1209         perf_session__delete(session);
1210         return status;
1211 }
1212
1213 static void callchain_debug(struct callchain_param *callchain)
1214 {
1215         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1216
1217         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1218
1219         if (callchain->record_mode == CALLCHAIN_DWARF)
1220                 pr_debug("callchain: stack dump size %d\n",
1221                          callchain->dump_size);
1222 }
1223
1224 int record_opts__parse_callchain(struct record_opts *record,
1225                                  struct callchain_param *callchain,
1226                                  const char *arg, bool unset)
1227 {
1228         int ret;
1229         callchain->enabled = !unset;
1230
1231         /* --no-call-graph */
1232         if (unset) {
1233                 callchain->record_mode = CALLCHAIN_NONE;
1234                 pr_debug("callchain: disabled\n");
1235                 return 0;
1236         }
1237
1238         ret = parse_callchain_record_opt(arg, callchain);
1239         if (!ret) {
1240                 /* Enable data address sampling for DWARF unwind. */
1241                 if (callchain->record_mode == CALLCHAIN_DWARF)
1242                         record->sample_address = true;
1243                 callchain_debug(callchain);
1244         }
1245
1246         return ret;
1247 }
1248
1249 int record_parse_callchain_opt(const struct option *opt,
1250                                const char *arg,
1251                                int unset)
1252 {
1253         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1254 }
1255
1256 int record_callchain_opt(const struct option *opt,
1257                          const char *arg __maybe_unused,
1258                          int unset __maybe_unused)
1259 {
1260         struct callchain_param *callchain = opt->value;
1261
1262         callchain->enabled = true;
1263
1264         if (callchain->record_mode == CALLCHAIN_NONE)
1265                 callchain->record_mode = CALLCHAIN_FP;
1266
1267         callchain_debug(callchain);
1268         return 0;
1269 }
1270
1271 static int perf_record_config(const char *var, const char *value, void *cb)
1272 {
1273         struct record *rec = cb;
1274
1275         if (!strcmp(var, "record.build-id")) {
1276                 if (!strcmp(value, "cache"))
1277                         rec->no_buildid_cache = false;
1278                 else if (!strcmp(value, "no-cache"))
1279                         rec->no_buildid_cache = true;
1280                 else if (!strcmp(value, "skip"))
1281                         rec->no_buildid = true;
1282                 else
1283                         return -1;
1284                 return 0;
1285         }
1286         if (!strcmp(var, "record.call-graph")) {
1287                 var = "call-graph.record-mode";
1288                 return perf_default_config(var, value, cb);
1289         }
1290
1291         return 0;
1292 }
1293
1294 struct clockid_map {
1295         const char *name;
1296         int clockid;
1297 };
1298
1299 #define CLOCKID_MAP(n, c)       \
1300         { .name = n, .clockid = (c), }
1301
1302 #define CLOCKID_END     { .name = NULL, }
1303
1304
1305 /*
1306  * Add the missing ones, we need to build on many distros...
1307  */
1308 #ifndef CLOCK_MONOTONIC_RAW
1309 #define CLOCK_MONOTONIC_RAW 4
1310 #endif
1311 #ifndef CLOCK_BOOTTIME
1312 #define CLOCK_BOOTTIME 7
1313 #endif
1314 #ifndef CLOCK_TAI
1315 #define CLOCK_TAI 11
1316 #endif
1317
1318 static const struct clockid_map clockids[] = {
1319         /* available for all events, NMI safe */
1320         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1321         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1322
1323         /* available for some events */
1324         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1325         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1326         CLOCKID_MAP("tai", CLOCK_TAI),
1327
1328         /* available for the lazy */
1329         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1330         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1331         CLOCKID_MAP("real", CLOCK_REALTIME),
1332         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1333
1334         CLOCKID_END,
1335 };
1336
1337 static int parse_clockid(const struct option *opt, const char *str, int unset)
1338 {
1339         struct record_opts *opts = (struct record_opts *)opt->value;
1340         const struct clockid_map *cm;
1341         const char *ostr = str;
1342
1343         if (unset) {
1344                 opts->use_clockid = 0;
1345                 return 0;
1346         }
1347
1348         /* no arg passed */
1349         if (!str)
1350                 return 0;
1351
1352         /* no setting it twice */
1353         if (opts->use_clockid)
1354                 return -1;
1355
1356         opts->use_clockid = true;
1357
1358         /* if its a number, we're done */
1359         if (sscanf(str, "%d", &opts->clockid) == 1)
1360                 return 0;
1361
1362         /* allow a "CLOCK_" prefix to the name */
1363         if (!strncasecmp(str, "CLOCK_", 6))
1364                 str += 6;
1365
1366         for (cm = clockids; cm->name; cm++) {
1367                 if (!strcasecmp(str, cm->name)) {
1368                         opts->clockid = cm->clockid;
1369                         return 0;
1370                 }
1371         }
1372
1373         opts->use_clockid = false;
1374         ui__warning("unknown clockid %s, check man page\n", ostr);
1375         return -1;
1376 }
1377
1378 static int record__parse_mmap_pages(const struct option *opt,
1379                                     const char *str,
1380                                     int unset __maybe_unused)
1381 {
1382         struct record_opts *opts = opt->value;
1383         char *s, *p;
1384         unsigned int mmap_pages;
1385         int ret;
1386
1387         if (!str)
1388                 return -EINVAL;
1389
1390         s = strdup(str);
1391         if (!s)
1392                 return -ENOMEM;
1393
1394         p = strchr(s, ',');
1395         if (p)
1396                 *p = '\0';
1397
1398         if (*s) {
1399                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1400                 if (ret)
1401                         goto out_free;
1402                 opts->mmap_pages = mmap_pages;
1403         }
1404
1405         if (!p) {
1406                 ret = 0;
1407                 goto out_free;
1408         }
1409
1410         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1411         if (ret)
1412                 goto out_free;
1413
1414         opts->auxtrace_mmap_pages = mmap_pages;
1415
1416 out_free:
1417         free(s);
1418         return ret;
1419 }
1420
1421 static void switch_output_size_warn(struct record *rec)
1422 {
1423         u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1424         struct switch_output *s = &rec->switch_output;
1425
1426         wakeup_size /= 2;
1427
1428         if (s->size < wakeup_size) {
1429                 char buf[100];
1430
1431                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1432                 pr_warning("WARNING: switch-output data size lower than "
1433                            "wakeup kernel buffer size (%s) "
1434                            "expect bigger perf.data sizes\n", buf);
1435         }
1436 }
1437
1438 static int switch_output_setup(struct record *rec)
1439 {
1440         struct switch_output *s = &rec->switch_output;
1441         static struct parse_tag tags_size[] = {
1442                 { .tag  = 'B', .mult = 1       },
1443                 { .tag  = 'K', .mult = 1 << 10 },
1444                 { .tag  = 'M', .mult = 1 << 20 },
1445                 { .tag  = 'G', .mult = 1 << 30 },
1446                 { .tag  = 0 },
1447         };
1448         static struct parse_tag tags_time[] = {
1449                 { .tag  = 's', .mult = 1        },
1450                 { .tag  = 'm', .mult = 60       },
1451                 { .tag  = 'h', .mult = 60*60    },
1452                 { .tag  = 'd', .mult = 60*60*24 },
1453                 { .tag  = 0 },
1454         };
1455         unsigned long val;
1456
1457         if (!s->set)
1458                 return 0;
1459
1460         if (!strcmp(s->str, "signal")) {
1461                 s->signal = true;
1462                 pr_debug("switch-output with SIGUSR2 signal\n");
1463                 goto enabled;
1464         }
1465
1466         val = parse_tag_value(s->str, tags_size);
1467         if (val != (unsigned long) -1) {
1468                 s->size = val;
1469                 pr_debug("switch-output with %s size threshold\n", s->str);
1470                 goto enabled;
1471         }
1472
1473         val = parse_tag_value(s->str, tags_time);
1474         if (val != (unsigned long) -1) {
1475                 s->time = val;
1476                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
1477                          s->str, s->time);
1478                 goto enabled;
1479         }
1480
1481         return -1;
1482
1483 enabled:
1484         rec->timestamp_filename = true;
1485         s->enabled              = true;
1486
1487         if (s->size && !rec->opts.no_buffering)
1488                 switch_output_size_warn(rec);
1489
1490         return 0;
1491 }
1492
1493 static const char * const __record_usage[] = {
1494         "perf record [<options>] [<command>]",
1495         "perf record [<options>] -- <command> [<options>]",
1496         NULL
1497 };
1498 const char * const *record_usage = __record_usage;
1499
1500 /*
1501  * XXX Ideally would be local to cmd_record() and passed to a record__new
1502  * because we need to have access to it in record__exit, that is called
1503  * after cmd_record() exits, but since record_options need to be accessible to
1504  * builtin-script, leave it here.
1505  *
1506  * At least we don't ouch it in all the other functions here directly.
1507  *
1508  * Just say no to tons of global variables, sigh.
1509  */
1510 static struct record record = {
1511         .opts = {
1512                 .sample_time         = true,
1513                 .mmap_pages          = UINT_MAX,
1514                 .user_freq           = UINT_MAX,
1515                 .user_interval       = ULLONG_MAX,
1516                 .freq                = 4000,
1517                 .target              = {
1518                         .uses_mmap   = true,
1519                         .default_per_cpu = true,
1520                 },
1521                 .proc_map_timeout     = 500,
1522         },
1523         .tool = {
1524                 .sample         = process_sample_event,
1525                 .fork           = perf_event__process_fork,
1526                 .exit           = perf_event__process_exit,
1527                 .comm           = perf_event__process_comm,
1528                 .namespaces     = perf_event__process_namespaces,
1529                 .mmap           = perf_event__process_mmap,
1530                 .mmap2          = perf_event__process_mmap2,
1531                 .ordered_events = true,
1532         },
1533 };
1534
1535 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1536         "\n\t\t\t\tDefault: fp";
1537
1538 static bool dry_run;
1539
1540 /*
1541  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1542  * with it and switch to use the library functions in perf_evlist that came
1543  * from builtin-record.c, i.e. use record_opts,
1544  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1545  * using pipes, etc.
1546  */
1547 static struct option __record_options[] = {
1548         OPT_CALLBACK('e', "event", &record.evlist, "event",
1549                      "event selector. use 'perf list' to list available events",
1550                      parse_events_option),
1551         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1552                      "event filter", parse_filter),
1553         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1554                            NULL, "don't record events from perf itself",
1555                            exclude_perf),
1556         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1557                     "record events on existing process id"),
1558         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1559                     "record events on existing thread id"),
1560         OPT_INTEGER('r', "realtime", &record.realtime_prio,
1561                     "collect data with this RT SCHED_FIFO priority"),
1562         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1563                     "collect data without buffering"),
1564         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1565                     "collect raw sample records from all opened counters"),
1566         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1567                             "system-wide collection from all CPUs"),
1568         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1569                     "list of cpus to monitor"),
1570         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1571         OPT_STRING('o', "output", &record.data.file.path, "file",
1572                     "output file name"),
1573         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1574                         &record.opts.no_inherit_set,
1575                         "child tasks do not inherit counters"),
1576         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
1577                     "synthesize non-sample events at the end of output"),
1578         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
1579         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
1580                     "Fail if the specified frequency can't be used"),
1581         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
1582                      "profile at this frequency",
1583                       record__parse_freq),
1584         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1585                      "number of mmap data pages and AUX area tracing mmap pages",
1586                      record__parse_mmap_pages),
1587         OPT_BOOLEAN(0, "group", &record.opts.group,
1588                     "put the counters into a counter group"),
1589         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
1590                            NULL, "enables call-graph recording" ,
1591                            &record_callchain_opt),
1592         OPT_CALLBACK(0, "call-graph", &record.opts,
1593                      "record_mode[,record_size]", record_callchain_help,
1594                      &record_parse_callchain_opt),
1595         OPT_INCR('v', "verbose", &verbose,
1596                     "be more verbose (show counter open errors, etc)"),
1597         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1598         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1599                     "per thread counts"),
1600         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1601         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
1602                     "Record the sample physical addresses"),
1603         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
1604         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1605                         &record.opts.sample_time_set,
1606                         "Record the sample timestamps"),
1607         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
1608                         "Record the sample period"),
1609         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1610                     "don't sample"),
1611         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
1612                         &record.no_buildid_cache_set,
1613                         "do not update the buildid cache"),
1614         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
1615                         &record.no_buildid_set,
1616                         "do not collect buildids in perf.data"),
1617         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1618                      "monitor event in cgroup name only",
1619                      parse_cgroups),
1620         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1621                   "ms to wait before starting measurement after program start"),
1622         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1623                    "user to profile"),
1624
1625         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1626                      "branch any", "sample any taken branches",
1627                      parse_branch_stack),
1628
1629         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1630                      "branch filter mask", "branch stack filter modes",
1631                      parse_branch_stack),
1632         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1633                     "sample by weight (on special events only)"),
1634         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1635                     "sample transaction flags (special events only)"),
1636         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1637                     "use per-thread mmaps"),
1638         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1639                     "sample selected machine registers on interrupt,"
1640                     " use -I ? to list register names", parse_regs),
1641         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
1642                     "sample selected machine registers on interrupt,"
1643                     " use -I ? to list register names", parse_regs),
1644         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1645                     "Record running/enabled time of read (:S) events"),
1646         OPT_CALLBACK('k', "clockid", &record.opts,
1647         "clockid", "clockid to use for events, see clock_gettime()",
1648         parse_clockid),
1649         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1650                           "opts", "AUX area tracing Snapshot Mode", ""),
1651         OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1652                         "per thread proc mmap processing timeout in ms"),
1653         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
1654                     "Record namespaces events"),
1655         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1656                     "Record context switch events"),
1657         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
1658                          "Configure all used events to run in kernel space.",
1659                          PARSE_OPT_EXCLUSIVE),
1660         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
1661                          "Configure all used events to run in user space.",
1662                          PARSE_OPT_EXCLUSIVE),
1663         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
1664                    "clang binary to use for compiling BPF scriptlets"),
1665         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
1666                    "options passed to clang when compiling BPF scriptlets"),
1667         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
1668                    "file", "vmlinux pathname"),
1669         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
1670                     "Record build-id of all DSOs regardless of hits"),
1671         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
1672                     "append timestamp to output filename"),
1673         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
1674                     "Record timestamp boundary (time of first/last samples)"),
1675         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
1676                           &record.switch_output.set, "signal,size,time",
1677                           "Switch output when receive SIGUSR2 or cross size,time threshold",
1678                           "signal"),
1679         OPT_BOOLEAN(0, "dry-run", &dry_run,
1680                     "Parse options then exit"),
1681         OPT_END()
1682 };
1683
1684 struct option *record_options = __record_options;
1685
1686 int cmd_record(int argc, const char **argv)
1687 {
1688         int err;
1689         struct record *rec = &record;
1690         char errbuf[BUFSIZ];
1691
1692         setlocale(LC_ALL, "");
1693
1694 #ifndef HAVE_LIBBPF_SUPPORT
1695 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
1696         set_nobuild('\0', "clang-path", true);
1697         set_nobuild('\0', "clang-opt", true);
1698 # undef set_nobuild
1699 #endif
1700
1701 #ifndef HAVE_BPF_PROLOGUE
1702 # if !defined (HAVE_DWARF_SUPPORT)
1703 #  define REASON  "NO_DWARF=1"
1704 # elif !defined (HAVE_LIBBPF_SUPPORT)
1705 #  define REASON  "NO_LIBBPF=1"
1706 # else
1707 #  define REASON  "this architecture doesn't support BPF prologue"
1708 # endif
1709 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
1710         set_nobuild('\0', "vmlinux", true);
1711 # undef set_nobuild
1712 # undef REASON
1713 #endif
1714
1715         rec->evlist = perf_evlist__new();
1716         if (rec->evlist == NULL)
1717                 return -ENOMEM;
1718
1719         err = perf_config(perf_record_config, rec);
1720         if (err)
1721                 return err;
1722
1723         argc = parse_options(argc, argv, record_options, record_usage,
1724                             PARSE_OPT_STOP_AT_NON_OPTION);
1725         if (quiet)
1726                 perf_quiet_option();
1727
1728         /* Make system wide (-a) the default target. */
1729         if (!argc && target__none(&rec->opts.target))
1730                 rec->opts.target.system_wide = true;
1731
1732         if (nr_cgroups && !rec->opts.target.system_wide) {
1733                 usage_with_options_msg(record_usage, record_options,
1734                         "cgroup monitoring only available in system-wide mode");
1735
1736         }
1737         if (rec->opts.record_switch_events &&
1738             !perf_can_record_switch_events()) {
1739                 ui__error("kernel does not support recording context switch events\n");
1740                 parse_options_usage(record_usage, record_options, "switch-events", 0);
1741                 return -EINVAL;
1742         }
1743
1744         if (switch_output_setup(rec)) {
1745                 parse_options_usage(record_usage, record_options, "switch-output", 0);
1746                 return -EINVAL;
1747         }
1748
1749         if (rec->switch_output.time) {
1750                 signal(SIGALRM, alarm_sig_handler);
1751                 alarm(rec->switch_output.time);
1752         }
1753
1754         /*
1755          * Allow aliases to facilitate the lookup of symbols for address
1756          * filters. Refer to auxtrace_parse_filters().
1757          */
1758         symbol_conf.allow_aliases = true;
1759
1760         symbol__init(NULL);
1761
1762         err = record__auxtrace_init(rec);
1763         if (err)
1764                 goto out;
1765
1766         if (dry_run)
1767                 goto out;
1768
1769         err = bpf__setup_stdout(rec->evlist);
1770         if (err) {
1771                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
1772                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
1773                          errbuf);
1774                 goto out;
1775         }
1776
1777         err = -ENOMEM;
1778
1779         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
1780                 pr_warning(
1781 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1782 "check /proc/sys/kernel/kptr_restrict.\n\n"
1783 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1784 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1785 "Samples in kernel modules won't be resolved at all.\n\n"
1786 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1787 "even with a suitable vmlinux or kallsyms file.\n\n");
1788
1789         if (rec->no_buildid_cache || rec->no_buildid) {
1790                 disable_buildid_cache();
1791         } else if (rec->switch_output.enabled) {
1792                 /*
1793                  * In 'perf record --switch-output', disable buildid
1794                  * generation by default to reduce data file switching
1795                  * overhead. Still generate buildid if they are required
1796                  * explicitly using
1797                  *
1798                  *  perf record --switch-output --no-no-buildid \
1799                  *              --no-no-buildid-cache
1800                  *
1801                  * Following code equals to:
1802                  *
1803                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
1804                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
1805                  *         disable_buildid_cache();
1806                  */
1807                 bool disable = true;
1808
1809                 if (rec->no_buildid_set && !rec->no_buildid)
1810                         disable = false;
1811                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
1812                         disable = false;
1813                 if (disable) {
1814                         rec->no_buildid = true;
1815                         rec->no_buildid_cache = true;
1816                         disable_buildid_cache();
1817                 }
1818         }
1819
1820         if (record.opts.overwrite)
1821                 record.opts.tail_synthesize = true;
1822
1823         if (rec->evlist->nr_entries == 0 &&
1824             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
1825                 pr_err("Not enough memory for event selector list\n");
1826                 goto out;
1827         }
1828
1829         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1830                 rec->opts.no_inherit = true;
1831
1832         err = target__validate(&rec->opts.target);
1833         if (err) {
1834                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1835                 ui__warning("%s\n", errbuf);
1836         }
1837
1838         err = target__parse_uid(&rec->opts.target);
1839         if (err) {
1840                 int saved_errno = errno;
1841
1842                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1843                 ui__error("%s", errbuf);
1844
1845                 err = -saved_errno;
1846                 goto out;
1847         }
1848
1849         /* Enable ignoring missing threads when -u/-p option is defined. */
1850         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
1851
1852         err = -ENOMEM;
1853         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1854                 usage_with_options(record_usage, record_options);
1855
1856         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1857         if (err)
1858                 goto out;
1859
1860         /*
1861          * We take all buildids when the file contains
1862          * AUX area tracing data because we do not decode the
1863          * trace because it would take too long.
1864          */
1865         if (rec->opts.full_auxtrace)
1866                 rec->buildid_all = true;
1867
1868         if (record_opts__config(&rec->opts)) {
1869                 err = -EINVAL;
1870                 goto out;
1871         }
1872
1873         err = __cmd_record(&record, argc, argv);
1874 out:
1875         perf_evlist__delete(rec->evlist);
1876         symbol__exit();
1877         auxtrace_record__free(rec->itr);
1878         return err;
1879 }
1880
1881 static void snapshot_sig_handler(int sig __maybe_unused)
1882 {
1883         struct record *rec = &record;
1884
1885         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1886                 trigger_hit(&auxtrace_snapshot_trigger);
1887                 auxtrace_record__snapshot_started = 1;
1888                 if (auxtrace_record__snapshot_start(record.itr))
1889                         trigger_error(&auxtrace_snapshot_trigger);
1890         }
1891
1892         if (switch_output_signal(rec))
1893                 trigger_hit(&switch_output_trigger);
1894 }
1895
1896 static void alarm_sig_handler(int sig __maybe_unused)
1897 {
1898         struct record *rec = &record;
1899
1900         if (switch_output_time(rec))
1901                 trigger_hit(&switch_output_trigger);
1902 }