blk-mq: fix corruption with direct issue
[sfrench/cifs-2.6.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "perf.h"
12
13 #include "util/build-id.h"
14 #include "util/util.h"
15 #include <subcmd/parse-options.h>
16 #include "util/parse-events.h"
17 #include "util/config.h"
18
19 #include "util/callchain.h"
20 #include "util/cgroup.h"
21 #include "util/header.h"
22 #include "util/event.h"
23 #include "util/evlist.h"
24 #include "util/evsel.h"
25 #include "util/debug.h"
26 #include "util/drv_configs.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/cpumap.h"
31 #include "util/thread_map.h"
32 #include "util/data.h"
33 #include "util/perf_regs.h"
34 #include "util/auxtrace.h"
35 #include "util/tsc.h"
36 #include "util/parse-branch-options.h"
37 #include "util/parse-regs-options.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "asm/bug.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <locale.h>
49 #include <poll.h>
50 #include <unistd.h>
51 #include <sched.h>
52 #include <signal.h>
53 #include <sys/mman.h>
54 #include <sys/wait.h>
55 #include <linux/time64.h>
56
57 struct switch_output {
58         bool             enabled;
59         bool             signal;
60         unsigned long    size;
61         unsigned long    time;
62         const char      *str;
63         bool             set;
64 };
65
66 struct record {
67         struct perf_tool        tool;
68         struct record_opts      opts;
69         u64                     bytes_written;
70         struct perf_data        data;
71         struct auxtrace_record  *itr;
72         struct perf_evlist      *evlist;
73         struct perf_session     *session;
74         int                     realtime_prio;
75         bool                    no_buildid;
76         bool                    no_buildid_set;
77         bool                    no_buildid_cache;
78         bool                    no_buildid_cache_set;
79         bool                    buildid_all;
80         bool                    timestamp_filename;
81         bool                    timestamp_boundary;
82         struct switch_output    switch_output;
83         unsigned long long      samples;
84 };
85
86 static volatile int auxtrace_record__snapshot_started;
87 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
88 static DEFINE_TRIGGER(switch_output_trigger);
89
90 static bool switch_output_signal(struct record *rec)
91 {
92         return rec->switch_output.signal &&
93                trigger_is_ready(&switch_output_trigger);
94 }
95
96 static bool switch_output_size(struct record *rec)
97 {
98         return rec->switch_output.size &&
99                trigger_is_ready(&switch_output_trigger) &&
100                (rec->bytes_written >= rec->switch_output.size);
101 }
102
103 static bool switch_output_time(struct record *rec)
104 {
105         return rec->switch_output.time &&
106                trigger_is_ready(&switch_output_trigger);
107 }
108
109 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
110                          void *bf, size_t size)
111 {
112         struct perf_data_file *file = &rec->session->data->file;
113
114         if (perf_data_file__write(file, bf, size) < 0) {
115                 pr_err("failed to write perf data, error: %m\n");
116                 return -1;
117         }
118
119         rec->bytes_written += size;
120
121         if (switch_output_size(rec))
122                 trigger_hit(&switch_output_trigger);
123
124         return 0;
125 }
126
127 static int process_synthesized_event(struct perf_tool *tool,
128                                      union perf_event *event,
129                                      struct perf_sample *sample __maybe_unused,
130                                      struct machine *machine __maybe_unused)
131 {
132         struct record *rec = container_of(tool, struct record, tool);
133         return record__write(rec, NULL, event, event->header.size);
134 }
135
136 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
137 {
138         struct record *rec = to;
139
140         rec->samples++;
141         return record__write(rec, map, bf, size);
142 }
143
144 static volatile int done;
145 static volatile int signr = -1;
146 static volatile int child_finished;
147
148 static void sig_handler(int sig)
149 {
150         if (sig == SIGCHLD)
151                 child_finished = 1;
152         else
153                 signr = sig;
154
155         done = 1;
156 }
157
158 static void sigsegv_handler(int sig)
159 {
160         perf_hooks__recover();
161         sighandler_dump_stack(sig);
162 }
163
164 static void record__sig_exit(void)
165 {
166         if (signr == -1)
167                 return;
168
169         signal(signr, SIG_DFL);
170         raise(signr);
171 }
172
173 #ifdef HAVE_AUXTRACE_SUPPORT
174
175 static int record__process_auxtrace(struct perf_tool *tool,
176                                     struct perf_mmap *map,
177                                     union perf_event *event, void *data1,
178                                     size_t len1, void *data2, size_t len2)
179 {
180         struct record *rec = container_of(tool, struct record, tool);
181         struct perf_data *data = &rec->data;
182         size_t padding;
183         u8 pad[8] = {0};
184
185         if (!perf_data__is_pipe(data)) {
186                 off_t file_offset;
187                 int fd = perf_data__fd(data);
188                 int err;
189
190                 file_offset = lseek(fd, 0, SEEK_CUR);
191                 if (file_offset == -1)
192                         return -1;
193                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
194                                                      event, file_offset);
195                 if (err)
196                         return err;
197         }
198
199         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
200         padding = (len1 + len2) & 7;
201         if (padding)
202                 padding = 8 - padding;
203
204         record__write(rec, map, event, event->header.size);
205         record__write(rec, map, data1, len1);
206         if (len2)
207                 record__write(rec, map, data2, len2);
208         record__write(rec, map, &pad, padding);
209
210         return 0;
211 }
212
213 static int record__auxtrace_mmap_read(struct record *rec,
214                                       struct perf_mmap *map)
215 {
216         int ret;
217
218         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
219                                   record__process_auxtrace);
220         if (ret < 0)
221                 return ret;
222
223         if (ret)
224                 rec->samples++;
225
226         return 0;
227 }
228
229 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
230                                                struct perf_mmap *map)
231 {
232         int ret;
233
234         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
235                                            record__process_auxtrace,
236                                            rec->opts.auxtrace_snapshot_size);
237         if (ret < 0)
238                 return ret;
239
240         if (ret)
241                 rec->samples++;
242
243         return 0;
244 }
245
246 static int record__auxtrace_read_snapshot_all(struct record *rec)
247 {
248         int i;
249         int rc = 0;
250
251         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
252                 struct perf_mmap *map = &rec->evlist->mmap[i];
253
254                 if (!map->auxtrace_mmap.base)
255                         continue;
256
257                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
258                         rc = -1;
259                         goto out;
260                 }
261         }
262 out:
263         return rc;
264 }
265
266 static void record__read_auxtrace_snapshot(struct record *rec)
267 {
268         pr_debug("Recording AUX area tracing snapshot\n");
269         if (record__auxtrace_read_snapshot_all(rec) < 0) {
270                 trigger_error(&auxtrace_snapshot_trigger);
271         } else {
272                 if (auxtrace_record__snapshot_finish(rec->itr))
273                         trigger_error(&auxtrace_snapshot_trigger);
274                 else
275                         trigger_ready(&auxtrace_snapshot_trigger);
276         }
277 }
278
279 static int record__auxtrace_init(struct record *rec)
280 {
281         int err;
282
283         if (!rec->itr) {
284                 rec->itr = auxtrace_record__init(rec->evlist, &err);
285                 if (err)
286                         return err;
287         }
288
289         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
290                                               rec->opts.auxtrace_snapshot_opts);
291         if (err)
292                 return err;
293
294         return auxtrace_parse_filters(rec->evlist);
295 }
296
297 #else
298
299 static inline
300 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
301                                struct perf_mmap *map __maybe_unused)
302 {
303         return 0;
304 }
305
306 static inline
307 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
308 {
309 }
310
311 static inline
312 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
313 {
314         return 0;
315 }
316
317 static int record__auxtrace_init(struct record *rec __maybe_unused)
318 {
319         return 0;
320 }
321
322 #endif
323
324 static int record__mmap_evlist(struct record *rec,
325                                struct perf_evlist *evlist)
326 {
327         struct record_opts *opts = &rec->opts;
328         char msg[512];
329
330         if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
331                                  opts->auxtrace_mmap_pages,
332                                  opts->auxtrace_snapshot_mode) < 0) {
333                 if (errno == EPERM) {
334                         pr_err("Permission error mapping pages.\n"
335                                "Consider increasing "
336                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
337                                "or try again with a smaller value of -m/--mmap_pages.\n"
338                                "(current value: %u,%u)\n",
339                                opts->mmap_pages, opts->auxtrace_mmap_pages);
340                         return -errno;
341                 } else {
342                         pr_err("failed to mmap with %d (%s)\n", errno,
343                                 str_error_r(errno, msg, sizeof(msg)));
344                         if (errno)
345                                 return -errno;
346                         else
347                                 return -EINVAL;
348                 }
349         }
350         return 0;
351 }
352
353 static int record__mmap(struct record *rec)
354 {
355         return record__mmap_evlist(rec, rec->evlist);
356 }
357
358 static int record__open(struct record *rec)
359 {
360         char msg[BUFSIZ];
361         struct perf_evsel *pos;
362         struct perf_evlist *evlist = rec->evlist;
363         struct perf_session *session = rec->session;
364         struct record_opts *opts = &rec->opts;
365         struct perf_evsel_config_term *err_term;
366         int rc = 0;
367
368         /*
369          * For initial_delay we need to add a dummy event so that we can track
370          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
371          * real events, the ones asked by the user.
372          */
373         if (opts->initial_delay) {
374                 if (perf_evlist__add_dummy(evlist))
375                         return -ENOMEM;
376
377                 pos = perf_evlist__first(evlist);
378                 pos->tracking = 0;
379                 pos = perf_evlist__last(evlist);
380                 pos->tracking = 1;
381                 pos->attr.enable_on_exec = 1;
382         }
383
384         perf_evlist__config(evlist, opts, &callchain_param);
385
386         evlist__for_each_entry(evlist, pos) {
387 try_again:
388                 if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
389                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
390                                 if (verbose > 0)
391                                         ui__warning("%s\n", msg);
392                                 goto try_again;
393                         }
394
395                         rc = -errno;
396                         perf_evsel__open_strerror(pos, &opts->target,
397                                                   errno, msg, sizeof(msg));
398                         ui__error("%s\n", msg);
399                         goto out;
400                 }
401
402                 pos->supported = true;
403         }
404
405         if (perf_evlist__apply_filters(evlist, &pos)) {
406                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
407                         pos->filter, perf_evsel__name(pos), errno,
408                         str_error_r(errno, msg, sizeof(msg)));
409                 rc = -1;
410                 goto out;
411         }
412
413         if (perf_evlist__apply_drv_configs(evlist, &pos, &err_term)) {
414                 pr_err("failed to set config \"%s\" on event %s with %d (%s)\n",
415                       err_term->val.drv_cfg, perf_evsel__name(pos), errno,
416                       str_error_r(errno, msg, sizeof(msg)));
417                 rc = -1;
418                 goto out;
419         }
420
421         rc = record__mmap(rec);
422         if (rc)
423                 goto out;
424
425         session->evlist = evlist;
426         perf_session__set_id_hdr_size(session);
427 out:
428         return rc;
429 }
430
431 static int process_sample_event(struct perf_tool *tool,
432                                 union perf_event *event,
433                                 struct perf_sample *sample,
434                                 struct perf_evsel *evsel,
435                                 struct machine *machine)
436 {
437         struct record *rec = container_of(tool, struct record, tool);
438
439         if (rec->evlist->first_sample_time == 0)
440                 rec->evlist->first_sample_time = sample->time;
441
442         rec->evlist->last_sample_time = sample->time;
443
444         if (rec->buildid_all)
445                 return 0;
446
447         rec->samples++;
448         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
449 }
450
451 static int process_buildids(struct record *rec)
452 {
453         struct perf_data *data = &rec->data;
454         struct perf_session *session = rec->session;
455
456         if (data->size == 0)
457                 return 0;
458
459         /*
460          * During this process, it'll load kernel map and replace the
461          * dso->long_name to a real pathname it found.  In this case
462          * we prefer the vmlinux path like
463          *   /lib/modules/3.16.4/build/vmlinux
464          *
465          * rather than build-id path (in debug directory).
466          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
467          */
468         symbol_conf.ignore_vmlinux_buildid = true;
469
470         /*
471          * If --buildid-all is given, it marks all DSO regardless of hits,
472          * so no need to process samples. But if timestamp_boundary is enabled,
473          * it still needs to walk on all samples to get the timestamps of
474          * first/last samples.
475          */
476         if (rec->buildid_all && !rec->timestamp_boundary)
477                 rec->tool.sample = NULL;
478
479         return perf_session__process_events(session);
480 }
481
482 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
483 {
484         int err;
485         struct perf_tool *tool = data;
486         /*
487          *As for guest kernel when processing subcommand record&report,
488          *we arrange module mmap prior to guest kernel mmap and trigger
489          *a preload dso because default guest module symbols are loaded
490          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
491          *method is used to avoid symbol missing when the first addr is
492          *in module instead of in guest kernel.
493          */
494         err = perf_event__synthesize_modules(tool, process_synthesized_event,
495                                              machine);
496         if (err < 0)
497                 pr_err("Couldn't record guest kernel [%d]'s reference"
498                        " relocation symbol.\n", machine->pid);
499
500         /*
501          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
502          * have no _text sometimes.
503          */
504         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
505                                                  machine);
506         if (err < 0)
507                 pr_err("Couldn't record guest kernel [%d]'s reference"
508                        " relocation symbol.\n", machine->pid);
509 }
510
511 static struct perf_event_header finished_round_event = {
512         .size = sizeof(struct perf_event_header),
513         .type = PERF_RECORD_FINISHED_ROUND,
514 };
515
516 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
517                                     bool overwrite)
518 {
519         u64 bytes_written = rec->bytes_written;
520         int i;
521         int rc = 0;
522         struct perf_mmap *maps;
523
524         if (!evlist)
525                 return 0;
526
527         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
528         if (!maps)
529                 return 0;
530
531         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
532                 return 0;
533
534         for (i = 0; i < evlist->nr_mmaps; i++) {
535                 struct perf_mmap *map = &maps[i];
536
537                 if (map->base) {
538                         if (perf_mmap__push(map, rec, record__pushfn) != 0) {
539                                 rc = -1;
540                                 goto out;
541                         }
542                 }
543
544                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
545                     record__auxtrace_mmap_read(rec, map) != 0) {
546                         rc = -1;
547                         goto out;
548                 }
549         }
550
551         /*
552          * Mark the round finished in case we wrote
553          * at least one event.
554          */
555         if (bytes_written != rec->bytes_written)
556                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
557
558         if (overwrite)
559                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
560 out:
561         return rc;
562 }
563
564 static int record__mmap_read_all(struct record *rec)
565 {
566         int err;
567
568         err = record__mmap_read_evlist(rec, rec->evlist, false);
569         if (err)
570                 return err;
571
572         return record__mmap_read_evlist(rec, rec->evlist, true);
573 }
574
575 static void record__init_features(struct record *rec)
576 {
577         struct perf_session *session = rec->session;
578         int feat;
579
580         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
581                 perf_header__set_feat(&session->header, feat);
582
583         if (rec->no_buildid)
584                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
585
586         if (!have_tracepoints(&rec->evlist->entries))
587                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
588
589         if (!rec->opts.branch_stack)
590                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
591
592         if (!rec->opts.full_auxtrace)
593                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
594
595         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
596                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
597
598         perf_header__clear_feat(&session->header, HEADER_STAT);
599 }
600
601 static void
602 record__finish_output(struct record *rec)
603 {
604         struct perf_data *data = &rec->data;
605         int fd = perf_data__fd(data);
606
607         if (data->is_pipe)
608                 return;
609
610         rec->session->header.data_size += rec->bytes_written;
611         data->size = lseek(perf_data__fd(data), 0, SEEK_CUR);
612
613         if (!rec->no_buildid) {
614                 process_buildids(rec);
615
616                 if (rec->buildid_all)
617                         dsos__hit_all(rec->session);
618         }
619         perf_session__write_header(rec->session, rec->evlist, fd, true);
620
621         return;
622 }
623
624 static int record__synthesize_workload(struct record *rec, bool tail)
625 {
626         int err;
627         struct thread_map *thread_map;
628
629         if (rec->opts.tail_synthesize != tail)
630                 return 0;
631
632         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
633         if (thread_map == NULL)
634                 return -1;
635
636         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
637                                                  process_synthesized_event,
638                                                  &rec->session->machines.host,
639                                                  rec->opts.sample_address,
640                                                  rec->opts.proc_map_timeout);
641         thread_map__put(thread_map);
642         return err;
643 }
644
645 static int record__synthesize(struct record *rec, bool tail);
646
647 static int
648 record__switch_output(struct record *rec, bool at_exit)
649 {
650         struct perf_data *data = &rec->data;
651         int fd, err;
652
653         /* Same Size:      "2015122520103046"*/
654         char timestamp[] = "InvalidTimestamp";
655
656         record__synthesize(rec, true);
657         if (target__none(&rec->opts.target))
658                 record__synthesize_workload(rec, true);
659
660         rec->samples = 0;
661         record__finish_output(rec);
662         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
663         if (err) {
664                 pr_err("Failed to get current timestamp\n");
665                 return -EINVAL;
666         }
667
668         fd = perf_data__switch(data, timestamp,
669                                     rec->session->header.data_offset,
670                                     at_exit);
671         if (fd >= 0 && !at_exit) {
672                 rec->bytes_written = 0;
673                 rec->session->header.data_size = 0;
674         }
675
676         if (!quiet)
677                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
678                         data->file.path, timestamp);
679
680         /* Output tracking events */
681         if (!at_exit) {
682                 record__synthesize(rec, false);
683
684                 /*
685                  * In 'perf record --switch-output' without -a,
686                  * record__synthesize() in record__switch_output() won't
687                  * generate tracking events because there's no thread_map
688                  * in evlist. Which causes newly created perf.data doesn't
689                  * contain map and comm information.
690                  * Create a fake thread_map and directly call
691                  * perf_event__synthesize_thread_map() for those events.
692                  */
693                 if (target__none(&rec->opts.target))
694                         record__synthesize_workload(rec, false);
695         }
696         return fd;
697 }
698
699 static volatile int workload_exec_errno;
700
701 /*
702  * perf_evlist__prepare_workload will send a SIGUSR1
703  * if the fork fails, since we asked by setting its
704  * want_signal to true.
705  */
706 static void workload_exec_failed_signal(int signo __maybe_unused,
707                                         siginfo_t *info,
708                                         void *ucontext __maybe_unused)
709 {
710         workload_exec_errno = info->si_value.sival_int;
711         done = 1;
712         child_finished = 1;
713 }
714
715 static void snapshot_sig_handler(int sig);
716 static void alarm_sig_handler(int sig);
717
718 int __weak
719 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
720                             struct perf_tool *tool __maybe_unused,
721                             perf_event__handler_t process __maybe_unused,
722                             struct machine *machine __maybe_unused)
723 {
724         return 0;
725 }
726
727 static const struct perf_event_mmap_page *
728 perf_evlist__pick_pc(struct perf_evlist *evlist)
729 {
730         if (evlist) {
731                 if (evlist->mmap && evlist->mmap[0].base)
732                         return evlist->mmap[0].base;
733                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
734                         return evlist->overwrite_mmap[0].base;
735         }
736         return NULL;
737 }
738
739 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
740 {
741         const struct perf_event_mmap_page *pc;
742
743         pc = perf_evlist__pick_pc(rec->evlist);
744         if (pc)
745                 return pc;
746         return NULL;
747 }
748
749 static int record__synthesize(struct record *rec, bool tail)
750 {
751         struct perf_session *session = rec->session;
752         struct machine *machine = &session->machines.host;
753         struct perf_data *data = &rec->data;
754         struct record_opts *opts = &rec->opts;
755         struct perf_tool *tool = &rec->tool;
756         int fd = perf_data__fd(data);
757         int err = 0;
758
759         if (rec->opts.tail_synthesize != tail)
760                 return 0;
761
762         if (data->is_pipe) {
763                 /*
764                  * We need to synthesize events first, because some
765                  * features works on top of them (on report side).
766                  */
767                 err = perf_event__synthesize_attrs(tool, rec->evlist,
768                                                    process_synthesized_event);
769                 if (err < 0) {
770                         pr_err("Couldn't synthesize attrs.\n");
771                         goto out;
772                 }
773
774                 err = perf_event__synthesize_features(tool, session, rec->evlist,
775                                                       process_synthesized_event);
776                 if (err < 0) {
777                         pr_err("Couldn't synthesize features.\n");
778                         return err;
779                 }
780
781                 if (have_tracepoints(&rec->evlist->entries)) {
782                         /*
783                          * FIXME err <= 0 here actually means that
784                          * there were no tracepoints so its not really
785                          * an error, just that we don't need to
786                          * synthesize anything.  We really have to
787                          * return this more properly and also
788                          * propagate errors that now are calling die()
789                          */
790                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
791                                                                   process_synthesized_event);
792                         if (err <= 0) {
793                                 pr_err("Couldn't record tracing data.\n");
794                                 goto out;
795                         }
796                         rec->bytes_written += err;
797                 }
798         }
799
800         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
801                                           process_synthesized_event, machine);
802         if (err)
803                 goto out;
804
805         if (rec->opts.full_auxtrace) {
806                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
807                                         session, process_synthesized_event);
808                 if (err)
809                         goto out;
810         }
811
812         if (!perf_evlist__exclude_kernel(rec->evlist)) {
813                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
814                                                          machine);
815                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
816                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
817                                    "Check /proc/kallsyms permission or run as root.\n");
818
819                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
820                                                      machine);
821                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
822                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
823                                    "Check /proc/modules permission or run as root.\n");
824         }
825
826         if (perf_guest) {
827                 machines__process_guests(&session->machines,
828                                          perf_event__synthesize_guest_os, tool);
829         }
830
831         err = perf_event__synthesize_extra_attr(&rec->tool,
832                                                 rec->evlist,
833                                                 process_synthesized_event,
834                                                 data->is_pipe);
835         if (err)
836                 goto out;
837
838         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
839                                                  process_synthesized_event,
840                                                 NULL);
841         if (err < 0) {
842                 pr_err("Couldn't synthesize thread map.\n");
843                 return err;
844         }
845
846         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
847                                              process_synthesized_event, NULL);
848         if (err < 0) {
849                 pr_err("Couldn't synthesize cpu map.\n");
850                 return err;
851         }
852
853         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
854                                             process_synthesized_event, opts->sample_address,
855                                             opts->proc_map_timeout, 1);
856 out:
857         return err;
858 }
859
860 static int __cmd_record(struct record *rec, int argc, const char **argv)
861 {
862         int err;
863         int status = 0;
864         unsigned long waking = 0;
865         const bool forks = argc > 0;
866         struct perf_tool *tool = &rec->tool;
867         struct record_opts *opts = &rec->opts;
868         struct perf_data *data = &rec->data;
869         struct perf_session *session;
870         bool disabled = false, draining = false;
871         int fd;
872
873         atexit(record__sig_exit);
874         signal(SIGCHLD, sig_handler);
875         signal(SIGINT, sig_handler);
876         signal(SIGTERM, sig_handler);
877         signal(SIGSEGV, sigsegv_handler);
878
879         if (rec->opts.record_namespaces)
880                 tool->namespace_events = true;
881
882         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
883                 signal(SIGUSR2, snapshot_sig_handler);
884                 if (rec->opts.auxtrace_snapshot_mode)
885                         trigger_on(&auxtrace_snapshot_trigger);
886                 if (rec->switch_output.enabled)
887                         trigger_on(&switch_output_trigger);
888         } else {
889                 signal(SIGUSR2, SIG_IGN);
890         }
891
892         session = perf_session__new(data, false, tool);
893         if (session == NULL) {
894                 pr_err("Perf session creation failed.\n");
895                 return -1;
896         }
897
898         fd = perf_data__fd(data);
899         rec->session = session;
900
901         record__init_features(rec);
902
903         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
904                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
905
906         if (forks) {
907                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
908                                                     argv, data->is_pipe,
909                                                     workload_exec_failed_signal);
910                 if (err < 0) {
911                         pr_err("Couldn't run the workload!\n");
912                         status = err;
913                         goto out_delete_session;
914                 }
915         }
916
917         /*
918          * If we have just single event and are sending data
919          * through pipe, we need to force the ids allocation,
920          * because we synthesize event name through the pipe
921          * and need the id for that.
922          */
923         if (data->is_pipe && rec->evlist->nr_entries == 1)
924                 rec->opts.sample_id = true;
925
926         if (record__open(rec) != 0) {
927                 err = -1;
928                 goto out_child;
929         }
930
931         err = bpf__apply_obj_config();
932         if (err) {
933                 char errbuf[BUFSIZ];
934
935                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
936                 pr_err("ERROR: Apply config to BPF failed: %s\n",
937                          errbuf);
938                 goto out_child;
939         }
940
941         /*
942          * Normally perf_session__new would do this, but it doesn't have the
943          * evlist.
944          */
945         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
946                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
947                 rec->tool.ordered_events = false;
948         }
949
950         if (!rec->evlist->nr_groups)
951                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
952
953         if (data->is_pipe) {
954                 err = perf_header__write_pipe(fd);
955                 if (err < 0)
956                         goto out_child;
957         } else {
958                 err = perf_session__write_header(session, rec->evlist, fd, false);
959                 if (err < 0)
960                         goto out_child;
961         }
962
963         if (!rec->no_buildid
964             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
965                 pr_err("Couldn't generate buildids. "
966                        "Use --no-buildid to profile anyway.\n");
967                 err = -1;
968                 goto out_child;
969         }
970
971         err = record__synthesize(rec, false);
972         if (err < 0)
973                 goto out_child;
974
975         if (rec->realtime_prio) {
976                 struct sched_param param;
977
978                 param.sched_priority = rec->realtime_prio;
979                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
980                         pr_err("Could not set realtime priority.\n");
981                         err = -1;
982                         goto out_child;
983                 }
984         }
985
986         /*
987          * When perf is starting the traced process, all the events
988          * (apart from group members) have enable_on_exec=1 set,
989          * so don't spoil it by prematurely enabling them.
990          */
991         if (!target__none(&opts->target) && !opts->initial_delay)
992                 perf_evlist__enable(rec->evlist);
993
994         /*
995          * Let the child rip
996          */
997         if (forks) {
998                 struct machine *machine = &session->machines.host;
999                 union perf_event *event;
1000                 pid_t tgid;
1001
1002                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1003                 if (event == NULL) {
1004                         err = -ENOMEM;
1005                         goto out_child;
1006                 }
1007
1008                 /*
1009                  * Some H/W events are generated before COMM event
1010                  * which is emitted during exec(), so perf script
1011                  * cannot see a correct process name for those events.
1012                  * Synthesize COMM event to prevent it.
1013                  */
1014                 tgid = perf_event__synthesize_comm(tool, event,
1015                                                    rec->evlist->workload.pid,
1016                                                    process_synthesized_event,
1017                                                    machine);
1018                 free(event);
1019
1020                 if (tgid == -1)
1021                         goto out_child;
1022
1023                 event = malloc(sizeof(event->namespaces) +
1024                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1025                                machine->id_hdr_size);
1026                 if (event == NULL) {
1027                         err = -ENOMEM;
1028                         goto out_child;
1029                 }
1030
1031                 /*
1032                  * Synthesize NAMESPACES event for the command specified.
1033                  */
1034                 perf_event__synthesize_namespaces(tool, event,
1035                                                   rec->evlist->workload.pid,
1036                                                   tgid, process_synthesized_event,
1037                                                   machine);
1038                 free(event);
1039
1040                 perf_evlist__start_workload(rec->evlist);
1041         }
1042
1043         if (opts->initial_delay) {
1044                 usleep(opts->initial_delay * USEC_PER_MSEC);
1045                 perf_evlist__enable(rec->evlist);
1046         }
1047
1048         trigger_ready(&auxtrace_snapshot_trigger);
1049         trigger_ready(&switch_output_trigger);
1050         perf_hooks__invoke_record_start();
1051         for (;;) {
1052                 unsigned long long hits = rec->samples;
1053
1054                 /*
1055                  * rec->evlist->bkw_mmap_state is possible to be
1056                  * BKW_MMAP_EMPTY here: when done == true and
1057                  * hits != rec->samples in previous round.
1058                  *
1059                  * perf_evlist__toggle_bkw_mmap ensure we never
1060                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1061                  */
1062                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1063                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1064
1065                 if (record__mmap_read_all(rec) < 0) {
1066                         trigger_error(&auxtrace_snapshot_trigger);
1067                         trigger_error(&switch_output_trigger);
1068                         err = -1;
1069                         goto out_child;
1070                 }
1071
1072                 if (auxtrace_record__snapshot_started) {
1073                         auxtrace_record__snapshot_started = 0;
1074                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1075                                 record__read_auxtrace_snapshot(rec);
1076                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1077                                 pr_err("AUX area tracing snapshot failed\n");
1078                                 err = -1;
1079                                 goto out_child;
1080                         }
1081                 }
1082
1083                 if (trigger_is_hit(&switch_output_trigger)) {
1084                         /*
1085                          * If switch_output_trigger is hit, the data in
1086                          * overwritable ring buffer should have been collected,
1087                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1088                          *
1089                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1090                          * record__mmap_read_all() didn't collect data from
1091                          * overwritable ring buffer. Read again.
1092                          */
1093                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1094                                 continue;
1095                         trigger_ready(&switch_output_trigger);
1096
1097                         /*
1098                          * Reenable events in overwrite ring buffer after
1099                          * record__mmap_read_all(): we should have collected
1100                          * data from it.
1101                          */
1102                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1103
1104                         if (!quiet)
1105                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1106                                         waking);
1107                         waking = 0;
1108                         fd = record__switch_output(rec, false);
1109                         if (fd < 0) {
1110                                 pr_err("Failed to switch to new file\n");
1111                                 trigger_error(&switch_output_trigger);
1112                                 err = fd;
1113                                 goto out_child;
1114                         }
1115
1116                         /* re-arm the alarm */
1117                         if (rec->switch_output.time)
1118                                 alarm(rec->switch_output.time);
1119                 }
1120
1121                 if (hits == rec->samples) {
1122                         if (done || draining)
1123                                 break;
1124                         err = perf_evlist__poll(rec->evlist, -1);
1125                         /*
1126                          * Propagate error, only if there's any. Ignore positive
1127                          * number of returned events and interrupt error.
1128                          */
1129                         if (err > 0 || (err < 0 && errno == EINTR))
1130                                 err = 0;
1131                         waking++;
1132
1133                         if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1134                                 draining = true;
1135                 }
1136
1137                 /*
1138                  * When perf is starting the traced process, at the end events
1139                  * die with the process and we wait for that. Thus no need to
1140                  * disable events in this case.
1141                  */
1142                 if (done && !disabled && !target__none(&opts->target)) {
1143                         trigger_off(&auxtrace_snapshot_trigger);
1144                         perf_evlist__disable(rec->evlist);
1145                         disabled = true;
1146                 }
1147         }
1148         trigger_off(&auxtrace_snapshot_trigger);
1149         trigger_off(&switch_output_trigger);
1150
1151         if (forks && workload_exec_errno) {
1152                 char msg[STRERR_BUFSIZE];
1153                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1154                 pr_err("Workload failed: %s\n", emsg);
1155                 err = -1;
1156                 goto out_child;
1157         }
1158
1159         if (!quiet)
1160                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1161
1162         if (target__none(&rec->opts.target))
1163                 record__synthesize_workload(rec, true);
1164
1165 out_child:
1166         if (forks) {
1167                 int exit_status;
1168
1169                 if (!child_finished)
1170                         kill(rec->evlist->workload.pid, SIGTERM);
1171
1172                 wait(&exit_status);
1173
1174                 if (err < 0)
1175                         status = err;
1176                 else if (WIFEXITED(exit_status))
1177                         status = WEXITSTATUS(exit_status);
1178                 else if (WIFSIGNALED(exit_status))
1179                         signr = WTERMSIG(exit_status);
1180         } else
1181                 status = err;
1182
1183         record__synthesize(rec, true);
1184         /* this will be recalculated during process_buildids() */
1185         rec->samples = 0;
1186
1187         if (!err) {
1188                 if (!rec->timestamp_filename) {
1189                         record__finish_output(rec);
1190                 } else {
1191                         fd = record__switch_output(rec, true);
1192                         if (fd < 0) {
1193                                 status = fd;
1194                                 goto out_delete_session;
1195                         }
1196                 }
1197         }
1198
1199         perf_hooks__invoke_record_end();
1200
1201         if (!err && !quiet) {
1202                 char samples[128];
1203                 const char *postfix = rec->timestamp_filename ?
1204                                         ".<timestamp>" : "";
1205
1206                 if (rec->samples && !rec->opts.full_auxtrace)
1207                         scnprintf(samples, sizeof(samples),
1208                                   " (%" PRIu64 " samples)", rec->samples);
1209                 else
1210                         samples[0] = '\0';
1211
1212                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s ]\n",
1213                         perf_data__size(data) / 1024.0 / 1024.0,
1214                         data->file.path, postfix, samples);
1215         }
1216
1217 out_delete_session:
1218         perf_session__delete(session);
1219         return status;
1220 }
1221
1222 static void callchain_debug(struct callchain_param *callchain)
1223 {
1224         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1225
1226         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1227
1228         if (callchain->record_mode == CALLCHAIN_DWARF)
1229                 pr_debug("callchain: stack dump size %d\n",
1230                          callchain->dump_size);
1231 }
1232
1233 int record_opts__parse_callchain(struct record_opts *record,
1234                                  struct callchain_param *callchain,
1235                                  const char *arg, bool unset)
1236 {
1237         int ret;
1238         callchain->enabled = !unset;
1239
1240         /* --no-call-graph */
1241         if (unset) {
1242                 callchain->record_mode = CALLCHAIN_NONE;
1243                 pr_debug("callchain: disabled\n");
1244                 return 0;
1245         }
1246
1247         ret = parse_callchain_record_opt(arg, callchain);
1248         if (!ret) {
1249                 /* Enable data address sampling for DWARF unwind. */
1250                 if (callchain->record_mode == CALLCHAIN_DWARF)
1251                         record->sample_address = true;
1252                 callchain_debug(callchain);
1253         }
1254
1255         return ret;
1256 }
1257
1258 int record_parse_callchain_opt(const struct option *opt,
1259                                const char *arg,
1260                                int unset)
1261 {
1262         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1263 }
1264
1265 int record_callchain_opt(const struct option *opt,
1266                          const char *arg __maybe_unused,
1267                          int unset __maybe_unused)
1268 {
1269         struct callchain_param *callchain = opt->value;
1270
1271         callchain->enabled = true;
1272
1273         if (callchain->record_mode == CALLCHAIN_NONE)
1274                 callchain->record_mode = CALLCHAIN_FP;
1275
1276         callchain_debug(callchain);
1277         return 0;
1278 }
1279
1280 static int perf_record_config(const char *var, const char *value, void *cb)
1281 {
1282         struct record *rec = cb;
1283
1284         if (!strcmp(var, "record.build-id")) {
1285                 if (!strcmp(value, "cache"))
1286                         rec->no_buildid_cache = false;
1287                 else if (!strcmp(value, "no-cache"))
1288                         rec->no_buildid_cache = true;
1289                 else if (!strcmp(value, "skip"))
1290                         rec->no_buildid = true;
1291                 else
1292                         return -1;
1293                 return 0;
1294         }
1295         if (!strcmp(var, "record.call-graph")) {
1296                 var = "call-graph.record-mode";
1297                 return perf_default_config(var, value, cb);
1298         }
1299
1300         return 0;
1301 }
1302
1303 struct clockid_map {
1304         const char *name;
1305         int clockid;
1306 };
1307
1308 #define CLOCKID_MAP(n, c)       \
1309         { .name = n, .clockid = (c), }
1310
1311 #define CLOCKID_END     { .name = NULL, }
1312
1313
1314 /*
1315  * Add the missing ones, we need to build on many distros...
1316  */
1317 #ifndef CLOCK_MONOTONIC_RAW
1318 #define CLOCK_MONOTONIC_RAW 4
1319 #endif
1320 #ifndef CLOCK_BOOTTIME
1321 #define CLOCK_BOOTTIME 7
1322 #endif
1323 #ifndef CLOCK_TAI
1324 #define CLOCK_TAI 11
1325 #endif
1326
1327 static const struct clockid_map clockids[] = {
1328         /* available for all events, NMI safe */
1329         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1330         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1331
1332         /* available for some events */
1333         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1334         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1335         CLOCKID_MAP("tai", CLOCK_TAI),
1336
1337         /* available for the lazy */
1338         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1339         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1340         CLOCKID_MAP("real", CLOCK_REALTIME),
1341         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1342
1343         CLOCKID_END,
1344 };
1345
1346 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1347 {
1348         struct timespec res;
1349
1350         *res_ns = 0;
1351         if (!clock_getres(clk_id, &res))
1352                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1353         else
1354                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1355
1356         return 0;
1357 }
1358
1359 static int parse_clockid(const struct option *opt, const char *str, int unset)
1360 {
1361         struct record_opts *opts = (struct record_opts *)opt->value;
1362         const struct clockid_map *cm;
1363         const char *ostr = str;
1364
1365         if (unset) {
1366                 opts->use_clockid = 0;
1367                 return 0;
1368         }
1369
1370         /* no arg passed */
1371         if (!str)
1372                 return 0;
1373
1374         /* no setting it twice */
1375         if (opts->use_clockid)
1376                 return -1;
1377
1378         opts->use_clockid = true;
1379
1380         /* if its a number, we're done */
1381         if (sscanf(str, "%d", &opts->clockid) == 1)
1382                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1383
1384         /* allow a "CLOCK_" prefix to the name */
1385         if (!strncasecmp(str, "CLOCK_", 6))
1386                 str += 6;
1387
1388         for (cm = clockids; cm->name; cm++) {
1389                 if (!strcasecmp(str, cm->name)) {
1390                         opts->clockid = cm->clockid;
1391                         return get_clockid_res(opts->clockid,
1392                                                &opts->clockid_res_ns);
1393                 }
1394         }
1395
1396         opts->use_clockid = false;
1397         ui__warning("unknown clockid %s, check man page\n", ostr);
1398         return -1;
1399 }
1400
1401 static int record__parse_mmap_pages(const struct option *opt,
1402                                     const char *str,
1403                                     int unset __maybe_unused)
1404 {
1405         struct record_opts *opts = opt->value;
1406         char *s, *p;
1407         unsigned int mmap_pages;
1408         int ret;
1409
1410         if (!str)
1411                 return -EINVAL;
1412
1413         s = strdup(str);
1414         if (!s)
1415                 return -ENOMEM;
1416
1417         p = strchr(s, ',');
1418         if (p)
1419                 *p = '\0';
1420
1421         if (*s) {
1422                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1423                 if (ret)
1424                         goto out_free;
1425                 opts->mmap_pages = mmap_pages;
1426         }
1427
1428         if (!p) {
1429                 ret = 0;
1430                 goto out_free;
1431         }
1432
1433         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1434         if (ret)
1435                 goto out_free;
1436
1437         opts->auxtrace_mmap_pages = mmap_pages;
1438
1439 out_free:
1440         free(s);
1441         return ret;
1442 }
1443
1444 static void switch_output_size_warn(struct record *rec)
1445 {
1446         u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1447         struct switch_output *s = &rec->switch_output;
1448
1449         wakeup_size /= 2;
1450
1451         if (s->size < wakeup_size) {
1452                 char buf[100];
1453
1454                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1455                 pr_warning("WARNING: switch-output data size lower than "
1456                            "wakeup kernel buffer size (%s) "
1457                            "expect bigger perf.data sizes\n", buf);
1458         }
1459 }
1460
1461 static int switch_output_setup(struct record *rec)
1462 {
1463         struct switch_output *s = &rec->switch_output;
1464         static struct parse_tag tags_size[] = {
1465                 { .tag  = 'B', .mult = 1       },
1466                 { .tag  = 'K', .mult = 1 << 10 },
1467                 { .tag  = 'M', .mult = 1 << 20 },
1468                 { .tag  = 'G', .mult = 1 << 30 },
1469                 { .tag  = 0 },
1470         };
1471         static struct parse_tag tags_time[] = {
1472                 { .tag  = 's', .mult = 1        },
1473                 { .tag  = 'm', .mult = 60       },
1474                 { .tag  = 'h', .mult = 60*60    },
1475                 { .tag  = 'd', .mult = 60*60*24 },
1476                 { .tag  = 0 },
1477         };
1478         unsigned long val;
1479
1480         if (!s->set)
1481                 return 0;
1482
1483         if (!strcmp(s->str, "signal")) {
1484                 s->signal = true;
1485                 pr_debug("switch-output with SIGUSR2 signal\n");
1486                 goto enabled;
1487         }
1488
1489         val = parse_tag_value(s->str, tags_size);
1490         if (val != (unsigned long) -1) {
1491                 s->size = val;
1492                 pr_debug("switch-output with %s size threshold\n", s->str);
1493                 goto enabled;
1494         }
1495
1496         val = parse_tag_value(s->str, tags_time);
1497         if (val != (unsigned long) -1) {
1498                 s->time = val;
1499                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
1500                          s->str, s->time);
1501                 goto enabled;
1502         }
1503
1504         return -1;
1505
1506 enabled:
1507         rec->timestamp_filename = true;
1508         s->enabled              = true;
1509
1510         if (s->size && !rec->opts.no_buffering)
1511                 switch_output_size_warn(rec);
1512
1513         return 0;
1514 }
1515
1516 static const char * const __record_usage[] = {
1517         "perf record [<options>] [<command>]",
1518         "perf record [<options>] -- <command> [<options>]",
1519         NULL
1520 };
1521 const char * const *record_usage = __record_usage;
1522
1523 /*
1524  * XXX Ideally would be local to cmd_record() and passed to a record__new
1525  * because we need to have access to it in record__exit, that is called
1526  * after cmd_record() exits, but since record_options need to be accessible to
1527  * builtin-script, leave it here.
1528  *
1529  * At least we don't ouch it in all the other functions here directly.
1530  *
1531  * Just say no to tons of global variables, sigh.
1532  */
1533 static struct record record = {
1534         .opts = {
1535                 .sample_time         = true,
1536                 .mmap_pages          = UINT_MAX,
1537                 .user_freq           = UINT_MAX,
1538                 .user_interval       = ULLONG_MAX,
1539                 .freq                = 4000,
1540                 .target              = {
1541                         .uses_mmap   = true,
1542                         .default_per_cpu = true,
1543                 },
1544                 .proc_map_timeout     = 500,
1545         },
1546         .tool = {
1547                 .sample         = process_sample_event,
1548                 .fork           = perf_event__process_fork,
1549                 .exit           = perf_event__process_exit,
1550                 .comm           = perf_event__process_comm,
1551                 .namespaces     = perf_event__process_namespaces,
1552                 .mmap           = perf_event__process_mmap,
1553                 .mmap2          = perf_event__process_mmap2,
1554                 .ordered_events = true,
1555         },
1556 };
1557
1558 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1559         "\n\t\t\t\tDefault: fp";
1560
1561 static bool dry_run;
1562
1563 /*
1564  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1565  * with it and switch to use the library functions in perf_evlist that came
1566  * from builtin-record.c, i.e. use record_opts,
1567  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1568  * using pipes, etc.
1569  */
1570 static struct option __record_options[] = {
1571         OPT_CALLBACK('e', "event", &record.evlist, "event",
1572                      "event selector. use 'perf list' to list available events",
1573                      parse_events_option),
1574         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1575                      "event filter", parse_filter),
1576         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1577                            NULL, "don't record events from perf itself",
1578                            exclude_perf),
1579         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1580                     "record events on existing process id"),
1581         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1582                     "record events on existing thread id"),
1583         OPT_INTEGER('r', "realtime", &record.realtime_prio,
1584                     "collect data with this RT SCHED_FIFO priority"),
1585         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1586                     "collect data without buffering"),
1587         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1588                     "collect raw sample records from all opened counters"),
1589         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1590                             "system-wide collection from all CPUs"),
1591         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1592                     "list of cpus to monitor"),
1593         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1594         OPT_STRING('o', "output", &record.data.file.path, "file",
1595                     "output file name"),
1596         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1597                         &record.opts.no_inherit_set,
1598                         "child tasks do not inherit counters"),
1599         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
1600                     "synthesize non-sample events at the end of output"),
1601         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
1602         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
1603                     "Fail if the specified frequency can't be used"),
1604         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
1605                      "profile at this frequency",
1606                       record__parse_freq),
1607         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1608                      "number of mmap data pages and AUX area tracing mmap pages",
1609                      record__parse_mmap_pages),
1610         OPT_BOOLEAN(0, "group", &record.opts.group,
1611                     "put the counters into a counter group"),
1612         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
1613                            NULL, "enables call-graph recording" ,
1614                            &record_callchain_opt),
1615         OPT_CALLBACK(0, "call-graph", &record.opts,
1616                      "record_mode[,record_size]", record_callchain_help,
1617                      &record_parse_callchain_opt),
1618         OPT_INCR('v', "verbose", &verbose,
1619                     "be more verbose (show counter open errors, etc)"),
1620         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1621         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1622                     "per thread counts"),
1623         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1624         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
1625                     "Record the sample physical addresses"),
1626         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
1627         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1628                         &record.opts.sample_time_set,
1629                         "Record the sample timestamps"),
1630         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
1631                         "Record the sample period"),
1632         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1633                     "don't sample"),
1634         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
1635                         &record.no_buildid_cache_set,
1636                         "do not update the buildid cache"),
1637         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
1638                         &record.no_buildid_set,
1639                         "do not collect buildids in perf.data"),
1640         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1641                      "monitor event in cgroup name only",
1642                      parse_cgroups),
1643         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1644                   "ms to wait before starting measurement after program start"),
1645         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1646                    "user to profile"),
1647
1648         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1649                      "branch any", "sample any taken branches",
1650                      parse_branch_stack),
1651
1652         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1653                      "branch filter mask", "branch stack filter modes",
1654                      parse_branch_stack),
1655         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1656                     "sample by weight (on special events only)"),
1657         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1658                     "sample transaction flags (special events only)"),
1659         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1660                     "use per-thread mmaps"),
1661         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1662                     "sample selected machine registers on interrupt,"
1663                     " use -I ? to list register names", parse_regs),
1664         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
1665                     "sample selected machine registers on interrupt,"
1666                     " use -I ? to list register names", parse_regs),
1667         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1668                     "Record running/enabled time of read (:S) events"),
1669         OPT_CALLBACK('k', "clockid", &record.opts,
1670         "clockid", "clockid to use for events, see clock_gettime()",
1671         parse_clockid),
1672         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1673                           "opts", "AUX area tracing Snapshot Mode", ""),
1674         OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1675                         "per thread proc mmap processing timeout in ms"),
1676         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
1677                     "Record namespaces events"),
1678         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1679                     "Record context switch events"),
1680         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
1681                          "Configure all used events to run in kernel space.",
1682                          PARSE_OPT_EXCLUSIVE),
1683         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
1684                          "Configure all used events to run in user space.",
1685                          PARSE_OPT_EXCLUSIVE),
1686         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
1687                    "clang binary to use for compiling BPF scriptlets"),
1688         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
1689                    "options passed to clang when compiling BPF scriptlets"),
1690         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
1691                    "file", "vmlinux pathname"),
1692         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
1693                     "Record build-id of all DSOs regardless of hits"),
1694         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
1695                     "append timestamp to output filename"),
1696         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
1697                     "Record timestamp boundary (time of first/last samples)"),
1698         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
1699                           &record.switch_output.set, "signal,size,time",
1700                           "Switch output when receive SIGUSR2 or cross size,time threshold",
1701                           "signal"),
1702         OPT_BOOLEAN(0, "dry-run", &dry_run,
1703                     "Parse options then exit"),
1704         OPT_END()
1705 };
1706
1707 struct option *record_options = __record_options;
1708
1709 int cmd_record(int argc, const char **argv)
1710 {
1711         int err;
1712         struct record *rec = &record;
1713         char errbuf[BUFSIZ];
1714
1715         setlocale(LC_ALL, "");
1716
1717 #ifndef HAVE_LIBBPF_SUPPORT
1718 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
1719         set_nobuild('\0', "clang-path", true);
1720         set_nobuild('\0', "clang-opt", true);
1721 # undef set_nobuild
1722 #endif
1723
1724 #ifndef HAVE_BPF_PROLOGUE
1725 # if !defined (HAVE_DWARF_SUPPORT)
1726 #  define REASON  "NO_DWARF=1"
1727 # elif !defined (HAVE_LIBBPF_SUPPORT)
1728 #  define REASON  "NO_LIBBPF=1"
1729 # else
1730 #  define REASON  "this architecture doesn't support BPF prologue"
1731 # endif
1732 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
1733         set_nobuild('\0', "vmlinux", true);
1734 # undef set_nobuild
1735 # undef REASON
1736 #endif
1737
1738         rec->evlist = perf_evlist__new();
1739         if (rec->evlist == NULL)
1740                 return -ENOMEM;
1741
1742         err = perf_config(perf_record_config, rec);
1743         if (err)
1744                 return err;
1745
1746         argc = parse_options(argc, argv, record_options, record_usage,
1747                             PARSE_OPT_STOP_AT_NON_OPTION);
1748         if (quiet)
1749                 perf_quiet_option();
1750
1751         /* Make system wide (-a) the default target. */
1752         if (!argc && target__none(&rec->opts.target))
1753                 rec->opts.target.system_wide = true;
1754
1755         if (nr_cgroups && !rec->opts.target.system_wide) {
1756                 usage_with_options_msg(record_usage, record_options,
1757                         "cgroup monitoring only available in system-wide mode");
1758
1759         }
1760         if (rec->opts.record_switch_events &&
1761             !perf_can_record_switch_events()) {
1762                 ui__error("kernel does not support recording context switch events\n");
1763                 parse_options_usage(record_usage, record_options, "switch-events", 0);
1764                 return -EINVAL;
1765         }
1766
1767         if (switch_output_setup(rec)) {
1768                 parse_options_usage(record_usage, record_options, "switch-output", 0);
1769                 return -EINVAL;
1770         }
1771
1772         if (rec->switch_output.time) {
1773                 signal(SIGALRM, alarm_sig_handler);
1774                 alarm(rec->switch_output.time);
1775         }
1776
1777         /*
1778          * Allow aliases to facilitate the lookup of symbols for address
1779          * filters. Refer to auxtrace_parse_filters().
1780          */
1781         symbol_conf.allow_aliases = true;
1782
1783         symbol__init(NULL);
1784
1785         err = record__auxtrace_init(rec);
1786         if (err)
1787                 goto out;
1788
1789         if (dry_run)
1790                 goto out;
1791
1792         err = bpf__setup_stdout(rec->evlist);
1793         if (err) {
1794                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
1795                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
1796                          errbuf);
1797                 goto out;
1798         }
1799
1800         err = -ENOMEM;
1801
1802         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
1803                 pr_warning(
1804 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1805 "check /proc/sys/kernel/kptr_restrict.\n\n"
1806 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1807 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1808 "Samples in kernel modules won't be resolved at all.\n\n"
1809 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1810 "even with a suitable vmlinux or kallsyms file.\n\n");
1811
1812         if (rec->no_buildid_cache || rec->no_buildid) {
1813                 disable_buildid_cache();
1814         } else if (rec->switch_output.enabled) {
1815                 /*
1816                  * In 'perf record --switch-output', disable buildid
1817                  * generation by default to reduce data file switching
1818                  * overhead. Still generate buildid if they are required
1819                  * explicitly using
1820                  *
1821                  *  perf record --switch-output --no-no-buildid \
1822                  *              --no-no-buildid-cache
1823                  *
1824                  * Following code equals to:
1825                  *
1826                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
1827                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
1828                  *         disable_buildid_cache();
1829                  */
1830                 bool disable = true;
1831
1832                 if (rec->no_buildid_set && !rec->no_buildid)
1833                         disable = false;
1834                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
1835                         disable = false;
1836                 if (disable) {
1837                         rec->no_buildid = true;
1838                         rec->no_buildid_cache = true;
1839                         disable_buildid_cache();
1840                 }
1841         }
1842
1843         if (record.opts.overwrite)
1844                 record.opts.tail_synthesize = true;
1845
1846         if (rec->evlist->nr_entries == 0 &&
1847             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
1848                 pr_err("Not enough memory for event selector list\n");
1849                 goto out;
1850         }
1851
1852         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1853                 rec->opts.no_inherit = true;
1854
1855         err = target__validate(&rec->opts.target);
1856         if (err) {
1857                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1858                 ui__warning("%s\n", errbuf);
1859         }
1860
1861         err = target__parse_uid(&rec->opts.target);
1862         if (err) {
1863                 int saved_errno = errno;
1864
1865                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1866                 ui__error("%s", errbuf);
1867
1868                 err = -saved_errno;
1869                 goto out;
1870         }
1871
1872         /* Enable ignoring missing threads when -u/-p option is defined. */
1873         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
1874
1875         err = -ENOMEM;
1876         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1877                 usage_with_options(record_usage, record_options);
1878
1879         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1880         if (err)
1881                 goto out;
1882
1883         /*
1884          * We take all buildids when the file contains
1885          * AUX area tracing data because we do not decode the
1886          * trace because it would take too long.
1887          */
1888         if (rec->opts.full_auxtrace)
1889                 rec->buildid_all = true;
1890
1891         if (record_opts__config(&rec->opts)) {
1892                 err = -EINVAL;
1893                 goto out;
1894         }
1895
1896         err = __cmd_record(&record, argc, argv);
1897 out:
1898         perf_evlist__delete(rec->evlist);
1899         symbol__exit();
1900         auxtrace_record__free(rec->itr);
1901         return err;
1902 }
1903
1904 static void snapshot_sig_handler(int sig __maybe_unused)
1905 {
1906         struct record *rec = &record;
1907
1908         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1909                 trigger_hit(&auxtrace_snapshot_trigger);
1910                 auxtrace_record__snapshot_started = 1;
1911                 if (auxtrace_record__snapshot_start(record.itr))
1912                         trigger_error(&auxtrace_snapshot_trigger);
1913         }
1914
1915         if (switch_output_signal(rec))
1916                 trigger_hit(&switch_output_trigger);
1917 }
1918
1919 static void alarm_sig_handler(int sig __maybe_unused)
1920 {
1921         struct record *rec = &record;
1922
1923         if (switch_output_time(rec))
1924                 trigger_hit(&switch_output_trigger);
1925 }