From: Linus Torvalds Date: Sun, 15 Jan 2017 19:37:43 +0000 (-0800) Subject: Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel... X-Git-Tag: v4.10-rc4~9 X-Git-Url: http://git.samba.org/samba.git/?p=sfrench%2Fcifs-2.6.git;a=commitdiff_plain;h=79078c53baabee12dfefb0cfe00ca94cb2c35570;hp=255e6140fa76ec9d0e24f201427e7e9a9573f681 Merge branch 'perf-urgent-for-linus' of git://git./linux/kernel/git/tip/tip Pull perf fixes from Ingo Molnar: "Misc race fixes uncovered by fuzzing efforts, a Sparse fix, two PMU driver fixes, plus miscellanous tooling fixes" * 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: perf/x86: Reject non sampling events with precise_ip perf/x86/intel: Account interrupts for PEBS errors perf/core: Fix concurrent sys_perf_event_open() vs. 'move_group' race perf/core: Fix sys_perf_event_open() vs. hotplug perf/x86/intel: Use ULL constant to prevent undefined shift behaviour perf/x86/intel/uncore: Fix hardcoded socket 0 assumption in the Haswell init code perf/x86: Set pmu->module in Intel PMU modules perf probe: Fix to probe on gcc generated symbols for offline kernel perf probe: Fix --funcs to show correct symbols for offline module perf symbols: Robustify reading of build-id from sysfs perf tools: Install tools/lib/traceevent plugins with install-bin tools lib traceevent: Fix prev/next_prio for deadline tasks perf record: Fix --switch-output documentation and comment perf record: Make __record_options static tools lib subcmd: Add OPT_STRING_OPTARG_SET option perf probe: Fix to get correct modname from elf header samples/bpf trace_output_user: Remove duplicate sys/ioctl.h include samples/bpf sock_example: Avoid getting ethhdr from two includes perf sched timehist: Show total scheduling time --- diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 019c5887b698..1635c0c8df23 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -505,6 +505,10 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + + /* There's no sense in having PEBS for non sampling events: */ + if (!is_sampling_event(event)) + return -EINVAL; } /* * check that PEBS LBR correction does not conflict with diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 86138267b68a..d611cab214a6 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3987,7 +3987,7 @@ __init int intel_pmu_init(void) x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index fec8a461bdef..1076c9a77292 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu cstate_pkg_pmu = { @@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = { .stop = cstate_pmu_event_stop, .read = cstate_pmu_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static const struct cstate_model nhm_cstates __initconst = { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index be202390bbd3..9dfeeeca0ea8 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; /* log dropped samples number */ - if (error[bit]) + if (error[bit]) { perf_log_lost_samples(event, error[bit]); + if (perf_event_account_interrupt(event)) + x86_pmu_stop(event, 0); + } + if (counts[bit]) { __intel_pmu_pebs_event(event, iregs, base, top, bit, counts[bit]); diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index bd34124449b0..17c3564d087a 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -697,6 +697,7 @@ static int __init init_rapl_pmus(void) rapl_pmus->pmu.start = rapl_pmu_event_start; rapl_pmus->pmu.stop = rapl_pmu_event_stop; rapl_pmus->pmu.read = rapl_pmu_event_read; + rapl_pmus->pmu.module = THIS_MODULE; return 0; } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 97c246f84dea..8c4ccdc3a3f3 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -733,6 +733,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .start = uncore_pmu_event_start, .stop = uncore_pmu_event_stop, .read = uncore_pmu_event_read, + .module = THIS_MODULE, }; } else { pmu->pmu = *pmu->type->pmu; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index e6832be714bc..dae2fedc1601 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { void hswep_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(0); + int pkg = boot_cpu_data.logical_proc_id; if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4741ecdb9817..78ed8105e64d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); +extern int perf_event_account_interrupt(struct perf_event *event); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, diff --git a/kernel/events/core.c b/kernel/events/core.c index ab15509fab8c..110b38a58493 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2249,7 +2249,7 @@ static int __perf_install_in_context(void *info) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; - bool activate = true; + bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); @@ -2257,27 +2257,26 @@ static int __perf_install_in_context(void *info) raw_spin_lock(&ctx->lock); task_ctx = ctx; - /* If we're on the wrong CPU, try again */ - if (task_cpu(ctx->task) != smp_processor_id()) { - ret = -ESRCH; - goto unlock; - } + reprogram = (ctx->task == current); /* - * If we're on the right CPU, see if the task we target is - * current, if not we don't have to activate the ctx, a future - * context switch will do that for us. + * If the task is running, it must be running on this CPU, + * otherwise we cannot reprogram things. + * + * If its not running, we don't care, ctx->lock will + * serialize against it becoming runnable. */ - if (ctx->task != current) - activate = false; - else - WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); + if (task_curr(ctx->task) && !reprogram) { + ret = -ESRCH; + goto unlock; + } + WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } - if (activate) { + if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx); @@ -2328,13 +2327,36 @@ perf_install_in_context(struct perf_event_context *ctx, /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. + * + * Instead we use task_curr(), which tells us if the task is running. + * However, since we use task_curr() outside of rq::lock, we can race + * against the actual state. This means the result can be wrong. + * + * If we get a false positive, we retry, this is harmless. + * + * If we get a false negative, things are complicated. If we are after + * perf_event_context_sched_in() ctx::lock will serialize us, and the + * value must be correct. If we're before, it doesn't matter since + * perf_event_context_sched_in() will program the counter. + * + * However, this hinges on the remote context switch having observed + * our task->perf_event_ctxp[] store, such that it will in fact take + * ctx::lock in perf_event_context_sched_in(). + * + * We do this by task_function_call(), if the IPI fails to hit the task + * we know any future context switch of task must see the + * perf_event_ctpx[] store. */ -again: + /* - * Cannot use task_function_call() because we need to run on the task's - * CPU regardless of whether its current or not. + * This smp_mb() orders the task->perf_event_ctxp[] store with the + * task_cpu() load, such that if the IPI then does not find the task + * running, a future context switch of that task must observe the + * store. */ - if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + smp_mb(); +again: + if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); @@ -2348,12 +2370,16 @@ again: raw_spin_unlock_irq(&ctx->lock); return; } - raw_spin_unlock_irq(&ctx->lock); /* - * Since !ctx->is_active doesn't mean anything, we must IPI - * unconditionally. + * If the task is not running, ctx->lock will avoid it becoming so, + * thus we can safely install the event. */ - goto again; + if (task_curr(task)) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); } /* @@ -7034,25 +7060,12 @@ static void perf_log_itrace_start(struct perf_event *event) perf_output_end(&handle); } -/* - * Generic event overflow handling, sampling. - */ - -static int __perf_event_overflow(struct perf_event *event, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) +static int +__perf_event_account_interrupt(struct perf_event *event, int throttle) { - int events = atomic_read(&event->event_limit); struct hw_perf_event *hwc = &event->hw; - u64 seq; int ret = 0; - - /* - * Non-sampling counters might still use the PMI to fold short - * hardware counters, ignore those. - */ - if (unlikely(!is_sampling_event(event))) - return 0; + u64 seq; seq = __this_cpu_read(perf_throttled_seq); if (seq != hwc->interrupts_seq) { @@ -7080,6 +7093,34 @@ static int __perf_event_overflow(struct perf_event *event, perf_adjust_period(event, delta, hwc->last_period, true); } + return ret; +} + +int perf_event_account_interrupt(struct perf_event *event) +{ + return __perf_event_account_interrupt(event, 1); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + int ret = 0; + + /* + * Non-sampling counters might still use the PMI to fold short + * hardware counters, ignore those. + */ + if (unlikely(!is_sampling_event(event))) + return 0; + + ret = __perf_event_account_interrupt(event, throttle); + /* * XXX event_limit might not quite work as expected on inherited * events @@ -9503,6 +9544,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) return 0; } +/* + * Variation on perf_event_ctx_lock_nested(), except we take two context + * mutexes. + */ +static struct perf_event_context * +__perf_event_ctx_lock_double(struct perf_event *group_leader, + struct perf_event_context *ctx) +{ + struct perf_event_context *gctx; + +again: + rcu_read_lock(); + gctx = READ_ONCE(group_leader->ctx); + if (!atomic_inc_not_zero(&gctx->refcount)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + mutex_lock_double(&gctx->mutex, &ctx->mutex); + + if (group_leader->ctx != gctx) { + mutex_unlock(&ctx->mutex); + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + goto again; + } + + return gctx; +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -9746,12 +9818,31 @@ SYSCALL_DEFINE5(perf_event_open, } if (move_group) { - gctx = group_leader->ctx; - mutex_lock_double(&gctx->mutex, &ctx->mutex); + gctx = __perf_event_ctx_lock_double(group_leader, ctx); + if (gctx->task == TASK_TOMBSTONE) { err = -ESRCH; goto err_locked; } + + /* + * Check if we raced against another sys_perf_event_open() call + * moving the software group underneath us. + */ + if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + /* + * If someone moved the group out from under us, check + * if this new event wound up on the same ctx, if so + * its the regular !move_group case, otherwise fail. + */ + if (gctx != ctx) { + err = -EINVAL; + goto err_locked; + } else { + perf_event_ctx_unlock(group_leader, gctx); + move_group = 0; + } + } } else { mutex_lock(&ctx->mutex); } @@ -9853,7 +9944,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_unpin_context(ctx); if (move_group) - mutex_unlock(&gctx->mutex); + perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { @@ -9879,7 +9970,7 @@ SYSCALL_DEFINE5(perf_event_open, err_locked: if (move_group) - mutex_unlock(&gctx->mutex); + perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); /* err_file: */ fput(event_file); diff --git a/samples/bpf/sock_example.h b/samples/bpf/sock_example.h index 09f7fe7e5fd7..d8014065d479 100644 --- a/samples/bpf/sock_example.h +++ b/samples/bpf/sock_example.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c index f4fa6af22def..ccca1e348017 100644 --- a/samples/bpf/trace_output_user.c +++ b/samples/bpf/trace_output_user.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/tools/lib/subcmd/parse-options.c b/tools/lib/subcmd/parse-options.c index 3284bb14ae78..8aad81151d50 100644 --- a/tools/lib/subcmd/parse-options.c +++ b/tools/lib/subcmd/parse-options.c @@ -213,6 +213,9 @@ static int get_value(struct parse_opt_ctx_t *p, else err = get_arg(p, opt, flags, (const char **)opt->value); + if (opt->set) + *(bool *)opt->set = true; + /* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */ if (opt->flags & PARSE_OPT_NOEMPTY) { const char *val = *(const char **)opt->value; diff --git a/tools/lib/subcmd/parse-options.h b/tools/lib/subcmd/parse-options.h index 8866ac438b34..11c3be3bcce7 100644 --- a/tools/lib/subcmd/parse-options.h +++ b/tools/lib/subcmd/parse-options.h @@ -137,6 +137,11 @@ struct option { { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \ .value = check_vtype(v, const char **), (a), .help = (h), \ .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) } +#define OPT_STRING_OPTARG_SET(s, l, v, os, a, h, d) \ + { .type = OPTION_STRING, .short_name = (s), .long_name = (l), \ + .value = check_vtype(v, const char **), (a), .help = (h), \ + .flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d), \ + .set = check_vtype(os, bool *)} #define OPT_STRING_NOEMPTY(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY} #define OPT_DATE(s, l, v, h) \ { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb } diff --git a/tools/lib/traceevent/plugin_sched_switch.c b/tools/lib/traceevent/plugin_sched_switch.c index f1ce60065258..ec30c2fcbac0 100644 --- a/tools/lib/traceevent/plugin_sched_switch.c +++ b/tools/lib/traceevent/plugin_sched_switch.c @@ -111,7 +111,7 @@ static int sched_switch_handler(struct trace_seq *s, trace_seq_printf(s, "%lld ", val); if (pevent_get_field_val(s, event, "prev_prio", record, &val, 0) == 0) - trace_seq_printf(s, "[%lld] ", val); + trace_seq_printf(s, "[%d] ", (int) val); if (pevent_get_field_val(s, event, "prev_state", record, &val, 0) == 0) write_state(s, val); @@ -129,7 +129,7 @@ static int sched_switch_handler(struct trace_seq *s, trace_seq_printf(s, "%lld", val); if (pevent_get_field_val(s, event, "next_prio", record, &val, 0) == 0) - trace_seq_printf(s, " [%lld]", val); + trace_seq_printf(s, " [%d]", (int) val); return 0; } diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 27fc3617c6a4..5054d9147f0f 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -430,6 +430,10 @@ that gets then processed, possibly via a perf script, to decide if that particular perf.data snapshot should be kept or not. Implies --timestamp-filename, --no-buildid and --no-buildid-cache. +The reason for the latter two is to reduce the data file switching +overhead. You can still switch them on with: + + --switch-output --no-no-buildid --no-no-buildid-cache --dry-run:: Parse options then exit. --dry-run can be used to detect errors in cmdline diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 8fc24824705e..8bb16aa9d661 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -704,9 +704,9 @@ install-tests: all install-gtk $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \ $(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr' -install-bin: install-tools install-tests +install-bin: install-tools install-tests install-traceevent-plugins -install: install-bin try-install-man install-traceevent-plugins +install: install-bin try-install-man install-python_ext: $(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)' diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 74d6a035133a..4ec10e9427d9 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1405,7 +1405,7 @@ static bool dry_run; * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', * using pipes, etc. */ -struct option __record_options[] = { +static struct option __record_options[] = { OPT_CALLBACK('e', "event", &record.evlist, "event", "event selector. use 'perf list' to list available events", parse_events_option), @@ -1636,7 +1636,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused) * overhead. Still generate buildid if they are required * explicitly using * - * perf record --signal-trigger --no-no-buildid \ + * perf record --switch-output --no-no-buildid \ * --no-no-buildid-cache * * Following code equals to: diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index d53e706a6f17..5b134b0d1ff3 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -209,6 +209,7 @@ struct perf_sched { u64 skipped_samples; const char *time_str; struct perf_time_interval ptime; + struct perf_time_interval hist_time; }; /* per thread run time data */ @@ -2460,6 +2461,11 @@ static int timehist_sched_change_event(struct perf_tool *tool, timehist_print_sample(sched, sample, &al, thread, t); out: + if (sched->hist_time.start == 0 && t >= ptime->start) + sched->hist_time.start = t; + if (ptime->end == 0 || t <= ptime->end) + sched->hist_time.end = t; + if (tr) { /* time of this sched_switch event becomes last time task seen */ tr->last_time = sample->time; @@ -2624,6 +2630,7 @@ static void timehist_print_summary(struct perf_sched *sched, struct thread *t; struct thread_runtime *r; int i; + u64 hist_time = sched->hist_time.end - sched->hist_time.start; memset(&totals, 0, sizeof(totals)); @@ -2665,7 +2672,7 @@ static void timehist_print_summary(struct perf_sched *sched, totals.sched_count += r->run_stats.n; printf(" CPU %2d idle for ", i); print_sched_time(r->total_run_time, 6); - printf(" msec\n"); + printf(" msec (%6.2f%%)\n", 100.0 * r->total_run_time / hist_time); } else printf(" CPU %2d idle entire time window\n", i); } @@ -2701,12 +2708,16 @@ static void timehist_print_summary(struct perf_sched *sched, printf("\n" " Total number of unique tasks: %" PRIu64 "\n" - "Total number of context switches: %" PRIu64 "\n" - " Total run time (msec): ", + "Total number of context switches: %" PRIu64 "\n", totals.task_count, totals.sched_count); + printf(" Total run time (msec): "); print_sched_time(totals.total_run_time, 2); printf("\n"); + + printf(" Total scheduling time (msec): "); + print_sched_time(hist_time, 2); + printf(" (x %d)\n", sched->max_cpu); } typedef int (*sched_handler)(struct perf_tool *tool, diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index d281ae2b54e8..4a57c8a60bd9 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -163,7 +163,7 @@ static struct map *kernel_get_module_map(const char *module) /* A file path -- this is an offline module */ if (module && strchr(module, '/')) - return machine__findnew_module_map(host_machine, 0, module); + return dso__new_map(module); if (!module) module = "kernel"; @@ -173,6 +173,7 @@ static struct map *kernel_get_module_map(const char *module) if (strncmp(pos->dso->short_name + 1, module, pos->dso->short_name_len - 2) == 0 && module[pos->dso->short_name_len - 2] == '\0') { + map__get(pos); return pos; } } @@ -188,15 +189,6 @@ struct map *get_target_map(const char *target, bool user) return kernel_get_module_map(target); } -static void put_target_map(struct map *map, bool user) -{ - if (map && user) { - /* Only the user map needs to be released */ - map__put(map); - } -} - - static int convert_exec_to_group(const char *exec, char **result) { char *ptr1, *ptr2, *exec_copy; @@ -267,21 +259,6 @@ static bool kprobe_warn_out_range(const char *symbol, unsigned long address) return true; } -/* - * NOTE: - * '.gnu.linkonce.this_module' section of kernel module elf directly - * maps to 'struct module' from linux/module.h. This section contains - * actual module name which will be used by kernel after loading it. - * But, we cannot use 'struct module' here since linux/module.h is not - * exposed to user-space. Offset of 'name' has remained same from long - * time, so hardcoding it here. - */ -#ifdef __LP64__ -#define MOD_NAME_OFFSET 24 -#else -#define MOD_NAME_OFFSET 12 -#endif - /* * @module can be module name of module file path. In case of path, * inspect elf and find out what is actual module name. @@ -296,6 +273,7 @@ static char *find_module_name(const char *module) Elf_Data *data; Elf_Scn *sec; char *mod_name = NULL; + int name_offset; fd = open(module, O_RDONLY); if (fd < 0) @@ -317,7 +295,21 @@ static char *find_module_name(const char *module) if (!data || !data->d_buf) goto ret_err; - mod_name = strdup((char *)data->d_buf + MOD_NAME_OFFSET); + /* + * NOTE: + * '.gnu.linkonce.this_module' section of kernel module elf directly + * maps to 'struct module' from linux/module.h. This section contains + * actual module name which will be used by kernel after loading it. + * But, we cannot use 'struct module' here since linux/module.h is not + * exposed to user-space. Offset of 'name' has remained same from long + * time, so hardcoding it here. + */ + if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) + name_offset = 12; + else /* expect ELFCLASS64 by default */ + name_offset = 24; + + mod_name = strdup((char *)data->d_buf + name_offset); ret_err: elf_end(elf); @@ -412,7 +404,7 @@ static int find_alternative_probe_point(struct debuginfo *dinfo, } out: - put_target_map(map, uprobes); + map__put(map); return ret; } @@ -618,6 +610,51 @@ error: return ret ? : -ENOENT; } +/* + * Rename DWARF symbols to ELF symbols -- gcc sometimes optimizes functions + * and generate new symbols with suffixes such as .constprop.N or .isra.N + * etc. Since those symbols are not recorded in DWARF, we have to find + * correct generated symbols from offline ELF binary. + * For online kernel or uprobes we don't need this because those are + * rebased on _text, or already a section relative address. + */ +static int +post_process_offline_probe_trace_events(struct probe_trace_event *tevs, + int ntevs, const char *pathname) +{ + struct symbol *sym; + struct map *map; + unsigned long stext = 0; + u64 addr; + int i; + + /* Prepare a map for offline binary */ + map = dso__new_map(pathname); + if (!map || get_text_start_address(pathname, &stext) < 0) { + pr_warning("Failed to get ELF symbols for %s\n", pathname); + return -EINVAL; + } + + for (i = 0; i < ntevs; i++) { + addr = tevs[i].point.address + tevs[i].point.offset - stext; + sym = map__find_symbol(map, addr); + if (!sym) + continue; + if (!strcmp(sym->name, tevs[i].point.symbol)) + continue; + /* If we have no realname, use symbol for it */ + if (!tevs[i].point.realname) + tevs[i].point.realname = tevs[i].point.symbol; + else + free(tevs[i].point.symbol); + tevs[i].point.symbol = strdup(sym->name); + tevs[i].point.offset = addr - sym->start; + } + map__put(map); + + return 0; +} + static int add_exec_to_probe_trace_events(struct probe_trace_event *tevs, int ntevs, const char *exec) { @@ -679,7 +716,8 @@ post_process_kernel_probe_trace_events(struct probe_trace_event *tevs, /* Skip post process if the target is an offline kernel */ if (symbol_conf.ignore_vmlinux_buildid) - return 0; + return post_process_offline_probe_trace_events(tevs, ntevs, + symbol_conf.vmlinux_name); reloc_sym = kernel_get_ref_reloc_sym(); if (!reloc_sym) { @@ -2869,7 +2907,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev, } out: - put_target_map(map, pev->uprobes); + map__put(map); free(syms); return ret; @@ -3362,10 +3400,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter, return ret; /* Get a symbol map */ - if (user) - map = dso__new_map(target); - else - map = kernel_get_module_map(target); + map = get_target_map(target, user); if (!map) { pr_err("Failed to get a map for %s\n", (target) ? : "kernel"); return -EINVAL; @@ -3397,9 +3432,7 @@ int show_available_funcs(const char *target, struct strfilter *_filter, } end: - if (user) { - map__put(map); - } + map__put(map); exit_probe_symbol_maps(); return ret; diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 99400b0e8f2a..adbc6c02c3aa 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -537,6 +537,12 @@ int sysfs__read_build_id(const char *filename, void *build_id, size_t size) break; } else { int n = namesz + descsz; + + if (n > (int)sizeof(bf)) { + n = sizeof(bf); + pr_debug("%s: truncating reading of build id in sysfs file %s: n_namesz=%u, n_descsz=%u.\n", + __func__, filename, nhdr.n_namesz, nhdr.n_descsz); + } if (read(fd, bf, n) != n) break; }