Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Dec 2009 23:30:21 +0000 (15:30 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 5 Dec 2009 23:30:21 +0000 (15:30 -0800)
* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (470 commits)
  x86: Fix comments of register/stack access functions
  perf tools: Replace %m with %a in sscanf
  hw-breakpoints: Keep track of user disabled breakpoints
  tracing/syscalls: Make syscall events print callbacks static
  tracing: Add DEFINE_EVENT(), DEFINE_SINGLE_EVENT() support to docbook
  perf: Don't free perf_mmap_data until work has been done
  perf_event: Fix compile error
  perf tools: Fix _GNU_SOURCE macro related strndup() build error
  trace_syscalls: Remove unused syscall_name_to_nr()
  trace_syscalls: Simplify syscall profile
  trace_syscalls: Remove duplicate init_enter_##sname()
  trace_syscalls: Add syscall_nr field to struct syscall_metadata
  trace_syscalls: Remove enter_id exit_id
  trace_syscalls: Set event_enter_##sname->data to its metadata
  trace_syscalls: Remove unused event_syscall_enter and event_syscall_exit
  perf_event: Initialize data.period in perf_swevent_hrtimer()
  perf probe: Simplify event naming
  perf probe: Add --list option for listing current probe events
  perf probe: Add argv_split() from lib/argv_split.c
  perf probe: Move probe event utility functions to probe-event.c
  ...

251 files changed:
Documentation/DocBook/tracepoint.tmpl
Documentation/trace/kprobetrace.txt [new file with mode: 0644]
arch/Kconfig
arch/powerpc/Kconfig.debug
arch/powerpc/configs/pseries_defconfig
arch/powerpc/include/asm/emulated_ops.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/trace.h [new file with mode: 0644]
arch/powerpc/kernel/align.c
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/perf_event.c
arch/powerpc/kernel/power5+-pmu.c
arch/powerpc/kernel/power5-pmu.c
arch/powerpc/kernel/power6-pmu.c
arch/powerpc/kernel/power7-pmu.c
arch/powerpc/kernel/ppc970-pmu.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/time.c
arch/powerpc/kernel/traps.c
arch/powerpc/lib/copypage_64.S
arch/powerpc/platforms/pseries/hvCall.S
arch/powerpc/platforms/pseries/hvCall_inst.c
arch/powerpc/platforms/pseries/lpar.c
arch/x86/Kconfig
arch/x86/Kconfig.debug
arch/x86/Makefile
arch/x86/include/asm/Kbuild
arch/x86/include/asm/a.out-core.h
arch/x86/include/asm/debugreg.h
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/hw_breakpoint.h [new file with mode: 0644]
arch/x86/include/asm/inat.h [new file with mode: 0644]
arch/x86/include/asm/inat_types.h [new file with mode: 0644]
arch/x86/include/asm/insn.h [new file with mode: 0644]
arch/x86/include/asm/mce.h
arch/x86/include/asm/perf_event.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/ptrace.h
arch/x86/kernel/Makefile
arch/x86/kernel/cpu/Makefile
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/mcheck/therm_throt.c
arch/x86/kernel/cpu/perf_event.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/hw_breakpoint.c [new file with mode: 0644]
arch/x86/kernel/irq.c
arch/x86/kernel/kgdb.c
arch/x86/kernel/kprobes.c
arch/x86/kernel/machine_kexec_32.c
arch/x86/kernel/machine_kexec_64.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/setup.c
arch/x86/kernel/signal.c
arch/x86/kernel/traps.c
arch/x86/kvm/x86.c
arch/x86/lib/.gitignore [new file with mode: 0644]
arch/x86/lib/Makefile
arch/x86/lib/inat.c [new file with mode: 0644]
arch/x86/lib/insn.c [new file with mode: 0644]
arch/x86/lib/x86-opcode-map.txt [new file with mode: 0644]
arch/x86/mm/fault.c
arch/x86/mm/kmmio.c
arch/x86/power/cpu.c
arch/x86/tools/Makefile [new file with mode: 0644]
arch/x86/tools/chkobjdump.awk [new file with mode: 0644]
arch/x86/tools/distill.awk [new file with mode: 0644]
arch/x86/tools/gen-insn-attr-x86.awk [new file with mode: 0644]
arch/x86/tools/test_get_len.c [new file with mode: 0644]
drivers/edac/edac_mce_amd.c
include/linux/ftrace_event.h
include/linux/hw_breakpoint.h [new file with mode: 0644]
include/linux/kprobes.h
include/linux/perf_counter.h
include/linux/perf_event.h
include/linux/syscalls.h
include/linux/tracepoint.h
include/trace/define_trace.h
include/trace/events/bkl.h
include/trace/events/block.h
include/trace/events/ext4.h
include/trace/events/irq.h
include/trace/events/jbd2.h
include/trace/events/kmem.h
include/trace/events/lock.h [moved from include/trace/events/lockdep.h with 92% similarity]
include/trace/events/mce.h [new file with mode: 0644]
include/trace/events/module.h
include/trace/events/power.h
include/trace/events/sched.h
include/trace/events/signal.h [new file with mode: 0644]
include/trace/events/timer.h
include/trace/events/workqueue.h
include/trace/ftrace.h
include/trace/syscall.h
kernel/Makefile
kernel/exit.c
kernel/hw_breakpoint.c [new file with mode: 0644]
kernel/kallsyms.c
kernel/kprobes.c
kernel/lockdep.c
kernel/notifier.c
kernel/perf_event.c
kernel/signal.c
kernel/trace/Kconfig
kernel/trace/Makefile
kernel/trace/ring_buffer.c
kernel/trace/trace.h
kernel/trace/trace_entries.h
kernel/trace/trace_event_profile.c
kernel/trace/trace_events.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_export.c
kernel/trace/trace_kprobe.c [new file with mode: 0644]
kernel/trace/trace_ksym.c [new file with mode: 0644]
kernel/trace/trace_selftest.c
kernel/trace/trace_syscalls.c
samples/Kconfig
samples/Makefile
samples/hw_breakpoint/Makefile [new file with mode: 0644]
samples/hw_breakpoint/data_breakpoint.c [new file with mode: 0644]
scripts/kernel-doc
tools/perf/.gitignore
tools/perf/Documentation/perf-bench.txt [new file with mode: 0644]
tools/perf/Documentation/perf-buildid-list.txt [new file with mode: 0644]
tools/perf/Documentation/perf-kmem.txt [new file with mode: 0644]
tools/perf/Documentation/perf-probe.txt [new file with mode: 0644]
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-timechart.txt
tools/perf/Documentation/perf-trace-perl.txt [new file with mode: 0644]
tools/perf/Documentation/perf-trace.txt
tools/perf/Makefile
tools/perf/bench/bench.h [new file with mode: 0644]
tools/perf/bench/mem-memcpy.c [new file with mode: 0644]
tools/perf/bench/sched-messaging.c [new file with mode: 0644]
tools/perf/bench/sched-pipe.c [new file with mode: 0644]
tools/perf/builtin-annotate.c
tools/perf/builtin-bench.c [new file with mode: 0644]
tools/perf/builtin-buildid-list.c [new file with mode: 0644]
tools/perf/builtin-help.c
tools/perf/builtin-kmem.c [new file with mode: 0644]
tools/perf/builtin-probe.c [new file with mode: 0644]
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-sched.c
tools/perf/builtin-stat.c
tools/perf/builtin-timechart.c
tools/perf/builtin-top.c
tools/perf/builtin-trace.c
tools/perf/builtin.h
tools/perf/command-list.txt
tools/perf/design.txt
tools/perf/perf.c
tools/perf/perf.h
tools/perf/scripts/perl/Perf-Trace-Util/Context.c [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/Context.xs [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/Makefile.PL [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/README [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm [new file with mode: 0644]
tools/perf/scripts/perl/Perf-Trace-Util/typemap [new file with mode: 0644]
tools/perf/scripts/perl/bin/check-perf-trace-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/check-perf-trace-report [new file with mode: 0644]
tools/perf/scripts/perl/bin/rw-by-file-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/rw-by-file-report [new file with mode: 0644]
tools/perf/scripts/perl/bin/rw-by-pid-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/rw-by-pid-report [new file with mode: 0644]
tools/perf/scripts/perl/bin/wakeup-latency-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/wakeup-latency-report [new file with mode: 0644]
tools/perf/scripts/perl/bin/workqueue-stats-record [new file with mode: 0644]
tools/perf/scripts/perl/bin/workqueue-stats-report [new file with mode: 0644]
tools/perf/scripts/perl/check-perf-trace.pl [new file with mode: 0644]
tools/perf/scripts/perl/rw-by-file.pl [new file with mode: 0644]
tools/perf/scripts/perl/rw-by-pid.pl [new file with mode: 0644]
tools/perf/scripts/perl/wakeup-latency.pl [new file with mode: 0644]
tools/perf/scripts/perl/workqueue-stats.pl [new file with mode: 0644]
tools/perf/util/cache.h
tools/perf/util/callchain.c
tools/perf/util/callchain.h
tools/perf/util/color.h
tools/perf/util/ctype.c
tools/perf/util/data_map.c [new file with mode: 0644]
tools/perf/util/data_map.h [new file with mode: 0644]
tools/perf/util/debug.c
tools/perf/util/debug.h
tools/perf/util/debugfs.c [new file with mode: 0644]
tools/perf/util/debugfs.h [new file with mode: 0644]
tools/perf/util/event.c [new file with mode: 0644]
tools/perf/util/event.h
tools/perf/util/exec_cmd.h
tools/perf/util/header.c
tools/perf/util/header.h
tools/perf/util/help.h
tools/perf/util/hist.c [new file with mode: 0644]
tools/perf/util/hist.h [new file with mode: 0644]
tools/perf/util/include/asm/asm-offsets.h [new file with mode: 0644]
tools/perf/util/include/asm/bitops.h [new file with mode: 0644]
tools/perf/util/include/asm/bug.h [new file with mode: 0644]
tools/perf/util/include/asm/byteorder.h [new file with mode: 0644]
tools/perf/util/include/asm/swab.h [new file with mode: 0644]
tools/perf/util/include/asm/uaccess.h [new file with mode: 0644]
tools/perf/util/include/linux/bitmap.h [new file with mode: 0644]
tools/perf/util/include/linux/bitops.h [new file with mode: 0644]
tools/perf/util/include/linux/compiler.h [new file with mode: 0644]
tools/perf/util/include/linux/ctype.h [new file with mode: 0644]
tools/perf/util/include/linux/kernel.h
tools/perf/util/include/linux/string.h [new file with mode: 0644]
tools/perf/util/include/linux/types.h [new file with mode: 0644]
tools/perf/util/levenshtein.h
tools/perf/util/map.c
tools/perf/util/module.c [deleted file]
tools/perf/util/module.h [deleted file]
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/parse-options.h
tools/perf/util/probe-event.c [new file with mode: 0644]
tools/perf/util/probe-event.h [new file with mode: 0644]
tools/perf/util/probe-finder.c [new file with mode: 0644]
tools/perf/util/probe-finder.h [new file with mode: 0644]
tools/perf/util/quote.h
tools/perf/util/run-command.h
tools/perf/util/sigchain.h
tools/perf/util/sort.c [new file with mode: 0644]
tools/perf/util/sort.h [new file with mode: 0644]
tools/perf/util/strbuf.h
tools/perf/util/string.c
tools/perf/util/string.h
tools/perf/util/strlist.h
tools/perf/util/svghelper.h
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/thread.c
tools/perf/util/thread.h
tools/perf/util/trace-event-info.c
tools/perf/util/trace-event-parse.c
tools/perf/util/trace-event-perl.c [new file with mode: 0644]
tools/perf/util/trace-event-perl.h [new file with mode: 0644]
tools/perf/util/trace-event-read.c
tools/perf/util/trace-event.h
tools/perf/util/types.h
tools/perf/util/util.h
tools/perf/util/values.h
tools/perf/util/wrapper.c

index b0756d0fd57910a44d2f2d29fdb4edf87dac21ca..8bca1d5cec09a8bf6c8c5f0e6f159d770fcf806c 100644 (file)
@@ -86,4 +86,9 @@
 !Iinclude/trace/events/irq.h
   </chapter>
 
+  <chapter id="signal">
+   <title>SIGNAL</title>
+!Iinclude/trace/events/signal.h
+  </chapter>
+
 </book>
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
new file mode 100644 (file)
index 0000000..47aabee
--- /dev/null
@@ -0,0 +1,149 @@
+                        Kprobe-based Event Tracing
+                        ==========================
+
+                 Documentation is written by Masami Hiramatsu
+
+
+Overview
+--------
+These events are similar to tracepoint based events. Instead of Tracepoint,
+this is based on kprobes (kprobe and kretprobe). So it can probe wherever
+kprobes can probe (this means, all functions body except for __kprobes
+functions). Unlike the Tracepoint based event, this can be added and removed
+dynamically, on the fly.
+
+To enable this feature, build your kernel with CONFIG_KPROBE_TRACING=y.
+
+Similar to the events tracer, this doesn't need to be activated via
+current_tracer. Instead of that, add probe points via
+/sys/kernel/debug/tracing/kprobe_events, and enable it via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/enabled.
+
+
+Synopsis of kprobe_events
+-------------------------
+  p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS]    : Set a probe
+  r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS]               : Set a return probe
+
+ GRP           : Group name. If omitted, use "kprobes" for it.
+ EVENT         : Event name. If omitted, the event name is generated
+                 based on SYMBOL+offs or MEMADDR.
+ SYMBOL[+offs] : Symbol+offset where the probe is inserted.
+ MEMADDR       : Address where the probe is inserted.
+
+ FETCHARGS     : Arguments. Each probe can have up to 128 args.
+  %REG         : Fetch register REG
+  @ADDR                : Fetch memory at ADDR (ADDR should be in kernel)
+  @SYM[+|-offs]        : Fetch memory at SYM +|- offs (SYM should be a data symbol)
+  $stackN      : Fetch Nth entry of stack (N >= 0)
+  $stack       : Fetch stack address.
+  $argN                : Fetch function argument. (N >= 0)(*)
+  $retval      : Fetch return value.(**)
+  +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
+  NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
+
+  (*) aN may not correct on asmlinkaged functions and at the middle of
+      function body.
+  (**) only for return probe.
+  (***) this is useful for fetching a field of data structures.
+
+
+Per-Probe Event Filtering
+-------------------------
+ Per-probe event filtering feature allows you to set different filter on each
+probe and gives you what arguments will be shown in trace buffer. If an event
+name is specified right after 'p:' or 'r:' in kprobe_events, it adds an event
+under tracing/events/kprobes/<EVENT>, at the directory you can see 'id',
+'enabled', 'format' and 'filter'.
+
+enabled:
+  You can enable/disable the probe by writing 1 or 0 on it.
+
+format:
+  This shows the format of this probe event.
+
+filter:
+  You can write filtering rules of this event.
+
+id:
+  This shows the id of this probe event.
+
+
+Event Profiling
+---------------
+ You can check the total number of probe hits and probe miss-hits via
+/sys/kernel/debug/tracing/kprobe_profile.
+ The first column is event name, the second is the number of probe hits,
+the third is the number of probe miss-hits.
+
+
+Usage examples
+--------------
+To add a probe as a new event, write a new definition to kprobe_events
+as below.
+
+  echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events
+
+ This sets a kprobe on the top of do_sys_open() function with recording
+1st to 4th arguments as "myprobe" event. As this example shows, users can
+choose more familiar names for each arguments.
+
+  echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events
+
+ This sets a kretprobe on the return point of do_sys_open() function with
+recording return value as "myretprobe" event.
+ You can see the format of these events via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/format.
+
+  cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
+name: myprobe
+ID: 75
+format:
+       field:unsigned short common_type;       offset:0;       size:2;
+       field:unsigned char common_flags;       offset:2;       size:1;
+       field:unsigned char common_preempt_count;       offset:3;       size:1;
+       field:int common_pid;   offset:4;       size:4;
+       field:int common_tgid;  offset:8;       size:4;
+
+       field: unsigned long ip;        offset:16;tsize:8;
+       field: int nargs;       offset:24;tsize:4;
+       field: unsigned long dfd;       offset:32;tsize:8;
+       field: unsigned long filename;  offset:40;tsize:8;
+       field: unsigned long flags;     offset:48;tsize:8;
+       field: unsigned long mode;      offset:56;tsize:8;
+
+print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode
+
+
+ You can see that the event has 4 arguments as in the expressions you specified.
+
+  echo > /sys/kernel/debug/tracing/kprobe_events
+
+ This clears all probe points.
+
+ Right after definition, each event is disabled by default. For tracing these
+events, you need to enable it.
+
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
+
+ And you can see the traced information via /sys/kernel/debug/tracing/trace.
+
+  cat /sys/kernel/debug/tracing/trace
+# tracer: nop
+#
+#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+#              | |       |          |         |
+           <...>-1447  [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
+           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $retval=fffffffffffffffe
+           <...>-1447  [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6
+           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
+           <...>-1447  [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10
+           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
+
+
+ Each line shows when the kernel hits an event, and <- SYMBOL means kernel
+returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
+returns from do_sys_open to sys_open+0x1b).
+
+
index 7f418bbc261a0f825ab6072f935f1f974a2f746c..eef3bbb970753c1d840cb9c1520147850dda6800 100644 (file)
@@ -126,4 +126,11 @@ config HAVE_DMA_API_DEBUG
 config HAVE_DEFAULT_NO_SPIN_MUTEXES
        bool
 
+config HAVE_HW_BREAKPOINT
+       bool
+       depends on HAVE_PERF_EVENTS
+       select ANON_INODES
+       select PERF_EVENTS
+
+
 source "kernel/gcov/Kconfig"
index 3b1005185390557558c99d3efb971824b8b76b68..bf3382f1904d04d0f0807490c1449691f540a4b7 100644 (file)
@@ -46,7 +46,7 @@ config DEBUG_STACK_USAGE
 
 config HCALL_STATS
        bool "Hypervisor call instrumentation"
-       depends on PPC_PSERIES && DEBUG_FS
+       depends on PPC_PSERIES && DEBUG_FS && TRACEPOINTS
        help
          Adds code to keep track of the number of hypervisor calls made and
          the amount of time spent in hypervisor calls.  Wall time spent in
index f1889abb89b1a196e61861413237f0ecd423368c..c568329723b8d5371f9cb920dbd9f811af9bcb81 100644 (file)
@@ -1683,7 +1683,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_STACK_USAGE is not set
 # CONFIG_DEBUG_PAGEALLOC is not set
-CONFIG_HCALL_STATS=y
+# CONFIG_HCALL_STATS is not set
 # CONFIG_CODE_PATCHING_SELFTEST is not set
 # CONFIG_FTR_FIXUP_SELFTEST is not set
 # CONFIG_MSI_BITMAP_SELFTEST is not set
index 9154e8526732cf2f3c217f450244f4d8ed5f0f04..f0fb4fc1f6e6878a2617fbdf6782c5e2883af39b 100644 (file)
@@ -19,6 +19,7 @@
 #define _ASM_POWERPC_EMULATED_OPS_H
 
 #include <asm/atomic.h>
+#include <linux/perf_event.h>
 
 
 #ifdef CONFIG_PPC_EMULATED_STATS
@@ -57,7 +58,7 @@ extern u32 ppc_warn_emulated;
 
 extern void ppc_warn_emulated_print(const char *type);
 
-#define PPC_WARN_EMULATED(type)                                                 \
+#define __PPC_WARN_EMULATED(type)                                       \
        do {                                                             \
                atomic_inc(&ppc_emulated.type.val);                      \
                if (ppc_warn_emulated)                                   \
@@ -66,8 +67,22 @@ extern void ppc_warn_emulated_print(const char *type);
 
 #else /* !CONFIG_PPC_EMULATED_STATS */
 
-#define PPC_WARN_EMULATED(type)        do { } while (0)
+#define __PPC_WARN_EMULATED(type)      do { } while (0)
 
 #endif /* !CONFIG_PPC_EMULATED_STATS */
 
+#define PPC_WARN_EMULATED(type, regs)                                  \
+       do {                                                            \
+               perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,           \
+                       1, 0, regs, 0);                                 \
+               __PPC_WARN_EMULATED(type);                              \
+       } while (0)
+
+#define PPC_WARN_ALIGNMENT(type, regs)                                 \
+       do {                                                            \
+               perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS,           \
+                       1, 0, regs, regs->dar);                         \
+               __PPC_WARN_EMULATED(type);                              \
+       } while (0)
+
 #endif /* _ASM_POWERPC_EMULATED_OPS_H */
index 6251a4b10be7a9a2004592b6aa7dff7994c09b0e..c27caac47ad1d6c92980ea78d13e03778053a13f 100644 (file)
@@ -274,6 +274,8 @@ struct hcall_stats {
        unsigned long   num_calls;      /* number of calls (on this CPU) */
        unsigned long   tb_total;       /* total wall time (mftb) of calls. */
        unsigned long   purr_total;     /* total cpu time (PURR) of calls. */
+       unsigned long   tb_start;
+       unsigned long   purr_start;
 };
 #define HCALL_STAT_ARRAY_SIZE  ((MAX_HCALL_OPCODE >> 2) + 1)
 
index 6315edc205d8673e1db6702224b477cee0ea602e..bc8dd53f718a1b201a16e91f4864bb221f5acb25 100644 (file)
 #define SPRN_MMCR1     798
 #define SPRN_MMCRA     0x312
 #define   MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
+#define   MMCRA_SDAR_DCACHE_MISS 0x40000000UL
+#define   MMCRA_SDAR_ERAT_MISS   0x20000000UL
 #define   MMCRA_SIHV   0x10000000UL /* state of MSR HV when SIAR set */
 #define   MMCRA_SIPR   0x08000000UL /* state of MSR PR when SIAR set */
 #define   MMCRA_SLOT   0x07000000UL /* SLOT bits (37-39) */
diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
new file mode 100644 (file)
index 0000000..cbe2297
--- /dev/null
@@ -0,0 +1,133 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM powerpc
+
+#if !defined(_TRACE_POWERPC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWERPC_H
+
+#include <linux/tracepoint.h>
+
+struct pt_regs;
+
+TRACE_EVENT(irq_entry,
+
+       TP_PROTO(struct pt_regs *regs),
+
+       TP_ARGS(regs),
+
+       TP_STRUCT__entry(
+               __field(struct pt_regs *, regs)
+       ),
+
+       TP_fast_assign(
+               __entry->regs = regs;
+       ),
+
+       TP_printk("pt_regs=%p", __entry->regs)
+);
+
+TRACE_EVENT(irq_exit,
+
+       TP_PROTO(struct pt_regs *regs),
+
+       TP_ARGS(regs),
+
+       TP_STRUCT__entry(
+               __field(struct pt_regs *, regs)
+       ),
+
+       TP_fast_assign(
+               __entry->regs = regs;
+       ),
+
+       TP_printk("pt_regs=%p", __entry->regs)
+);
+
+TRACE_EVENT(timer_interrupt_entry,
+
+       TP_PROTO(struct pt_regs *regs),
+
+       TP_ARGS(regs),
+
+       TP_STRUCT__entry(
+               __field(struct pt_regs *, regs)
+       ),
+
+       TP_fast_assign(
+               __entry->regs = regs;
+       ),
+
+       TP_printk("pt_regs=%p", __entry->regs)
+);
+
+TRACE_EVENT(timer_interrupt_exit,
+
+       TP_PROTO(struct pt_regs *regs),
+
+       TP_ARGS(regs),
+
+       TP_STRUCT__entry(
+               __field(struct pt_regs *, regs)
+       ),
+
+       TP_fast_assign(
+               __entry->regs = regs;
+       ),
+
+       TP_printk("pt_regs=%p", __entry->regs)
+);
+
+#ifdef CONFIG_PPC_PSERIES
+extern void hcall_tracepoint_regfunc(void);
+extern void hcall_tracepoint_unregfunc(void);
+
+TRACE_EVENT_FN(hcall_entry,
+
+       TP_PROTO(unsigned long opcode, unsigned long *args),
+
+       TP_ARGS(opcode, args),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, opcode)
+       ),
+
+       TP_fast_assign(
+               __entry->opcode = opcode;
+       ),
+
+       TP_printk("opcode=%lu", __entry->opcode),
+
+       hcall_tracepoint_regfunc, hcall_tracepoint_unregfunc
+);
+
+TRACE_EVENT_FN(hcall_exit,
+
+       TP_PROTO(unsigned long opcode, unsigned long retval,
+               unsigned long *retbuf),
+
+       TP_ARGS(opcode, retval, retbuf),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, opcode)
+               __field(unsigned long, retval)
+       ),
+
+       TP_fast_assign(
+               __entry->opcode = opcode;
+               __entry->retval = retval;
+       ),
+
+       TP_printk("opcode=%lu retval=%lu", __entry->opcode, __entry->retval),
+
+       hcall_tracepoint_regfunc, hcall_tracepoint_unregfunc
+);
+#endif
+
+#endif /* _TRACE_POWERPC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH asm
+#define TRACE_INCLUDE_FILE trace
+
+#include <trace/define_trace.h>
index a5b632e52faea9dab9c63921430d572ca1e9f0f2..3839839f83c79ad8544ef0bbf6ded33e0d96ce13 100644 (file)
@@ -732,7 +732,7 @@ int fix_alignment(struct pt_regs *regs)
 
 #ifdef CONFIG_SPE
        if ((instr >> 26) == 0x4) {
-               PPC_WARN_EMULATED(spe);
+               PPC_WARN_ALIGNMENT(spe, regs);
                return emulate_spe(regs, reg, instr);
        }
 #endif
@@ -786,7 +786,7 @@ int fix_alignment(struct pt_regs *regs)
                        flags |= SPLT;
                        nb = 8;
                }
-               PPC_WARN_EMULATED(vsx);
+               PPC_WARN_ALIGNMENT(vsx, regs);
                return emulate_vsx(addr, reg, areg, regs, flags, nb);
        }
 #endif
@@ -794,7 +794,7 @@ int fix_alignment(struct pt_regs *regs)
         * the exception of DCBZ which is handled as a special case here
         */
        if (instr == DCBZ) {
-               PPC_WARN_EMULATED(dcbz);
+               PPC_WARN_ALIGNMENT(dcbz, regs);
                return emulate_dcbz(regs, addr);
        }
        if (unlikely(nb == 0))
@@ -804,7 +804,7 @@ int fix_alignment(struct pt_regs *regs)
         * function
         */
        if (flags & M) {
-               PPC_WARN_EMULATED(multiple);
+               PPC_WARN_ALIGNMENT(multiple, regs);
                return emulate_multiple(regs, addr, reg, nb,
                                        flags, instr, swiz);
        }
@@ -825,11 +825,11 @@ int fix_alignment(struct pt_regs *regs)
 
        /* Special case for 16-byte FP loads and stores */
        if (nb == 16) {
-               PPC_WARN_EMULATED(fp_pair);
+               PPC_WARN_ALIGNMENT(fp_pair, regs);
                return emulate_fp_pair(addr, reg, flags);
        }
 
-       PPC_WARN_EMULATED(unaligned);
+       PPC_WARN_ALIGNMENT(unaligned, regs);
 
        /* If we are loading, get the data from user space, else
         * get it from register values
index 9763267e38b46cbcdb2d6c9bfffd83d61b1eeb08..bdcb557d470ab1a3f272f0783e0e77244f88cbe2 100644 (file)
@@ -551,7 +551,7 @@ restore:
 BEGIN_FW_FTR_SECTION
        ld      r5,SOFTE(r1)
 FW_FTR_SECTION_ELSE
-       b       iseries_check_pending_irqs
+       b       .Liseries_check_pending_irqs
 ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 2:
        TRACE_AND_RESTORE_IRQ(r5);
@@ -623,7 +623,7 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 
 #endif /* CONFIG_PPC_BOOK3E */
 
-iseries_check_pending_irqs:
+.Liseries_check_pending_irqs:
 #ifdef CONFIG_PPC_ISERIES
        ld      r5,SOFTE(r1)
        cmpdi   0,r5,0
index 1808876edcc91d4f43fdff8fde2c8e1770fc832f..c7eb4e0eb86cae27f3fcd510519d5177b124fa5e 100644 (file)
@@ -185,12 +185,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
         * prolog code of the PerformanceMonitor one. A little
         * trickery is thus necessary
         */
+performance_monitor_pSeries_1:
        . = 0xf00
        b       performance_monitor_pSeries
 
+altivec_unavailable_pSeries_1:
        . = 0xf20
        b       altivec_unavailable_pSeries
 
+vsx_unavailable_pSeries_1:
        . = 0xf40
        b       vsx_unavailable_pSeries
 
index e5d1211779840f029841e1591952c3b5ec9b4fae..02a334662cc02148bf6f1cb9ba85cf118d799059 100644 (file)
@@ -70,6 +70,8 @@
 #include <asm/firmware.h>
 #include <asm/lv1call.h>
 #endif
+#define CREATE_TRACE_POINTS
+#include <asm/trace.h>
 
 int __irq_offset_value;
 static int ppc_spurious_interrupts;
@@ -325,6 +327,8 @@ void do_IRQ(struct pt_regs *regs)
        struct pt_regs *old_regs = set_irq_regs(regs);
        unsigned int irq;
 
+       trace_irq_entry(regs);
+
        irq_enter();
 
        check_stack_overflow();
@@ -348,6 +352,8 @@ void do_IRQ(struct pt_regs *regs)
                timer_interrupt(regs);
        }
 #endif
+
+       trace_irq_exit(regs);
 }
 
 void __init init_IRQ(void)
index 87f1663584b054182b3f7719fe96088e637fb78c..1eb85fbf53a50277f48be5ae5d8c57772d216d61 100644 (file)
@@ -1165,7 +1165,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
         */
        if (record) {
                struct perf_sample_data data = {
-                       .addr   = 0,
+                       .addr   = ~0ULL,
                        .period = event->hw.last_period,
                };
 
index 0f4c1c73a6adea4ca405b3ec223faca3370e0ee3..199de527d411446918651bd374bc5e9586ace46f 100644 (file)
 #define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
 #define MMCR1_PMCSEL_MSK       0x7f
 
-/*
- * Bits in MMCRA
- */
-
 /*
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
index c351b3a57fbb3361cb04103e95f514b86dcaff52..98b6a729a9dd127cc2c88e799b58bdbea2fa313c 100644 (file)
 #define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
 #define MMCR1_PMCSEL_MSK       0x7f
 
-/*
- * Bits in MMCRA
- */
-
 /*
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
@@ -390,7 +386,7 @@ static int power5_compute_mmcr(u64 event[], int n_ev,
                               unsigned int hwc[], unsigned long mmcr[])
 {
        unsigned long mmcr1 = 0;
-       unsigned long mmcra = 0;
+       unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
        unsigned int pmc, unit, byte, psel;
        unsigned int ttm, grp;
        int i, isbus, bit, grsel;
index ca399ba5034c9d13373484476312b6c7bc8691f2..84a607bda8fbc129562943d7d2ce7fe0ee0f11bd 100644 (file)
@@ -178,7 +178,7 @@ static int p6_compute_mmcr(u64 event[], int n_ev,
                           unsigned int hwc[], unsigned long mmcr[])
 {
        unsigned long mmcr1 = 0;
-       unsigned long mmcra = 0;
+       unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
        int i;
        unsigned int pmc, ev, b, u, s, psel;
        unsigned int ttmset = 0;
index 28a4daacdc0222cc6b80c3824bf1529ea2f9556f..852f7b7f6b4045801df807b997c7b110426ce7a6 100644 (file)
 #define MMCR1_PMCSEL_SH(n)     (MMCR1_PMC1SEL_SH - (n) * 8)
 #define MMCR1_PMCSEL_MSK       0xff
 
-/*
- * Bits in MMCRA
- */
-
 /*
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
@@ -230,7 +226,7 @@ static int power7_compute_mmcr(u64 event[], int n_ev,
                               unsigned int hwc[], unsigned long mmcr[])
 {
        unsigned long mmcr1 = 0;
-       unsigned long mmcra = 0;
+       unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS;
        unsigned int pmc, unit, combine, l2sel, psel;
        unsigned int pmc_inuse = 0;
        int i;
index 479574413a93fa5c7aa3b38e28537b2830517f90..8eff48e20dba8ae056f9f4469642a5cdc453aefc 100644 (file)
@@ -83,10 +83,6 @@ static short mmcr1_adder_bits[8] = {
        MMCR1_PMC8_ADDER_SEL_SH
 };
 
-/*
- * Bits in MMCRA
- */
-
 /*
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
index 4271f7a655a3adfb5e0636e83c8f6e59b61e1868..845c72ab7357884c580a1c6f3f3217ed4987ee75 100644 (file)
@@ -660,6 +660,7 @@ late_initcall(check_cache_coherency);
 
 #ifdef CONFIG_DEBUG_FS
 struct dentry *powerpc_debugfs_root;
+EXPORT_SYMBOL(powerpc_debugfs_root);
 
 static int powerpc_debugfs_init(void)
 {
index a136a11c490d0f36a9b32812132e682c38262f8f..36707dec94d775376c0b59eac099ff5e8da75116 100644 (file)
@@ -54,6 +54,7 @@
 #include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/perf_event.h>
+#include <asm/trace.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
@@ -571,6 +572,8 @@ void timer_interrupt(struct pt_regs * regs)
        struct clock_event_device *evt = &decrementer->event;
        u64 now;
 
+       trace_timer_interrupt_entry(regs);
+
        /* Ensure a positive value is written to the decrementer, or else
         * some CPUs will continuue to take decrementer exceptions */
        set_dec(DECREMENTER_MAX);
@@ -590,6 +593,7 @@ void timer_interrupt(struct pt_regs * regs)
                now = decrementer->next_tb - now;
                if (now <= DECREMENTER_MAX)
                        set_dec((int)now);
+               trace_timer_interrupt_exit(regs);
                return;
        }
        old_regs = set_irq_regs(regs);
@@ -620,6 +624,8 @@ void timer_interrupt(struct pt_regs * regs)
 
        irq_exit();
        set_irq_regs(old_regs);
+
+       trace_timer_interrupt_exit(regs);
 }
 
 void wakeup_decrementer(void)
index 6f0ae1a9bfae6d78b31afdf15f1c22d554e5900d..9d1f9354d6cafcf12fd7719d599615e31e55b2d9 100644 (file)
@@ -759,7 +759,7 @@ static int emulate_instruction(struct pt_regs *regs)
 
        /* Emulate the mfspr rD, PVR. */
        if ((instword & PPC_INST_MFSPR_PVR_MASK) == PPC_INST_MFSPR_PVR) {
-               PPC_WARN_EMULATED(mfpvr);
+               PPC_WARN_EMULATED(mfpvr, regs);
                rd = (instword >> 21) & 0x1f;
                regs->gpr[rd] = mfspr(SPRN_PVR);
                return 0;
@@ -767,7 +767,7 @@ static int emulate_instruction(struct pt_regs *regs)
 
        /* Emulating the dcba insn is just a no-op.  */
        if ((instword & PPC_INST_DCBA_MASK) == PPC_INST_DCBA) {
-               PPC_WARN_EMULATED(dcba);
+               PPC_WARN_EMULATED(dcba, regs);
                return 0;
        }
 
@@ -776,7 +776,7 @@ static int emulate_instruction(struct pt_regs *regs)
                int shift = (instword >> 21) & 0x1c;
                unsigned long msk = 0xf0000000UL >> shift;
 
-               PPC_WARN_EMULATED(mcrxr);
+               PPC_WARN_EMULATED(mcrxr, regs);
                regs->ccr = (regs->ccr & ~msk) | ((regs->xer >> shift) & msk);
                regs->xer &= ~0xf0000000UL;
                return 0;
@@ -784,19 +784,19 @@ static int emulate_instruction(struct pt_regs *regs)
 
        /* Emulate load/store string insn. */
        if ((instword & PPC_INST_STRING_GEN_MASK) == PPC_INST_STRING) {
-               PPC_WARN_EMULATED(string);
+               PPC_WARN_EMULATED(string, regs);
                return emulate_string_inst(regs, instword);
        }
 
        /* Emulate the popcntb (Population Count Bytes) instruction. */
        if ((instword & PPC_INST_POPCNTB_MASK) == PPC_INST_POPCNTB) {
-               PPC_WARN_EMULATED(popcntb);
+               PPC_WARN_EMULATED(popcntb, regs);
                return emulate_popcntb_inst(regs, instword);
        }
 
        /* Emulate isel (Integer Select) instruction */
        if ((instword & PPC_INST_ISEL_MASK) == PPC_INST_ISEL) {
-               PPC_WARN_EMULATED(isel);
+               PPC_WARN_EMULATED(isel, regs);
                return emulate_isel(regs, instword);
        }
 
@@ -995,7 +995,7 @@ void SoftwareEmulation(struct pt_regs *regs)
 #ifdef CONFIG_MATH_EMULATION
        errcode = do_mathemu(regs);
        if (errcode >= 0)
-               PPC_WARN_EMULATED(math);
+               PPC_WARN_EMULATED(math, regs);
 
        switch (errcode) {
        case 0:
@@ -1018,7 +1018,7 @@ void SoftwareEmulation(struct pt_regs *regs)
 #elif defined(CONFIG_8XX_MINIMAL_FPEMU)
        errcode = Soft_emulate_8xx(regs);
        if (errcode >= 0)
-               PPC_WARN_EMULATED(8xx);
+               PPC_WARN_EMULATED(8xx, regs);
 
        switch (errcode) {
        case 0:
@@ -1129,7 +1129,7 @@ void altivec_assist_exception(struct pt_regs *regs)
 
        flush_altivec_to_thread(current);
 
-       PPC_WARN_EMULATED(altivec);
+       PPC_WARN_EMULATED(altivec, regs);
        err = emulate_altivec(regs);
        if (err == 0) {
                regs->nip += 4;         /* skip emulated instruction */
index 75f3267fdc300977f26ac6cc6e2a0db41add6525..e68beac0a171f41918e37a47aed0722bc0081218 100644 (file)
@@ -26,11 +26,11 @@ BEGIN_FTR_SECTION
        srd     r8,r5,r11
 
        mtctr   r8
-setup:
+.Lsetup:
        dcbt    r9,r4
        dcbz    r9,r3
        add     r9,r9,r12
-       bdnz    setup
+       bdnz    .Lsetup
 END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
        addi    r3,r3,-8
        srdi    r8,r5,7         /* page is copied in 128 byte strides */
index c1427b3634ec8341206d575e4643c7633ffc977d..383a5d0e9818a07758463bf3ddf29a835eb9af99 100644 (file)
        
 #define STK_PARM(i)     (48 + ((i)-3)*8)
 
-#ifdef CONFIG_HCALL_STATS
+#ifdef CONFIG_TRACEPOINTS
+
+       .section        ".toc","aw"
+
+       .globl hcall_tracepoint_refcount
+hcall_tracepoint_refcount:
+       .llong  0
+
+       .section        ".text"
+
 /*
  * precall must preserve all registers.  use unused STK_PARM()
- * areas to save snapshots and opcode.
+ * areas to save snapshots and opcode. We branch around this
+ * in early init (eg when populating the MMU hashtable) by using an
+ * unconditional cpu feature.
  */
-#define HCALL_INST_PRECALL                                     \
-       std     r3,STK_PARM(r3)(r1);    /* save opcode */       \
-       mftb    r0;                     /* get timebase and */  \
-       std     r0,STK_PARM(r5)(r1);    /* save for later */    \
+#define HCALL_INST_PRECALL(FIRST_REG)                          \
 BEGIN_FTR_SECTION;                                             \
-       mfspr   r0,SPRN_PURR;           /* get PURR and */      \
-       std     r0,STK_PARM(r6)(r1);    /* save for later */    \
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);
-       
+       b       1f;                                             \
+END_FTR_SECTION(0, 1);                                         \
+       ld      r12,hcall_tracepoint_refcount@toc(r2);          \
+       cmpdi   r12,0;                                          \
+       beq+    1f;                                             \
+       mflr    r0;                                             \
+       std     r3,STK_PARM(r3)(r1);                            \
+       std     r4,STK_PARM(r4)(r1);                            \
+       std     r5,STK_PARM(r5)(r1);                            \
+       std     r6,STK_PARM(r6)(r1);                            \
+       std     r7,STK_PARM(r7)(r1);                            \
+       std     r8,STK_PARM(r8)(r1);                            \
+       std     r9,STK_PARM(r9)(r1);                            \
+       std     r10,STK_PARM(r10)(r1);                          \
+       std     r0,16(r1);                                      \
+       addi    r4,r1,STK_PARM(FIRST_REG);                      \
+       stdu    r1,-STACK_FRAME_OVERHEAD(r1);                   \
+       bl      .__trace_hcall_entry;                           \
+       addi    r1,r1,STACK_FRAME_OVERHEAD;                     \
+       ld      r0,16(r1);                                      \
+       ld      r3,STK_PARM(r3)(r1);                            \
+       ld      r4,STK_PARM(r4)(r1);                            \
+       ld      r5,STK_PARM(r5)(r1);                            \
+       ld      r6,STK_PARM(r6)(r1);                            \
+       ld      r7,STK_PARM(r7)(r1);                            \
+       ld      r8,STK_PARM(r8)(r1);                            \
+       ld      r9,STK_PARM(r9)(r1);                            \
+       ld      r10,STK_PARM(r10)(r1);                          \
+       mtlr    r0;                                             \
+1:
+
 /*
  * postcall is performed immediately before function return which
  * allows liberal use of volatile registers.  We branch around this
  * in early init (eg when populating the MMU hashtable) by using an
  * unconditional cpu feature.
  */
-#define HCALL_INST_POSTCALL                                    \
+#define __HCALL_INST_POSTCALL                                  \
 BEGIN_FTR_SECTION;                                             \
        b       1f;                                             \
 END_FTR_SECTION(0, 1);                                         \
-       ld      r4,STK_PARM(r3)(r1);    /* validate opcode */   \
-       cmpldi  cr7,r4,MAX_HCALL_OPCODE;                        \
-       bgt-    cr7,1f;                                         \
-                                                               \
-       /* get time and PURR snapshots after hcall */           \
-       mftb    r7;                     /* timebase after */    \
-BEGIN_FTR_SECTION;                                             \
-       mfspr   r8,SPRN_PURR;           /* PURR after */        \
-       ld      r6,STK_PARM(r6)(r1);    /* PURR before */       \
-       subf    r6,r6,r8;               /* delta */             \
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);                           \
-       ld      r5,STK_PARM(r5)(r1);    /* timebase before */   \
-       subf    r5,r5,r7;               /* time delta */        \
-                                                               \
-       /* calculate address of stat structure r4 = opcode */   \
-       srdi    r4,r4,2;                /* index into array */  \
-       mulli   r4,r4,HCALL_STAT_SIZE;                          \
-       LOAD_REG_ADDR(r7, per_cpu__hcall_stats);                \
-       add     r4,r4,r7;                                       \
-       ld      r7,PACA_DATA_OFFSET(r13); /* per cpu offset */  \
-       add     r4,r4,r7;                                       \
-                                                               \
-       /* update stats */                                      \
-       ld      r7,HCALL_STAT_CALLS(r4); /* count */            \
-       addi    r7,r7,1;                                        \
-       std     r7,HCALL_STAT_CALLS(r4);                        \
-       ld      r7,HCALL_STAT_TB(r4);   /* timebase */          \
-       add     r7,r7,r5;                                       \
-       std     r7,HCALL_STAT_TB(r4);                           \
-BEGIN_FTR_SECTION;                                             \
-       ld      r7,HCALL_STAT_PURR(r4); /* PURR */              \
-       add     r7,r7,r6;                                       \
-       std     r7,HCALL_STAT_PURR(r4);                         \
-END_FTR_SECTION_IFSET(CPU_FTR_PURR);                           \
+       ld      r12,hcall_tracepoint_refcount@toc(r2);          \
+       cmpdi   r12,0;                                          \
+       beq+    1f;                                             \
+       mflr    r0;                                             \
+       ld      r6,STK_PARM(r3)(r1);                            \
+       std     r3,STK_PARM(r3)(r1);                            \
+       mr      r4,r3;                                          \
+       mr      r3,r6;                                          \
+       std     r0,16(r1);                                      \
+       stdu    r1,-STACK_FRAME_OVERHEAD(r1);                   \
+       bl      .__trace_hcall_exit;                            \
+       addi    r1,r1,STACK_FRAME_OVERHEAD;                     \
+       ld      r0,16(r1);                                      \
+       ld      r3,STK_PARM(r3)(r1);                            \
+       mtlr    r0;                                             \
 1:
+
+#define HCALL_INST_POSTCALL_NORETS                             \
+       li      r5,0;                                           \
+       __HCALL_INST_POSTCALL
+
+#define HCALL_INST_POSTCALL(BUFREG)                            \
+       mr      r5,BUFREG;                                      \
+       __HCALL_INST_POSTCALL
+
 #else
-#define HCALL_INST_PRECALL
-#define HCALL_INST_POSTCALL
+#define HCALL_INST_PRECALL(FIRST_ARG)
+#define HCALL_INST_POSTCALL_NORETS
+#define HCALL_INST_POSTCALL(BUFREG)
 #endif
 
        .text
@@ -86,11 +112,11 @@ _GLOBAL(plpar_hcall_norets)
        mfcr    r0
        stw     r0,8(r1)
 
-       HCALL_INST_PRECALL
+       HCALL_INST_PRECALL(r4)
 
        HVSC                            /* invoke the hypervisor */
 
-       HCALL_INST_POSTCALL
+       HCALL_INST_POSTCALL_NORETS
 
        lwz     r0,8(r1)
        mtcrf   0xff,r0
@@ -102,7 +128,7 @@ _GLOBAL(plpar_hcall)
        mfcr    r0
        stw     r0,8(r1)
 
-       HCALL_INST_PRECALL
+       HCALL_INST_PRECALL(r5)
 
        std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
 
@@ -121,7 +147,7 @@ _GLOBAL(plpar_hcall)
        std     r6, 16(r12)
        std     r7, 24(r12)
 
-       HCALL_INST_POSTCALL
+       HCALL_INST_POSTCALL(r12)
 
        lwz     r0,8(r1)
        mtcrf   0xff,r0
@@ -168,7 +194,7 @@ _GLOBAL(plpar_hcall9)
        mfcr    r0
        stw     r0,8(r1)
 
-       HCALL_INST_PRECALL
+       HCALL_INST_PRECALL(r5)
 
        std     r4,STK_PARM(r4)(r1)     /* Save ret buffer */
 
@@ -196,7 +222,7 @@ _GLOBAL(plpar_hcall9)
        std     r11,56(r12)
        std     r0, 64(r12)
 
-       HCALL_INST_POSTCALL
+       HCALL_INST_POSTCALL(r12)
 
        lwz     r0,8(r1)
        mtcrf   0xff,r0
index 3631a4f277eb2ab8aaf02676d7124d6a2a430e44..2f58c71b72593911694cc0cd6a33588a0259ef20 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/hvcall.h>
 #include <asm/firmware.h>
 #include <asm/cputable.h>
+#include <asm/trace.h>
 
 DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
 
@@ -100,6 +101,35 @@ static const struct file_operations hcall_inst_seq_fops = {
 #define        HCALL_ROOT_DIR          "hcall_inst"
 #define CPU_NAME_BUF_SIZE      32
 
+
+static void probe_hcall_entry(unsigned long opcode, unsigned long *args)
+{
+       struct hcall_stats *h;
+
+       if (opcode > MAX_HCALL_OPCODE)
+               return;
+
+       h = &get_cpu_var(hcall_stats)[opcode / 4];
+       h->tb_start = mftb();
+       h->purr_start = mfspr(SPRN_PURR);
+}
+
+static void probe_hcall_exit(unsigned long opcode, unsigned long retval,
+                            unsigned long *retbuf)
+{
+       struct hcall_stats *h;
+
+       if (opcode > MAX_HCALL_OPCODE)
+               return;
+
+       h = &__get_cpu_var(hcall_stats)[opcode / 4];
+       h->num_calls++;
+       h->tb_total = mftb() - h->tb_start;
+       h->purr_total = mfspr(SPRN_PURR) - h->purr_start;
+
+       put_cpu_var(hcall_stats);
+}
+
 static int __init hcall_inst_init(void)
 {
        struct dentry *hcall_root;
@@ -110,6 +140,14 @@ static int __init hcall_inst_init(void)
        if (!firmware_has_feature(FW_FEATURE_LPAR))
                return 0;
 
+       if (register_trace_hcall_entry(probe_hcall_entry))
+               return -EINVAL;
+
+       if (register_trace_hcall_exit(probe_hcall_exit)) {
+               unregister_trace_hcall_entry(probe_hcall_entry);
+               return -EINVAL;
+       }
+
        hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL);
        if (!hcall_root)
                return -ENOMEM;
index 903eb9eec687471628c37e30026ae6aad5c35548..0707653612bacbc4d7dcf71f45bb4436275a7202 100644 (file)
@@ -39,6 +39,7 @@
 #include <asm/cputable.h>
 #include <asm/udbg.h>
 #include <asm/smp.h>
+#include <asm/trace.h>
 
 #include "plpar_wrappers.h"
 #include "pseries.h"
@@ -661,3 +662,35 @@ void arch_free_page(struct page *page, int order)
 EXPORT_SYMBOL(arch_free_page);
 
 #endif
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * We optimise our hcall path by placing hcall_tracepoint_refcount
+ * directly in the TOC so we can check if the hcall tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long hcall_tracepoint_refcount;
+
+void hcall_tracepoint_regfunc(void)
+{
+       hcall_tracepoint_refcount++;
+}
+
+void hcall_tracepoint_unregfunc(void)
+{
+       hcall_tracepoint_refcount--;
+}
+
+void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
+{
+       trace_hcall_entry(opcode, args);
+}
+
+void __trace_hcall_exit(long opcode, unsigned long retval,
+                       unsigned long *retbuf)
+{
+       trace_hcall_exit(opcode, retval, retbuf);
+}
+#endif
index 72ace9515a07a44525778899e1ea04b32b3accbc..178084b4377ccbebb2fb089ea1c17742f9addbfe 100644 (file)
@@ -49,6 +49,7 @@ config X86
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_BZIP2
        select HAVE_KERNEL_LZMA
+       select HAVE_HW_BREAKPOINT
        select HAVE_ARCH_KMEMCHECK
 
 config OUTPUT_FORMAT
index d105f29bb6bb7c9b75fe3369d66f68f2f3ada5ba..7d0b681a132bb0b241717695e33f61c33e21cb46 100644 (file)
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
 config HAVE_MMIOTRACE_SUPPORT
        def_bool y
 
+config X86_DECODER_SELFTEST
+     bool "x86 instruction decoder selftest"
+     depends on DEBUG_KERNEL
+       ---help---
+        Perform x86 instruction decoder selftests at build time.
+        This option is useful for checking the sanity of x86 instruction
+        decoder code.
+        If unsure, say "N".
+
 #
 # IO delay types:
 #
index d2d24c9ee64d926de80a249330e938095623ca85..78b32be55e9e7a82dafcdf96a5b59be4a26decd8 100644 (file)
@@ -155,6 +155,9 @@ all: bzImage
 KBUILD_IMAGE := $(boot)/bzImage
 
 bzImage: vmlinux
+ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
+       $(Q)$(MAKE) $(build)=arch/x86/tools posttest
+endif
        $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
        $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
        $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
index 4a8e80cdcfa57a7faff08a2042a6b6fb64f5ae66..9f828f87ca35f418d24d4b7674477f643eea773d 100644 (file)
@@ -10,6 +10,7 @@ header-y += ptrace-abi.h
 header-y += sigcontext32.h
 header-y += ucontext.h
 header-y += processor-flags.h
+header-y += hw_breakpoint.h
 
 unifdef-y += e820.h
 unifdef-y += ist.h
index bb70e397aa84c0e7cfa452c1efa9298b8a58c985..7a15588e45d47265391ec508c787d98b374aaeb9 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <linux/user.h>
 #include <linux/elfcore.h>
+#include <asm/debugreg.h>
 
 /*
  * fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
                        >> PAGE_SHIFT;
        dump->u_dsize -= dump->u_tsize;
        dump->u_ssize = 0;
-       dump->u_debugreg[0] = current->thread.debugreg0;
-       dump->u_debugreg[1] = current->thread.debugreg1;
-       dump->u_debugreg[2] = current->thread.debugreg2;
-       dump->u_debugreg[3] = current->thread.debugreg3;
-       dump->u_debugreg[4] = 0;
-       dump->u_debugreg[5] = 0;
-       dump->u_debugreg[6] = current->thread.debugreg6;
-       dump->u_debugreg[7] = current->thread.debugreg7;
+       aout_dump_debugregs(dump);
 
        if (dump->start_stack < TASK_SIZE)
                dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
index 3ea6f37be9e2d29a69f6982bb3ddcc80554f1652..8240f76b531e0959be5a4fa823b1820d5d5952b5 100644 (file)
@@ -18,6 +18,7 @@
 #define DR_TRAP1       (0x2)           /* db1 */
 #define DR_TRAP2       (0x4)           /* db2 */
 #define DR_TRAP3       (0x8)           /* db3 */
+#define DR_TRAP_BITS   (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
 
 #define DR_STEP                (0x4000)        /* single-step */
 #define DR_SWITCH      (0x8000)        /* task switch */
@@ -49,6 +50,8 @@
 
 #define DR_LOCAL_ENABLE_SHIFT 0    /* Extra shift to the local enable bit */
 #define DR_GLOBAL_ENABLE_SHIFT 1   /* Extra shift to the global enable bit */
+#define DR_LOCAL_ENABLE (0x1)      /* Local enable for reg 0 */
+#define DR_GLOBAL_ENABLE (0x2)     /* Global enable for reg 0 */
 #define DR_ENABLE_SIZE 2           /* 2 enable bits per register */
 
 #define DR_LOCAL_ENABLE_MASK (0x55)  /* Set  local bits for all 4 regs */
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+/*
+ * HW breakpoint additions
+ */
+#ifdef __KERNEL__
+
+DECLARE_PER_CPU(unsigned long, cpu_dr7);
+
+static inline void hw_breakpoint_disable(void)
+{
+       /* Zero the control register for HW Breakpoint */
+       set_debugreg(0UL, 7);
+
+       /* Zero-out the individual HW breakpoint address registers */
+       set_debugreg(0UL, 0);
+       set_debugreg(0UL, 1);
+       set_debugreg(0UL, 2);
+       set_debugreg(0UL, 3);
+}
+
+static inline int hw_breakpoint_active(void)
+{
+       return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
+}
+
+extern void aout_dump_debugregs(struct user *dump);
+
+extern void hw_breakpoint_restore(void);
+
+#endif /* __KERNEL__ */
+
 #endif /* _ASM_X86_DEBUGREG_H */
index 82e3e8f010439cde125a3a75c4f0d65a7068b573..108eb6fd1ae7f6da8c94ca72197fdf7f4e3ec74d 100644 (file)
@@ -20,11 +20,11 @@ typedef struct {
        unsigned int irq_call_count;
        unsigned int irq_tlb_count;
 #endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
        unsigned int irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
        unsigned int irq_threshold_count;
-# endif
 #endif
 } ____cacheline_aligned irq_cpustat_t;
 
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644 (file)
index 0000000..0675a7c
--- /dev/null
@@ -0,0 +1,73 @@
+#ifndef        _I386_HW_BREAKPOINT_H
+#define        _I386_HW_BREAKPOINT_H
+
+#ifdef __KERNEL__
+#define        __ARCH_HW_BREAKPOINT_H
+
+/*
+ * The name should probably be something dealt in
+ * a higher level. While dealing with the user
+ * (display/resolving)
+ */
+struct arch_hw_breakpoint {
+       char            *name; /* Contains name of the symbol to set bkpt */
+       unsigned long   address;
+       u8              len;
+       u8              type;
+};
+
+#include <linux/kdebug.h>
+#include <linux/percpu.h>
+#include <linux/list.h>
+
+/* Available HW breakpoint length encodings */
+#define X86_BREAKPOINT_LEN_1           0x40
+#define X86_BREAKPOINT_LEN_2           0x44
+#define X86_BREAKPOINT_LEN_4           0x4c
+#define X86_BREAKPOINT_LEN_EXECUTE     0x40
+
+#ifdef CONFIG_X86_64
+#define X86_BREAKPOINT_LEN_8           0x48
+#endif
+
+/* Available HW breakpoint type encodings */
+
+/* trigger on instruction execute */
+#define X86_BREAKPOINT_EXECUTE 0x80
+/* trigger on memory write */
+#define X86_BREAKPOINT_WRITE   0x81
+/* trigger on memory read or write */
+#define X86_BREAKPOINT_RW      0x83
+
+/* Total number of available HW breakpoint registers */
+#define HBP_NUM 4
+
+struct perf_event;
+struct pmu;
+
+extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
+                                        struct task_struct *tsk);
+extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+                                          unsigned long val, void *data);
+
+
+int arch_install_hw_breakpoint(struct perf_event *bp);
+void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+void hw_breakpoint_pmu_read(struct perf_event *bp);
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
+
+extern void
+arch_fill_perf_breakpoint(struct perf_event *bp);
+
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
+
+extern int arch_bp_generic_fields(int x86_len, int x86_type,
+                                 int *gen_len, int *gen_type);
+
+extern struct pmu perf_ops_bp;
+
+#endif /* __KERNEL__ */
+#endif /* _I386_HW_BREAKPOINT_H */
+
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644 (file)
index 0000000..205b063
--- /dev/null
@@ -0,0 +1,220 @@
+#ifndef _ASM_X86_INAT_H
+#define _ASM_X86_INAT_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/inat_types.h>
+
+/*
+ * Internal bits. Don't use bitmasks directly, because these bits are
+ * unstable. You should use checking functions.
+ */
+
+#define INAT_OPCODE_TABLE_SIZE 256
+#define INAT_GROUP_TABLE_SIZE 8
+
+/* Legacy last prefixes */
+#define INAT_PFX_OPNDSZ        1       /* 0x66 */ /* LPFX1 */
+#define INAT_PFX_REPE  2       /* 0xF3 */ /* LPFX2 */
+#define INAT_PFX_REPNE 3       /* 0xF2 */ /* LPFX3 */
+/* Other Legacy prefixes */
+#define INAT_PFX_LOCK  4       /* 0xF0 */
+#define INAT_PFX_CS    5       /* 0x2E */
+#define INAT_PFX_DS    6       /* 0x3E */
+#define INAT_PFX_ES    7       /* 0x26 */
+#define INAT_PFX_FS    8       /* 0x64 */
+#define INAT_PFX_GS    9       /* 0x65 */
+#define INAT_PFX_SS    10      /* 0x36 */
+#define INAT_PFX_ADDRSZ        11      /* 0x67 */
+/* x86-64 REX prefix */
+#define INAT_PFX_REX   12      /* 0x4X */
+/* AVX VEX prefixes */
+#define INAT_PFX_VEX2  13      /* 2-bytes VEX prefix */
+#define INAT_PFX_VEX3  14      /* 3-bytes VEX prefix */
+
+#define INAT_LSTPFX_MAX        3
+#define INAT_LGCPFX_MAX        11
+
+/* Immediate size */
+#define INAT_IMM_BYTE          1
+#define INAT_IMM_WORD          2
+#define INAT_IMM_DWORD         3
+#define INAT_IMM_QWORD         4
+#define INAT_IMM_PTR           5
+#define INAT_IMM_VWORD32       6
+#define INAT_IMM_VWORD         7
+
+/* Legacy prefix */
+#define INAT_PFX_OFFS  0
+#define INAT_PFX_BITS  4
+#define INAT_PFX_MAX    ((1 << INAT_PFX_BITS) - 1)
+#define INAT_PFX_MASK  (INAT_PFX_MAX << INAT_PFX_OFFS)
+/* Escape opcodes */
+#define INAT_ESC_OFFS  (INAT_PFX_OFFS + INAT_PFX_BITS)
+#define INAT_ESC_BITS  2
+#define INAT_ESC_MAX   ((1 << INAT_ESC_BITS) - 1)
+#define INAT_ESC_MASK  (INAT_ESC_MAX << INAT_ESC_OFFS)
+/* Group opcodes (1-16) */
+#define INAT_GRP_OFFS  (INAT_ESC_OFFS + INAT_ESC_BITS)
+#define INAT_GRP_BITS  5
+#define INAT_GRP_MAX   ((1 << INAT_GRP_BITS) - 1)
+#define INAT_GRP_MASK  (INAT_GRP_MAX << INAT_GRP_OFFS)
+/* Immediates */
+#define INAT_IMM_OFFS  (INAT_GRP_OFFS + INAT_GRP_BITS)
+#define INAT_IMM_BITS  3
+#define INAT_IMM_MASK  (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
+/* Flags */
+#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
+#define INAT_MODRM     (1 << (INAT_FLAG_OFFS))
+#define INAT_FORCE64   (1 << (INAT_FLAG_OFFS + 1))
+#define INAT_SCNDIMM   (1 << (INAT_FLAG_OFFS + 2))
+#define INAT_MOFFSET   (1 << (INAT_FLAG_OFFS + 3))
+#define INAT_VARIANT   (1 << (INAT_FLAG_OFFS + 4))
+#define INAT_VEXOK     (1 << (INAT_FLAG_OFFS + 5))
+#define INAT_VEXONLY   (1 << (INAT_FLAG_OFFS + 6))
+/* Attribute making macros for attribute tables */
+#define INAT_MAKE_PREFIX(pfx)  (pfx << INAT_PFX_OFFS)
+#define INAT_MAKE_ESCAPE(esc)  (esc << INAT_ESC_OFFS)
+#define INAT_MAKE_GROUP(grp)   ((grp << INAT_GRP_OFFS) | INAT_MODRM)
+#define INAT_MAKE_IMM(imm)     (imm << INAT_IMM_OFFS)
+
+/* Attribute search APIs */
+extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
+extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
+                                            insn_byte_t last_pfx,
+                                            insn_attr_t esc_attr);
+extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
+                                           insn_byte_t last_pfx,
+                                           insn_attr_t esc_attr);
+extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
+                                         insn_byte_t vex_m,
+                                         insn_byte_t vex_pp);
+
+/* Attribute checking functions */
+static inline int inat_is_legacy_prefix(insn_attr_t attr)
+{
+       attr &= INAT_PFX_MASK;
+       return attr && attr <= INAT_LGCPFX_MAX;
+}
+
+static inline int inat_is_address_size_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
+}
+
+static inline int inat_is_operand_size_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
+}
+
+static inline int inat_is_rex_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
+}
+
+static inline int inat_last_prefix_id(insn_attr_t attr)
+{
+       if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
+               return 0;
+       else
+               return attr & INAT_PFX_MASK;
+}
+
+static inline int inat_is_vex_prefix(insn_attr_t attr)
+{
+       attr &= INAT_PFX_MASK;
+       return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_vex3_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_escape(insn_attr_t attr)
+{
+       return attr & INAT_ESC_MASK;
+}
+
+static inline int inat_escape_id(insn_attr_t attr)
+{
+       return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
+}
+
+static inline int inat_is_group(insn_attr_t attr)
+{
+       return attr & INAT_GRP_MASK;
+}
+
+static inline int inat_group_id(insn_attr_t attr)
+{
+       return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
+}
+
+static inline int inat_group_common_attribute(insn_attr_t attr)
+{
+       return attr & ~INAT_GRP_MASK;
+}
+
+static inline int inat_has_immediate(insn_attr_t attr)
+{
+       return attr & INAT_IMM_MASK;
+}
+
+static inline int inat_immediate_size(insn_attr_t attr)
+{
+       return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
+}
+
+static inline int inat_has_modrm(insn_attr_t attr)
+{
+       return attr & INAT_MODRM;
+}
+
+static inline int inat_is_force64(insn_attr_t attr)
+{
+       return attr & INAT_FORCE64;
+}
+
+static inline int inat_has_second_immediate(insn_attr_t attr)
+{
+       return attr & INAT_SCNDIMM;
+}
+
+static inline int inat_has_moffset(insn_attr_t attr)
+{
+       return attr & INAT_MOFFSET;
+}
+
+static inline int inat_has_variant(insn_attr_t attr)
+{
+       return attr & INAT_VARIANT;
+}
+
+static inline int inat_accept_vex(insn_attr_t attr)
+{
+       return attr & INAT_VEXOK;
+}
+
+static inline int inat_must_vex(insn_attr_t attr)
+{
+       return attr & INAT_VEXONLY;
+}
+#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644 (file)
index 0000000..cb3c20c
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_INAT_TYPES_H
+#define _ASM_X86_INAT_TYPES_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* Instruction attributes */
+typedef unsigned int insn_attr_t;
+typedef unsigned char insn_byte_t;
+typedef signed int insn_value_t;
+
+#endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644 (file)
index 0000000..96c2e0a
--- /dev/null
@@ -0,0 +1,184 @@
+#ifndef _ASM_X86_INSN_H
+#define _ASM_X86_INSN_H
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/* insn_attr_t is defined in inat.h */
+#include <asm/inat.h>
+
+struct insn_field {
+       union {
+               insn_value_t value;
+               insn_byte_t bytes[4];
+       };
+       /* !0 if we've run insn_get_xxx() for this field */
+       unsigned char got;
+       unsigned char nbytes;
+};
+
+struct insn {
+       struct insn_field prefixes;     /*
+                                        * Prefixes
+                                        * prefixes.bytes[3]: last prefix
+                                        */
+       struct insn_field rex_prefix;   /* REX prefix */
+       struct insn_field vex_prefix;   /* VEX prefix */
+       struct insn_field opcode;       /*
+                                        * opcode.bytes[0]: opcode1
+                                        * opcode.bytes[1]: opcode2
+                                        * opcode.bytes[2]: opcode3
+                                        */
+       struct insn_field modrm;
+       struct insn_field sib;
+       struct insn_field displacement;
+       union {
+               struct insn_field immediate;
+               struct insn_field moffset1;     /* for 64bit MOV */
+               struct insn_field immediate1;   /* for 64bit imm or off16/32 */
+       };
+       union {
+               struct insn_field moffset2;     /* for 64bit MOV */
+               struct insn_field immediate2;   /* for 64bit imm or seg16 */
+       };
+
+       insn_attr_t attr;
+       unsigned char opnd_bytes;
+       unsigned char addr_bytes;
+       unsigned char length;
+       unsigned char x86_64;
+
+       const insn_byte_t *kaddr;       /* kernel address of insn to analyze */
+       const insn_byte_t *next_byte;
+};
+
+#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
+#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
+#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
+
+#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
+#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
+#define X86_SIB_BASE(sib) ((sib) & 0x07)
+
+#define X86_REX_W(rex) ((rex) & 8)
+#define X86_REX_R(rex) ((rex) & 4)
+#define X86_REX_X(rex) ((rex) & 2)
+#define X86_REX_B(rex) ((rex) & 1)
+
+/* VEX bit flags  */
+#define X86_VEX_W(vex) ((vex) & 0x80)  /* VEX3 Byte2 */
+#define X86_VEX_R(vex) ((vex) & 0x80)  /* VEX2/3 Byte1 */
+#define X86_VEX_X(vex) ((vex) & 0x40)  /* VEX3 Byte1 */
+#define X86_VEX_B(vex) ((vex) & 0x20)  /* VEX3 Byte1 */
+#define X86_VEX_L(vex) ((vex) & 0x04)  /* VEX3 Byte2, VEX2 Byte1 */
+/* VEX bit fields */
+#define X86_VEX3_M(vex)        ((vex) & 0x1f)          /* VEX3 Byte1 */
+#define X86_VEX2_M     1                       /* VEX2.M always 1 */
+#define X86_VEX_V(vex) (((vex) & 0x78) >> 3)   /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_P(vex) ((vex) & 0x03)          /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_M_MAX  0x1f                    /* VEX3.M Maximum value */
+
+/* The last prefix is needed for two-byte and three-byte opcodes */
+static inline insn_byte_t insn_last_prefix(struct insn *insn)
+{
+       return insn->prefixes.bytes[3];
+}
+
+extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
+extern void insn_get_prefixes(struct insn *insn);
+extern void insn_get_opcode(struct insn *insn);
+extern void insn_get_modrm(struct insn *insn);
+extern void insn_get_sib(struct insn *insn);
+extern void insn_get_displacement(struct insn *insn);
+extern void insn_get_immediate(struct insn *insn);
+extern void insn_get_length(struct insn *insn);
+
+/* Attribute will be determined after getting ModRM (for opcode groups) */
+static inline void insn_get_attribute(struct insn *insn)
+{
+       insn_get_modrm(insn);
+}
+
+/* Instruction uses RIP-relative addressing */
+extern int insn_rip_relative(struct insn *insn);
+
+/* Init insn for kernel text */
+static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
+{
+#ifdef CONFIG_X86_64
+       insn_init(insn, kaddr, 1);
+#else /* CONFIG_X86_32 */
+       insn_init(insn, kaddr, 0);
+#endif
+}
+
+static inline int insn_is_avx(struct insn *insn)
+{
+       if (!insn->prefixes.got)
+               insn_get_prefixes(insn);
+       return (insn->vex_prefix.value != 0);
+}
+
+static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
+{
+       if (insn->vex_prefix.nbytes == 2)       /* 2 bytes VEX */
+               return X86_VEX2_M;
+       else
+               return X86_VEX3_M(insn->vex_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
+{
+       if (insn->vex_prefix.nbytes == 2)       /* 2 bytes VEX */
+               return X86_VEX_P(insn->vex_prefix.bytes[1]);
+       else
+               return X86_VEX_P(insn->vex_prefix.bytes[2]);
+}
+
+/* Offset of each field from kaddr */
+static inline int insn_offset_rex_prefix(struct insn *insn)
+{
+       return insn->prefixes.nbytes;
+}
+static inline int insn_offset_vex_prefix(struct insn *insn)
+{
+       return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
+}
+static inline int insn_offset_opcode(struct insn *insn)
+{
+       return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
+}
+static inline int insn_offset_modrm(struct insn *insn)
+{
+       return insn_offset_opcode(insn) + insn->opcode.nbytes;
+}
+static inline int insn_offset_sib(struct insn *insn)
+{
+       return insn_offset_modrm(insn) + insn->modrm.nbytes;
+}
+static inline int insn_offset_displacement(struct insn *insn)
+{
+       return insn_offset_sib(insn) + insn->sib.nbytes;
+}
+static inline int insn_offset_immediate(struct insn *insn)
+{
+       return insn_offset_displacement(insn) + insn->displacement.nbytes;
+}
+
+#endif /* _ASM_X86_INSN_H */
index f1363b72364f3e2a53609e77e52379f7f3998b3a..858baa061cfce365a51e48bf162f92cb00a8e8ba 100644 (file)
@@ -108,6 +108,8 @@ struct mce_log {
 #define K8_MCE_THRESHOLD_BANK_5    (MCE_THRESHOLD_BASE + 5 * 9)
 #define K8_MCE_THRESHOLD_DRAM_ECC  (MCE_THRESHOLD_BANK_4 + 0)
 
+extern struct atomic_notifier_head x86_mce_decoder_chain;
+
 #ifdef __KERNEL__
 
 #include <linux/percpu.h>
@@ -118,9 +120,11 @@ extern int mce_disabled;
 extern int mce_p5_enabled;
 
 #ifdef CONFIG_X86_MCE
-void mcheck_init(struct cpuinfo_x86 *c);
+int mcheck_init(void);
+void mcheck_cpu_init(struct cpuinfo_x86 *c);
 #else
-static inline void mcheck_init(struct cpuinfo_x86 *c) {}
+static inline int mcheck_init(void) { return 0; }
+static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
 #endif
 
 #ifdef CONFIG_X86_ANCIENT_MCE
@@ -214,5 +218,11 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
 
 void mce_log_therm_throt_event(__u64 status);
 
+#ifdef CONFIG_X86_THERMAL_VECTOR
+extern void mcheck_intel_therm_init(void);
+#else
+static inline void mcheck_intel_therm_init(void) { }
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
index ad7ce3fd5065a8354035233de59fd1ba21b8d0e8..8d9f8548a870498e94e3de1ced5811692238962c 100644 (file)
  */
 #define ARCH_PERFMON_EVENT_MASK                                    0xffff
 
+/*
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
+#define ARCH_PERFMON_EVENT_FILTER_MASK                 0xff840000
+
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL                0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK                (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX                 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX                         0
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
                (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
 
index c9786480f0fe4d074e9575557316d7f4b358be4b..6f8ec1c37e0a8c999f9344082465fd2c9b4570f1 100644 (file)
@@ -30,6 +30,7 @@ struct mm_struct;
 #include <linux/math64.h>
 #include <linux/init.h>
 
+#define HBP_NUM 4
 /*
  * Default implementation of macro that returns current
  * instruction pointer ("program counter").
@@ -422,6 +423,8 @@ extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
 
+struct perf_event;
+
 struct thread_struct {
        /* Cached TLS descriptors: */
        struct desc_struct      tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -443,13 +446,10 @@ struct thread_struct {
        unsigned long           fs;
 #endif
        unsigned long           gs;
-       /* Hardware debugging registers: */
-       unsigned long           debugreg0;
-       unsigned long           debugreg1;
-       unsigned long           debugreg2;
-       unsigned long           debugreg3;
-       unsigned long           debugreg6;
-       unsigned long           debugreg7;
+       /* Save middle states of ptrace breakpoints */
+       struct perf_event       *ptrace_bps[HBP_NUM];
+       /* Debug status used for traps, single steps, etc... */
+       unsigned long           debugreg6;
        /* Fault info: */
        unsigned long           cr2;
        unsigned long           trap_no;
index 0f0d908349aa3f375e87f68802cbcb2e7753f725..3d11fd0f44c5f4f86c060207417710db58758094 100644 (file)
@@ -7,6 +7,7 @@
 
 #ifdef __KERNEL__
 #include <asm/segment.h>
+#include <asm/page_types.h>
 #endif
 
 #ifndef __ASSEMBLY__
@@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
        return regs->sp;
 }
 
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs:      pt_regs from which register value is gotten.
+ * @offset:    offset number of the register.
+ *
+ * regs_get_register returns the value of a register. The @offset is the
+ * offset of the register in struct pt_regs address which specified by @regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+                                             unsigned int offset)
+{
+       if (unlikely(offset > MAX_REG_OFFSET))
+               return 0;
+       return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @addr:      address which is checked.
+ *
+ * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+                                          unsigned long addr)
+{
+       return ((addr & ~(THREAD_SIZE - 1))  ==
+               (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @n:         stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specified by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+                                                     unsigned int n)
+{
+       unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+       addr += n;
+       if (regs_within_kernel_stack(regs, (unsigned long)addr))
+               return *addr;
+       else
+               return 0;
+}
+
+/* Get Nth argument at function call */
+extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
+                                          unsigned int n);
+
 /*
  * These are defined as per linux/ptrace.h, which see.
  */
index d8e5d0cdd678d3b4396c0e7f859b7c3f6ac0d212..4f2e66e29ecc5cc35e749f18e194b2ad11f398d2 100644 (file)
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64)  += sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)   += syscall_64.o vsyscall_64.o
 obj-y                  += bootflag.o e820.o
 obj-y                  += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
-obj-y                  += alternative.o i8253.o pci-nommu.o
+obj-y                  += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y                  += tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)   += trampoline.o
index 68537e957a9b2cd621185e275103c201af9502e6..1d2cb383410ebef206fb48e2ccecbe8fd03f90ce 100644 (file)
@@ -5,6 +5,7 @@
 # Don't trace early stages of a secondary CPU boot
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_common.o = -pg
+CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
 # Make sure load_percpu_segment has no stackprotector
index cc25c2b4a567c2ca3e020127cefe87b2778f02ee..9053be5d95cd4fb21f4aae94f2e7335af6a6f83a 100644 (file)
@@ -837,10 +837,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                        boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
        }
 
-#ifdef CONFIG_X86_MCE
        /* Init Machine Check Exception if available. */
-       mcheck_init(c);
-#endif
+       mcheck_cpu_init(c);
 
        select_idle_routine(c);
 
index 721a77ca811536eb2129e02449701948fe9aa9c3..0bcaa3875863aaefa98942e00d93ae5be0fecdb6 100644 (file)
@@ -46,6 +46,9 @@
 
 #include "mce-internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
 int mce_disabled __read_mostly;
 
 #define MISC_MCELOG_MINOR      227
@@ -85,18 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int                     cpu_missing;
 
-static void default_decode_mce(struct mce *m)
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
+
+static int default_decode_mce(struct notifier_block *nb, unsigned long val,
+                              void *data)
 {
        pr_emerg("No human readable MCE decoding support on this CPU type.\n");
        pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+
+       return NOTIFY_STOP;
 }
 
-/*
- * CPU/chipset specific EDAC code can register a callback here to print
- * MCE errors in a human-readable form:
- */
-void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
-EXPORT_SYMBOL(x86_mce_decode_callback);
+static struct notifier_block mce_dec_nb = {
+       .notifier_call = default_decode_mce,
+       .priority      = -1,
+};
 
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -141,6 +152,9 @@ void mce_log(struct mce *mce)
 {
        unsigned next, entry;
 
+       /* Emit the trace record: */
+       trace_mce_record(mce);
+
        mce->finished = 0;
        wmb();
        for (;;) {
@@ -204,9 +218,9 @@ static void print_mce(struct mce *m)
 
        /*
         * Print out human-readable details about the MCE error,
-        * (if the CPU has an implementation for that):
+        * (if the CPU has an implementation for that)
         */
-       x86_mce_decode_callback(m);
+       atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 }
 
 static void print_mce_head(void)
@@ -1122,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
 static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static void mcheck_timer(unsigned long data)
+static void mce_start_timer(unsigned long data)
 {
        struct timer_list *t = &per_cpu(mce_timer, data);
        int *n;
@@ -1187,7 +1201,7 @@ int mce_notify_irq(void)
 }
 EXPORT_SYMBOL_GPL(mce_notify_irq);
 
-static int mce_banks_init(void)
+static int __cpuinit __mcheck_cpu_mce_banks_init(void)
 {
        int i;
 
@@ -1206,7 +1220,7 @@ static int mce_banks_init(void)
 /*
  * Initialize Machine Checks for a CPU.
  */
-static int __cpuinit mce_cap_init(void)
+static int __cpuinit __mcheck_cpu_cap_init(void)
 {
        unsigned b;
        u64 cap;
@@ -1228,7 +1242,7 @@ static int __cpuinit mce_cap_init(void)
        WARN_ON(banks != 0 && b != banks);
        banks = b;
        if (!mce_banks) {
-               int err = mce_banks_init();
+               int err = __mcheck_cpu_mce_banks_init();
 
                if (err)
                        return err;
@@ -1244,7 +1258,7 @@ static int __cpuinit mce_cap_init(void)
        return 0;
 }
 
-static void mce_init(void)
+static void __mcheck_cpu_init_generic(void)
 {
        mce_banks_t all_banks;
        u64 cap;
@@ -1273,7 +1287,7 @@ static void mce_init(void)
 }
 
 /* Add per CPU specific workarounds here */
-static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 {
        if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
                pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1341,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
        return 0;
 }
 
-static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
+static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 {
        if (c->x86 != 5)
                return;
@@ -1355,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
        }
 }
 
-static void mce_cpu_features(struct cpuinfo_x86 *c)
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 {
        switch (c->x86_vendor) {
        case X86_VENDOR_INTEL:
@@ -1369,7 +1383,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
        }
 }
 
-static void mce_init_timer(void)
+static void __mcheck_cpu_init_timer(void)
 {
        struct timer_list *t = &__get_cpu_var(mce_timer);
        int *n = &__get_cpu_var(mce_next_interval);
@@ -1380,7 +1394,7 @@ static void mce_init_timer(void)
        *n = check_interval * HZ;
        if (!*n)
                return;
-       setup_timer(t, mcheck_timer, smp_processor_id());
+       setup_timer(t, mce_start_timer, smp_processor_id());
        t->expires = round_jiffies(jiffies + *n);
        add_timer_on(t, smp_processor_id());
 }
@@ -1400,27 +1414,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off:
  */
-void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
 {
        if (mce_disabled)
                return;
 
-       mce_ancient_init(c);
+       __mcheck_cpu_ancient_init(c);
 
        if (!mce_available(c))
                return;
 
-       if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
+       if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
                mce_disabled = 1;
                return;
        }
 
        machine_check_vector = do_machine_check;
 
-       mce_init();
-       mce_cpu_features(c);
-       mce_init_timer();
+       __mcheck_cpu_init_generic();
+       __mcheck_cpu_init_vendor(c);
+       __mcheck_cpu_init_timer();
        INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
+
 }
 
 /*
@@ -1640,6 +1655,15 @@ static int __init mcheck_enable(char *str)
 }
 __setup("mce", mcheck_enable);
 
+int __init mcheck_init(void)
+{
+       atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
+
+       mcheck_intel_therm_init();
+
+       return 0;
+}
+
 /*
  * Sysfs support
  */
@@ -1648,7 +1672,7 @@ __setup("mce", mcheck_enable);
  * Disable machine checks on suspend and shutdown. We can't really handle
  * them later.
  */
-static int mce_disable(void)
+static int mce_disable_error_reporting(void)
 {
        int i;
 
@@ -1663,12 +1687,12 @@ static int mce_disable(void)
 
 static int mce_suspend(struct sys_device *dev, pm_message_t state)
 {
-       return mce_disable();
+       return mce_disable_error_reporting();
 }
 
 static int mce_shutdown(struct sys_device *dev)
 {
-       return mce_disable();
+       return mce_disable_error_reporting();
 }
 
 /*
@@ -1678,8 +1702,8 @@ static int mce_shutdown(struct sys_device *dev)
  */
 static int mce_resume(struct sys_device *dev)
 {
-       mce_init();
-       mce_cpu_features(&current_cpu_data);
+       __mcheck_cpu_init_generic();
+       __mcheck_cpu_init_vendor(&current_cpu_data);
 
        return 0;
 }
@@ -1689,8 +1713,8 @@ static void mce_cpu_restart(void *data)
        del_timer_sync(&__get_cpu_var(mce_timer));
        if (!mce_available(&current_cpu_data))
                return;
-       mce_init();
-       mce_init_timer();
+       __mcheck_cpu_init_generic();
+       __mcheck_cpu_init_timer();
 }
 
 /* Reinit MCEs after user configuration changes */
@@ -1716,7 +1740,7 @@ static void mce_enable_ce(void *all)
        cmci_reenable();
        cmci_recheck();
        if (all)
-               mce_init_timer();
+               __mcheck_cpu_init_timer();
 }
 
 static struct sysdev_class mce_sysclass = {
@@ -1929,13 +1953,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
+static void __cpuinit mce_disable_cpu(void *h)
 {
        unsigned long action = *(unsigned long *)h;
        int i;
 
        if (!mce_available(&current_cpu_data))
                return;
+
        if (!(action & CPU_TASKS_FROZEN))
                cmci_clear();
        for (i = 0; i < banks; i++) {
@@ -1946,7 +1971,7 @@ static void mce_disable_cpu(void *h)
        }
 }
 
-static void mce_reenable_cpu(void *h)
+static void __cpuinit mce_reenable_cpu(void *h)
 {
        unsigned long action = *(unsigned long *)h;
        int i;
@@ -2025,7 +2050,7 @@ static __init void mce_init_banks(void)
        }
 }
 
-static __init int mce_init_device(void)
+static __init int mcheck_init_device(void)
 {
        int err;
        int i = 0;
@@ -2053,7 +2078,7 @@ static __init int mce_init_device(void)
        return err;
 }
 
-device_initcall(mce_init_device);
+device_initcall(mcheck_init_device);
 
 /*
  * Old style boot options parsing. Only for compatibility.
@@ -2101,7 +2126,7 @@ static int fake_panic_set(void *data, u64 val)
 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
                        fake_panic_set, "%llu\n");
 
-static int __init mce_debugfs_init(void)
+static int __init mcheck_debugfs_init(void)
 {
        struct dentry *dmce, *ffake_panic;
 
@@ -2115,5 +2140,5 @@ static int __init mce_debugfs_init(void)
 
        return 0;
 }
-late_initcall(mce_debugfs_init);
+late_initcall(mcheck_debugfs_init);
 #endif
index b3a1dba75330a4891b7d49739bd8507076a642da..4fef985fc221622623473c25e9abadda053095c9 100644 (file)
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
 
 static atomic_t therm_throt_en = ATOMIC_INIT(0);
 
+static u32 lvtthmr_init __read_mostly;
+
 #ifdef CONFIG_SYSFS
 #define define_therm_throt_sysdev_one_ro(_name)                                \
        static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
        ack_APIC_irq();
 }
 
+void __init mcheck_intel_therm_init(void)
+{
+       /*
+        * This function is only called on boot CPU. Save the init thermal
+        * LVT value on BSP and use that value to restore APs' thermal LVT
+        * entry BIOS programmed later
+        */
+       if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) &&
+               cpu_has(&boot_cpu_data, X86_FEATURE_ACC))
+               lvtthmr_init = apic_read(APIC_LVTTHMR);
+}
+
 void intel_init_thermal(struct cpuinfo_x86 *c)
 {
        unsigned int cpu = smp_processor_id();
@@ -270,7 +284,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
         * since it might be delivered via SMI already:
         */
        rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-       h = apic_read(APIC_LVTTHMR);
+
+       /*
+        * The initial value of thermal LVT entries on all APs always reads
+        * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
+        * sequence to them and LVT registers are reset to 0s except for
+        * the mask bits which are set to 1s when APs receive INIT IPI.
+        * Always restore the value that BIOS has programmed on AP based on
+        * BSP's info we saved since BIOS is always setting the same value
+        * for all threads/cores
+        */
+       apic_write(APIC_LVTTHMR, lvtthmr_init);
+
+       h = lvtthmr_init;
+
        if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
                printk(KERN_DEBUG
                       "CPU%d: Thermal monitoring handled by SMI\n", cpu);
index b5801c311846304f2fc2263732e93a6a83c07217..c1bbed1021d96c63e96593f1687d50bc7716b22e 100644 (file)
@@ -77,6 +77,18 @@ struct cpu_hw_events {
        struct debug_store      *ds;
 };
 
+struct event_constraint {
+       unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+       int             code;
+};
+
+#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
+#define EVENT_CONSTRAINT_END  { .code = 0, .idxmsk[0] = 0 }
+
+#define for_each_event_constraint(e, c) \
+       for ((e) = (c); (e)->idxmsk[0]; (e)++)
+
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -102,6 +114,8 @@ struct x86_pmu {
        u64             intel_ctrl;
        void            (*enable_bts)(u64 config);
        void            (*disable_bts)(void);
+       int             (*get_event_idx)(struct cpu_hw_events *cpuc,
+                                        struct hw_perf_event *hwc);
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
        .enabled = 1,
 };
 
+static const struct event_constraint *event_constraints;
+
 /*
  * Not sure about some of these
  */
@@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
        return hw_event & P6_EVNTSEL_MASK;
 }
 
+static const struct event_constraint intel_p6_event_constraints[] =
+{
+       EVENT_CONSTRAINT(0xc1, 0x1),    /* FLOPS */
+       EVENT_CONSTRAINT(0x10, 0x1),    /* FP_COMP_OPS_EXE */
+       EVENT_CONSTRAINT(0x11, 0x1),    /* FP_ASSIST */
+       EVENT_CONSTRAINT(0x12, 0x2),    /* MUL */
+       EVENT_CONSTRAINT(0x13, 0x2),    /* DIV */
+       EVENT_CONSTRAINT(0x14, 0x1),    /* CYCLES_DIV_BUSY */
+       EVENT_CONSTRAINT_END
+};
 
 /*
  * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] =
   [PERF_COUNT_HW_BUS_CYCLES]           = 0x013c,
 };
 
+static const struct event_constraint intel_core_event_constraints[] =
+{
+       EVENT_CONSTRAINT(0x10, 0x1),    /* FP_COMP_OPS_EXE */
+       EVENT_CONSTRAINT(0x11, 0x2),    /* FP_ASSIST */
+       EVENT_CONSTRAINT(0x12, 0x2),    /* MUL */
+       EVENT_CONSTRAINT(0x13, 0x2),    /* DIV */
+       EVENT_CONSTRAINT(0x14, 0x1),    /* CYCLES_DIV_BUSY */
+       EVENT_CONSTRAINT(0x18, 0x1),    /* IDLE_DURING_DIV */
+       EVENT_CONSTRAINT(0x19, 0x2),    /* DELAYED_BYPASS */
+       EVENT_CONSTRAINT(0xa1, 0x1),    /* RS_UOPS_DISPATCH_CYCLES */
+       EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED */
+       EVENT_CONSTRAINT_END
+};
+
+static const struct event_constraint intel_nehalem_event_constraints[] =
+{
+       EVENT_CONSTRAINT(0x40, 0x3),    /* L1D_CACHE_LD */
+       EVENT_CONSTRAINT(0x41, 0x3),    /* L1D_CACHE_ST */
+       EVENT_CONSTRAINT(0x42, 0x3),    /* L1D_CACHE_LOCK */
+       EVENT_CONSTRAINT(0x43, 0x3),    /* L1D_ALL_REF */
+       EVENT_CONSTRAINT(0x4e, 0x3),    /* L1D_PREFETCH */
+       EVENT_CONSTRAINT(0x4c, 0x3),    /* LOAD_HIT_PRE */
+       EVENT_CONSTRAINT(0x51, 0x3),    /* L1D */
+       EVENT_CONSTRAINT(0x52, 0x3),    /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
+       EVENT_CONSTRAINT(0x53, 0x3),    /* L1D_CACHE_LOCK_FB_HIT */
+       EVENT_CONSTRAINT(0xc5, 0x3),    /* CACHE_LOCK_CYCLES */
+       EVENT_CONSTRAINT_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
        return intel_perfmon_event_map[hw_event];
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX];
 
-static const u64 nehalem_hw_cache_event_ids
+static __initconst u64 nehalem_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
  },
 };
 
-static const u64 core2_hw_cache_event_ids
+static __initconst u64 core2_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
  },
 };
 
-static const u64 atom_hw_cache_event_ids
+static __initconst u64 atom_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
 #define CORE_EVNTSEL_UNIT_MASK         0x0000FF00ULL
 #define CORE_EVNTSEL_EDGE_MASK         0x00040000ULL
 #define CORE_EVNTSEL_INV_MASK          0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK  0xFF000000ULL
+#define CORE_EVNTSEL_REG_MASK          0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK              \
        (CORE_EVNTSEL_EVENT_MASK |      \
@@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
        return hw_event & CORE_EVNTSEL_MASK;
 }
 
-static const u64 amd_hw_cache_event_ids
+static __initconst u64 amd_hw_cache_event_ids
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event)
         */
        hwc->config = ARCH_PERFMON_EVENTSEL_INT;
 
+       hwc->idx = -1;
+
        /*
         * Count user and OS events unless requested not to.
         */
@@ -1334,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
                x86_pmu_enable_event(hwc, idx);
 }
 
-static int
-fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
+static int fixed_mode_idx(struct hw_perf_event *hwc)
 {
        unsigned int hw_event;
 
@@ -1349,6 +1405,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
        if (!x86_pmu.num_events_fixed)
                return -1;
 
+       /*
+        * fixed counters do not take all possible filters
+        */
+       if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
+               return -1;
+
        if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
                return X86_PMC_IDX_FIXED_INSTRUCTIONS;
        if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1360,22 +1422,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
 }
 
 /*
- * Find a PMC slot for the freshly enabled / scheduled in event:
+ * generic counter allocator: get next free counter
  */
-static int x86_pmu_enable(struct perf_event *event)
+static int
+gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
+{
+       int idx;
+
+       idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
+       return idx == x86_pmu.num_events ? -1 : idx;
+}
+
+/*
+ * intel-specific counter allocator: check event constraints
+ */
+static int
+intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
+{
+       const struct event_constraint *event_constraint;
+       int i, code;
+
+       if (!event_constraints)
+               goto skip;
+
+       code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
+
+       for_each_event_constraint(event_constraint, event_constraints) {
+               if (code == event_constraint->code) {
+                       for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
+                               if (!test_and_set_bit(i, cpuc->used_mask))
+                                       return i;
+                       }
+                       return -1;
+               }
+       }
+skip:
+       return gen_get_event_idx(cpuc, hwc);
+}
+
+static int
+x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
 {
-       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-       struct hw_perf_event *hwc = &event->hw;
        int idx;
 
-       idx = fixed_mode_idx(event, hwc);
+       idx = fixed_mode_idx(hwc);
        if (idx == X86_PMC_IDX_FIXED_BTS) {
                /* BTS is already occupied. */
                if (test_and_set_bit(idx, cpuc->used_mask))
                        return -EAGAIN;
 
                hwc->config_base        = 0;
-               hwc->event_base = 0;
+               hwc->event_base         = 0;
                hwc->idx                = idx;
        } else if (idx >= 0) {
                /*
@@ -1396,20 +1493,35 @@ static int x86_pmu_enable(struct perf_event *event)
        } else {
                idx = hwc->idx;
                /* Try to get the previous generic event again */
-               if (test_and_set_bit(idx, cpuc->used_mask)) {
+               if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
 try_generic:
-                       idx = find_first_zero_bit(cpuc->used_mask,
-                                                 x86_pmu.num_events);
-                       if (idx == x86_pmu.num_events)
+                       idx = x86_pmu.get_event_idx(cpuc, hwc);
+                       if (idx == -1)
                                return -EAGAIN;
 
                        set_bit(idx, cpuc->used_mask);
                        hwc->idx = idx;
                }
-               hwc->config_base  = x86_pmu.eventsel;
-               hwc->event_base = x86_pmu.perfctr;
+               hwc->config_base = x86_pmu.eventsel;
+               hwc->event_base  = x86_pmu.perfctr;
        }
 
+       return idx;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in event:
+ */
+static int x86_pmu_enable(struct perf_event *event)
+{
+       struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+       struct hw_perf_event *hwc = &event->hw;
+       int idx;
+
+       idx = x86_schedule_event(cpuc, hwc);
+       if (idx < 0)
+               return idx;
+
        perf_events_lapic_init();
 
        x86_pmu.disable(hwc, idx);
@@ -1852,7 +1964,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
        .priority               = 1
 };
 
-static struct x86_pmu p6_pmu = {
+static __initconst struct x86_pmu p6_pmu = {
        .name                   = "p6",
        .handle_irq             = p6_pmu_handle_irq,
        .disable_all            = p6_pmu_disable_all,
@@ -1877,9 +1989,10 @@ static struct x86_pmu p6_pmu = {
         */
        .event_bits             = 32,
        .event_mask             = (1ULL << 32) - 1,
+       .get_event_idx          = intel_get_event_idx,
 };
 
-static struct x86_pmu intel_pmu = {
+static __initconst struct x86_pmu intel_pmu = {
        .name                   = "Intel",
        .handle_irq             = intel_pmu_handle_irq,
        .disable_all            = intel_pmu_disable_all,
@@ -1900,9 +2013,10 @@ static struct x86_pmu intel_pmu = {
        .max_period             = (1ULL << 31) - 1,
        .enable_bts             = intel_pmu_enable_bts,
        .disable_bts            = intel_pmu_disable_bts,
+       .get_event_idx          = intel_get_event_idx,
 };
 
-static struct x86_pmu amd_pmu = {
+static __initconst struct x86_pmu amd_pmu = {
        .name                   = "AMD",
        .handle_irq             = amd_pmu_handle_irq,
        .disable_all            = amd_pmu_disable_all,
@@ -1920,9 +2034,10 @@ static struct x86_pmu amd_pmu = {
        .apic                   = 1,
        /* use highest bit to detect overflow */
        .max_period             = (1ULL << 47) - 1,
+       .get_event_idx          = gen_get_event_idx,
 };
 
-static int p6_pmu_init(void)
+static __init int p6_pmu_init(void)
 {
        switch (boot_cpu_data.x86_model) {
        case 1:
@@ -1932,10 +2047,12 @@ static int p6_pmu_init(void)
        case 7:
        case 8:
        case 11: /* Pentium III */
+               event_constraints = intel_p6_event_constraints;
                break;
        case 9:
        case 13:
                /* Pentium M */
+               event_constraints = intel_p6_event_constraints;
                break;
        default:
                pr_cont("unsupported p6 CPU model %d ",
@@ -1954,7 +2071,7 @@ static int p6_pmu_init(void)
        return 0;
 }
 
-static int intel_pmu_init(void)
+static __init int intel_pmu_init(void)
 {
        union cpuid10_edx edx;
        union cpuid10_eax eax;
@@ -2007,12 +2124,14 @@ static int intel_pmu_init(void)
                       sizeof(hw_cache_event_ids));
 
                pr_cont("Core2 events, ");
+               event_constraints = intel_core_event_constraints;
                break;
        default:
        case 26:
                memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
 
+               event_constraints = intel_nehalem_event_constraints;
                pr_cont("Nehalem/Corei7 events, ");
                break;
        case 28:
@@ -2025,7 +2144,7 @@ static int intel_pmu_init(void)
        return 0;
 }
 
-static int amd_pmu_init(void)
+static __init int amd_pmu_init(void)
 {
        /* Performance-monitoring supported from K7 and later: */
        if (boot_cpu_data.x86 < 6)
@@ -2105,11 +2224,47 @@ static const struct pmu pmu = {
        .unthrottle     = x86_pmu_unthrottle,
 };
 
+static int
+validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
+{
+       struct hw_perf_event fake_event = event->hw;
+
+       if (event->pmu && event->pmu != &pmu)
+               return 0;
+
+       return x86_schedule_event(cpuc, &fake_event) >= 0;
+}
+
+static int validate_group(struct perf_event *event)
+{
+       struct perf_event *sibling, *leader = event->group_leader;
+       struct cpu_hw_events fake_pmu;
+
+       memset(&fake_pmu, 0, sizeof(fake_pmu));
+
+       if (!validate_event(&fake_pmu, leader))
+               return -ENOSPC;
+
+       list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+               if (!validate_event(&fake_pmu, sibling))
+                       return -ENOSPC;
+       }
+
+       if (!validate_event(&fake_pmu, event))
+               return -ENOSPC;
+
+       return 0;
+}
+
 const struct pmu *hw_perf_event_init(struct perf_event *event)
 {
        int err;
 
        err = __hw_perf_event_init(event);
+       if (!err) {
+               if (event->group_leader != event)
+                       err = validate_group(event);
+       }
        if (err) {
                if (event->destroy)
                        event->destroy(event);
index 7d52e9da5e0cc0d1942b6cd9624f83f504491e77..50b9c220e1213ceacf48e10f4f302a98b65ced4f 100644 (file)
@@ -333,6 +333,10 @@ ENTRY(ret_from_fork)
        CFI_ENDPROC
 END(ret_from_fork)
 
+/*
+ * Interrupt exit functions should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
 /*
  * Return to user mode is not as complex as all this looks,
  * but we want the default path for a system call return to
@@ -383,6 +387,10 @@ need_resched:
 END(resume_kernel)
 #endif
        CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
@@ -513,6 +521,10 @@ sysexit_audit:
        PTGS_TO_GS_EX
 ENDPROC(ia32_sysenter_target)
 
+/*
+ * syscall stub including irq exit should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
        # system call handler stub
 ENTRY(system_call)
        RING0_INT_FRAME                 # can't unwind into user space anyway
@@ -705,6 +717,10 @@ syscall_badsys:
        jmp resume_userspace
 END(syscall_badsys)
        CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 /*
  * System calls that need a pt_regs pointer.
@@ -814,6 +830,10 @@ common_interrupt:
 ENDPROC(common_interrupt)
        CFI_ENDPROC
 
+/*
+ *  Irq entries should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
 #define BUILD_INTERRUPT3(name, nr, fn) \
 ENTRY(name)                            \
        RING0_INT_FRAME;                \
@@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug)
        jmp error_code
        CFI_ENDPROC
 END(spurious_interrupt_bug)
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 ENTRY(kernel_thread_helper)
        pushl $0                # fake return address for unwinder
index bd5bbddddf91eab956106bff572e03a0ba86b9f9..722df1b1152d57721568ef583e4f2c002922979b 100644 (file)
@@ -803,6 +803,10 @@ END(interrupt)
        call \func
        .endm
 
+/*
+ * Interrupt entry/exit should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
        /*
         * The interrupt stubs push (~vector+0x80) onto the stack and
         * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
 
        CFI_ENDPROC
 END(common_interrupt)
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 /*
  * APIC interrupts.
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644 (file)
index 0000000..d42f65a
--- /dev/null
@@ -0,0 +1,555 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2007 Alan Stern
+ * Copyright (C) 2009 IBM Corporation
+ * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Authors: Alan Stern <stern@rowland.harvard.edu>
+ *          K.Prasad <prasad@linux.vnet.ibm.com>
+ *          Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+/*
+ * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
+ * using the CPU's debug registers.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/irqflags.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/hw_breakpoint.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+
+/* Per cpu debug control register value */
+DEFINE_PER_CPU(unsigned long, cpu_dr7);
+EXPORT_PER_CPU_SYMBOL(cpu_dr7);
+
+/* Per cpu debug address registers values */
+static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
+
+/*
+ * Stores the breakpoints currently in use on each breakpoint address
+ * register for each cpus
+ */
+static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
+
+
+static inline unsigned long
+__encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+       unsigned long bp_info;
+
+       bp_info = (len | type) & 0xf;
+       bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
+       bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
+
+       return bp_info;
+}
+
+/*
+ * Encode the length, type, Exact, and Enable bits for a particular breakpoint
+ * as stored in debug register 7.
+ */
+unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
+{
+       return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
+}
+
+/*
+ * Decode the length and type bits for a particular breakpoint as
+ * stored in debug register 7.  Return the "enabled" status.
+ */
+int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
+{
+       int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
+
+       *len = (bp_info & 0xc) | 0x40;
+       *type = (bp_info & 0x3) | 0x80;
+
+       return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
+}
+
+/*
+ * Install a perf counter breakpoint.
+ *
+ * We seek a free debug address register and use it for this
+ * breakpoint. Eventually we enable it in the debug control register.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+       unsigned long *dr7;
+       int i;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+               if (!*slot) {
+                       *slot = bp;
+                       break;
+               }
+       }
+
+       if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+               return -EBUSY;
+
+       set_debugreg(info->address, i);
+       __get_cpu_var(cpu_debugreg[i]) = info->address;
+
+       dr7 = &__get_cpu_var(cpu_dr7);
+       *dr7 |= encode_dr7(i, info->len, info->type);
+
+       set_debugreg(*dr7, 7);
+
+       return 0;
+}
+
+/*
+ * Uninstall the breakpoint contained in the given counter.
+ *
+ * First we search the debug address register it uses and then we disable
+ * it.
+ *
+ * Atomic: we hold the counter->ctx->lock and we only handle variables
+ * and registers local to this cpu.
+ */
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+       unsigned long *dr7;
+       int i;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+
+               if (*slot == bp) {
+                       *slot = NULL;
+                       break;
+               }
+       }
+
+       if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
+               return;
+
+       dr7 = &__get_cpu_var(cpu_dr7);
+       *dr7 &= ~__encode_dr7(i, info->len, info->type);
+
+       set_debugreg(*dr7, 7);
+}
+
+static int get_hbp_len(u8 hbp_len)
+{
+       unsigned int len_in_bytes = 0;
+
+       switch (hbp_len) {
+       case X86_BREAKPOINT_LEN_1:
+               len_in_bytes = 1;
+               break;
+       case X86_BREAKPOINT_LEN_2:
+               len_in_bytes = 2;
+               break;
+       case X86_BREAKPOINT_LEN_4:
+               len_in_bytes = 4;
+               break;
+#ifdef CONFIG_X86_64
+       case X86_BREAKPOINT_LEN_8:
+               len_in_bytes = 8;
+               break;
+#endif
+       }
+       return len_in_bytes;
+}
+
+/*
+ * Check for virtual address in user space.
+ */
+int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
+{
+       unsigned int len;
+
+       len = get_hbp_len(hbp_len);
+
+       return (va <= TASK_SIZE - len);
+}
+
+/*
+ * Check for virtual address in kernel space.
+ */
+static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+{
+       unsigned int len;
+
+       len = get_hbp_len(hbp_len);
+
+       return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+}
+
+/*
+ * Store a breakpoint's encoded address, length, and type.
+ */
+static int arch_store_info(struct perf_event *bp)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+       /*
+        * For kernel-addresses, either the address or symbol name can be
+        * specified.
+        */
+       if (info->name)
+               info->address = (unsigned long)
+                               kallsyms_lookup_name(info->name);
+       if (info->address)
+               return 0;
+
+       return -EINVAL;
+}
+
+int arch_bp_generic_fields(int x86_len, int x86_type,
+                          int *gen_len, int *gen_type)
+{
+       /* Len */
+       switch (x86_len) {
+       case X86_BREAKPOINT_LEN_1:
+               *gen_len = HW_BREAKPOINT_LEN_1;
+               break;
+       case X86_BREAKPOINT_LEN_2:
+               *gen_len = HW_BREAKPOINT_LEN_2;
+               break;
+       case X86_BREAKPOINT_LEN_4:
+               *gen_len = HW_BREAKPOINT_LEN_4;
+               break;
+#ifdef CONFIG_X86_64
+       case X86_BREAKPOINT_LEN_8:
+               *gen_len = HW_BREAKPOINT_LEN_8;
+               break;
+#endif
+       default:
+               return -EINVAL;
+       }
+
+       /* Type */
+       switch (x86_type) {
+       case X86_BREAKPOINT_EXECUTE:
+               *gen_type = HW_BREAKPOINT_X;
+               break;
+       case X86_BREAKPOINT_WRITE:
+               *gen_type = HW_BREAKPOINT_W;
+               break;
+       case X86_BREAKPOINT_RW:
+               *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+
+static int arch_build_bp_info(struct perf_event *bp)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+       info->address = bp->attr.bp_addr;
+
+       /* Len */
+       switch (bp->attr.bp_len) {
+       case HW_BREAKPOINT_LEN_1:
+               info->len = X86_BREAKPOINT_LEN_1;
+               break;
+       case HW_BREAKPOINT_LEN_2:
+               info->len = X86_BREAKPOINT_LEN_2;
+               break;
+       case HW_BREAKPOINT_LEN_4:
+               info->len = X86_BREAKPOINT_LEN_4;
+               break;
+#ifdef CONFIG_X86_64
+       case HW_BREAKPOINT_LEN_8:
+               info->len = X86_BREAKPOINT_LEN_8;
+               break;
+#endif
+       default:
+               return -EINVAL;
+       }
+
+       /* Type */
+       switch (bp->attr.bp_type) {
+       case HW_BREAKPOINT_W:
+               info->type = X86_BREAKPOINT_WRITE;
+               break;
+       case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+               info->type = X86_BREAKPOINT_RW;
+               break;
+       case HW_BREAKPOINT_X:
+               info->type = X86_BREAKPOINT_EXECUTE;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       return 0;
+}
+/*
+ * Validate the arch-specific HW Breakpoint register settings
+ */
+int arch_validate_hwbkpt_settings(struct perf_event *bp,
+                                 struct task_struct *tsk)
+{
+       struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+       unsigned int align;
+       int ret;
+
+
+       ret = arch_build_bp_info(bp);
+       if (ret)
+               return ret;
+
+       ret = -EINVAL;
+
+       if (info->type == X86_BREAKPOINT_EXECUTE)
+               /*
+                * Ptrace-refactoring code
+                * For now, we'll allow instruction breakpoint only for user-space
+                * addresses
+                */
+               if ((!arch_check_va_in_userspace(info->address, info->len)) &&
+                       info->len != X86_BREAKPOINT_EXECUTE)
+                       return ret;
+
+       switch (info->len) {
+       case X86_BREAKPOINT_LEN_1:
+               align = 0;
+               break;
+       case X86_BREAKPOINT_LEN_2:
+               align = 1;
+               break;
+       case X86_BREAKPOINT_LEN_4:
+               align = 3;
+               break;
+#ifdef CONFIG_X86_64
+       case X86_BREAKPOINT_LEN_8:
+               align = 7;
+               break;
+#endif
+       default:
+               return ret;
+       }
+
+       if (bp->callback)
+               ret = arch_store_info(bp);
+
+       if (ret < 0)
+               return ret;
+       /*
+        * Check that the low-order bits of the address are appropriate
+        * for the alignment implied by len.
+        */
+       if (info->address & align)
+               return -EINVAL;
+
+       /* Check that the virtual address is in the proper range */
+       if (tsk) {
+               if (!arch_check_va_in_userspace(info->address, info->len))
+                       return -EFAULT;
+       } else {
+               if (!arch_check_va_in_kernelspace(info->address, info->len))
+                       return -EFAULT;
+       }
+
+       return 0;
+}
+
+/*
+ * Dump the debug register contents to the user.
+ * We can't dump our per cpu values because it
+ * may contain cpu wide breakpoint, something that
+ * doesn't belong to the current task.
+ *
+ * TODO: include non-ptrace user breakpoints (perf)
+ */
+void aout_dump_debugregs(struct user *dump)
+{
+       int i;
+       int dr7 = 0;
+       struct perf_event *bp;
+       struct arch_hw_breakpoint *info;
+       struct thread_struct *thread = &current->thread;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               bp = thread->ptrace_bps[i];
+
+               if (bp && !bp->attr.disabled) {
+                       dump->u_debugreg[i] = bp->attr.bp_addr;
+                       info = counter_arch_bp(bp);
+                       dr7 |= encode_dr7(i, info->len, info->type);
+               } else {
+                       dump->u_debugreg[i] = 0;
+               }
+       }
+
+       dump->u_debugreg[4] = 0;
+       dump->u_debugreg[5] = 0;
+       dump->u_debugreg[6] = current->thread.debugreg6;
+
+       dump->u_debugreg[7] = dr7;
+}
+EXPORT_SYMBOL_GPL(aout_dump_debugregs);
+
+/*
+ * Release the user breakpoints used by ptrace
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+       int i;
+       struct thread_struct *t = &tsk->thread;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               unregister_hw_breakpoint(t->ptrace_bps[i]);
+               t->ptrace_bps[i] = NULL;
+       }
+}
+
+void hw_breakpoint_restore(void)
+{
+       set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
+       set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
+       set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
+       set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
+       set_debugreg(current->thread.debugreg6, 6);
+       set_debugreg(__get_cpu_var(cpu_dr7), 7);
+}
+EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
+
+/*
+ * Handle debug exception notifications.
+ *
+ * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
+ *
+ * NOTIFY_DONE returned if one of the following conditions is true.
+ * i) When the causative address is from user-space and the exception
+ * is a valid one, i.e. not triggered as a result of lazy debug register
+ * switching
+ * ii) When there are more bits than trap<n> set in DR6 register (such
+ * as BD, BS or BT) indicating that more than one debug condition is
+ * met and requires some more action in do_debug().
+ *
+ * NOTIFY_STOP returned for all other cases
+ *
+ */
+static int __kprobes hw_breakpoint_handler(struct die_args *args)
+{
+       int i, cpu, rc = NOTIFY_STOP;
+       struct perf_event *bp;
+       unsigned long dr7, dr6;
+       unsigned long *dr6_p;
+
+       /* The DR6 value is pointed by args->err */
+       dr6_p = (unsigned long *)ERR_PTR(args->err);
+       dr6 = *dr6_p;
+
+       /* Do an early return if no trap bits are set in DR6 */
+       if ((dr6 & DR_TRAP_BITS) == 0)
+               return NOTIFY_DONE;
+
+       get_debugreg(dr7, 7);
+       /* Disable breakpoints during exception handling */
+       set_debugreg(0UL, 7);
+       /*
+        * Assert that local interrupts are disabled
+        * Reset the DRn bits in the virtualized register value.
+        * The ptrace trigger routine will add in whatever is needed.
+        */
+       current->thread.debugreg6 &= ~DR_TRAP_BITS;
+       cpu = get_cpu();
+
+       /* Handle all the breakpoints that were triggered */
+       for (i = 0; i < HBP_NUM; ++i) {
+               if (likely(!(dr6 & (DR_TRAP0 << i))))
+                       continue;
+
+               /*
+                * The counter may be concurrently released but that can only
+                * occur from a call_rcu() path. We can then safely fetch
+                * the breakpoint, use its callback, touch its counter
+                * while we are in an rcu_read_lock() path.
+                */
+               rcu_read_lock();
+
+               bp = per_cpu(bp_per_reg[i], cpu);
+               if (bp)
+                       rc = NOTIFY_DONE;
+               /*
+                * Reset the 'i'th TRAP bit in dr6 to denote completion of
+                * exception handling
+                */
+               (*dr6_p) &= ~(DR_TRAP0 << i);
+               /*
+                * bp can be NULL due to lazy debug register switching
+                * or due to concurrent perf counter removing.
+                */
+               if (!bp) {
+                       rcu_read_unlock();
+                       break;
+               }
+
+               (bp->callback)(bp, args->regs);
+
+               rcu_read_unlock();
+       }
+       if (dr6 & (~DR_TRAP_BITS))
+               rc = NOTIFY_DONE;
+
+       set_debugreg(dr7, 7);
+       put_cpu();
+
+       return rc;
+}
+
+/*
+ * Handle debug exception notifications.
+ */
+int __kprobes hw_breakpoint_exceptions_notify(
+               struct notifier_block *unused, unsigned long val, void *data)
+{
+       if (val != DIE_DEBUG)
+               return NOTIFY_DONE;
+
+       return hw_breakpoint_handler(data);
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+       /* TODO */
+}
+
+void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
+{
+       /* TODO */
+}
index 04bbd52785688c4152c449459d943a0482c68830..19212cb01558101c5fd861fc838e83230e94e15c 100644 (file)
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
                seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
        seq_printf(p, "  TLB shootdowns\n");
 #endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
        seq_printf(p, "%*s: ", prec, "TRM");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
        seq_printf(p, "  Thermal event interrupts\n");
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
        seq_printf(p, "%*s: ", prec, "THR");
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
        seq_printf(p, "  Threshold APIC interrupts\n");
-# endif
 #endif
 #ifdef CONFIG_X86_MCE
        seq_printf(p, "%*s: ", prec, "MCE");
@@ -194,11 +194,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
        sum += irq_stats(cpu)->irq_call_count;
        sum += irq_stats(cpu)->irq_tlb_count;
 #endif
-#ifdef CONFIG_X86_MCE
+#ifdef CONFIG_X86_THERMAL_VECTOR
        sum += irq_stats(cpu)->irq_thermal_count;
-# ifdef CONFIG_X86_MCE_THRESHOLD
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
        sum += irq_stats(cpu)->irq_threshold_count;
-# endif
 #endif
 #ifdef CONFIG_X86_MCE
        sum += per_cpu(mce_exception_count, cpu);
index 8d82a77a3f3b96ea3c0dc37e91551dddc7e10b51..34e86b67550c523d3bbefb0275ede78d526b0826 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/smp.h>
 #include <linux/nmi.h>
 
+#include <asm/debugreg.h>
 #include <asm/apicdef.h>
 #include <asm/system.h>
 
@@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
                        "resuming...\n");
        kgdb_arch_handle_exception(args->trapnr, args->signr,
                                   args->err, "c", "", regs);
+       /*
+        * Reset the BS bit in dr6 (pointed by args->err) to
+        * denote completion of processing
+        */
+       (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
 
        return NOTIFY_STOP;
 }
index 7b5169d2b00026272ed26874913c42ff315befb3..3fe86d706a1493ad59cb655478bd9b41cfd11f4d 100644 (file)
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/kdebug.h>
+#include <linux/kallsyms.h>
 
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/alternative.h>
+#include <asm/insn.h>
+#include <asm/debugreg.h>
 
 void jprobe_return_end(void);
 
@@ -106,50 +109,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
        /*      -----------------------------------------------         */
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
 };
-static const u32 onebyte_has_modrm[256 / 32] = {
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-       /*      -----------------------------------------------         */
-       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
-       W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
-       W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
-       W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
-       W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
-       W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
-       W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
-       W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
-       W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-       W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
-       W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
-       W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
-       W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
-       W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-       W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
-       W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
-       /*      -----------------------------------------------         */
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-       /*      -----------------------------------------------         */
-       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
-       W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
-       W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
-       W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
-       W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
-       W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
-       W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
-       W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
-       W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
-       W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
-       W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
-       W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
-       W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
-       W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
-       W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
-       W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
-       /*      -----------------------------------------------         */
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
 #undef W
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +203,75 @@ retry:
        }
 }
 
+/* Recover the probed instruction at addr for further analysis. */
+static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+       struct kprobe *kp;
+       kp = get_kprobe((void *)addr);
+       if (!kp)
+               return -EINVAL;
+
+       /*
+        *  Basically, kp->ainsn.insn has an original instruction.
+        *  However, RIP-relative instruction can not do single-stepping
+        *  at different place, fix_riprel() tweaks the displacement of
+        *  that instruction. In that case, we can't recover the instruction
+        *  from the kp->ainsn.insn.
+        *
+        *  On the other hand, kp->opcode has a copy of the first byte of
+        *  the probed instruction, which is overwritten by int3. And
+        *  the instruction at kp->addr is not modified by kprobes except
+        *  for the first byte, we can recover the original instruction
+        *  from it and kp->opcode.
+        */
+       memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+       buf[0] = kp->opcode;
+       return 0;
+}
+
+/* Dummy buffers for kallsyms_lookup */
+static char __dummy_buf[KSYM_NAME_LEN];
+
+/* Check if paddr is at an instruction boundary */
+static int __kprobes can_probe(unsigned long paddr)
+{
+       int ret;
+       unsigned long addr, offset = 0;
+       struct insn insn;
+       kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+       if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+               return 0;
+
+       /* Decode instructions */
+       addr = paddr - offset;
+       while (addr < paddr) {
+               kernel_insn_init(&insn, (void *)addr);
+               insn_get_opcode(&insn);
+
+               /*
+                * Check if the instruction has been modified by another
+                * kprobe, in which case we replace the breakpoint by the
+                * original instruction in our buffer.
+                */
+               if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+                       ret = recover_probed_instruction(buf, addr);
+                       if (ret)
+                               /*
+                                * Another debugging subsystem might insert
+                                * this breakpoint. In that case, we can't
+                                * recover it.
+                                */
+                               return 0;
+                       kernel_insn_init(&insn, buf);
+               }
+               insn_get_length(&insn);
+               addr += insn.length;
+       }
+
+       return (addr == paddr);
+}
+
 /*
  * Returns non-zero if opcode modifies the interrupt flag.
  */
@@ -277,68 +305,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 static void __kprobes fix_riprel(struct kprobe *p)
 {
 #ifdef CONFIG_X86_64
-       u8 *insn = p->ainsn.insn;
-       s64 disp;
-       int need_modrm;
-
-       /* Skip legacy instruction prefixes.  */
-       while (1) {
-               switch (*insn) {
-               case 0x66:
-               case 0x67:
-               case 0x2e:
-               case 0x3e:
-               case 0x26:
-               case 0x64:
-               case 0x65:
-               case 0x36:
-               case 0xf0:
-               case 0xf3:
-               case 0xf2:
-                       ++insn;
-                       continue;
-               }
-               break;
-       }
+       struct insn insn;
+       kernel_insn_init(&insn, p->ainsn.insn);
 
-       /* Skip REX instruction prefix.  */
-       if (is_REX_prefix(insn))
-               ++insn;
-
-       if (*insn == 0x0f) {
-               /* Two-byte opcode.  */
-               ++insn;
-               need_modrm = test_bit(*insn,
-                                     (unsigned long *)twobyte_has_modrm);
-       } else
-               /* One-byte opcode.  */
-               need_modrm = test_bit(*insn,
-                                     (unsigned long *)onebyte_has_modrm);
-
-       if (need_modrm) {
-               u8 modrm = *++insn;
-               if ((modrm & 0xc7) == 0x05) {
-                       /* %rip+disp32 addressing mode */
-                       /* Displacement follows ModRM byte.  */
-                       ++insn;
-                       /*
-                        * The copied instruction uses the %rip-relative
-                        * addressing mode.  Adjust the displacement for the
-                        * difference between the original location of this
-                        * instruction and the location of the copy that will
-                        * actually be run.  The tricky bit here is making sure
-                        * that the sign extension happens correctly in this
-                        * calculation, since we need a signed 32-bit result to
-                        * be sign-extended to 64 bits when it's added to the
-                        * %rip value and yield the same 64-bit result that the
-                        * sign-extension of the original signed 32-bit
-                        * displacement would have given.
-                        */
-                       disp = (u8 *) p->addr + *((s32 *) insn) -
-                              (u8 *) p->ainsn.insn;
-                       BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
-                       *(s32 *)insn = (s32) disp;
-               }
+       if (insn_rip_relative(&insn)) {
+               s64 newdisp;
+               u8 *disp;
+               insn_get_displacement(&insn);
+               /*
+                * The copied instruction uses the %rip-relative addressing
+                * mode.  Adjust the displacement for the difference between
+                * the original location of this instruction and the location
+                * of the copy that will actually be run.  The tricky bit here
+                * is making sure that the sign extension happens correctly in
+                * this calculation, since we need a signed 32-bit result to
+                * be sign-extended to 64 bits when it's added to the %rip
+                * value and yield the same 64-bit result that the sign-
+                * extension of the original signed 32-bit displacement would
+                * have given.
+                */
+               newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
+                         (u8 *) p->ainsn.insn;
+               BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */
+               disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
+               *(s32 *) disp = (s32) newdisp;
        }
 #endif
 }
@@ -359,6 +349,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
+       if (!can_probe((unsigned long)p->addr))
+               return -EILSEQ;
        /* insn: must be on special executable page on x86. */
        p->ainsn.insn = get_insn_slot();
        if (!p->ainsn.insn)
@@ -472,17 +464,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 {
        switch (kcb->kprobe_status) {
        case KPROBE_HIT_SSDONE:
-#ifdef CONFIG_X86_64
-               /* TODO: Provide re-entrancy from post_kprobes_handler() and
-                * avoid exception stack corruption while single-stepping on
-                * the instruction of the new probe.
-                */
-               arch_disarm_kprobe(p);
-               regs->ip = (unsigned long)p->addr;
-               reset_current_kprobe();
-               preempt_enable_no_resched();
-               break;
-#endif
        case KPROBE_HIT_ACTIVE:
                save_previous_kprobe(kcb);
                set_current_kprobe(p, regs, kcb);
@@ -491,18 +472,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
                kcb->kprobe_status = KPROBE_REENTER;
                break;
        case KPROBE_HIT_SS:
-               if (p == kprobe_running()) {
-                       regs->flags &= ~X86_EFLAGS_TF;
-                       regs->flags |= kcb->kprobe_saved_flags;
-                       return 0;
-               } else {
-                       /* A probe has been hit in the codepath leading up
-                        * to, or just after, single-stepping of a probed
-                        * instruction. This entire codepath should strictly
-                        * reside in .kprobes.text section. Raise a warning
-                        * to highlight this peculiar case.
-                        */
-               }
+               /* A probe has been hit in the codepath leading up to, or just
+                * after, single-stepping of a probed instruction. This entire
+                * codepath should strictly reside in .kprobes.text section.
+                * Raise a BUG or we'll continue in an endless reentering loop
+                * and eventually a stack overflow.
+                */
+               printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
+                      p->addr);
+               dump_kprobe(p);
+               BUG();
        default:
                /* impossible cases */
                WARN_ON(1);
@@ -967,8 +946,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
                        ret = NOTIFY_STOP;
                break;
        case DIE_DEBUG:
-               if (post_kprobe_handler(args->regs))
+               if (post_kprobe_handler(args->regs)) {
+                       /*
+                        * Reset the BS bit in dr6 (pointed by args->err) to
+                        * denote completion of processing
+                        */
+                       (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
                        ret = NOTIFY_STOP;
+               }
                break;
        case DIE_GPF:
                /*
index c1c429d00130c2a233b35f69449d5a02a9f6f2e9..c843f8406da2b95f416cc9a83c2bead2d4d17ae6 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/desc.h>
 #include <asm/system.h>
 #include <asm/cacheflush.h>
+#include <asm/debugreg.h>
 
 static void set_idt(void *newidt, __u16 limit)
 {
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
 
        /* Interrupts aren't acceptable while we reboot */
        local_irq_disable();
+       hw_breakpoint_disable();
 
        if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
index 84c3bf209e98a390ff536e46fd8fec362479f22a..4a8bb82248ae8a9a945854105c19447fec8a6530 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/debugreg.h>
 
 static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                                unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
 
        /* Interrupts aren't acceptable while we reboot */
        local_irq_disable();
+       hw_breakpoint_disable();
 
        if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
index 5284cd2b57769f53e79f520ecc6f8199720497cf..744508e7cfdd051e3896fe5ec28d5d3da0f3c16c 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/clockchips.h>
 #include <linux/random.h>
 #include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
@@ -17,6 +18,7 @@
 #include <asm/uaccess.h>
 #include <asm/i387.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -103,14 +105,7 @@ void flush_thread(void)
        }
 #endif
 
-       clear_tsk_thread_flag(tsk, TIF_DEBUG);
-
-       tsk->thread.debugreg0 = 0;
-       tsk->thread.debugreg1 = 0;
-       tsk->thread.debugreg2 = 0;
-       tsk->thread.debugreg3 = 0;
-       tsk->thread.debugreg6 = 0;
-       tsk->thread.debugreg7 = 0;
+       flush_ptrace_hw_breakpoint(tsk);
        memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
        /*
         * Forget coprocessor state..
@@ -192,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
        else if (next->debugctlmsr != prev->debugctlmsr)
                update_debugctlmsr(next->debugctlmsr);
 
-       if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-               set_debugreg(next->debugreg0, 0);
-               set_debugreg(next->debugreg1, 1);
-               set_debugreg(next->debugreg2, 2);
-               set_debugreg(next->debugreg3, 3);
-               /* no 4 and 5 */
-               set_debugreg(next->debugreg6, 6);
-               set_debugreg(next->debugreg7, 7);
-       }
-
        if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
            test_tsk_thread_flag(next_p, TIF_NOTSC)) {
                /* prev and next are different */
index 4cf79567cdab0728b33c2f9698e3a5b535e4eb28..d5bd3132ee706d764510eec5058cf3e214cb2cc2 100644 (file)
@@ -58,6 +58,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
        task_user_gs(p) = get_user_gs(regs);
 
+       p->thread.io_bitmap_ptr = NULL;
        tsk = current;
+       err = -ENOMEM;
+
+       memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
        if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
                p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
                                                IO_BITMAP_BYTES, GFP_KERNEL);
index eb62cbcaa490ad553ef5d70b6751a2288d25089e..70cf15873f3d65da38e42fbbb670555d2fc22722 100644 (file)
@@ -52,6 +52,7 @@
 #include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/ds.h>
+#include <asm/debugreg.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -297,12 +298,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
        p->thread.fs = me->thread.fs;
        p->thread.gs = me->thread.gs;
+       p->thread.io_bitmap_ptr = NULL;
 
        savesegment(gs, p->thread.gsindex);
        savesegment(fs, p->thread.fsindex);
        savesegment(es, p->thread.es);
        savesegment(ds, p->thread.ds);
 
+       err = -ENOMEM;
+       memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
                p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
                if (!p->thread.io_bitmap_ptr) {
@@ -341,6 +346,7 @@ out:
                kfree(p->thread.io_bitmap_ptr);
                p->thread.io_bitmap_max = 0;
        }
+
        return err;
 }
 
@@ -495,6 +501,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         */
        if (preload_fpu)
                __math_state_restore();
+
        return prev_p;
 }
 
index 7b058a2dc66afecdaeb58877102957dff77e7d81..04d182a7cfdbd6e3c88c41658f6d0a91878e6366 100644 (file)
@@ -22,6 +22,8 @@
 #include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/workqueue.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -34,6 +36,7 @@
 #include <asm/prctl.h>
 #include <asm/proto.h>
 #include <asm/ds.h>
+#include <asm/hw_breakpoint.h>
 
 #include "tls.h"
 
@@ -49,6 +52,118 @@ enum x86_regset {
        REGSET_IOPERM32,
 };
 
+struct pt_regs_offset {
+       const char *name;
+       int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+       REG_OFFSET_NAME(r15),
+       REG_OFFSET_NAME(r14),
+       REG_OFFSET_NAME(r13),
+       REG_OFFSET_NAME(r12),
+       REG_OFFSET_NAME(r11),
+       REG_OFFSET_NAME(r10),
+       REG_OFFSET_NAME(r9),
+       REG_OFFSET_NAME(r8),
+#endif
+       REG_OFFSET_NAME(bx),
+       REG_OFFSET_NAME(cx),
+       REG_OFFSET_NAME(dx),
+       REG_OFFSET_NAME(si),
+       REG_OFFSET_NAME(di),
+       REG_OFFSET_NAME(bp),
+       REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+       REG_OFFSET_NAME(ds),
+       REG_OFFSET_NAME(es),
+       REG_OFFSET_NAME(fs),
+       REG_OFFSET_NAME(gs),
+#endif
+       REG_OFFSET_NAME(orig_ax),
+       REG_OFFSET_NAME(ip),
+       REG_OFFSET_NAME(cs),
+       REG_OFFSET_NAME(flags),
+       REG_OFFSET_NAME(sp),
+       REG_OFFSET_NAME(ss),
+       REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name:      the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+       const struct pt_regs_offset *roff;
+       for (roff = regoffset_table; roff->name != NULL; roff++)
+               if (!strcmp(roff->name, name))
+                       return roff->offset;
+       return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset:    the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+       const struct pt_regs_offset *roff;
+       for (roff = regoffset_table; roff->name != NULL; roff++)
+               if (roff->offset == offset)
+                       return roff->name;
+       return NULL;
+}
+
+static const int arg_offs_table[] = {
+#ifdef CONFIG_X86_32
+       [0] = offsetof(struct pt_regs, ax),
+       [1] = offsetof(struct pt_regs, dx),
+       [2] = offsetof(struct pt_regs, cx)
+#else /* CONFIG_X86_64 */
+       [0] = offsetof(struct pt_regs, di),
+       [1] = offsetof(struct pt_regs, si),
+       [2] = offsetof(struct pt_regs, dx),
+       [3] = offsetof(struct pt_regs, cx),
+       [4] = offsetof(struct pt_regs, r8),
+       [5] = offsetof(struct pt_regs, r9)
+#endif
+};
+
+/**
+ * regs_get_argument_nth() - get Nth argument at function call
+ * @regs:      pt_regs which contains registers at function entry.
+ * @n:         argument number.
+ *
+ * regs_get_argument_nth() returns @n th argument of a function call.
+ * Since usually the kernel stack will be changed right after function entry,
+ * you must use this at function entry. If the @n th entry is NOT in the
+ * kernel stack or pt_regs, this returns 0.
+ */
+unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
+{
+       if (n < ARRAY_SIZE(arg_offs_table))
+               return *(unsigned long *)((char *)regs + arg_offs_table[n]);
+       else {
+               /*
+                * The typical case: arg n is on the stack.
+                * (Note: stack[0] = return address, so skip it)
+                */
+               n -= ARRAY_SIZE(arg_offs_table);
+               return regs_get_kernel_stack_nth(regs, 1 + n);
+       }
+}
+
 /*
  * does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
        return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-       return TASK_SIZE - 3;
-}
-
 #else  /* CONFIG_X86_64 */
 
 #define FLAG_MASK              (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
        return 0;
 }
 
-static unsigned long debugreg_addr_limit(struct task_struct *task)
-{
-#ifdef CONFIG_IA32_EMULATION
-       if (test_tsk_thread_flag(task, TIF_IA32))
-               return IA32_PAGE_OFFSET - 3;
-#endif
-       return TASK_SIZE_MAX - 7;
-}
-
 #endif /* CONFIG_X86_32 */
 
 static unsigned long get_flags(struct task_struct *task)
@@ -454,98 +555,238 @@ static int genregs_set(struct task_struct *target,
        return ret;
 }
 
+static void ptrace_triggered(struct perf_event *bp, void *data)
+{
+       int i;
+       struct thread_struct *thread = &(current->thread);
+
+       /*
+        * Store in the virtual DR6 register the fact that the breakpoint
+        * was hit so the thread's debugger will see it.
+        */
+       for (i = 0; i < HBP_NUM; i++) {
+               if (thread->ptrace_bps[i] == bp)
+                       break;
+       }
+
+       thread->debugreg6 |= (DR_TRAP0 << i);
+}
+
 /*
- * This function is trivial and will be inlined by the compiler.
- * Having it separates the implementation details of debug
- * registers from the interface details of ptrace.
+ * Walk through every ptrace breakpoints for this thread and
+ * build the dr7 value on top of their attributes.
+ *
  */
-static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
+static unsigned long ptrace_get_dr7(struct perf_event *bp[])
 {
-       switch (n) {
-       case 0:         return child->thread.debugreg0;
-       case 1:         return child->thread.debugreg1;
-       case 2:         return child->thread.debugreg2;
-       case 3:         return child->thread.debugreg3;
-       case 6:         return child->thread.debugreg6;
-       case 7:         return child->thread.debugreg7;
+       int i;
+       int dr7 = 0;
+       struct arch_hw_breakpoint *info;
+
+       for (i = 0; i < HBP_NUM; i++) {
+               if (bp[i] && !bp[i]->attr.disabled) {
+                       info = counter_arch_bp(bp[i]);
+                       dr7 |= encode_dr7(i, info->len, info->type);
+               }
        }
-       return 0;
+
+       return dr7;
 }
 
-static int ptrace_set_debugreg(struct task_struct *child,
-                              int n, unsigned long data)
+static struct perf_event *
+ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
+                        struct task_struct *tsk, int disabled)
 {
-       int i;
+       int err;
+       int gen_len, gen_type;
+       DEFINE_BREAKPOINT_ATTR(attr);
 
-       if (unlikely(n == 4 || n == 5))
-               return -EIO;
+       /*
+        * We shoud have at least an inactive breakpoint at this
+        * slot. It means the user is writing dr7 without having
+        * written the address register first
+        */
+       if (!bp)
+               return ERR_PTR(-EINVAL);
 
-       if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
-               return -EIO;
+       err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
+       if (err)
+               return ERR_PTR(err);
 
-       switch (n) {
-       case 0:         child->thread.debugreg0 = data; break;
-       case 1:         child->thread.debugreg1 = data; break;
-       case 2:         child->thread.debugreg2 = data; break;
-       case 3:         child->thread.debugreg3 = data; break;
+       attr = bp->attr;
+       attr.bp_len = gen_len;
+       attr.bp_type = gen_type;
+       attr.disabled = disabled;
 
-       case 6:
-               if ((data & ~0xffffffffUL) != 0)
-                       return -EIO;
-               child->thread.debugreg6 = data;
-               break;
+       return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
+}
+
+/*
+ * Handle ptrace writes to debug register 7.
+ */
+static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
+{
+       struct thread_struct *thread = &(tsk->thread);
+       unsigned long old_dr7;
+       int i, orig_ret = 0, rc = 0;
+       int enabled, second_pass = 0;
+       unsigned len, type;
+       struct perf_event *bp;
+
+       data &= ~DR_CONTROL_RESERVED;
+       old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
+restore:
+       /*
+        * Loop through all the hardware breakpoints, making the
+        * appropriate changes to each.
+        */
+       for (i = 0; i < HBP_NUM; i++) {
+               enabled = decode_dr7(data, i, &len, &type);
+               bp = thread->ptrace_bps[i];
+
+               if (!enabled) {
+                       if (bp) {
+                               /*
+                                * Don't unregister the breakpoints right-away,
+                                * unless all register_user_hw_breakpoint()
+                                * requests have succeeded. This prevents
+                                * any window of opportunity for debug
+                                * register grabbing by other users.
+                                */
+                               if (!second_pass)
+                                       continue;
+
+                               thread->ptrace_bps[i] = NULL;
+                               bp = ptrace_modify_breakpoint(bp, len, type,
+                                                             tsk, 1);
+                               if (IS_ERR(bp)) {
+                                       rc = PTR_ERR(bp);
+                                       thread->ptrace_bps[i] = NULL;
+                                       break;
+                               }
+                               thread->ptrace_bps[i] = bp;
+                       }
+                       continue;
+               }
+
+               bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
+
+               /* Incorrect bp, or we have a bug in bp API */
+               if (IS_ERR(bp)) {
+                       rc = PTR_ERR(bp);
+                       thread->ptrace_bps[i] = NULL;
+                       break;
+               }
+               thread->ptrace_bps[i] = bp;
+       }
+       /*
+        * Make a second pass to free the remaining unused breakpoints
+        * or to restore the original breakpoints if an error occurred.
+        */
+       if (!second_pass) {
+               second_pass = 1;
+               if (rc < 0) {
+                       orig_ret = rc;
+                       data = old_dr7;
+               }
+               goto restore;
+       }
+       return ((orig_ret < 0) ? orig_ret : rc);
+}
+
+/*
+ * Handle PTRACE_PEEKUSR calls for the debug register area.
+ */
+static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
+{
+       struct thread_struct *thread = &(tsk->thread);
+       unsigned long val = 0;
 
-       case 7:
+       if (n < HBP_NUM) {
+               struct perf_event *bp;
+               bp = thread->ptrace_bps[n];
+               if (!bp)
+                       return 0;
+               val = bp->hw.info.address;
+       } else if (n == 6) {
+               val = thread->debugreg6;
+        } else if (n == 7) {
+               val = ptrace_get_dr7(thread->ptrace_bps);
+       }
+       return val;
+}
+
+static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
+                                     unsigned long addr)
+{
+       struct perf_event *bp;
+       struct thread_struct *t = &tsk->thread;
+       DEFINE_BREAKPOINT_ATTR(attr);
+
+       if (!t->ptrace_bps[nr]) {
                /*
-                * Sanity-check data. Take one half-byte at once with
-                * check = (val >> (16 + 4*i)) & 0xf. It contains the
-                * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
-                * 2 and 3 are LENi. Given a list of invalid values,
-                * we do mask |= 1 << invalid_value, so that
-                * (mask >> check) & 1 is a correct test for invalid
-                * values.
-                *
-                * R/Wi contains the type of the breakpoint /
-                * watchpoint, LENi contains the length of the watched
-                * data in the watchpoint case.
-                *
-                * The invalid values are:
-                * - LENi == 0x10 (undefined), so mask |= 0x0f00.       [32-bit]
-                * - R/Wi == 0x10 (break on I/O reads or writes), so
-                *   mask |= 0x4444.
-                * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
-                *   0x1110.
-                *
-                * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
-                *
-                * See the Intel Manual "System Programming Guide",
-                * 15.2.4
-                *
-                * Note that LENi == 0x10 is defined on x86_64 in long
-                * mode (i.e. even for 32-bit userspace software, but
-                * 64-bit kernel), so the x86_64 mask value is 0x5454.
-                * See the AMD manual no. 24593 (AMD64 System Programming)
+                * Put stub len and type to register (reserve) an inactive but
+                * correct bp
                 */
-#ifdef CONFIG_X86_32
-#define        DR7_MASK        0x5f54
-#else
-#define        DR7_MASK        0x5554
-#endif
-               data &= ~DR_CONTROL_RESERVED;
-               for (i = 0; i < 4; i++)
-                       if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
-                               return -EIO;
-               child->thread.debugreg7 = data;
-               if (data)
-                       set_tsk_thread_flag(child, TIF_DEBUG);
-               else
-                       clear_tsk_thread_flag(child, TIF_DEBUG);
-               break;
+               attr.bp_addr = addr;
+               attr.bp_len = HW_BREAKPOINT_LEN_1;
+               attr.bp_type = HW_BREAKPOINT_W;
+               attr.disabled = 1;
+
+               bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+       } else {
+               bp = t->ptrace_bps[nr];
+               t->ptrace_bps[nr] = NULL;
+
+               attr = bp->attr;
+               attr.bp_addr = addr;
+               bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
        }
+       /*
+        * CHECKME: the previous code returned -EIO if the addr wasn't a
+        * valid task virtual addr. The new one will return -EINVAL in this
+        * case.
+        * -EINVAL may be what we want for in-kernel breakpoints users, but
+        * -EIO looks better for ptrace, since we refuse a register writing
+        * for the user. And anyway this is the previous behaviour.
+        */
+       if (IS_ERR(bp))
+               return PTR_ERR(bp);
+
+       t->ptrace_bps[nr] = bp;
 
        return 0;
 }
 
+/*
+ * Handle PTRACE_POKEUSR calls for the debug register area.
+ */
+int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+{
+       struct thread_struct *thread = &(tsk->thread);
+       int rc = 0;
+
+       /* There are no DR4 or DR5 registers */
+       if (n == 4 || n == 5)
+               return -EIO;
+
+       if (n == 6) {
+               thread->debugreg6 = val;
+               goto ret_path;
+       }
+       if (n < HBP_NUM) {
+               rc = ptrace_set_breakpoint_addr(tsk, n, val);
+               if (rc)
+                       return rc;
+       }
+       /* All that's left is DR7 */
+       if (n == 7)
+               rc = ptrace_write_dr7(tsk, val);
+
+ret_path:
+       return rc;
+}
+
 /*
  * These access the current or another (stopped) task's io permission
  * bitmap for debugging or core dump.
index 2a34f9c5be214fd428c4197a1f09819d65759a37..c0ca8f921c91f24091cb89af9fad7360c6312313 100644 (file)
 #ifdef CONFIG_X86_64
 #include <asm/numa_64.h>
 #endif
+#include <asm/mce.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -1031,6 +1032,8 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif
        x86_init.oem.banner();
+
+       mcheck_init();
 }
 
 #ifdef CONFIG_X86_32
index 6a44a76055adcb781572978d1e332a9923f7e2d6..fbf3b07c856740805f0dbe64a85fa0d2131df37e 100644 (file)
@@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs)
 
        signr = get_signal_to_deliver(&info, &ka, regs, NULL);
        if (signr > 0) {
-               /*
-                * Re-enable any watchpoints before delivering the
-                * signal to user space. The processor register will
-                * have been cleared if the watchpoint triggered
-                * inside the kernel.
-                */
-               if (current->thread.debugreg7)
-                       set_debugreg(current->thread.debugreg7, 7);
-
                /* Whee! Actually deliver the signal.  */
                if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
                        /*
index 7e37dcee0cc352df1104e211d77cce2f99a9e4e8..33399176512a8a2c4c718d53ad76bdea631bd46e 100644 (file)
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
        struct task_struct *tsk = current;
-       unsigned long condition;
+       unsigned long dr6;
        int si_code;
 
-       get_debugreg(condition, 6);
+       get_debugreg(dr6, 6);
 
        /* Catch kmemcheck conditions first of all! */
-       if (condition & DR_STEP && kmemcheck_trap(regs))
+       if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
                return;
 
+       /* DR6 may or may not be cleared by the CPU */
+       set_debugreg(0, 6);
        /*
         * The processor cleared BTF, so don't mark that we need it set.
         */
        clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
        tsk->thread.debugctlmsr = 0;
 
-       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
-                                               SIGTRAP) == NOTIFY_STOP)
+       /* Store the virtualized DR6 value */
+       tsk->thread.debugreg6 = dr6;
+
+       if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
+                                                       SIGTRAP) == NOTIFY_STOP)
                return;
 
        /* It's safe to allow irq's after DR6 has been saved */
        preempt_conditional_sti(regs);
 
-       /* Mask out spurious debug traps due to lazy DR7 setting */
-       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-               if (!tsk->thread.debugreg7)
-                       goto clear_dr7;
+       if (regs->flags & X86_VM_MASK) {
+               handle_vm86_trap((struct kernel_vm86_regs *) regs,
+                               error_code, 1);
+               return;
        }
 
-#ifdef CONFIG_X86_32
-       if (regs->flags & X86_VM_MASK)
-               goto debug_vm86;
-#endif
-
-       /* Save debug status register where ptrace can see it */
-       tsk->thread.debugreg6 = condition;
-
        /*
-        * Single-stepping through TF: make sure we ignore any events in
-        * kernel space (but re-enable TF when returning to user mode).
+        * Single-stepping through system calls: ignore any exceptions in
+        * kernel space, but re-enable TF when returning to user mode.
+        *
+        * We already checked v86 mode above, so we can check for kernel mode
+        * by just checking the CPL of CS.
         */
-       if (condition & DR_STEP) {
-               if (!user_mode(regs))
-                       goto clear_TF_reenable;
+       if ((dr6 & DR_STEP) && !user_mode(regs)) {
+               tsk->thread.debugreg6 &= ~DR_STEP;
+               set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+               regs->flags &= ~X86_EFLAGS_TF;
        }
-
-       si_code = get_si_code(condition);
-       /* Ok, finally something we can handle */
-       send_sigtrap(tsk, regs, error_code, si_code);
-
-       /*
-        * Disable additional traps. They'll be re-enabled when
-        * the signal is delivered.
-        */
-clear_dr7:
-       set_debugreg(0, 7);
+       si_code = get_si_code(tsk->thread.debugreg6);
+       if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
+               send_sigtrap(tsk, regs, error_code, si_code);
        preempt_conditional_cli(regs);
-       return;
 
-#ifdef CONFIG_X86_32
-debug_vm86:
-       /* reenable preemption: handle_vm86_trap() might sleep */
-       dec_preempt_count();
-       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
-       conditional_cli(regs);
-       return;
-#endif
-
-clear_TF_reenable:
-       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-       regs->flags &= ~X86_EFLAGS_TF;
-       preempt_conditional_cli(regs);
        return;
 }
 
index ae07d261527cba458ed1682118b19295bc997847..4fc80174191ce4b17549b643fe11dee645286c3f 100644 (file)
@@ -42,6 +42,7 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
+#include <asm/debugreg.h>
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
@@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        trace_kvm_entry(vcpu->vcpu_id);
        kvm_x86_ops->run(vcpu, kvm_run);
 
-       if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) {
-               set_debugreg(current->thread.debugreg0, 0);
-               set_debugreg(current->thread.debugreg1, 1);
-               set_debugreg(current->thread.debugreg2, 2);
-               set_debugreg(current->thread.debugreg3, 3);
-               set_debugreg(current->thread.debugreg6, 6);
-               set_debugreg(current->thread.debugreg7, 7);
-       }
+       /*
+        * If the guest has used debug registers, at least dr7
+        * will be disabled while returning to the host.
+        * If we don't have active breakpoints in the host, we don't
+        * care about the messed up debug address registers. But if
+        * we have some of them active, restore the old state.
+        */
+       if (hw_breakpoint_active())
+               hw_breakpoint_restore();
 
        set_bit(KVM_REQ_KICK, &vcpu->requests);
        local_irq_enable();
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644 (file)
index 0000000..8df89f0
--- /dev/null
@@ -0,0 +1 @@
+inat-tables.c
index 85f5db95c60f03718f292080587f7faa337ef570..a2d6472895fb309884d442b100502447fcb4c5e9 100644 (file)
@@ -2,12 +2,25 @@
 # Makefile for x86 specific library files.
 #
 
+inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
+inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
+quiet_cmd_inat_tables = GEN     $@
+      cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@
+
+$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
+       $(call cmd,inat_tables)
+
+$(obj)/inat.o: $(obj)/inat-tables.c
+
+clean-files := inat-tables.c
+
 obj-$(CONFIG_SMP) := msr.o
 
 lib-y := delay.o
 lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
+lib-y += insn.o inat.o
 
 obj-y += msr-reg.o msr-reg-export.o
 
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644 (file)
index 0000000..46fc4ee
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * x86 instruction attribute tables
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/insn.h>
+
+/* Attribute tables are generated from opcode map */
+#include "inat-tables.c"
+
+/* Attribute search APIs */
+insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
+{
+       return inat_primary_table[opcode];
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
+                                     insn_attr_t esc_attr)
+{
+       const insn_attr_t *table;
+       insn_attr_t lpfx_attr;
+       int n, m = 0;
+
+       n = inat_escape_id(esc_attr);
+       if (last_pfx) {
+               lpfx_attr = inat_get_opcode_attribute(last_pfx);
+               m = inat_last_prefix_id(lpfx_attr);
+       }
+       table = inat_escape_tables[n][0];
+       if (!table)
+               return 0;
+       if (inat_has_variant(table[opcode]) && m) {
+               table = inat_escape_tables[n][m];
+               if (!table)
+                       return 0;
+       }
+       return table[opcode];
+}
+
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
+                                    insn_attr_t grp_attr)
+{
+       const insn_attr_t *table;
+       insn_attr_t lpfx_attr;
+       int n, m = 0;
+
+       n = inat_group_id(grp_attr);
+       if (last_pfx) {
+               lpfx_attr = inat_get_opcode_attribute(last_pfx);
+               m = inat_last_prefix_id(lpfx_attr);
+       }
+       table = inat_group_tables[n][0];
+       if (!table)
+               return inat_group_common_attribute(grp_attr);
+       if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
+               table = inat_group_tables[n][m];
+               if (!table)
+                       return inat_group_common_attribute(grp_attr);
+       }
+       return table[X86_MODRM_REG(modrm)] |
+              inat_group_common_attribute(grp_attr);
+}
+
+insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
+                                  insn_byte_t vex_p)
+{
+       const insn_attr_t *table;
+       if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
+               return 0;
+       table = inat_avx_tables[vex_m][vex_p];
+       if (!table)
+               return 0;
+       return table[opcode];
+}
+
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644 (file)
index 0000000..9f33b98
--- /dev/null
@@ -0,0 +1,516 @@
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004, 2009
+ */
+
+#include <linux/string.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+
+#define get_next(t, insn)      \
+       ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+
+#define peek_next(t, insn)     \
+       ({t r; r = *(t*)insn->next_byte; r; })
+
+#define peek_nbyte_next(t, insn, n)    \
+       ({t r; r = *(t*)((insn)->next_byte + n); r; })
+
+/**
+ * insn_init() - initialize struct insn
+ * @insn:      &struct insn to be initialized
+ * @kaddr:     address (in kernel memory) of instruction (or copy thereof)
+ * @x86_64:    !0 for 64-bit kernel or 64-bit app
+ */
+void insn_init(struct insn *insn, const void *kaddr, int x86_64)
+{
+       memset(insn, 0, sizeof(*insn));
+       insn->kaddr = kaddr;
+       insn->next_byte = kaddr;
+       insn->x86_64 = x86_64 ? 1 : 0;
+       insn->opnd_bytes = 4;
+       if (x86_64)
+               insn->addr_bytes = 8;
+       else
+               insn->addr_bytes = 4;
+}
+
+/**
+ * insn_get_prefixes - scan x86 instruction prefix bytes
+ * @insn:      &struct insn containing instruction
+ *
+ * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
+ * to point to the (first) opcode.  No effect if @insn->prefixes.got
+ * is already set.
+ */
+void insn_get_prefixes(struct insn *insn)
+{
+       struct insn_field *prefixes = &insn->prefixes;
+       insn_attr_t attr;
+       insn_byte_t b, lb;
+       int i, nb;
+
+       if (prefixes->got)
+               return;
+
+       nb = 0;
+       lb = 0;
+       b = peek_next(insn_byte_t, insn);
+       attr = inat_get_opcode_attribute(b);
+       while (inat_is_legacy_prefix(attr)) {
+               /* Skip if same prefix */
+               for (i = 0; i < nb; i++)
+                       if (prefixes->bytes[i] == b)
+                               goto found;
+               if (nb == 4)
+                       /* Invalid instruction */
+                       break;
+               prefixes->bytes[nb++] = b;
+               if (inat_is_address_size_prefix(attr)) {
+                       /* address size switches 2/4 or 4/8 */
+                       if (insn->x86_64)
+                               insn->addr_bytes ^= 12;
+                       else
+                               insn->addr_bytes ^= 6;
+               } else if (inat_is_operand_size_prefix(attr)) {
+                       /* oprand size switches 2/4 */
+                       insn->opnd_bytes ^= 6;
+               }
+found:
+               prefixes->nbytes++;
+               insn->next_byte++;
+               lb = b;
+               b = peek_next(insn_byte_t, insn);
+               attr = inat_get_opcode_attribute(b);
+       }
+       /* Set the last prefix */
+       if (lb && lb != insn->prefixes.bytes[3]) {
+               if (unlikely(insn->prefixes.bytes[3])) {
+                       /* Swap the last prefix */
+                       b = insn->prefixes.bytes[3];
+                       for (i = 0; i < nb; i++)
+                               if (prefixes->bytes[i] == lb)
+                                       prefixes->bytes[i] = b;
+               }
+               insn->prefixes.bytes[3] = lb;
+       }
+
+       /* Decode REX prefix */
+       if (insn->x86_64) {
+               b = peek_next(insn_byte_t, insn);
+               attr = inat_get_opcode_attribute(b);
+               if (inat_is_rex_prefix(attr)) {
+                       insn->rex_prefix.value = b;
+                       insn->rex_prefix.nbytes = 1;
+                       insn->next_byte++;
+                       if (X86_REX_W(b))
+                               /* REX.W overrides opnd_size */
+                               insn->opnd_bytes = 8;
+               }
+       }
+       insn->rex_prefix.got = 1;
+
+       /* Decode VEX prefix */
+       b = peek_next(insn_byte_t, insn);
+       attr = inat_get_opcode_attribute(b);
+       if (inat_is_vex_prefix(attr)) {
+               insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
+               if (!insn->x86_64) {
+                       /*
+                        * In 32-bits mode, if the [7:6] bits (mod bits of
+                        * ModRM) on the second byte are not 11b, it is
+                        * LDS or LES.
+                        */
+                       if (X86_MODRM_MOD(b2) != 3)
+                               goto vex_end;
+               }
+               insn->vex_prefix.bytes[0] = b;
+               insn->vex_prefix.bytes[1] = b2;
+               if (inat_is_vex3_prefix(attr)) {
+                       b2 = peek_nbyte_next(insn_byte_t, insn, 2);
+                       insn->vex_prefix.bytes[2] = b2;
+                       insn->vex_prefix.nbytes = 3;
+                       insn->next_byte += 3;
+                       if (insn->x86_64 && X86_VEX_W(b2))
+                               /* VEX.W overrides opnd_size */
+                               insn->opnd_bytes = 8;
+               } else {
+                       insn->vex_prefix.nbytes = 2;
+                       insn->next_byte += 2;
+               }
+       }
+vex_end:
+       insn->vex_prefix.got = 1;
+
+       prefixes->got = 1;
+       return;
+}
+
+/**
+ * insn_get_opcode - collect opcode(s)
+ * @insn:      &struct insn containing instruction
+ *
+ * Populates @insn->opcode, updates @insn->next_byte to point past the
+ * opcode byte(s), and set @insn->attr (except for groups).
+ * If necessary, first collects any preceding (prefix) bytes.
+ * Sets @insn->opcode.value = opcode1.  No effect if @insn->opcode.got
+ * is already 1.
+ */
+void insn_get_opcode(struct insn *insn)
+{
+       struct insn_field *opcode = &insn->opcode;
+       insn_byte_t op, pfx;
+       if (opcode->got)
+               return;
+       if (!insn->prefixes.got)
+               insn_get_prefixes(insn);
+
+       /* Get first opcode */
+       op = get_next(insn_byte_t, insn);
+       opcode->bytes[0] = op;
+       opcode->nbytes = 1;
+
+       /* Check if there is VEX prefix or not */
+       if (insn_is_avx(insn)) {
+               insn_byte_t m, p;
+               m = insn_vex_m_bits(insn);
+               p = insn_vex_p_bits(insn);
+               insn->attr = inat_get_avx_attribute(op, m, p);
+               if (!inat_accept_vex(insn->attr))
+                       insn->attr = 0; /* This instruction is bad */
+               goto end;       /* VEX has only 1 byte for opcode */
+       }
+
+       insn->attr = inat_get_opcode_attribute(op);
+       while (inat_is_escape(insn->attr)) {
+               /* Get escaped opcode */
+               op = get_next(insn_byte_t, insn);
+               opcode->bytes[opcode->nbytes++] = op;
+               pfx = insn_last_prefix(insn);
+               insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
+       }
+       if (inat_must_vex(insn->attr))
+               insn->attr = 0; /* This instruction is bad */
+end:
+       opcode->got = 1;
+}
+
+/**
+ * insn_get_modrm - collect ModRM byte, if any
+ * @insn:      &struct insn containing instruction
+ *
+ * Populates @insn->modrm and updates @insn->next_byte to point past the
+ * ModRM byte, if any.  If necessary, first collects the preceding bytes
+ * (prefixes and opcode(s)).  No effect if @insn->modrm.got is already 1.
+ */
+void insn_get_modrm(struct insn *insn)
+{
+       struct insn_field *modrm = &insn->modrm;
+       insn_byte_t pfx, mod;
+       if (modrm->got)
+               return;
+       if (!insn->opcode.got)
+               insn_get_opcode(insn);
+
+       if (inat_has_modrm(insn->attr)) {
+               mod = get_next(insn_byte_t, insn);
+               modrm->value = mod;
+               modrm->nbytes = 1;
+               if (inat_is_group(insn->attr)) {
+                       pfx = insn_last_prefix(insn);
+                       insn->attr = inat_get_group_attribute(mod, pfx,
+                                                             insn->attr);
+               }
+       }
+
+       if (insn->x86_64 && inat_is_force64(insn->attr))
+               insn->opnd_bytes = 8;
+       modrm->got = 1;
+}
+
+
+/**
+ * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.  No effect if @insn->x86_64 is 0.
+ */
+int insn_rip_relative(struct insn *insn)
+{
+       struct insn_field *modrm = &insn->modrm;
+
+       if (!insn->x86_64)
+               return 0;
+       if (!modrm->got)
+               insn_get_modrm(insn);
+       /*
+        * For rip-relative instructions, the mod field (top 2 bits)
+        * is zero and the r/m field (bottom 3 bits) is 0x5.
+        */
+       return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+}
+
+/**
+ * insn_get_sib() - Get the SIB byte of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.
+ */
+void insn_get_sib(struct insn *insn)
+{
+       insn_byte_t modrm;
+
+       if (insn->sib.got)
+               return;
+       if (!insn->modrm.got)
+               insn_get_modrm(insn);
+       if (insn->modrm.nbytes) {
+               modrm = (insn_byte_t)insn->modrm.value;
+               if (insn->addr_bytes != 2 &&
+                   X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
+                       insn->sib.value = get_next(insn_byte_t, insn);
+                       insn->sib.nbytes = 1;
+               }
+       }
+       insn->sib.got = 1;
+}
+
+
+/**
+ * insn_get_displacement() - Get the displacement of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * SIB byte.
+ * Displacement value is sign-expanded.
+ */
+void insn_get_displacement(struct insn *insn)
+{
+       insn_byte_t mod, rm, base;
+
+       if (insn->displacement.got)
+               return;
+       if (!insn->sib.got)
+               insn_get_sib(insn);
+       if (insn->modrm.nbytes) {
+               /*
+                * Interpreting the modrm byte:
+                * mod = 00 - no displacement fields (exceptions below)
+                * mod = 01 - 1-byte displacement field
+                * mod = 10 - displacement field is 4 bytes, or 2 bytes if
+                *      address size = 2 (0x67 prefix in 32-bit mode)
+                * mod = 11 - no memory operand
+                *
+                * If address size = 2...
+                * mod = 00, r/m = 110 - displacement field is 2 bytes
+                *
+                * If address size != 2...
+                * mod != 11, r/m = 100 - SIB byte exists
+                * mod = 00, SIB base = 101 - displacement field is 4 bytes
+                * mod = 00, r/m = 101 - rip-relative addressing, displacement
+                *      field is 4 bytes
+                */
+               mod = X86_MODRM_MOD(insn->modrm.value);
+               rm = X86_MODRM_RM(insn->modrm.value);
+               base = X86_SIB_BASE(insn->sib.value);
+               if (mod == 3)
+                       goto out;
+               if (mod == 1) {
+                       insn->displacement.value = get_next(char, insn);
+                       insn->displacement.nbytes = 1;
+               } else if (insn->addr_bytes == 2) {
+                       if ((mod == 0 && rm == 6) || mod == 2) {
+                               insn->displacement.value =
+                                        get_next(short, insn);
+                               insn->displacement.nbytes = 2;
+                       }
+               } else {
+                       if ((mod == 0 && rm == 5) || mod == 2 ||
+                           (mod == 0 && base == 5)) {
+                               insn->displacement.value = get_next(int, insn);
+                               insn->displacement.nbytes = 4;
+                       }
+               }
+       }
+out:
+       insn->displacement.got = 1;
+}
+
+/* Decode moffset16/32/64 */
+static void __get_moffset(struct insn *insn)
+{
+       switch (insn->addr_bytes) {
+       case 2:
+               insn->moffset1.value = get_next(short, insn);
+               insn->moffset1.nbytes = 2;
+               break;
+       case 4:
+               insn->moffset1.value = get_next(int, insn);
+               insn->moffset1.nbytes = 4;
+               break;
+       case 8:
+               insn->moffset1.value = get_next(int, insn);
+               insn->moffset1.nbytes = 4;
+               insn->moffset2.value = get_next(int, insn);
+               insn->moffset2.nbytes = 4;
+               break;
+       }
+       insn->moffset1.got = insn->moffset2.got = 1;
+}
+
+/* Decode imm v32(Iz) */
+static void __get_immv32(struct insn *insn)
+{
+       switch (insn->opnd_bytes) {
+       case 2:
+               insn->immediate.value = get_next(short, insn);
+               insn->immediate.nbytes = 2;
+               break;
+       case 4:
+       case 8:
+               insn->immediate.value = get_next(int, insn);
+               insn->immediate.nbytes = 4;
+               break;
+       }
+}
+
+/* Decode imm v64(Iv/Ov) */
+static void __get_immv(struct insn *insn)
+{
+       switch (insn->opnd_bytes) {
+       case 2:
+               insn->immediate1.value = get_next(short, insn);
+               insn->immediate1.nbytes = 2;
+               break;
+       case 4:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               break;
+       case 8:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               insn->immediate2.value = get_next(int, insn);
+               insn->immediate2.nbytes = 4;
+               break;
+       }
+       insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/* Decode ptr16:16/32(Ap) */
+static void __get_immptr(struct insn *insn)
+{
+       switch (insn->opnd_bytes) {
+       case 2:
+               insn->immediate1.value = get_next(short, insn);
+               insn->immediate1.nbytes = 2;
+               break;
+       case 4:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               break;
+       case 8:
+               /* ptr16:64 is not exist (no segment) */
+               return;
+       }
+       insn->immediate2.value = get_next(unsigned short, insn);
+       insn->immediate2.nbytes = 2;
+       insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/**
+ * insn_get_immediate() - Get the immediates of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * displacement bytes.
+ * Basically, most of immediates are sign-expanded. Unsigned-value can be
+ * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ */
+void insn_get_immediate(struct insn *insn)
+{
+       if (insn->immediate.got)
+               return;
+       if (!insn->displacement.got)
+               insn_get_displacement(insn);
+
+       if (inat_has_moffset(insn->attr)) {
+               __get_moffset(insn);
+               goto done;
+       }
+
+       if (!inat_has_immediate(insn->attr))
+               /* no immediates */
+               goto done;
+
+       switch (inat_immediate_size(insn->attr)) {
+       case INAT_IMM_BYTE:
+               insn->immediate.value = get_next(char, insn);
+               insn->immediate.nbytes = 1;
+               break;
+       case INAT_IMM_WORD:
+               insn->immediate.value = get_next(short, insn);
+               insn->immediate.nbytes = 2;
+               break;
+       case INAT_IMM_DWORD:
+               insn->immediate.value = get_next(int, insn);
+               insn->immediate.nbytes = 4;
+               break;
+       case INAT_IMM_QWORD:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               insn->immediate2.value = get_next(int, insn);
+               insn->immediate2.nbytes = 4;
+               break;
+       case INAT_IMM_PTR:
+               __get_immptr(insn);
+               break;
+       case INAT_IMM_VWORD32:
+               __get_immv32(insn);
+               break;
+       case INAT_IMM_VWORD:
+               __get_immv(insn);
+               break;
+       default:
+               break;
+       }
+       if (inat_has_second_immediate(insn->attr)) {
+               insn->immediate2.value = get_next(char, insn);
+               insn->immediate2.nbytes = 1;
+       }
+done:
+       insn->immediate.got = 1;
+}
+
+/**
+ * insn_get_length() - Get the length of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * immediates bytes.
+ */
+void insn_get_length(struct insn *insn)
+{
+       if (insn->length)
+               return;
+       if (!insn->immediate.got)
+               insn_get_immediate(insn);
+       insn->length = (unsigned char)((unsigned long)insn->next_byte
+                                    - (unsigned long)insn->kaddr);
+}
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644 (file)
index 0000000..a793da5
--- /dev/null
@@ -0,0 +1,893 @@
+# x86 Opcode Maps
+#
+#<Opcode maps>
+# Table: table-name
+# Referrer: escaped-name
+# AVXcode: avx-code
+# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# (or)
+# opcode: escape # escaped-name
+# EndTable
+#
+#<group maps>
+# GrpTable: GrpXXX
+# reg:  mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# EndTable
+#
+# AVX Superscripts
+#  (VEX): this opcode can accept VEX prefix.
+#  (oVEX): this opcode requires VEX prefix.
+#  (o128): this opcode only supports 128bit VEX.
+#  (o256): this opcode only supports 256bit VEX.
+#
+
+Table: one byte opcode
+Referrer:
+AVXcode:
+# 0x00 - 0x0f
+00: ADD Eb,Gb
+01: ADD Ev,Gv
+02: ADD Gb,Eb
+03: ADD Gv,Ev
+04: ADD AL,Ib
+05: ADD rAX,Iz
+06: PUSH ES (i64)
+07: POP ES (i64)
+08: OR Eb,Gb
+09: OR Ev,Gv
+0a: OR Gb,Eb
+0b: OR Gv,Ev
+0c: OR AL,Ib
+0d: OR rAX,Iz
+0e: PUSH CS (i64)
+0f: escape # 2-byte escape
+# 0x10 - 0x1f
+10: ADC Eb,Gb
+11: ADC Ev,Gv
+12: ADC Gb,Eb
+13: ADC Gv,Ev
+14: ADC AL,Ib
+15: ADC rAX,Iz
+16: PUSH SS (i64)
+17: POP SS (i64)
+18: SBB Eb,Gb
+19: SBB Ev,Gv
+1a: SBB Gb,Eb
+1b: SBB Gv,Ev
+1c: SBB AL,Ib
+1d: SBB rAX,Iz
+1e: PUSH DS (i64)
+1f: POP DS (i64)
+# 0x20 - 0x2f
+20: AND Eb,Gb
+21: AND Ev,Gv
+22: AND Gb,Eb
+23: AND Gv,Ev
+24: AND AL,Ib
+25: AND rAx,Iz
+26: SEG=ES (Prefix)
+27: DAA (i64)
+28: SUB Eb,Gb
+29: SUB Ev,Gv
+2a: SUB Gb,Eb
+2b: SUB Gv,Ev
+2c: SUB AL,Ib
+2d: SUB rAX,Iz
+2e: SEG=CS (Prefix)
+2f: DAS (i64)
+# 0x30 - 0x3f
+30: XOR Eb,Gb
+31: XOR Ev,Gv
+32: XOR Gb,Eb
+33: XOR Gv,Ev
+34: XOR AL,Ib
+35: XOR rAX,Iz
+36: SEG=SS (Prefix)
+37: AAA (i64)
+38: CMP Eb,Gb
+39: CMP Ev,Gv
+3a: CMP Gb,Eb
+3b: CMP Gv,Ev
+3c: CMP AL,Ib
+3d: CMP rAX,Iz
+3e: SEG=DS (Prefix)
+3f: AAS (i64)
+# 0x40 - 0x4f
+40: INC eAX (i64) | REX (o64)
+41: INC eCX (i64) | REX.B (o64)
+42: INC eDX (i64) | REX.X (o64)
+43: INC eBX (i64) | REX.XB (o64)
+44: INC eSP (i64) | REX.R (o64)
+45: INC eBP (i64) | REX.RB (o64)
+46: INC eSI (i64) | REX.RX (o64)
+47: INC eDI (i64) | REX.RXB (o64)
+48: DEC eAX (i64) | REX.W (o64)
+49: DEC eCX (i64) | REX.WB (o64)
+4a: DEC eDX (i64) | REX.WX (o64)
+4b: DEC eBX (i64) | REX.WXB (o64)
+4c: DEC eSP (i64) | REX.WR (o64)
+4d: DEC eBP (i64) | REX.WRB (o64)
+4e: DEC eSI (i64) | REX.WRX (o64)
+4f: DEC eDI (i64) | REX.WRXB (o64)
+# 0x50 - 0x5f
+50: PUSH rAX/r8 (d64)
+51: PUSH rCX/r9 (d64)
+52: PUSH rDX/r10 (d64)
+53: PUSH rBX/r11 (d64)
+54: PUSH rSP/r12 (d64)
+55: PUSH rBP/r13 (d64)
+56: PUSH rSI/r14 (d64)
+57: PUSH rDI/r15 (d64)
+58: POP rAX/r8 (d64)
+59: POP rCX/r9 (d64)
+5a: POP rDX/r10 (d64)
+5b: POP rBX/r11 (d64)
+5c: POP rSP/r12 (d64)
+5d: POP rBP/r13 (d64)
+5e: POP rSI/r14 (d64)
+5f: POP rDI/r15 (d64)
+# 0x60 - 0x6f
+60: PUSHA/PUSHAD (i64)
+61: POPA/POPAD (i64)
+62: BOUND Gv,Ma (i64)
+63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
+64: SEG=FS (Prefix)
+65: SEG=GS (Prefix)
+66: Operand-Size (Prefix)
+67: Address-Size (Prefix)
+68: PUSH Iz (d64)
+69: IMUL Gv,Ev,Iz
+6a: PUSH Ib (d64)
+6b: IMUL Gv,Ev,Ib
+6c: INS/INSB Yb,DX
+6d: INS/INSW/INSD Yz,DX
+6e: OUTS/OUTSB DX,Xb
+6f: OUTS/OUTSW/OUTSD DX,Xz
+# 0x70 - 0x7f
+70: JO Jb
+71: JNO Jb
+72: JB/JNAE/JC Jb
+73: JNB/JAE/JNC Jb
+74: JZ/JE Jb
+75: JNZ/JNE Jb
+76: JBE/JNA Jb
+77: JNBE/JA Jb
+78: JS Jb
+79: JNS Jb
+7a: JP/JPE Jb
+7b: JNP/JPO Jb
+7c: JL/JNGE Jb
+7d: JNL/JGE Jb
+7e: JLE/JNG Jb
+7f: JNLE/JG Jb
+# 0x80 - 0x8f
+80: Grp1 Eb,Ib (1A)
+81: Grp1 Ev,Iz (1A)
+82: Grp1 Eb,Ib (1A),(i64)
+83: Grp1 Ev,Ib (1A)
+84: TEST Eb,Gb
+85: TEST Ev,Gv
+86: XCHG Eb,Gb
+87: XCHG Ev,Gv
+88: MOV Eb,Gb
+89: MOV Ev,Gv
+8a: MOV Gb,Eb
+8b: MOV Gv,Ev
+8c: MOV Ev,Sw
+8d: LEA Gv,M
+8e: MOV Sw,Ew
+8f: Grp1A (1A) | POP Ev (d64)
+# 0x90 - 0x9f
+90: NOP | PAUSE (F3) | XCHG r8,rAX
+91: XCHG rCX/r9,rAX
+92: XCHG rDX/r10,rAX
+93: XCHG rBX/r11,rAX
+94: XCHG rSP/r12,rAX
+95: XCHG rBP/r13,rAX
+96: XCHG rSI/r14,rAX
+97: XCHG rDI/r15,rAX
+98: CBW/CWDE/CDQE
+99: CWD/CDQ/CQO
+9a: CALLF Ap (i64)
+9b: FWAIT/WAIT
+9c: PUSHF/D/Q Fv (d64)
+9d: POPF/D/Q Fv (d64)
+9e: SAHF
+9f: LAHF
+# 0xa0 - 0xaf
+a0: MOV AL,Ob
+a1: MOV rAX,Ov
+a2: MOV Ob,AL
+a3: MOV Ov,rAX
+a4: MOVS/B Xb,Yb
+a5: MOVS/W/D/Q Xv,Yv
+a6: CMPS/B Xb,Yb
+a7: CMPS/W/D Xv,Yv
+a8: TEST AL,Ib
+a9: TEST rAX,Iz
+aa: STOS/B Yb,AL
+ab: STOS/W/D/Q Yv,rAX
+ac: LODS/B AL,Xb
+ad: LODS/W/D/Q rAX,Xv
+ae: SCAS/B AL,Yb
+af: SCAS/W/D/Q rAX,Xv
+# 0xb0 - 0xbf
+b0: MOV AL/R8L,Ib
+b1: MOV CL/R9L,Ib
+b2: MOV DL/R10L,Ib
+b3: MOV BL/R11L,Ib
+b4: MOV AH/R12L,Ib
+b5: MOV CH/R13L,Ib
+b6: MOV DH/R14L,Ib
+b7: MOV BH/R15L,Ib
+b8: MOV rAX/r8,Iv
+b9: MOV rCX/r9,Iv
+ba: MOV rDX/r10,Iv
+bb: MOV rBX/r11,Iv
+bc: MOV rSP/r12,Iv
+bd: MOV rBP/r13,Iv
+be: MOV rSI/r14,Iv
+bf: MOV rDI/r15,Iv
+# 0xc0 - 0xcf
+c0: Grp2 Eb,Ib (1A)
+c1: Grp2 Ev,Ib (1A)
+c2: RETN Iw (f64)
+c3: RETN
+c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
+c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
+c6: Grp11 Eb,Ib (1A)
+c7: Grp11 Ev,Iz (1A)
+c8: ENTER Iw,Ib
+c9: LEAVE (d64)
+ca: RETF Iw
+cb: RETF
+cc: INT3
+cd: INT Ib
+ce: INTO (i64)
+cf: IRET/D/Q
+# 0xd0 - 0xdf
+d0: Grp2 Eb,1 (1A)
+d1: Grp2 Ev,1 (1A)
+d2: Grp2 Eb,CL (1A)
+d3: Grp2 Ev,CL (1A)
+d4: AAM Ib (i64)
+d5: AAD Ib (i64)
+d6:
+d7: XLAT/XLATB
+d8: ESC
+d9: ESC
+da: ESC
+db: ESC
+dc: ESC
+dd: ESC
+de: ESC
+df: ESC
+# 0xe0 - 0xef
+e0: LOOPNE/LOOPNZ Jb (f64)
+e1: LOOPE/LOOPZ Jb (f64)
+e2: LOOP Jb (f64)
+e3: JrCXZ Jb (f64)
+e4: IN AL,Ib
+e5: IN eAX,Ib
+e6: OUT Ib,AL
+e7: OUT Ib,eAX
+e8: CALL Jz (f64)
+e9: JMP-near Jz (f64)
+ea: JMP-far Ap (i64)
+eb: JMP-short Jb (f64)
+ec: IN AL,DX
+ed: IN eAX,DX
+ee: OUT DX,AL
+ef: OUT DX,eAX
+# 0xf0 - 0xff
+f0: LOCK (Prefix)
+f1:
+f2: REPNE (Prefix)
+f3: REP/REPE (Prefix)
+f4: HLT
+f5: CMC
+f6: Grp3_1 Eb (1A)
+f7: Grp3_2 Ev (1A)
+f8: CLC
+f9: STC
+fa: CLI
+fb: STI
+fc: CLD
+fd: STD
+fe: Grp4 (1A)
+ff: Grp5 (1A)
+EndTable
+
+Table: 2-byte opcode (0x0f)
+Referrer: 2-byte escape
+AVXcode: 1
+# 0x0f 0x00-0x0f
+00: Grp6 (1A)
+01: Grp7 (1A)
+02: LAR Gv,Ew
+03: LSL Gv,Ew
+04:
+05: SYSCALL (o64)
+06: CLTS
+07: SYSRET (o64)
+08: INVD
+09: WBINVD
+0a:
+0b: UD2 (1B)
+0c:
+0d: NOP Ev | GrpP
+0e: FEMMS
+# 3DNow! uses the last imm byte as opcode extension.
+0f: 3DNow! Pq,Qq,Ib
+# 0x0f 0x10-0x1f
+10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
+11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
+12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
+13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
+14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
+15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
+16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
+17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
+18: Grp16 (1A)
+19:
+1a:
+1b:
+1c:
+1d:
+1e:
+1f: NOP Ev
+# 0x0f 0x20-0x2f
+20: MOV Rd,Cd
+21: MOV Rd,Dd
+22: MOV Cd,Rd
+23: MOV Dd,Rd
+24:
+25:
+26:
+27:
+28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
+29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
+2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
+2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
+2c: cvttps2pi Ppi,Wps | cvttss2si  Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd  Vsd,Wsd (66),(VEX),(o128)
+2f: comiss Vss,Wss (VEX),(o128) | comisd  Vsd,Wsd (66),(VEX),(o128)
+# 0x0f 0x30-0x3f
+30: WRMSR
+31: RDTSC
+32: RDMSR
+33: RDPMC
+34: SYSENTER
+35: SYSEXIT
+36:
+37: GETSEC
+38: escape # 3-byte escape 1
+39:
+3a: escape # 3-byte escape 2
+3b:
+3c:
+3d:
+3e:
+3f:
+# 0x0f 0x40-0x4f
+40: CMOVO Gv,Ev
+41: CMOVNO Gv,Ev
+42: CMOVB/C/NAE Gv,Ev
+43: CMOVAE/NB/NC Gv,Ev
+44: CMOVE/Z Gv,Ev
+45: CMOVNE/NZ Gv,Ev
+46: CMOVBE/NA Gv,Ev
+47: CMOVA/NBE Gv,Ev
+48: CMOVS Gv,Ev
+49: CMOVNS Gv,Ev
+4a: CMOVP/PE Gv,Ev
+4b: CMOVNP/PO Gv,Ev
+4c: CMOVL/NGE Gv,Ev
+4d: CMOVNL/GE Gv,Ev
+4e: CMOVLE/NG Gv,Ev
+4f: CMOVNLE/G Gv,Ev
+# 0x0f 0x50-0x5f
+50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
+51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
+52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
+53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
+54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
+55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
+56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
+57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
+58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
+59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
+5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
+5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
+5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
+5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
+5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
+5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
+# 0x0f 0x60-0x6f
+60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
+61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
+62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
+63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
+64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
+65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
+66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
+67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
+68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
+69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
+6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
+6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
+6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
+6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
+6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
+6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
+# 0x0f 0x70-0x7f
+70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
+71: Grp12 (1A)
+72: Grp13 (1A)
+73: Grp14 (1A)
+74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
+75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
+76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
+77: emms/vzeroupper/vzeroall (VEX)
+78: VMREAD Ed/q,Gd/q
+79: VMWRITE Gd/q,Ed/q
+7a:
+7b:
+7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
+7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
+7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
+7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
+# 0x0f 0x80-0x8f
+80: JO Jz (f64)
+81: JNO Jz (f64)
+82: JB/JNAE/JC Jz (f64)
+83: JNB/JAE/JNC Jz (f64)
+84: JZ/JE Jz (f64)
+85: JNZ/JNE Jz (f64)
+86: JBE/JNA Jz (f64)
+87: JNBE/JA Jz (f64)
+88: JS Jz (f64)
+89: JNS Jz (f64)
+8a: JP/JPE Jz (f64)
+8b: JNP/JPO Jz (f64)
+8c: JL/JNGE Jz (f64)
+8d: JNL/JGE Jz (f64)
+8e: JLE/JNG Jz (f64)
+8f: JNLE/JG Jz (f64)
+# 0x0f 0x90-0x9f
+90: SETO Eb
+91: SETNO Eb
+92: SETB/C/NAE Eb
+93: SETAE/NB/NC Eb
+94: SETE/Z Eb
+95: SETNE/NZ Eb
+96: SETBE/NA Eb
+97: SETA/NBE Eb
+98: SETS Eb
+99: SETNS Eb
+9a: SETP/PE Eb
+9b: SETNP/PO Eb
+9c: SETL/NGE Eb
+9d: SETNL/GE Eb
+9e: SETLE/NG Eb
+9f: SETNLE/G Eb
+# 0x0f 0xa0-0xaf
+a0: PUSH FS (d64)
+a1: POP FS (d64)
+a2: CPUID
+a3: BT Ev,Gv
+a4: SHLD Ev,Gv,Ib
+a5: SHLD Ev,Gv,CL
+a6: GrpPDLK
+a7: GrpRNG
+a8: PUSH GS (d64)
+a9: POP GS (d64)
+aa: RSM
+ab: BTS Ev,Gv
+ac: SHRD Ev,Gv,Ib
+ad: SHRD Ev,Gv,CL
+ae: Grp15 (1A),(1C)
+af: IMUL Gv,Ev
+# 0x0f 0xb0-0xbf
+b0: CMPXCHG Eb,Gb
+b1: CMPXCHG Ev,Gv
+b2: LSS Gv,Mp
+b3: BTR Ev,Gv
+b4: LFS Gv,Mp
+b5: LGS Gv,Mp
+b6: MOVZX Gv,Eb
+b7: MOVZX Gv,Ew
+b8: JMPE | POPCNT Gv,Ev (F3)
+b9: Grp10 (1A)
+ba: Grp8 Ev,Ib (1A)
+bb: BTC Ev,Gv
+bc: BSF Gv,Ev
+bd: BSR Gv,Ev
+be: MOVSX Gv,Eb
+bf: MOVSX Gv,Ew
+# 0x0f 0xc0-0xcf
+c0: XADD Eb,Gb
+c1: XADD Ev,Gv
+c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
+c3: movnti Md/q,Gd/q
+c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
+c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
+c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
+c7: Grp9 (1A)
+c8: BSWAP RAX/EAX/R8/R8D
+c9: BSWAP RCX/ECX/R9/R9D
+ca: BSWAP RDX/EDX/R10/R10D
+cb: BSWAP RBX/EBX/R11/R11D
+cc: BSWAP RSP/ESP/R12/R12D
+cd: BSWAP RBP/EBP/R13/R13D
+ce: BSWAP RSI/ESI/R14/R14D
+cf: BSWAP RDI/EDI/R15/R15D
+# 0x0f 0xd0-0xdf
+d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
+d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
+d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
+d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
+d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
+d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
+d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
+d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
+d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
+da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
+db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
+dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
+dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
+de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
+df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xe0-0xef
+e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
+e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
+e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
+e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
+e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
+e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
+e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
+e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
+e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
+e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
+ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
+eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
+ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
+ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
+ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
+ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xf0-0xff
+f0: lddqu Vdq,Mdq (F2),(VEX)
+f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
+f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
+f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
+f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
+f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
+f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
+f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
+f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
+f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
+fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
+fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
+fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
+fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
+fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
+ff:
+EndTable
+
+Table: 3-byte opcode 1 (0x0f 0x38)
+Referrer: 3-byte escape 1
+AVXcode: 2
+# 0x0f 0x38 0x00-0x0f
+00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
+01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
+02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
+03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
+04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
+05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
+06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
+07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
+08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
+09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
+0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
+0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
+0c: Vpermilps /r (66),(oVEX)
+0d: Vpermilpd /r (66),(oVEX)
+0e: vtestps /r (66),(oVEX)
+0f: vtestpd /r (66),(oVEX)
+# 0x0f 0x38 0x10-0x1f
+10: pblendvb Vdq,Wdq (66)
+11:
+12:
+13:
+14: blendvps Vdq,Wdq (66)
+15: blendvpd Vdq,Wdq (66)
+16:
+17: ptest Vdq,Wdq (66),(VEX)
+18: vbroadcastss /r (66),(oVEX)
+19: vbroadcastsd /r (66),(oVEX),(o256)
+1a: vbroadcastf128 /r (66),(oVEX),(o256)
+1b:
+1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
+1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
+1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
+1f:
+# 0x0f 0x38 0x20-0x2f
+20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
+21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
+22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
+23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
+24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
+25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
+26:
+27:
+28: pmuldq Vdq,Wdq (66),(VEX),(o128)
+29: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
+2a: movntdqa Vdq,Mdq (66),(VEX),(o128)
+2b: packusdw Vdq,Wdq (66),(VEX),(o128)
+2c: vmaskmovps(ld) /r (66),(oVEX)
+2d: vmaskmovpd(ld) /r (66),(oVEX)
+2e: vmaskmovps(st) /r (66),(oVEX)
+2f: vmaskmovpd(st) /r (66),(oVEX)
+# 0x0f 0x38 0x30-0x3f
+30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
+31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
+32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
+33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
+34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
+35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
+36:
+37: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
+38: pminsb Vdq,Wdq (66),(VEX),(o128)
+39: pminsd Vdq,Wdq (66),(VEX),(o128)
+3a: pminuw Vdq,Wdq (66),(VEX),(o128)
+3b: pminud Vdq,Wdq (66),(VEX),(o128)
+3c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
+3d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
+3e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
+3f: pmaxud Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0x38 0x40-0x8f
+40: pmulld Vdq,Wdq (66),(VEX),(o128)
+41: phminposuw Vdq,Wdq (66),(VEX),(o128)
+80: INVEPT Gd/q,Mdq (66)
+81: INVPID Gd/q,Mdq (66)
+# 0x0f 0x38 0x90-0xbf (FMA)
+96: vfmaddsub132pd/ps /r (66),(VEX)
+97: vfmsubadd132pd/ps /r (66),(VEX)
+98: vfmadd132pd/ps /r (66),(VEX)
+99: vfmadd132sd/ss /r (66),(VEX),(o128)
+9a: vfmsub132pd/ps /r (66),(VEX)
+9b: vfmsub132sd/ss /r (66),(VEX),(o128)
+9c: vfnmadd132pd/ps /r (66),(VEX)
+9d: vfnmadd132sd/ss /r (66),(VEX),(o128)
+9e: vfnmsub132pd/ps /r (66),(VEX)
+9f: vfnmsub132sd/ss /r (66),(VEX),(o128)
+a6: vfmaddsub213pd/ps /r (66),(VEX)
+a7: vfmsubadd213pd/ps /r (66),(VEX)
+a8: vfmadd213pd/ps /r (66),(VEX)
+a9: vfmadd213sd/ss /r (66),(VEX),(o128)
+aa: vfmsub213pd/ps /r (66),(VEX)
+ab: vfmsub213sd/ss /r (66),(VEX),(o128)
+ac: vfnmadd213pd/ps /r (66),(VEX)
+ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
+ae: vfnmsub213pd/ps /r (66),(VEX)
+af: vfnmsub213sd/ss /r (66),(VEX),(o128)
+b6: vfmaddsub231pd/ps /r (66),(VEX)
+b7: vfmsubadd231pd/ps /r (66),(VEX)
+b8: vfmadd231pd/ps /r (66),(VEX)
+b9: vfmadd231sd/ss /r (66),(VEX),(o128)
+ba: vfmsub231pd/ps /r (66),(VEX)
+bb: vfmsub231sd/ss /r (66),(VEX),(o128)
+bc: vfnmadd231pd/ps /r (66),(VEX)
+bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
+be: vfnmsub231pd/ps /r (66),(VEX)
+bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
+# 0x0f 0x38 0xc0-0xff
+db: aesimc Vdq,Wdq (66),(VEX),(o128)
+dc: aesenc Vdq,Wdq (66),(VEX),(o128)
+dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
+de: aesdec Vdq,Wdq (66),(VEX),(o128)
+df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
+f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
+f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x3a)
+Referrer: 3-byte escape 2
+AVXcode: 3
+# 0x0f 0x3a 0x00-0xff
+04: vpermilps /r,Ib (66),(oVEX)
+05: vpermilpd /r,Ib (66),(oVEX)
+06: vperm2f128 /r,Ib (66),(oVEX),(o256)
+08: roundps Vdq,Wdq,Ib (66),(VEX)
+09: roundpd Vdq,Wdq,Ib (66),(VEX)
+0a: roundss Vss,Wss,Ib (66),(VEX),(o128)
+0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
+0c: blendps Vdq,Wdq,Ib (66),(VEX)
+0d: blendpd Vdq,Wdq,Ib (66),(VEX)
+0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
+0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
+14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
+15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
+16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
+17: extractps Ed,Vdq,Ib (66),(VEX),(o128)
+18: vinsertf128 /r,Ib (66),(oVEX),(o256)
+19: vextractf128 /r,Ib (66),(oVEX),(o256)
+20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
+21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
+22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
+40: dpps Vdq,Wdq,Ib (66),(VEX)
+41: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
+42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
+44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
+4a: vblendvps /r,Ib (66),(oVEX)
+4b: vblendvpd /r,Ib (66),(oVEX)
+4c: vpblendvb /r,Ib (66),(oVEX),(o128)
+60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
+61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
+62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
+63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
+df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
+EndTable
+
+GrpTable: Grp1
+0: ADD
+1: OR
+2: ADC
+3: SBB
+4: AND
+5: SUB
+6: XOR
+7: CMP
+EndTable
+
+GrpTable: Grp1A
+0: POP
+EndTable
+
+GrpTable: Grp2
+0: ROL
+1: ROR
+2: RCL
+3: RCR
+4: SHL/SAL
+5: SHR
+6:
+7: SAR
+EndTable
+
+GrpTable: Grp3_1
+0: TEST Eb,Ib
+1:
+2: NOT Eb
+3: NEG Eb
+4: MUL AL,Eb
+5: IMUL AL,Eb
+6: DIV AL,Eb
+7: IDIV AL,Eb
+EndTable
+
+GrpTable: Grp3_2
+0: TEST Ev,Iz
+1:
+2: NOT Ev
+3: NEG Ev
+4: MUL rAX,Ev
+5: IMUL rAX,Ev
+6: DIV rAX,Ev
+7: IDIV rAX,Ev
+EndTable
+
+GrpTable: Grp4
+0: INC Eb
+1: DEC Eb
+EndTable
+
+GrpTable: Grp5
+0: INC Ev
+1: DEC Ev
+2: CALLN Ev (f64)
+3: CALLF Ep
+4: JMPN Ev (f64)
+5: JMPF Ep
+6: PUSH Ev (d64)
+7:
+EndTable
+
+GrpTable: Grp6
+0: SLDT Rv/Mw
+1: STR Rv/Mw
+2: LLDT Ew
+3: LTR Ew
+4: VERR Ew
+5: VERW Ew
+EndTable
+
+GrpTable: Grp7
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
+3: LIDT Ms
+4: SMSW Mw/Rv
+5:
+6: LMSW Ew
+7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
+EndTable
+
+GrpTable: Grp8
+4: BT
+5: BTS
+6: BTR
+7: BTC
+EndTable
+
+GrpTable: Grp9
+1: CMPXCHG8B/16B Mq/Mdq
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
+7: VMPTRST Mq
+EndTable
+
+GrpTable: Grp10
+EndTable
+
+GrpTable: Grp11
+0: MOV
+EndTable
+
+GrpTable: Grp12
+2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
+4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
+6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp13
+2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
+4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
+6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp14
+2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
+3: psrldq Udq,Ib (66),(11B),(VEX),(o128)
+6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
+7: pslldq Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp15
+0: fxsave
+1: fxstor
+2: ldmxcsr (VEX)
+3: stmxcsr (VEX)
+4: XSAVE
+5: XRSTOR | lfence (11B)
+6: mfence (11B)
+7: clflush | sfence (11B)
+EndTable
+
+GrpTable: Grp16
+0: prefetch NTA
+1: prefetch T0
+2: prefetch T1
+3: prefetch T2
+EndTable
+
+# AMD's Prefetch Group
+GrpTable: GrpP
+0: PREFETCH
+1: PREFETCHW
+EndTable
+
+GrpTable: GrpPDLK
+0: MONTMUL
+1: XSHA1
+2: XSHA2
+EndTable
+
+GrpTable: GrpRNG
+0: xstore-rng
+1: xcrypt-ecb
+2: xcrypt-cbc
+4: xcrypt-cfb
+5: xcrypt-ofb
+EndTable
index f4cee9028cf0b01e11951662b625f63371f627e6..8f4e2ac93928edd82f4b34ac3bdead37eea289d4 100644 (file)
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
  * Returns 0 if mmiotrace is disabled, or if the fault is not
  * handled by mmiotrace:
  */
-static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+static inline int __kprobes
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
        return 0;
 }
 
-static inline int notify_page_fault(struct pt_regs *regs)
+static inline int __kprobes notify_page_fault(struct pt_regs *regs)
 {
        int ret = 0;
 
@@ -240,7 +241,7 @@ void vmalloc_sync_all(void)
  *
  *   Handle a fault on the vmalloc or module mapping area
  */
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
 {
        unsigned long pgd_paddr;
        pmd_t *pmd_k;
@@ -357,7 +358,7 @@ void vmalloc_sync_all(void)
  *
  * This assumes no large pages in there.
  */
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
 {
        pgd_t *pgd, *pgd_ref;
        pud_t *pud, *pud_ref;
@@ -860,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
  */
-static noinline int
+static noinline __kprobes int
 spurious_fault(unsigned long error_code, unsigned long address)
 {
        pgd_t *pgd;
index 16ccbd77917f22c1693b9b41fcb8dc7485acee39..11a4ad4d62530ff58b7bfa56d086ad69e3fc0bd1 100644 (file)
@@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
        struct die_args *arg = args;
 
        if (val == DIE_DEBUG && (arg->err & DR_STEP))
-               if (post_kmmio_handler(arg->err, arg->regs) == 1)
+               if (post_kmmio_handler(arg->err, arg->regs) == 1) {
+                       /*
+                        * Reset the BS bit in dr6 (pointed by args->err) to
+                        * denote completion of processing
+                        */
+                       (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP;
                        return NOTIFY_STOP;
+               }
 
        return NOTIFY_DONE;
 }
index 8aa85f17667e5034cfd2fae3eecd217c878d0529..0a979f3e5b8a7596aaf7d402cf73b0bb7eace8a9 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/mce.h>
 #include <asm/xcr.h>
 #include <asm/suspend.h>
+#include <asm/debugreg.h>
 
 #ifdef CONFIG_X86_32
 static struct saved_context saved_context;
@@ -142,31 +143,6 @@ static void fix_processor_context(void)
 #endif
        load_TR_desc();                         /* This does ltr */
        load_LDT(&current->active_mm->context); /* This does lldt */
-
-       /*
-        * Now maybe reload the debug registers
-        */
-       if (current->thread.debugreg7) {
-#ifdef CONFIG_X86_32
-               set_debugreg(current->thread.debugreg0, 0);
-               set_debugreg(current->thread.debugreg1, 1);
-               set_debugreg(current->thread.debugreg2, 2);
-               set_debugreg(current->thread.debugreg3, 3);
-               /* no 4 and 5 */
-               set_debugreg(current->thread.debugreg6, 6);
-               set_debugreg(current->thread.debugreg7, 7);
-#else
-               /* CONFIG_X86_64 */
-               loaddebug(&current->thread, 0);
-               loaddebug(&current->thread, 1);
-               loaddebug(&current->thread, 2);
-               loaddebug(&current->thread, 3);
-               /* no 4 and 5 */
-               loaddebug(&current->thread, 6);
-               loaddebug(&current->thread, 7);
-#endif
-       }
-
 }
 
 /**
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644 (file)
index 0000000..f820826
--- /dev/null
@@ -0,0 +1,31 @@
+PHONY += posttest
+
+ifeq ($(KBUILD_VERBOSE),1)
+  posttest_verbose = -v
+else
+  posttest_verbose =
+endif
+
+ifeq ($(CONFIG_64BIT),y)
+  posttest_64bit = -y
+else
+  posttest_64bit = -n
+endif
+
+distill_awk = $(srctree)/arch/x86/tools/distill.awk
+chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
+
+quiet_cmd_posttest = TEST    $@
+      cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
+
+posttest: $(obj)/test_get_len vmlinux
+       $(call cmd,posttest)
+
+hostprogs-y    := test_get_len
+
+# -I needed for generated C source and C source which in the kernel tree.
+HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+
+# Dependencies are also needed.
+$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
new file mode 100644 (file)
index 0000000..0d13cd9
--- /dev/null
@@ -0,0 +1,23 @@
+# GNU objdump version checker
+#
+# Usage:
+# objdump -v | awk -f chkobjdump.awk
+BEGIN {
+       # objdump version 2.19 or later is OK for the test.
+       od_ver = 2;
+       od_sver = 19;
+}
+
+/^GNU/ {
+       split($4, ver, ".");
+       if (ver[1] > od_ver ||
+           (ver[1] == od_ver && ver[2] >= od_sver)) {
+               exit 1;
+       } else {
+               printf("Warning: objdump version %s is older than %d.%d\n",
+                      $4, od_ver, od_sver);
+               print("Warning: Skipping posttest.");
+               # Logic is inverted, because we just skip test without error.
+               exit 0;
+       }
+}
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk
new file mode 100644 (file)
index 0000000..c13c0ee
--- /dev/null
@@ -0,0 +1,47 @@
+#!/bin/awk -f
+# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+# Distills the disassembly as follows:
+# - Removes all lines except the disassembled instructions.
+# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
+# into a single line.
+# - Remove bad(or prefix only) instructions
+
+BEGIN {
+       prev_addr = ""
+       prev_hex = ""
+       prev_mnemonic = ""
+       bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
+       fwait_expr = "^9b "
+       fwait_str="9b\tfwait"
+}
+
+/^ *[0-9a-f]+ <[^>]*>:/ {
+       # Symbol entry
+       printf("%s%s\n", $2, $1)
+}
+
+/^ *[0-9a-f]+:/ {
+       if (split($0, field, "\t") < 3) {
+               # This is a continuation of the same insn.
+               prev_hex = prev_hex field[2]
+       } else {
+               # Skip bad instructions
+               if (match(prev_mnemonic, bad_expr))
+                       prev_addr = ""
+               # Split fwait from other f* instructions
+               if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
+                       printf "%s\t%s\n", prev_addr, fwait_str
+                       sub(fwait_expr, "", prev_hex)
+               }
+               if (prev_addr != "")
+                       printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+               prev_addr = field[1]
+               prev_hex = field[2]
+               prev_mnemonic = field[3]
+       }
+}
+
+END {
+       if (prev_addr != "")
+               printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644 (file)
index 0000000..e34e92a
--- /dev/null
@@ -0,0 +1,380 @@
+#!/bin/awk -f
+# gen-insn-attr-x86.awk: Instruction attribute table generator
+# Written by Masami Hiramatsu <mhiramat@redhat.com>
+#
+# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
+
+# Awk implementation sanity check
+function check_awk_implement() {
+       if (!match("abc", "[[:lower:]]+"))
+               return "Your awk doesn't support charactor-class."
+       if (sprintf("%x", 0) != "0")
+               return "Your awk has a printf-format problem."
+       return ""
+}
+
+# Clear working vars
+function clear_vars() {
+       delete table
+       delete lptable2
+       delete lptable1
+       delete lptable3
+       eid = -1 # escape id
+       gid = -1 # group id
+       aid = -1 # AVX id
+       tname = ""
+}
+
+BEGIN {
+       # Implementation error checking
+       awkchecked = check_awk_implement()
+       if (awkchecked != "") {
+               print "Error: " awkchecked > "/dev/stderr"
+               print "Please try to use gawk." > "/dev/stderr"
+               exit 1
+       }
+
+       # Setup generating tables
+       print "/* x86 opcode map generated from x86-opcode-map.txt */"
+       print "/* Do not change this code. */\n"
+       ggid = 1
+       geid = 1
+       gaid = 0
+       delete etable
+       delete gtable
+       delete atable
+
+       opnd_expr = "^[[:alpha:]/]"
+       ext_expr = "^\\("
+       sep_expr = "^\\|$"
+       group_expr = "^Grp[[:alnum:]]+"
+
+       imm_expr = "^[IJAO][[:lower:]]"
+       imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+       imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+       imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
+       imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
+       imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
+       imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
+       imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+       imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+       imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
+       imm_flag["Ob"] = "INAT_MOFFSET"
+       imm_flag["Ov"] = "INAT_MOFFSET"
+
+       modrm_expr = "^([CDEGMNPQRSUVW/][[:lower:]]+|NTA|T[012])"
+       force64_expr = "\\([df]64\\)"
+       rex_expr = "^REX(\\.[XRWB]+)*"
+       fpu_expr = "^ESC" # TODO
+
+       lprefix1_expr = "\\(66\\)"
+       lprefix2_expr = "\\(F3\\)"
+       lprefix3_expr = "\\(F2\\)"
+       max_lprefix = 4
+
+       vexok_expr = "\\(VEX\\)"
+       vexonly_expr = "\\(oVEX\\)"
+
+       prefix_expr = "\\(Prefix\\)"
+       prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
+       prefix_num["REPNE"] = "INAT_PFX_REPNE"
+       prefix_num["REP/REPE"] = "INAT_PFX_REPE"
+       prefix_num["LOCK"] = "INAT_PFX_LOCK"
+       prefix_num["SEG=CS"] = "INAT_PFX_CS"
+       prefix_num["SEG=DS"] = "INAT_PFX_DS"
+       prefix_num["SEG=ES"] = "INAT_PFX_ES"
+       prefix_num["SEG=FS"] = "INAT_PFX_FS"
+       prefix_num["SEG=GS"] = "INAT_PFX_GS"
+       prefix_num["SEG=SS"] = "INAT_PFX_SS"
+       prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
+       prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
+       prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
+
+       clear_vars()
+}
+
+function semantic_error(msg) {
+       print "Semantic error at " NR ": " msg > "/dev/stderr"
+       exit 1
+}
+
+function debug(msg) {
+       print "DEBUG: " msg
+}
+
+function array_size(arr,   i,c) {
+       c = 0
+       for (i in arr)
+               c++
+       return c
+}
+
+/^Table:/ {
+       print "/* " $0 " */"
+       if (tname != "")
+               semantic_error("Hit Table: before EndTable:.");
+}
+
+/^Referrer:/ {
+       if (NF != 1) {
+               # escape opcode table
+               ref = ""
+               for (i = 2; i <= NF; i++)
+                       ref = ref $i
+               eid = escape[ref]
+               tname = sprintf("inat_escape_table_%d", eid)
+       }
+}
+
+/^AVXcode:/ {
+       if (NF != 1) {
+               # AVX/escape opcode table
+               aid = $2
+               if (gaid <= aid)
+                       gaid = aid + 1
+               if (tname == "")        # AVX only opcode table
+                       tname = sprintf("inat_avx_table_%d", $2)
+       }
+       if (aid == -1 && eid == -1)     # primary opcode table
+               tname = "inat_primary_table"
+}
+
+/^GrpTable:/ {
+       print "/* " $0 " */"
+       if (!($2 in group))
+               semantic_error("No group: " $2 )
+       gid = group[$2]
+       tname = "inat_group_table_" gid
+}
+
+function print_table(tbl,name,fmt,n)
+{
+       print "const insn_attr_t " name " = {"
+       for (i = 0; i < n; i++) {
+               id = sprintf(fmt, i)
+               if (tbl[id])
+                       print " [" id "] = " tbl[id] ","
+       }
+       print "};"
+}
+
+/^EndTable/ {
+       if (gid != -1) {
+               # print group tables
+               if (array_size(table) != 0) {
+                       print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,0] = tname
+               }
+               if (array_size(lptable1) != 0) {
+                       print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,1] = tname "_1"
+               }
+               if (array_size(lptable2) != 0) {
+                       print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,2] = tname "_2"
+               }
+               if (array_size(lptable3) != 0) {
+                       print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,3] = tname "_3"
+               }
+       } else {
+               # print primary/escaped tables
+               if (array_size(table) != 0) {
+                       print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,0] = tname
+                       if (aid >= 0)
+                               atable[aid,0] = tname
+               }
+               if (array_size(lptable1) != 0) {
+                       print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,1] = tname "_1"
+                       if (aid >= 0)
+                               atable[aid,1] = tname "_1"
+               }
+               if (array_size(lptable2) != 0) {
+                       print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,2] = tname "_2"
+                       if (aid >= 0)
+                               atable[aid,2] = tname "_2"
+               }
+               if (array_size(lptable3) != 0) {
+                       print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,3] = tname "_3"
+                       if (aid >= 0)
+                               atable[aid,3] = tname "_3"
+               }
+       }
+       print ""
+       clear_vars()
+}
+
+function add_flags(old,new) {
+       if (old && new)
+               return old " | " new
+       else if (old)
+               return old
+       else
+               return new
+}
+
+# convert operands to flags.
+function convert_operands(opnd,       i,imm,mod)
+{
+       imm = null
+       mod = null
+       for (i in opnd) {
+               i  = opnd[i]
+               if (match(i, imm_expr) == 1) {
+                       if (!imm_flag[i])
+                               semantic_error("Unknown imm opnd: " i)
+                       if (imm) {
+                               if (i != "Ib")
+                                       semantic_error("Second IMM error")
+                               imm = add_flags(imm, "INAT_SCNDIMM")
+                       } else
+                               imm = imm_flag[i]
+               } else if (match(i, modrm_expr))
+                       mod = "INAT_MODRM"
+       }
+       return add_flags(imm, mod)
+}
+
+/^[0-9a-f]+\:/ {
+       if (NR == 1)
+               next
+       # get index
+       idx = "0x" substr($1, 1, index($1,":") - 1)
+       if (idx in table)
+               semantic_error("Redefine " idx " in " tname)
+
+       # check if escape