Merge branch 'perf/core' into perf/probes
authorIngo Molnar <mingo@elte.hu>
Tue, 17 Nov 2009 09:16:43 +0000 (10:16 +0100)
committerIngo Molnar <mingo@elte.hu>
Tue, 17 Nov 2009 09:17:47 +0000 (10:17 +0100)
Resolved merge conflict in tools/perf/Makefile

Merge reason: we want to queue up a dependent patch.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
45 files changed:
Documentation/trace/kprobetrace.txt [new file with mode: 0644]
arch/x86/Kconfig.debug
arch/x86/Makefile
arch/x86/include/asm/inat.h [new file with mode: 0644]
arch/x86/include/asm/inat_types.h [new file with mode: 0644]
arch/x86/include/asm/insn.h [new file with mode: 0644]
arch/x86/include/asm/ptrace.h
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/kprobes.c
arch/x86/kernel/ptrace.c
arch/x86/lib/.gitignore [new file with mode: 0644]
arch/x86/lib/Makefile
arch/x86/lib/inat.c [new file with mode: 0644]
arch/x86/lib/insn.c [new file with mode: 0644]
arch/x86/lib/x86-opcode-map.txt [new file with mode: 0644]
arch/x86/mm/fault.c
arch/x86/tools/Makefile [new file with mode: 0644]
arch/x86/tools/distill.awk [new file with mode: 0644]
arch/x86/tools/gen-insn-attr-x86.awk [new file with mode: 0644]
arch/x86/tools/test_get_len.c [new file with mode: 0644]
include/linux/ftrace_event.h
include/linux/kprobes.h
include/linux/syscalls.h
include/trace/ftrace.h
include/trace/syscall.h
kernel/kprobes.c
kernel/notifier.c
kernel/trace/Kconfig
kernel/trace/Makefile
kernel/trace/trace.h
kernel/trace/trace_event_profile.c
kernel/trace/trace_events.c
kernel/trace/trace_export.c
kernel/trace/trace_kprobe.c [new file with mode: 0644]
kernel/trace/trace_syscalls.c
tools/perf/Documentation/perf-probe.txt [new file with mode: 0644]
tools/perf/Makefile
tools/perf/builtin-probe.c [new file with mode: 0644]
tools/perf/builtin.h
tools/perf/command-list.txt
tools/perf/perf.c
tools/perf/util/probe-finder.c [new file with mode: 0644]
tools/perf/util/probe-finder.h [new file with mode: 0644]
tools/perf/util/util.h

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
new file mode 100644 (file)
index 0000000..47aabee
--- /dev/null
@@ -0,0 +1,149 @@
+                        Kprobe-based Event Tracing
+                        ==========================
+
+                 Documentation is written by Masami Hiramatsu
+
+
+Overview
+--------
+These events are similar to tracepoint based events. Instead of Tracepoint,
+this is based on kprobes (kprobe and kretprobe). So it can probe wherever
+kprobes can probe (this means, all functions body except for __kprobes
+functions). Unlike the Tracepoint based event, this can be added and removed
+dynamically, on the fly.
+
+To enable this feature, build your kernel with CONFIG_KPROBE_TRACING=y.
+
+Similar to the events tracer, this doesn't need to be activated via
+current_tracer. Instead of that, add probe points via
+/sys/kernel/debug/tracing/kprobe_events, and enable it via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/enabled.
+
+
+Synopsis of kprobe_events
+-------------------------
+  p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS]    : Set a probe
+  r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS]               : Set a return probe
+
+ GRP           : Group name. If omitted, use "kprobes" for it.
+ EVENT         : Event name. If omitted, the event name is generated
+                 based on SYMBOL+offs or MEMADDR.
+ SYMBOL[+offs] : Symbol+offset where the probe is inserted.
+ MEMADDR       : Address where the probe is inserted.
+
+ FETCHARGS     : Arguments. Each probe can have up to 128 args.
+  %REG         : Fetch register REG
+  @ADDR                : Fetch memory at ADDR (ADDR should be in kernel)
+  @SYM[+|-offs]        : Fetch memory at SYM +|- offs (SYM should be a data symbol)
+  $stackN      : Fetch Nth entry of stack (N >= 0)
+  $stack       : Fetch stack address.
+  $argN                : Fetch function argument. (N >= 0)(*)
+  $retval      : Fetch return value.(**)
+  +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(***)
+  NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
+
+  (*) aN may not correct on asmlinkaged functions and at the middle of
+      function body.
+  (**) only for return probe.
+  (***) this is useful for fetching a field of data structures.
+
+
+Per-Probe Event Filtering
+-------------------------
+ Per-probe event filtering feature allows you to set different filter on each
+probe and gives you what arguments will be shown in trace buffer. If an event
+name is specified right after 'p:' or 'r:' in kprobe_events, it adds an event
+under tracing/events/kprobes/<EVENT>, at the directory you can see 'id',
+'enabled', 'format' and 'filter'.
+
+enabled:
+  You can enable/disable the probe by writing 1 or 0 on it.
+
+format:
+  This shows the format of this probe event.
+
+filter:
+  You can write filtering rules of this event.
+
+id:
+  This shows the id of this probe event.
+
+
+Event Profiling
+---------------
+ You can check the total number of probe hits and probe miss-hits via
+/sys/kernel/debug/tracing/kprobe_profile.
+ The first column is event name, the second is the number of probe hits,
+the third is the number of probe miss-hits.
+
+
+Usage examples
+--------------
+To add a probe as a new event, write a new definition to kprobe_events
+as below.
+
+  echo p:myprobe do_sys_open dfd=$arg0 filename=$arg1 flags=$arg2 mode=$arg3 > /sys/kernel/debug/tracing/kprobe_events
+
+ This sets a kprobe on the top of do_sys_open() function with recording
+1st to 4th arguments as "myprobe" event. As this example shows, users can
+choose more familiar names for each arguments.
+
+  echo r:myretprobe do_sys_open $retval >> /sys/kernel/debug/tracing/kprobe_events
+
+ This sets a kretprobe on the return point of do_sys_open() function with
+recording return value as "myretprobe" event.
+ You can see the format of these events via
+/sys/kernel/debug/tracing/events/kprobes/<EVENT>/format.
+
+  cat /sys/kernel/debug/tracing/events/kprobes/myprobe/format
+name: myprobe
+ID: 75
+format:
+       field:unsigned short common_type;       offset:0;       size:2;
+       field:unsigned char common_flags;       offset:2;       size:1;
+       field:unsigned char common_preempt_count;       offset:3;       size:1;
+       field:int common_pid;   offset:4;       size:4;
+       field:int common_tgid;  offset:8;       size:4;
+
+       field: unsigned long ip;        offset:16;tsize:8;
+       field: int nargs;       offset:24;tsize:4;
+       field: unsigned long dfd;       offset:32;tsize:8;
+       field: unsigned long filename;  offset:40;tsize:8;
+       field: unsigned long flags;     offset:48;tsize:8;
+       field: unsigned long mode;      offset:56;tsize:8;
+
+print fmt: "(%lx) dfd=%lx filename=%lx flags=%lx mode=%lx", REC->ip, REC->dfd, REC->filename, REC->flags, REC->mode
+
+
+ You can see that the event has 4 arguments as in the expressions you specified.
+
+  echo > /sys/kernel/debug/tracing/kprobe_events
+
+ This clears all probe points.
+
+ Right after definition, each event is disabled by default. For tracing these
+events, you need to enable it.
+
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
+  echo 1 > /sys/kernel/debug/tracing/events/kprobes/myretprobe/enable
+
+ And you can see the traced information via /sys/kernel/debug/tracing/trace.
+
+  cat /sys/kernel/debug/tracing/trace
+# tracer: nop
+#
+#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+#              | |       |          |         |
+           <...>-1447  [001] 1038282.286875: myprobe: (do_sys_open+0x0/0xd6) dfd=3 filename=7fffd1ec4440 flags=8000 mode=0
+           <...>-1447  [001] 1038282.286878: myretprobe: (sys_openat+0xc/0xe <- do_sys_open) $retval=fffffffffffffffe
+           <...>-1447  [001] 1038282.286885: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=40413c flags=8000 mode=1b6
+           <...>-1447  [001] 1038282.286915: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
+           <...>-1447  [001] 1038282.286969: myprobe: (do_sys_open+0x0/0xd6) dfd=ffffff9c filename=4041c6 flags=98800 mode=10
+           <...>-1447  [001] 1038282.286976: myretprobe: (sys_open+0x1b/0x1d <- do_sys_open) $retval=3
+
+
+ Each line shows when the kernel hits an event, and <- SYMBOL means kernel
+returns from SYMBOL(e.g. "sys_open+0x1b/0x1d <- do_sys_open" means kernel
+returns from do_sys_open to sys_open+0x1b).
+
+
index d105f29bb6bb7c9b75fe3369d66f68f2f3ada5ba..7d0b681a132bb0b241717695e33f61c33e21cb46 100644 (file)
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
 config HAVE_MMIOTRACE_SUPPORT
        def_bool y
 
+config X86_DECODER_SELFTEST
+     bool "x86 instruction decoder selftest"
+     depends on DEBUG_KERNEL
+       ---help---
+        Perform x86 instruction decoder selftests at build time.
+        This option is useful for checking the sanity of x86 instruction
+        decoder code.
+        If unsure, say "N".
+
 #
 # IO delay types:
 #
index d2d24c9ee64d926de80a249330e938095623ca85..78b32be55e9e7a82dafcdf96a5b59be4a26decd8 100644 (file)
@@ -155,6 +155,9 @@ all: bzImage
 KBUILD_IMAGE := $(boot)/bzImage
 
 bzImage: vmlinux
+ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
+       $(Q)$(MAKE) $(build)=arch/x86/tools posttest
+endif
        $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
        $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
        $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644 (file)
index 0000000..205b063
--- /dev/null
@@ -0,0 +1,220 @@
+#ifndef _ASM_X86_INAT_H
+#define _ASM_X86_INAT_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/inat_types.h>
+
+/*
+ * Internal bits. Don't use bitmasks directly, because these bits are
+ * unstable. You should use checking functions.
+ */
+
+#define INAT_OPCODE_TABLE_SIZE 256
+#define INAT_GROUP_TABLE_SIZE 8
+
+/* Legacy last prefixes */
+#define INAT_PFX_OPNDSZ        1       /* 0x66 */ /* LPFX1 */
+#define INAT_PFX_REPE  2       /* 0xF3 */ /* LPFX2 */
+#define INAT_PFX_REPNE 3       /* 0xF2 */ /* LPFX3 */
+/* Other Legacy prefixes */
+#define INAT_PFX_LOCK  4       /* 0xF0 */
+#define INAT_PFX_CS    5       /* 0x2E */
+#define INAT_PFX_DS    6       /* 0x3E */
+#define INAT_PFX_ES    7       /* 0x26 */
+#define INAT_PFX_FS    8       /* 0x64 */
+#define INAT_PFX_GS    9       /* 0x65 */
+#define INAT_PFX_SS    10      /* 0x36 */
+#define INAT_PFX_ADDRSZ        11      /* 0x67 */
+/* x86-64 REX prefix */
+#define INAT_PFX_REX   12      /* 0x4X */
+/* AVX VEX prefixes */
+#define INAT_PFX_VEX2  13      /* 2-bytes VEX prefix */
+#define INAT_PFX_VEX3  14      /* 3-bytes VEX prefix */
+
+#define INAT_LSTPFX_MAX        3
+#define INAT_LGCPFX_MAX        11
+
+/* Immediate size */
+#define INAT_IMM_BYTE          1
+#define INAT_IMM_WORD          2
+#define INAT_IMM_DWORD         3
+#define INAT_IMM_QWORD         4
+#define INAT_IMM_PTR           5
+#define INAT_IMM_VWORD32       6
+#define INAT_IMM_VWORD         7
+
+/* Legacy prefix */
+#define INAT_PFX_OFFS  0
+#define INAT_PFX_BITS  4
+#define INAT_PFX_MAX    ((1 << INAT_PFX_BITS) - 1)
+#define INAT_PFX_MASK  (INAT_PFX_MAX << INAT_PFX_OFFS)
+/* Escape opcodes */
+#define INAT_ESC_OFFS  (INAT_PFX_OFFS + INAT_PFX_BITS)
+#define INAT_ESC_BITS  2
+#define INAT_ESC_MAX   ((1 << INAT_ESC_BITS) - 1)
+#define INAT_ESC_MASK  (INAT_ESC_MAX << INAT_ESC_OFFS)
+/* Group opcodes (1-16) */
+#define INAT_GRP_OFFS  (INAT_ESC_OFFS + INAT_ESC_BITS)
+#define INAT_GRP_BITS  5
+#define INAT_GRP_MAX   ((1 << INAT_GRP_BITS) - 1)
+#define INAT_GRP_MASK  (INAT_GRP_MAX << INAT_GRP_OFFS)
+/* Immediates */
+#define INAT_IMM_OFFS  (INAT_GRP_OFFS + INAT_GRP_BITS)
+#define INAT_IMM_BITS  3
+#define INAT_IMM_MASK  (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
+/* Flags */
+#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
+#define INAT_MODRM     (1 << (INAT_FLAG_OFFS))
+#define INAT_FORCE64   (1 << (INAT_FLAG_OFFS + 1))
+#define INAT_SCNDIMM   (1 << (INAT_FLAG_OFFS + 2))
+#define INAT_MOFFSET   (1 << (INAT_FLAG_OFFS + 3))
+#define INAT_VARIANT   (1 << (INAT_FLAG_OFFS + 4))
+#define INAT_VEXOK     (1 << (INAT_FLAG_OFFS + 5))
+#define INAT_VEXONLY   (1 << (INAT_FLAG_OFFS + 6))
+/* Attribute making macros for attribute tables */
+#define INAT_MAKE_PREFIX(pfx)  (pfx << INAT_PFX_OFFS)
+#define INAT_MAKE_ESCAPE(esc)  (esc << INAT_ESC_OFFS)
+#define INAT_MAKE_GROUP(grp)   ((grp << INAT_GRP_OFFS) | INAT_MODRM)
+#define INAT_MAKE_IMM(imm)     (imm << INAT_IMM_OFFS)
+
+/* Attribute search APIs */
+extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
+extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
+                                            insn_byte_t last_pfx,
+                                            insn_attr_t esc_attr);
+extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
+                                           insn_byte_t last_pfx,
+                                           insn_attr_t esc_attr);
+extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
+                                         insn_byte_t vex_m,
+                                         insn_byte_t vex_pp);
+
+/* Attribute checking functions */
+static inline int inat_is_legacy_prefix(insn_attr_t attr)
+{
+       attr &= INAT_PFX_MASK;
+       return attr && attr <= INAT_LGCPFX_MAX;
+}
+
+static inline int inat_is_address_size_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
+}
+
+static inline int inat_is_operand_size_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
+}
+
+static inline int inat_is_rex_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
+}
+
+static inline int inat_last_prefix_id(insn_attr_t attr)
+{
+       if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
+               return 0;
+       else
+               return attr & INAT_PFX_MASK;
+}
+
+static inline int inat_is_vex_prefix(insn_attr_t attr)
+{
+       attr &= INAT_PFX_MASK;
+       return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_vex3_prefix(insn_attr_t attr)
+{
+       return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
+}
+
+static inline int inat_is_escape(insn_attr_t attr)
+{
+       return attr & INAT_ESC_MASK;
+}
+
+static inline int inat_escape_id(insn_attr_t attr)
+{
+       return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
+}
+
+static inline int inat_is_group(insn_attr_t attr)
+{
+       return attr & INAT_GRP_MASK;
+}
+
+static inline int inat_group_id(insn_attr_t attr)
+{
+       return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
+}
+
+static inline int inat_group_common_attribute(insn_attr_t attr)
+{
+       return attr & ~INAT_GRP_MASK;
+}
+
+static inline int inat_has_immediate(insn_attr_t attr)
+{
+       return attr & INAT_IMM_MASK;
+}
+
+static inline int inat_immediate_size(insn_attr_t attr)
+{
+       return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
+}
+
+static inline int inat_has_modrm(insn_attr_t attr)
+{
+       return attr & INAT_MODRM;
+}
+
+static inline int inat_is_force64(insn_attr_t attr)
+{
+       return attr & INAT_FORCE64;
+}
+
+static inline int inat_has_second_immediate(insn_attr_t attr)
+{
+       return attr & INAT_SCNDIMM;
+}
+
+static inline int inat_has_moffset(insn_attr_t attr)
+{
+       return attr & INAT_MOFFSET;
+}
+
+static inline int inat_has_variant(insn_attr_t attr)
+{
+       return attr & INAT_VARIANT;
+}
+
+static inline int inat_accept_vex(insn_attr_t attr)
+{
+       return attr & INAT_VEXOK;
+}
+
+static inline int inat_must_vex(insn_attr_t attr)
+{
+       return attr & INAT_VEXONLY;
+}
+#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644 (file)
index 0000000..cb3c20c
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef _ASM_X86_INAT_TYPES_H
+#define _ASM_X86_INAT_TYPES_H
+/*
+ * x86 instruction attributes
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* Instruction attributes */
+typedef unsigned int insn_attr_t;
+typedef unsigned char insn_byte_t;
+typedef signed int insn_value_t;
+
+#endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644 (file)
index 0000000..96c2e0a
--- /dev/null
@@ -0,0 +1,184 @@
+#ifndef _ASM_X86_INSN_H
+#define _ASM_X86_INSN_H
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+/* insn_attr_t is defined in inat.h */
+#include <asm/inat.h>
+
+struct insn_field {
+       union {
+               insn_value_t value;
+               insn_byte_t bytes[4];
+       };
+       /* !0 if we've run insn_get_xxx() for this field */
+       unsigned char got;
+       unsigned char nbytes;
+};
+
+struct insn {
+       struct insn_field prefixes;     /*
+                                        * Prefixes
+                                        * prefixes.bytes[3]: last prefix
+                                        */
+       struct insn_field rex_prefix;   /* REX prefix */
+       struct insn_field vex_prefix;   /* VEX prefix */
+       struct insn_field opcode;       /*
+                                        * opcode.bytes[0]: opcode1
+                                        * opcode.bytes[1]: opcode2
+                                        * opcode.bytes[2]: opcode3
+                                        */
+       struct insn_field modrm;
+       struct insn_field sib;
+       struct insn_field displacement;
+       union {
+               struct insn_field immediate;
+               struct insn_field moffset1;     /* for 64bit MOV */
+               struct insn_field immediate1;   /* for 64bit imm or off16/32 */
+       };
+       union {
+               struct insn_field moffset2;     /* for 64bit MOV */
+               struct insn_field immediate2;   /* for 64bit imm or seg16 */
+       };
+
+       insn_attr_t attr;
+       unsigned char opnd_bytes;
+       unsigned char addr_bytes;
+       unsigned char length;
+       unsigned char x86_64;
+
+       const insn_byte_t *kaddr;       /* kernel address of insn to analyze */
+       const insn_byte_t *next_byte;
+};
+
+#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
+#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
+#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
+
+#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
+#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
+#define X86_SIB_BASE(sib) ((sib) & 0x07)
+
+#define X86_REX_W(rex) ((rex) & 8)
+#define X86_REX_R(rex) ((rex) & 4)
+#define X86_REX_X(rex) ((rex) & 2)
+#define X86_REX_B(rex) ((rex) & 1)
+
+/* VEX bit flags  */
+#define X86_VEX_W(vex) ((vex) & 0x80)  /* VEX3 Byte2 */
+#define X86_VEX_R(vex) ((vex) & 0x80)  /* VEX2/3 Byte1 */
+#define X86_VEX_X(vex) ((vex) & 0x40)  /* VEX3 Byte1 */
+#define X86_VEX_B(vex) ((vex) & 0x20)  /* VEX3 Byte1 */
+#define X86_VEX_L(vex) ((vex) & 0x04)  /* VEX3 Byte2, VEX2 Byte1 */
+/* VEX bit fields */
+#define X86_VEX3_M(vex)        ((vex) & 0x1f)          /* VEX3 Byte1 */
+#define X86_VEX2_M     1                       /* VEX2.M always 1 */
+#define X86_VEX_V(vex) (((vex) & 0x78) >> 3)   /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_P(vex) ((vex) & 0x03)          /* VEX3 Byte2, VEX2 Byte1 */
+#define X86_VEX_M_MAX  0x1f                    /* VEX3.M Maximum value */
+
+/* The last prefix is needed for two-byte and three-byte opcodes */
+static inline insn_byte_t insn_last_prefix(struct insn *insn)
+{
+       return insn->prefixes.bytes[3];
+}
+
+extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
+extern void insn_get_prefixes(struct insn *insn);
+extern void insn_get_opcode(struct insn *insn);
+extern void insn_get_modrm(struct insn *insn);
+extern void insn_get_sib(struct insn *insn);
+extern void insn_get_displacement(struct insn *insn);
+extern void insn_get_immediate(struct insn *insn);
+extern void insn_get_length(struct insn *insn);
+
+/* Attribute will be determined after getting ModRM (for opcode groups) */
+static inline void insn_get_attribute(struct insn *insn)
+{
+       insn_get_modrm(insn);
+}
+
+/* Instruction uses RIP-relative addressing */
+extern int insn_rip_relative(struct insn *insn);
+
+/* Init insn for kernel text */
+static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
+{
+#ifdef CONFIG_X86_64
+       insn_init(insn, kaddr, 1);
+#else /* CONFIG_X86_32 */
+       insn_init(insn, kaddr, 0);
+#endif
+}
+
+static inline int insn_is_avx(struct insn *insn)
+{
+       if (!insn->prefixes.got)
+               insn_get_prefixes(insn);
+       return (insn->vex_prefix.value != 0);
+}
+
+static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
+{
+       if (insn->vex_prefix.nbytes == 2)       /* 2 bytes VEX */
+               return X86_VEX2_M;
+       else
+               return X86_VEX3_M(insn->vex_prefix.bytes[1]);
+}
+
+static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
+{
+       if (insn->vex_prefix.nbytes == 2)       /* 2 bytes VEX */
+               return X86_VEX_P(insn->vex_prefix.bytes[1]);
+       else
+               return X86_VEX_P(insn->vex_prefix.bytes[2]);
+}
+
+/* Offset of each field from kaddr */
+static inline int insn_offset_rex_prefix(struct insn *insn)
+{
+       return insn->prefixes.nbytes;
+}
+static inline int insn_offset_vex_prefix(struct insn *insn)
+{
+       return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
+}
+static inline int insn_offset_opcode(struct insn *insn)
+{
+       return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
+}
+static inline int insn_offset_modrm(struct insn *insn)
+{
+       return insn_offset_opcode(insn) + insn->opcode.nbytes;
+}
+static inline int insn_offset_sib(struct insn *insn)
+{
+       return insn_offset_modrm(insn) + insn->modrm.nbytes;
+}
+static inline int insn_offset_displacement(struct insn *insn)
+{
+       return insn_offset_sib(insn) + insn->sib.nbytes;
+}
+static inline int insn_offset_immediate(struct insn *insn)
+{
+       return insn_offset_displacement(insn) + insn->displacement.nbytes;
+}
+
+#endif /* _ASM_X86_INSN_H */
index 0f0d908349aa3f375e87f68802cbcb2e7753f725..a3d49dd7d26e1765d6345bf03a861bcdd91238a5 100644 (file)
@@ -7,6 +7,7 @@
 
 #ifdef __KERNEL__
 #include <asm/segment.h>
+#include <asm/page_types.h>
 #endif
 
 #ifndef __ASSEMBLY__
@@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
        return regs->sp;
 }
 
+/* Query offset/name of register from its name/offset */
+extern int regs_query_register_offset(const char *name);
+extern const char *regs_query_register_name(unsigned int offset);
+#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
+
+/**
+ * regs_get_register() - get register value from its offset
+ * @regs:      pt_regs from which register value is gotten.
+ * @offset:    offset number of the register.
+ *
+ * regs_get_register returns the value of a register whose offset from @regs
+ * is @offset. The @offset is the offset of the register in struct pt_regs.
+ * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
+ */
+static inline unsigned long regs_get_register(struct pt_regs *regs,
+                                             unsigned int offset)
+{
+       if (unlikely(offset > MAX_REG_OFFSET))
+               return 0;
+       return *(unsigned long *)((unsigned long)regs + offset);
+}
+
+/**
+ * regs_within_kernel_stack() - check the address in the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @addr:      address which is checked.
+ *
+ * regs_within_kenel_stack() checks @addr is within the kernel stack page(s).
+ * If @addr is within the kernel stack, it returns true. If not, returns false.
+ */
+static inline int regs_within_kernel_stack(struct pt_regs *regs,
+                                          unsigned long addr)
+{
+       return ((addr & ~(THREAD_SIZE - 1))  ==
+               (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
+}
+
+/**
+ * regs_get_kernel_stack_nth() - get Nth entry of the stack
+ * @regs:      pt_regs which contains kernel stack pointer.
+ * @n:         stack entry number.
+ *
+ * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
+ * is specifined by @regs. If the @n th entry is NOT in the kernel stack,
+ * this returns 0.
+ */
+static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
+                                                     unsigned int n)
+{
+       unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
+       addr += n;
+       if (regs_within_kernel_stack(regs, (unsigned long)addr))
+               return *addr;
+       else
+               return 0;
+}
+
+/* Get Nth argument at function call */
+extern unsigned long regs_get_argument_nth(struct pt_regs *regs,
+                                          unsigned int n);
+
 /*
  * These are defined as per linux/ptrace.h, which see.
  */
index 7d52e9da5e0cc0d1942b6cd9624f83f504491e77..50b9c220e1213ceacf48e10f4f302a98b65ced4f 100644 (file)
@@ -333,6 +333,10 @@ ENTRY(ret_from_fork)
        CFI_ENDPROC
 END(ret_from_fork)
 
+/*
+ * Interrupt exit functions should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
 /*
  * Return to user mode is not as complex as all this looks,
  * but we want the default path for a system call return to
@@ -383,6 +387,10 @@ need_resched:
 END(resume_kernel)
 #endif
        CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 /* SYSENTER_RETURN points to after the "sysenter" instruction in
    the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
@@ -513,6 +521,10 @@ sysexit_audit:
        PTGS_TO_GS_EX
 ENDPROC(ia32_sysenter_target)
 
+/*
+ * syscall stub including irq exit should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
        # system call handler stub
 ENTRY(system_call)
        RING0_INT_FRAME                 # can't unwind into user space anyway
@@ -705,6 +717,10 @@ syscall_badsys:
        jmp resume_userspace
 END(syscall_badsys)
        CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 /*
  * System calls that need a pt_regs pointer.
@@ -814,6 +830,10 @@ common_interrupt:
 ENDPROC(common_interrupt)
        CFI_ENDPROC
 
+/*
+ *  Irq entries should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
 #define BUILD_INTERRUPT3(name, nr, fn) \
 ENTRY(name)                            \
        RING0_INT_FRAME;                \
@@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug)
        jmp error_code
        CFI_ENDPROC
 END(spurious_interrupt_bug)
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 ENTRY(kernel_thread_helper)
        pushl $0                # fake return address for unwinder
index bd5bbddddf91eab956106bff572e03a0ba86b9f9..722df1b1152d57721568ef583e4f2c002922979b 100644 (file)
@@ -803,6 +803,10 @@ END(interrupt)
        call \func
        .endm
 
+/*
+ * Interrupt entry/exit should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
        /*
         * The interrupt stubs push (~vector+0x80) onto the stack and
         * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
 
        CFI_ENDPROC
 END(common_interrupt)
+/*
+ * End of kprobes section
+ */
+       .popsection
 
 /*
  * APIC interrupts.
index 7b5169d2b00026272ed26874913c42ff315befb3..c5f1f117e0c0577a527e788d4c80220bd49dd84d 100644 (file)
 #include <linux/preempt.h>
 #include <linux/module.h>
 #include <linux/kdebug.h>
+#include <linux/kallsyms.h>
 
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/alternative.h>
+#include <asm/insn.h>
 
 void jprobe_return_end(void);
 
@@ -106,50 +108,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
        /*      -----------------------------------------------         */
        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
 };
-static const u32 onebyte_has_modrm[256 / 32] = {
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-       /*      -----------------------------------------------         */
-       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
-       W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
-       W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
-       W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
-       W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
-       W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
-       W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
-       W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
-       W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
-       W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
-       W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
-       W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
-       W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
-       W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
-       W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
-       W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1)   /* f0 */
-       /*      -----------------------------------------------         */
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
-static const u32 twobyte_has_modrm[256 / 32] = {
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-       /*      -----------------------------------------------         */
-       W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
-       W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
-       W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
-       W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
-       W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
-       W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
-       W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
-       W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
-       W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
-       W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
-       W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
-       W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
-       W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
-       W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
-       W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
-       W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* ff */
-       /*      -----------------------------------------------         */
-       /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
-};
 #undef W
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +202,75 @@ retry:
        }
 }
 
+/* Recover the probed instruction at addr for further analysis. */
+static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
+{
+       struct kprobe *kp;
+       kp = get_kprobe((void *)addr);
+       if (!kp)
+               return -EINVAL;
+
+       /*
+        *  Basically, kp->ainsn.insn has an original instruction.
+        *  However, RIP-relative instruction can not do single-stepping
+        *  at different place, fix_riprel() tweaks the displacement of
+        *  that instruction. In that case, we can't recover the instruction
+        *  from the kp->ainsn.insn.
+        *
+        *  On the other hand, kp->opcode has a copy of the first byte of
+        *  the probed instruction, which is overwritten by int3. And
+        *  the instruction at kp->addr is not modified by kprobes except
+        *  for the first byte, we can recover the original instruction
+        *  from it and kp->opcode.
+        */
+       memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+       buf[0] = kp->opcode;
+       return 0;
+}
+
+/* Dummy buffers for kallsyms_lookup */
+static char __dummy_buf[KSYM_NAME_LEN];
+
+/* Check if paddr is at an instruction boundary */
+static int __kprobes can_probe(unsigned long paddr)
+{
+       int ret;
+       unsigned long addr, offset = 0;
+       struct insn insn;
+       kprobe_opcode_t buf[MAX_INSN_SIZE];
+
+       if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
+               return 0;
+
+       /* Decode instructions */
+       addr = paddr - offset;
+       while (addr < paddr) {
+               kernel_insn_init(&insn, (void *)addr);
+               insn_get_opcode(&insn);
+
+               /*
+                * Check if the instruction has been modified by another
+                * kprobe, in which case we replace the breakpoint by the
+                * original instruction in our buffer.
+                */
+               if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
+                       ret = recover_probed_instruction(buf, addr);
+                       if (ret)
+                               /*
+                                * Another debugging subsystem might insert
+                                * this breakpoint. In that case, we can't
+                                * recover it.
+                                */
+                               return 0;
+                       kernel_insn_init(&insn, buf);
+               }
+               insn_get_length(&insn);
+               addr += insn.length;
+       }
+
+       return (addr == paddr);
+}
+
 /*
  * Returns non-zero if opcode modifies the interrupt flag.
  */
@@ -277,68 +304,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
 static void __kprobes fix_riprel(struct kprobe *p)
 {
 #ifdef CONFIG_X86_64
-       u8 *insn = p->ainsn.insn;
-       s64 disp;
-       int need_modrm;
-
-       /* Skip legacy instruction prefixes.  */
-       while (1) {
-               switch (*insn) {
-               case 0x66:
-               case 0x67:
-               case 0x2e:
-               case 0x3e:
-               case 0x26:
-               case 0x64:
-               case 0x65:
-               case 0x36:
-               case 0xf0:
-               case 0xf3:
-               case 0xf2:
-                       ++insn;
-                       continue;
-               }
-               break;
-       }
+       struct insn insn;
+       kernel_insn_init(&insn, p->ainsn.insn);
 
-       /* Skip REX instruction prefix.  */
-       if (is_REX_prefix(insn))
-               ++insn;
-
-       if (*insn == 0x0f) {
-               /* Two-byte opcode.  */
-               ++insn;
-               need_modrm = test_bit(*insn,
-                                     (unsigned long *)twobyte_has_modrm);
-       } else
-               /* One-byte opcode.  */
-               need_modrm = test_bit(*insn,
-                                     (unsigned long *)onebyte_has_modrm);
-
-       if (need_modrm) {
-               u8 modrm = *++insn;
-               if ((modrm & 0xc7) == 0x05) {
-                       /* %rip+disp32 addressing mode */
-                       /* Displacement follows ModRM byte.  */
-                       ++insn;
-                       /*
-                        * The copied instruction uses the %rip-relative
-                        * addressing mode.  Adjust the displacement for the
-                        * difference between the original location of this
-                        * instruction and the location of the copy that will
-                        * actually be run.  The tricky bit here is making sure
-                        * that the sign extension happens correctly in this
-                        * calculation, since we need a signed 32-bit result to
-                        * be sign-extended to 64 bits when it's added to the
-                        * %rip value and yield the same 64-bit result that the
-                        * sign-extension of the original signed 32-bit
-                        * displacement would have given.
-                        */
-                       disp = (u8 *) p->addr + *((s32 *) insn) -
-                              (u8 *) p->ainsn.insn;
-                       BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
-                       *(s32 *)insn = (s32) disp;
-               }
+       if (insn_rip_relative(&insn)) {
+               s64 newdisp;
+               u8 *disp;
+               insn_get_displacement(&insn);
+               /*
+                * The copied instruction uses the %rip-relative addressing
+                * mode.  Adjust the displacement for the difference between
+                * the original location of this instruction and the location
+                * of the copy that will actually be run.  The tricky bit here
+                * is making sure that the sign extension happens correctly in
+                * this calculation, since we need a signed 32-bit result to
+                * be sign-extended to 64 bits when it's added to the %rip
+                * value and yield the same 64-bit result that the sign-
+                * extension of the original signed 32-bit displacement would
+                * have given.
+                */
+               newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
+                         (u8 *) p->ainsn.insn;
+               BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check.  */
+               disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
+               *(s32 *) disp = (s32) newdisp;
        }
 #endif
 }
@@ -359,6 +348,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
 
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
+       if (!can_probe((unsigned long)p->addr))
+               return -EILSEQ;
        /* insn: must be on special executable page on x86. */
        p->ainsn.insn = get_insn_slot();
        if (!p->ainsn.insn)
@@ -472,17 +463,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 {
        switch (kcb->kprobe_status) {
        case KPROBE_HIT_SSDONE:
-#ifdef CONFIG_X86_64
-               /* TODO: Provide re-entrancy from post_kprobes_handler() and
-                * avoid exception stack corruption while single-stepping on
-                * the instruction of the new probe.
-                */
-               arch_disarm_kprobe(p);
-               regs->ip = (unsigned long)p->addr;
-               reset_current_kprobe();
-               preempt_enable_no_resched();
-               break;
-#endif
        case KPROBE_HIT_ACTIVE:
                save_previous_kprobe(kcb);
                set_current_kprobe(p, regs, kcb);
@@ -491,18 +471,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
                kcb->kprobe_status = KPROBE_REENTER;
                break;
        case KPROBE_HIT_SS:
-               if (p == kprobe_running()) {
-                       regs->flags &= ~X86_EFLAGS_TF;
-                       regs->flags |= kcb->kprobe_saved_flags;
-                       return 0;
-               } else {
-                       /* A probe has been hit in the codepath leading up
-                        * to, or just after, single-stepping of a probed
-                        * instruction. This entire codepath should strictly
-                        * reside in .kprobes.text section. Raise a warning
-                        * to highlight this peculiar case.
-                        */
-               }
+               /* A probe has been hit in the codepath leading up to, or just
+                * after, single-stepping of a probed instruction. This entire
+                * codepath should strictly reside in .kprobes.text section.
+                * Raise a BUG or we'll continue in an endless reentering loop
+                * and eventually a stack overflow.
+                */
+               printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
+                      p->addr);
+               dump_kprobe(p);
+               BUG();
        default:
                /* impossible cases */
                WARN_ON(1);
index 7b058a2dc66afecdaeb58877102957dff77e7d81..c4f76d275ee4cda38bed14268a4ac3b6f04d7436 100644 (file)
@@ -49,6 +49,118 @@ enum x86_regset {
        REGSET_IOPERM32,
 };
 
+struct pt_regs_offset {
+       const char *name;
+       int offset;
+};
+
+#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
+#define REG_OFFSET_END {.name = NULL, .offset = 0}
+
+static const struct pt_regs_offset regoffset_table[] = {
+#ifdef CONFIG_X86_64
+       REG_OFFSET_NAME(r15),
+       REG_OFFSET_NAME(r14),
+       REG_OFFSET_NAME(r13),
+       REG_OFFSET_NAME(r12),
+       REG_OFFSET_NAME(r11),
+       REG_OFFSET_NAME(r10),
+       REG_OFFSET_NAME(r9),
+       REG_OFFSET_NAME(r8),
+#endif
+       REG_OFFSET_NAME(bx),
+       REG_OFFSET_NAME(cx),
+       REG_OFFSET_NAME(dx),
+       REG_OFFSET_NAME(si),
+       REG_OFFSET_NAME(di),
+       REG_OFFSET_NAME(bp),
+       REG_OFFSET_NAME(ax),
+#ifdef CONFIG_X86_32
+       REG_OFFSET_NAME(ds),
+       REG_OFFSET_NAME(es),
+       REG_OFFSET_NAME(fs),
+       REG_OFFSET_NAME(gs),
+#endif
+       REG_OFFSET_NAME(orig_ax),
+       REG_OFFSET_NAME(ip),
+       REG_OFFSET_NAME(cs),
+       REG_OFFSET_NAME(flags),
+       REG_OFFSET_NAME(sp),
+       REG_OFFSET_NAME(ss),
+       REG_OFFSET_END,
+};
+
+/**
+ * regs_query_register_offset() - query register offset from its name
+ * @name:      the name of a register
+ *
+ * regs_query_register_offset() returns the offset of a register in struct
+ * pt_regs from its name. If the name is invalid, this returns -EINVAL;
+ */
+int regs_query_register_offset(const char *name)
+{
+       const struct pt_regs_offset *roff;
+       for (roff = regoffset_table; roff->name != NULL; roff++)
+               if (!strcmp(roff->name, name))
+                       return roff->offset;
+       return -EINVAL;
+}
+
+/**
+ * regs_query_register_name() - query register name from its offset
+ * @offset:    the offset of a register in struct pt_regs.
+ *
+ * regs_query_register_name() returns the name of a register from its
+ * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
+ */
+const char *regs_query_register_name(unsigned int offset)
+{
+       const struct pt_regs_offset *roff;
+       for (roff = regoffset_table; roff->name != NULL; roff++)
+               if (roff->offset == offset)
+                       return roff->name;
+       return NULL;
+}
+
+static const int arg_offs_table[] = {
+#ifdef CONFIG_X86_32
+       [0] = offsetof(struct pt_regs, ax),
+       [1] = offsetof(struct pt_regs, dx),
+       [2] = offsetof(struct pt_regs, cx)
+#else /* CONFIG_X86_64 */
+       [0] = offsetof(struct pt_regs, di),
+       [1] = offsetof(struct pt_regs, si),
+       [2] = offsetof(struct pt_regs, dx),
+       [3] = offsetof(struct pt_regs, cx),
+       [4] = offsetof(struct pt_regs, r8),
+       [5] = offsetof(struct pt_regs, r9)
+#endif
+};
+
+/**
+ * regs_get_argument_nth() - get Nth argument at function call
+ * @regs:      pt_regs which contains registers at function entry.
+ * @n:         argument number.
+ *
+ * regs_get_argument_nth() returns @n th argument of a function call.
+ * Since usually the kernel stack will be changed right after function entry,
+ * you must use this at function entry. If the @n th entry is NOT in the
+ * kernel stack or pt_regs, this returns 0.
+ */
+unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
+{
+       if (n < ARRAY_SIZE(arg_offs_table))
+               return *(unsigned long *)((char *)regs + arg_offs_table[n]);
+       else {
+               /*
+                * The typical case: arg n is on the stack.
+                * (Note: stack[0] = return address, so skip it)
+                */
+               n -= ARRAY_SIZE(arg_offs_table);
+               return regs_get_kernel_stack_nth(regs, 1 + n);
+       }
+}
+
 /*
  * does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644 (file)
index 0000000..8df89f0
--- /dev/null
@@ -0,0 +1 @@
+inat-tables.c
index 85f5db95c60f03718f292080587f7faa337ef570..a2d6472895fb309884d442b100502447fcb4c5e9 100644 (file)
@@ -2,12 +2,25 @@
 # Makefile for x86 specific library files.
 #
 
+inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
+inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
+quiet_cmd_inat_tables = GEN     $@
+      cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@
+
+$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
+       $(call cmd,inat_tables)
+
+$(obj)/inat.o: $(obj)/inat-tables.c
+
+clean-files := inat-tables.c
+
 obj-$(CONFIG_SMP) := msr.o
 
 lib-y := delay.o
 lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
+lib-y += insn.o inat.o
 
 obj-y += msr-reg.o msr-reg-export.o
 
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644 (file)
index 0000000..46fc4ee
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * x86 instruction attribute tables
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#include <asm/insn.h>
+
+/* Attribute tables are generated from opcode map */
+#include "inat-tables.c"
+
+/* Attribute search APIs */
+insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
+{
+       return inat_primary_table[opcode];
+}
+
+insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
+                                     insn_attr_t esc_attr)
+{
+       const insn_attr_t *table;
+       insn_attr_t lpfx_attr;
+       int n, m = 0;
+
+       n = inat_escape_id(esc_attr);
+       if (last_pfx) {
+               lpfx_attr = inat_get_opcode_attribute(last_pfx);
+               m = inat_last_prefix_id(lpfx_attr);
+       }
+       table = inat_escape_tables[n][0];
+       if (!table)
+               return 0;
+       if (inat_has_variant(table[opcode]) && m) {
+               table = inat_escape_tables[n][m];
+               if (!table)
+                       return 0;
+       }
+       return table[opcode];
+}
+
+insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
+                                    insn_attr_t grp_attr)
+{
+       const insn_attr_t *table;
+       insn_attr_t lpfx_attr;
+       int n, m = 0;
+
+       n = inat_group_id(grp_attr);
+       if (last_pfx) {
+               lpfx_attr = inat_get_opcode_attribute(last_pfx);
+               m = inat_last_prefix_id(lpfx_attr);
+       }
+       table = inat_group_tables[n][0];
+       if (!table)
+               return inat_group_common_attribute(grp_attr);
+       if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
+               table = inat_group_tables[n][m];
+               if (!table)
+                       return inat_group_common_attribute(grp_attr);
+       }
+       return table[X86_MODRM_REG(modrm)] |
+              inat_group_common_attribute(grp_attr);
+}
+
+insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
+                                  insn_byte_t vex_p)
+{
+       const insn_attr_t *table;
+       if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
+               return 0;
+       table = inat_avx_tables[vex_m][vex_p];
+       if (!table)
+               return 0;
+       return table[opcode];
+}
+
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644 (file)
index 0000000..9f33b98
--- /dev/null
@@ -0,0 +1,516 @@
+/*
+ * x86 instruction analysis
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004, 2009
+ */
+
+#include <linux/string.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+
+#define get_next(t, insn)      \
+       ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+
+#define peek_next(t, insn)     \
+       ({t r; r = *(t*)insn->next_byte; r; })
+
+#define peek_nbyte_next(t, insn, n)    \
+       ({t r; r = *(t*)((insn)->next_byte + n); r; })
+
+/**
+ * insn_init() - initialize struct insn
+ * @insn:      &struct insn to be initialized
+ * @kaddr:     address (in kernel memory) of instruction (or copy thereof)
+ * @x86_64:    !0 for 64-bit kernel or 64-bit app
+ */
+void insn_init(struct insn *insn, const void *kaddr, int x86_64)
+{
+       memset(insn, 0, sizeof(*insn));
+       insn->kaddr = kaddr;
+       insn->next_byte = kaddr;
+       insn->x86_64 = x86_64 ? 1 : 0;
+       insn->opnd_bytes = 4;
+       if (x86_64)
+               insn->addr_bytes = 8;
+       else
+               insn->addr_bytes = 4;
+}
+
+/**
+ * insn_get_prefixes - scan x86 instruction prefix bytes
+ * @insn:      &struct insn containing instruction
+ *
+ * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
+ * to point to the (first) opcode.  No effect if @insn->prefixes.got
+ * is already set.
+ */
+void insn_get_prefixes(struct insn *insn)
+{
+       struct insn_field *prefixes = &insn->prefixes;
+       insn_attr_t attr;
+       insn_byte_t b, lb;
+       int i, nb;
+
+       if (prefixes->got)
+               return;
+
+       nb = 0;
+       lb = 0;
+       b = peek_next(insn_byte_t, insn);
+       attr = inat_get_opcode_attribute(b);
+       while (inat_is_legacy_prefix(attr)) {
+               /* Skip if same prefix */
+               for (i = 0; i < nb; i++)
+                       if (prefixes->bytes[i] == b)
+                               goto found;
+               if (nb == 4)
+                       /* Invalid instruction */
+                       break;
+               prefixes->bytes[nb++] = b;
+               if (inat_is_address_size_prefix(attr)) {
+                       /* address size switches 2/4 or 4/8 */
+                       if (insn->x86_64)
+                               insn->addr_bytes ^= 12;
+                       else
+                               insn->addr_bytes ^= 6;
+               } else if (inat_is_operand_size_prefix(attr)) {
+                       /* oprand size switches 2/4 */
+                       insn->opnd_bytes ^= 6;
+               }
+found:
+               prefixes->nbytes++;
+               insn->next_byte++;
+               lb = b;
+               b = peek_next(insn_byte_t, insn);
+               attr = inat_get_opcode_attribute(b);
+       }
+       /* Set the last prefix */
+       if (lb && lb != insn->prefixes.bytes[3]) {
+               if (unlikely(insn->prefixes.bytes[3])) {
+                       /* Swap the last prefix */
+                       b = insn->prefixes.bytes[3];
+                       for (i = 0; i < nb; i++)
+                               if (prefixes->bytes[i] == lb)
+                                       prefixes->bytes[i] = b;
+               }
+               insn->prefixes.bytes[3] = lb;
+       }
+
+       /* Decode REX prefix */
+       if (insn->x86_64) {
+               b = peek_next(insn_byte_t, insn);
+               attr = inat_get_opcode_attribute(b);
+               if (inat_is_rex_prefix(attr)) {
+                       insn->rex_prefix.value = b;
+                       insn->rex_prefix.nbytes = 1;
+                       insn->next_byte++;
+                       if (X86_REX_W(b))
+                               /* REX.W overrides opnd_size */
+                               insn->opnd_bytes = 8;
+               }
+       }
+       insn->rex_prefix.got = 1;
+
+       /* Decode VEX prefix */
+       b = peek_next(insn_byte_t, insn);
+       attr = inat_get_opcode_attribute(b);
+       if (inat_is_vex_prefix(attr)) {
+               insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
+               if (!insn->x86_64) {
+                       /*
+                        * In 32-bits mode, if the [7:6] bits (mod bits of
+                        * ModRM) on the second byte are not 11b, it is
+                        * LDS or LES.
+                        */
+                       if (X86_MODRM_MOD(b2) != 3)
+                               goto vex_end;
+               }
+               insn->vex_prefix.bytes[0] = b;
+               insn->vex_prefix.bytes[1] = b2;
+               if (inat_is_vex3_prefix(attr)) {
+                       b2 = peek_nbyte_next(insn_byte_t, insn, 2);
+                       insn->vex_prefix.bytes[2] = b2;
+                       insn->vex_prefix.nbytes = 3;
+                       insn->next_byte += 3;
+                       if (insn->x86_64 && X86_VEX_W(b2))
+                               /* VEX.W overrides opnd_size */
+                               insn->opnd_bytes = 8;
+               } else {
+                       insn->vex_prefix.nbytes = 2;
+                       insn->next_byte += 2;
+               }
+       }
+vex_end:
+       insn->vex_prefix.got = 1;
+
+       prefixes->got = 1;
+       return;
+}
+
+/**
+ * insn_get_opcode - collect opcode(s)
+ * @insn:      &struct insn containing instruction
+ *
+ * Populates @insn->opcode, updates @insn->next_byte to point past the
+ * opcode byte(s), and set @insn->attr (except for groups).
+ * If necessary, first collects any preceding (prefix) bytes.
+ * Sets @insn->opcode.value = opcode1.  No effect if @insn->opcode.got
+ * is already 1.
+ */
+void insn_get_opcode(struct insn *insn)
+{
+       struct insn_field *opcode = &insn->opcode;
+       insn_byte_t op, pfx;
+       if (opcode->got)
+               return;
+       if (!insn->prefixes.got)
+               insn_get_prefixes(insn);
+
+       /* Get first opcode */
+       op = get_next(insn_byte_t, insn);
+       opcode->bytes[0] = op;
+       opcode->nbytes = 1;
+
+       /* Check if there is VEX prefix or not */
+       if (insn_is_avx(insn)) {
+               insn_byte_t m, p;
+               m = insn_vex_m_bits(insn);
+               p = insn_vex_p_bits(insn);
+               insn->attr = inat_get_avx_attribute(op, m, p);
+               if (!inat_accept_vex(insn->attr))
+                       insn->attr = 0; /* This instruction is bad */
+               goto end;       /* VEX has only 1 byte for opcode */
+       }
+
+       insn->attr = inat_get_opcode_attribute(op);
+       while (inat_is_escape(insn->attr)) {
+               /* Get escaped opcode */
+               op = get_next(insn_byte_t, insn);
+               opcode->bytes[opcode->nbytes++] = op;
+               pfx = insn_last_prefix(insn);
+               insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
+       }
+       if (inat_must_vex(insn->attr))
+               insn->attr = 0; /* This instruction is bad */
+end:
+       opcode->got = 1;
+}
+
+/**
+ * insn_get_modrm - collect ModRM byte, if any
+ * @insn:      &struct insn containing instruction
+ *
+ * Populates @insn->modrm and updates @insn->next_byte to point past the
+ * ModRM byte, if any.  If necessary, first collects the preceding bytes
+ * (prefixes and opcode(s)).  No effect if @insn->modrm.got is already 1.
+ */
+void insn_get_modrm(struct insn *insn)
+{
+       struct insn_field *modrm = &insn->modrm;
+       insn_byte_t pfx, mod;
+       if (modrm->got)
+               return;
+       if (!insn->opcode.got)
+               insn_get_opcode(insn);
+
+       if (inat_has_modrm(insn->attr)) {
+               mod = get_next(insn_byte_t, insn);
+               modrm->value = mod;
+               modrm->nbytes = 1;
+               if (inat_is_group(insn->attr)) {
+                       pfx = insn_last_prefix(insn);
+                       insn->attr = inat_get_group_attribute(mod, pfx,
+                                                             insn->attr);
+               }
+       }
+
+       if (insn->x86_64 && inat_is_force64(insn->attr))
+               insn->opnd_bytes = 8;
+       modrm->got = 1;
+}
+
+
+/**
+ * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.  No effect if @insn->x86_64 is 0.
+ */
+int insn_rip_relative(struct insn *insn)
+{
+       struct insn_field *modrm = &insn->modrm;
+
+       if (!insn->x86_64)
+               return 0;
+       if (!modrm->got)
+               insn_get_modrm(insn);
+       /*
+        * For rip-relative instructions, the mod field (top 2 bits)
+        * is zero and the r/m field (bottom 3 bits) is 0x5.
+        */
+       return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+}
+
+/**
+ * insn_get_sib() - Get the SIB byte of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * ModRM byte.
+ */
+void insn_get_sib(struct insn *insn)
+{
+       insn_byte_t modrm;
+
+       if (insn->sib.got)
+               return;
+       if (!insn->modrm.got)
+               insn_get_modrm(insn);
+       if (insn->modrm.nbytes) {
+               modrm = (insn_byte_t)insn->modrm.value;
+               if (insn->addr_bytes != 2 &&
+                   X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
+                       insn->sib.value = get_next(insn_byte_t, insn);
+                       insn->sib.nbytes = 1;
+               }
+       }
+       insn->sib.got = 1;
+}
+
+
+/**
+ * insn_get_displacement() - Get the displacement of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * SIB byte.
+ * Displacement value is sign-expanded.
+ */
+void insn_get_displacement(struct insn *insn)
+{
+       insn_byte_t mod, rm, base;
+
+       if (insn->displacement.got)
+               return;
+       if (!insn->sib.got)
+               insn_get_sib(insn);
+       if (insn->modrm.nbytes) {
+               /*
+                * Interpreting the modrm byte:
+                * mod = 00 - no displacement fields (exceptions below)
+                * mod = 01 - 1-byte displacement field
+                * mod = 10 - displacement field is 4 bytes, or 2 bytes if
+                *      address size = 2 (0x67 prefix in 32-bit mode)
+                * mod = 11 - no memory operand
+                *
+                * If address size = 2...
+                * mod = 00, r/m = 110 - displacement field is 2 bytes
+                *
+                * If address size != 2...
+                * mod != 11, r/m = 100 - SIB byte exists
+                * mod = 00, SIB base = 101 - displacement field is 4 bytes
+                * mod = 00, r/m = 101 - rip-relative addressing, displacement
+                *      field is 4 bytes
+                */
+               mod = X86_MODRM_MOD(insn->modrm.value);
+               rm = X86_MODRM_RM(insn->modrm.value);
+               base = X86_SIB_BASE(insn->sib.value);
+               if (mod == 3)
+                       goto out;
+               if (mod == 1) {
+                       insn->displacement.value = get_next(char, insn);
+                       insn->displacement.nbytes = 1;
+               } else if (insn->addr_bytes == 2) {
+                       if ((mod == 0 && rm == 6) || mod == 2) {
+                               insn->displacement.value =
+                                        get_next(short, insn);
+                               insn->displacement.nbytes = 2;
+                       }
+               } else {
+                       if ((mod == 0 && rm == 5) || mod == 2 ||
+                           (mod == 0 && base == 5)) {
+                               insn->displacement.value = get_next(int, insn);
+                               insn->displacement.nbytes = 4;
+                       }
+               }
+       }
+out:
+       insn->displacement.got = 1;
+}
+
+/* Decode moffset16/32/64 */
+static void __get_moffset(struct insn *insn)
+{
+       switch (insn->addr_bytes) {
+       case 2:
+               insn->moffset1.value = get_next(short, insn);
+               insn->moffset1.nbytes = 2;
+               break;
+       case 4:
+               insn->moffset1.value = get_next(int, insn);
+               insn->moffset1.nbytes = 4;
+               break;
+       case 8:
+               insn->moffset1.value = get_next(int, insn);
+               insn->moffset1.nbytes = 4;
+               insn->moffset2.value = get_next(int, insn);
+               insn->moffset2.nbytes = 4;
+               break;
+       }
+       insn->moffset1.got = insn->moffset2.got = 1;
+}
+
+/* Decode imm v32(Iz) */
+static void __get_immv32(struct insn *insn)
+{
+       switch (insn->opnd_bytes) {
+       case 2:
+               insn->immediate.value = get_next(short, insn);
+               insn->immediate.nbytes = 2;
+               break;
+       case 4:
+       case 8:
+               insn->immediate.value = get_next(int, insn);
+               insn->immediate.nbytes = 4;
+               break;
+       }
+}
+
+/* Decode imm v64(Iv/Ov) */
+static void __get_immv(struct insn *insn)
+{
+       switch (insn->opnd_bytes) {
+       case 2:
+               insn->immediate1.value = get_next(short, insn);
+               insn->immediate1.nbytes = 2;
+               break;
+       case 4:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               break;
+       case 8:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               insn->immediate2.value = get_next(int, insn);
+               insn->immediate2.nbytes = 4;
+               break;
+       }
+       insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/* Decode ptr16:16/32(Ap) */
+static void __get_immptr(struct insn *insn)
+{
+       switch (insn->opnd_bytes) {
+       case 2:
+               insn->immediate1.value = get_next(short, insn);
+               insn->immediate1.nbytes = 2;
+               break;
+       case 4:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               break;
+       case 8:
+               /* ptr16:64 is not exist (no segment) */
+               return;
+       }
+       insn->immediate2.value = get_next(unsigned short, insn);
+       insn->immediate2.nbytes = 2;
+       insn->immediate1.got = insn->immediate2.got = 1;
+}
+
+/**
+ * insn_get_immediate() - Get the immediates of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * displacement bytes.
+ * Basically, most of immediates are sign-expanded. Unsigned-value can be
+ * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ */
+void insn_get_immediate(struct insn *insn)
+{
+       if (insn->immediate.got)
+               return;
+       if (!insn->displacement.got)
+               insn_get_displacement(insn);
+
+       if (inat_has_moffset(insn->attr)) {
+               __get_moffset(insn);
+               goto done;
+       }
+
+       if (!inat_has_immediate(insn->attr))
+               /* no immediates */
+               goto done;
+
+       switch (inat_immediate_size(insn->attr)) {
+       case INAT_IMM_BYTE:
+               insn->immediate.value = get_next(char, insn);
+               insn->immediate.nbytes = 1;
+               break;
+       case INAT_IMM_WORD:
+               insn->immediate.value = get_next(short, insn);
+               insn->immediate.nbytes = 2;
+               break;
+       case INAT_IMM_DWORD:
+               insn->immediate.value = get_next(int, insn);
+               insn->immediate.nbytes = 4;
+               break;
+       case INAT_IMM_QWORD:
+               insn->immediate1.value = get_next(int, insn);
+               insn->immediate1.nbytes = 4;
+               insn->immediate2.value = get_next(int, insn);
+               insn->immediate2.nbytes = 4;
+               break;
+       case INAT_IMM_PTR:
+               __get_immptr(insn);
+               break;
+       case INAT_IMM_VWORD32:
+               __get_immv32(insn);
+               break;
+       case INAT_IMM_VWORD:
+               __get_immv(insn);
+               break;
+       default:
+               break;
+       }
+       if (inat_has_second_immediate(insn->attr)) {
+               insn->immediate2.value = get_next(char, insn);
+               insn->immediate2.nbytes = 1;
+       }
+done:
+       insn->immediate.got = 1;
+}
+
+/**
+ * insn_get_length() - Get the length of instruction
+ * @insn:      &struct insn containing instruction
+ *
+ * If necessary, first collects the instruction up to and including the
+ * immediates bytes.
+ */
+void insn_get_length(struct insn *insn)
+{
+       if (insn->length)
+               return;
+       if (!insn->immediate.got)
+               insn_get_immediate(insn);
+       insn->length = (unsigned char)((unsigned long)insn->next_byte
+                                    - (unsigned long)insn->kaddr);
+}
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644 (file)
index 0000000..a793da5
--- /dev/null
@@ -0,0 +1,893 @@
+# x86 Opcode Maps
+#
+#<Opcode maps>
+# Table: table-name
+# Referrer: escaped-name
+# AVXcode: avx-code
+# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# (or)
+# opcode: escape # escaped-name
+# EndTable
+#
+#<group maps>
+# GrpTable: GrpXXX
+# reg:  mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
+# EndTable
+#
+# AVX Superscripts
+#  (VEX): this opcode can accept VEX prefix.
+#  (oVEX): this opcode requires VEX prefix.
+#  (o128): this opcode only supports 128bit VEX.
+#  (o256): this opcode only supports 256bit VEX.
+#
+
+Table: one byte opcode
+Referrer:
+AVXcode:
+# 0x00 - 0x0f
+00: ADD Eb,Gb
+01: ADD Ev,Gv
+02: ADD Gb,Eb
+03: ADD Gv,Ev
+04: ADD AL,Ib
+05: ADD rAX,Iz
+06: PUSH ES (i64)
+07: POP ES (i64)
+08: OR Eb,Gb
+09: OR Ev,Gv
+0a: OR Gb,Eb
+0b: OR Gv,Ev
+0c: OR AL,Ib
+0d: OR rAX,Iz
+0e: PUSH CS (i64)
+0f: escape # 2-byte escape
+# 0x10 - 0x1f
+10: ADC Eb,Gb
+11: ADC Ev,Gv
+12: ADC Gb,Eb
+13: ADC Gv,Ev
+14: ADC AL,Ib
+15: ADC rAX,Iz
+16: PUSH SS (i64)
+17: POP SS (i64)
+18: SBB Eb,Gb
+19: SBB Ev,Gv
+1a: SBB Gb,Eb
+1b: SBB Gv,Ev
+1c: SBB AL,Ib
+1d: SBB rAX,Iz
+1e: PUSH DS (i64)
+1f: POP DS (i64)
+# 0x20 - 0x2f
+20: AND Eb,Gb
+21: AND Ev,Gv
+22: AND Gb,Eb
+23: AND Gv,Ev
+24: AND AL,Ib
+25: AND rAx,Iz
+26: SEG=ES (Prefix)
+27: DAA (i64)
+28: SUB Eb,Gb
+29: SUB Ev,Gv
+2a: SUB Gb,Eb
+2b: SUB Gv,Ev
+2c: SUB AL,Ib
+2d: SUB rAX,Iz
+2e: SEG=CS (Prefix)
+2f: DAS (i64)
+# 0x30 - 0x3f
+30: XOR Eb,Gb
+31: XOR Ev,Gv
+32: XOR Gb,Eb
+33: XOR Gv,Ev
+34: XOR AL,Ib
+35: XOR rAX,Iz
+36: SEG=SS (Prefix)
+37: AAA (i64)
+38: CMP Eb,Gb
+39: CMP Ev,Gv
+3a: CMP Gb,Eb
+3b: CMP Gv,Ev
+3c: CMP AL,Ib
+3d: CMP rAX,Iz
+3e: SEG=DS (Prefix)
+3f: AAS (i64)
+# 0x40 - 0x4f
+40: INC eAX (i64) | REX (o64)
+41: INC eCX (i64) | REX.B (o64)
+42: INC eDX (i64) | REX.X (o64)
+43: INC eBX (i64) | REX.XB (o64)
+44: INC eSP (i64) | REX.R (o64)
+45: INC eBP (i64) | REX.RB (o64)
+46: INC eSI (i64) | REX.RX (o64)
+47: INC eDI (i64) | REX.RXB (o64)
+48: DEC eAX (i64) | REX.W (o64)
+49: DEC eCX (i64) | REX.WB (o64)
+4a: DEC eDX (i64) | REX.WX (o64)
+4b: DEC eBX (i64) | REX.WXB (o64)
+4c: DEC eSP (i64) | REX.WR (o64)
+4d: DEC eBP (i64) | REX.WRB (o64)
+4e: DEC eSI (i64) | REX.WRX (o64)
+4f: DEC eDI (i64) | REX.WRXB (o64)
+# 0x50 - 0x5f
+50: PUSH rAX/r8 (d64)
+51: PUSH rCX/r9 (d64)
+52: PUSH rDX/r10 (d64)
+53: PUSH rBX/r11 (d64)
+54: PUSH rSP/r12 (d64)
+55: PUSH rBP/r13 (d64)
+56: PUSH rSI/r14 (d64)
+57: PUSH rDI/r15 (d64)
+58: POP rAX/r8 (d64)
+59: POP rCX/r9 (d64)
+5a: POP rDX/r10 (d64)
+5b: POP rBX/r11 (d64)
+5c: POP rSP/r12 (d64)
+5d: POP rBP/r13 (d64)
+5e: POP rSI/r14 (d64)
+5f: POP rDI/r15 (d64)
+# 0x60 - 0x6f
+60: PUSHA/PUSHAD (i64)
+61: POPA/POPAD (i64)
+62: BOUND Gv,Ma (i64)
+63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
+64: SEG=FS (Prefix)
+65: SEG=GS (Prefix)
+66: Operand-Size (Prefix)
+67: Address-Size (Prefix)
+68: PUSH Iz (d64)
+69: IMUL Gv,Ev,Iz
+6a: PUSH Ib (d64)
+6b: IMUL Gv,Ev,Ib
+6c: INS/INSB Yb,DX
+6d: INS/INSW/INSD Yz,DX
+6e: OUTS/OUTSB DX,Xb
+6f: OUTS/OUTSW/OUTSD DX,Xz
+# 0x70 - 0x7f
+70: JO Jb
+71: JNO Jb
+72: JB/JNAE/JC Jb
+73: JNB/JAE/JNC Jb
+74: JZ/JE Jb
+75: JNZ/JNE Jb
+76: JBE/JNA Jb
+77: JNBE/JA Jb
+78: JS Jb
+79: JNS Jb
+7a: JP/JPE Jb
+7b: JNP/JPO Jb
+7c: JL/JNGE Jb
+7d: JNL/JGE Jb
+7e: JLE/JNG Jb
+7f: JNLE/JG Jb
+# 0x80 - 0x8f
+80: Grp1 Eb,Ib (1A)
+81: Grp1 Ev,Iz (1A)
+82: Grp1 Eb,Ib (1A),(i64)
+83: Grp1 Ev,Ib (1A)
+84: TEST Eb,Gb
+85: TEST Ev,Gv
+86: XCHG Eb,Gb
+87: XCHG Ev,Gv
+88: MOV Eb,Gb
+89: MOV Ev,Gv
+8a: MOV Gb,Eb
+8b: MOV Gv,Ev
+8c: MOV Ev,Sw
+8d: LEA Gv,M
+8e: MOV Sw,Ew
+8f: Grp1A (1A) | POP Ev (d64)
+# 0x90 - 0x9f
+90: NOP | PAUSE (F3) | XCHG r8,rAX
+91: XCHG rCX/r9,rAX
+92: XCHG rDX/r10,rAX
+93: XCHG rBX/r11,rAX
+94: XCHG rSP/r12,rAX
+95: XCHG rBP/r13,rAX
+96: XCHG rSI/r14,rAX
+97: XCHG rDI/r15,rAX
+98: CBW/CWDE/CDQE
+99: CWD/CDQ/CQO
+9a: CALLF Ap (i64)
+9b: FWAIT/WAIT
+9c: PUSHF/D/Q Fv (d64)
+9d: POPF/D/Q Fv (d64)
+9e: SAHF
+9f: LAHF
+# 0xa0 - 0xaf
+a0: MOV AL,Ob
+a1: MOV rAX,Ov
+a2: MOV Ob,AL
+a3: MOV Ov,rAX
+a4: MOVS/B Xb,Yb
+a5: MOVS/W/D/Q Xv,Yv
+a6: CMPS/B Xb,Yb
+a7: CMPS/W/D Xv,Yv
+a8: TEST AL,Ib
+a9: TEST rAX,Iz
+aa: STOS/B Yb,AL
+ab: STOS/W/D/Q Yv,rAX
+ac: LODS/B AL,Xb
+ad: LODS/W/D/Q rAX,Xv
+ae: SCAS/B AL,Yb
+af: SCAS/W/D/Q rAX,Xv
+# 0xb0 - 0xbf
+b0: MOV AL/R8L,Ib
+b1: MOV CL/R9L,Ib
+b2: MOV DL/R10L,Ib
+b3: MOV BL/R11L,Ib
+b4: MOV AH/R12L,Ib
+b5: MOV CH/R13L,Ib
+b6: MOV DH/R14L,Ib
+b7: MOV BH/R15L,Ib
+b8: MOV rAX/r8,Iv
+b9: MOV rCX/r9,Iv
+ba: MOV rDX/r10,Iv
+bb: MOV rBX/r11,Iv
+bc: MOV rSP/r12,Iv
+bd: MOV rBP/r13,Iv
+be: MOV rSI/r14,Iv
+bf: MOV rDI/r15,Iv
+# 0xc0 - 0xcf
+c0: Grp2 Eb,Ib (1A)
+c1: Grp2 Ev,Ib (1A)
+c2: RETN Iw (f64)
+c3: RETN
+c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
+c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
+c6: Grp11 Eb,Ib (1A)
+c7: Grp11 Ev,Iz (1A)
+c8: ENTER Iw,Ib
+c9: LEAVE (d64)
+ca: RETF Iw
+cb: RETF
+cc: INT3
+cd: INT Ib
+ce: INTO (i64)
+cf: IRET/D/Q
+# 0xd0 - 0xdf
+d0: Grp2 Eb,1 (1A)
+d1: Grp2 Ev,1 (1A)
+d2: Grp2 Eb,CL (1A)
+d3: Grp2 Ev,CL (1A)
+d4: AAM Ib (i64)
+d5: AAD Ib (i64)
+d6:
+d7: XLAT/XLATB
+d8: ESC
+d9: ESC
+da: ESC
+db: ESC
+dc: ESC
+dd: ESC
+de: ESC
+df: ESC
+# 0xe0 - 0xef
+e0: LOOPNE/LOOPNZ Jb (f64)
+e1: LOOPE/LOOPZ Jb (f64)
+e2: LOOP Jb (f64)
+e3: JrCXZ Jb (f64)
+e4: IN AL,Ib
+e5: IN eAX,Ib
+e6: OUT Ib,AL
+e7: OUT Ib,eAX
+e8: CALL Jz (f64)
+e9: JMP-near Jz (f64)
+ea: JMP-far Ap (i64)
+eb: JMP-short Jb (f64)
+ec: IN AL,DX
+ed: IN eAX,DX
+ee: OUT DX,AL
+ef: OUT DX,eAX
+# 0xf0 - 0xff
+f0: LOCK (Prefix)
+f1:
+f2: REPNE (Prefix)
+f3: REP/REPE (Prefix)
+f4: HLT
+f5: CMC
+f6: Grp3_1 Eb (1A)
+f7: Grp3_2 Ev (1A)
+f8: CLC
+f9: STC
+fa: CLI
+fb: STI
+fc: CLD
+fd: STD
+fe: Grp4 (1A)
+ff: Grp5 (1A)
+EndTable
+
+Table: 2-byte opcode (0x0f)
+Referrer: 2-byte escape
+AVXcode: 1
+# 0x0f 0x00-0x0f
+00: Grp6 (1A)
+01: Grp7 (1A)
+02: LAR Gv,Ew
+03: LSL Gv,Ew
+04:
+05: SYSCALL (o64)
+06: CLTS
+07: SYSRET (o64)
+08: INVD
+09: WBINVD
+0a:
+0b: UD2 (1B)
+0c:
+0d: NOP Ev | GrpP
+0e: FEMMS
+# 3DNow! uses the last imm byte as opcode extension.
+0f: 3DNow! Pq,Qq,Ib
+# 0x0f 0x10-0x1f
+10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
+11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
+12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
+13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
+14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
+15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
+16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
+17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
+18: Grp16 (1A)
+19:
+1a:
+1b:
+1c:
+1d:
+1e:
+1f: NOP Ev
+# 0x0f 0x20-0x2f
+20: MOV Rd,Cd
+21: MOV Rd,Dd
+22: MOV Cd,Rd
+23: MOV Dd,Rd
+24:
+25:
+26:
+27:
+28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
+29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
+2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
+2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
+2c: cvttps2pi Ppi,Wps | cvttss2si  Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
+2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd  Vsd,Wsd (66),(VEX),(o128)
+2f: comiss Vss,Wss (VEX),(o128) | comisd  Vsd,Wsd (66),(VEX),(o128)
+# 0x0f 0x30-0x3f
+30: WRMSR
+31: RDTSC
+32: RDMSR
+33: RDPMC
+34: SYSENTER
+35: SYSEXIT
+36:
+37: GETSEC
+38: escape # 3-byte escape 1
+39:
+3a: escape # 3-byte escape 2
+3b:
+3c:
+3d:
+3e:
+3f:
+# 0x0f 0x40-0x4f
+40: CMOVO Gv,Ev
+41: CMOVNO Gv,Ev
+42: CMOVB/C/NAE Gv,Ev
+43: CMOVAE/NB/NC Gv,Ev
+44: CMOVE/Z Gv,Ev
+45: CMOVNE/NZ Gv,Ev
+46: CMOVBE/NA Gv,Ev
+47: CMOVA/NBE Gv,Ev
+48: CMOVS Gv,Ev
+49: CMOVNS Gv,Ev
+4a: CMOVP/PE Gv,Ev
+4b: CMOVNP/PO Gv,Ev
+4c: CMOVL/NGE Gv,Ev
+4d: CMOVNL/GE Gv,Ev
+4e: CMOVLE/NG Gv,Ev
+4f: CMOVNLE/G Gv,Ev
+# 0x0f 0x50-0x5f
+50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
+51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
+52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
+53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
+54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
+55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
+56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
+57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
+58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
+59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
+5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
+5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
+5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
+5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
+5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
+5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
+# 0x0f 0x60-0x6f
+60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
+61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
+62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
+63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
+64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
+65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
+66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
+67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
+68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
+69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
+6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
+6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
+6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
+6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
+6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
+6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
+# 0x0f 0x70-0x7f
+70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
+71: Grp12 (1A)
+72: Grp13 (1A)
+73: Grp14 (1A)
+74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
+75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
+76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
+77: emms/vzeroupper/vzeroall (VEX)
+78: VMREAD Ed/q,Gd/q
+79: VMWRITE Gd/q,Ed/q
+7a:
+7b:
+7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
+7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
+7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
+7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
+# 0x0f 0x80-0x8f
+80: JO Jz (f64)
+81: JNO Jz (f64)
+82: JB/JNAE/JC Jz (f64)
+83: JNB/JAE/JNC Jz (f64)
+84: JZ/JE Jz (f64)
+85: JNZ/JNE Jz (f64)
+86: JBE/JNA Jz (f64)
+87: JNBE/JA Jz (f64)
+88: JS Jz (f64)
+89: JNS Jz (f64)
+8a: JP/JPE Jz (f64)
+8b: JNP/JPO Jz (f64)
+8c: JL/JNGE Jz (f64)
+8d: JNL/JGE Jz (f64)
+8e: JLE/JNG Jz (f64)
+8f: JNLE/JG Jz (f64)
+# 0x0f 0x90-0x9f
+90: SETO Eb
+91: SETNO Eb
+92: SETB/C/NAE Eb
+93: SETAE/NB/NC Eb
+94: SETE/Z Eb
+95: SETNE/NZ Eb
+96: SETBE/NA Eb
+97: SETA/NBE Eb
+98: SETS Eb
+99: SETNS Eb
+9a: SETP/PE Eb
+9b: SETNP/PO Eb
+9c: SETL/NGE Eb
+9d: SETNL/GE Eb
+9e: SETLE/NG Eb
+9f: SETNLE/G Eb
+# 0x0f 0xa0-0xaf
+a0: PUSH FS (d64)
+a1: POP FS (d64)
+a2: CPUID
+a3: BT Ev,Gv
+a4: SHLD Ev,Gv,Ib
+a5: SHLD Ev,Gv,CL
+a6: GrpPDLK
+a7: GrpRNG
+a8: PUSH GS (d64)
+a9: POP GS (d64)
+aa: RSM
+ab: BTS Ev,Gv
+ac: SHRD Ev,Gv,Ib
+ad: SHRD Ev,Gv,CL
+ae: Grp15 (1A),(1C)
+af: IMUL Gv,Ev
+# 0x0f 0xb0-0xbf
+b0: CMPXCHG Eb,Gb
+b1: CMPXCHG Ev,Gv
+b2: LSS Gv,Mp
+b3: BTR Ev,Gv
+b4: LFS Gv,Mp
+b5: LGS Gv,Mp
+b6: MOVZX Gv,Eb
+b7: MOVZX Gv,Ew
+b8: JMPE | POPCNT Gv,Ev (F3)
+b9: Grp10 (1A)
+ba: Grp8 Ev,Ib (1A)
+bb: BTC Ev,Gv
+bc: BSF Gv,Ev
+bd: BSR Gv,Ev
+be: MOVSX Gv,Eb
+bf: MOVSX Gv,Ew
+# 0x0f 0xc0-0xcf
+c0: XADD Eb,Gb
+c1: XADD Ev,Gv
+c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
+c3: movnti Md/q,Gd/q
+c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
+c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
+c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
+c7: Grp9 (1A)
+c8: BSWAP RAX/EAX/R8/R8D
+c9: BSWAP RCX/ECX/R9/R9D
+ca: BSWAP RDX/EDX/R10/R10D
+cb: BSWAP RBX/EBX/R11/R11D
+cc: BSWAP RSP/ESP/R12/R12D
+cd: BSWAP RBP/EBP/R13/R13D
+ce: BSWAP RSI/ESI/R14/R14D
+cf: BSWAP RDI/EDI/R15/R15D
+# 0x0f 0xd0-0xdf
+d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
+d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
+d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
+d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
+d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
+d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
+d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
+d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
+d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
+da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
+db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
+dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
+dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
+de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
+df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xe0-0xef
+e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
+e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
+e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
+e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
+e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
+e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
+e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
+e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
+e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
+e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
+ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
+eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
+ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
+ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
+ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
+ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0xf0-0xff
+f0: lddqu Vdq,Mdq (F2),(VEX)
+f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
+f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
+f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
+f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
+f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
+f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
+f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
+f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
+f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
+fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
+fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
+fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
+fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
+fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
+ff:
+EndTable
+
+Table: 3-byte opcode 1 (0x0f 0x38)
+Referrer: 3-byte escape 1
+AVXcode: 2
+# 0x0f 0x38 0x00-0x0f
+00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
+01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
+02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
+03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
+04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
+05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
+06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
+07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
+08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
+09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
+0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
+0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
+0c: Vpermilps /r (66),(oVEX)
+0d: Vpermilpd /r (66),(oVEX)
+0e: vtestps /r (66),(oVEX)
+0f: vtestpd /r (66),(oVEX)
+# 0x0f 0x38 0x10-0x1f
+10: pblendvb Vdq,Wdq (66)
+11:
+12:
+13:
+14: blendvps Vdq,Wdq (66)
+15: blendvpd Vdq,Wdq (66)
+16:
+17: ptest Vdq,Wdq (66),(VEX)
+18: vbroadcastss /r (66),(oVEX)
+19: vbroadcastsd /r (66),(oVEX),(o256)
+1a: vbroadcastf128 /r (66),(oVEX),(o256)
+1b:
+1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
+1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
+1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
+1f:
+# 0x0f 0x38 0x20-0x2f
+20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
+21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
+22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
+23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
+24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
+25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
+26:
+27:
+28: pmuldq Vdq,Wdq (66),(VEX),(o128)
+29: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
+2a: movntdqa Vdq,Mdq (66),(VEX),(o128)
+2b: packusdw Vdq,Wdq (66),(VEX),(o128)
+2c: vmaskmovps(ld) /r (66),(oVEX)
+2d: vmaskmovpd(ld) /r (66),(oVEX)
+2e: vmaskmovps(st) /r (66),(oVEX)
+2f: vmaskmovpd(st) /r (66),(oVEX)
+# 0x0f 0x38 0x30-0x3f
+30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
+31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
+32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
+33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
+34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
+35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
+36:
+37: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
+38: pminsb Vdq,Wdq (66),(VEX),(o128)
+39: pminsd Vdq,Wdq (66),(VEX),(o128)
+3a: pminuw Vdq,Wdq (66),(VEX),(o128)
+3b: pminud Vdq,Wdq (66),(VEX),(o128)
+3c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
+3d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
+3e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
+3f: pmaxud Vdq,Wdq (66),(VEX),(o128)
+# 0x0f 0x38 0x40-0x8f
+40: pmulld Vdq,Wdq (66),(VEX),(o128)
+41: phminposuw Vdq,Wdq (66),(VEX),(o128)
+80: INVEPT Gd/q,Mdq (66)
+81: INVPID Gd/q,Mdq (66)
+# 0x0f 0x38 0x90-0xbf (FMA)
+96: vfmaddsub132pd/ps /r (66),(VEX)
+97: vfmsubadd132pd/ps /r (66),(VEX)
+98: vfmadd132pd/ps /r (66),(VEX)
+99: vfmadd132sd/ss /r (66),(VEX),(o128)
+9a: vfmsub132pd/ps /r (66),(VEX)
+9b: vfmsub132sd/ss /r (66),(VEX),(o128)
+9c: vfnmadd132pd/ps /r (66),(VEX)
+9d: vfnmadd132sd/ss /r (66),(VEX),(o128)
+9e: vfnmsub132pd/ps /r (66),(VEX)
+9f: vfnmsub132sd/ss /r (66),(VEX),(o128)
+a6: vfmaddsub213pd/ps /r (66),(VEX)
+a7: vfmsubadd213pd/ps /r (66),(VEX)
+a8: vfmadd213pd/ps /r (66),(VEX)
+a9: vfmadd213sd/ss /r (66),(VEX),(o128)
+aa: vfmsub213pd/ps /r (66),(VEX)
+ab: vfmsub213sd/ss /r (66),(VEX),(o128)
+ac: vfnmadd213pd/ps /r (66),(VEX)
+ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
+ae: vfnmsub213pd/ps /r (66),(VEX)
+af: vfnmsub213sd/ss /r (66),(VEX),(o128)
+b6: vfmaddsub231pd/ps /r (66),(VEX)
+b7: vfmsubadd231pd/ps /r (66),(VEX)
+b8: vfmadd231pd/ps /r (66),(VEX)
+b9: vfmadd231sd/ss /r (66),(VEX),(o128)
+ba: vfmsub231pd/ps /r (66),(VEX)
+bb: vfmsub231sd/ss /r (66),(VEX),(o128)
+bc: vfnmadd231pd/ps /r (66),(VEX)
+bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
+be: vfnmsub231pd/ps /r (66),(VEX)
+bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
+# 0x0f 0x38 0xc0-0xff
+db: aesimc Vdq,Wdq (66),(VEX),(o128)
+dc: aesenc Vdq,Wdq (66),(VEX),(o128)
+dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
+de: aesdec Vdq,Wdq (66),(VEX),(o128)
+df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
+f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
+f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
+EndTable
+
+Table: 3-byte opcode 2 (0x0f 0x3a)
+Referrer: 3-byte escape 2
+AVXcode: 3
+# 0x0f 0x3a 0x00-0xff
+04: vpermilps /r,Ib (66),(oVEX)
+05: vpermilpd /r,Ib (66),(oVEX)
+06: vperm2f128 /r,Ib (66),(oVEX),(o256)
+08: roundps Vdq,Wdq,Ib (66),(VEX)
+09: roundpd Vdq,Wdq,Ib (66),(VEX)
+0a: roundss Vss,Wss,Ib (66),(VEX),(o128)
+0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
+0c: blendps Vdq,Wdq,Ib (66),(VEX)
+0d: blendpd Vdq,Wdq,Ib (66),(VEX)
+0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
+0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
+14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
+15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
+16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
+17: extractps Ed,Vdq,Ib (66),(VEX),(o128)
+18: vinsertf128 /r,Ib (66),(oVEX),(o256)
+19: vextractf128 /r,Ib (66),(oVEX),(o256)
+20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
+21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
+22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
+40: dpps Vdq,Wdq,Ib (66),(VEX)
+41: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
+42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
+44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
+4a: vblendvps /r,Ib (66),(oVEX)
+4b: vblendvpd /r,Ib (66),(oVEX)
+4c: vpblendvb /r,Ib (66),(oVEX),(o128)
+60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
+61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
+62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
+63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
+df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
+EndTable
+
+GrpTable: Grp1
+0: ADD
+1: OR
+2: ADC
+3: SBB
+4: AND
+5: SUB
+6: XOR
+7: CMP
+EndTable
+
+GrpTable: Grp1A
+0: POP
+EndTable
+
+GrpTable: Grp2
+0: ROL
+1: ROR
+2: RCL
+3: RCR
+4: SHL/SAL
+5: SHR
+6:
+7: SAR
+EndTable
+
+GrpTable: Grp3_1
+0: TEST Eb,Ib
+1:
+2: NOT Eb
+3: NEG Eb
+4: MUL AL,Eb
+5: IMUL AL,Eb
+6: DIV AL,Eb
+7: IDIV AL,Eb
+EndTable
+
+GrpTable: Grp3_2
+0: TEST Ev,Iz
+1:
+2: NOT Ev
+3: NEG Ev
+4: MUL rAX,Ev
+5: IMUL rAX,Ev
+6: DIV rAX,Ev
+7: IDIV rAX,Ev
+EndTable
+
+GrpTable: Grp4
+0: INC Eb
+1: DEC Eb
+EndTable
+
+GrpTable: Grp5
+0: INC Ev
+1: DEC Ev
+2: CALLN Ev (f64)
+3: CALLF Ep
+4: JMPN Ev (f64)
+5: JMPF Ep
+6: PUSH Ev (d64)
+7:
+EndTable
+
+GrpTable: Grp6
+0: SLDT Rv/Mw
+1: STR Rv/Mw
+2: LLDT Ew
+3: LTR Ew
+4: VERR Ew
+5: VERW Ew
+EndTable
+
+GrpTable: Grp7
+0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
+1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
+3: LIDT Ms
+4: SMSW Mw/Rv
+5:
+6: LMSW Ew
+7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
+EndTable
+
+GrpTable: Grp8
+4: BT
+5: BTS
+6: BTR
+7: BTC
+EndTable
+
+GrpTable: Grp9
+1: CMPXCHG8B/16B Mq/Mdq
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
+7: VMPTRST Mq
+EndTable
+
+GrpTable: Grp10
+EndTable
+
+GrpTable: Grp11
+0: MOV
+EndTable
+
+GrpTable: Grp12
+2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
+4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
+6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp13
+2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
+4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
+6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp14
+2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
+3: psrldq Udq,Ib (66),(11B),(VEX),(o128)
+6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
+7: pslldq Udq,Ib (66),(11B),(VEX),(o128)
+EndTable
+
+GrpTable: Grp15
+0: fxsave
+1: fxstor
+2: ldmxcsr (VEX)
+3: stmxcsr (VEX)
+4: XSAVE
+5: XRSTOR | lfence (11B)
+6: mfence (11B)
+7: clflush | sfence (11B)
+EndTable
+
+GrpTable: Grp16
+0: prefetch NTA
+1: prefetch T0
+2: prefetch T1
+3: prefetch T2
+EndTable
+
+# AMD's Prefetch Group
+GrpTable: GrpP
+0: PREFETCH
+1: PREFETCHW
+EndTable
+
+GrpTable: GrpPDLK
+0: MONTMUL
+1: XSHA1
+2: XSHA2
+EndTable
+
+GrpTable: GrpRNG
+0: xstore-rng
+1: xcrypt-ecb
+2: xcrypt-cbc
+4: xcrypt-cfb
+5: xcrypt-ofb
+EndTable
index f4cee9028cf0b01e11951662b625f63371f627e6..8f4e2ac93928edd82f4b34ac3bdead37eea289d4 100644 (file)
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
  * Returns 0 if mmiotrace is disabled, or if the fault is not
  * handled by mmiotrace:
  */
-static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+static inline int __kprobes
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
        return 0;
 }
 
-static inline int notify_page_fault(struct pt_regs *regs)
+static inline int __kprobes notify_page_fault(struct pt_regs *regs)
 {
        int ret = 0;
 
@@ -240,7 +241,7 @@ void vmalloc_sync_all(void)
  *
  *   Handle a fault on the vmalloc or module mapping area
  */
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
 {
        unsigned long pgd_paddr;
        pmd_t *pmd_k;
@@ -357,7 +358,7 @@ void vmalloc_sync_all(void)
  *
  * This assumes no large pages in there.
  */
-static noinline int vmalloc_fault(unsigned long address)
+static noinline __kprobes int vmalloc_fault(unsigned long address)
 {
        pgd_t *pgd, *pgd_ref;
        pud_t *pud, *pud_ref;
@@ -860,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
  */
-static noinline int
+static noinline __kprobes int
 spurious_fault(unsigned long error_code, unsigned long address)
 {
        pgd_t *pgd;
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644 (file)
index 0000000..4688f90
--- /dev/null
@@ -0,0 +1,22 @@
+PHONY += posttest
+
+ifeq ($(KBUILD_VERBOSE),1)
+  postest_verbose = -v
+else
+  postest_verbose =
+endif
+
+quiet_cmd_posttest = TEST    $@
+      cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | awk -f $(srctree)/arch/x86/tools/distill.awk | $(obj)/test_get_len -$(CONFIG_64BIT) $(posttest_verbose)
+
+posttest: $(obj)/test_get_len vmlinux
+       $(call cmd,posttest)
+
+hostprogs-y    := test_get_len
+
+# -I needed for generated C source and C source which in the kernel tree.
+HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+
+# Dependencies are also needed.
+$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk
new file mode 100644 (file)
index 0000000..c13c0ee
--- /dev/null
@@ -0,0 +1,47 @@
+#!/bin/awk -f
+# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+# Distills the disassembly as follows:
+# - Removes all lines except the disassembled instructions.
+# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
+# into a single line.
+# - Remove bad(or prefix only) instructions
+
+BEGIN {
+       prev_addr = ""
+       prev_hex = ""
+       prev_mnemonic = ""
+       bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
+       fwait_expr = "^9b "
+       fwait_str="9b\tfwait"
+}
+
+/^ *[0-9a-f]+ <[^>]*>:/ {
+       # Symbol entry
+       printf("%s%s\n", $2, $1)
+}
+
+/^ *[0-9a-f]+:/ {
+       if (split($0, field, "\t") < 3) {
+               # This is a continuation of the same insn.
+               prev_hex = prev_hex field[2]
+       } else {
+               # Skip bad instructions
+               if (match(prev_mnemonic, bad_expr))
+                       prev_addr = ""
+               # Split fwait from other f* instructions
+               if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
+                       printf "%s\t%s\n", prev_addr, fwait_str
+                       sub(fwait_expr, "", prev_hex)
+               }
+               if (prev_addr != "")
+                       printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+               prev_addr = field[1]
+               prev_hex = field[2]
+               prev_mnemonic = field[3]
+       }
+}
+
+END {
+       if (prev_addr != "")
+               printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
+}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644 (file)
index 0000000..e34e92a
--- /dev/null
@@ -0,0 +1,380 @@
+#!/bin/awk -f
+# gen-insn-attr-x86.awk: Instruction attribute table generator
+# Written by Masami Hiramatsu <mhiramat@redhat.com>
+#
+# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
+
+# Awk implementation sanity check
+function check_awk_implement() {
+       if (!match("abc", "[[:lower:]]+"))
+               return "Your awk doesn't support charactor-class."
+       if (sprintf("%x", 0) != "0")
+               return "Your awk has a printf-format problem."
+       return ""
+}
+
+# Clear working vars
+function clear_vars() {
+       delete table
+       delete lptable2
+       delete lptable1
+       delete lptable3
+       eid = -1 # escape id
+       gid = -1 # group id
+       aid = -1 # AVX id
+       tname = ""
+}
+
+BEGIN {
+       # Implementation error checking
+       awkchecked = check_awk_implement()
+       if (awkchecked != "") {
+               print "Error: " awkchecked > "/dev/stderr"
+               print "Please try to use gawk." > "/dev/stderr"
+               exit 1
+       }
+
+       # Setup generating tables
+       print "/* x86 opcode map generated from x86-opcode-map.txt */"
+       print "/* Do not change this code. */\n"
+       ggid = 1
+       geid = 1
+       gaid = 0
+       delete etable
+       delete gtable
+       delete atable
+
+       opnd_expr = "^[[:alpha:]/]"
+       ext_expr = "^\\("
+       sep_expr = "^\\|$"
+       group_expr = "^Grp[[:alnum:]]+"
+
+       imm_expr = "^[IJAO][[:lower:]]"
+       imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+       imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
+       imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
+       imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
+       imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
+       imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
+       imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+       imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
+       imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
+       imm_flag["Ob"] = "INAT_MOFFSET"
+       imm_flag["Ov"] = "INAT_MOFFSET"
+
+       modrm_expr = "^([CDEGMNPQRSUVW/][[:lower:]]+|NTA|T[012])"
+       force64_expr = "\\([df]64\\)"
+       rex_expr = "^REX(\\.[XRWB]+)*"
+       fpu_expr = "^ESC" # TODO
+
+       lprefix1_expr = "\\(66\\)"
+       lprefix2_expr = "\\(F3\\)"
+       lprefix3_expr = "\\(F2\\)"
+       max_lprefix = 4
+
+       vexok_expr = "\\(VEX\\)"
+       vexonly_expr = "\\(oVEX\\)"
+
+       prefix_expr = "\\(Prefix\\)"
+       prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
+       prefix_num["REPNE"] = "INAT_PFX_REPNE"
+       prefix_num["REP/REPE"] = "INAT_PFX_REPE"
+       prefix_num["LOCK"] = "INAT_PFX_LOCK"
+       prefix_num["SEG=CS"] = "INAT_PFX_CS"
+       prefix_num["SEG=DS"] = "INAT_PFX_DS"
+       prefix_num["SEG=ES"] = "INAT_PFX_ES"
+       prefix_num["SEG=FS"] = "INAT_PFX_FS"
+       prefix_num["SEG=GS"] = "INAT_PFX_GS"
+       prefix_num["SEG=SS"] = "INAT_PFX_SS"
+       prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
+       prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
+       prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
+
+       clear_vars()
+}
+
+function semantic_error(msg) {
+       print "Semantic error at " NR ": " msg > "/dev/stderr"
+       exit 1
+}
+
+function debug(msg) {
+       print "DEBUG: " msg
+}
+
+function array_size(arr,   i,c) {
+       c = 0
+       for (i in arr)
+               c++
+       return c
+}
+
+/^Table:/ {
+       print "/* " $0 " */"
+       if (tname != "")
+               semantic_error("Hit Table: before EndTable:.");
+}
+
+/^Referrer:/ {
+       if (NF != 1) {
+               # escape opcode table
+               ref = ""
+               for (i = 2; i <= NF; i++)
+                       ref = ref $i
+               eid = escape[ref]
+               tname = sprintf("inat_escape_table_%d", eid)
+       }
+}
+
+/^AVXcode:/ {
+       if (NF != 1) {
+               # AVX/escape opcode table
+               aid = $2
+               if (gaid <= aid)
+                       gaid = aid + 1
+               if (tname == "")        # AVX only opcode table
+                       tname = sprintf("inat_avx_table_%d", $2)
+       }
+       if (aid == -1 && eid == -1)     # primary opcode table
+               tname = "inat_primary_table"
+}
+
+/^GrpTable:/ {
+       print "/* " $0 " */"
+       if (!($2 in group))
+               semantic_error("No group: " $2 )
+       gid = group[$2]
+       tname = "inat_group_table_" gid
+}
+
+function print_table(tbl,name,fmt,n)
+{
+       print "const insn_attr_t " name " = {"
+       for (i = 0; i < n; i++) {
+               id = sprintf(fmt, i)
+               if (tbl[id])
+                       print " [" id "] = " tbl[id] ","
+       }
+       print "};"
+}
+
+/^EndTable/ {
+       if (gid != -1) {
+               # print group tables
+               if (array_size(table) != 0) {
+                       print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,0] = tname
+               }
+               if (array_size(lptable1) != 0) {
+                       print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,1] = tname "_1"
+               }
+               if (array_size(lptable2) != 0) {
+                       print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,2] = tname "_2"
+               }
+               if (array_size(lptable3) != 0) {
+                       print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
+                                   "0x%x", 8)
+                       gtable[gid,3] = tname "_3"
+               }
+       } else {
+               # print primary/escaped tables
+               if (array_size(table) != 0) {
+                       print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,0] = tname
+                       if (aid >= 0)
+                               atable[aid,0] = tname
+               }
+               if (array_size(lptable1) != 0) {
+                       print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,1] = tname "_1"
+                       if (aid >= 0)
+                               atable[aid,1] = tname "_1"
+               }
+               if (array_size(lptable2) != 0) {
+                       print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,2] = tname "_2"
+                       if (aid >= 0)
+                               atable[aid,2] = tname "_2"
+               }
+               if (array_size(lptable3) != 0) {
+                       print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
+                                   "0x%02x", 256)
+                       etable[eid,3] = tname "_3"
+                       if (aid >= 0)
+                               atable[aid,3] = tname "_3"
+               }
+       }
+       print ""
+       clear_vars()
+}
+
+function add_flags(old,new) {
+       if (old && new)
+               return old " | " new
+       else if (old)
+               return old
+       else
+               return new
+}
+
+# convert operands to flags.
+function convert_operands(opnd,       i,imm,mod)
+{
+       imm = null
+       mod = null
+       for (i in opnd) {
+               i  = opnd[i]
+               if (match(i, imm_expr) == 1) {
+                       if (!imm_flag[i])
+                               semantic_error("Unknown imm opnd: " i)
+                       if (imm) {
+                               if (i != "Ib")
+                                       semantic_error("Second IMM error")
+                               imm = add_flags(imm, "INAT_SCNDIMM")
+                       } else
+                               imm = imm_flag[i]
+               } else if (match(i, modrm_expr))
+                       mod = "INAT_MODRM"
+       }
+       return add_flags(imm, mod)
+}
+
+/^[0-9a-f]+\:/ {
+       if (NR == 1)
+               next
+       # get index
+       idx = "0x" substr($1, 1, index($1,":") - 1)
+       if (idx in table)
+               semantic_error("Redefine " idx " in " tname)
+
+       # check if escaped opcode
+       if ("escape" == $2) {
+               if ($3 != "#")
+                       semantic_error("No escaped name")
+               ref = ""
+               for (i = 4; i <= NF; i++)
+                       ref = ref $i
+               if (ref in escape)
+                       semantic_error("Redefine escape (" ref ")")
+               escape[ref] = geid
+               geid++
+               table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")"
+               next
+       }
+
+       variant = null
+       # converts
+       i = 2
+       while (i <= NF) {
+               opcode = $(i++)
+               delete opnds
+               ext = null
+               flags = null
+               opnd = null
+               # parse one opcode
+               if (match($i, opnd_expr)) {
+                       opnd = $i
+                       split($(i++), opnds, ",")
+                       flags = convert_operands(opnds)
+               }
+               if (match($i, ext_expr))
+                       ext = $(i++)
+               if (match($i, sep_expr))
+                       i++
+               else if (i < NF)
+                       semantic_error($i " is not a separator")
+
+               # check if group opcode
+               if (match(opcode, group_expr)) {
+                       if (!(opcode in group)) {
+                               group[opcode] = ggid
+                               ggid++
+                       }
+                       flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")")
+               }
+               # check force(or default) 64bit
+               if (match(ext, force64_expr))
+                       flags = add_flags(flags, "INAT_FORCE64")
+
+               # check REX prefix
+               if (match(opcode, rex_expr))
+                       flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
+
+               # check coprocessor escape : TODO
+               if (match(opcode, fpu_expr))
+                       flags = add_flags(flags, "INAT_MODRM")
+
+               # check VEX only code
+               if (match(ext, vexonly_expr))
+                       flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
+
+               # check VEX only code
+               if (match(ext, vexok_expr))
+                       flags = add_flags(flags, "INAT_VEXOK")
+
+               # check prefixes
+               if (match(ext, prefix_expr)) {
+                       if (!prefix_num[opcode])
+                               semantic_error("Unknown prefix: " opcode)
+                       flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")")
+               }
+               if (length(flags) == 0)
+                       continue
+               # check if last prefix
+               if (match(ext, lprefix1_expr)) {
+                       lptable1[idx] = add_flags(lptable1[idx],flags)
+                       variant = "INAT_VARIANT"
+               } else if (match(ext, lprefix2_expr)) {
+                       lptable2[idx] = add_flags(lptable2[idx],flags)
+                       variant = "INAT_VARIANT"
+               } else if (match(ext, lprefix3_expr)) {
+                       lptable3[idx] = add_flags(lptable3[idx],flags)
+                       variant = "INAT_VARIANT"
+               } else {
+                       table[idx] = add_flags(table[idx],flags)
+               }
+       }
+       if (variant)
+               table[idx] = add_flags(table[idx],variant)
+}
+
+END {
+       if (awkchecked != "")
+               exit 1
+       # print escape opcode map's array
+       print "/* Escape opcode map array */"
+       print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \
+             "[INAT_LSTPFX_MAX + 1] = {"
+       for (i = 0; i < geid; i++)
+               for (j = 0; j < max_lprefix; j++)
+                       if (etable[i,j])
+                               print " ["i"]["j"] = "etable[i,j]","
+       print "};\n"
+       # print group opcode map's array
+       print "/* Group opcode map array */"
+       print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\
+             "[INAT_LSTPFX_MAX + 1] = {"
+       for (i = 0; i < ggid; i++)
+               for (j = 0; j < max_lprefix; j++)
+                       if (gtable[i,j])
+                               print " ["i"]["j"] = "gtable[i,j]","
+       print "};\n"
+       # print AVX opcode map's array
+       print "/* AVX opcode map array */"
+       print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\
+             "[INAT_LSTPFX_MAX + 1] = {"
+       for (i = 0; i < gaid; i++)
+               for (j = 0; j < max_lprefix; j++)
+                       if (atable[i,j])
+                               print " ["i"]["j"] = "atable[i,j]","
+       print "};"
+}
+
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
new file mode 100644 (file)
index 0000000..af75e07
--- /dev/null
@@ -0,0 +1,168 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#define unlikely(cond) (cond)
+
+#include <asm/insn.h>
+#include <inat.c>
+#include <insn.c>
+
+/*
+ * Test of instruction analysis in general and insn_get_length() in
+ * particular.  See if insn_get_length() and the disassembler agree
+ * on the length of each instruction in an elf disassembly.
+ *
+ * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
+ */
+
+const char *prog;
+static int verbose;
+static int x86_64;
+
+static void usage(void)
+{
+       fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
+               " %s [-y|-n] [-v] \n", prog);
+       fprintf(stderr, "\t-y   64bit mode\n");
+       fprintf(stderr, "\t-n   32bit mode\n");
+       fprintf(stderr, "\t-v   verbose mode\n");
+       exit(1);
+}
+
+static void malformed_line(const char *line, int line_nr)
+{
+       fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
+       exit(3);
+}
+
+static void dump_field(FILE *fp, const char *name, const char *indent,
+                      struct insn_field *field)
+{
+       fprintf(fp, "%s.%s = {\n", indent, name);
+       fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
+               indent, field->value, field->bytes[0], field->bytes[1],
+               field->bytes[2], field->bytes[3]);
+       fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
+               field->got, field->nbytes);
+}
+
+static void dump_insn(FILE *fp, struct insn *insn)
+{
+       fprintf(fp, "Instruction = { \n");
+       dump_field(fp, "prefixes", "\t",        &insn->prefixes);
+       dump_field(fp, "rex_prefix", "\t",      &insn->rex_prefix);
+       dump_field(fp, "vex_prefix", "\t",      &insn->vex_prefix);
+       dump_field(fp, "opcode", "\t",          &insn->opcode);
+       dump_field(fp, "modrm", "\t",           &insn->modrm);
+       dump_field(fp, "sib", "\t",             &insn->sib);
+       dump_field(fp, "displacement", "\t",    &insn->displacement);
+       dump_field(fp, "immediate1", "\t",      &insn->immediate1);
+       dump_field(fp, "immediate2", "\t",      &insn->immediate2);
+       fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
+               insn->attr, insn->opnd_bytes, insn->addr_bytes);
+       fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
+               insn->length, insn->x86_64, insn->kaddr);
+}
+
+static void parse_args(int argc, char **argv)
+{
+       int c;
+       prog = argv[0];
+       while ((c = getopt(argc, argv, "ynv")) != -1) {
+               switch (c) {
+               case 'y':
+                       x86_64 = 1;
+                       break;
+               case 'n':
+                       x86_64 = 0;
+                       break;
+               case 'v':
+                       verbose = 1;
+                       break;
+               default:
+                       usage();
+               }
+       }
+}
+
+#define BUFSIZE 256
+
+int main(int argc, char **argv)
+{
+       char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
+       unsigned char insn_buf[16];
+       struct insn insn;
+       int insns = 0, c;
+
+       parse_args(argc, argv);
+
+       while (fgets(line, BUFSIZE, stdin)) {
+               char copy[BUFSIZE], *s, *tab1, *tab2;
+               int nb = 0;
+               unsigned int b;
+
+               if (line[0] == '<') {
+                       /* Symbol line */
+                       strcpy(sym, line);
+                       continue;
+               }
+
+               insns++;
+               memset(insn_buf, 0, 16);
+               strcpy(copy, line);
+               tab1 = strchr(copy, '\t');
+               if (!tab1)
+                       malformed_line(line, insns);
+               s = tab1 + 1;
+               s += strspn(s, " ");
+               tab2 = strchr(s, '\t');
+               if (!tab2)
+                       malformed_line(line, insns);
+               *tab2 = '\0';   /* Characters beyond tab2 aren't examined */
+               while (s < tab2) {
+                       if (sscanf(s, "%x", &b) == 1) {
+                               insn_buf[nb++] = (unsigned char) b;
+                               s += 3;
+                       } else
+                               break;
+               }
+               /* Decode an instruction */
+               insn_init(&insn, insn_buf, x86_64);
+               insn_get_length(&insn);
+               if (insn.length != nb) {
+                       fprintf(stderr, "Error: %s found a difference at %s\n",
+                               prog, sym);
+                       fprintf(stderr, "Error: %s", line);
+                       fprintf(stderr, "Error: objdump says %d bytes, but "
+                               "insn_get_length() says %d\n", nb,
+                               insn.length);
+                       if (verbose)
+                               dump_insn(stderr, &insn);
+                       exit(2);
+               }
+       }
+       fprintf(stderr, "Succeed: decoded and checked %d instructions\n",
+               insns);
+       return 0;
+}
index d11770472bc8b68987e4ee310d08814f2d363a39..43360c1d8f70a1d9bf7dc436bb1b82ee497eda6f 100644 (file)
@@ -117,12 +117,12 @@ struct ftrace_event_call {
        struct dentry           *dir;
        struct trace_event      *event;
        int                     enabled;
-       int                     (*regfunc)(void *);
-       void                    (*unregfunc)(void *);
+       int                     (*regfunc)(struct ftrace_event_call *);
+       void                    (*unregfunc)(struct ftrace_event_call *);
        int                     id;
-       int                     (*raw_init)(void);
-       int                     (*show_format)(struct ftrace_event_call *call,
-                                              struct trace_seq *s);
+       int                     (*raw_init)(struct ftrace_event_call *);
+       int                     (*show_format)(struct ftrace_event_call *,
+                                              struct trace_seq *);
        int                     (*define_fields)(struct ftrace_event_call *);
        struct list_head        fields;
        int                     filter_active;
@@ -131,14 +131,19 @@ struct ftrace_event_call {
        void                    *data;
 
        atomic_t                profile_count;
-       int                     (*profile_enable)(void);
-       void                    (*profile_disable)(void);
+       int                     (*profile_enable)(struct ftrace_event_call *);
+       void                    (*profile_disable)(struct ftrace_event_call *);
 };
 
 #define FTRACE_MAX_PROFILE_SIZE        2048
 
-extern char                    *trace_profile_buf;
-extern char                    *trace_profile_buf_nmi;
+struct perf_trace_buf {
+       char    buf[FTRACE_MAX_PROFILE_SIZE];
+       int     recursion;
+};
+
+extern struct perf_trace_buf   *perf_trace_buf;
+extern struct perf_trace_buf   *perf_trace_buf_nmi;
 
 #define MAX_FILTER_PRED                32
 #define MAX_FILTER_STR_VAL     256     /* Should handle KSYM_SYMBOL_LEN */
@@ -157,11 +162,12 @@ enum {
        FILTER_PTR_STRING,
 };
 
-extern int trace_define_field(struct ftrace_event_call *call,
-                             const char *type, const char *name,
-                             int offset, int size, int is_signed,
-                             int filter_type);
 extern int trace_define_common_fields(struct ftrace_event_call *call);
+extern int trace_define_field(struct ftrace_event_call *call, const char *type,
+                             const char *name, int offset, int size,
+                             int is_signed, int filter_type);
+extern int trace_add_event_call(struct ftrace_event_call *call);
+extern void trace_remove_event_call(struct ftrace_event_call *call);
 
 #define is_signed_type(type)   (((type)(-1)) < 0)
 
index 3a46b7b7abb219c40bf39ce4d5f4e448da131212..1b672f74a32f0d76d27c277c759c6c3a24135674 100644 (file)
@@ -296,6 +296,8 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 int disable_kprobe(struct kprobe *kp);
 int enable_kprobe(struct kprobe *kp);
 
+void dump_kprobe(struct kprobe *kp);
+
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
index a990ace1a8380f01901b742a6b0821aff46a5d9d..b50974a93af0b83d2c1e3a826f4bdc6a8848ba62 100644 (file)
@@ -100,23 +100,23 @@ struct perf_event_attr;
 
 #ifdef CONFIG_EVENT_PROFILE
 #define TRACE_SYS_ENTER_PROFILE(sname)                                        \
-static int prof_sysenter_enable_##sname(void)                                 \
+static int prof_sysenter_enable_##sname(struct ftrace_event_call *unused)      \
 {                                                                             \
        return reg_prof_syscall_enter("sys"#sname);                            \
 }                                                                             \
                                                                               \
-static void prof_sysenter_disable_##sname(void)                                       \
+static void prof_sysenter_disable_##sname(struct ftrace_event_call *unused)    \
 {                                                                             \
        unreg_prof_syscall_enter("sys"#sname);                                 \
 }
 
 #define TRACE_SYS_EXIT_PROFILE(sname)                                         \
-static int prof_sysexit_enable_##sname(void)                                  \
+static int prof_sysexit_enable_##sname(struct ftrace_event_call *unused)       \
 {                                                                             \
        return reg_prof_syscall_exit("sys"#sname);                             \
 }                                                                             \
                                                                               \
-static void prof_sysexit_disable_##sname(void)                                \
+static void prof_sysexit_disable_##sname(struct ftrace_event_call *unused)     \
 {                                                                              \
        unreg_prof_syscall_exit("sys"#sname);                                  \
 }
@@ -157,7 +157,7 @@ static void prof_sysexit_disable_##sname(void)                                     \
        struct trace_event enter_syscall_print_##sname = {              \
                .trace                  = print_syscall_enter,          \
        };                                                              \
-       static int init_enter_##sname(void)                             \
+       static int init_enter_##sname(struct ftrace_event_call *call)   \
        {                                                               \
                int num, id;                                            \
                num = syscall_name_to_nr("sys"#sname);                  \
@@ -193,7 +193,7 @@ static void prof_sysexit_disable_##sname(void)                                     \
        struct trace_event exit_syscall_print_##sname = {               \
                .trace                  = print_syscall_exit,           \
        };                                                              \
-       static int init_exit_##sname(void)                              \
+       static int init_exit_##sname(struct ftrace_event_call *call)    \
        {                                                               \
                int num, id;                                            \
                num = syscall_name_to_nr("sys"#sname);                  \
index c9bbcab95fbe7974c515ed52c85a8f8c208631c7..4945d1c998645548a818a939bf1ad30b0a497734 100644 (file)
@@ -402,12 +402,12 @@ static inline int ftrace_get_offsets_##call(                              \
                                                                        \
 static void ftrace_profile_##call(proto);                              \
                                                                        \
-static int ftrace_profile_enable_##call(void)                          \
+static int ftrace_profile_enable_##call(struct ftrace_event_call *unused)\
 {                                                                      \
        return register_trace_##call(ftrace_profile_##call);            \
 }                                                                      \
                                                                        \
-static void ftrace_profile_disable_##call(void)                                \
+static void ftrace_profile_disable_##call(struct ftrace_event_call *unused)\
 {                                                                      \
        unregister_trace_##call(ftrace_profile_##call);                 \
 }
@@ -426,7 +426,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
- * static int ftrace_reg_event_<call>(void)
+ * static int ftrace_reg_event_<call>(struct ftrace_event_call *unused)
  * {
  *     int ret;
  *
@@ -437,7 +437,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     return ret;
  * }
  *
- * static void ftrace_unreg_event_<call>(void)
+ * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
  * {
  *     unregister_trace_<call>(ftrace_event_<call>);
  * }
@@ -472,7 +472,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
  * }
  *
- * static int ftrace_raw_reg_event_<call>(void)
+ * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
  * {
  *     int ret;
  *
@@ -483,7 +483,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     return ret;
  * }
  *
- * static void ftrace_unreg_event_<call>(void)
+ * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
  * {
  *     unregister_trace_<call>(ftrace_raw_event_<call>);
  * }
@@ -492,7 +492,7 @@ static void ftrace_profile_disable_##call(void)                             \
  *     .trace                  = ftrace_raw_output_<call>, <-- stage 2
  * };
  *
- * static int ftrace_raw_init_event_<call>(void)
+ * static int ftrace_raw_init_event_<call>(struct ftrace_event_call *unused)
  * {
  *     int id;
  *
@@ -589,7 +589,7 @@ static void ftrace_raw_event_##call(proto)                          \
                                                  event, irq_flags, pc); \
 }                                                                      \
                                                                        \
-static int ftrace_raw_reg_event_##call(void *ptr)                      \
+static int ftrace_raw_reg_event_##call(struct ftrace_event_call *unused)\
 {                                                                      \
        int ret;                                                        \
                                                                        \
@@ -600,7 +600,7 @@ static int ftrace_raw_reg_event_##call(void *ptr)                   \
        return ret;                                                     \
 }                                                                      \
                                                                        \
-static void ftrace_raw_unreg_event_##call(void *ptr)                   \
+static void ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused)\
 {                                                                      \
        unregister_trace_##call(ftrace_raw_event_##call);               \
 }                                                                      \
@@ -609,7 +609,7 @@ static struct trace_event ftrace_event_type_##call = {                      \
        .trace                  = ftrace_raw_output_##call,             \
 };                                                                     \
                                                                        \
-static int ftrace_raw_init_event_##call(void)                          \
+static int ftrace_raw_init_event_##call(struct ftrace_event_call *unused)\
 {                                                                      \
        int id;                                                         \
                                                                        \
@@ -649,6 +649,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {         \
  *     struct ftrace_event_call *event_call = &event_<call>;
  *     extern void perf_tp_event(int, u64, u64, void *, int);
  *     struct ftrace_raw_##call *entry;
+ *     struct perf_trace_buf *trace_buf;
  *     u64 __addr = 0, __count = 1;
  *     unsigned long irq_flags;
  *     struct trace_entry *ent;
@@ -673,14 +674,25 @@ __attribute__((section("_ftrace_events"))) event_##call = {               \
  *     __cpu = smp_processor_id();
  *
  *     if (in_nmi())
- *             raw_data = rcu_dereference(trace_profile_buf_nmi);
+ *             trace_buf = rcu_dereference(perf_trace_buf_nmi);
  *     else
- *             raw_data = rcu_dereference(trace_profile_buf);
+ *             trace_buf = rcu_dereference(perf_trace_buf);
  *
- *     if (!raw_data)
+ *     if (!trace_buf)
  *             goto end;
  *
- *     raw_data = per_cpu_ptr(raw_data, __cpu);
+ *     trace_buf = per_cpu_ptr(trace_buf, __cpu);
+ *
+ *     // Avoid recursion from perf that could mess up the buffer
+ *     if (trace_buf->recursion++)
+ *             goto end_recursion;
+ *
+ *     raw_data = trace_buf->buf;
+ *
+ *     // Make recursion update visible before entering perf_tp_event
+ *     // so that we protect from perf recursions.
+ *
+ *     barrier();
  *
  *     //zero dead bytes from alignment to avoid stack leak to userspace:
  *     *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
@@ -713,8 +725,9 @@ static void ftrace_profile_##call(proto)                            \
 {                                                                      \
        struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
        struct ftrace_event_call *event_call = &event_##call;           \
-       extern void perf_tp_event(int, u64, u64, void *, int);  \
+       extern void perf_tp_event(int, u64, u64, void *, int);          \
        struct ftrace_raw_##call *entry;                                \
+       struct perf_trace_buf *trace_buf;                               \
        u64 __addr = 0, __count = 1;                                    \
        unsigned long irq_flags;                                        \
        struct trace_entry *ent;                                        \
@@ -739,14 +752,20 @@ static void ftrace_profile_##call(proto)                          \
        __cpu = smp_processor_id();                                     \
                                                                        \
        if (in_nmi())                                                   \
-               raw_data = rcu_dereference(trace_profile_buf_nmi);              \
+               trace_buf = rcu_dereference(perf_trace_buf_nmi);        \
        else                                                            \
-               raw_data = rcu_dereference(trace_profile_buf);          \
+               trace_buf = rcu_dereference(perf_trace_buf);            \
                                                                        \
-       if (!raw_data)                                                  \
+       if (!trace_buf)                                                 \
                goto end;                                               \
                                                                        \
-       raw_data = per_cpu_ptr(raw_data, __cpu);                        \
+       trace_buf = per_cpu_ptr(trace_buf, __cpu);                      \
+       if (trace_buf->recursion++)                                     \
+               goto end_recursion;                                     \
+                                                                       \
+       barrier();                                                      \
+                                                                       \
+       raw_data = trace_buf->buf;                                      \
                                                                        \
        *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;         \
        entry = (struct ftrace_raw_##call *)raw_data;                   \
@@ -761,6 +780,8 @@ static void ftrace_profile_##call(proto)                            \
        perf_tp_event(event_call->id, __addr, __count, entry,           \
                             __entry_size);                             \
                                                                        \
+end_recursion:                                                         \
+       trace_buf->recursion--;                                         \
 end:                                                                   \
        local_irq_restore(irq_flags);                                   \
                                                                        \
index e972f0a40f8d02af648863793adb7e8383673f7b..51ee17d3632a4800dd06fa879a7d6bf92605ea71 100644 (file)
@@ -39,16 +39,19 @@ void set_syscall_enter_id(int num, int id);
 void set_syscall_exit_id(int num, int id);
 extern struct trace_event event_syscall_enter;
 extern struct trace_event event_syscall_exit;
-extern int reg_event_syscall_enter(void *ptr);
-extern void unreg_event_syscall_enter(void *ptr);
-extern int reg_event_syscall_exit(void *ptr);
-extern void unreg_event_syscall_exit(void *ptr);
+
 extern int syscall_enter_format(struct ftrace_event_call *call,
                                struct trace_seq *s);
 extern int syscall_exit_format(struct ftrace_event_call *call,
                                struct trace_seq *s);
 extern int syscall_enter_define_fields(struct ftrace_event_call *call);
 extern int syscall_exit_define_fields(struct ftrace_event_call *call);
+extern int reg_event_syscall_enter(struct ftrace_event_call *call);
+extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
+extern int reg_event_syscall_exit(struct ftrace_event_call *call);
+extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
+extern int
+ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
 enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags);
 #endif
index 5240d75f4c60e95f95eb55e0cc46b448e2d6eb7c..84495958e703366eef86906d1e0ff4aa1cb2d2cd 100644 (file)
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
  */
 static struct kprobe_blackpoint kprobe_blacklist[] = {
        {"preempt_schedule",},
+       {"native_get_debugreg",},
+       {"irq_entries_start",},
+       {"common_interrupt",},
        {NULL}    /* Terminator */
 };
 
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
        return (kprobe_opcode_t *)(((char *)addr) + p->offset);
 }
 
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+{
+       struct kprobe *old_p, *list_p;
+
+       old_p = get_kprobe(p->addr);
+       if (unlikely(!old_p))
+               return NULL;
+
+       if (p != old_p) {
+               list_for_each_entry_rcu(list_p, &old_p->list, list)
+                       if (list_p == p)
+                       /* kprobe p is a valid probe */
+                               goto valid;
+               return NULL;
+       }
+valid:
+       return old_p;
+}
+
+/* Return error if the kprobe is being re-registered */
+static inline int check_kprobe_rereg(struct kprobe *p)
+{
+       int ret = 0;
+       struct kprobe *old_p;
+
+       mutex_lock(&kprobe_mutex);
+       old_p = __get_valid_kprobe(p);
+       if (old_p)
+               ret = -EINVAL;
+       mutex_unlock(&kprobe_mutex);
+       return ret;
+}
+
 int __kprobes register_kprobe(struct kprobe *p)
 {
        int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
                return -EINVAL;
        p->addr = addr;
 
+       ret = check_kprobe_rereg(p);
+       if (ret)
+               return ret;
+
        preempt_disable();
        if (!kernel_text_address((unsigned long) p->addr) ||
            in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
 
-/* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
-{
-       struct kprobe *old_p, *list_p;
-
-       old_p = get_kprobe(p->addr);
-       if (unlikely(!old_p))
-               return NULL;
-
-       if (p != old_p) {
-               list_for_each_entry_rcu(list_p, &old_p->list, list)
-                       if (list_p == p)
-                       /* kprobe p is a valid probe */
-                               goto valid;
-               return NULL;
-       }
-valid:
-       return old_p;
-}
-
 /*
  * Unregister a kprobe without a scheduler synchronization.
  */
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
        arch_remove_kprobe(p);
 }
 
+void __kprobes dump_kprobe(struct kprobe *kp)
+{
+       printk(KERN_WARNING "Dumping kprobe:\n");
+       printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
+              kp->symbol_name, kp->addr, kp->offset);
+}
+
 /* Module notifier call back, checking kprobes on the module */
 static int __kprobes kprobes_module_callback(struct notifier_block *nb,
                                             unsigned long val, void *data)
index 61d5aa5eced3466393582e4f566b63c468ea7cc3..acd24e7643eb8ebdb9a0386b65931294b37c7eaf 100644 (file)
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
 
 static ATOMIC_NOTIFIER_HEAD(die_chain);
 
-int notrace notify_die(enum die_val val, const char *str,
+int notrace __kprobes notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
 {
        struct die_args args = {
index b416512ad17ff77eea13b6b391d907f32f76143b..f05671609a897dba01bd32641e301d418e490849 100644 (file)
@@ -428,6 +428,23 @@ config BLK_DEV_IO_TRACE
 
          If unsure, say N.
 
+config KPROBE_EVENT
+       depends on KPROBES
+       depends on X86
+       bool "Enable kprobes-based dynamic events"
+       select TRACING
+       default y
+       help
+         This allows the user to add tracing events (similar to tracepoints) on the fly
+         via the ftrace interface. See Documentation/trace/kprobetrace.txt
+         for more details.
+
+         Those events can be inserted wherever kprobes can probe, and record
+         various register and memory values.
+
+         This option is also required by perf-probe subcommand of perf tools. If
+         you want to use perf tools, this option is strongly recommended.
+
 config DYNAMIC_FTRACE
        bool "enable/disable ftrace tracepoints dynamically"
        depends on FUNCTION_TRACER
index 26f03ac07c2bc2164ce809cea5a48fce15d09ff1..edc3a3cca1a16cbd4199a0b8035550f60f7ba593 100644 (file)
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 
 libftrace-y := ftrace.o
index 4959ada9e0bbd38e07407637656d15d2afd83b9c..b4e4212e66d7d6905d835c425387aad291d6eb30 100644 (file)
@@ -101,6 +101,29 @@ struct syscall_trace_exit {
        unsigned long           ret;
 };
 
+struct kprobe_trace_entry {
+       struct trace_entry      ent;
+       unsigned long           ip;
+       int                     nargs;
+       unsigned long           args[];
+};
+
+#define SIZEOF_KPROBE_TRACE_ENTRY(n)                   \
+       (offsetof(struct kprobe_trace_entry, args) +    \
+       (sizeof(unsigned long) * (n)))
+
+struct kretprobe_trace_entry {
+       struct trace_entry      ent;
+       unsigned long           func;
+       unsigned long           ret_ip;
+       int                     nargs;
+       unsigned long           args[];
+};
+
+#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)                        \
+       (offsetof(struct kretprobe_trace_entry, args) + \
+       (sizeof(unsigned long) * (n)))
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
index 8d5c171cc9987d924f9fcfd3328fcb966288b9a0..e0d351b01f5ac9bacf230cf077cb1e5cbb82489a 100644 (file)
@@ -8,44 +8,39 @@
 #include <linux/module.h>
 #include "trace.h"
 
-/*
- * We can't use a size but a type in alloc_percpu()
- * So let's create a dummy type that matches the desired size
- */
-typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
 
-char           *trace_profile_buf;
-EXPORT_SYMBOL_GPL(trace_profile_buf);
+struct perf_trace_buf *perf_trace_buf;
+EXPORT_SYMBOL_GPL(perf_trace_buf);
 
-char           *trace_profile_buf_nmi;
-EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
+struct perf_trace_buf *perf_trace_buf_nmi;
+EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
 
 /* Count the events in use (per event id, not per instance) */
 static int     total_profile_count;
 
 static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 {
-       char *buf;
+       struct perf_trace_buf *buf;
        int ret = -ENOMEM;
 
        if (atomic_inc_return(&event->profile_count))
                return 0;
 
        if (!total_profile_count) {
-               buf = (char *)alloc_percpu(profile_buf_t);
+               buf = alloc_percpu(struct perf_trace_buf);
                if (!buf)
                        goto fail_buf;
 
-               rcu_assign_pointer(trace_profile_buf, buf);
+               rcu_assign_pointer(perf_trace_buf, buf);
 
-               buf = (char *)alloc_percpu(profile_buf_t);
+               buf = alloc_percpu(struct perf_trace_buf);
                if (!buf)
                        goto fail_buf_nmi;
 
-               rcu_assign_pointer(trace_profile_buf_nmi, buf);
+               rcu_assign_pointer(perf_trace_buf_nmi, buf);
        }
 
-       ret = event->profile_enable();
+       ret = event->profile_enable(event);
        if (!ret) {
                total_profile_count++;
                return 0;
@@ -53,10 +48,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 
 fail_buf_nmi:
        if (!total_profile_count) {
-               free_percpu(trace_profile_buf_nmi);
-               free_percpu(trace_profile_buf);
-               trace_profile_buf_nmi = NULL;
-               trace_profile_buf = NULL;
+               free_percpu(perf_trace_buf_nmi);
+               free_percpu(perf_trace_buf);
+               perf_trace_buf_nmi = NULL;
+               perf_trace_buf = NULL;
        }
 fail_buf:
        atomic_dec(&event->profile_count);
@@ -84,19 +79,19 @@ int ftrace_profile_enable(int event_id)
 
 static void ftrace_profile_disable_event(struct ftrace_event_call *event)
 {
-       char *buf, *nmi_buf;
+       struct perf_trace_buf *buf, *nmi_buf;
 
        if (!atomic_add_negative(-1, &event->profile_count))
                return;
 
-       event->profile_disable();
+       event->profile_disable(event);
 
        if (!--total_profile_count) {
-               buf = trace_profile_buf;
-               rcu_assign_pointer(trace_profile_buf, NULL);
+               buf = perf_trace_buf;
+               rcu_assign_pointer(perf_trace_buf, NULL);
 
-               nmi_buf = trace_profile_buf_nmi;
-               rcu_assign_pointer(trace_profile_buf_nmi, NULL);
+               nmi_buf = perf_trace_buf_nmi;
+               rcu_assign_pointer(perf_trace_buf_nmi, NULL);
 
                /*
                 * Ensure every events in profiling have finished before
index 7c18d154ea28e0e295fe580099e25849a9626431..1d18315dc836e6e15d50cc8d17b4eacec56fd6ba 100644 (file)
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_define_common_fields);
 
-#ifdef CONFIG_MODULES
-
-static void trace_destroy_fields(struct ftrace_event_call *call)
+void trace_destroy_fields(struct ftrace_event_call *call)
 {
        struct ftrace_event_field *field, *next;
 
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
        }
 }
 
-#endif /* CONFIG_MODULES */
-
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                                        int enable)
 {
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                if (call->enabled) {
                        call->enabled = 0;
                        tracing_stop_cmdline_record();
-                       call->unregfunc(call->data);
+                       call->unregfunc(call);
                }
                break;
        case 1:
                if (!call->enabled) {
                        call->enabled = 1;
                        tracing_start_cmdline_record();
-                       call->regfunc(call->data);
+                       call->regfunc(call);
                }
                break;
        }
@@ -937,27 +933,46 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
        return 0;
 }
 
-#define for_each_event(event, start, end)                      \
-       for (event = start;                                     \
-            (unsigned long)event < (unsigned long)end;         \
-            event++)
+static int __trace_add_event_call(struct ftrace_event_call *call)
+{
+       struct dentry *d_events;
+       int ret;
 
-#ifdef CONFIG_MODULES
+       if (!call->name)
+               return -EINVAL;
 
-static LIST_HEAD(ftrace_module_file_list);
+       if (call->raw_init) {
+               ret = call->raw_init(call);
+               if (ret < 0) {
+                       if (ret != -ENOSYS)
+                               pr_warning("Could not initialize trace "
+                               "events/%s\n", call->name);
+                       return ret;
+               }
+       }
 
-/*
- * Modules must own their file_operations to keep up with
- * reference counting.
- */
-struct ftrace_module_file_ops {
-       struct list_head                list;
-       struct module                   *mod;
-       struct file_operations          id;
-       struct file_operations          enable;
-       struct file_operations          format;
-       struct file_operations          filter;
-};
+       d_events = event_trace_events_dir();
+       if (!d_events)
+               return -ENOENT;
+
+       ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+                               &ftrace_enable_fops, &ftrace_event_filter_fops,
+                               &ftrace_event_format_fops);
+       if (!ret)
+               list_add(&call->list, &ftrace_events);
+
+       return ret;
+}
+
+/* Add an additional event_call dynamically */
+int trace_add_event_call(struct ftrace_event_call *call)
+{
+       int ret;
+       mutex_lock(&event_mutex);
+       ret = __trace_add_event_call(call);
+       mutex_unlock(&event_mutex);
+       return ret;
+}
 
 static void remove_subsystem_dir(const char *name)
 {
@@ -985,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
        }
 }
 
+/*
+ * Must be called under locking both of event_mutex and trace_event_mutex.
+ */
+static void __trace_remove_event_call(struct ftrace_event_call *call)
+{
+       ftrace_event_enable_disable(call, 0);
+       if (call->event)
+               __unregister_ftrace_event(call->event);
+       debugfs_remove_recursive(call->dir);
+       list_del(&call->list);
+       trace_destroy_fields(call);
+       destroy_preds(call);
+       remove_subsystem_dir(call->system);
+}
+
+/* Remove an event_call */
+void trace_remove_event_call(struct ftrace_event_call *call)
+{
+       mutex_lock(&event_mutex);
+       down_write(&trace_event_mutex);
+       __trace_remove_event_call(call);
+       up_write(&trace_event_mutex);
+       mutex_unlock(&event_mutex);
+}
+
+#define for_each_event(event, start, end)                      \
+       for (event = start;                                     \
+            (unsigned long)event < (unsigned long)end;         \
+            event++)
+
+#ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+       struct list_head                list;
+       struct module                   *mod;
+       struct file_operations          id;
+       struct file_operations          enable;
+       struct file_operations          format;
+       struct file_operations          filter;
+};
+
 static struct ftrace_module_file_ops *
 trace_create_file_ops(struct module *mod)
 {
@@ -1042,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
                if (!call->name)
                        continue;
                if (call->raw_init) {
-                       ret = call->raw_init();
+                       ret = call->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1060,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
                                return;
                }
                call->mod = mod;
-               list_add(&call->list, &ftrace_events);
-               event_create_dir(call, d_events,
-                                &file_ops->id, &file_ops->enable,
-                                &file_ops->filter, &file_ops->format);
+               ret = event_create_dir(call, d_events,
+                                      &file_ops->id, &file_ops->enable,
+                                      &file_ops->filter, &file_ops->format);
+               if (!ret)
+                       list_add(&call->list, &ftrace_events);
        }
 }
 
@@ -1077,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
        list_for_each_entry_safe(call, p, &ftrace_events, list) {
                if (call->mod == mod) {
                        found = true;
-                       ftrace_event_enable_disable(call, 0);
-                       if (call->event)
-                               __unregister_ftrace_event(call->event);
-                       debugfs_remove_recursive(call->dir);
-                       list_del(&call->list);
-                       trace_destroy_fields(call);
-                       destroy_preds(call);
-                       remove_subsystem_dir(call->system);
+                       __trace_remove_event_call(call);
                }
        }
 
@@ -1202,7 +1258,7 @@ static __init int event_trace_init(void)
                if (!call->name)
                        continue;
                if (call->raw_init) {
-                       ret = call->raw_init();
+                       ret = call->raw_init(call);
                        if (ret < 0) {
                                if (ret != -ENOSYS)
                                        pr_warning("Could not initialize trace "
@@ -1210,10 +1266,12 @@ static __init int event_trace_init(void)
                                continue;
                        }
                }
-               list_add(&call->list, &ftrace_events);
-               event_create_dir(call, d_events, &ftrace_event_id_fops,
-                                &ftrace_enable_fops, &ftrace_event_filter_fops,
-                                &ftrace_event_format_fops);
+               ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
+                                      &ftrace_enable_fops,
+                                      &ftrace_event_filter_fops,
+                                      &ftrace_event_format_fops);
+               if (!ret)
+                       list_add(&call->list, &ftrace_events);
        }
 
        while (true) {
index 31da218ee10f317e6076eaa016e2b28eff599027..934d81fb4ca46b2c8b654d8b7ac234b87f9ab481 100644 (file)
@@ -134,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused,                      \
 
 #include "trace_entries.h"
 
-
 #undef __field
 #define __field(type, item)                                            \
        ret = trace_define_field(event_call, #type, #item,              \
@@ -196,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)  \
 
 #include "trace_entries.h"
 
+static int ftrace_raw_init_event(struct ftrace_event_call *call)
+{
+       INIT_LIST_HEAD(&call->fields);
+       return 0;
+}
 
 #undef __field
 #define __field(type, item)
@@ -214,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)   \
 
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, type, tstruct, print)          \
-static int ftrace_raw_init_event_##call(void);                         \
                                                                        \
 struct ftrace_event_call __used                                                \
 __attribute__((__aligned__(4)))                                                \
@@ -222,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = {                \
        .name                   = #call,                                \
        .id                     = type,                                 \
        .system                 = __stringify(TRACE_SYSTEM),            \
-       .raw_init               = ftrace_raw_init_event_##call,         \
+       .raw_init               = ftrace_raw_init_event,                \
        .show_format            = ftrace_format_##call,                 \
        .define_fields          = ftrace_define_fields_##call,          \
 };                                                                     \
-static int ftrace_raw_init_event_##call(void)                          \
-{                                                                      \
-       INIT_LIST_HEAD(&event_##call.fields);                           \
-       return 0;                                                       \
-}                                                                      \
 
 #include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644 (file)
index 0000000..3696476
--- /dev/null
@@ -0,0 +1,1513 @@
+/*
+ * Kprobes-based tracing events
+ *
+ * Created by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/ptrace.h>
+#include <linux/perf_event.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define MAX_TRACE_ARGS 128
+#define MAX_ARGSTR_LEN 63
+#define MAX_EVENT_NAME_LEN 64
+#define KPROBE_EVENT_SYSTEM "kprobes"
+
+/* Reserved field names */
+#define FIELD_STRING_IP "__probe_ip"
+#define FIELD_STRING_NARGS "__probe_nargs"
+#define FIELD_STRING_RETIP "__probe_ret_ip"
+#define FIELD_STRING_FUNC "__probe_func"
+
+const char *reserved_field_names[] = {
+       "common_type",
+       "common_flags",
+       "common_preempt_count",
+       "common_pid",
+       "common_tgid",
+       "common_lock_depth",
+       FIELD_STRING_IP,
+       FIELD_STRING_NARGS,
+       FIELD_STRING_RETIP,
+       FIELD_STRING_FUNC,
+};
+
+struct fetch_func {
+       unsigned long (*func)(struct pt_regs *, void *);
+       void *data;
+};
+
+static __kprobes unsigned long call_fetch(struct fetch_func *f,
+                                         struct pt_regs *regs)
+{
+       return f->func(regs, f->data);
+}
+
+/* fetch handlers */
+static __kprobes unsigned long fetch_register(struct pt_regs *regs,
+                                             void *offset)
+{
+       return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+}
+
+static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
+                                          void *num)
+{
+       return regs_get_kernel_stack_nth(regs,
+                                        (unsigned int)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
+{
+       unsigned long retval;
+
+       if (probe_kernel_address(addr, retval))
+               return 0;
+       return retval;
+}
+
+static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
+{
+       return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
+}
+
+static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
+                                             void *dummy)
+{
+       return regs_return_value(regs);
+}
+
+static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
+                                                  void *dummy)
+{
+       return kernel_stack_pointer(regs);
+}
+
+/* Memory fetching by symbol */
+struct symbol_cache {
+       char *symbol;
+       long offset;
+       unsigned long addr;
+};
+
+static unsigned long update_symbol_cache(struct symbol_cache *sc)
+{
+       sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
+       if (sc->addr)
+               sc->addr += sc->offset;
+       return sc->addr;
+}
+
+static void free_symbol_cache(struct symbol_cache *sc)
+{
+       kfree(sc->symbol);
+       kfree(sc);
+}
+
+static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
+{
+       struct symbol_cache *sc;
+
+       if (!sym || strlen(sym) == 0)
+               return NULL;
+       sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
+       if (!sc)
+               return NULL;
+
+       sc->symbol = kstrdup(sym, GFP_KERNEL);
+       if (!sc->symbol) {
+               kfree(sc);
+               return NULL;
+       }
+       sc->offset = offset;
+
+       update_symbol_cache(sc);
+       return sc;
+}
+
+static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
+{
+       struct symbol_cache *sc = data;
+
+       if (sc->addr)
+               return fetch_memory(regs, (void *)sc->addr);
+       else
+               return 0;
+}
+
+/* Special indirect memory access interface */
+struct indirect_fetch_data {
+       struct fetch_func orig;
+       long offset;
+};
+
+static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
+{
+       struct indirect_fetch_data *ind = data;
+       unsigned long addr;
+
+       addr = call_fetch(&ind->orig, regs);
+       if (addr) {
+               addr += ind->offset;
+               return fetch_memory(regs, (void *)addr);
+       } else
+               return 0;
+}
+
+static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+{
+       if (data->orig.func == fetch_indirect)
+               free_indirect_fetch_data(data->orig.data);
+       else if (data->orig.func == fetch_symbol)
+               free_symbol_cache(data->orig.data);
+       kfree(data);
+}
+
+/**
+ * Kprobe event core functions
+ */
+
+struct probe_arg {
+       struct fetch_func       fetch;
+       const char              *name;
+};
+
+/* Flags for trace_probe */
+#define TP_FLAG_TRACE  1
+#define TP_FLAG_PROFILE        2
+
+struct trace_probe {
+       struct list_head        list;
+       struct kretprobe        rp;     /* Use rp.kp for kprobe use */
+       unsigned long           nhit;
+       unsigned int            flags;  /* For TP_FLAG_* */
+       const char              *symbol;        /* symbol name */
+       struct ftrace_event_call        call;
+       struct trace_event              event;
+       unsigned int            nr_args;
+       struct probe_arg        args[];
+};
+
+#define SIZEOF_TRACE_PROBE(n)                  \
+       (offsetof(struct trace_probe, args) +   \
+       (sizeof(struct probe_arg) * (n)))
+
+static __kprobes int probe_is_return(struct trace_probe *tp)
+{
+       return tp->rp.handler != NULL;
+}
+
+static __kprobes const char *probe_symbol(struct trace_probe *tp)
+{
+       return tp->symbol ? tp->symbol : "unknown";
+}
+
+static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
+{
+       int ret = -EINVAL;
+
+       if (ff->func == fetch_argument)
+               ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
+       else if (ff->func == fetch_register) {
+               const char *name;
+               name = regs_query_register_name((unsigned int)((long)ff->data));
+               ret = snprintf(buf, n, "%%%s", name);
+       } else if (ff->func == fetch_stack)
+               ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
+       else if (ff->func == fetch_memory)
+               ret = snprintf(buf, n, "@0x%p", ff->data);
+       else if (ff->func == fetch_symbol) {
+               struct symbol_cache *sc = ff->data;
+               ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset);
+       } else if (ff->func == fetch_retvalue)
+               ret = snprintf(buf, n, "$retval");
+       else if (ff->func == fetch_stack_address)
+               ret = snprintf(buf, n, "$stack");
+       else if (ff->func == fetch_indirect) {
+               struct indirect_fetch_data *id = ff->data;
+               size_t l = 0;
+               ret = snprintf(buf, n, "%+ld(", id->offset);
+               if (ret >= n)
+                       goto end;
+               l += ret;
+               ret = probe_arg_string(buf + l, n - l, &id->orig);
+               if (ret < 0)
+                       goto end;
+               l += ret;
+               ret = snprintf(buf + l, n - l, ")");
+               ret += l;
+       }
+end:
+       if (ret >= n)
+               return -ENOSPC;
+       return ret;
+}
+
+static int register_probe_event(struct trace_probe *tp);
+static void unregister_probe_event(struct trace_probe *tp);
+
+static DEFINE_MUTEX(probe_lock);
+static LIST_HEAD(probe_list);
+
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
+static int kretprobe_dispatcher(struct kretprobe_instance *ri,
+                               struct pt_regs *regs);
+
+/*
+ * Allocate new trace_probe and initialize it (including kprobes).
+ */
+static struct trace_probe *alloc_trace_probe(const char *group,
+                                            const char *event,
+                                            void *addr,
+                                            const char *symbol,
+                                            unsigned long offs,
+                                            int nargs, int is_return)
+{
+       struct trace_probe *tp;
+
+       tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
+       if (!tp)
+               return ERR_PTR(-ENOMEM);
+
+       if (symbol) {
+               tp->symbol = kstrdup(symbol, GFP_KERNEL);
+               if (!tp->symbol)
+                       goto error;
+               tp->rp.kp.symbol_name = tp->symbol;
+               tp->rp.kp.offset = offs;
+       } else
+               tp->rp.kp.addr = addr;
+
+       if (is_return)
+               tp->rp.handler = kretprobe_dispatcher;
+       else
+               tp->rp.kp.pre_handler = kprobe_dispatcher;
+
+       if (!event)
+               goto error;
+       tp->call.name = kstrdup(event, GFP_KERNEL);
+       if (!tp->call.name)
+               goto error;
+
+       if (!group)
+               goto error;
+       tp->call.system = kstrdup(group, GFP_KERNEL);
+       if (!tp->call.system)
+               goto error;
+
+       INIT_LIST_HEAD(&tp->list);
+       return tp;
+error:
+       kfree(tp->call.name);
+       kfree(tp->symbol);
+       kfree(tp);
+       return ERR_PTR(-ENOMEM);
+}
+
+static void free_probe_arg(struct probe_arg *arg)
+{
+       if (arg->fetch.func == fetch_symbol)
+               free_symbol_cache(arg->fetch.data);
+       else if (arg->fetch.func == fetch_indirect)
+               free_indirect_fetch_data(arg->fetch.data);
+       kfree(arg->name);
+}
+
+static void free_trace_probe(struct trace_probe *tp)
+{
+       int i;
+
+       for (i = 0; i < tp->nr_args; i++)
+               free_probe_arg(&tp->args[i]);
+
+       kfree(tp->call.system);
+       kfree(tp->call.name);
+       kfree(tp->symbol);
+       kfree(tp);
+}
+
+static struct trace_probe *find_probe_event(const char *event,
+                                           const char *group)
+{
+       struct trace_probe *tp;
+
+       list_for_each_entry(tp, &probe_list, list)
+               if (strcmp(tp->call.name, event) == 0 &&
+                   strcmp(tp->call.system, group) == 0)
+                       return tp;
+       return NULL;
+}
+
+/* Unregister a trace_probe and probe_event: call with locking probe_lock */
+static void unregister_trace_probe(struct trace_probe *tp)
+{
+       if (probe_is_return(tp))
+               unregister_kretprobe(&tp->rp);
+       else
+               unregister_kprobe(&tp->rp.kp);
+       list_del(&tp->list);
+       unregister_probe_event(tp);
+}
+
+/* Register a trace_probe and probe_event */
+static int register_trace_probe(struct trace_probe *tp)
+{
+       struct trace_probe *old_tp;
+       int ret;
+
+       mutex_lock(&probe_lock);
+
+       /* register as an event */
+       old_tp = find_probe_event(tp->call.name, tp->call.system);
+       if (old_tp) {
+               /* delete old event */
+               unregister_trace_probe(old_tp);
+               free_trace_probe(old_tp);
+       }
+       ret = register_probe_event(tp);
+       if (ret) {
+               pr_warning("Faild to register probe event(%d)\n", ret);
+               goto end;
+       }
+
+       tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
+       if (probe_is_return(tp))
+               ret = register_kretprobe(&tp->rp);
+       else
+               ret = register_kprobe(&tp->rp.kp);
+
+       if (ret) {
+               pr_warning("Could not insert probe(%d)\n", ret);
+               if (ret == -EILSEQ) {
+                       pr_warning("Probing address(0x%p) is not an "
+                                  "instruction boundary.\n",
+                                  tp->rp.kp.addr);
+                       ret = -EINVAL;
+               }
+               unregister_probe_event(tp);
+       } else
+               list_add_tail(&tp->list, &probe_list);
+end:
+       mutex_unlock(&probe_lock);
+       return ret;
+}
+
+/* Split symbol and offset. */
+static int split_symbol_offset(char *symbol, unsigned long *offset)
+{
+       char *tmp;
+       int ret;
+
+       if (!offset)
+               return -EINVAL;
+
+       tmp = strchr(symbol, '+');
+       if (tmp) {
+               /* skip sign because strict_strtol doesn't accept '+' */
+               ret = strict_strtoul(tmp + 1, 0, offset);
+               if (ret)
+                       return ret;
+               *tmp = '\0';
+       } else
+               *offset = 0;
+       return 0;
+}
+
+#define PARAM_MAX_ARGS 16
+#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
+
+static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+{
+       int ret = 0;
+       unsigned long param;
+
+       if (strcmp(arg, "retval") == 0) {
+               if (is_return) {
+                       ff->func = fetch_retvalue;
+                       ff->data = NULL;
+               } else
+                       ret = -EINVAL;
+       } else if (strncmp(arg, "stack", 5) == 0) {
+               if (arg[5] == '\0') {
+                       ff->func = fetch_stack_address;
+                       ff->data = NULL;
+               } else if (isdigit(arg[5])) {
+                       ret = strict_strtoul(arg + 5, 10, &param);
+                       if (ret || param > PARAM_MAX_STACK)
+                               ret = -EINVAL;
+                       else {
+                               ff->func = fetch_stack;
+                               ff->data = (void *)param;
+                       }
+               } else
+                       ret = -EINVAL;
+       } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
+               ret = strict_strtoul(arg + 3, 10, &param);
+               if (ret || param > PARAM_MAX_ARGS)
+                       ret = -EINVAL;
+               else {
+                       ff->func = fetch_argument;
+                       ff->data = (void *)param;
+               }
+       } else
+               ret = -EINVAL;
+       return ret;
+}
+
+static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+{
+       int ret = 0;
+       unsigned long param;
+       long offset;
+       char *tmp;
+
+       switch (arg[0]) {
+       case '$':
+               ret = parse_probe_vars(arg + 1, ff, is_return);
+               break;
+       case '%':       /* named register */
+               ret = regs_query_register_offset(arg + 1);
+               if (ret >= 0) {
+                       ff->func = fetch_register;
+                       ff->data = (void *)(unsigned long)ret;
+                       ret = 0;
+               }
+               break;
+       case '@':       /* memory or symbol */
+               if (isdigit(arg[1])) {
+                       ret = strict_strtoul(arg + 1, 0, &param);
+                       if (ret)
+                               break;
+                       ff->func = fetch_memory;
+                       ff->data = (void *)param;
+               } else {
+                       ret = split_symbol_offset(arg + 1, &offset);
+                       if (ret)
+                               break;
+                       ff->data = alloc_symbol_cache(arg + 1, offset);
+                       if (ff->data)
+                               ff->func = fetch_symbol;
+                       else
+                               ret = -EINVAL;
+               }
+               break;
+       case '+':       /* indirect memory */
+       case '-':
+               tmp = strchr(arg, '(');
+               if (!tmp) {
+                       ret = -EINVAL;
+                       break;
+               }
+               *tmp = '\0';
+               ret = strict_strtol(arg + 1, 0, &offset);
+               if (ret)
+                       break;
+               if (arg[0] == '-')
+                       offset = -offset;
+               arg = tmp + 1;
+               tmp = strrchr(arg, ')');
+               if (tmp) {
+                       struct indirect_fetch_data *id;
+                       *tmp = '\0';
+                       id = kzalloc(sizeof(struct indirect_fetch_data),
+                                    GFP_KERNEL);
+                       if (!id)
+                               return -ENOMEM;
+                       id->offset = offset;
+                       ret = parse_probe_arg(arg, &id->orig, is_return);
+                       if (ret)
+                               kfree(id);
+                       else {
+                               ff->func = fetch_indirect;
+                               ff->data = (void *)id;
+                       }
+               } else
+                       ret = -EINVAL;
+               break;
+       default:
+               /* TODO: support custom handler */
+               ret = -EINVAL;
+       }
+       return ret;
+}
+
+/* Return 1 if name is reserved or already used by another argument */
+static int conflict_field_name(const char *name,
+                              struct probe_arg *args, int narg)
+{
+       int i;
+       for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
+               if (strcmp(reserved_field_names[i], name) == 0)
+                       return 1;
+       for (i = 0; i < narg; i++)
+               if (strcmp(args[i].name, name) == 0)
+                       return 1;
+       return 0;
+}
+
+static int create_trace_probe(int argc, char **argv)
+{
+       /*
+        * Argument syntax:
+        *  - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
+        *  - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
+        * Fetch args:
+        *  $argN       : fetch Nth of function argument. (N:0-)
+        *  $retval     : fetch return value
+        *  $stack      : fetch stack address
+        *  $stackN     : fetch Nth of stack (N:0-)
+        *  @ADDR       : fetch memory at ADDR (ADDR should be in kernel)
+        *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
+        *  %REG        : fetch register REG
+        * Indirect memory fetch:
+        *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
+        * Alias name of args:
+        *  NAME=FETCHARG : set NAME as alias of FETCHARG.
+        */
+       struct trace_probe *tp;
+       int i, ret = 0;
+       int is_return = 0;
+       char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+       unsigned long offset = 0;
+       void *addr = NULL;
+       char buf[MAX_EVENT_NAME_LEN];
+
+       if (argc < 2) {
+               pr_info("Probe point is not specified.\n");
+               return -EINVAL;
+       }
+
+       if (argv[0][0] == 'p')
+               is_return = 0;
+       else if (argv[0][0] == 'r')
+               is_return = 1;
+       else {
+               pr_info("Probe definition must be started with 'p' or 'r'.\n");
+               return -EINVAL;
+       }
+
+       if (argv[0][1] == ':') {
+               event = &argv[0][2];
+               if (strchr(event, '/')) {
+                       group = event;
+                       event = strchr(group, '/') + 1;
+                       event[-1] = '\0';
+                       if (strlen(group) == 0) {
+                               pr_info("Group name is not specifiled\n");
+                               return -EINVAL;
+                       }
+               }
+               if (strlen(event) == 0) {
+                       pr_info("Event name is not specifiled\n");
+                       return -EINVAL;
+               }
+       }
+
+       if (isdigit(argv[1][0])) {
+               if (is_return) {
+                       pr_info("Return probe point must be a symbol.\n");
+                       return -EINVAL;
+               }
+               /* an address specified */
+               ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
+               if (ret) {
+                       pr_info("Failed to parse address.\n");
+                       return ret;
+               }
+       } else {
+               /* a symbol specified */
+               symbol = argv[1];
+               /* TODO: support .init module functions */
+               ret = split_symbol_offset(symbol, &offset);
+               if (ret) {
+                       pr_info("Failed to parse symbol.\n");
+                       return ret;
+               }
+               if (offset && is_return) {
+                       pr_info("Return probe must be used without offset.\n");
+                       return -EINVAL;
+               }
+       }
+       argc -= 2; argv += 2;
+
+       /* setup a probe */
+       if (!group)
+               group = KPROBE_EVENT_SYSTEM;
+       if (!event) {
+               /* Make a new event name */
+               if (symbol)
+                       snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
+                                is_return ? 'r' : 'p', symbol, offset);
+               else
+                       snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
+                                is_return ? 'r' : 'p', addr);
+               event = buf;
+       }
+       tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
+                              is_return);
+       if (IS_ERR(tp)) {
+               pr_info("Failed to allocate trace_probe.(%d)\n",
+                       (int)PTR_ERR(tp));
+               return PTR_ERR(tp);
+       }
+
+       /* parse arguments */
+       ret = 0;
+       for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
+               /* Parse argument name */
+               arg = strchr(argv[i], '=');
+               if (arg)
+                       *arg++ = '\0';
+               else
+                       arg = argv[i];
+
+               if (conflict_field_name(argv[i], tp->args, i)) {
+                       pr_info("Argument%d name '%s' conflicts with "
+                               "another field.\n", i, argv[i]);
+                       ret = -EINVAL;
+                       goto error;
+               }
+
+               tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
+
+               /* Parse fetch argument */
+               if (strlen(arg) > MAX_ARGSTR_LEN) {
+                       pr_info("Argument%d(%s) is too long.\n", i, arg);
+                       ret = -ENOSPC;
+                       goto error;
+               }
+               ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+               if (ret) {
+                       pr_info("Parse error at argument%d. (%d)\n", i, ret);
+                       goto error;
+               }
+       }
+       tp->nr_args = i;
+
+       ret = register_trace_probe(tp);
+       if (ret)
+               goto error;
+       return 0;
+
+error:
+       free_trace_probe(tp);
+       return ret;
+}
+
+static void cleanup_all_probes(void)
+{
+       struct trace_probe *tp;
+
+       mutex_lock(&probe_lock);
+       /* TODO: Use batch unregistration */
+       while (!list_empty(&probe_list)) {
+               tp = list_entry(probe_list.next, struct trace_probe, list);
+               unregister_trace_probe(tp);
+               free_trace_probe(tp);
+       }
+       mutex_unlock(&probe_lock);
+}
+
+
+/* Probes listing interfaces */
+static void *probes_seq_start(struct seq_file *m, loff_t *pos)
+{
+       mutex_lock(&probe_lock);
+       return seq_list_start(&probe_list, *pos);
+}
+
+static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       return seq_list_next(v, &probe_list, pos);
+}
+
+static void probes_seq_stop(struct seq_file *m, void *v)
+{
+       mutex_unlock(&probe_lock);
+}
+
+static int probes_seq_show(struct seq_file *m, void *v)
+{
+       struct trace_probe *tp = v;
+       int i, ret;
+       char buf[MAX_ARGSTR_LEN + 1];
+
+       seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
+       seq_printf(m, ":%s", tp->call.name);
+
+       if (tp->symbol)
+               seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
+       else
+               seq_printf(m, " 0x%p", tp->rp.kp.addr);
+
+       for (i = 0; i < tp->nr_args; i++) {
+               ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
+               if (ret < 0) {
+                       pr_warning("Argument%d decoding error(%d).\n", i, ret);
+                       return ret;
+               }
+               seq_printf(m, " %s=%s", tp->args[i].name, buf);
+       }
+       seq_printf(m, "\n");
+       return 0;
+}
+
+static const struct seq_operations probes_seq_op = {
+       .start  = probes_seq_start,
+       .next   = probes_seq_next,
+       .stop   = probes_seq_stop,
+       .show   = probes_seq_show
+};
+
+static int probes_open(struct inode *inode, struct file *file)
+{
+       if ((file->f_mode & FMODE_WRITE) &&
+           (file->f_flags & O_TRUNC))
+               cleanup_all_probes();
+
+       return seq_open(file, &probes_seq_op);
+}
+
+static int command_trace_probe(const char *buf)
+{
+       char **argv;
+       int argc = 0, ret = 0;
+
+       argv = argv_split(GFP_KERNEL, buf, &argc);
+       if (!argv)
+               return -ENOMEM;
+
+       if (argc)
+               ret = create_trace_probe(argc, argv);
+
+       argv_free(argv);
+       return ret;
+}
+
+#define WRITE_BUFSIZE 128
+
+static ssize_t probes_write(struct file *file, const char __user *buffer,
+                           size_t count, loff_t *ppos)
+{
+       char *kbuf, *tmp;
+       int ret;
+       size_t done;
+       size_t size;
+
+       kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+       if (!kbuf)
+               return -ENOMEM;
+
+       ret = done = 0;
+       while (done < count) {
+               size = count - done;
+               if (size >= WRITE_BUFSIZE)
+                       size = WRITE_BUFSIZE - 1;
+               if (copy_from_user(kbuf, buffer + done, size)) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+               kbuf[size] = '\0';
+               tmp = strchr(kbuf, '\n');
+               if (tmp) {
+                       *tmp = '\0';
+                       size = tmp - kbuf + 1;
+               } else if (done + size < count) {
+                       pr_warning("Line length is too long: "
+                                  "Should be less than %d.", WRITE_BUFSIZE);
+                       ret = -EINVAL;
+                       goto out;
+               }
+               done += size;
+               /* Remove comments */
+               tmp = strchr(kbuf, '#');
+               if (tmp)
+                       *tmp = '\0';
+
+               ret = command_trace_probe(kbuf);
+               if (ret)
+                       goto out;
+       }
+       ret = done;
+out:
+       kfree(kbuf);
+       return ret;
+}
+
+static const struct file_operations kprobe_events_ops = {
+       .owner          = THIS_MODULE,
+       .open           = probes_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+       .write          = probes_write,
+};
+
+/* Probes profiling interfaces */
+static int probes_profile_seq_show(struct seq_file *m, void *v)
+{
+       struct trace_probe *tp = v;
+
+       seq_printf(m, "  %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
+                  tp->rp.kp.nmissed);
+
+       return 0;
+}
+
+static const struct seq_operations profile_seq_op = {
+       .start  = probes_seq_start,
+       .next   = probes_seq_next,
+       .stop   = probes_seq_stop,
+       .show   = probes_profile_seq_show
+};
+
+static int profile_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &profile_seq_op);
+}
+
+static const struct file_operations kprobe_profile_ops = {
+       .owner          = THIS_MODULE,
+       .open           = profile_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
+
+/* Kprobe handler */
+static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+       struct kprobe_trace_entry *entry;
+       struct ring_buffer_event *event;
+       struct ring_buffer *buffer;
+       int size, i, pc;
+       unsigned long irq_flags;
+       struct ftrace_event_call *call = &tp->call;
+
+       tp->nhit++;
+
+       local_save_flags(irq_flags);
+       pc = preempt_count();
+
+       size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+
+       event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+                                                 irq_flags, pc);
+       if (!event)
+               return 0;
+
+       entry = ring_buffer_event_data(event);
+       entry->nargs = tp->nr_args;
+       entry->ip = (unsigned long)kp->addr;
+       for (i = 0; i < tp->nr_args; i++)
+               entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+
+       if (!filter_current_check_discard(buffer, call, entry, event))
+               trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+       return 0;
+}
+
+/* Kretprobe handler */
+static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+                                         struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+       struct kretprobe_trace_entry *entry;
+       struct ring_buffer_event *event;
+       struct ring_buffer *buffer;
+       int size, i, pc;
+       unsigned long irq_flags;
+       struct ftrace_event_call *call = &tp->call;
+
+       local_save_flags(irq_flags);
+       pc = preempt_count();
+
+       size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+
+       event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
+                                                 irq_flags, pc);
+       if (!event)
+               return 0;
+
+       entry = ring_buffer_event_data(event);
+       entry->nargs = tp->nr_args;
+       entry->func = (unsigned long)tp->rp.kp.addr;
+       entry->ret_ip = (unsigned long)ri->ret_addr;
+       for (i = 0; i < tp->nr_args; i++)
+               entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+
+       if (!filter_current_check_discard(buffer, call, entry, event))
+               trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+
+       return 0;
+}
+
+/* Event entry printers */
+enum print_line_t
+print_kprobe_event(struct trace_iterator *iter, int flags)
+{
+       struct kprobe_trace_entry *field;
+       struct trace_seq *s = &iter->seq;
+       struct trace_event *event;
+       struct trace_probe *tp;
+       int i;
+
+       field = (struct kprobe_trace_entry *)iter->ent;
+       event = ftrace_find_event(field->ent.type);
+       tp = container_of(event, struct trace_probe, event);
+
+       if (!trace_seq_printf(s, "%s: (", tp->call.name))
+               goto partial;
+
+       if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+               goto partial;
+
+       if (!trace_seq_puts(s, ")"))
+               goto partial;
+
+       for (i = 0; i < field->nargs; i++)
+               if (!trace_seq_printf(s, " %s=%lx",
+                                     tp->args[i].name, field->args[i]))
+                       goto partial;
+
+       if (!trace_seq_puts(s, "\n"))
+               goto partial;
+
+       return TRACE_TYPE_HANDLED;
+partial:
+       return TRACE_TYPE_PARTIAL_LINE;
+}
+
+enum print_line_t
+print_kretprobe_event(struct trace_iterator *iter, int flags)
+{
+       struct kretprobe_trace_entry *field;
+       struct trace_seq *s = &iter->seq;
+       struct trace_event *event;
+       struct trace_probe *tp;
+       int i;
+
+       field = (struct kretprobe_trace_entry *)iter->ent;
+       event = ftrace_find_event(field->ent.type);
+       tp = container_of(event, struct trace_probe, event);
+
+       if (!trace_seq_printf(s, "%s: (", tp->call.name))
+               goto partial;
+
+       if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
+               goto partial;
+
+       if (!trace_seq_puts(s, " <- "))
+               goto partial;
+
+       if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
+               goto partial;
+
+       if (!trace_seq_puts(s, ")"))
+               goto partial;
+
+       for (i = 0; i < field->nargs; i++)
+               if (!trace_seq_printf(s, " %s=%lx",
+                                     tp->args[i].name, field->args[i]))
+                       goto partial;
+
+       if (!trace_seq_puts(s, "\n"))
+               goto partial;
+
+       return TRACE_TYPE_HANDLED;
+partial:
+       return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int probe_event_enable(struct ftrace_event_call *call)
+{
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       tp->flags |= TP_FLAG_TRACE;
+       if (probe_is_return(tp))
+               return enable_kretprobe(&tp->rp);
+       else
+               return enable_kprobe(&tp->rp.kp);
+}
+
+static void probe_event_disable(struct ftrace_event_call *call)
+{
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       tp->flags &= ~TP_FLAG_TRACE;
+       if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
+               if (probe_is_return(tp))
+                       disable_kretprobe(&tp->rp);
+               else
+                       disable_kprobe(&tp->rp.kp);
+       }
+}
+
+static int probe_event_raw_init(struct ftrace_event_call *event_call)
+{
+       INIT_LIST_HEAD(&event_call->fields);
+
+       return 0;
+}
+
+#undef DEFINE_FIELD
+#define DEFINE_FIELD(type, item, name, is_signed)                      \
+       do {                                                            \
+               ret = trace_define_field(event_call, #type, name,       \
+                                        offsetof(typeof(field), item), \
+                                        sizeof(field.item), is_signed, \
+                                        FILTER_OTHER);                 \
+               if (ret)                                                \
+                       return ret;                                     \
+       } while (0)
+
+static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+       int ret, i;
+       struct kprobe_trace_entry field;
+       struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+       ret = trace_define_common_fields(event_call);
+       if (!ret)
+               return ret;
+
+       DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+       DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+       /* Set argument names as fields */
+       for (i = 0; i < tp->nr_args; i++)
+               DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+       return 0;
+}
+
+static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
+{
+       int ret, i;
+       struct kretprobe_trace_entry field;
+       struct trace_probe *tp = (struct trace_probe *)event_call->data;
+
+       ret = trace_define_common_fields(event_call);
+       if (!ret)
+               return ret;
+
+       DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
+       DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
+       DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
+       /* Set argument names as fields */
+       for (i = 0; i < tp->nr_args; i++)
+               DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+       return 0;
+}
+
+static int __probe_event_show_format(struct trace_seq *s,
+                                    struct trace_probe *tp, const char *fmt,
+                                    const char *arg)
+{
+       int i;
+
+       /* Show format */
+       if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
+               return 0;
+
+       for (i = 0; i < tp->nr_args; i++)
+               if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
+                       return 0;
+
+       if (!trace_seq_printf(s, "\", %s", arg))
+               return 0;
+
+       for (i = 0; i < tp->nr_args; i++)
+               if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
+                       return 0;
+
+       return trace_seq_puts(s, "\n");
+}
+
+#undef SHOW_FIELD
+#define SHOW_FIELD(type, item, name)                                   \
+       do {                                                            \
+               ret = trace_seq_printf(s, "\tfield: " #type " %s;\t"    \
+                               "offset:%u;\tsize:%u;\n", name,         \
+                               (unsigned int)offsetof(typeof(field), item),\
+                               (unsigned int)sizeof(type));            \
+               if (!ret)                                               \
+                       return 0;                                       \
+       } while (0)
+
+static int kprobe_event_show_format(struct ftrace_event_call *call,
+                                   struct trace_seq *s)
+{
+       struct kprobe_trace_entry field __attribute__((unused));
+       int ret, i;
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
+       SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+
+       /* Show fields */
+       for (i = 0; i < tp->nr_args; i++)
+               SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+       trace_seq_puts(s, "\n");
+
+       return __probe_event_show_format(s, tp, "(%lx)",
+                                        "REC->" FIELD_STRING_IP);
+}
+
+static int kretprobe_event_show_format(struct ftrace_event_call *call,
+                                      struct trace_seq *s)
+{
+       struct kretprobe_trace_entry field __attribute__((unused));
+       int ret, i;
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
+       SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
+       SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
+
+       /* Show fields */
+       for (i = 0; i < tp->nr_args; i++)
+               SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
+       trace_seq_puts(s, "\n");
+
+       return __probe_event_show_format(s, tp, "(%lx <- %lx)",
+                                        "REC->" FIELD_STRING_FUNC
+                                        ", REC->" FIELD_STRING_RETIP);
+}
+
+#ifdef CONFIG_EVENT_PROFILE
+
+/* Kprobe profile handler */
+static __kprobes int kprobe_profile_func(struct kprobe *kp,
+                                        struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+       struct ftrace_event_call *call = &tp->call;
+       struct kprobe_trace_entry *entry;
+       struct perf_trace_buf *trace_buf;
+       struct trace_entry *ent;
+       int size, __size, i, pc, __cpu;
+       unsigned long irq_flags;
+       char *raw_data;
+
+       pc = preempt_count();
+       __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+       size = ALIGN(__size + sizeof(u32), sizeof(u64));
+       size -= sizeof(u32);
+       if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+                    "profile buffer not large enough"))
+               return 0;
+
+       /*
+        * Protect the non nmi buffer
+        * This also protects the rcu read side
+        */
+       local_irq_save(irq_flags);
+       __cpu = smp_processor_id();
+
+       if (in_nmi())
+               trace_buf = rcu_dereference(perf_trace_buf_nmi);
+       else
+               trace_buf = rcu_dereference(perf_trace_buf);
+
+       if (!trace_buf)
+               goto end;
+
+       trace_buf = per_cpu_ptr(trace_buf, __cpu);
+
+       if (trace_buf->recursion++)
+               goto end_recursion;
+
+       /*
+        * Make recursion update visible before entering perf_tp_event
+        * so that we protect from perf recursions.
+        */
+       barrier();
+
+       raw_data = trace_buf->buf;
+
+       /* Zero dead bytes from alignment to avoid buffer leak to userspace */
+       *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+       entry = (struct kprobe_trace_entry *)raw_data;
+       ent = &entry->ent;
+
+       tracing_generic_entry_update(ent, irq_flags, pc);
+       ent->type = call->id;
+       entry->nargs = tp->nr_args;
+       entry->ip = (unsigned long)kp->addr;
+       for (i = 0; i < tp->nr_args; i++)
+               entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+       perf_tp_event(call->id, entry->ip, 1, entry, size);
+
+end_recursion:
+       trace_buf->recursion--;
+end:
+       local_irq_restore(irq_flags);
+
+       return 0;
+}
+
+/* Kretprobe profile handler */
+static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+                                           struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+       struct ftrace_event_call *call = &tp->call;
+       struct kretprobe_trace_entry *entry;
+       struct perf_trace_buf *trace_buf;
+       struct trace_entry *ent;
+       int size, __size, i, pc, __cpu;
+       unsigned long irq_flags;
+       char *raw_data;
+
+       pc = preempt_count();
+       __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+       size = ALIGN(__size + sizeof(u32), sizeof(u64));
+       size -= sizeof(u32);
+       if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+                    "profile buffer not large enough"))
+               return 0;
+
+       /*
+        * Protect the non nmi buffer
+        * This also protects the rcu read side
+        */
+       local_irq_save(irq_flags);
+       __cpu = smp_processor_id();
+
+       if (in_nmi())
+               trace_buf = rcu_dereference(perf_trace_buf_nmi);
+       else
+               trace_buf = rcu_dereference(perf_trace_buf);
+
+       if (!trace_buf)
+               goto end;
+
+       trace_buf = per_cpu_ptr(trace_buf, __cpu);
+
+       if (trace_buf->recursion++)
+               goto end_recursion;
+
+       /*
+        * Make recursion update visible before entering perf_tp_event
+        * so that we protect from perf recursions.
+        */
+       barrier();
+
+       raw_data = trace_buf->buf;
+
+       /* Zero dead bytes from alignment to avoid buffer leak to userspace */
+       *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+       entry = (struct kretprobe_trace_entry *)raw_data;
+       ent = &entry->ent;
+
+       tracing_generic_entry_update(ent, irq_flags, pc);
+       ent->type = call->id;
+       entry->nargs = tp->nr_args;
+       entry->func = (unsigned long)tp->rp.kp.addr;
+       entry->ret_ip = (unsigned long)ri->ret_addr;
+       for (i = 0; i < tp->nr_args; i++)
+               entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+       perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
+
+end_recursion:
+       trace_buf->recursion--;
+end:
+       local_irq_restore(irq_flags);
+
+       return 0;
+}
+
+static int probe_profile_enable(struct ftrace_event_call *call)
+{
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       tp->flags |= TP_FLAG_PROFILE;
+
+       if (probe_is_return(tp))
+               return enable_kretprobe(&tp->rp);
+       else
+               return enable_kprobe(&tp->rp.kp);
+}
+
+static void probe_profile_disable(struct ftrace_event_call *call)
+{
+       struct trace_probe *tp = (struct trace_probe *)call->data;
+
+       tp->flags &= ~TP_FLAG_PROFILE;
+
+       if (!(tp->flags & TP_FLAG_TRACE)) {
+               if (probe_is_return(tp))
+                       disable_kretprobe(&tp->rp);
+               else
+                       disable_kprobe(&tp->rp.kp);
+       }
+}
+#endif /* CONFIG_EVENT_PROFILE */
+
+
+static __kprobes
+int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
+
+       if (tp->flags & TP_FLAG_TRACE)
+               kprobe_trace_func(kp, regs);
+#ifdef CONFIG_EVENT_PROFILE
+       if (tp->flags & TP_FLAG_PROFILE)
+               kprobe_profile_func(kp, regs);
+#endif /* CONFIG_EVENT_PROFILE */
+       return 0;       /* We don't tweek kernel, so just return 0 */
+}
+
+static __kprobes
+int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+       struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
+
+       if (tp->flags & TP_FLAG_TRACE)
+               kretprobe_trace_func(ri, regs);
+#ifdef CONFIG_EVENT_PROFILE
+       if (tp->flags & TP_FLAG_PROFILE)
+               kretprobe_profile_func(ri, regs);
+#endif /* CONFIG_EVENT_PROFILE */
+       return 0;       /* We don't tweek kernel, so just return 0 */
+}
+
+static int register_probe_event(struct trace_probe *tp)
+{
+       struct ftrace_event_call *call = &tp->call;
+       int ret;
+
+       /* Initialize ftrace_event_call */
+       if (probe_is_return(tp)) {
+               tp->event.trace = print_kretprobe_event;
+               call->raw_init = probe_event_raw_init;
+               call->show_format = kretprobe_event_show_format;
+               call->define_fields = kretprobe_event_define_fields;
+       } else {
+               tp->event.trace = print_kprobe_event;
+               call->raw_init = probe_event_raw_init;
+               call->show_format = kprobe_event_show_format;
+               call->define_fields = kprobe_event_define_fields;
+       }
+       call->event = &tp->event;
+       call->id = register_ftrace_event(&tp->event);
+       if (!call->id)
+               return -ENODEV;
+       call->enabled = 0;
+       call->regfunc = probe_event_enable;
+       call->unregfunc = probe_event_disable;
+
+#ifdef CONFIG_EVENT_PROFILE
+       atomic_set(&call->profile_count, -1);
+       call->profile_enable = probe_profile_enable;
+       call->profile_disable = probe_profile_disable;
+#endif
+       call->data = tp;
+       ret = trace_add_event_call(call);
+       if (ret) {
+               pr_info("Failed to register kprobe event: %s\n", call->name);
+               unregister_ftrace_event(&tp->event);
+       }
+       return ret;
+}
+
+static void unregister_probe_event(struct trace_probe *tp)
+{
+       /* tp->event is unregistered in trace_remove_event_call() */
+       trace_remove_event_call(&tp->call);
+}
+
+/* Make a debugfs interface for controling probe points */
+static __init int init_kprobe_trace(void)
+{
+       struct dentry *d_tracer;
+       struct dentry *entry;
+
+       d_tracer = tracing_init_dentry();
+       if (!d_tracer)
+               return 0;
+
+       entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
+                                   NULL, &kprobe_events_ops);
+
+       /* Event list interface */
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'kprobe_events' entry\n");
+
+       /* Profile interface */
+       entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
+                                   NULL, &kprobe_profile_ops);
+
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'kprobe_profile' entry\n");
+       return 0;
+}
+fs_initcall(init_kprobe_trace);
+
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static int kprobe_trace_selftest_target(int a1, int a2, int a3,
+                                       int a4, int a5, int a6)
+{
+       return a1 + a2 + a3 + a4 + a5 + a6;
+}
+
+static __init int kprobe_trace_self_tests_init(void)
+{
+       int ret;
+       int (*target)(int, int, int, int, int, int);
+
+       target = kprobe_trace_selftest_target;
+
+       pr_info("Testing kprobe tracing: ");
+
+       ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
+                                 "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
+       if (WARN_ON_ONCE(ret))
+               pr_warning("error enabling function entry\n");
+
+       ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
+                                 "$retval");
+       if (WARN_ON_ONCE(ret))
+               pr_warning("error enabling function return\n");
+
+       ret = target(1, 2, 3, 4, 5, 6);
+
+       cleanup_all_probes();
+
+       pr_cont("OK\n");
+       return 0;
+}
+
+late_initcall(kprobe_trace_self_tests_init);
+
+#endif
index d00d1a8f1f262280d048dab527f3c8836784bd3c..51213b0aa81b2dc48d0a5a6f84effb5a68da1a7b 100644 (file)
@@ -354,13 +354,13 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
 
-int reg_event_syscall_enter(void *ptr)
+int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
        char *name;
 
-       name = (char *)ptr;
+       name = (char *)call->data;
        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return -ENOSYS;
@@ -378,12 +378,12 @@ int reg_event_syscall_enter(void *ptr)
        return ret;
 }
 
-void unreg_event_syscall_enter(void *ptr)
+void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int num;
        char *name;
 
-       name = (char *)ptr;
+       name = (char *)call->data;
        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return;
@@ -395,13 +395,13 @@ void unreg_event_syscall_enter(void *ptr)
        mutex_unlock(&syscall_trace_lock);
 }
 
-int reg_event_syscall_exit(void *ptr)
+int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
        char *name;
 
-       name = (char *)ptr;
+       name = call->data;
        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return -ENOSYS;
@@ -419,12 +419,12 @@ int reg_event_syscall_exit(void *ptr)
        return ret;
 }
 
-void unreg_event_syscall_exit(void *ptr)
+void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int num;
        char *name;
 
-       name = (char *)ptr;
+       name = call->data;
        num = syscall_name_to_nr(name);
        if (num < 0 || num >= NR_syscalls)
                return;
@@ -477,6 +477,7 @@ static int sys_prof_refcount_exit;
 static void prof_syscall_enter(struct pt_regs *regs, long id)
 {
        struct syscall_metadata *sys_data;
+       struct perf_trace_buf *trace_buf;
        struct syscall_trace_enter *rec;
        unsigned long flags;
        char *raw_data;
@@ -507,14 +508,25 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        cpu = smp_processor_id();
 
        if (in_nmi())
-               raw_data = rcu_dereference(trace_profile_buf_nmi);
+               trace_buf = rcu_dereference(perf_trace_buf_nmi);
        else
-               raw_data = rcu_dereference(trace_profile_buf);
+               trace_buf = rcu_dereference(perf_trace_buf);
 
-       if (!raw_data)
+       if (!trace_buf)
                goto end;
 
-       raw_data = per_cpu_ptr(raw_data, cpu);
+       trace_buf = per_cpu_ptr(trace_buf, cpu);
+
+       if (trace_buf->recursion++)
+               goto end_recursion;
+
+       /*
+        * Make recursion update visible before entering perf_tp_event
+        * so that we protect from perf recursions.
+        */
+       barrier();
+
+       raw_data = trace_buf->buf;
 
        /* zero the dead bytes from align to not leak stack to user */
        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -527,6 +539,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
                               (unsigned long *)&rec->args);
        perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
 
+end_recursion:
+       trace_buf->recursion--;
 end:
        local_irq_restore(flags);
 }
@@ -574,6 +588,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
+       struct perf_trace_buf *trace_buf;
        unsigned long flags;
        int syscall_nr;
        char *raw_data;
@@ -605,14 +620,25 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        cpu = smp_processor_id();
 
        if (in_nmi())
-               raw_data = rcu_dereference(trace_profile_buf_nmi);
+               trace_buf = rcu_dereference(perf_trace_buf_nmi);
        else
-               raw_data = rcu_dereference(trace_profile_buf);
+               trace_buf = rcu_dereference(perf_trace_buf);
 
-       if (!raw_data)
+       if (!trace_buf)
                goto end;
 
-       raw_data = per_cpu_ptr(raw_data, cpu);
+       trace_buf = per_cpu_ptr(trace_buf, cpu);
+
+       if (trace_buf->recursion++)
+               goto end_recursion;
+
+       /*
+        * Make recursion update visible before entering perf_tp_event
+        * so that we protect from perf recursions.
+        */
+       barrier();
+
+       raw_data = trace_buf->buf;
 
        /* zero the dead bytes from align to not leak stack to user */
        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -626,6 +652,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
 
        perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
 
+end_recursion:
+       trace_buf->recursion--;
 end:
        local_irq_restore(flags);
 }
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
new file mode 100644 (file)
index 0000000..9270594
--- /dev/null
@@ -0,0 +1,49 @@
+perf-probe(1)
+=============
+
+NAME
+----
+perf-probe - Define new dynamic tracepoints
+
+SYNOPSIS
+--------
+[verse]
+'perf probe' [options] --add 'PROBE' [--add 'PROBE' ...]
+or
+'perf probe' [options] 'PROBE' ['PROBE' ...]
+
+
+DESCRIPTION
+-----------
+This command defines dynamic tracepoint events, by symbol and registers
+without debuginfo, or by C expressions (C line numbers, C function names,
+and C local variables) with debuginfo.
+
+
+OPTIONS
+-------
+-k::
+--vmlinux=PATH::
+       Specify vmlinux path which has debuginfo (Dwarf binary).
+
+-v::
+--verbose::
+        Be more verbose (show parsed arguments, etc).
+
+-a::
+--add::
+       Define a probe point (see PROBE SYNTAX for detail)
+
+PROBE SYNTAX
+------------
+Probe points are defined by following syntax.
+
+ "FUNC[+OFFS|:RLN|%return][@SRC]|SRC:ALN [ARG ...]"
+
+'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, 'RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. In addition, 'SRC' specifies a source file which has that function.
+It is also possible to specify a probe point by the source line number by using 'SRC:ALN' syntax, where 'SRC' is the source file path and 'ALN' is the line number.
+'ARG' specifies the arguments of this probe point. You can use the name of local variable, or kprobe-tracer argument format (e.g. $retval, %ax, etc).
+
+SEE ALSO
+--------
+linkperf:perf-trace[1], linkperf:perf-record[1]
index 46a58a81c9ad5df9fe68f02d5756c629c26a0d8b..3dbb5c5bb8c665bd41db347b5766842b18d4a17b 100644 (file)
@@ -337,6 +337,7 @@ LIB_FILE=libperf.a
 LIB_H += ../../include/linux/perf_event.h
 LIB_H += ../../include/linux/rbtree.h
 LIB_H += ../../include/linux/list.h
+LIB_H += ../../include/linux/stringify.h
 LIB_H += util/include/linux/bitmap.h
 LIB_H += util/include/linux/bitops.h
 LIB_H += util/include/linux/compiler.h
@@ -438,6 +439,7 @@ BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-timechart.o
 BUILTIN_OBJS += builtin-top.o
 BUILTIN_OBJS += builtin-trace.o
+BUILTIN_OBJS += builtin-probe.o
 
 PERFLIBS = $(LIB_FILE)
 
@@ -469,6 +471,10 @@ ifeq ($(uname_S),Darwin)
 endif
 
 ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) > /dev/null 2>&1 && echo y"), y)
+ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) > /dev/null 2>&1 && echo y"), y)
+       msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]);
+endif
+
        ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o /dev/null $(ALL_LDFLAGS) > /dev/null 2>&1 && echo y"), y)
                BASIC_CFLAGS += -DLIBELF_NO_MMAP
        endif
@@ -476,6 +482,15 @@ else
        msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]);
 endif
 
+ifneq ($(shell sh -c "(echo '\#include <libdwarf/dwarf.h>'; echo '\#include <libdwarf/libdwarf.h>'; echo 'int main(void) { Dwarf_Debug dbg; Dwarf_Error err; Dwarf_Ranges *rng; dwarf_init(0, DW_DLC_READ, 0, 0, &dbg, &err); dwarf_get_ranges(dbg, 0, &rng, 0, 0, &err); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -ldwarf -lelf -o /dev/null $(ALL_LDFLAGS) > /dev/null 2>&1 && echo y"), y)
+       msg := $(warning No libdwarf.h found or old libdwarf.h found, disables dwarf support. Please install libdwarf-dev/libdwarf-devel >= 20081231);
+       BASIC_CFLAGS += -DNO_LIBDWARF
+else
+       EXTLIBS += -lelf -ldwarf
+       LIB_H += util/probe-finder.h
+       LIB_OBJS += util/probe-finder.o
+endif
+
 ifdef NO_DEMANGLE
        BASIC_CFLAGS += -DNO_DEMANGLE
 else
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
new file mode 100644 (file)
index 0000000..d78a3d9
--- /dev/null
@@ -0,0 +1,435 @@
+/*
+ * builtin-probe.c
+ *
+ * Builtin probe command: Set up probe events by C expression
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+#define _GNU_SOURCE
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+
+#undef _GNU_SOURCE
+#include "perf.h"
+#include "builtin.h"
+#include "util/util.h"
+#include "util/event.h"
+#include "util/debug.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h" /* For debugfs_path */
+#include "util/probe-finder.h"
+
+/* Default vmlinux search paths */
+#define NR_SEARCH_PATH 3
+const char *default_search_path[NR_SEARCH_PATH] = {
+"/lib/modules/%s/build/vmlinux",               /* Custom build kernel */
+"/usr/lib/debug/lib/modules/%s/vmlinux",       /* Red Hat debuginfo */
+"/boot/vmlinux-debug-%s",                      /* Ubuntu */
+};
+
+#define MAX_PATH_LEN 256
+#define MAX_PROBES 128
+#define MAX_PROBE_ARGS 128
+#define PERFPROBE_GROUP "probe"
+
+/* Session management structure */
+static struct {
+       char *vmlinux;
+       char *release;
+       int need_dwarf;
+       int nr_probe;
+       struct probe_point probes[MAX_PROBES];
+} session;
+
+#define semantic_error(msg ...) die("Semantic error :" msg)
+
+/* Parse probe point. Return 1 if return probe */
+static void parse_probe_point(char *arg, struct probe_point *pp)
+{
+       char *ptr, *tmp;
+       char c, nc = 0;
+       /*
+        * <Syntax>
+        * perf probe SRC:LN
+        * perf probe FUNC[+OFFS|%return][@SRC]
+        */
+
+       ptr = strpbrk(arg, ":+@%");
+       if (ptr) {
+               nc = *ptr;
+               *ptr++ = '\0';
+       }
+
+       /* Check arg is function or file and copy it */
+       if (strchr(arg, '.'))   /* File */
+               pp->file = strdup(arg);
+       else                    /* Function */
+               pp->function = strdup(arg);
+       DIE_IF(pp->file == NULL && pp->function == NULL);
+
+       /* Parse other options */
+       while (ptr) {
+               arg = ptr;
+               c = nc;
+               ptr = strpbrk(arg, ":+@%");
+               if (ptr) {
+                       nc = *ptr;
+                       *ptr++ = '\0';
+               }
+               switch (c) {
+               case ':':       /* Line number */
+                       pp->line = strtoul(arg, &tmp, 0);
+                       if (*tmp != '\0')
+                               semantic_error("There is non-digit charactor"
+                                               " in line number.");
+                       break;
+               case '+':       /* Byte offset from a symbol */
+                       pp->offset = strtoul(arg, &tmp, 0);
+                       if (*tmp != '\0')
+                               semantic_error("There is non-digit charactor"
+                                               " in offset.");
+                       break;
+               case '@':       /* File name */
+                       if (pp->file)
+                               semantic_error("SRC@SRC is not allowed.");
+                       pp->file = strdup(arg);
+                       DIE_IF(pp->file == NULL);
+                       if (ptr)
+                               semantic_error("@SRC must be the last "
+                                              "option.");
+                       break;
+               case '%':       /* Probe places */
+                       if (strcmp(arg, "return") == 0) {
+                               pp->retprobe = 1;
+                       } else  /* Others not supported yet */
+                               semantic_error("%%%s is not supported.", arg);
+                       break;
+               default:
+                       DIE_IF("Program has a bug.");
+                       break;
+               }
+       }
+
+       /* Exclusion check */
+       if (pp->line && pp->offset)
+               semantic_error("Offset can't be used with line number.");
+       if (!pp->line && pp->file && !pp->function)
+               semantic_error("File always requires line number.");
+       if (pp->offset && !pp->function)
+               semantic_error("Offset requires an entry function.");
+       if (pp->retprobe && !pp->function)
+               semantic_error("Return probe requires an entry function.");
+       if ((pp->offset || pp->line) && pp->retprobe)
+               semantic_error("Offset/Line can't be used with return probe.");
+
+       pr_debug("symbol:%s file:%s line:%d offset:%d, return:%d\n",
+                pp->function, pp->file, pp->line, pp->offset, pp->retprobe);
+}
+
+/* Parse an event definition. Note that any error must die. */
+static void parse_probe_event(const char *str)
+{
+       char *argv[MAX_PROBE_ARGS + 2]; /* Event + probe + args */
+       int argc, i;
+       struct probe_point *pp = &session.probes[session.nr_probe];
+
+       pr_debug("probe-definition(%d): %s\n", session.nr_probe, str);
+       if (++session.nr_probe == MAX_PROBES)
+               semantic_error("Too many probes");
+
+       /* Separate arguments, similar to argv_split */
+       argc = 0;
+       do {
+               /* Skip separators */
+               while (isspace(*str))
+                       str++;
+
+               /* Add an argument */
+               if (*str != '\0') {
+                       const char *s = str;
+
+                       /* Skip the argument */
+                       while (!isspace(*str) && *str != '\0')
+                               str++;
+
+                       /* Duplicate the argument */
+                       argv[argc] = strndup(s, str - s);
+                       if (argv[argc] == NULL)
+                               die("strndup");
+                       if (++argc == MAX_PROBE_ARGS)
+                               semantic_error("Too many arguments");
+                       pr_debug("argv[%d]=%s\n", argc, argv[argc - 1]);
+               }
+       } while (*str != '\0');
+       if (!argc)
+               semantic_error("An empty argument.");
+
+       /* Parse probe point */
+       parse_probe_point(argv[0], pp);
+       free(argv[0]);
+       if (pp->file || pp->line)
+               session.need_dwarf = 1;
+
+       /* Copy arguments */
+       pp->nr_args = argc - 1;
+       if (pp->nr_args > 0) {
+               pp->args = (char **)malloc(sizeof(char *) * pp->nr_args);
+               if (!pp->args)
+                       die("malloc");
+               memcpy(pp->args, &argv[1], sizeof(char *) * pp->nr_args);
+       }
+
+       /* Ensure return probe has no C argument */
+       for (i = 0; i < pp->nr_args; i++)
+               if (is_c_varname(pp->args[i])) {
+                       if (pp->retprobe)
+                               semantic_error("You can't specify local"
+                                               " variable for kretprobe");
+                       session.need_dwarf = 1;
+               }
+
+       pr_debug("%d arguments\n", pp->nr_args);
+}
+
+static int opt_add_probe_event(const struct option *opt __used,
+                             const char *str, int unset __used)
+{
+       if (str)
+               parse_probe_event(str);
+       return 0;
+}
+
+#ifndef NO_LIBDWARF
+static int open_default_vmlinux(void)
+{
+       struct utsname uts;
+       char fname[MAX_PATH_LEN];
+       int fd, ret, i;
+
+       ret = uname(&uts);
+       if (ret) {
+               pr_debug("uname() failed.\n");
+               return -errno;
+       }
+       session.release = uts.release;
+       for (i = 0; i < NR_SEARCH_PATH; i++) {
+               ret = snprintf(fname, MAX_PATH_LEN,
+                              default_search_path[i], session.release);
+               if (ret >= MAX_PATH_LEN || ret < 0) {
+                       pr_debug("Filename(%d,%s) is too long.\n", i,
+                               uts.release);
+                       errno = E2BIG;
+                       return -E2BIG;
+               }
+               pr_debug("try to open %s\n", fname);
+               fd = open(fname, O_RDONLY);
+               if (fd >= 0)
+                       break;
+       }
+       return fd;
+}
+#endif
+
+static const char * const probe_usage[] = {
+       "perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
+       "perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
+       NULL
+};
+
+static const struct option options[] = {
+       OPT_BOOLEAN('v', "verbose", &verbose,
+                   "be more verbose (show parsed arguments, etc)"),
+#ifndef NO_LIBDWARF
+       OPT_STRING('k', "vmlinux", &session.vmlinux, "file",
+               "vmlinux/module pathname"),
+#endif
+       OPT_CALLBACK('a', "add", NULL,
+#ifdef NO_LIBDWARF
+               "FUNC[+OFFS|%return] [ARG ...]",
+#else
+               "FUNC[+OFFS|%return|:RLN][@SRC]|SRC:ALN [ARG ...]",
+#endif
+               "probe point definition, where\n"
+               "\t\tGRP:\tGroup name (optional)\n"
+               "\t\tNAME:\tEvent name\n"
+               "\t\tFUNC:\tFunction name\n"
+               "\t\tOFFS:\tOffset from function entry (in byte)\n"
+               "\t\t%return:\tPut the probe at function return\n"
+#ifdef NO_LIBDWARF
+               "\t\tARG:\tProbe argument (only \n"
+#else
+               "\t\tSRC:\tSource code path\n"
+               "\t\tRLN:\tRelative line number from function entry.\n"
+               "\t\tALN:\tAbsolute line number in file.\n"
+               "\t\tARG:\tProbe argument (local variable name or\n"
+#endif
+               "\t\t\tkprobe-tracer argument format is supported.)\n",
+               opt_add_probe_event),
+       OPT_END()
+};
+
+static int write_new_event(int fd, const char *buf)
+{
+       int ret;
+
+       ret = write(fd, buf, strlen(buf));
+       if (ret <= 0)
+               die("Failed to create event.");
+       else
+               printf("Added new event: %s\n", buf);
+
+       return ret;
+}
+
+#define MAX_CMDLEN 256
+
+static int synthesize_probe_event(struct probe_point *pp)
+{
+       char *buf;
+       int i, len, ret;
+       pp->probes[0] = buf = (char *)calloc(MAX_CMDLEN, sizeof(char));
+       if (!buf)
+               die("Failed to allocate memory by calloc.");
+       ret = snprintf(buf, MAX_CMDLEN, "%s+%d", pp->function, pp->offset);
+       if (ret <= 0 || ret >= MAX_CMDLEN)
+               goto error;
+       len = ret;
+
+       for (i = 0; i < pp->nr_args; i++) {
+               ret = snprintf(&buf[len], MAX_CMDLEN - len, " %s",
+                              pp->args[i]);
+               if (ret <= 0 || ret >= MAX_CMDLEN - len)
+                       goto error;
+               len += ret;
+       }
+       pp->found = 1;
+       return pp->found;
+error:
+       free(pp->probes[0]);
+       if (ret > 0)
+               ret = -E2BIG;
+       return ret;
+}
+
+int cmd_probe(int argc, const char **argv, const char *prefix __used)
+{
+       int i, j, fd, ret;
+       struct probe_point *pp;
+       char buf[MAX_CMDLEN];
+
+       argc = parse_options(argc, argv, options, probe_usage,
+                            PARSE_OPT_STOP_AT_NON_OPTION);
+       for (i = 0; i < argc; i++)
+               parse_probe_event(argv[i]);
+
+       if (session.nr_probe == 0)
+               usage_with_options(probe_usage, options);
+
+       if (session.need_dwarf)
+#ifdef NO_LIBDWARF
+               semantic_error("Debuginfo-analysis is not supported");
+#else  /* !NO_LIBDWARF */
+               pr_info("Some probes require debuginfo.\n");
+
+       if (session.vmlinux)
+               fd = open(session.vmlinux, O_RDONLY);
+       else
+               fd = open_default_vmlinux();
+       if (fd < 0) {
+               if (session.need_dwarf)
+                       die("Could not open vmlinux/module file.");
+
+               pr_warning("Could not open vmlinux/module file."
+                          " Try to use symbols.\n");
+               goto end_dwarf;
+       }
+
+       /* Searching probe points */
+       for (j = 0; j < session.nr_probe; j++) {
+               pp = &session.probes[j];
+               if (pp->found)
+                       continue;
+
+               lseek(fd, SEEK_SET, 0);
+               ret = find_probepoint(fd, pp);
+               if (ret < 0) {
+                       if (session.need_dwarf)
+                               die("Could not analyze debuginfo.");
+
+                       pr_warning("An error occurred in debuginfo analysis. Try to use symbols.\n");
+                       break;
+               }
+               if (ret == 0)   /* No error but failed to find probe point. */
+                       die("No probe point found.");
+       }
+       close(fd);
+
+end_dwarf:
+#endif /* !NO_LIBDWARF */
+
+       /* Synthesize probes without dwarf */
+       for (j = 0; j < session.nr_probe; j++) {
+               pp = &session.probes[j];
+               if (pp->found)  /* This probe is already found. */
+                       continue;
+
+               ret = synthesize_probe_event(pp);
+               if (ret == -E2BIG)
+                       semantic_error("probe point is too long.");
+               else if (ret < 0)
+                       die("Failed to synthesize a probe point.");
+       }
+
+       /* Settng up probe points */
+       snprintf(buf, MAX_CMDLEN, "%s/../kprobe_events", debugfs_path);
+       fd = open(buf, O_WRONLY, O_APPEND);
+       if (fd < 0) {
+               if (errno == ENOENT)
+                       die("kprobe_events file does not exist - please rebuild with CONFIG_KPROBE_TRACER.");
+               else
+                       die("Could not open kprobe_events file: %s",
+                           strerror(errno));
+       }
+       for (j = 0; j < session.nr_probe; j++) {
+               pp = &session.probes[j];
+               if (pp->found == 1) {
+                       snprintf(buf, MAX_CMDLEN, "%c:%s/%s_%x %s\n",
+                               pp->retprobe ? 'r' : 'p', PERFPROBE_GROUP,
+                               pp->function, pp->offset, pp->probes[0]);
+                       write_new_event(fd, buf);
+               } else
+                       for (i = 0; i < pp->found; i++) {
+                               snprintf(buf, MAX_CMDLEN, "%c:%s/%s_%x_%d %s\n",
+                                       pp->retprobe ? 'r' : 'p',
+                                       PERFPROBE_GROUP,
+                                       pp->function, pp->offset, i,
+                                       pp->probes[0]);
+                               write_new_event(fd, buf);
+                       }
+       }
+       close(fd);
+       return 0;
+}
+
index e97954a0a3d2e23ea10abc08dcdaf8d21f910d1c..9b02d85091fe7b59a955830fefb653278bd3e86f 100644 (file)
@@ -27,5 +27,6 @@ extern int cmd_timechart(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
+extern int cmd_probe(int argc, const char **argv, const char *prefix);
 
 #endif
index d37b16cf18ff83566440e5bffc4aded9e35239ee..d3a6e18e4a5e2fcd9a4c38a1f969c6d622d40e46 100644 (file)
@@ -13,3 +13,4 @@ perf-stat                     mainporcelain common
 perf-timechart                 mainporcelain common
 perf-top                       mainporcelain common
 perf-trace                     mainporcelain common
+perf-probe                     mainporcelain common
index 53359ebb369aca1dd0b4e35be91c35c9e5e82bba..89b82acac7d9d9800f63088a530029e95c1e601b 100644 (file)
@@ -298,6 +298,7 @@ static void handle_internal_command(int argc, const char **argv)
                { "version", cmd_version, 0 },
                { "trace", cmd_trace, 0 },
                { "sched", cmd_sched, 0 },
+               { "probe", cmd_probe, 0 },
        };
        unsigned int i;
        static const char ext[] = STRIP_EXTENSION;
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
new file mode 100644 (file)
index 0000000..293cdfc
--- /dev/null
@@ -0,0 +1,732 @@
+/*
+ * probe-finder.c : C expression to kprobe event converter
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <ctype.h>
+
+#include "event.h"
+#include "debug.h"
+#include "util.h"
+#include "probe-finder.h"
+
+
+/* Dwarf_Die Linkage to parent Die */
+struct die_link {
+       struct die_link *parent;        /* Parent die */
+       Dwarf_Die die;                  /* Current die */
+};
+
+static Dwarf_Debug __dw_debug;
+static Dwarf_Error __dw_error;
+
+/*
+ * Generic dwarf analysis helpers
+ */
+
+#define X86_32_MAX_REGS 8
+const char *x86_32_regs_table[X86_32_MAX_REGS] = {
+       "%ax",
+       "%cx",
+       "%dx",
+       "%bx",
+       "$stack",       /* Stack address instead of %sp */
+       "%bp",
+       "%si",
+       "%di",
+};
+
+#define X86_64_MAX_REGS 16
+const char *x86_64_regs_table[X86_64_MAX_REGS] = {
+       "%ax",
+       "%dx",
+       "%cx",
+       "%bx",
+       "%si",
+       "%di",
+       "%bp",
+       "%sp",
+       "%r8",
+       "%r9",
+       "%r10",
+       "%r11",
+       "%r12",
+       "%r13",
+       "%r14",
+       "%r15",
+};
+
+/* TODO: switching by dwarf address size */
+#ifdef __x86_64__
+#define ARCH_MAX_REGS X86_64_MAX_REGS
+#define arch_regs_table x86_64_regs_table
+#else
+#define ARCH_MAX_REGS X86_32_MAX_REGS
+#define arch_regs_table x86_32_regs_table
+#endif
+
+/* Return architecture dependent register string (for kprobe-tracer) */
+static const char *get_arch_regstr(unsigned int n)
+{
+       return (n <= ARCH_MAX_REGS) ? arch_regs_table[n] : NULL;
+}
+
+/*
+ * Compare the tail of two strings.
+ * Return 0 if whole of either string is same as another's tail part.
+ */
+static int strtailcmp(const char *s1, const char *s2)
+{
+       int i1 = strlen(s1);
+       int i2 = strlen(s2);
+       while (--i1 > 0 && --i2 > 0) {
+               if (s1[i1] != s2[i2])
+                       return s1[i1] - s2[i2];
+       }
+       return 0;
+}
+
+/* Find the fileno of the target file. */
+static Dwarf_Unsigned cu_find_fileno(Dwarf_Die cu_die, const char *fname)
+{
+       Dwarf_Signed cnt, i;
+       Dwarf_Unsigned found = 0;
+       char **srcs;
+       int ret;
+
+       if (!fname)
+               return 0;
+
+       ret = dwarf_srcfiles(cu_die, &srcs, &cnt, &__dw_error);
+       if (ret == DW_DLV_OK) {
+               for (i = 0; i < cnt && !found; i++) {
+                       if (strtailcmp(srcs[i], fname) == 0)
+                               found = i + 1;
+                       dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING);
+               }
+               for (; i < cnt; i++)
+                       dwarf_dealloc(__dw_debug, srcs[i], DW_DLA_STRING);
+               dwarf_dealloc(__dw_debug, srcs, DW_DLA_LIST);
+       }
+       if (found)
+               pr_debug("found fno: %d\n", (int)found);
+       return found;
+}
+
+/* Compare diename and tname */
+static int die_compare_name(Dwarf_Die dw_die, const char *tname)
+{
+       char *name;
+       int ret;
+       ret = dwarf_diename(dw_die, &name, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK) {
+               ret = strcmp(tname, name);
+               dwarf_dealloc(__dw_debug, name, DW_DLA_STRING);
+       } else
+               ret = -1;
+       return ret;
+}
+
+/* Check the address is in the subprogram(function). */
+static int die_within_subprogram(Dwarf_Die sp_die, Dwarf_Addr addr,
+                                Dwarf_Signed *offs)
+{
+       Dwarf_Addr lopc, hipc;
+       int ret;
+
+       /* TODO: check ranges */
+       ret = dwarf_lowpc(sp_die, &lopc, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_NO_ENTRY)
+               return 0;
+       ret = dwarf_highpc(sp_die, &hipc, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       if (lopc <= addr && addr < hipc) {
+               *offs = addr - lopc;
+               return 1;
+       } else
+               return 0;
+}
+
+/* Check the die is inlined function */
+static Dwarf_Bool die_inlined_subprogram(Dwarf_Die dw_die)
+{
+       /* TODO: check strictly */
+       Dwarf_Bool inl;
+       int ret;
+
+       ret = dwarf_hasattr(dw_die, DW_AT_inline, &inl, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       return inl;
+}
+
+/* Get the offset of abstruct_origin */
+static Dwarf_Off die_get_abstract_origin(Dwarf_Die dw_die)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Off cu_offs;
+       int ret;
+
+       ret = dwarf_attr(dw_die, DW_AT_abstract_origin, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = dwarf_formref(attr, &cu_offs, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+       return cu_offs;
+}
+
+/* Get entry pc(or low pc, 1st entry of ranges)  of the die */
+static Dwarf_Addr die_get_entrypc(Dwarf_Die dw_die)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Addr addr;
+       Dwarf_Off offs;
+       Dwarf_Ranges *ranges;
+       Dwarf_Signed cnt;
+       int ret;
+
+       /* Try to get entry pc */
+       ret = dwarf_attr(dw_die, DW_AT_entry_pc, &attr, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK) {
+               ret = dwarf_formaddr(attr, &addr, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+               dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+               return addr;
+       }
+
+       /* Try to get low pc */
+       ret = dwarf_lowpc(dw_die, &addr, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK)
+               return addr;
+
+       /* Try to get ranges */
+       ret = dwarf_attr(dw_die, DW_AT_ranges, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = dwarf_formref(attr, &offs, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = dwarf_get_ranges(__dw_debug, offs, &ranges, &cnt, NULL,
+                               &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       addr = ranges[0].dwr_addr1;
+       dwarf_ranges_dealloc(__dw_debug, ranges, cnt);
+       return addr;
+}
+
+/*
+ * Search a Die from Die tree.
+ * Note: cur_link->die should be deallocated in this function.
+ */
+static int __search_die_tree(struct die_link *cur_link,
+                            int (*die_cb)(struct die_link *, void *),
+                            void *data)
+{
+       Dwarf_Die new_die;
+       struct die_link new_link;
+       int ret;
+
+       if (!die_cb)
+               return 0;
+
+       /* Check current die */
+       while (!(ret = die_cb(cur_link, data))) {
+               /* Check child die */
+               ret = dwarf_child(cur_link->die, &new_die, &__dw_error);
+               DIE_IF(ret == DW_DLV_ERROR);
+               if (ret == DW_DLV_OK) {
+                       new_link.parent = cur_link;
+                       new_link.die = new_die;
+                       ret = __search_die_tree(&new_link, die_cb, data);
+                       if (ret)
+                               break;
+               }
+
+               /* Move to next sibling */
+               ret = dwarf_siblingof(__dw_debug, cur_link->die, &new_die,
+                                     &__dw_error);
+               DIE_IF(ret == DW_DLV_ERROR);
+               dwarf_dealloc(__dw_debug, cur_link->die, DW_DLA_DIE);
+               cur_link->die = new_die;
+               if (ret == DW_DLV_NO_ENTRY)
+                       return 0;
+       }
+       dwarf_dealloc(__dw_debug, cur_link->die, DW_DLA_DIE);
+       return ret;
+}
+
+/* Search a die in its children's die tree */
+static int search_die_from_children(Dwarf_Die parent_die,
+                                   int (*die_cb)(struct die_link *, void *),
+                                   void *data)
+{
+       struct die_link new_link;
+       int ret;
+
+       new_link.parent = NULL;
+       ret = dwarf_child(parent_die, &new_link.die, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK)
+               return __search_die_tree(&new_link, die_cb, data);
+       else
+               return 0;
+}
+
+/* Find a locdesc corresponding to the address */
+static int attr_get_locdesc(Dwarf_Attribute attr, Dwarf_Locdesc *desc,
+                           Dwarf_Addr addr)
+{
+       Dwarf_Signed lcnt;
+       Dwarf_Locdesc **llbuf;
+       int ret, i;
+
+       ret = dwarf_loclist_n(attr, &llbuf, &lcnt, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = DW_DLV_NO_ENTRY;
+       for (i = 0; i < lcnt; ++i) {
+               if (llbuf[i]->ld_lopc <= addr &&
+                   llbuf[i]->ld_hipc > addr) {
+                       memcpy(desc, llbuf[i], sizeof(Dwarf_Locdesc));
+                       desc->ld_s =
+                               malloc(sizeof(Dwarf_Loc) * llbuf[i]->ld_cents);
+                       DIE_IF(desc->ld_s == NULL);
+                       memcpy(desc->ld_s, llbuf[i]->ld_s,
+                               sizeof(Dwarf_Loc) * llbuf[i]->ld_cents);
+                       ret = DW_DLV_OK;
+                       break;
+               }
+               dwarf_dealloc(__dw_debug, llbuf[i]->ld_s, DW_DLA_LOC_BLOCK);
+               dwarf_dealloc(__dw_debug, llbuf[i], DW_DLA_LOCDESC);
+       }
+       /* Releasing loop */
+       for (; i < lcnt; ++i) {
+               dwarf_dealloc(__dw_debug, llbuf[i]->ld_s, DW_DLA_LOC_BLOCK);
+               dwarf_dealloc(__dw_debug, llbuf[i], DW_DLA_LOCDESC);
+       }
+       dwarf_dealloc(__dw_debug, llbuf, DW_DLA_LIST);
+       return ret;
+}
+
+/* Get decl_file attribute value (file number) */
+static Dwarf_Unsigned die_get_decl_file(Dwarf_Die sp_die)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Unsigned fno;
+       int ret;
+
+       ret = dwarf_attr(sp_die, DW_AT_decl_file, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_formudata(attr, &fno, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+       return fno;
+}
+
+/* Get decl_line attribute value (line number) */
+static Dwarf_Unsigned die_get_decl_line(Dwarf_Die sp_die)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Unsigned lno;
+       int ret;
+
+       ret = dwarf_attr(sp_die, DW_AT_decl_line, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_formudata(attr, &lno, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+       return lno;
+}
+
+/*
+ * Probe finder related functions
+ */
+
+/* Show a location */
+static void show_location(Dwarf_Loc *loc, struct probe_finder *pf)
+{
+       Dwarf_Small op;
+       Dwarf_Unsigned regn;
+       Dwarf_Signed offs;
+       int deref = 0, ret;
+       const char *regs;
+
+       op = loc->lr_atom;
+
+       /* If this is based on frame buffer, set the offset */
+       if (op == DW_OP_fbreg) {
+               deref = 1;
+               offs = (Dwarf_Signed)loc->lr_number;
+               op = pf->fbloc.ld_s[0].lr_atom;
+               loc = &pf->fbloc.ld_s[0];
+       } else
+               offs = 0;
+
+       if (op >= DW_OP_breg0 && op <= DW_OP_breg31) {
+               regn = op - DW_OP_breg0;
+               offs += (Dwarf_Signed)loc->lr_number;
+               deref = 1;
+       } else if (op >= DW_OP_reg0 && op <= DW_OP_reg31) {
+               regn = op - DW_OP_reg0;
+       } else if (op == DW_OP_bregx) {
+               regn = loc->lr_number;
+               offs += (Dwarf_Signed)loc->lr_number2;
+               deref = 1;
+       } else if (op == DW_OP_regx) {
+               regn = loc->lr_number;
+       } else
+               die("Dwarf_OP %d is not supported.\n", op);
+
+       regs = get_arch_regstr(regn);
+       if (!regs)
+               die("%lld exceeds max register number.\n", regn);
+
+       if (deref)
+               ret = snprintf(pf->buf, pf->len,
+                                " %s=%+lld(%s)", pf->var, offs, regs);
+       else
+               ret = snprintf(pf->buf, pf->len, " %s=%s", pf->var, regs);
+       DIE_IF(ret < 0);
+       DIE_IF(ret >= pf->len);
+}
+
+/* Show a variables in kprobe event format */
+static void show_variable(Dwarf_Die vr_die, struct probe_finder *pf)
+{
+       Dwarf_Attribute attr;
+       Dwarf_Locdesc ld;
+       int ret;
+
+       ret = dwarf_attr(vr_die, DW_AT_location, &attr, &__dw_error);
+       if (ret != DW_DLV_OK)
+               goto error;
+       ret = attr_get_locdesc(attr, &ld, (pf->addr - pf->cu_base));
+       if (ret != DW_DLV_OK)
+               goto error;
+       /* TODO? */
+       DIE_IF(ld.ld_cents != 1);
+       show_location(&ld.ld_s[0], pf);
+       free(ld.ld_s);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+       return ;
+error:
+       die("Failed to find the location of %s at this address.\n"
+           " Perhaps, it has been optimized out.\n", pf->var);
+}
+
+static int variable_callback(struct die_link *dlink, void *data)
+{
+       struct probe_finder *pf = (struct probe_finder *)data;
+       Dwarf_Half tag;
+       int ret;
+
+       ret = dwarf_tag(dlink->die, &tag, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if ((tag == DW_TAG_formal_parameter ||
+            tag == DW_TAG_variable) &&
+           (die_compare_name(dlink->die, pf->var) == 0)) {
+               show_variable(dlink->die, pf);
+               return 1;
+       }
+       /* TODO: Support struct members and arrays */
+       return 0;
+}
+
+/* Find a variable in a subprogram die */
+static void find_variable(Dwarf_Die sp_die, struct probe_finder *pf)
+{
+       int ret;
+
+       if (!is_c_varname(pf->var)) {
+               /* Output raw parameters */
+               ret = snprintf(pf->buf, pf->len, " %s", pf->var);
+               DIE_IF(ret < 0);
+               DIE_IF(ret >= pf->len);
+               return ;
+       }
+
+       pr_debug("Searching '%s' variable in context.\n", pf->var);
+       /* Search child die for local variables and parameters. */
+       ret = search_die_from_children(sp_die, variable_callback, pf);
+       if (!ret)
+               die("Failed to find '%s' in this function.\n", pf->var);
+}
+
+/* Get a frame base on the address */
+static void get_current_frame_base(Dwarf_Die sp_die, struct probe_finder *pf)
+{
+       Dwarf_Attribute attr;
+       int ret;
+
+       ret = dwarf_attr(sp_die, DW_AT_frame_base, &attr, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+       ret = attr_get_locdesc(attr, &pf->fbloc, (pf->addr - pf->cu_base));
+       DIE_IF(ret != DW_DLV_OK);
+       dwarf_dealloc(__dw_debug, attr, DW_DLA_ATTR);
+}
+
+static void free_current_frame_base(struct probe_finder *pf)
+{
+       free(pf->fbloc.ld_s);
+       memset(&pf->fbloc, 0, sizeof(Dwarf_Locdesc));
+}
+
+/* Show a probe point to output buffer */
+static void show_probepoint(Dwarf_Die sp_die, Dwarf_Signed offs,
+                           struct probe_finder *pf)
+{
+       struct probe_point *pp = pf->pp;
+       char *name;
+       char tmp[MAX_PROBE_BUFFER];
+       int ret, i, len;
+
+       /* Output name of probe point */
+       ret = dwarf_diename(sp_die, &name, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (ret == DW_DLV_OK) {
+               ret = snprintf(tmp, MAX_PROBE_BUFFER, "%s+%u", name,
+                               (unsigned int)offs);
+               /* Copy the function name if possible */
+               if (!pp->function) {
+                       pp->function = strdup(name);
+                       pp->offset = offs;
+               }
+               dwarf_dealloc(__dw_debug, name, DW_DLA_STRING);
+       } else {
+               /* This function has no name. */
+               ret = snprintf(tmp, MAX_PROBE_BUFFER, "0x%llx", pf->addr);
+               if (!pp->function) {
+                       /* TODO: Use _stext */
+                       pp->function = strdup("");
+                       pp->offset = (int)pf->addr;
+               }
+       }
+       DIE_IF(ret < 0);
+       DIE_IF(ret >= MAX_PROBE_BUFFER);
+       len = ret;
+       pr_debug("Probe point found: %s\n", tmp);
+
+       /* Find each argument */
+       get_current_frame_base(sp_die, pf);
+       for (i = 0; i < pp->nr_args; i++) {
+               pf->var = pp->args[i];
+               pf->buf = &tmp[len];
+               pf->len = MAX_PROBE_BUFFER - len;
+               find_variable(sp_die, pf);
+               len += strlen(pf->buf);
+       }
+       free_current_frame_base(pf);
+
+       pp->probes[pp->found] = strdup(tmp);
+       pp->found++;
+}
+
+static int probeaddr_callback(struct die_link *dlink, void *data)
+{
+       struct probe_finder *pf = (struct probe_finder *)data;
+       Dwarf_Half tag;
+       Dwarf_Signed offs;
+       int ret;
+
+       ret = dwarf_tag(dlink->die, &tag, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       /* Check the address is in this subprogram */
+       if (tag == DW_TAG_subprogram &&
+           die_within_subprogram(dlink->die, pf->addr, &offs)) {
+               show_probepoint(dlink->die, offs, pf);
+               return 1;
+       }
+       return 0;
+}
+
+/* Find probe point from its line number */
+static void find_by_line(struct probe_finder *pf)
+{
+       Dwarf_Signed cnt, i, clm;
+       Dwarf_Line *lines;
+       Dwarf_Unsigned lineno = 0;
+       Dwarf_Addr addr;
+       Dwarf_Unsigned fno;
+       int ret;
+
+       ret = dwarf_srclines(pf->cu_die, &lines, &cnt, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+
+       for (i = 0; i < cnt; i++) {
+               ret = dwarf_line_srcfileno(lines[i], &fno, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+               if (fno != pf->fno)
+                       continue;
+
+               ret = dwarf_lineno(lines[i], &lineno, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+               if (lineno != pf->lno)
+                       continue;
+
+               ret = dwarf_lineoff(lines[i], &clm, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+
+               ret = dwarf_lineaddr(lines[i], &addr, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+               pr_debug("Probe line found: line[%d]:%u,%d addr:0x%llx\n",
+                        (int)i, (unsigned)lineno, (int)clm, addr);
+               pf->addr = addr;
+               /* Search a real subprogram including this line, */
+               ret = search_die_from_children(pf->cu_die,
+                                              probeaddr_callback, pf);
+               if (ret == 0)
+                       die("Probe point is not found in subprograms.\n");
+               /* Continuing, because target line might be inlined. */
+       }
+       dwarf_srclines_dealloc(__dw_debug, lines, cnt);
+}
+
+/* Search function from function name */
+static int probefunc_callback(struct die_link *dlink, void *data)
+{
+       struct probe_finder *pf = (struct probe_finder *)data;
+       struct probe_point *pp = pf->pp;
+       struct die_link *lk;
+       Dwarf_Signed offs;
+       Dwarf_Half tag;
+       int ret;
+
+       ret = dwarf_tag(dlink->die, &tag, &__dw_error);
+       DIE_IF(ret == DW_DLV_ERROR);
+       if (tag == DW_TAG_subprogram) {
+               if (die_compare_name(dlink->die, pp->function) == 0) {
+                       if (pp->line) { /* Function relative line */
+                               pf->fno = die_get_decl_file(dlink->die);
+                               pf->lno = die_get_decl_line(dlink->die)
+                                        + pp->line;
+                               find_by_line(pf);
+                               return 1;
+                       }
+                       if (die_inlined_subprogram(dlink->die)) {
+                               /* Inlined function, save it. */
+                               ret = dwarf_die_CU_offset(dlink->die,
+                                                         &pf->inl_offs,
+                                                         &__dw_error);
+                               DIE_IF(ret != DW_DLV_OK);
+                               pr_debug("inline definition offset %lld\n",
+                                        pf->inl_offs);
+                               return 0;       /* Continue to search */
+                       }
+                       /* Get probe address */
+                       pf->addr = die_get_entrypc(dlink->die);
+                       pf->addr += pp->offset;
+                       /* TODO: Check the address in this function */
+                       show_probepoint(dlink->die, pp->offset, pf);
+                       return 1; /* Exit; no same symbol in this CU. */
+               }
+       } else if (tag == DW_TAG_inlined_subroutine && pf->inl_offs) {
+               if (die_get_abstract_origin(dlink->die) == pf->inl_offs) {
+                       /* Get probe address */
+                       pf->addr = die_get_entrypc(dlink->die);
+                       pf->addr += pp->offset;
+                       pr_debug("found inline addr: 0x%llx\n", pf->addr);
+                       /* Inlined function. Get a real subprogram */
+                       for (lk = dlink->parent; lk != NULL; lk = lk->parent) {
+                               tag = 0;
+                               dwarf_tag(lk->die, &tag, &__dw_error);
+                               DIE_IF(ret == DW_DLV_ERROR);
+                               if (tag == DW_TAG_subprogram &&
+                                   !die_inlined_subprogram(lk->die))
+                                       goto found;
+                       }
+                       die("Failed to find real subprogram.\n");
+found:
+                       /* Get offset from subprogram */
+                       ret = die_within_subprogram(lk->die, pf->addr, &offs);
+                       DIE_IF(!ret);
+                       show_probepoint(lk->die, offs, pf);
+                       /* Continue to search */
+               }
+       }
+       return 0;
+}
+
+static void find_by_func(struct probe_finder *pf)
+{
+       search_die_from_children(pf->cu_die, probefunc_callback, pf);
+}
+
+/* Find a probe point */
+int find_probepoint(int fd, struct probe_point *pp)
+{
+       Dwarf_Half addr_size = 0;
+       Dwarf_Unsigned next_cuh = 0;
+       int cu_number = 0, ret;
+       struct probe_finder pf = {.pp = pp};
+
+       ret = dwarf_init(fd, DW_DLC_READ, 0, 0, &__dw_debug, &__dw_error);
+       if (ret != DW_DLV_OK) {
+               pr_warning("No dwarf info found in the vmlinux - please rebuild with CONFIG_DEBUG_INFO.\n");
+               return -ENOENT;
+       }
+
+       pp->found = 0;
+       while (++cu_number) {
+               /* Search CU (Compilation Unit) */
+               ret = dwarf_next_cu_header(__dw_debug, NULL, NULL, NULL,
+                       &addr_size, &next_cuh, &__dw_error);
+               DIE_IF(ret == DW_DLV_ERROR);
+               if (ret == DW_DLV_NO_ENTRY)
+                       break;
+
+               /* Get the DIE(Debugging Information Entry) of this CU */
+               ret = dwarf_siblingof(__dw_debug, 0, &pf.cu_die, &__dw_error);
+               DIE_IF(ret != DW_DLV_OK);
+
+               /* Check if target file is included. */
+               if (pp->file)
+                       pf.fno = cu_find_fileno(pf.cu_die, pp->file);
+
+               if (!pp->file || pf.fno) {
+                       /* Save CU base address (for frame_base) */
+                       ret = dwarf_lowpc(pf.cu_die, &pf.cu_base, &__dw_error);
+                       DIE_IF(ret == DW_DLV_ERROR);
+                       if (ret == DW_DLV_NO_ENTRY)
+                               pf.cu_base = 0;
+                       if (pp->function)
+                               find_by_func(&pf);
+                       else {
+                               pf.lno = pp->line;
+                               find_by_line(&pf);
+                       }
+               }
+               dwarf_dealloc(__dw_debug, pf.cu_die, DW_DLA_DIE);
+       }
+       ret = dwarf_finish(__dw_debug, &__dw_error);
+       DIE_IF(ret != DW_DLV_OK);
+
+       return pp->found;
+}
+
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
new file mode 100644 (file)
index 0000000..bdebca6
--- /dev/null
@@ -0,0 +1,57 @@
+#ifndef _PROBE_FINDER_H
+#define _PROBE_FINDER_H
+
+#define MAX_PATH_LEN 256
+#define MAX_PROBE_BUFFER 1024
+#define MAX_PROBES 128
+
+static inline int is_c_varname(const char *name)
+{
+       /* TODO */
+       return isalpha(name[0]) || name[0] == '_';
+}
+
+struct probe_point {
+       /* Inputs */
+       char    *file;          /* File name */
+       int     line;           /* Line number */
+
+       char    *function;      /* Function name */
+       int     offset;         /* Offset bytes */
+
+       int     nr_args;        /* Number of arguments */
+       char    **args;         /* Arguments */
+
+       int     retprobe;       /* Return probe */
+
+       /* Output */
+       int     found;          /* Number of found probe points */
+       char    *probes[MAX_PROBES];    /* Output buffers (will be allocated)*/
+};
+
+#ifndef NO_LIBDWARF
+extern int find_probepoint(int fd, struct probe_point *pp);
+
+#include <libdwarf/dwarf.h>
+#include <libdwarf/libdwarf.h>
+
+struct probe_finder {
+       struct probe_point      *pp;    /* Target probe point */
+
+       /* For function searching */
+       Dwarf_Addr      addr;           /* Address */
+       Dwarf_Unsigned  fno;            /* File number */
+       Dwarf_Unsigned  lno;            /* Line number */
+       Dwarf_Off       inl_offs;       /* Inline offset */
+       Dwarf_Die       cu_die;         /* Current CU */
+
+       /* For variable searching */
+       Dwarf_Addr      cu_base;        /* Current CU base address */
+       Dwarf_Locdesc   fbloc;          /* Location of Current Frame Base */
+       const char      *var;           /* Current variable name */
+       char            *buf;           /* Current output buffer */
+       int             len;            /* Length of output buffer */
+};
+#endif /* NO_LIBDWARF */
+
+#endif /*_PROBE_FINDER_H */
index 7bd5bdaeb2357344318a9966e556b9de4b7fe6a2..f2203a0946bcbb04917b5473f9788ae62dbc149d 100644 (file)
@@ -134,6 +134,15 @@ extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1,
 extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
 extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
 
+#include "../../../include/linux/stringify.h"
+
+#define DIE_IF(cnd)    \
+       do { if (cnd)   \
+               die(" at (" __FILE__ ":" __stringify(__LINE__) "): "    \
+                   __stringify(cnd) "\n");                             \
+       } while (0)
+
+
 extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
 
 extern int prefixcmp(const char *str, const char *prefix);