Merge branch 'auto-ftrace-next' into tracing/for-linus
authorIngo Molnar <mingo@elte.hu>
Mon, 14 Jul 2008 14:11:52 +0000 (16:11 +0200)
committerIngo Molnar <mingo@elte.hu>
Mon, 14 Jul 2008 14:11:52 +0000 (16:11 +0200)
Conflicts:

arch/x86/kernel/entry_32.S
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/lib/Makefile
include/asm-x86/irqflags.h
kernel/Makefile
kernel/sched.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
100 files changed:
Documentation/tracers/mmiotrace.txt [new file with mode: 0644]
Makefile
arch/arm/Kconfig
arch/arm/boot/compressed/Makefile
arch/arm/kernel/Makefile
arch/arm/kernel/armksyms.c
arch/arm/kernel/entry-common.S
arch/arm/kernel/ftrace.c [new file with mode: 0644]
arch/arm/kernel/kprobes.c
arch/powerpc/Kconfig
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/entry_32.S
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/ftrace.c [new file with mode: 0644]
arch/powerpc/kernel/io.c
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/ppc_ksyms.c
arch/powerpc/kernel/setup_32.c
arch/powerpc/platforms/powermac/Makefile
arch/sparc64/Kconfig
arch/sparc64/Kconfig.debug
arch/sparc64/kernel/Makefile
arch/sparc64/kernel/ftrace.c [new file with mode: 0644]
arch/sparc64/kernel/sparc64_ksyms.c
arch/sparc64/lib/mcount.S
arch/x86/Kconfig
arch/x86/Kconfig.debug
arch/x86/kernel/Makefile
arch/x86/kernel/alternative.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/ftrace.c [new file with mode: 0644]
arch/x86/kernel/i386_ksyms_32.c
arch/x86/kernel/machine_kexec_32.c
arch/x86/kernel/machine_kexec_64.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/vsyscall_64.c
arch/x86/kernel/x8664_ksyms_64.c
arch/x86/lib/Makefile
arch/x86/lib/thunk_32.S [new file with mode: 0644]
arch/x86/lib/thunk_64.S
arch/x86/mm/Makefile
arch/x86/mm/fault.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/ioremap.c
arch/x86/mm/kmmio.c [new file with mode: 0644]
arch/x86/mm/mmio-mod.c [new file with mode: 0644]
arch/x86/mm/pageattr.c
arch/x86/mm/pf_in.c [new file with mode: 0644]
arch/x86/mm/pf_in.h [new file with mode: 0644]
arch/x86/mm/testmmiotrace.c [new file with mode: 0644]
arch/x86/vdso/vclock_gettime.c
arch/x86/vdso/vgetcpu.c
include/asm-arm/ftrace.h [new file with mode: 0644]
include/asm-arm/kprobes.h
include/asm-powerpc/ftrace.h [new file with mode: 0644]
include/asm-powerpc/hw_irq.h
include/asm-sparc64/ftrace.h [new file with mode: 0644]
include/asm-x86/alternative.h
include/asm-x86/ftrace.h [new file with mode: 0644]
include/asm-x86/irqflags.h
include/asm-x86/vsyscall.h
include/linux/ftrace.h [new file with mode: 0644]
include/linux/irqflags.h
include/linux/kprobes.h
include/linux/linkage.h
include/linux/marker.h
include/linux/mmiotrace.h [new file with mode: 0644]
include/linux/preempt.h
include/linux/sched.h
include/linux/writeback.h
kernel/Makefile
kernel/fork.c
kernel/lockdep.c
kernel/marker.c
kernel/printk.c
kernel/sched.c
kernel/semaphore.c
kernel/spinlock.c
kernel/sysctl.c
kernel/trace/Kconfig [new file with mode: 0644]
kernel/trace/Makefile [new file with mode: 0644]
kernel/trace/ftrace.c [new file with mode: 0644]
kernel/trace/trace.c [new file with mode: 0644]
kernel/trace/trace.h [new file with mode: 0644]
kernel/trace/trace_functions.c [new file with mode: 0644]
kernel/trace/trace_irqsoff.c [new file with mode: 0644]
kernel/trace/trace_mmiotrace.c [new file with mode: 0644]
kernel/trace/trace_sched_switch.c [new file with mode: 0644]
kernel/trace/trace_sched_wakeup.c [new file with mode: 0644]
kernel/trace/trace_selftest.c [new file with mode: 0644]
kernel/trace/trace_selftest_dynamic.c [new file with mode: 0644]
kernel/trace/trace_sysprof.c [new file with mode: 0644]
lib/Kconfig.debug
lib/Makefile
lib/smp_processor_id.c
mm/page-writeback.c
scripts/Makefile.lib

diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt
new file mode 100644 (file)
index 0000000..a4afb56
--- /dev/null
@@ -0,0 +1,164 @@
+               In-kernel memory-mapped I/O tracing
+
+
+Home page and links to optional user space tools:
+
+       http://nouveau.freedesktop.org/wiki/MmioTrace
+
+MMIO tracing was originally developed by Intel around 2003 for their Fault
+Injection Test Harness. In Dec 2006 - Jan 2007, using the code from Intel,
+Jeff Muizelaar created a tool for tracing MMIO accesses with the Nouveau
+project in mind. Since then many people have contributed.
+
+Mmiotrace was built for reverse engineering any memory-mapped IO device with
+the Nouveau project as the first real user. Only x86 and x86_64 architectures
+are supported.
+
+Out-of-tree mmiotrace was originally modified for mainline inclusion and
+ftrace framework by Pekka Paalanen <pq@iki.fi>.
+
+
+Preparation
+-----------
+
+Mmiotrace feature is compiled in by the CONFIG_MMIOTRACE option. Tracing is
+disabled by default, so it is safe to have this set to yes. SMP systems are
+supported, but tracing is unreliable and may miss events if more than one CPU
+is on-line, therefore mmiotrace takes all but one CPU off-line during run-time
+activation. You can re-enable CPUs by hand, but you have been warned, there
+is no way to automatically detect if you are losing events due to CPUs racing.
+
+
+Usage Quick Reference
+---------------------
+
+$ mount -t debugfs debugfs /debug
+$ echo mmiotrace > /debug/tracing/current_tracer
+$ cat /debug/tracing/trace_pipe > mydump.txt &
+Start X or whatever.
+$ echo "X is up" > /debug/tracing/marker
+$ echo none > /debug/tracing/current_tracer
+Check for lost events.
+
+
+Usage
+-----
+
+Make sure debugfs is mounted to /debug. If not, (requires root privileges)
+$ mount -t debugfs debugfs /debug
+
+Check that the driver you are about to trace is not loaded.
+
+Activate mmiotrace (requires root privileges):
+$ echo mmiotrace > /debug/tracing/current_tracer
+
+Start storing the trace:
+$ cat /debug/tracing/trace_pipe > mydump.txt &
+The 'cat' process should stay running (sleeping) in the background.
+
+Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
+accesses to areas that are ioremapped while mmiotrace is active.
+
+[Unimplemented feature:]
+During tracing you can place comments (markers) into the trace by
+$ echo "X is up" > /debug/tracing/marker
+This makes it easier to see which part of the (huge) trace corresponds to
+which action. It is recommended to place descriptive markers about what you
+do.
+
+Shut down mmiotrace (requires root privileges):
+$ echo none > /debug/tracing/current_tracer
+The 'cat' process exits. If it does not, kill it by issuing 'fg' command and
+pressing ctrl+c.
+
+Check that mmiotrace did not lose events due to a buffer filling up. Either
+$ grep -i lost mydump.txt
+which tells you exactly how many events were lost, or use
+$ dmesg
+to view your kernel log and look for "mmiotrace has lost events" warning. If
+events were lost, the trace is incomplete. You should enlarge the buffers and
+try again. Buffers are enlarged by first seeing how large the current buffers
+are:
+$ cat /debug/tracing/trace_entries
+gives you a number. Approximately double this number and write it back, for
+instance:
+$ echo 128000 > /debug/tracing/trace_entries
+Then start again from the top.
+
+If you are doing a trace for a driver project, e.g. Nouveau, you should also
+do the following before sending your results:
+$ lspci -vvv > lspci.txt
+$ dmesg > dmesg.txt
+$ tar zcf pciid-nick-mmiotrace.tar.gz mydump.txt lspci.txt dmesg.txt
+and then send the .tar.gz file. The trace compresses considerably. Replace
+"pciid" and "nick" with the PCI ID or model name of your piece of hardware
+under investigation and your nick name.
+
+
+How Mmiotrace Works
+-------------------
+
+Access to hardware IO-memory is gained by mapping addresses from PCI bus by
+calling one of the ioremap_*() functions. Mmiotrace is hooked into the
+__ioremap() function and gets called whenever a mapping is created. Mapping is
+an event that is recorded into the trace log. Note, that ISA range mappings
+are not caught, since the mapping always exists and is returned directly.
+
+MMIO accesses are recorded via page faults. Just before __ioremap() returns,
+the mapped pages are marked as not present. Any access to the pages causes a
+fault. The page fault handler calls mmiotrace to handle the fault. Mmiotrace
+marks the page present, sets TF flag to achieve single stepping and exits the
+fault handler. The instruction that faulted is executed and debug trap is
+entered. Here mmiotrace again marks the page as not present. The instruction
+is decoded to get the type of operation (read/write), data width and the value
+read or written. These are stored to the trace log.
+
+Setting the page present in the page fault handler has a race condition on SMP
+machines. During the single stepping other CPUs may run freely on that page
+and events can be missed without a notice. Re-enabling other CPUs during
+tracing is discouraged.
+
+
+Trace Log Format
+----------------
+
+The raw log is text and easily filtered with e.g. grep and awk. One record is
+one line in the log. A record starts with a keyword, followed by keyword
+dependant arguments. Arguments are separated by a space, or continue until the
+end of line. The format for version 20070824 is as follows:
+
+Explanation    Keyword Space separated arguments
+---------------------------------------------------------------------------
+
+read event     R       width, timestamp, map id, physical, value, PC, PID
+write event    W       width, timestamp, map id, physical, value, PC, PID
+ioremap event  MAP     timestamp, map id, physical, virtual, length, PC, PID
+iounmap event  UNMAP   timestamp, map id, PC, PID
+marker         MARK    timestamp, text
+version                VERSION the string "20070824"
+info for reader        LSPCI   one line from lspci -v
+PCI address map        PCIDEV  space separated /proc/bus/pci/devices data
+unk. opcode    UNKNOWN timestamp, map id, physical, data, PC, PID
+
+Timestamp is in seconds with decimals. Physical is a PCI bus address, virtual
+is a kernel virtual address. Width is the data width in bytes and value is the
+data value. Map id is an arbitrary id number identifying the mapping that was
+used in an operation. PC is the program counter and PID is process id. PC is
+zero if it is not recorded. PID is always zero as tracing MMIO accesses
+originating in user space memory is not yet supported.
+
+For instance, the following awk filter will pass all 32-bit writes that target
+physical addresses in the range [0xfb73ce40, 0xfb800000[
+
+$ awk '/W 4 / { adr=strtonum($5); if (adr >= 0xfb73ce40 &&
+adr < 0xfb800000) print; }'
+
+
+Tools for Developers
+--------------------
+
+The user space tools include utilities for:
+- replacing numeric addresses and values with hardware register names
+- replaying MMIO logs, i.e., re-executing the recorded writes
+
+
index e3c5eb66ec52dee13127e3b2b83f84c3184bd8be..4ac1d2f71ac3598df817199fc87d573831e848a2 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -528,6 +528,10 @@ KBUILD_CFLAGS      += -g
 KBUILD_AFLAGS  += -gdwarf-2
 endif
 
+ifdef CONFIG_FTRACE
+KBUILD_CFLAGS  += -pg
+endif
+
 # We trigger additional mismatches with less inlining
 ifdef CONFIG_DEBUG_SECTION_MISMATCH
 KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
index b786e68914d4a20b12b743ee886b9acdbed623d7..3845e5c8a34f9c873c6eaba0c9cfe171a030fcd6 100644 (file)
@@ -14,6 +14,8 @@ config ARM
        select HAVE_OPROFILE
        select HAVE_KPROBES if (!XIP_KERNEL)
        select HAVE_KRETPROBES if (HAVE_KPROBES)
+       select HAVE_FTRACE if (!XIP_KERNEL)
+       select HAVE_DYNAMIC_FTRACE if (HAVE_FTRACE)
        help
          The ARM series is a line of low-power-consumption RISC chip designs
          licensed by ARM Ltd and targeted at embedded applications and
index de9d9ee50958089bcf8fb7217fd1c48617d6097c..95baac4939e09a3809e427ed82207c5f3c1270c7 100644 (file)
@@ -69,6 +69,12 @@ SEDFLAGS     = s/TEXT_START/$(ZTEXTADDR)/;s/BSS_START/$(ZBSSADDR)/
 
 targets       := vmlinux vmlinux.lds piggy.gz piggy.o font.o font.c \
                 head.o misc.o $(OBJS)
+
+ifeq ($(CONFIG_FTRACE),y)
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
+endif
+
 EXTRA_CFLAGS  := -fpic -fno-builtin
 EXTRA_AFLAGS  :=
 
index ad455ff5aebe5379841cede6d11ae29c504b8841..eb9092ca80080cd4431951e46c6f2ce8daf70ab8 100644 (file)
@@ -4,6 +4,10 @@
 
 AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET)
 
+ifdef CONFIG_DYNAMIC_FTRACE
+CFLAGS_REMOVE_ftrace.o = -pg
+endif
+
 # Object file lists.
 
 obj-y          := compat.o entry-armv.o entry-common.o irq.o \
@@ -18,6 +22,7 @@ obj-$(CONFIG_ARTHUR)          += arthur.o
 obj-$(CONFIG_ISA_DMA)          += dma-isa.o
 obj-$(CONFIG_PCI)              += bios32.o isa.o
 obj-$(CONFIG_SMP)              += smp.o
+obj-$(CONFIG_DYNAMIC_FTRACE)   += ftrace.o
 obj-$(CONFIG_KEXEC)            += machine_kexec.o relocate_kernel.o
 obj-$(CONFIG_KPROBES)          += kprobes.o kprobes-decode.o
 obj-$(CONFIG_ATAGS_PROC)       += atags.o
index 688b7b1ee416c54987239740680ffbd29fdeb3b0..cc7b246e9652033d9593006f2821f99aad102fe3 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
+#include <asm/ftrace.h>
 
 /*
  * libgcc functions - functions that are used internally by the
@@ -181,3 +182,7 @@ EXPORT_SYMBOL(_find_next_bit_be);
 #endif
 
 EXPORT_SYMBOL(copy_page);
+
+#ifdef CONFIG_FTRACE
+EXPORT_SYMBOL(mcount);
+#endif
index 597ed00a08d87e68fa84cb81256179283668d821..84694e88b4289d07d2b952f69a1ff644ca010fb2 100644 (file)
@@ -9,6 +9,7 @@
  */
 
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 #include <asm/arch/entry-macro.S>
 
 #include "entry-header.S"
@@ -99,6 +100,56 @@ ENTRY(ret_from_fork)
 #undef CALL
 #define CALL(x) .long x
 
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+       stmdb sp!, {r0-r3, lr}
+       mov r0, lr
+       sub r0, r0, #MCOUNT_INSN_SIZE
+
+       .globl mcount_call
+mcount_call:
+       bl ftrace_stub
+       ldmia sp!, {r0-r3, pc}
+
+ENTRY(ftrace_caller)
+       stmdb sp!, {r0-r3, lr}
+       ldr r1, [fp, #-4]
+       mov r0, lr
+       sub r0, r0, #MCOUNT_INSN_SIZE
+
+       .globl ftrace_call
+ftrace_call:
+       bl ftrace_stub
+       ldmia sp!, {r0-r3, pc}
+
+#else
+
+ENTRY(mcount)
+       stmdb sp!, {r0-r3, lr}
+       ldr r0, =ftrace_trace_function
+       ldr r2, [r0]
+       adr r0, ftrace_stub
+       cmp r0, r2
+       bne trace
+       ldmia sp!, {r0-r3, pc}
+
+trace:
+       ldr r1, [fp, #-4]
+       mov r0, lr
+       sub r0, r0, #MCOUNT_INSN_SIZE
+       mov lr, pc
+       mov pc, r2
+       ldmia sp!, {r0-r3, pc}
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+       .globl ftrace_stub
+ftrace_stub:
+       mov pc, lr
+
+#endif /* CONFIG_FTRACE */
+
 /*=============================================================================
  * SWI handler
  *-----------------------------------------------------------------------------
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
new file mode 100644 (file)
index 0000000..76d50e6
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Dynamic function tracing support.
+ *
+ * Copyright (C) 2008 Abhishek Sagar <sagar.abhishek@gmail.com>
+ *
+ * For licencing details, see COPYING.
+ *
+ * Defines low-level handling of mcount calls when the kernel
+ * is compiled with the -pg flag. When using dynamic ftrace, the
+ * mcount call-sites get patched lazily with NOP till they are
+ * enabled. All code mutation routines here take effect atomically.
+ */
+
+#include <linux/ftrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/ftrace.h>
+
+#define PC_OFFSET      8
+#define BL_OPCODE      0xeb000000
+#define BL_OFFSET_MASK 0x00ffffff
+
+static unsigned long bl_insn;
+static const unsigned long NOP = 0xe1a00000; /* mov r0, r0 */
+
+unsigned char *ftrace_nop_replace(void)
+{
+       return (char *)&NOP;
+}
+
+/* construct a branch (BL) instruction to addr */
+unsigned char *ftrace_call_replace(unsigned long pc, unsigned long addr)
+{
+       long offset;
+
+       offset = (long)addr - (long)(pc + PC_OFFSET);
+       if (unlikely(offset < -33554432 || offset > 33554428)) {
+               /* Can't generate branches that far (from ARM ARM). Ftrace
+                * doesn't generate branches outside of kernel text.
+                */
+               WARN_ON_ONCE(1);
+               return NULL;
+       }
+       offset = (offset >> 2) & BL_OFFSET_MASK;
+       bl_insn = BL_OPCODE | offset;
+       return (unsigned char *)&bl_insn;
+}
+
+int ftrace_modify_code(unsigned long pc, unsigned char *old_code,
+                      unsigned char *new_code)
+{
+       unsigned long err = 0, replaced = 0, old, new;
+
+       old = *(unsigned long *)old_code;
+       new = *(unsigned long *)new_code;
+
+       __asm__ __volatile__ (
+               "1:  ldr    %1, [%2]  \n"
+               "    cmp    %1, %4    \n"
+               "2:  streq  %3, [%2]  \n"
+               "    cmpne  %1, %3    \n"
+               "    movne  %0, #2    \n"
+               "3:\n"
+
+               ".section .fixup, \"ax\"\n"
+               "4:  mov  %0, #1  \n"
+               "    b    3b      \n"
+               ".previous\n"
+
+               ".section __ex_table, \"a\"\n"
+               "    .long 1b, 4b \n"
+               "    .long 2b, 4b \n"
+               ".previous\n"
+
+               : "=r"(err), "=r"(replaced)
+               : "r"(pc), "r"(new), "r"(old), "0"(err), "1"(replaced)
+               : "memory");
+
+       if (!err && (replaced == old))
+               flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
+
+       return err;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+       int ret;
+       unsigned long pc, old;
+       unsigned char *new;
+
+       pc = (unsigned long)&ftrace_call;
+       memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE);
+       new = ftrace_call_replace(pc, (unsigned long)func);
+       ret = ftrace_modify_code(pc, (unsigned char *)&old, new);
+       return ret;
+}
+
+int ftrace_mcount_set(unsigned long *data)
+{
+       unsigned long pc, old;
+       unsigned long *addr = data;
+       unsigned char *new;
+
+       pc = (unsigned long)&mcount_call;
+       memcpy(&old, &mcount_call, MCOUNT_INSN_SIZE);
+       new = ftrace_call_replace(pc, *addr);
+       *addr = ftrace_modify_code(pc, (unsigned char *)&old, new);
+       return 0;
+}
+
+/* run from kstop_machine */
+int __init ftrace_dyn_arch_init(void *data)
+{
+       ftrace_mcount_set(data);
+       return 0;
+}
index 5593dd207216bd0d2d1d8319eb8ab06e2bc13714..5ee39e10c8d18a37eb8004e91b0e5c046bd2eff7 100644 (file)
@@ -274,7 +274,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
  * for kretprobe handlers which should normally be interested in r0 only
  * anyway.
  */
-static void __attribute__((naked)) __kprobes kretprobe_trampoline(void)
+void __naked __kprobes kretprobe_trampoline(void)
 {
        __asm__ __volatile__ (
                "stmdb  sp!, {r0 - r11}         \n\t"
index 3934e2659407b2769aaf81fba5c08a3873ed7fe0..a5e9912e2d3773fdab23a4bc52ec5e147f47e1cf 100644 (file)
@@ -105,11 +105,13 @@ config ARCH_NO_VIRT_TO_BUS
 config PPC
        bool
        default y
+       select HAVE_DYNAMIC_FTRACE
+       select HAVE_FTRACE
        select HAVE_IDE
-       select HAVE_OPROFILE
        select HAVE_KPROBES
        select HAVE_KRETPROBES
        select HAVE_LMB
+       select HAVE_OPROFILE
 
 config EARLY_PRINTK
        bool
index 2346d271fbfdbc5bb8323a588de5124e3237cabf..f3f5e26414322af110730cd9ca14b190509c2574 100644 (file)
@@ -12,6 +12,18 @@ CFLAGS_prom_init.o      += -fPIC
 CFLAGS_btext.o         += -fPIC
 endif
 
+ifdef CONFIG_FTRACE
+# Do not trace early boot code
+CFLAGS_REMOVE_cputable.o = -pg
+CFLAGS_REMOVE_prom_init.o = -pg
+
+ifdef CONFIG_DYNAMIC_FTRACE
+# dynamic ftrace setup.
+CFLAGS_REMOVE_ftrace.o = -pg
+endif
+
+endif
+
 obj-y                          := cputable.o ptrace.o syscalls.o \
                                   irq.o align.o signal_32.o pmc.o vdso.o \
                                   init_task.o process.o systbl.o idle.o \
@@ -78,6 +90,8 @@ obj-$(CONFIG_KEXEC)           += machine_kexec.o crash.o \
 obj-$(CONFIG_AUDIT)            += audit.o
 obj64-$(CONFIG_AUDIT)          += compat_audit.o
 
+obj-$(CONFIG_DYNAMIC_FTRACE)   += ftrace.o
+
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
 ifneq ($(CONFIG_PPC_INDIRECT_IO),y)
index 0c8614d9875ca8b5966c148c7ad447bbcf3a987d..7231a708af0d6ceacde2b0e04b1c2f7110c3aa6e 100644 (file)
@@ -30,6 +30,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 
 #undef SHOW_SYSCALLS
 #undef SHOW_SYSCALLS_TASK
@@ -1035,3 +1036,129 @@ machine_check_in_rtas:
        /* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_RTAS */
+
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+       stwu    r1,-48(r1)
+       stw     r3, 12(r1)
+       stw     r4, 16(r1)
+       stw     r5, 20(r1)
+       stw     r6, 24(r1)
+       mflr    r3
+       stw     r7, 28(r1)
+       mfcr    r5
+       stw     r8, 32(r1)
+       stw     r9, 36(r1)
+       stw     r10,40(r1)
+       stw     r3, 44(r1)
+       stw     r5, 8(r1)
+       subi    r3, r3, MCOUNT_INSN_SIZE
+       .globl mcount_call
+mcount_call:
+       bl      ftrace_stub
+       nop
+       lwz     r6, 8(r1)
+       lwz     r0, 44(r1)
+       lwz     r3, 12(r1)
+       mtctr   r0
+       lwz     r4, 16(r1)
+       mtcr    r6
+       lwz     r5, 20(r1)
+       lwz     r6, 24(r1)
+       lwz     r0, 52(r1)
+       lwz     r7, 28(r1)
+       lwz     r8, 32(r1)
+       mtlr    r0
+       lwz     r9, 36(r1)
+       lwz     r10,40(r1)
+       addi    r1, r1, 48
+       bctr
+
+_GLOBAL(ftrace_caller)
+       /* Based off of objdump optput from glibc */
+       stwu    r1,-48(r1)
+       stw     r3, 12(r1)
+       stw     r4, 16(r1)
+       stw     r5, 20(r1)
+       stw     r6, 24(r1)
+       mflr    r3
+       lwz     r4, 52(r1)
+       mfcr    r5
+       stw     r7, 28(r1)
+       stw     r8, 32(r1)
+       stw     r9, 36(r1)
+       stw     r10,40(r1)
+       stw     r3, 44(r1)
+       stw     r5, 8(r1)
+       subi    r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+       bl      ftrace_stub
+       nop
+       lwz     r6, 8(r1)
+       lwz     r0, 44(r1)
+       lwz     r3, 12(r1)
+       mtctr   r0
+       lwz     r4, 16(r1)
+       mtcr    r6
+       lwz     r5, 20(r1)
+       lwz     r6, 24(r1)
+       lwz     r0, 52(r1)
+       lwz     r7, 28(r1)
+       lwz     r8, 32(r1)
+       mtlr    r0
+       lwz     r9, 36(r1)
+       lwz     r10,40(r1)
+       addi    r1, r1, 48
+       bctr
+#else
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+       stwu    r1,-48(r1)
+       stw     r3, 12(r1)
+       stw     r4, 16(r1)
+       stw     r5, 20(r1)
+       stw     r6, 24(r1)
+       mflr    r3
+       lwz     r4, 52(r1)
+       mfcr    r5
+       stw     r7, 28(r1)
+       stw     r8, 32(r1)
+       stw     r9, 36(r1)
+       stw     r10,40(r1)
+       stw     r3, 44(r1)
+       stw     r5, 8(r1)
+
+       subi    r3, r3, MCOUNT_INSN_SIZE
+       LOAD_REG_ADDR(r5, ftrace_trace_function)
+       lwz     r5,0(r5)
+
+       mtctr   r5
+       bctrl
+
+       nop
+
+       lwz     r6, 8(r1)
+       lwz     r0, 44(r1)
+       lwz     r3, 12(r1)
+       mtctr   r0
+       lwz     r4, 16(r1)
+       mtcr    r6
+       lwz     r5, 20(r1)
+       lwz     r6, 24(r1)
+       lwz     r0, 52(r1)
+       lwz     r7, 28(r1)
+       lwz     r8, 32(r1)
+       mtlr    r0
+       lwz     r9, 36(r1)
+       lwz     r10,40(r1)
+       addi    r1, r1, 48
+       bctr
+#endif
+
+_GLOBAL(ftrace_stub)
+       blr
+
+#endif /* CONFIG_MCOUNT */
index c0db5b769e55ee601e0d11e428cb6acc2c90d4bc..2f511a969d2cfb9e9db66f12aa41a293ce32e026 100644 (file)
@@ -31,6 +31,7 @@
 #include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/irqflags.h>
+#include <asm/ftrace.h>
 
 /*
  * System calls.
@@ -870,3 +871,67 @@ _GLOBAL(enter_prom)
        ld      r0,16(r1)
        mtlr    r0
         blr
+
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+       /* Taken from output of objdump from lib64/glibc */
+       mflr    r3
+       stdu    r1, -112(r1)
+       std     r3, 128(r1)
+       subi    r3, r3, MCOUNT_INSN_SIZE
+       .globl mcount_call
+mcount_call:
+       bl      ftrace_stub
+       nop
+       ld      r0, 128(r1)
+       mtlr    r0
+       addi    r1, r1, 112
+       blr
+
+_GLOBAL(ftrace_caller)
+       /* Taken from output of objdump from lib64/glibc */
+       mflr    r3
+       ld      r11, 0(r1)
+       stdu    r1, -112(r1)
+       std     r3, 128(r1)
+       ld      r4, 16(r11)
+       subi    r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+       bl      ftrace_stub
+       nop
+       ld      r0, 128(r1)
+       mtlr    r0
+       addi    r1, r1, 112
+_GLOBAL(ftrace_stub)
+       blr
+#else
+_GLOBAL(mcount)
+       blr
+
+_GLOBAL(_mcount)
+       /* Taken from output of objdump from lib64/glibc */
+       mflr    r3
+       ld      r11, 0(r1)
+       stdu    r1, -112(r1)
+       std     r3, 128(r1)
+       ld      r4, 16(r11)
+
+       subi    r3, r3, MCOUNT_INSN_SIZE
+       LOAD_REG_ADDR(r5,ftrace_trace_function)
+       ld      r5,0(r5)
+       ld      r5,0(r5)
+       mtctr   r5
+       bctrl
+
+       nop
+       ld      r0, 128(r1)
+       mtlr    r0
+       addi    r1, r1, 112
+_GLOBAL(ftrace_stub)
+       blr
+
+#endif
+#endif
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
new file mode 100644 (file)
index 0000000..3855ceb
--- /dev/null
@@ -0,0 +1,154 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes out to P.A. Semi, Inc for supplying me with a PPC64 box.
+ *
+ */
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/cacheflush.h>
+#include <asm/ftrace.h>
+
+
+static unsigned int ftrace_nop = 0x60000000;
+
+#ifdef CONFIG_PPC32
+# define GET_ADDR(addr) addr
+#else
+/* PowerPC64's functions are data that points to the functions */
+# define GET_ADDR(addr) *(unsigned long *)addr
+#endif
+
+
+static unsigned int notrace ftrace_calc_offset(long ip, long addr)
+{
+       return (int)(addr - ip);
+}
+
+notrace unsigned char *ftrace_nop_replace(void)
+{
+       return (char *)&ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+       static unsigned int op;
+
+       /*
+        * It would be nice to just use create_function_call, but that will
+        * update the code itself. Here we need to just return the
+        * instruction that is going to be modified, without modifying the
+        * code.
+        */
+       addr = GET_ADDR(addr);
+
+       /* Set to "bl addr" */
+       op = 0x48000001 | (ftrace_calc_offset(ip, addr) & 0x03fffffc);
+
+       /*
+        * No locking needed, this must be called via kstop_machine
+        * which in essence is like running on a uniprocessor machine.
+        */
+       return (unsigned char *)&op;
+}
+
+#ifdef CONFIG_PPC64
+# define _ASM_ALIGN    " .align 3 "
+# define _ASM_PTR      " .llong "
+#else
+# define _ASM_ALIGN    " .align 2 "
+# define _ASM_PTR      " .long "
+#endif
+
+notrace int
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+                  unsigned char *new_code)
+{
+       unsigned replaced;
+       unsigned old = *(unsigned *)old_code;
+       unsigned new = *(unsigned *)new_code;
+       int faulted = 0;
+
+       /*
+        * Note: Due to modules and __init, code can
+        *  disappear and change, we need to protect against faulting
+        *  as well as code changing.
+        *
+        * No real locking needed, this code is run through
+        * kstop_machine.
+        */
+       asm volatile (
+               "1: lwz         %1, 0(%2)\n"
+               "   cmpw        %1, %5\n"
+               "   bne         2f\n"
+               "   stwu        %3, 0(%2)\n"
+               "2:\n"
+               ".section .fixup, \"ax\"\n"
+               "3:     li %0, 1\n"
+               "       b 2b\n"
+               ".previous\n"
+               ".section __ex_table,\"a\"\n"
+               _ASM_ALIGN "\n"
+               _ASM_PTR "1b, 3b\n"
+               ".previous"
+               : "=r"(faulted), "=r"(replaced)
+               : "r"(ip), "r"(new),
+                 "0"(faulted), "r"(old)
+               : "memory");
+
+       if (replaced != old && replaced != new)
+               faulted = 2;
+
+       if (!faulted)
+               flush_icache_range(ip, ip + 8);
+
+       return faulted;
+}
+
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+       unsigned long ip = (unsigned long)(&ftrace_call);
+       unsigned char old[MCOUNT_INSN_SIZE], *new;
+       int ret;
+
+       memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
+       new = ftrace_call_replace(ip, (unsigned long)func);
+       ret = ftrace_modify_code(ip, old, new);
+
+       return ret;
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+       unsigned long ip = (long)(&mcount_call);
+       unsigned long *addr = data;
+       unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+       /*
+        * Replace the mcount stub with a pointer to the
+        * ip recorder function.
+        */
+       memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
+       new = ftrace_call_replace(ip, *addr);
+       *addr = ftrace_modify_code(ip, old, new);
+
+       return 0;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+       /* This is running in kstop_machine */
+
+       ftrace_mcount_set(data);
+
+       return 0;
+}
+
index e31aca9208eba4a7cc711f52fdb5c755829fe0dc..1882bf419fa6f294db6adf46e15d2a428493c414 100644 (file)
@@ -120,7 +120,8 @@ EXPORT_SYMBOL(_outsl_ns);
 
 #define IO_CHECK_ALIGN(v,a) ((((unsigned long)(v)) & ((a) - 1)) == 0)
 
-void _memset_io(volatile void __iomem *addr, int c, unsigned long n)
+notrace void
+_memset_io(volatile void __iomem *addr, int c, unsigned long n)
 {
        void *p = (void __force *)addr;
        u32 lc = c;
index bcc249d90c4deee52b8d6af1d711dc493e8fb7ad..dcc946e670991c7f972141ee162b814789fd4435 100644 (file)
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_desc);
 
 int distribute_irqs = 1;
 
-static inline unsigned long get_hard_enabled(void)
+static inline notrace unsigned long get_hard_enabled(void)
 {
        unsigned long enabled;
 
@@ -108,13 +108,13 @@ static inline unsigned long get_hard_enabled(void)
        return enabled;
 }
 
-static inline void set_soft_enabled(unsigned long enable)
+static inline notrace void set_soft_enabled(unsigned long enable)
 {
        __asm__ __volatile__("stb %0,%1(13)"
        : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
-void raw_local_irq_restore(unsigned long en)
+notrace void raw_local_irq_restore(unsigned long en)
 {
        /*
         * get_paca()->soft_enabled = en;
index d3ac631cbd2628946ac32985d9bcb90db82339a6..a8d02506468aeb5762827cdd080f70d17032f9d5 100644 (file)
@@ -42,6 +42,7 @@
 #include <asm/div64.h>
 #include <asm/signal.h>
 #include <asm/dcr.h>
+#include <asm/ftrace.h>
 
 #ifdef CONFIG_PPC32
 extern void transfer_to_handler(void);
@@ -67,6 +68,10 @@ EXPORT_SYMBOL(single_step_exception);
 EXPORT_SYMBOL(sys_sigreturn);
 #endif
 
+#ifdef CONFIG_FTRACE
+EXPORT_SYMBOL(_mcount);
+#endif
+
 EXPORT_SYMBOL(strcpy);
 EXPORT_SYMBOL(strncpy);
 EXPORT_SYMBOL(strcat);
index 5112a4aa801d7d6843c2202c7815147831a74d2e..19e8fcb9cea890ae8bd937200acea9e9284c4770 100644 (file)
@@ -81,7 +81,7 @@ int ucache_bsize;
  * from the address that it was linked at, so we must use RELOC/PTRRELOC
  * to access static data (including strings).  -- paulus
  */
-unsigned long __init early_init(unsigned long dt_ptr)
+notrace unsigned long __init early_init(unsigned long dt_ptr)
 {
        unsigned long offset = reloc_offset();
        struct cpu_spec *spec;
@@ -111,7 +111,7 @@ unsigned long __init early_init(unsigned long dt_ptr)
  * This is called very early on the boot process, after a minimal
  * MMU environment has been set up but before MMU_init is called.
  */
-void __init machine_init(unsigned long dt_ptr, unsigned long phys)
+notrace void __init machine_init(unsigned long dt_ptr, unsigned long phys)
 {
        /* Enable early debugging if any specified (see udbg.h) */
        udbg_early_init();
@@ -133,7 +133,7 @@ void __init machine_init(unsigned long dt_ptr, unsigned long phys)
 
 #ifdef CONFIG_BOOKE_WDT
 /* Checks wdt=x and wdt_period=xx command-line option */
-int __init early_parse_wdt(char *p)
+notrace int __init early_parse_wdt(char *p)
 {
        if (p && strncmp(p, "0", 1) != 0)
               booke_wdt_enabled = 1;
index 4d72c8f721598d77c60c68ec74799fd75920e931..89774177b209e25465ae9d5648166a1c2ff72840 100644 (file)
@@ -1,5 +1,10 @@
 CFLAGS_bootx_init.o            += -fPIC
 
+ifdef CONFIG_FTRACE
+# Do not trace early boot code
+CFLAGS_REMOVE_bootx_init.o = -pg
+endif
+
 obj-y                          += pic.o setup.o time.o feature.o pci.o \
                                   sleep.o low_i2c.o cache.o pfunc_core.o \
                                   pfunc_base.o
index eb36f3b746b8ad280c7a26a07de046af25c01d3c..fca9246470b11e3beb0be6e6fe1ba65c1e7a183b 100644 (file)
@@ -11,6 +11,8 @@ config SPARC
 config SPARC64
        bool
        default y
+       select HAVE_DYNAMIC_FTRACE
+       select HAVE_FTRACE
        select HAVE_IDE
        select HAVE_LMB
        select HAVE_ARCH_KGDB
index 6a4d28a4076d0df87e637a3292a21fefcd8bd07a..d6d32d178fc8af2533939225caf2b7e6d8fef2b8 100644 (file)
@@ -33,7 +33,7 @@ config DEBUG_PAGEALLOC
 
 config MCOUNT
        bool
-       depends on STACK_DEBUG
+       depends on STACK_DEBUG || FTRACE
        default y
 
 config FRAME_POINTER
index ec4f5ebb1ca669e72048317fe623c12d30b281d2..418b5782096ec709eda315ceb79f5575d50059a6 100644 (file)
@@ -14,6 +14,7 @@ obj-y         := process.o setup.o cpu.o idprom.o \
                   power.o sbus.o sparc64_ksyms.o chmc.o \
                   visemul.o prom.o of_device.o hvapi.o sstate.o mdesc.o
 
+obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-$(CONFIG_PCI)       += ebus.o pci_common.o \
                            pci_psycho.o pci_sabre.o pci_schizo.o \
diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c
new file mode 100644 (file)
index 0000000..4298d0a
--- /dev/null
@@ -0,0 +1,94 @@
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/ftrace.h>
+
+static const u32 ftrace_nop = 0x01000000;
+
+notrace unsigned char *ftrace_nop_replace(void)
+{
+       return (char *)&ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+       static u32 call;
+       s32 off;
+
+       off = ((s32)addr - (s32)ip);
+       call = 0x40000000 | ((u32)off >> 2);
+
+       return (unsigned char *) &call;
+}
+
+notrace int
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+                  unsigned char *new_code)
+{
+       u32 old = *(u32 *)old_code;
+       u32 new = *(u32 *)new_code;
+       u32 replaced;
+       int faulted;
+
+       __asm__ __volatile__(
+       "1:     cas     [%[ip]], %[old], %[new]\n"
+       "       flush   %[ip]\n"
+       "       mov     0, %[faulted]\n"
+       "2:\n"
+       "       .section .fixup,#alloc,#execinstr\n"
+       "       .align  4\n"
+       "3:     sethi   %%hi(2b), %[faulted]\n"
+       "       jmpl    %[faulted] + %%lo(2b), %%g0\n"
+       "        mov    1, %[faulted]\n"
+       "       .previous\n"
+       "       .section __ex_table,\"a\"\n"
+       "       .align  4\n"
+       "       .word   1b, 3b\n"
+       "       .previous\n"
+       : "=r" (replaced), [faulted] "=r" (faulted)
+       : [new] "0" (new), [old] "r" (old), [ip] "r" (ip)
+       : "memory");
+
+       if (replaced != old && replaced != new)
+               faulted = 2;
+
+       return faulted;
+}
+
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+       unsigned long ip = (unsigned long)(&ftrace_call);
+       unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+       memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
+       new = ftrace_call_replace(ip, (unsigned long)func);
+       return ftrace_modify_code(ip, old, new);
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+       unsigned long ip = (long)(&mcount_call);
+       unsigned long *addr = data;
+       unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+       /*
+        * Replace the mcount stub with a pointer to the
+        * ip recorder function.
+        */
+       memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
+       new = ftrace_call_replace(ip, *addr);
+       *addr = ftrace_modify_code(ip, old, new);
+
+       return 0;
+}
+
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+       ftrace_mcount_set(data);
+       return 0;
+}
index 8ac0b99f2c5563bad603edd5628794f1bdf5a898..49d3ea50c24797f22532aa4a7b6f42605ee74ce3 100644 (file)
@@ -53,6 +53,7 @@
 #include <asm/ns87303.h>
 #include <asm/timer.h>
 #include <asm/cpudata.h>
+#include <asm/ftrace.h>
 
 struct poll {
        int fd;
@@ -111,8 +112,7 @@ EXPORT_SYMBOL(__write_trylock);
 EXPORT_SYMBOL(smp_call_function);
 #endif /* CONFIG_SMP */
 
-#if defined(CONFIG_MCOUNT)
-extern void _mcount(void);
+#ifdef CONFIG_MCOUNT
 EXPORT_SYMBOL(_mcount);
 #endif
 
index 9e4534b485c7a578af04517adff245a413f7bfb9..7735a7a60533fa225256ad0bd2d94a00be26033a 100644 (file)
@@ -28,10 +28,13 @@ ovstack:
        .skip           OVSTACKSIZE
 #endif
        .text
-       .align 32
-       .globl mcount, _mcount
-mcount:
+       .align          32
+       .globl          _mcount
+       .type           _mcount,#function
+       .globl          mcount
+       .type           mcount,#function
 _mcount:
+mcount:
 #ifdef CONFIG_STACK_DEBUG
        /*
         * Check whether %sp is dangerously low.
@@ -55,6 +58,53 @@ _mcount:
         or             %g3, %lo(panicstring), %o0
        call            prom_halt
         nop
+1:
+#endif
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+       mov             %o7, %o0
+       .globl          mcount_call
+mcount_call:
+       call            ftrace_stub
+        mov            %o0, %o7
+#else
+       sethi           %hi(ftrace_trace_function), %g1
+       sethi           %hi(ftrace_stub), %g2
+       ldx             [%g1 + %lo(ftrace_trace_function)], %g1
+       or              %g2, %lo(ftrace_stub), %g2
+       cmp             %g1, %g2
+       be,pn           %icc, 1f
+        mov            %i7, %o1
+       jmpl            %g1, %g0
+        mov            %o7, %o0
+       /* not reached */
+1:
 #endif
-1:     retl
+#endif
+       retl
         nop
+       .size           _mcount,.-_mcount
+       .size           mcount,.-mcount
+
+#ifdef CONFIG_FTRACE
+       .globl          ftrace_stub
+       .type           ftrace_stub,#function
+ftrace_stub:
+       retl
+        nop
+       .size           ftrace_stub,.-ftrace_stub
+#ifdef CONFIG_DYNAMIC_FTRACE
+       .globl          ftrace_caller
+       .type           ftrace_caller,#function
+ftrace_caller:
+       mov             %i7, %o1
+       mov             %o7, %o0
+       .globl          ftrace_call
+ftrace_call:
+       call            ftrace_stub
+        mov            %o0, %o7
+       retl
+        nop
+       .size           ftrace_caller,.-ftrace_caller
+#endif
+#endif
index 2cfccc987a2605e81f7017f4e055e09ddf8e3da3..6958d6bcaf704c8cc0c9d5af066e4787353cbb5b 100644 (file)
@@ -23,6 +23,8 @@ config X86
        select HAVE_OPROFILE
        select HAVE_KPROBES
        select HAVE_KRETPROBES
+       select HAVE_DYNAMIC_FTRACE
+       select HAVE_FTRACE
        select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
        select HAVE_ARCH_KGDB if !X86_VOYAGER
 
index acc0271920f2e3d80be6a50bb882066727e297e5..5236621350bc9169d2be181e3de37d56c1a4a08b 100644 (file)
@@ -171,6 +171,34 @@ config IOMMU_LEAK
          Add a simple leak tracer to the IOMMU code. This is useful when you
          are debugging a buggy device driver that leaks IOMMU mappings.
 
+config MMIOTRACE_HOOKS
+       bool
+
+config MMIOTRACE
+       bool "Memory mapped IO tracing"
+       depends on DEBUG_KERNEL && PCI
+       select TRACING
+       select MMIOTRACE_HOOKS
+       default y
+       help
+         Mmiotrace traces Memory Mapped I/O access and is meant for
+         debugging and reverse engineering. It is called from the ioremap
+         implementation and works via page faults. Tracing is disabled by
+         default and can be enabled at run-time.
+
+         See Documentation/tracers/mmiotrace.txt.
+         If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+       tristate "Test module for mmiotrace"
+       depends on MMIOTRACE && m
+       help
+         This is a dumb module for testing mmiotrace. It is very dangerous
+         as it will write garbage to IO memory starting at a given address.
+         However, it should be safe to use on e.g. unused portion of VRAM.
+
+         Say N, unless you absolutely know what you are doing.
+
 #
 # IO delay types:
 #
index 55ff016e9f694f61d5dc143675797860a4f4de3d..5112c84f542164e37a74ed9785451d127cd8da95 100644 (file)
@@ -6,6 +6,13 @@ extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
+ifdef CONFIG_FTRACE
+# Do not profile debug utilities
+CFLAGS_REMOVE_tsc_64.o = -pg
+CFLAGS_REMOVE_tsc_32.o = -pg
+CFLAGS_REMOVE_rtc.o = -pg
+endif
+
 #
 # vsyscalls (which work on the user stack) should have
 # no stack-protector checks:
@@ -57,6 +64,7 @@ obj-$(CONFIG_X86_MPPARSE)     += mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)   += apic_$(BITS).o nmi.o
 obj-$(CONFIG_X86_IO_APIC)      += io_apic_$(BITS).o
 obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE)   += ftrace.o
 obj-$(CONFIG_KEXEC)            += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)            += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)       += crash_dump_$(BITS).o
index 65c7857a90ddfc6ff084c6817baba045ced0ad71..2763cb37b553e5780c9dc080311022e3b4339092 100644 (file)
@@ -1,6 +1,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/kprobes.h>
 #include <linux/mm.h>
@@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
 #ifdef CONFIG_X86_64
 
 extern char __vsyscall_0;
-static inline const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
        return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
               boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
@@ -162,7 +162,7 @@ static const struct nop {
        { -1, NULL }
 };
 
-static const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
        const unsigned char *const *noptable = intel_nops;
        int i;
@@ -279,7 +279,7 @@ struct smp_alt_module {
        struct list_head next;
 };
 static LIST_HEAD(smp_alt_modules);
-static DEFINE_SPINLOCK(smp_alt);
+static DEFINE_MUTEX(smp_alt);
 static int smp_mode = 1;       /* protected by smp_alt */
 
 void alternatives_smp_module_add(struct module *mod, char *name,
@@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name,
                __func__, smp->locks, smp->locks_end,
                smp->text, smp->text_end, smp->name);
 
-       spin_lock(&smp_alt);
+       mutex_lock(&smp_alt);
        list_add_tail(&smp->next, &smp_alt_modules);
        if (boot_cpu_has(X86_FEATURE_UP))
                alternatives_smp_unlock(smp->locks, smp->locks_end,
                                        smp->text, smp->text_end);
-       spin_unlock(&smp_alt);
+       mutex_unlock(&smp_alt);
 }
 
 void alternatives_smp_module_del(struct module *mod)
@@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod)
        if (smp_alt_once || noreplace_smp)
                return;
 
-       spin_lock(&smp_alt);
+       mutex_lock(&smp_alt);
        list_for_each_entry(item, &smp_alt_modules, next) {
                if (mod != item->mod)
                        continue;
                list_del(&item->next);
-               spin_unlock(&smp_alt);
+               mutex_unlock(&smp_alt);
                DPRINTK("%s: %s\n", __func__, item->name);
                kfree(item);
                return;
        }
-       spin_unlock(&smp_alt);
+       mutex_unlock(&smp_alt);
 }
 
 void alternatives_smp_switch(int smp)
@@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp)
                return;
        BUG_ON(!smp && (num_online_cpus() > 1));
 
-       spin_lock(&smp_alt);
+       mutex_lock(&smp_alt);
 
        /*
         * Avoid unnecessary switches because it forces JIT based VMs to
@@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp)
                                                mod->text, mod->text_end);
        }
        smp_mode = smp;
-       spin_unlock(&smp_alt);
+       mutex_unlock(&smp_alt);
 }
 
 #endif
index cfe28a715434762352df73207da0fe422c76e713..6bc07f0f1202eeb1eaac0b55064acad7f355371a 100644 (file)
@@ -51,6 +51,7 @@
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
+#include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
 
 /*
@@ -1111,6 +1112,77 @@ ENDPROC(xen_failsafe_callback)
 
 #endif /* CONFIG_XEN */
 
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %eax
+       subl $MCOUNT_INSN_SIZE, %eax
+
+.globl mcount_call
+mcount_call:
+       call ftrace_stub
+
+       popl %edx
+       popl %ecx
+       popl %eax
+
+       ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %eax
+       movl 0x4(%ebp), %edx
+       subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+       call ftrace_stub
+
+       popl %edx
+       popl %ecx
+       popl %eax
+
+.globl ftrace_stub
+ftrace_stub:
+       ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+       cmpl $ftrace_stub, ftrace_trace_function
+       jnz trace
+.globl ftrace_stub
+ftrace_stub:
+       ret
+
+       /* taken from glibc */
+trace:
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %eax
+       movl 0x4(%ebp), %edx
+       subl $MCOUNT_INSN_SIZE, %eax
+
+       call *ftrace_trace_function
+
+       popl %edx
+       popl %ecx
+       popl %eax
+
+       jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
 .section .rodata,"a"
 #include "syscall_table_32.S"
 
index bb4e22f4892fd807cf4dfaee0a8ae41aefb9111b..ba41bf42748d7657548b8e39191d9496f86a9480 100644 (file)
 #include <asm/page.h>
 #include <asm/irqflags.h>
 #include <asm/paravirt.h>
+#include <asm/ftrace.h>
 
        .code64
 
+#ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+
+       subq $0x38, %rsp
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+
+       movq 0x38(%rsp), %rdi
+       subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl mcount_call
+mcount_call:
+       call ftrace_stub
+
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $0x38, %rsp
+
+       retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+
+       /* taken from glibc */
+       subq $0x38, %rsp
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+
+       movq 0x38(%rsp), %rdi
+       movq 8(%rbp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rdi
+
+.globl ftrace_call
+ftrace_call:
+       call ftrace_stub
+
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $0x38, %rsp
+
+.globl ftrace_stub
+ftrace_stub:
+       retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+ENTRY(mcount)
+       cmpq $ftrace_stub, ftrace_trace_function
+       jnz trace
+.globl ftrace_stub
+ftrace_stub:
+       retq
+
+trace:
+       /* taken from glibc */
+       subq $0x38, %rsp
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+
+       movq 0x38(%rsp), %rdi
+       movq 8(%rbp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rdi
+
+       call   *ftrace_trace_function
+
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $0x38, %rsp
+
+       jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644 (file)
index 0000000..ab115cd
--- /dev/null
@@ -0,0 +1,141 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes to Ingo Molnar, for suggesting the idea.
+ * Mathieu Desnoyers, for suggesting postponing the modifications.
+ * Arjan van de Ven, for keeping me straight, and explaining to me
+ * the dangers of modifying code on the run.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/alternative.h>
+#include <asm/ftrace.h>
+
+
+/* Long is fine, even if it is only 4 bytes ;-) */
+static long *ftrace_nop;
+
+union ftrace_code_union {
+       char code[MCOUNT_INSN_SIZE];
+       struct {
+               char e8;
+               int offset;
+       } __attribute__((packed));
+};
+
+
+static int notrace ftrace_calc_offset(long ip, long addr)
+{
+       return (int)(addr - ip);
+}
+
+notrace unsigned char *ftrace_nop_replace(void)
+{
+       return (char *)ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+       static union ftrace_code_union calc;
+
+       calc.e8         = 0xe8;
+       calc.offset     = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
+
+       /*
+        * No locking needed, this must be called via kstop_machine
+        * which in essence is like running on a uniprocessor machine.
+        */
+       return calc.code;
+}
+
+notrace int
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+                  unsigned char *new_code)
+{
+       unsigned replaced;
+       unsigned old = *(unsigned *)old_code; /* 4 bytes */
+       unsigned new = *(unsigned *)new_code; /* 4 bytes */
+       unsigned char newch = new_code[4];
+       int faulted = 0;
+
+       /*
+        * Note: Due to modules and __init, code can
+        *  disappear and change, we need to protect against faulting
+        *  as well as code changing.
+        *
+        * No real locking needed, this code is run through
+        * kstop_machine.
+        */
+       asm volatile (
+               "1: lock\n"
+               "   cmpxchg %3, (%2)\n"
+               "   jnz 2f\n"
+               "   movb %b4, 4(%2)\n"
+               "2:\n"
+               ".section .fixup, \"ax\"\n"
+               "3:     movl $1, %0\n"
+               "       jmp 2b\n"
+               ".previous\n"
+               _ASM_EXTABLE(1b, 3b)
+               : "=r"(faulted), "=a"(replaced)
+               : "r"(ip), "r"(new), "c"(newch),
+                 "0"(faulted), "a"(old)
+               : "memory");
+       sync_core();
+
+       if (replaced != old && replaced != new)
+               faulted = 2;
+
+       return faulted;
+}
+
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+       unsigned long ip = (unsigned long)(&ftrace_call);
+       unsigned char old[MCOUNT_INSN_SIZE], *new;
+       int ret;
+
+       memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
+       new = ftrace_call_replace(ip, (unsigned long)func);
+       ret = ftrace_modify_code(ip, old, new);
+
+       return ret;
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+       unsigned long ip = (long)(&mcount_call);
+       unsigned long *addr = data;
+       unsigned char old[MCOUNT_INSN_SIZE], *new;
+
+       /*
+        * Replace the mcount stub with a pointer to the
+        * ip recorder function.
+        */
+       memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
+       new = ftrace_call_replace(ip, *addr);
+       *addr = ftrace_modify_code(ip, old, new);
+
+       return 0;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
+{
+       const unsigned char *const *noptable = find_nop_table();
+
+       /* This is running in kstop_machine */
+
+       ftrace_mcount_set(data);
+
+       ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
+
+       return 0;
+}
index deb43785e923d09a8ba7215222e84363ad682742..dd7ebee446afaab244041868b67d89833b8d00c5 100644 (file)
@@ -1,7 +1,14 @@
 #include <linux/module.h>
+
 #include <asm/checksum.h>
-#include <asm/desc.h>
 #include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/ftrace.h>
+
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
 
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
index f4960171bc66625ebac582a172990b0531031fcb..8864230d55afd11476e96e13429c4a1ba03ce8cd 100644 (file)
@@ -11,6 +11,8 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
@@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
        unsigned long page_list[PAGES_NR];
        void *control_page;
 
+       tracer_disable();
+
        /* Interrupts aren't acceptable while we reboot */
        local_irq_disable();
 
index 7830dc4a8380d4f2631beef1df15f183cd69b721..9dd9262693a330ae8aa81f386af90b0defb8c904 100644 (file)
@@ -11,6 +11,8 @@
 #include <linux/string.h>
 #include <linux/reboot.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
        unsigned long page_list[PAGES_NR];
        void *control_page;
 
+       tracer_disable();
+
        /* Interrupts aren't acceptable while we reboot */
        local_irq_disable();
 
index 9a139f6c9df30fa2c304e96da89b99fd62b78a8d..0c3927accb0054b71c7de9eb828a93559232737e 100644 (file)
@@ -142,7 +142,10 @@ void cpu_idle(void)
 
                        local_irq_disable();
                        __get_cpu_var(irq_stat).idle_timestamp = jiffies;
+                       /* Don't trace irqs off for idle */
+                       stop_critical_timings();
                        pm_idle();
+                       start_critical_timings();
                }
                tick_nohz_restart_sched_tick();
                preempt_enable_no_resched();
index db5eb963e4df2e7d573012f96bc3696eff6b52ca..a8e53626ac9aaf5fc8290908aaf42552556a1b11 100644 (file)
@@ -134,7 +134,10 @@ void cpu_idle(void)
                         */
                        local_irq_disable();
                        enter_idle();
+                       /* Don't trace irqs off for idle */
+                       stop_critical_timings();
                        pm_idle();
+                       start_critical_timings();
                        /* In many cases the interrupt that ended idle
                           has already called exit_idle. But some idle
                           loops can be woken up without interrupt. */
index c87cbd84c3e521ca92a567c751bc852a74239122..e50740d32314e2608e13da780fa2c704f2172df4 100644 (file)
@@ -42,7 +42,8 @@
 #include <asm/topology.h>
 #include <asm/vgtod.h>
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) \
+               __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
 #define __syscall_clobber "r11","cx","memory"
 
 /*
index 2f306a8268973bb531402dc7b8aa8cb14ebbd22a..b545f371b5f542243de7c9962ce551fb10647f3a 100644 (file)
@@ -2,13 +2,20 @@
    All C exports should go in the respective C files. */
 
 #include <linux/module.h>
-#include <net/checksum.h>
 #include <linux/smp.h>
 
+#include <net/checksum.h>
+
 #include <asm/processor.h>
-#include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm/uaccess.h>
 #include <asm/desc.h>
+#include <asm/ftrace.h>
+
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
 
 EXPORT_SYMBOL(kernel_thread);
 
index 83226e0a7ce4e331aa1aac3277b1d6301bf025f8..aa3fa4119424205ba297e1af4ce4100ba5514baf 100644 (file)
@@ -5,6 +5,7 @@
 obj-$(CONFIG_SMP) := msr-on-cpu.o
 
 lib-y := delay.o
+lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
 
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
new file mode 100644 (file)
index 0000000..650b11e
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
+ * Copyright 2008 by Steven Rostedt, Red Hat, Inc
+ *  (inspired by Andi Kleen's thunk_64.S)
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+
+       #include <linux/linkage.h>
+
+#define ARCH_TRACE_IRQS_ON                     \
+       pushl %eax;                             \
+       pushl %ecx;                             \
+       pushl %edx;                             \
+       call trace_hardirqs_on;                 \
+       popl %edx;                              \
+       popl %ecx;                              \
+       popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF                    \
+       pushl %eax;                             \
+       pushl %ecx;                             \
+       pushl %edx;                             \
+       call trace_hardirqs_off;                \
+       popl %edx;                              \
+       popl %ecx;                              \
+       popl %eax;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+       /* put return address in eax (arg1) */
+       .macro thunk_ra name,func
+       .globl \name
+\name:
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       /* Place EIP in the arg1 */
+       movl 3*4(%esp), %eax
+       call \func
+       popl %edx
+       popl %ecx
+       popl %eax
+       ret
+       .endm
+
+       thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+       thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#endif
index e009251d4e9f3afa3874e2ab48bd2d6cd28a83ae..bf9a7d5a54288763b35dc355790c00aaa43b637e 100644 (file)
@@ -2,6 +2,7 @@
  * Save registers before calling assembly functions. This avoids
  * disturbance of register allocation in some inline assembly constructs.
  * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
 
 #endif 
        
 #ifdef CONFIG_TRACE_IRQFLAGS
-       thunk trace_hardirqs_on_thunk,trace_hardirqs_on
-       thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+       /* put return address in rdi (arg1) */
+       .macro thunk_ra name,func
+       .globl \name
+\name:
+       CFI_STARTPROC
+       SAVE_ARGS
+       /* SAVE_ARGS pushs 9 elements */
+       /* the next element would be the rip */
+       movq 9*8(%rsp), %rdi
+       call \func
+       jmp  restore
+       CFI_ENDPROC
+       .endm
+
+       thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+       thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
index c107641cd39bfadb4969b6b86183fe666034616e..9873716e9f764bcd7c5bc0f369a0269efdffb35a 100644 (file)
@@ -8,6 +8,11 @@ obj-$(CONFIG_X86_PTDUMP)       += dump_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)          += highmem_32.o
 
+obj-$(CONFIG_MMIOTRACE_HOOKS)  += kmmio.o
+obj-$(CONFIG_MMIOTRACE)                += mmiotrace.o
+mmiotrace-y                    := pf_in.o mmio-mod.o
+obj-$(CONFIG_MMIOTRACE_TEST)   += testmmiotrace.o
+
 ifeq ($(CONFIG_X86_32),y)
 obj-$(CONFIG_NUMA)             += discontig_32.o
 else
index d0f5fce77d95b9f8afa7c0879a3e211b937365b8..455f3fe67b42412c8acda3eb63e33678ebec1845 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/mmiotrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #define PF_RSVD                (1<<3)
 #define PF_INSTR       (1<<4)
 
+static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+#ifdef CONFIG_MMIOTRACE_HOOKS
+       if (unlikely(is_kmmio_active()))
+               if (kmmio_handler(regs, addr) == 1)
+                       return -1;
+#endif
+       return 0;
+}
+
 static inline int notify_page_fault(struct pt_regs *regs)
 {
 #ifdef CONFIG_KPROBES
@@ -598,6 +609,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
        if (notify_page_fault(regs))
                return;
+       if (unlikely(kmmio_fault(regs, address)))
+               return;
 
        /*
         * We fault-in kernel-space virtual memory on-demand. The
index 029e8cffca9e11cf1794b0cad1c74029eaefa815..9689a5138e6472e33c6d0862b3ae56194ffcedb4 100644 (file)
@@ -1035,6 +1035,8 @@ void mark_rodata_ro(void)
        unsigned long start = PFN_ALIGN(_text);
        unsigned long size = PFN_ALIGN(_etext) - start;
 
+#ifndef CONFIG_DYNAMIC_FTRACE
+       /* Dynamic tracing modifies the kernel text section */
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
        printk(KERN_INFO "Write protecting the kernel text: %luk\n",
                size >> 10);
@@ -1047,6 +1049,8 @@ void mark_rodata_ro(void)
        printk(KERN_INFO "Testing CPA: write protecting again\n");
        set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
        start += size;
        size = (unsigned long)__end_rodata - start;
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
index a25cc6fa2207262131dcec567f3b43ce861d0f3f..27de2435e0080f419e3c5ac7386f710b993a93cd 100644 (file)
@@ -991,6 +991,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
 void mark_rodata_ro(void)
 {
        unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+       unsigned long rodata_start =
+               ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+       /* Dynamic tracing modifies the kernel text section */
+       start = rodata_start;
+#endif
 
        printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
               (end - start) >> 10);
@@ -1000,8 +1007,7 @@ void mark_rodata_ro(void)
         * The rodata section (but not the kernel text!) should also be
         * not-executable.
         */
-       start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
-       set_memory_nx(start, (end - start) >> PAGE_SHIFT);
+       set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
 
        rodata_test();
 
index 115f13ee40c9156fc1f954d5181b415e97c78ed3..24c1d3c30186c893c400eff0942b857207a833fa 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -122,10 +123,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 {
        unsigned long pfn, offset, vaddr;
        resource_size_t last_addr;
+       const resource_size_t unaligned_phys_addr = phys_addr;
+       const unsigned long unaligned_size = size;
        struct vm_struct *area;
        unsigned long new_prot_val;
        pgprot_t prot;
        int retval;
+       void __iomem *ret_addr;
 
        /* Don't allow wraparound or zero size */
        last_addr = phys_addr + size - 1;
@@ -233,7 +237,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
                return NULL;
        }
 
-       return (void __iomem *) (vaddr + offset);
+       ret_addr = (void __iomem *) (vaddr + offset);
+       mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+
+       return ret_addr;
 }
 
 /**
@@ -348,6 +355,8 @@ void iounmap(volatile void __iomem *addr)
        addr = (volatile void __iomem *)
                (PAGE_MASK & (unsigned long __force)addr);
 
+       mmiotrace_iounmap(addr);
+
        /* Use the vm area unlocked, assuming the caller
           ensures there isn't another iounmap for the same address
           in parallel. Reuse of the virtual address is prevented by
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644 (file)
index 0000000..93d8203
--- /dev/null
@@ -0,0 +1,510 @@
+/* Support for MMIO probes.
+ * Benfit many code from kprobes
+ * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
+ *     2007 Alexander Eichner
+ *     2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <linux/errno.h>
+#include <asm/debugreg.h>
+#include <linux/mmiotrace.h>
+
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_fault_page {
+       struct list_head list;
+       struct kmmio_fault_page *release_next;
+       unsigned long page; /* location of the fault page */
+
+       /*
+        * Number of times this page has been registered as a part
+        * of a probe. If zero, page is disarmed and this may be freed.
+        * Used only by writers (RCU).
+        */
+       int count;
+};
+
+struct kmmio_delayed_release {
+       struct rcu_head rcu;
+       struct kmmio_fault_page *release_list;
+};
+
+struct kmmio_context {
+       struct kmmio_fault_page *fpage;
+       struct kmmio_probe *probe;
+       unsigned long saved_flags;
+       unsigned long addr;
+       int active;
+};
+
+static DEFINE_SPINLOCK(kmmio_lock);
+
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct list_head *kmmio_page_list(unsigned long page)
+{
+       return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+}
+
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+       struct kmmio_probe *p;
+       list_for_each_entry_rcu(p, &kmmio_probes, list) {
+               if (addr >= p->addr && addr <= (p->addr + p->len))
+                       return p;
+       }
+       return NULL;
+}
+
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+{
+       struct list_head *head;
+       struct kmmio_fault_page *p;
+
+       page &= PAGE_MASK;
+       head = kmmio_page_list(page);
+       list_for_each_entry_rcu(p, head, list) {
+               if (p->page == page)
+                       return p;
+       }
+       return NULL;
+}
+
+static void set_page_present(unsigned long addr, bool present,
+                                                       unsigned int *pglevel)
+{
+       pteval_t pteval;
+       pmdval_t pmdval;
+       unsigned int level;
+       pmd_t *pmd;
+       pte_t *pte = lookup_address(addr, &level);
+
+       if (!pte) {
+               pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+               return;
+       }
+
+       if (pglevel)
+               *pglevel = level;
+
+       switch (level) {
+       case PG_LEVEL_2M:
+               pmd = (pmd_t *)pte;
+               pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
+               if (present)
+                       pmdval |= _PAGE_PRESENT;
+               set_pmd(pmd, __pmd(pmdval));
+               break;
+
+       case PG_LEVEL_4K:
+               pteval = pte_val(*pte) & ~_PAGE_PRESENT;
+               if (present)
+                       pteval |= _PAGE_PRESENT;
+               set_pte_atomic(pte, __pte(pteval));
+               break;
+
+       default:
+               pr_err("kmmio: unexpected page level 0x%x.\n", level);
+               return;
+       }
+
+       __flush_tlb_one(addr);
+}
+
+/** Mark the given page as not present. Access to it will trigger a fault. */
+static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+{
+       set_page_present(page & PAGE_MASK, false, pglevel);
+}
+
+/** Mark the given page as present. */
+static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+{
+       set_page_present(page & PAGE_MASK, true, pglevel);
+}
+
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+       struct kmmio_context *ctx;
+       struct kmmio_fault_page *faultpage;
+       int ret = 0; /* default to fault not handled */
+
+       /*
+        * Preemption is now disabled to prevent process switch during
+        * single stepping. We can only handle one active kmmio trace
+        * per cpu, so ensure that we finish it before something else
+        * gets to run. We also hold the RCU read lock over single
+        * stepping to avoid looking up the probe and kmmio_fault_page
+        * again.
+        */
+       preempt_disable();
+       rcu_read_lock();
+
+       faultpage = get_kmmio_fault_page(addr);
+       if (!faultpage) {
+               /*
+                * Either this page fault is not caused by kmmio, or
+                * another CPU just pulled the kmmio probe from under
+                * our feet. The latter case should not be possible.
+                */
+               goto no_kmmio;
+       }
+
+       ctx = &get_cpu_var(kmmio_ctx);
+       if (ctx->active) {
+               disarm_kmmio_fault_page(faultpage->page, NULL);
+               if (addr == ctx->addr) {
+                       /*
+                        * On SMP we sometimes get recursive probe hits on the
+                        * same address. Context is already saved, fall out.
+                        */
+                       pr_debug("kmmio: duplicate probe hit on CPU %d, for "
+                                               "address 0x%08lx.\n",
+                                               smp_processor_id(), addr);
+                       ret = 1;
+                       goto no_kmmio_ctx;
+               }
+               /*
+                * Prevent overwriting already in-flight context.
+                * This should not happen, let's hope disarming at least
+                * prevents a panic.
+                */
+               pr_emerg("kmmio: recursive probe hit on CPU %d, "
+                                       "for address 0x%08lx. Ignoring.\n",
+                                       smp_processor_id(), addr);
+               pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+                                       ctx->addr);
+               goto no_kmmio_ctx;
+       }
+       ctx->active++;
+
+       ctx->fpage = faultpage;
+       ctx->probe = get_kmmio_probe(addr);
+       ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+       ctx->addr = addr;
+
+       if (ctx->probe && ctx->probe->pre_handler)
+               ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+       /*
+        * Enable single-stepping and disable interrupts for the faulting
+        * context. Local interrupts must not get enabled during stepping.
+        */
+       regs->flags |= X86_EFLAGS_TF;
+       regs->flags &= ~X86_EFLAGS_IF;
+
+       /* Now we set present bit in PTE and single step. */
+       disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+
+       /*
+        * If another cpu accesses the same page while we are stepping,
+        * the access will not be caught. It will simply succeed and the
+        * only downside is we lose the event. If this becomes a problem,
+        * the user should drop to single cpu before tracing.
+        */
+
+       put_cpu_var(kmmio_ctx);
+       return 1; /* fault handled */
+
+no_kmmio_ctx:
+       put_cpu_var(kmmio_ctx);
+no_kmmio:
+       rcu_read_unlock();
+       preempt_enable_no_resched();
+       return ret;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+       int ret = 0;
+       struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+
+       if (!ctx->active) {
+               pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+                                                       smp_processor_id());
+               goto out;
+       }
+
+       if (ctx->probe && ctx->probe->post_handler)
+               ctx->probe->post_handler(ctx->probe, condition, regs);
+
+       arm_kmmio_fault_page(ctx->fpage->page, NULL);
+
+       regs->flags &= ~X86_EFLAGS_TF;
+       regs->flags |= ctx->saved_flags;
+
+       /* These were acquired in kmmio_handler(). */
+       ctx->active--;
+       BUG_ON(ctx->active);
+       rcu_read_unlock();
+       preempt_enable_no_resched();
+
+       /*
+        * if somebody else is singlestepping across a probe point, flags
+        * will have TF set, in which case, continue the remaining processing
+        * of do_debug, as if this is not a probe hit.
+        */
+       if (!(regs->flags & X86_EFLAGS_TF))
+               ret = 1;
+out:
+       put_cpu_var(kmmio_ctx);
+       return ret;
+}
+
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long page)
+{
+       struct kmmio_fault_page *f;
+
+       page &= PAGE_MASK;
+       f = get_kmmio_fault_page(page);
+       if (f) {
+               if (!f->count)
+                       arm_kmmio_fault_page(f->page, NULL);
+               f->count++;
+               return 0;
+       }
+
+       f = kmalloc(sizeof(*f), GFP_ATOMIC);
+       if (!f)
+               return -1;
+
+       f->count = 1;
+       f->page = page;
+       list_add_rcu(&f->list, kmmio_page_list(f->page));
+
+       arm_kmmio_fault_page(f->page, NULL);
+
+       return 0;
+}
+
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long page,
+                               struct kmmio_fault_page **release_list)
+{
+       struct kmmio_fault_page *f;
+
+       page &= PAGE_MASK;
+       f = get_kmmio_fault_page(page);
+       if (!f)
+               return;
+
+       f->count--;
+       BUG_ON(f->count < 0);
+       if (!f->count) {
+               disarm_kmmio_fault_page(f->page, NULL);
+               f->release_next = *release_list;
+               *release_list = f;
+       }
+}
+
+/*
+ * With page-unaligned ioremaps, one or two armed pages may contain
+ * addresses from outside the intended mapping. Events for these addresses
+ * are currently silently dropped. The events may result only from programming
+ * mistakes by accessing addresses before the beginning or past the end of a
+ * mapping.
+ */
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+       unsigned long flags;
+       int ret = 0;
+       unsigned long size = 0;
+       const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+
+       spin_lock_irqsave(&kmmio_lock, flags);
+       if (get_kmmio_probe(p->addr)) {
+               ret = -EEXIST;
+               goto out;
+       }
+       kmmio_count++;
+       list_add_rcu(&p->list, &kmmio_probes);
+       while (size < size_lim) {
+               if (add_kmmio_fault_page(p->addr + size))
+                       pr_err("kmmio: Unable to set page fault.\n");
+               size += PAGE_SIZE;
+       }
+out:
+       spin_unlock_irqrestore(&kmmio_lock, flags);
+       /*
+        * XXX: What should I do here?
+        * Here was a call to global_flush_tlb(), but it does not exist
+        * anymore. It seems it's not needed after all.
+        */
+       return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+       struct kmmio_delayed_release *dr = container_of(
+                                               head,
+                                               struct kmmio_delayed_release,
+                                               rcu);
+       struct kmmio_fault_page *p = dr->release_list;
+       while (p) {
+               struct kmmio_fault_page *next = p->release_next;
+               BUG_ON(p->count);
+               kfree(p);
+               p = next;
+       }
+       kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+       struct kmmio_delayed_release *dr = container_of(
+                                               head,
+                                               struct kmmio_delayed_release,
+                                               rcu);
+       struct kmmio_fault_page *p = dr->release_list;
+       struct kmmio_fault_page **prevp = &dr->release_list;
+       unsigned long flags;
+       spin_lock_irqsave(&kmmio_lock, flags);
+       while (p) {
+               if (!p->count)
+                       list_del_rcu(&p->list);
+               else
+                       *prevp = p->release_next;
+               prevp = &p->release_next;
+               p = p->release_next;
+       }
+       spin_unlock_irqrestore(&kmmio_lock, flags);
+       /* This is the real RCU destroy call. */
+       call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actally free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+       unsigned long flags;
+       unsigned long size = 0;
+       const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+       struct kmmio_fault_page *release_list = NULL;
+       struct kmmio_delayed_release *drelease;
+
+       spin_lock_irqsave(&kmmio_lock, flags);
+       while (size < size_lim) {
+               release_kmmio_fault_page(p->addr + size, &release_list);
+               size += PAGE_SIZE;
+       }
+       list_del_rcu(&p->list);
+       kmmio_count--;
+       spin_unlock_irqrestore(&kmmio_lock, flags);
+
+       drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+       if (!drelease) {
+               pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
+               return;
+       }
+       drelease->release_list = release_list;
+
+       /*
+        * This is not really RCU here. We have just disarmed a set of
+        * pages so that they cannot trigger page faults anymore. However,
+        * we cannot remove the pages from kmmio_page_table,
+        * because a probe hit might be in flight on another CPU. The
+        * pages are collected into a list, and they will be removed from
+        * kmmio_page_table when it is certain that no probe hit related to
+        * these pages can be in flight. RCU grace period sounds like a
+        * good choice.
+        *
+        * If we removed the pages too early, kmmio page fault handler might
+        * not find the respective kmmio_fault_page and determine it's not
+        * a kmmio fault, when it actually is. This would lead to madness.
+        */
+       call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+
+static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
+                                                               void *args)
+{
+       struct die_args *arg = args;
+
+       if (val == DIE_DEBUG && (arg->err & DR_STEP))
+               if (post_kmmio_handler(arg->err, arg->regs) == 1)
+                       return NOTIFY_STOP;
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_die = {
+       .notifier_call = kmmio_die_notifier
+};
+
+static int __init init_kmmio(void)
+{
+       int i;
+       for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+               INIT_LIST_HEAD(&kmmio_page_table[i]);
+       return register_die_notifier(&nb_die);
+}
+fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644 (file)
index 0000000..e7397e1
--- /dev/null
@@ -0,0 +1,515 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *               Jeff Muizelaar, 2006, 2007
+ *               Pekka Paalanen, 2008 <pq@iki.fi>
+ *
+ * Derived from the read-mod example from relay-examples by Tom Zanussi.
+ */
+#define DEBUG 1
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/version.h>
+#include <linux/kallsyms.h>
+#include <asm/pgtable.h>
+#include <linux/mmiotrace.h>
+#include <asm/e820.h> /* for ISA_START_ADDRESS */
+#include <asm/atomic.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include "pf_in.h"
+
+#define NAME "mmiotrace: "
+
+struct trap_reason {
+       unsigned long addr;
+       unsigned long ip;
+       enum reason_type type;
+       int active_traces;
+};
+
+struct remap_trace {
+       struct list_head list;
+       struct kmmio_probe probe;
+       resource_size_t phys;
+       unsigned long id;
+};
+
+/* Accessed per-cpu. */
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
+
+#if 0 /* XXX: no way gather this info anymore */
+/* Access to this is not per-cpu. */
+static DEFINE_PER_CPU(atomic_t, dropped);
+#endif
+
+static struct dentry *marker_file;
+
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list);          /* struct remap_trace */
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ *   and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that mmio_trace_record is allowed.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
+
+/* module parameters */
+static unsigned long   filter_offset;
+static int             nommiotrace;
+static int             trace_pc;
+
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+module_param(trace_pc, bool, 0);
+
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+
+static bool is_enabled(void)
+{
+       return atomic_read(&mmiotrace_enabled);
+}
+
+#if 0 /* XXX: needs rewrite */
+/*
+ * Write callback for the debugfs entry:
+ * Read a marker and write it to the mmio trace log
+ */
+static ssize_t write_marker(struct file *file, const char __user *buffer,
+                                               size_t count, loff_t *ppos)
+{
+       char *event = NULL;
+       struct mm_io_header *headp;
+       ssize_t len = (count > 65535) ? 65535 : count;
+
+       event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
+       if (!event)
+               return -ENOMEM;
+
+       headp = (struct mm_io_header *)event;
+       headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
+       headp->data_len = len;
+
+       if (copy_from_user(event + sizeof(*headp), buffer, len)) {
+               kfree(event);
+               return -EFAULT;
+       }
+
+       spin_lock_irq(&trace_lock);
+#if 0 /* XXX: convert this to use tracing */
+       if (is_enabled())
+               relay_write(chan, event, sizeof(*headp) + len);
+       else
+#endif
+               len = -EINVAL;
+       spin_unlock_irq(&trace_lock);
+       kfree(event);
+       return len;
+}
+#endif
+
+static void print_pte(unsigned long address)
+{
+       unsigned int level;
+       pte_t *pte = lookup_address(address, &level);
+
+       if (!pte) {
+               pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
+                                                       __func__, address);
+               return;
+       }
+
+       if (level == PG_LEVEL_2M) {
+               pr_emerg(NAME "4MB pages are not currently supported: "
+                                                       "0x%08lx\n", address);
+               BUG();
+       }
+       pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
+               (unsigned long long)pte_val(*pte),
+               (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+       const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+       pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
+                                       "last fault for address: 0x%08lx\n",
+                                       addr, my_reason->addr);
+       print_pte(addr);
+       print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
+       print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
+#ifdef __i386__
+       pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+                       regs->ax, regs->bx, regs->cx, regs->dx);
+       pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+                       regs->si, regs->di, regs->bp, regs->sp);
+#else
+       pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
+                                       regs->ax, regs->cx, regs->dx);
+       pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
+                               regs->si, regs->di, regs->bp, regs->sp);
+#endif
+       put_cpu_var(pf_reason);
+       BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+                                               unsigned long addr)
+{
+       struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+       struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+       const unsigned long instptr = instruction_pointer(regs);
+       const enum reason_type type = get_ins_type(instptr);
+       struct remap_trace *trace = p->private;
+
+       /* it doesn't make sense to have more than one active trace per cpu */
+       if (my_reason->active_traces)
+               die_kmmio_nesting_error(regs, addr);
+       else
+               my_reason->active_traces++;
+
+       my_reason->type = type;
+       my_reason->addr = addr;
+       my_reason->ip = instptr;
+
+       my_trace->phys = addr - trace->probe.addr + trace->phys;
+       my_trace->map_id = trace->id;
+
+       /*
+        * Only record the program counter when requested.
+        * It may taint clean-room reverse engineering.
+        */
+       if (trace_pc)
+               my_trace->pc = instptr;
+       else
+               my_trace->pc = 0;
+
+       /*
+        * XXX: the timestamp recorded will be *after* the tracing has been
+        * done, not at the time we hit the instruction. SMP implications
+        * on event ordering?
+        */
+
+       switch (type) {
+       case REG_READ:
+               my_trace->opcode = MMIO_READ;
+               my_trace->width = get_ins_mem_width(instptr);
+               break;
+       case REG_WRITE:
+               my_trace->opcode = MMIO_WRITE;
+               my_trace->width = get_ins_mem_width(instptr);
+               my_trace->value = get_ins_reg_val(instptr, regs);
+               break;
+       case IMM_WRITE:
+               my_trace->opcode = MMIO_WRITE;
+               my_trace->width = get_ins_mem_width(instptr);
+               my_trace->value = get_ins_imm_val(instptr);
+               break;
+       default:
+               {
+                       unsigned char *ip = (unsigned char *)instptr;
+                       my_trace->opcode = MMIO_UNKNOWN_OP;
+                       my_trace->width = 0;
+                       my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
+                                                               *(ip + 2);
+               }
+       }
+       put_cpu_var(cpu_trace);
+       put_cpu_var(pf_reason);
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+                                                       struct pt_regs *regs)
+{
+       struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+       struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+
+       /* this should always return the active_trace count to 0 */
+       my_reason->active_traces--;
+       if (my_reason->active_traces) {
+               pr_emerg(NAME "unexpected post handler");
+               BUG();
+       }
+
+       switch (my_reason->type) {
+       case REG_READ:
+               my_trace->value = get_ins_reg_val(my_reason->ip, regs);
+               break;
+       default:
+               break;
+       }
+
+       mmio_trace_rw(my_trace);
+       put_cpu_var(cpu_trace);
+       put_cpu_var(pf_reason);
+}
+
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
+                                                       void __iomem *addr)
+{
+       static atomic_t next_id;
+       struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+       /* These are page-unaligned. */
+       struct mmiotrace_map map = {
+               .phys = offset,
+               .virt = (unsigned long)addr,
+               .len = size,
+               .opcode = MMIO_PROBE
+       };
+
+       if (!trace) {
+               pr_err(NAME "kmalloc failed in ioremap\n");
+               return;
+       }
+
+       *trace = (struct remap_trace) {
+               .probe = {
+                       .addr = (unsigned long)addr,
+                       .len = size,
+                       .pre_handler = pre,
+                       .post_handler = post,
+                       .private = trace
+               },
+               .phys = offset,
+               .id = atomic_inc_return(&next_id)
+       };
+       map.map_id = trace->id;
+
+       spin_lock_irq(&trace_lock);
+       if (!is_enabled())
+               goto not_enabled;
+
+       mmio_trace_mapping(&map);
+       list_add_tail(&trace->list, &trace_list);
+       if (!nommiotrace)
+               register_kmmio_probe(&trace->probe);
+
+not_enabled:
+       spin_unlock_irq(&trace_lock);
+}
+
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+                                               void __iomem *addr)
+{
+       if (!is_enabled()) /* recheck and proper locking in *_core() */
+               return;
+
+       pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
+                               (unsigned long long)offset, size, addr);
+       if ((filter_offset) && (offset != filter_offset))
+               return;
+       ioremap_trace_core(offset, size, addr);
+}
+
+static void iounmap_trace_core(volatile void __iomem *addr)
+{
+       struct mmiotrace_map map = {
+               .phys = 0,
+               .virt = (unsigned long)addr,
+               .len = 0,
+               .opcode = MMIO_UNPROBE
+       };
+       struct remap_trace *trace;
+       struct remap_trace *tmp;
+       struct remap_trace *found_trace = NULL;
+
+       pr_debug(NAME "Unmapping %p.\n", addr);
+
+       spin_lock_irq(&trace_lock);
+       if (!is_enabled())
+               goto not_enabled;
+
+       list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+               if ((unsigned long)addr == trace->probe.addr) {
+                       if (!nommiotrace)
+                               unregister_kmmio_probe(&trace->probe);
+                       list_del(&trace->list);
+                       found_trace = trace;
+                       break;
+               }
+       }
+       map.map_id = (found_trace) ? found_trace->id : -1;
+       mmio_trace_mapping(&map);
+
+not_enabled:
+       spin_unlock_irq(&trace_lock);
+       if (found_trace) {
+               synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+               kfree(found_trace);
+       }
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+       might_sleep();
+       if (is_enabled()) /* recheck and proper locking in *_core() */
+               iounmap_trace_core(addr);
+}
+
+static void clear_trace_list(void)
+{
+       struct remap_trace *trace;
+       struct remap_trace *tmp;
+
+       /*
+        * No locking required, because the caller ensures we are in a
+        * critical section via mutex, and is_enabled() is false,
+        * i.e. nothing can traverse or modify this list.
+        * Caller also ensures is_enabled() cannot change.
+        */
+       list_for_each_entry(trace, &trace_list, list) {
+               pr_notice(NAME "purging non-iounmapped "
+                                       "trace @0x%08lx, size 0x%lx.\n",
+                                       trace->probe.addr, trace->probe.len);
+               if (!nommiotrace)
+                       unregister_kmmio_probe(&trace->probe);
+       }
+       synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+       list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+               list_del(&trace->list);
+               kfree(trace);
+       }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static cpumask_t downed_cpus;
+
+static void enter_uniprocessor(void)
+{
+       int cpu;
+       int err;
+
+       get_online_cpus();
+       downed_cpus = cpu_online_map;
+       cpu_clear(first_cpu(cpu_online_map), downed_cpus);
+       if (num_online_cpus() > 1)
+               pr_notice(NAME "Disabling non-boot CPUs...\n");
+       put_online_cpus();
+
+       for_each_cpu_mask(cpu, downed_cpus) {
+               err = cpu_down(cpu);
+               if (!err)
+                       pr_info(NAME "CPU%d is down.\n", cpu);
+               else
+                       pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
+       }
+       if (num_online_cpus() > 1)
+               pr_warning(NAME "multiple CPUs still online, "
+                                               "may miss events.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+       int cpu;
+       int err;
+
+       if (cpus_weight(downed_cpus) == 0)
+               return;
+       pr_notice(NAME "Re-enabling CPUs...\n");
+       for_each_cpu_mask(cpu, downed_cpus) {
+               err = cpu_up(cpu);
+               if (!err)
+                       pr_info(NAME "enabled CPU%d.\n", cpu);
+               else
+                       pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
+       }
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static void enter_uniprocessor(void)
+{
+       if (num_online_cpus() > 1)
+               pr_warning(NAME "multiple CPUs are online, may miss events. "
+                       "Suggest booting with maxcpus=1 kernel argument.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+}
+#endif
+
+#if 0 /* XXX: out of order */
+static struct file_operations fops_marker = {
+       .owner =        THIS_MODULE,
+       .write =        write_marker
+};
+#endif
+
+void enable_mmiotrace(void)
+{
+       mutex_lock(&mmiotrace_mutex);
+       if (is_enabled())
+               goto out;
+
+#if 0 /* XXX: tracing does not support text entries */
+       marker_file = debugfs_create_file("marker", 0660, dir, NULL,
+                                                               &fops_marker);
+       if (!marker_file)
+               pr_err(NAME "marker file creation failed.\n");
+#endif
+
+       if (nommiotrace)
+               pr_info(NAME "MMIO tracing disabled.\n");
+       enter_uniprocessor();
+       spin_lock_irq(&trace_lock);
+       atomic_inc(&mmiotrace_enabled);
+       spin_unlock_irq(&trace_lock);
+       pr_info(NAME "enabled.\n");
+out:
+       mutex_unlock(&mmiotrace_mutex);
+}
+
+void disable_mmiotrace(void)
+{
+       mutex_lock(&mmiotrace_mutex);
+       if (!is_enabled())
+               goto out;
+
+       spin_lock_irq(&trace_lock);
+       atomic_dec(&mmiotrace_enabled);
+       BUG_ON(is_enabled());
+       spin_unlock_irq(&trace_lock);
+
+       clear_trace_list(); /* guarantees: no more kmmio callbacks */
+       leave_uniprocessor();
+       if (marker_file) {
+               debugfs_remove(marker_file);
+               marker_file = NULL;
+       }
+
+       pr_info(NAME "disabled.\n");
+out:
+       mutex_unlock(&mmiotrace_mutex);
+}
index fb6f2ab40dda092f5029b1e29542fb2d9accb6f7..47f4e2e4a0968ca848c2d16358d6ae7e43a6f565 100644 (file)
@@ -262,6 +262,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
 
        return pte_offset_kernel(pmd, address);
 }
+EXPORT_SYMBOL_GPL(lookup_address);
 
 /*
  * Set the new pmd in all the pgds we know about:
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644 (file)
index 0000000..efa1911
--- /dev/null
@@ -0,0 +1,489 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+/*  Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
+ *  Copyright by Intel Crop., 2002
+ *  Louis Zhuang (louis.zhuang@intel.com)
+ *
+ *  Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
+ */
+
+#include <linux/module.h>
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+
+#ifdef __i386__
+/* IA32 Manual 3, 2-1 */
+static unsigned char prefix_codes[] = {
+       0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
+       0x65, 0x2E, 0x3E, 0x66, 0x67
+};
+/* IA32 Manual 3, 3-432*/
+static unsigned int reg_rop[] = {
+       0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+/* IA32 Manual 3, 3-432*/
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
+static unsigned int rw32[] = {
+       0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
+static unsigned int mw64[] = {};
+#else /* not __i386__ */
+static unsigned char prefix_codes[] = {
+       0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
+       0xF0, 0xF3, 0xF2,
+       /* REX Prefixes */
+       0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+       0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
+};
+/* AMD64 Manual 3, Appendix A*/
+static unsigned int reg_rop[] = {
+       0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
+static unsigned int rw32[] = {
+       0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+/* 8 bit only */
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
+/* 16 bit only */
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+/* 16 or 32 bit */
+static unsigned int mw32[] = { 0xC7 };
+/* 16, 32 or 64 bit */
+static unsigned int mw64[] = { 0x89, 0x8B };
+#endif /* not __i386__ */
+
+static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
+                                                               int *rexr)
+{
+       int i;
+       unsigned char *p = addr;
+       *shorted = 0;
+       *enlarged = 0;
+       *rexr = 0;
+
+restart:
+       for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
+               if (*p == prefix_codes[i]) {
+                       if (*p == 0x66)
+                               *shorted = 1;
+#ifdef __amd64__
+                       if ((*p & 0xf8) == 0x48)
+                               *enlarged = 1;
+                       if ((*p & 0xf4) == 0x44)
+                               *rexr = 1;
+#endif
+                       p++;
+                       goto restart;
+               }
+       }
+
+       return (p - addr);
+}
+
+static int get_opcode(unsigned char *addr, unsigned int *opcode)
+{
+       int len;
+
+       if (*addr == 0x0F) {
+               /* 0x0F is extension instruction */
+               *opcode = *(unsigned short *)addr;
+               len = 2;
+       } else {
+               *opcode = *addr;
+               len = 1;
+       }
+
+       return len;
+}
+
+#define CHECK_OP_TYPE(opcode, array, type) \
+       for (i = 0; i < ARRAY_SIZE(array); i++) { \
+               if (array[i] == opcode) { \
+                       rv = type; \
+                       goto exit; \
+               } \
+       }
+
+enum reason_type get_ins_type(unsigned long ins_addr)
+{
+       unsigned int opcode;
+       unsigned char *p;
+       int shorted, enlarged, rexr;
+       int i;
+       enum reason_type rv = OTHERS;
+
+       p = (unsigned char *)ins_addr;
+       p += skip_prefix(p, &shorted, &enlarged, &rexr);
+       p += get_opcode(p, &opcode);
+
+       CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
+       CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
+       CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
+
+exit:
+       return rv;
+}
+#undef CHECK_OP_TYPE
+
+static unsigned int get_ins_reg_width(unsigned long ins_addr)
+{
+       unsigned int opcode;
+       unsigned char *p;
+       int i, shorted, enlarged, rexr;
+
+       p = (unsigned char *)ins_addr;
+       p += skip_prefix(p, &shorted, &enlarged, &rexr);
+       p += get_opcode(p, &opcode);
+
+       for (i = 0; i < ARRAY_SIZE(rw8); i++)
+               if (rw8[i] == opcode)
+                       return 1;
+
+       for (i = 0; i < ARRAY_SIZE(rw32); i++)
+               if (rw32[i] == opcode)
+                       return (shorted ? 2 : (enlarged ? 8 : 4));
+
+       printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+       return 0;
+}
+
+unsigned int get_ins_mem_width(unsigned long ins_addr)
+{
+       unsigned int opcode;
+       unsigned char *p;
+       int i, shorted, enlarged, rexr;
+
+       p = (unsigned char *)ins_addr;
+       p += skip_prefix(p, &shorted, &enlarged, &rexr);
+       p += get_opcode(p, &opcode);
+
+       for (i = 0; i < ARRAY_SIZE(mw8); i++)
+               if (mw8[i] == opcode)
+                       return 1;
+
+       for (i = 0; i < ARRAY_SIZE(mw16); i++)
+               if (mw16[i] == opcode)
+                       return 2;
+
+       for (i = 0; i < ARRAY_SIZE(mw32); i++)
+               if (mw32[i] == opcode)
+                       return shorted ? 2 : 4;
+
+       for (i = 0; i < ARRAY_SIZE(mw64); i++)
+               if (mw64[i] == opcode)
+                       return shorted ? 2 : (enlarged ? 8 : 4);
+
+       printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+       return 0;
+}
+
+/*
+ * Define register ident in mod/rm byte.
+ * Note: these are NOT the same as in ptrace-abi.h.
+ */
+enum {
+       arg_AL = 0,
+       arg_CL = 1,
+       arg_DL = 2,
+       arg_BL = 3,
+       arg_AH = 4,
+       arg_CH = 5,
+       arg_DH = 6,
+       arg_BH = 7,
+
+       arg_AX = 0,
+       arg_CX = 1,
+       arg_DX = 2,
+       arg_BX = 3,
+       arg_SP = 4,
+       arg_BP = 5,
+       arg_SI = 6,
+       arg_DI = 7,
+#ifdef __amd64__
+       arg_R8  = 8,
+       arg_R9  = 9,
+       arg_R10 = 10,
+       arg_R11 = 11,
+       arg_R12 = 12,
+       arg_R13 = 13,
+       arg_R14 = 14,
+       arg_R15 = 15
+#endif
+};
+
+static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
+{
+       unsigned char *rv = NULL;
+
+       switch (no) {
+       case arg_AL:
+               rv = (unsigned char *)&regs->ax;
+               break;
+       case arg_BL:
+               rv = (unsigned char *)&regs->bx;
+               break;
+       case arg_CL:
+               rv = (unsigned char *)&regs->cx;
+               break;
+       case arg_DL:
+               rv = (unsigned char *)&regs->dx;
+               break;
+       case arg_AH:
+               rv = 1 + (unsigned char *)&regs->ax;
+               break;
+       case arg_BH:
+               rv = 1 + (unsigned char *)&regs->bx;
+               break;
+       case arg_CH:
+               rv = 1 + (unsigned char *)&regs->cx;
+               break;
+       case arg_DH:
+               rv = 1 + (unsigned char *)&regs->dx;
+               break;
+#ifdef __amd64__
+       case arg_R8:
+               rv = (unsigned char *)&regs->r8;
+               break;
+       case arg_R9:
+               rv = (unsigned char *)&regs->r9;
+               break;
+       case arg_R10:
+               rv = (unsigned char *)&regs->r10;
+               break;
+       case arg_R11:
+               rv = (unsigned char *)&regs->r11;
+               break;
+       case arg_R12:
+               rv = (unsigned char *)&regs->r12;
+               break;
+       case arg_R13:
+               rv = (unsigned char *)&regs->r13;
+               break;
+       case arg_R14:
+               rv = (unsigned char *)&regs->r14;
+               break;
+       case arg_R15:
+               rv = (unsigned char *)&regs->r15;
+               break;
+#endif
+       default:
+               printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+               break;
+       }
+       return rv;
+}
+
+static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
+{
+       unsigned long *rv = NULL;
+
+       switch (no) {
+       case arg_AX:
+               rv = &regs->ax;
+               break;
+       case arg_BX:
+               rv = &regs->bx;
+               break;
+       case arg_CX:
+               rv = &regs->cx;
+               break;
+       case arg_DX:
+               rv = &regs->dx;
+               break;
+       case arg_SP:
+               rv = &regs->sp;
+               break;
+       case arg_BP:
+               rv = &regs->bp;
+               break;
+       case arg_SI:
+               rv = &regs->si;
+               break;
+       case arg_DI:
+               rv = &regs->di;
+               break;
+#ifdef __amd64__
+       case arg_R8:
+               rv = &regs->r8;
+               break;
+       case arg_R9:
+               rv = &regs->r9;
+               break;
+       case arg_R10:
+               rv = &regs->r10;
+               break;
+       case arg_R11:
+               rv = &regs->r11;
+               break;
+       case arg_R12:
+               rv = &regs->r12;
+               break;
+       case arg_R13:
+               rv = &regs->r13;
+               break;
+       case arg_R14:
+               rv = &regs->r14;
+               break;
+       case arg_R15:
+               rv = &regs->r15;
+               break;
+#endif
+       default:
+               printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+       }
+
+       return rv;
+}
+
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
+{
+       unsigned int opcode;
+       unsigned char mod_rm;
+       int reg;
+       unsigned char *p;
+       int i, shorted, enlarged, rexr;
+       unsigned long rv;
+
+       p = (unsigned char *)ins_addr;
+       p += skip_prefix(p, &shorted, &enlarged, &rexr);
+       p += get_opcode(p, &opcode);
+       for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
+               if (reg_rop[i] == opcode) {
+                       rv = REG_READ;
+                       goto do_work;
+               }
+
+       for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
+               if (reg_wop[i] == opcode) {
+                       rv = REG_WRITE;
+                       goto do_work;
+               }
+
+       printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
+                                                       "0x%02x\n", opcode);
+       goto err;
+
+do_work:
+       mod_rm = *p;
+       reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
+       switch (get_ins_reg_width(ins_addr)) {
+       case 1:
+               return *get_reg_w8(reg, regs);
+
+       case 2:
+               return *(unsigned short *)get_reg_w32(reg, regs);
+
+       case 4:
+               return *(unsigned int *)get_reg_w32(reg, regs);
+
+#ifdef __amd64__
+       case 8:
+               return *(unsigned long *)get_reg_w32(reg, regs);
+#endif
+
+       default:
+               printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
+       }
+
+err:
+       return 0;
+}
+
+unsigned long get_ins_imm_val(unsigned long ins_addr)
+{
+       unsigned int opcode;
+       unsigned char mod_rm;
+       unsigned char mod;
+       unsigned char *p;
+       int i, shorted, enlarged, rexr;
+       unsigned long rv;
+
+       p = (unsigned char *)ins_addr;
+       p += skip_prefix(p, &shorted, &enlarged, &rexr);
+       p += get_opcode(p, &opcode);
+       for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
+               if (imm_wop[i] == opcode) {
+                       rv = IMM_WRITE;
+                       goto do_work;
+               }
+
+       printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
+                                                       "0x%02x\n", opcode);
+       goto err;
+
+do_work:
+       mod_rm = *p;
+       mod = mod_rm >> 6;
+       p++;
+       switch (mod) {
+       case 0:
+               /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2)  */
+               /* AMD64: XXX Check for address size prefix? */
+               if ((mod_rm & 0x7) == 0x5)
+                       p += 4;
+               break;
+
+       case 1:
+               p += 1;
+               break;
+
+       case 2:
+               p += 4;
+               break;
+
+       case 3:
+       default:
+               printk(KERN_ERR "mmiotrace: not a memory access instruction "
+                                               "at 0x%lx, rm_mod=0x%02x\n",
+                                               ins_addr, mod_rm);
+       }
+
+       switch (get_ins_reg_width(ins_addr)) {
+       case 1:
+               return *(unsigned char *)p;
+
+       case 2:
+               return *(unsigned short *)p;
+
+       case 4:
+               return *(unsigned int *)p;
+
+#ifdef __amd64__
+       case 8:
+               return *(unsigned long *)p;
+#endif
+
+       default:
+               printk(KERN_ERR "mmiotrace: Error: width.\n");
+       }
+
+err:
+       return 0;
+}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644 (file)
index 0000000..e05341a
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum reason_type {
+       NOT_ME, /* page fault is not in regions */
+       NOTHING,        /* access others point in regions */
+       REG_READ,       /* read from addr to reg */
+       REG_WRITE,      /* write from reg to addr */
+       IMM_WRITE,      /* write from imm to addr */
+       OTHERS  /* Other instructions can not intercept */
+};
+
+enum reason_type get_ins_type(unsigned long ins_addr);
+unsigned int get_ins_mem_width(unsigned long ins_addr);
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
+unsigned long get_ins_imm_val(unsigned long ins_addr);
+
+#endif /* __PF_H_ */
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644 (file)
index 0000000..d877c5b
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * Written by Pekka Paalanen, 2008 <pq@iki.fi>
+ */
+#include <linux/module.h>
+#include <linux/io.h>
+
+#define MODULE_NAME "testmmiotrace"
+
+static unsigned long mmio_address;
+module_param(mmio_address, ulong, 0);
+MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
+
+static void do_write_test(void __iomem *p)
+{
+       unsigned int i;
+       for (i = 0; i < 256; i++)
+               iowrite8(i, p + i);
+       for (i = 1024; i < (5 * 1024); i += 2)
+               iowrite16(i * 12 + 7, p + i);
+       for (i = (5 * 1024); i < (16 * 1024); i += 4)
+               iowrite32(i * 212371 + 13, p + i);
+}
+
+static void do_read_test(void __iomem *p)
+{
+       unsigned int i;
+       for (i = 0; i < 256; i++)
+               ioread8(p + i);
+       for (i = 1024; i < (5 * 1024); i += 2)
+               ioread16(p + i);
+       for (i = (5 * 1024); i < (16 * 1024); i += 4)
+               ioread32(p + i);
+}
+
+static void do_test(void)
+{
+       void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
+       if (!p) {
+               pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
+               return;
+       }
+       do_write_test(p);
+       do_read_test(p);
+       iounmap(p);
+}
+
+static int __init init(void)
+{
+       if (mmio_address == 0) {
+               pr_err(MODULE_NAME ": you have to use the module argument "
+                                                       "mmio_address.\n");
+               pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
+                               " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+               return -ENXIO;
+       }
+
+       pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
+                                       "in PCI address space, and writing "
+                                       "rubbish in there.\n", mmio_address);
+       do_test();
+       return 0;
+}
+
+static void __exit cleanup(void)
+{
+       pr_debug(MODULE_NAME ": unloaded.\n");
+}
+
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");
index efa2ba7c600567ea95c29202eee2d87b892cb60d..1ef0f90813d626ed6be436b93d3d5b6550dbb392 100644 (file)
@@ -23,7 +23,7 @@
 
 #define gtod vdso_vsyscall_gtod_data
 
-static long vdso_fallback_gettime(long clock, struct timespec *ts)
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
        long ret;
        asm("syscall" : "=a" (ret) :
@@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts)
        return ret;
 }
 
-static inline long vgetns(void)
+notrace static inline long vgetns(void)
 {
        long v;
        cycles_t (*vread)(void);
@@ -40,7 +40,7 @@ static inline long vgetns(void)
        return (v * gtod->clock.mult) >> gtod->clock.shift;
 }
 
-static noinline int do_realtime(struct timespec *ts)
+notrace static noinline int do_realtime(struct timespec *ts)
 {
        unsigned long seq, ns;
        do {
@@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts)
 }
 
 /* Copy of the version in kernel/time.c which we cannot directly access */
-static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+notrace static void
+vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
 {
        while (nsec >= NSEC_PER_SEC) {
                nsec -= NSEC_PER_SEC;
@@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
        ts->tv_nsec = nsec;
 }
 
-static noinline int do_monotonic(struct timespec *ts)
+notrace static noinline int do_monotonic(struct timespec *ts)
 {
        unsigned long seq, ns, secs;
        do {
@@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts)
        return 0;
 }
 
-int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
        if (likely(gtod->sysctl_enabled && gtod->clock.vread))
                switch (clock) {
@@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 int clock_gettime(clockid_t, struct timespec *)
        __attribute__((weak, alias("__vdso_clock_gettime")));
 
-int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
        long ret;
        if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
index c8097f17f8a978a5d956d74bbd8e6150fefb9ddb..9fbc6b20026b5ac4f23f9704e2cac9b1daaaf3a6 100644 (file)
@@ -13,7 +13,8 @@
 #include <asm/vgtod.h>
 #include "vextern.h"
 
-long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
        unsigned int p;
 
diff --git a/include/asm-arm/ftrace.h b/include/asm-arm/ftrace.h
new file mode 100644 (file)
index 0000000..584ef9a
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef _ASM_ARM_FTRACE
+#define _ASM_ARM_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR            ((long)(mcount))
+#define MCOUNT_INSN_SIZE       4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_ARM_FTRACE */
index c042194d3ab55fc19be196d9db37640d260f224b..b1a37876942deb793252e35f7b826f77fec0d9a1 100644 (file)
@@ -59,6 +59,7 @@ struct kprobe_ctlblk {
 };
 
 void arch_remove_kprobe(struct kprobe *);
+void kretprobe_trampoline(void);
 
 int kprobe_trap_handler(struct pt_regs *regs, unsigned int instr);
 int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr);
diff --git a/include/asm-powerpc/ftrace.h b/include/asm-powerpc/ftrace.h
new file mode 100644 (file)
index 0000000..de92132
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef _ASM_POWERPC_FTRACE
+#define _ASM_POWERPC_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR            ((long)(_mcount))
+#define MCOUNT_INSN_SIZE       4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void _mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_POWERPC_FTRACE */
index ad8c9f7fd0e359f8edb08db45e645b4632b9c5c8..f75a5fc64d2e64c8eb117c3faacb0c35f804b919 100644 (file)
@@ -59,6 +59,11 @@ extern void iseries_handle_interrupts(void);
                get_paca()->hard_enabled = 0;   \
        } while(0)
 
+static inline int irqs_disabled_flags(unsigned long flags)
+{
+       return flags == 0;
+}
+
 #else
 
 #if defined(CONFIG_BOOKE)
@@ -113,6 +118,11 @@ static inline void local_irq_save_ptr(unsigned long *flags)
 #define hard_irq_enable()      local_irq_enable()
 #define hard_irq_disable()     local_irq_disable()
 
+static inline int irqs_disabled_flags(unsigned long flags)
+{
+       return (flags & MSR_EE) == 0;
+}
+
 #endif /* CONFIG_PPC64 */
 
 /*
diff --git a/include/asm-sparc64/ftrace.h b/include/asm-sparc64/ftrace.h
new file mode 100644 (file)
index 0000000..d27716c
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef _ASM_SPARC64_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_MCOUNT
+#define MCOUNT_ADDR            ((long)(_mcount))
+#define MCOUNT_INSN_SIZE       4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void _mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_SPARC64_FTRACE */
index 1f6a9ca1012607de987efdf28d2ce19db3c423e5..f6aa18eadf71717d9e86c53ac3719776fa035969 100644 (file)
@@ -72,6 +72,8 @@ static inline void alternatives_smp_module_del(struct module *mod) {}
 static inline void alternatives_smp_switch(int smp) {}
 #endif /* CONFIG_SMP */
 
+const unsigned char *const *find_nop_table(void);
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h
new file mode 100644 (file)
index 0000000..c184441
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR            ((long)(mcount))
+#define MCOUNT_INSN_SIZE       5 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif /* CONFIG_FTRACE */
+
+#endif /* _ASM_X86_FTRACE */
index 17e7a1701c97be5c6a734e25e2add35347bb96b5..424acb48cd61baf681f7d71fb6b4ad56e1cfc3ee 100644 (file)
@@ -190,8 +190,6 @@ static inline void trace_hardirqs_fixup(void)
 #else
 
 #ifdef CONFIG_X86_64
-#define ARCH_TRACE_IRQS_ON             call trace_hardirqs_on_thunk
-#define ARCH_TRACE_IRQS_OFF            call trace_hardirqs_off_thunk
 #define ARCH_LOCKDEP_SYS_EXIT          call lockdep_sys_exit_thunk
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ      \
        TRACE_IRQS_ON; \
@@ -203,24 +201,6 @@ static inline void trace_hardirqs_fixup(void)
        TRACE_IRQS_OFF;
 
 #else
-#define ARCH_TRACE_IRQS_ON                     \
-       pushl %eax;                             \
-       pushl %ecx;                             \
-       pushl %edx;                             \
-       call trace_hardirqs_on;                 \
-       popl %edx;                              \
-       popl %ecx;                              \
-       popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF                    \
-       pushl %eax;                             \
-       pushl %ecx;                             \
-       pushl %edx;                             \
-       call trace_hardirqs_off;                \
-       popl %edx;                              \
-       popl %ecx;                              \
-       popl %eax;
-
 #define ARCH_LOCKDEP_SYS_EXIT                  \
        pushl %eax;                             \
        pushl %ecx;                             \
@@ -234,8 +214,8 @@ static inline void trace_hardirqs_fixup(void)
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON                ARCH_TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF       ARCH_TRACE_IRQS_OFF
+#  define TRACE_IRQS_ON                call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF       call trace_hardirqs_off_thunk;
 #else
 #  define TRACE_IRQS_ON
 #  define TRACE_IRQS_OFF
index 17b3700949bfe3e90ef495fd8e09c60f7450c129..6b66ff905af0408ee9ff586c4fe49dfc001c9af1 100644 (file)
@@ -24,7 +24,8 @@ enum vsyscall_num {
        ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
 #define __section_vsyscall_clock __attribute__ \
        ((unused, __section__ (".vsyscall_clock"),aligned(16)))
-#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_fn \
+       __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
 
 #define VGETCPU_RDTSCP 1
 #define VGETCPU_LSL    2
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
new file mode 100644 (file)
index 0000000..f368d04
--- /dev/null
@@ -0,0 +1,144 @@
+#ifndef _LINUX_FTRACE_H
+#define _LINUX_FTRACE_H
+
+#ifdef CONFIG_FTRACE
+
+#include <linux/linkage.h>
+#include <linux/fs.h>
+
+extern int ftrace_enabled;
+extern int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+                    struct file *filp, void __user *buffer, size_t *lenp,
+                    loff_t *ppos);
+
+typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
+
+struct ftrace_ops {
+       ftrace_func_t     func;
+       struct ftrace_ops *next;
+};
+
+/*
+ * The ftrace_ops must be a static and should also
+ * be read_mostly.  These functions do modify read_mostly variables
+ * so use them sparely. Never free an ftrace_op or modify the
+ * next pointer after it has been registered. Even after unregistering
+ * it, the next pointer may still be used internally.
+ */
+int register_ftrace_function(struct ftrace_ops *ops);
+int unregister_ftrace_function(struct ftrace_ops *ops);
+void clear_ftrace_function(void);
+
+extern void ftrace_stub(unsigned long a0, unsigned long a1);
+
+#else /* !CONFIG_FTRACE */
+# define register_ftrace_function(ops) do { } while (0)
+# define unregister_ftrace_function(ops) do { } while (0)
+# define clear_ftrace_function(ops) do { } while (0)
+#endif /* CONFIG_FTRACE */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+# define FTRACE_HASHBITS       10
+# define FTRACE_HASHSIZE       (1<<FTRACE_HASHBITS)
+
+enum {
+       FTRACE_FL_FREE          = (1 << 0),
+       FTRACE_FL_FAILED        = (1 << 1),
+       FTRACE_FL_FILTER        = (1 << 2),
+       FTRACE_FL_ENABLED       = (1 << 3),
+       FTRACE_FL_NOTRACE       = (1 << 4),
+       FTRACE_FL_CONVERTED     = (1 << 5),
+       FTRACE_FL_FROZEN        = (1 << 6),
+};
+
+struct dyn_ftrace {
+       struct hlist_node node;
+       unsigned long     ip; /* address of mcount call-site */
+       unsigned long     flags;
+};
+
+int ftrace_force_update(void);
+void ftrace_set_filter(unsigned char *buf, int len, int reset);
+
+/* defined in arch */
+extern int ftrace_ip_converted(unsigned long ip);
+extern unsigned char *ftrace_nop_replace(void);
+extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
+extern int ftrace_dyn_arch_init(void *data);
+extern int ftrace_mcount_set(unsigned long *data);
+extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+                             unsigned char *new_code);
+extern int ftrace_update_ftrace_func(ftrace_func_t func);
+extern void ftrace_caller(void);
+extern void ftrace_call(void);
+extern void mcount_call(void);
+
+extern int skip_trace(unsigned long ip);
+
+void ftrace_disable_daemon(void);
+void ftrace_enable_daemon(void);
+
+#else
+# define skip_trace(ip)                                ({ 0; })
+# define ftrace_force_update()                 ({ 0; })
+# define ftrace_set_filter(buf, len, reset)    do { } while (0)
+# define ftrace_disable_daemon()               do { } while (0)
+# define ftrace_enable_daemon()                        do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/* totally disable ftrace - can not re-enable after this */
+void ftrace_kill(void);
+void ftrace_kill_atomic(void);
+
+static inline void tracer_disable(void)
+{
+#ifdef CONFIG_FTRACE
+       ftrace_enabled = 0;
+#endif
+}
+
+#ifdef CONFIG_FRAME_POINTER
+/* TODO: need to fix this for ARM */
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
+# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+# define CALLER_ADDR4 0UL
+# define CALLER_ADDR5 0UL
+# define CALLER_ADDR6 0UL
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+  extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1)              do { } while (0)
+# define time_hardirqs_off(a0, a1)             do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+  extern void trace_preempt_on(unsigned long a0, unsigned long a1);
+  extern void trace_preempt_off(unsigned long a0, unsigned long a1);
+#else
+# define trace_preempt_on(a0, a1)              do { } while (0)
+# define trace_preempt_off(a0, a1)             do { } while (0)
+#endif
+
+#ifdef CONFIG_TRACING
+extern void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
+#else
+static inline void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
+#endif
+
+#endif /* _LINUX_FTRACE_H */
index e600c4e9b8c5b179eebdcaea3769f5bcba49f18a..2b1c2e58566ea04460370fef4a4b686794ab512d 100644 (file)
 #define _LINUX_TRACE_IRQFLAGS_H
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-  extern void trace_hardirqs_on(void);
-  extern void trace_hardirqs_off(void);
   extern void trace_softirqs_on(unsigned long ip);
   extern void trace_softirqs_off(unsigned long ip);
+  extern void trace_hardirqs_on(void);
+  extern void trace_hardirqs_off(void);
 # define trace_hardirq_context(p)      ((p)->hardirq_context)
 # define trace_softirq_context(p)      ((p)->softirq_context)
 # define trace_hardirqs_enabled(p)     ((p)->hardirqs_enabled)
 # define INIT_TRACE_IRQFLAGS
 #endif
 
+#if defined(CONFIG_IRQSOFF_TRACER) || \
+       defined(CONFIG_PREEMPT_TRACER)
+ extern void stop_critical_timings(void);
+ extern void start_critical_timings(void);
+#else
+# define stop_critical_timings() do { } while (0)
+# define start_critical_timings() do { } while (0)
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 
 #include <asm/irqflags.h>
index 1036631ff4fac552ffd850e24f7e21b939fe32ad..04a3556bdea6b87e654db168240139dc887decb4 100644 (file)
@@ -259,6 +259,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 struct jprobe;
 struct kretprobe;
 
+static inline struct kprobe *get_kprobe(void *addr)
+{
+       return NULL;
+}
 static inline struct kprobe *kprobe_running(void)
 {
        return NULL;
index 9fd1f859021b5018baffdb440261379dab4a55b3..56ba37394656c7f211eaebdadf5481eda844acca 100644 (file)
@@ -4,6 +4,8 @@
 #include <linux/compiler.h>
 #include <asm/linkage.h>
 
+#define notrace __attribute__((no_instrument_function))
+
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
index 430f6adf9762d175096246d92dfe999f6b6bba9c..1290653f924181333f6699d37b161238370b1515 100644 (file)
@@ -44,8 +44,8 @@ struct marker {
                                 */
        char state;             /* Marker state. */
        char ptype;             /* probe type : 0 : single, 1 : multi */
-       void (*call)(const struct marker *mdata,        /* Probe wrapper */
-               void *call_private, const char *fmt, ...);
+                               /* Probe wrapper */
+       void (*call)(const struct marker *mdata, void *call_private, ...);
        struct marker_probe_closure single;
        struct marker_probe_closure *multi;
 } __attribute__((aligned(8)));
@@ -58,8 +58,12 @@ struct marker {
  * Make sure the alignment of the structure in the __markers section will
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
+ *
+ * The "generic" argument controls which marker enabling mechanism must be used.
+ * If generic is true, a variable read is used.
+ * If generic is false, immediate values are used.
  */
-#define __trace_mark(name, call_private, format, args...)              \
+#define __trace_mark(generic, name, call_private, format, args...)     \
        do {                                                            \
                static const char __mstrtab_##name[]                    \
                __attribute__((section("__markers_strings")))           \
@@ -72,15 +76,14 @@ struct marker {
                __mark_check_format(format, ## args);                   \
                if (unlikely(__mark_##name.state)) {                    \
                        (*__mark_##name.call)                           \
-                               (&__mark_##name, call_private,          \
-                               format, ## args);                       \
+                               (&__mark_##name, call_private, ## args);\
                }                                                       \
        } while (0)
 
 extern void marker_update_probe_range(struct marker *begin,
        struct marker *end);
 #else /* !CONFIG_MARKERS */
-#define __trace_mark(name, call_private, format, args...) \
+#define __trace_mark(generic, name, call_private, format, args...) \
                __mark_check_format(format, ## args)
 static inline void marker_update_probe_range(struct marker *begin,
        struct marker *end)
@@ -88,15 +91,30 @@ static inline void marker_update_probe_range(struct marker *begin,
 #endif /* CONFIG_MARKERS */
 
 /**
- * trace_mark - Marker
+ * trace_mark - Marker using code patching
  * @name: marker name, not quoted.
  * @format: format string
  * @args...: variable argument list
  *
- * Places a marker.
+ * Places a marker using optimized code patching technique (imv_read())
+ * to be enabled when immediate values are present.
  */
 #define trace_mark(name, format, args...) \
-       __trace_mark(name, NULL, format, ## args)
+       __trace_mark(0, name, NULL, format, ## args)
+
+/**
+ * _trace_mark - Marker using variable read
+ * @name: marker name, not quoted.
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker using a standard memory read (_imv_read()) to be
+ * enabled. Should be used for markers in code paths where instruction
+ * modification based enabling is not welcome. (__init and __exit functions,
+ * lockdep, some traps, printk).
+ */
+#define _trace_mark(name, format, args...) \
+       __trace_mark(1, name, NULL, format, ## args)
 
 /**
  * MARK_NOARGS - Format string for a marker with no argument.
@@ -117,9 +135,9 @@ static inline void __printf(1, 2) ___mark_check_format(const char *fmt, ...)
 extern marker_probe_func __mark_empty_function;
 
 extern void marker_probe_cb(const struct marker *mdata,
-       void *call_private, const char *fmt, ...);
+       void *call_private, ...);
 extern void marker_probe_cb_noarg(const struct marker *mdata,
-       void *call_private, const char *fmt, ...);
+       void *call_private, ...);
 
 /*
  * Connect a probe to a marker.
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
new file mode 100644 (file)
index 0000000..61d19e1
--- /dev/null
@@ -0,0 +1,85 @@
+#ifndef MMIOTRACE_H
+#define MMIOTRACE_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+struct kmmio_probe;
+struct pt_regs;
+
+typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
+                               struct pt_regs *, unsigned long addr);
+typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
+                               unsigned long condition, struct pt_regs *);
+
+struct kmmio_probe {
+       struct list_head list; /* kmmio internal list */
+       unsigned long addr; /* start location of the probe point */
+       unsigned long len; /* length of the probe region */
+       kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
+       kmmio_post_handler_t post_handler; /* Called after addr is executed */
+       void *private;
+};
+
+/* kmmio is active by some kmmio_probes? */
+static inline int is_kmmio_active(void)
+{
+       extern unsigned int kmmio_count;
+       return kmmio_count;
+}
+
+extern int register_kmmio_probe(struct kmmio_probe *p);
+extern void unregister_kmmio_probe(struct kmmio_probe *p);
+
+/* Called from page fault handler. */
+extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
+
+/* Called from ioremap.c */
+#ifdef CONFIG_MMIOTRACE
+extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+                                                       void __iomem *addr);
+extern void mmiotrace_iounmap(volatile void __iomem *addr);
+#else
+static inline void mmiotrace_ioremap(resource_size_t offset,
+                                       unsigned long size, void __iomem *addr)
+{
+}
+
+static inline void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+}
+#endif /* CONFIG_MMIOTRACE_HOOKS */
+
+enum mm_io_opcode {
+       MMIO_READ = 0x1,     /* struct mmiotrace_rw */
+       MMIO_WRITE = 0x2,    /* struct mmiotrace_rw */
+       MMIO_PROBE = 0x3,    /* struct mmiotrace_map */
+       MMIO_UNPROBE = 0x4,  /* struct mmiotrace_map */
+       MMIO_MARKER = 0x5,   /* raw char data */
+       MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */
+};
+
+struct mmiotrace_rw {
+       resource_size_t phys;   /* PCI address of register */
+       unsigned long value;
+       unsigned long pc;       /* optional program counter */
+       int map_id;
+       unsigned char opcode;   /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
+       unsigned char width;    /* size of register access in bytes */
+};
+
+struct mmiotrace_map {
+       resource_size_t phys;   /* base address in PCI space */
+       unsigned long virt;     /* base virtual address */
+       unsigned long len;      /* mapping size */
+       int map_id;
+       unsigned char opcode;   /* MMIO_PROBE or MMIO_UNPROBE */
+};
+
+/* in kernel/trace/trace_mmiotrace.c */
+extern void enable_mmiotrace(void);
+extern void disable_mmiotrace(void);
+extern void mmio_trace_rw(struct mmiotrace_rw *rw);
+extern void mmio_trace_mapping(struct mmiotrace_map *map);
+
+#endif /* MMIOTRACE_H */
index 23f0c54175cdde74280bf89782a4c0743a0ef58e..72b1a10a59b6c178bf192f9d06c3c6c38a33e0c4 100644 (file)
@@ -10,7 +10,7 @@
 #include <linux/linkage.h>
 #include <linux/list.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
 #else
@@ -52,6 +52,34 @@ do { \
        preempt_check_resched(); \
 } while (0)
 
+/* For debugging and tracer internals only! */
+#define add_preempt_count_notrace(val)                 \
+       do { preempt_count() += (val); } while (0)
+#define sub_preempt_count_notrace(val)                 \
+       do { preempt_count() -= (val); } while (0)
+#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
+#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
+
+#define preempt_disable_notrace() \
+do { \
+       inc_preempt_count_notrace(); \
+       barrier(); \
+} while (0)
+
+#define preempt_enable_no_resched_notrace() \
+do { \
+       barrier(); \
+       dec_preempt_count_notrace(); \
+} while (0)
+
+/* preempt_check_resched is OK to trace */
+#define preempt_enable_notrace() \
+do { \
+       preempt_enable_no_resched_notrace(); \
+       barrier(); \
+       preempt_check_resched(); \
+} while (0)
+
 #else
 
 #define preempt_disable()              do { } while (0)
@@ -59,6 +87,10 @@ do { \
 #define preempt_enable()               do { } while (0)
 #define preempt_check_resched()                do { } while (0)
 
+#define preempt_disable_notrace()              do { } while (0)
+#define preempt_enable_no_resched_notrace()    do { } while (0)
+#define preempt_enable_notrace()               do { } while (0)
+
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
index f6cd60f2de63ba70cc127fcd1f05ebf4efdd7c45..5d1af10b90c3fabe104ef21f82c5c4aaa5899e92 100644 (file)
@@ -245,6 +245,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
+extern int runqueue_is_locked(void);
+
 extern cpumask_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
@@ -2132,6 +2134,18 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_TRACING
+extern void
+__trace_special(void *__tr, void *__data,
+               unsigned long arg1, unsigned long arg2, unsigned long arg3);
+#else
+static inline void
+__trace_special(void *__tr, void *__data,
+               unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+}
+#endif
+
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
@@ -2226,6 +2240,8 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 }
 #endif /* CONFIG_MM_OWNER */
 
+#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+
 #endif /* __KERNEL__ */
 
 #endif
index f462439cc2886c56f7c1940f4100ba726d4068c4..bd91987c065fcd1f923a5cd05d6f781c3f685aab 100644 (file)
@@ -105,6 +105,8 @@ extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 
+extern unsigned long determine_dirtyable_memory(void);
+
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
                loff_t *ppos);
index 6c55301112e064cf7b95795c899897739a6cfe14..f6328e16dfdde5749b05279e503f2172a2831544 100644 (file)
@@ -11,6 +11,18 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
+CFLAGS_REMOVE_sched.o = -mno-spe
+
+ifdef CONFIG_FTRACE
+# Do not trace debug files and internal ftrace files
+CFLAGS_REMOVE_lockdep.o = -pg
+CFLAGS_REMOVE_lockdep_proc.o = -pg
+CFLAGS_REMOVE_mutex-debug.o = -pg
+CFLAGS_REMOVE_rtmutex-debug.o = -pg
+CFLAGS_REMOVE_cgroup-debug.o = -pg
+CFLAGS_REMOVE_sched_clock.o = -pg
+endif
+
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
@@ -69,6 +81,8 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
+obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
index 19908b26cf80494a0db8f3410384521081293a32..d66d676dc36205bb04e24f6f3b31ae16ce092f97 100644 (file)
@@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
        rt_mutex_init_task(p);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
        DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
index 81a4e4a3f087adfc650eb6baf2c05147f209e062..65548eff029e4d61360d5a29626bce3f64ffaae1 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/hash.h>
+#include <linux/ftrace.h>
 
 #include <asm/sections.h>
 
@@ -81,6 +82,8 @@ static int graph_lock(void)
                __raw_spin_unlock(&lockdep_lock);
                return 0;
        }
+       /* prevent any recursions within lockdep from causing deadlocks */
+       current->lockdep_recursion++;
        return 1;
 }
 
@@ -89,6 +92,7 @@ static inline int graph_unlock(void)
        if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
                return DEBUG_LOCKS_WARN_ON(1);
 
+       current->lockdep_recursion--;
        __raw_spin_unlock(&lockdep_lock);
        return 0;
 }
@@ -982,7 +986,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
        return 1;
 }
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
@@ -1680,7 +1684,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
                     enum lock_usage_bit new_bit);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 
 /*
  * print irq inversion bug:
@@ -2013,11 +2017,13 @@ void early_boot_irqs_on(void)
 /*
  * Hardirqs will be enabled:
  */
-void trace_hardirqs_on(void)
+void trace_hardirqs_on_caller(unsigned long a0)
 {
        struct task_struct *curr = current;
        unsigned long ip;
 
+       time_hardirqs_on(CALLER_ADDR0, a0);
+
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
 
@@ -2055,16 +2061,23 @@ void trace_hardirqs_on(void)
        curr->hardirq_enable_event = ++curr->irq_events;
        debug_atomic_inc(&hardirqs_on_events);
 }
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
+void trace_hardirqs_on(void)
+{
+       trace_hardirqs_on_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off(void)
+void trace_hardirqs_off_caller(unsigned long a0)
 {
        struct task_struct *curr = current;
 
+       time_hardirqs_off(CALLER_ADDR0, a0);
+
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
 
@@ -2082,7 +2095,12 @@ void trace_hardirqs_off(void)
        } else
                debug_atomic_inc(&redundant_hardirqs_off);
 }
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
+void trace_hardirqs_off(void)
+{
+       trace_hardirqs_off_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 /*
@@ -2246,7 +2264,7 @@ static inline int separate_irq_context(struct task_struct *curr,
  * Mark a lock with a usage bit, and validate the state transition:
  */
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
-                    enum lock_usage_bit new_bit)
+                            enum lock_usage_bit new_bit)
 {
        unsigned int new_mask = 1 << new_bit, ret = 1;
 
@@ -2686,7 +2704,7 @@ static void check_flags(unsigned long flags)
  * and also avoid lockdep recursion:
  */
 void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
-                 int trylock, int read, int check, unsigned long ip)
+                         int trylock, int read, int check, unsigned long ip)
 {
        unsigned long flags;
 
@@ -2708,7 +2726,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+void lock_release(struct lockdep_map *lock, int nested,
+                         unsigned long ip)
 {
        unsigned long flags;
 
index b5a9fe1d50d5ce1c480ca7ca1777e55e6deb485e..1abfb923b761f46b3266ec471924ade4fc593ef2 100644 (file)
@@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex);
 struct marker_entry {
        struct hlist_node hlist;
        char *format;
-       void (*call)(const struct marker *mdata,        /* Probe wrapper */
-               void *call_private, const char *fmt, ...);
+                       /* Probe wrapper */
+       void (*call)(const struct marker *mdata, void *call_private, ...);
        struct marker_probe_closure single;
        struct marker_probe_closure *multi;
        int refcount;   /* Number of times armed. 0 if disarmed. */
@@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
  * marker_probe_cb Callback that prepares the variable argument list for probes.
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  *
  * Since we do not use "typical" pointer based RCU in the 1 argument case, we
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * rcu_dereference() for the pointer read.
  */
-void marker_probe_cb(const struct marker *mdata, void *call_private,
-       const char *fmt, ...)
+void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 {
        va_list args;
        char ptype;
@@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
                /* Must read the ptr before private data. They are not data
                 * dependant, so we put an explicit smp_rmb() here. */
                smp_rmb();
-               va_start(args, fmt);
-               func(mdata->single.probe_private, call_private, fmt, &args);
+               va_start(args, call_private);
+               func(mdata->single.probe_private, call_private, mdata->format,
+                       &args);
                va_end(args);
        } else {
                struct marker_probe_closure *multi;
@@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
                smp_read_barrier_depends();
                multi = mdata->multi;
                for (i = 0; multi[i].func; i++) {
-                       va_start(args, fmt);
-                       multi[i].func(multi[i].probe_private, call_private, fmt,
-                               &args);
+                       va_start(args, call_private);
+                       multi[i].func(multi[i].probe_private, call_private,
+                               mdata->format, &args);
                        va_end(args);
                }
        }
@@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  * marker_probe_cb Callback that does not prepare the variable argument list.
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-void marker_probe_cb_noarg(const struct marker *mdata,
-       void *call_private, const char *fmt, ...)
+void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 {
        va_list args;   /* not initialized */
        char ptype;
@@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
                /* Must read the ptr before private data. They are not data
                 * dependant, so we put an explicit smp_rmb() here. */
                smp_rmb();
-               func(mdata->single.probe_private, call_private, fmt, &args);
+               func(mdata->single.probe_private, call_private, mdata->format,
+                       &args);
        } else {
                struct marker_probe_closure *multi;
                int i;
@@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
                smp_read_barrier_depends();
                multi = mdata->multi;
                for (i = 0; multi[i].func; i++)
-                       multi[i].func(multi[i].probe_private, call_private, fmt,
-                               &args);
+                       multi[i].func(multi[i].probe_private, call_private,
+                               mdata->format, &args);
        }
        preempt_enable();
 }
index 625d240d7ada4633de19e99659d481af68b59913..5d81a11321fd72d4c956b677110589acf51379fd 100644 (file)
@@ -1046,7 +1046,9 @@ void release_console_sem(void)
                _log_end = log_end;
                con_start = log_end;            /* Flush */
                spin_unlock(&logbuf_lock);
+               stop_critical_timings();        /* don't trace print latency */
                call_console_drivers(_con_start, _log_end);
+               start_critical_timings();
                local_irq_restore(flags);
        }
        console_locked = 0;
index 591d5e7f757ad7438e9696bd3735f4b098dfda92..c74b0d23c7525c1db91d73c5ebab56931bccae4a 100644 (file)
@@ -70,6 +70,7 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
+#include <linux/ftrace.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -645,6 +646,24 @@ static inline void update_rq_clock(struct rq *rq)
 # define const_debug static const
 #endif
 
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+       int cpu = get_cpu();
+       struct rq *rq = cpu_rq(cpu);
+       int ret;
+
+       ret = spin_is_locked(&rq->lock);
+       put_cpu();
+       return ret;
+}
+
 /*
  * Debugging: various feature bits
  */
@@ -2318,6 +2337,9 @@ out_activate:
        success = 1;
 
 out_running:
+       trace_mark(kernel_sched_wakeup,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
        check_preempt_curr(rq, p);
 
        p->state = TASK_RUNNING;
@@ -2450,6 +2472,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                p->sched_class->task_new(rq, p);
                inc_nr_running(rq);
        }
+       trace_mark(kernel_sched_wakeup_new,
+               "pid %d state %ld ## rq %p task %p rq->curr %p",
+               p->pid, p->state, rq, p, rq->curr);
        check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@ -2622,6 +2647,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
 
        prepare_task_switch(rq, prev, next);
+       trace_mark(kernel_sched_schedule,
+               "prev_pid %d next_pid %d prev_state %ld "
+               "## rq %p prev %p next %p",
+               prev->pid, next->pid, prev->state,
+               rq, prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -4221,26 +4251,44 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+       if (in_lock_functions(addr)) {
+               addr = CALLER_ADDR2;
+               if (in_lock_functions(addr))
+                       addr = CALLER_ADDR3;
+       }
+       return addr;
+}
 
 void __kprobes add_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Underflow?
         */
        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                return;
+#endif
        preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
         */
        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                PREEMPT_MASK - 10);
+#endif
+       if (preempt_count() == val)
+               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 
 void __kprobes sub_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Underflow?
         */
@@ -4252,7 +4300,10 @@ void __kprobes sub_preempt_count(int val)
        if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
                        !(preempt_count() & PREEMPT_MASK)))
                return;
+#endif
 
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
        preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
@@ -5566,7 +5617,7 @@ out_unlock:
        return retval;
 }
 
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 
 void sched_show_task(struct task_struct *p)
 {
index 5c2942e768cdd44371ae305d5870abfe7428732c..aaaeae8244e77aa7dffd26ff935e7e3260c58a97 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
+#include <linux/ftrace.h>
 
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
index ae28c82451237a7ee0b0d8653701e7eed5437c6b..a1fb54c93cdd2381f23573748852a4a105e8ddd3 100644 (file)
@@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 }
 EXPORT_SYMBOL(_spin_trylock_bh);
 
-int in_lock_functions(unsigned long addr)
+notrace int in_lock_functions(unsigned long addr)
 {
        /* Linker adds these: start and end of __lockfunc functions */
        extern char __lock_text_start[], __lock_text_end[];
index fe8cdc80ff028ac5542e0a342c5a2c9c5a823833..18943985ddee42f4163e80d4537b991886f0dad1 100644 (file)
@@ -46,6 +46,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -463,6 +464,16 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+#ifdef CONFIG_FTRACE
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "ftrace_enabled",
+               .data           = &ftrace_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &ftrace_enable_sysctl,
+       },
+#endif
 #ifdef CONFIG_KMOD
        {
                .ctl_name       = KERN_MODPROBE,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644 (file)
index 0000000..263e9e6
--- /dev/null
@@ -0,0 +1,135 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+       bool
+
+config HAVE_DYNAMIC_FTRACE
+       bool
+
+config TRACER_MAX_TRACE
+       bool
+
+config TRACING
+       bool
+       select DEBUG_FS
+       select STACKTRACE
+
+config FTRACE
+       bool "Kernel Function Tracer"
+       depends on HAVE_FTRACE
+       select FRAME_POINTER
+       select TRACING
+       select CONTEXT_SWITCH_TRACER
+       help
+         Enable the kernel to trace every kernel function. This is done
+         by using a compiler feature to insert a small, 5-byte No-Operation
+         instruction to the beginning of every kernel function, which NOP
+         sequence is then dynamically patched into a tracer call when
+         tracing is enabled by the administrator. If it's runtime disabled
+         (the bootup default), then the overhead of the instructions is very
+         small and not measurable even in micro-benchmarks.
+
+config IRQSOFF_TRACER
+       bool "Interrupts-off Latency Tracer"
+       default n
+       depends on TRACE_IRQFLAGS_SUPPORT
+       depends on GENERIC_TIME
+       depends on HAVE_FTRACE
+       select TRACE_IRQFLAGS
+       select TRACING
+       select TRACER_MAX_TRACE
+       help
+         This option measures the time spent in irqs-off critical
+         sections, with microsecond accuracy.
+
+         The default measurement method is a maximum search, which is
+         disabled by default and can be runtime (re-)started
+         via:
+
+             echo 0 > /debugfs/tracing/tracing_max_latency
+
+         (Note that kernel size and overhead increases with this option
+         enabled. This option and the preempt-off timing option can be
+         used together or separately.)
+
+config PREEMPT_TRACER
+       bool "Preemption-off Latency Tracer"
+       default n
+       depends on GENERIC_TIME
+       depends on PREEMPT
+       depends on HAVE_FTRACE
+       select TRACING
+       select TRACER_MAX_TRACE
+       help
+         This option measures the time spent in preemption off critical
+         sections, with microsecond accuracy.
+
+         The default measurement method is a maximum search, which is
+         disabled by default and can be runtime (re-)started
+         via:
+
+             echo 0 > /debugfs/tracing/tracing_max_latency
+
+         (Note that kernel size and overhead increases with this option
+         enabled. This option and the irqs-off timing option can be
+         used together or separately.)
+
+config SYSPROF_TRACER
+       bool "Sysprof Tracer"
+       depends on X86
+       select TRACING
+       help
+         This tracer provides the trace needed by the 'Sysprof' userspace
+         tool.
+
+config SCHED_TRACER
+       bool "Scheduling Latency Tracer"
+       depends on HAVE_FTRACE
+       select TRACING
+       select CONTEXT_SWITCH_TRACER
+       select TRACER_MAX_TRACE
+       help
+         This tracer tracks the latency of the highest priority task
+         to be scheduled in, starting from the point it has woken up.
+
+config CONTEXT_SWITCH_TRACER
+       bool "Trace process context switches"
+       depends on HAVE_FTRACE
+       select TRACING
+       select MARKERS
+       help
+         This tracer gets called from the context switch and records
+         all switching of tasks.
+
+config DYNAMIC_FTRACE
+       bool "enable/disable ftrace tracepoints dynamically"
+       depends on FTRACE
+       depends on HAVE_DYNAMIC_FTRACE
+       default y
+       help
+         This option will modify all the calls to ftrace dynamically
+        (will patch them out of the binary image and replaces them
+        with a No-Op instruction) as they are called. A table is
+        created to dynamically enable them again.
+
+        This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+        has native performance as long as no tracing is active.
+
+        The changes to the code are done by a kernel thread that
+        wakes up once a second and checks to see if any ftrace calls
+        were made. If so, it runs stop_machine (stops all CPUS)
+        and modifies the code to jump over the call to ftrace.
+
+config FTRACE_SELFTEST
+       bool
+
+config FTRACE_STARTUP_TEST
+       bool "Perform a startup test on ftrace"
+       depends on TRACING
+       select FTRACE_SELFTEST
+       help
+         This option performs a series of startup tests on ftrace. On bootup
+         a series of tests are made to verify that the tracer is
+         functioning properly. It will do tests on all the configured
+         tracers of ftrace.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644 (file)
index 0000000..71d17de
--- /dev/null
@@ -0,0 +1,24 @@
+
+# Do not instrument the tracer itself:
+
+ifdef CONFIG_FTRACE
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+
+# selftest needs instrumentation
+CFLAGS_trace_selftest_dynamic.o = -pg
+obj-y += trace_selftest_dynamic.o
+endif
+
+obj-$(CONFIG_FTRACE) += libftrace.o
+
+obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
+obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
+obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
+
+libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644 (file)
index 0000000..4231a3d
--- /dev/null
@@ -0,0 +1,1727 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/hardirq.h>
+#include <linux/kthread.h>
+#include <linux/uaccess.h>
+#include <linux/kprobes.h>
+#include <linux/ftrace.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+
+#include <asm/ftrace.h>
+
+#include "trace.h"
+
+/* ftrace_enabled is a method to turn ftrace on or off */
+int ftrace_enabled __read_mostly;
+static int last_ftrace_enabled;
+
+/*
+ * ftrace_disabled is set when an anomaly is discovered.
+ * ftrace_disabled is much stronger than ftrace_enabled.
+ */
+static int ftrace_disabled __read_mostly;
+
+static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_sysctl_lock);
+
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+       .func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+       struct ftrace_ops *op = ftrace_list;
+
+       /* in case someone actually ports this to alpha! */
+       read_barrier_depends();
+
+       while (op != &ftrace_list_end) {
+               /* silly alpha */
+               read_barrier_depends();
+               op->func(ip, parent_ip);
+               op = op->next;
+       };
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+       ftrace_trace_function = ftrace_stub;
+}
+
+static int __register_ftrace_function(struct ftrace_ops *ops)
+{
+       /* Should never be called by interrupts */
+       spin_lock(&ftrace_lock);
+
+       ops->next = ftrace_list;
+       /*
+        * We are entering ops into the ftrace_list but another
+        * CPU might be walking that list. We need to make sure
+        * the ops->next pointer is valid before another CPU sees
+        * the ops pointer included into the ftrace_list.
+        */
+       smp_wmb();
+       ftrace_list = ops;
+
+       if (ftrace_enabled) {
+               /*
+                * For one func, simply call it directly.
+                * For more than one func, call the chain.
+                */
+               if (ops->next == &ftrace_list_end)
+                       ftrace_trace_function = ops->func;
+               else
+                       ftrace_trace_function = ftrace_list_func;
+       }
+
+       spin_unlock(&ftrace_lock);
+
+       return 0;
+}
+
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
+{
+       struct ftrace_ops **p;
+       int ret = 0;
+
+       spin_lock(&ftrace_lock);
+
+       /*
+        * If we are removing the last function, then simply point
+        * to the ftrace_stub.
+        */
+       if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+               ftrace_trace_function = ftrace_stub;
+               ftrace_list = &ftrace_list_end;
+               goto out;
+       }
+
+       for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+               if (*p == ops)
+                       break;
+
+       if (*p != ops) {
+               ret = -1;
+               goto out;
+       }
+
+       *p = (*p)->next;
+
+       if (ftrace_enabled) {
+               /* If we only have one func left, then call that directly */
+               if (ftrace_list == &ftrace_list_end ||
+                   ftrace_list->next == &ftrace_list_end)
+                       ftrace_trace_function = ftrace_list->func;
+       }
+
+ out:
+       spin_unlock(&ftrace_lock);
+
+       return ret;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static struct task_struct *ftraced_task;
+
+enum {
+       FTRACE_ENABLE_CALLS             = (1 << 0),
+       FTRACE_DISABLE_CALLS            = (1 << 1),
+       FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
+       FTRACE_ENABLE_MCOUNT            = (1 << 3),
+       FTRACE_DISABLE_MCOUNT           = (1 << 4),
+};
+
+static int ftrace_filtered;
+static int tracing_on;
+static int frozen_record_count;
+
+static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+
+static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+
+static DEFINE_SPINLOCK(ftrace_shutdown_lock);
+static DEFINE_MUTEX(ftraced_lock);
+static DEFINE_MUTEX(ftrace_regex_lock);
+
+struct ftrace_page {
+       struct ftrace_page      *next;
+       unsigned long           index;
+       struct dyn_ftrace       records[];
+};
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT             10000
+
+static struct ftrace_page      *ftrace_pages_start;
+static struct ftrace_page      *ftrace_pages;
+
+static int ftraced_trigger;
+static int ftraced_suspend;
+static int ftraced_stop;
+
+static int ftrace_record_suspend;
+
+static struct dyn_ftrace *ftrace_free_records;
+
+
+#ifdef CONFIG_KPROBES
+static inline void freeze_record(struct dyn_ftrace *rec)
+{
+       if (!(rec->flags & FTRACE_FL_FROZEN)) {
+               rec->flags |= FTRACE_FL_FROZEN;
+               frozen_record_count++;
+       }
+}
+
+static inline void unfreeze_record(struct dyn_ftrace *rec)
+{
+       if (rec->flags & FTRACE_FL_FROZEN) {
+               rec->flags &= ~FTRACE_FL_FROZEN;
+               frozen_record_count--;
+       }
+}
+
+static inline int record_frozen(struct dyn_ftrace *rec)
+{
+       return rec->flags & FTRACE_FL_FROZEN;
+}
+#else
+# define freeze_record(rec)                    ({ 0; })
+# define unfreeze_record(rec)                  ({ 0; })
+# define record_frozen(rec)                    ({ 0; })
+#endif /* CONFIG_KPROBES */
+
+int skip_trace(unsigned long ip)
+{
+       unsigned long fl;
+       struct dyn_ftrace *rec;
+       struct hlist_node *t;
+       struct hlist_head *head;
+
+       if (frozen_record_count == 0)
+               return 0;
+
+       head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
+       hlist_for_each_entry_rcu(rec, t, head, node) {
+               if (rec->ip == ip) {
+                       if (record_frozen(rec)) {
+                               if (rec->flags & FTRACE_FL_FAILED)
+                                       return 1;
+
+                               if (!(rec->flags & FTRACE_FL_CONVERTED))
+                                       return 1;
+
+                               if (!tracing_on || !ftrace_enabled)
+                                       return 1;
+
+                               if (ftrace_filtered) {
+                                       fl = rec->flags & (FTRACE_FL_FILTER |
+                                                          FTRACE_FL_NOTRACE);
+                                       if (!fl || (fl & FTRACE_FL_NOTRACE))
+                                               return 1;
+                               }
+                       }
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+static inline int
+ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+{
+       struct dyn_ftrace *p;
+       struct hlist_node *t;
+       int found = 0;
+
+       hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
+               if (p->ip == ip) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       return found;
+}
+
+static inline void
+ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
+{
+       hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
+}
+
+/* called from kstop_machine */
+static inline void ftrace_del_hash(struct dyn_ftrace *node)
+{
+       hlist_del(&node->node);
+}
+
+static void ftrace_free_rec(struct dyn_ftrace *rec)
+{
+       /* no locking, only called from kstop_machine */
+
+       rec->ip = (unsigned long)ftrace_free_records;
+       ftrace_free_records = rec;
+       rec->flags |= FTRACE_FL_FREE;
+}
+
+static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
+{
+       struct dyn_ftrace *rec;
+
+       /* First check for freed records */
+       if (ftrace_free_records) {
+               rec = ftrace_free_records;
+
+               if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
+                       WARN_ON_ONCE(1);
+                       ftrace_free_records = NULL;
+                       ftrace_disabled = 1;
+                       ftrace_enabled = 0;
+                       return NULL;
+               }
+
+               ftrace_free_records = (void *)rec->ip;
+               memset(rec, 0, sizeof(*rec));
+               return rec;
+       }
+
+       if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+               if (!ftrace_pages->next)
+                       return NULL;
+               ftrace_pages = ftrace_pages->next;
+       }
+
+       return &ftrace_pages->records[ftrace_pages->index++];
+}
+
+static void
+ftrace_record_ip(unsigned long ip)
+{
+       struct dyn_ftrace *node;
+       unsigned long flags;
+       unsigned long key;
+       int resched;
+       int atomic;
+       int cpu;
+
+       if (!ftrace_enabled || ftrace_disabled)
+               return;
+
+       resched = need_resched();
+       preempt_disable_notrace();
+
+       /*
+        * We simply need to protect against recursion.
+        * Use the the raw version of smp_processor_id and not
+        * __get_cpu_var which can call debug hooks that can
+        * cause a recursive crash here.
+        */
+       cpu = raw_smp_processor_id();
+       per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
+       if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
+               goto out;
+
+       if (unlikely(ftrace_record_suspend))
+               goto out;
+
+       key = hash_long(ip, FTRACE_HASHBITS);
+
+       WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+
+       if (ftrace_ip_in_hash(ip, key))
+               goto out;
+
+       atomic = irqs_disabled();
+
+       spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+
+       /* This ip may have hit the hash before the lock */
+       if (ftrace_ip_in_hash(ip, key))
+               goto out_unlock;
+
+       node = ftrace_alloc_dyn_node(ip);
+       if (!node)
+               goto out_unlock;
+
+       node->ip = ip;
+
+       ftrace_add_hash(node, key);
+
+       ftraced_trigger = 1;
+
+ out_unlock:
+       spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ out:
+       per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
+
+       /* prevent recursion with scheduler */
+       if (resched)
+               preempt_enable_no_resched_notrace();
+       else
+               preempt_enable_notrace();
+}
+
+#define FTRACE_ADDR ((long)(ftrace_caller))
+
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec,
+                     unsigned char *old, unsigned char *new, int enable)
+{
+       unsigned long ip, fl;
+
+       ip = rec->ip;
+
+       if (ftrace_filtered && enable) {
+               /*
+                * If filtering is on:
+                *
+                * If this record is set to be filtered and
+                * is enabled then do nothing.
+                *
+                * If this record is set to be filtered and
+                * it is not enabled, enable it.
+                *
+                * If this record is not set to be filtered
+                * and it is not enabled do nothing.
+                *
+                * If this record is set not to trace then
+                * do nothing.
+                *
+                * If this record is set not to trace and
+                * it is enabled then disable it.
+                *
+                * If this record is not set to be filtered and
+                * it is enabled, disable it.
+                */
+
+               fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
+                                  FTRACE_FL_ENABLED);
+
+               if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
+                   (fl ==  (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
+                   !fl || (fl == FTRACE_FL_NOTRACE))
+                       return 0;
+
+               /*
+                * If it is enabled disable it,
+                * otherwise enable it!
+                */
+               if (fl & FTRACE_FL_ENABLED) {
+                       /* swap new and old */
+                       new = old;
+                       old = ftrace_call_replace(ip, FTRACE_ADDR);
+                       rec->flags &= ~FTRACE_FL_ENABLED;
+               } else {
+                       new = ftrace_call_replace(ip, FTRACE_ADDR);
+                       rec->flags |= FTRACE_FL_ENABLED;
+               }
+       } else {
+
+               if (enable) {
+                       /*
+                        * If this record is set not to trace and is
+                        * not enabled, do nothing.
+                        */
+                       fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
+                       if (fl == FTRACE_FL_NOTRACE)
+                               return 0;
+
+                       new = ftrace_call_replace(ip, FTRACE_ADDR);
+               } else
+                       old = ftrace_call_replace(ip, FTRACE_ADDR);
+
+               if (enable) {
+                       if (rec->flags & FTRACE_FL_ENABLED)
+                               return 0;
+                       rec->flags |= FTRACE_FL_ENABLED;
+               } else {
+                       if (!(rec->flags & FTRACE_FL_ENABLED))
+                               return 0;
+                       rec->flags &= ~FTRACE_FL_ENABLED;
+               }
+       }
+
+       return ftrace_modify_code(ip, old, new);
+}
+
+static void ftrace_replace_code(int enable)
+{
+       int i, failed;
+       unsigned char *new = NULL, *old = NULL;
+       struct dyn_ftrace *rec;
+       struct ftrace_page *pg;
+
+       if (enable)
+               old = ftrace_nop_replace();
+       else
+               new = ftrace_nop_replace();
+
+       for (pg = ftrace_pages_start; pg; pg = pg->next) {
+               for (i = 0; i < pg->index; i++) {
+                       rec = &pg->records[i];
+
+                       /* don't modify code that has already faulted */
+                       if (rec->flags & FTRACE_FL_FAILED)
+                               continue;
+
+                       /* ignore updates to this record's mcount site */
+                       if (get_kprobe((void *)rec->ip)) {
+                               freeze_record(rec);
+                               continue;
+                       } else {
+                               unfreeze_record(rec);
+                       }
+
+                       failed = __ftrace_replace_code(rec, old, new, enable);
+                       if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+                               rec->flags |= FTRACE_FL_FAILED;
+                               if ((system_state == SYSTEM_BOOTING) ||
+                                   !core_kernel_text(rec->ip)) {
+                                       ftrace_del_hash(rec);
+                                       ftrace_free_rec(rec);
+                               }
+                       }
+               }
+       }
+}
+
+static void ftrace_shutdown_replenish(void)
+{
+       if (ftrace_pages->next)
+               return;
+
+       /* allocate another page */
+       ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+static int
+ftrace_code_disable(struct dyn_ftrace *rec)
+{
+       unsigned long ip;
+       unsigned char *nop, *call;
+       int failed;
+
+       ip = rec->ip;
+
+       nop = ftrace_nop_replace();
+       call = ftrace_call_replace(ip, MCOUNT_ADDR);
+
+       failed = ftrace_modify_code(ip, call, nop);
+       if (failed) {
+               rec->flags |= FTRACE_FL_FAILED;
+               return 0;
+       }
+       return 1;
+}
+
+static int __ftrace_update_code(void *ignore);
+
+static int __ftrace_modify_code(void *data)
+{
+       unsigned long addr;
+       int *command = data;
+
+       if (*command & FTRACE_ENABLE_CALLS) {
+               /*
+                * Update any recorded ips now that we have the
+                * machine stopped
+                */
+               __ftrace_update_code(NULL);
+               ftrace_replace_code(1);
+               tracing_on = 1;
+       } else if (*command & FTRACE_DISABLE_CALLS) {
+               ftrace_replace_code(0);
+               tracing_on = 0;
+       }
+
+       if (*command & FTRACE_UPDATE_TRACE_FUNC)
+               ftrace_update_ftrace_func(ftrace_trace_function);
+
+       if (*command & FTRACE_ENABLE_MCOUNT) {
+               addr = (unsigned long)ftrace_record_ip;
+               ftrace_mcount_set(&addr);
+       } else if (*command & FTRACE_DISABLE_MCOUNT) {
+               addr = (unsigned long)ftrace_stub;
+               ftrace_mcount_set(&addr);
+       }
+
+       return 0;
+}
+
+static void ftrace_run_update_code(int command)
+{
+       stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
+}
+
+void ftrace_disable_daemon(void)
+{
+       /* Stop the daemon from calling kstop_machine */
+       mutex_lock(&ftraced_lock);
+       ftraced_stop = 1;
+       mutex_unlock(&ftraced_lock);
+
+       ftrace_force_update();
+}
+
+void ftrace_enable_daemon(void)
+{
+       mutex_lock(&ftraced_lock);
+       ftraced_stop = 0;
+       mutex_unlock(&ftraced_lock);
+
+       ftrace_force_update();
+}
+
+static ftrace_func_t saved_ftrace_func;
+
+static void ftrace_startup(void)
+{
+       int command = 0;
+
+       if (unlikely(ftrace_disabled))
+               return;
+
+       mutex_lock(&ftraced_lock);
+       ftraced_suspend++;
+       if (ftraced_suspend == 1)
+               command |= FTRACE_ENABLE_CALLS;
+
+       if (saved_ftrace_func != ftrace_trace_function) {
+               saved_ftrace_func = ftrace_trace_function;
+               command |= FTRACE_UPDATE_TRACE_FUNC;
+       }
+
+       if (!command || !ftrace_enabled)
+               goto out;
+
+       ftrace_run_update_code(command);
+ out:
+       mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_shutdown(void)
+{
+       int command = 0;
+
+       if (unlikely(ftrace_disabled))
+               return;
+
+       mutex_lock(&ftraced_lock);
+       ftraced_suspend--;
+       if (!ftraced_suspend)
+               command |= FTRACE_DISABLE_CALLS;
+
+       if (saved_ftrace_func != ftrace_trace_function) {
+               saved_ftrace_func = ftrace_trace_function;
+               command |= FTRACE_UPDATE_TRACE_FUNC;
+       }
+
+       if (!command || !ftrace_enabled)
+               goto out;
+
+       ftrace_run_update_code(command);
+ out:
+       mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_startup_sysctl(void)
+{
+       int command = FTRACE_ENABLE_MCOUNT;
+
+       if (unlikely(ftrace_disabled))
+               return;
+
+       mutex_lock(&ftraced_lock);
+       /* Force update next time */
+       saved_ftrace_func = NULL;
+       /* ftraced_suspend is true if we want ftrace running */
+       if (ftraced_suspend)
+               command |= FTRACE_ENABLE_CALLS;
+
+       ftrace_run_update_code(command);
+       mutex_unlock(&ftraced_lock);
+}
+
+static void ftrace_shutdown_sysctl(void)
+{
+       int command = FTRACE_DISABLE_MCOUNT;
+
+       if (unlikely(ftrace_disabled))
+               return;
+
+       mutex_lock(&ftraced_lock);
+       /* ftraced_suspend is true if ftrace is running */
+       if (ftraced_suspend)
+               command |= FTRACE_DISABLE_CALLS;
+
+       ftrace_run_update_code(command);
+       mutex_unlock(&ftraced_lock);
+}
+
+static cycle_t         ftrace_update_time;
+static unsigned long   ftrace_update_cnt;
+unsigned long          ftrace_update_tot_cnt;
+
+static int __ftrace_update_code(void *ignore)
+{
+       int i, save_ftrace_enabled;
+       cycle_t start, stop;
+       struct dyn_ftrace *p;
+       struct hlist_node *t, *n;
+       struct hlist_head *head, temp_list;
+
+       /* Don't be recording funcs now */
+       ftrace_record_suspend++;
+       save_ftrace_enabled = ftrace_enabled;
+       ftrace_enabled = 0;
+
+       start = ftrace_now(raw_smp_processor_id());
+       ftrace_update_cnt = 0;
+
+       /* No locks needed, the machine is stopped! */
+       for (i = 0; i < FTRACE_HASHSIZE; i++) {
+               INIT_HLIST_HEAD(&temp_list);
+               head = &ftrace_hash[i];
+
+               /* all CPUS are stopped, we are safe to modify code */
+               hlist_for_each_entry_safe(p, t, n, head, node) {
+                       /* Skip over failed records which have not been
+                        * freed. */
+                       if (p->flags & FTRACE_FL_FAILED)
+                               continue;
+
+                       /* Unconverted records are always at the head of the
+                        * hash bucket. Once we encounter a converted record,
+                        * simply skip over to the next bucket. Saves ftraced
+                        * some processor cycles (ftrace does its bid for
+                        * global warming :-p ). */
+                       if (p->flags & (FTRACE_FL_CONVERTED))
+                               break;
+
+                       /* Ignore updates to this record's mcount site.
+                        * Reintroduce this record at the head of this
+                        * bucket to attempt to "convert" it again if
+                        * the kprobe on it is unregistered before the
+                        * next run. */
+                       if (get_kprobe((void *)p->ip)) {
+                               ftrace_del_hash(p);
+                               INIT_HLIST_NODE(&p->node);
+                               hlist_add_head(&p->node, &temp_list);
+                               freeze_record(p);
+                               continue;
+                       } else {
+                               unfreeze_record(p);
+                       }
+
+                       /* convert record (i.e, patch mcount-call with NOP) */
+                       if (ftrace_code_disable(p)) {
+                               p->flags |= FTRACE_FL_CONVERTED;
+                               ftrace_update_cnt++;
+                       } else {
+                               if ((system_state == SYSTEM_BOOTING) ||
+                                   !core_kernel_text(p->ip)) {
+                                       ftrace_del_hash(p);
+                                       ftrace_free_rec(p);
+                               }
+                       }
+               }
+
+               hlist_for_each_entry_safe(p, t, n, &temp_list, node) {
+                       hlist_del(&p->node);
+                       INIT_HLIST_NODE(&p->node);
+                       hlist_add_head(&p->node, head);
+               }
+       }
+
+       stop = ftrace_now(raw_smp_processor_id());
+       ftrace_update_time = stop - start;
+       ftrace_update_tot_cnt += ftrace_update_cnt;
+       ftraced_trigger = 0;
+
+       ftrace_enabled = save_ftrace_enabled;
+       ftrace_record_suspend--;
+
+       return 0;
+}
+
+static int ftrace_update_code(void)
+{
+       if (unlikely(ftrace_disabled) ||
+           !ftrace_enabled || !ftraced_trigger)
+               return 0;
+
+       stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+
+       return 1;
+}
+
+static int ftraced(void *ignore)
+{
+       unsigned long usecs;
+
+       while (!kthread_should_stop()) {
+
+               set_current_state(TASK_INTERRUPTIBLE);
+
+               /* check once a second */
+               schedule_timeout(HZ);
+
+               if (unlikely(ftrace_disabled))
+                       continue;
+
+               mutex_lock(&ftrace_sysctl_lock);
+               mutex_lock(&ftraced_lock);
+               if (!ftraced_suspend && !ftraced_stop &&
+                   ftrace_update_code()) {
+                       usecs = nsecs_to_usecs(ftrace_update_time);
+                       if (ftrace_update_tot_cnt > 100000) {
+                               ftrace_update_tot_cnt = 0;
+                               pr_info("hm, dftrace overflow: %lu change%s"
+                                       " (%lu total) in %lu usec%s\n",
+                                       ftrace_update_cnt,
+                                       ftrace_update_cnt != 1 ? "s" : "",
+                                       ftrace_update_tot_cnt,
+                                       usecs, usecs != 1 ? "s" : "");
+                               ftrace_disabled = 1;
+                               WARN_ON_ONCE(1);
+                       }
+               }
+               mutex_unlock(&ftraced_lock);
+               mutex_unlock(&ftrace_sysctl_lock);
+
+               ftrace_shutdown_replenish();
+       }
+       __set_current_state(TASK_RUNNING);
+       return 0;
+}
+
+static int __init ftrace_dyn_table_alloc(void)
+{
+       struct ftrace_page *pg;
+       int cnt;
+       int i;
+
+       /* allocate a few pages */
+       ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!ftrace_pages_start)
+               return -1;
+
+       /*
+        * Allocate a few more pages.
+        *
+        * TODO: have some parser search vmlinux before
+        *   final linking to find all calls to ftrace.
+        *   Then we can:
+        *    a) know how many pages to allocate.
+        *     and/or
+        *    b) set up the table then.
+        *
+        *  The dynamic code is still necessary for
+        *  modules.
+        */
+
+       pg = ftrace_pages = ftrace_pages_start;
+
+       cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+       for (i = 0; i < cnt; i++) {
+               pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+               /* If we fail, we'll try later anyway */
+               if (!pg->next)
+                       break;
+
+               pg = pg->next;
+       }
+
+       return 0;
+}
+
+enum {
+       FTRACE_ITER_FILTER      = (1 << 0),
+       FTRACE_ITER_CONT        = (1 << 1),
+       FTRACE_ITER_NOTRACE     = (1 << 2),
+       FTRACE_ITER_FAILURES    = (1 << 3),
+};
+
+#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
+
+struct ftrace_iterator {
+       loff_t                  pos;
+       struct ftrace_page      *pg;
+       unsigned                idx;
+       unsigned                flags;
+       unsigned char           buffer[FTRACE_BUFF_MAX+1];
+       unsigned                buffer_idx;
+       unsigned                filtered;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct ftrace_iterator *iter = m->private;
+       struct dyn_ftrace *rec = NULL;
+
+       (*pos)++;
+
+ retry:
+       if (iter->idx >= iter->pg->index) {
+               if (iter->pg->next) {
+                       iter->pg = iter->pg->next;
+                       iter->idx = 0;
+                       goto retry;
+               }
+       } else {
+               rec = &iter->pg->records[iter->idx++];
+               if ((!(iter->flags & FTRACE_ITER_FAILURES) &&
+                    (rec->flags & FTRACE_FL_FAILED)) ||
+
+                   ((iter->flags & FTRACE_ITER_FAILURES) &&
+                    (!(rec->flags & FTRACE_FL_FAILED) ||
+                     (rec->flags & FTRACE_FL_FREE))) ||
+
+                   ((iter->flags & FTRACE_ITER_FILTER) &&
+                    !(rec->flags & FTRACE_FL_FILTER)) ||
+
+                   ((iter->flags & FTRACE_ITER_NOTRACE) &&
+                    !(rec->flags & FTRACE_FL_NOTRACE))) {
+                       rec = NULL;
+                       goto retry;
+               }
+       }
+
+       iter->pos = *pos;
+
+       return rec;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+       struct ftrace_iterator *iter = m->private;
+       void *p = NULL;
+       loff_t l = -1;
+
+       if (*pos != iter->pos) {
+               for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
+                       ;
+       } else {
+               l = *pos;
+               p = t_next(m, p, &l);
+       }
+
+       return p;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+       struct dyn_ftrace *rec = v;
+       char str[KSYM_SYMBOL_LEN];
+
+       if (!rec)
+               return 0;
+
+       kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+       seq_printf(m, "%s\n", str);
+
+       return 0;
+}
+
+static struct seq_operations show_ftrace_seq_ops = {
+       .start = t_start,
+       .next = t_next,
+       .stop = t_stop,
+       .show = t_show,
+};
+
+static int
+ftrace_avail_open(struct inode *inode, struct file *file)
+{
+       struct ftrace_iterator *iter;
+       int ret;
+
+       if (unlikely(ftrace_disabled))
+               return -ENODEV;
+
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               return -ENOMEM;
+
+       iter->pg = ftrace_pages_start;
+       iter->pos = -1;
+
+       ret = seq_open(file, &show_ftrace_seq_ops);
+       if (!ret) {
+               struct seq_file *m = file->private_data;
+
+               m->private = iter;
+       } else {
+               kfree(iter);
+       }
+
+       return ret;
+}
+
+int ftrace_avail_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *m = (struct seq_file *)file->private_data;
+       struct ftrace_iterator *iter = m->private;
+
+       seq_release(inode, file);
+       kfree(iter);
+
+       return 0;
+}
+
+static int
+ftrace_failures_open(struct inode *inode, struct file *file)
+{
+       int ret;
+       struct seq_file *m;
+       struct ftrace_iterator *iter;
+
+       ret = ftrace_avail_open(inode, file);
+       if (!ret) {
+               m = (struct seq_file *)file->private_data;
+               iter = (struct ftrace_iterator *)m->private;
+               iter->flags = FTRACE_ITER_FAILURES;
+       }
+
+       return ret;
+}
+
+
+static void ftrace_filter_reset(int enable)
+{
+       struct ftrace_page *pg;
+       struct dyn_ftrace *rec;
+       unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+       unsigned i;
+
+       /* keep kstop machine from running */
+       preempt_disable();
+       if (enable)
+               ftrace_filtered = 0;
+       pg = ftrace_pages_start;
+       while (pg) {
+               for (i = 0; i < pg->index; i++) {
+                       rec = &pg->records[i];
+                       if (rec->flags & FTRACE_FL_FAILED)
+                               continue;
+                       rec->flags &= ~type;
+               }
+               pg = pg->next;
+       }
+       preempt_enable();
+}
+
+static int
+ftrace_regex_open(struct inode *inode, struct file *file, int enable)
+{
+       struct ftrace_iterator *iter;
+       int ret = 0;
+
+       if (unlikely(ftrace_disabled))
+               return -ENODEV;
+
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               return -ENOMEM;
+
+       mutex_lock(&ftrace_regex_lock);
+       if ((file->f_mode & FMODE_WRITE) &&
+           !(file->f_flags & O_APPEND))
+               ftrace_filter_reset(enable);
+
+       if (file->f_mode & FMODE_READ) {
+               iter->pg = ftrace_pages_start;
+               iter->pos = -1;
+               iter->flags = enable ? FTRACE_ITER_FILTER :
+                       FTRACE_ITER_NOTRACE;
+
+               ret = seq_open(file, &show_ftrace_seq_ops);
+               if (!ret) {
+                       struct seq_file *m = file->private_data;
+                       m->private = iter;
+               } else
+                       kfree(iter);
+       } else
+               file->private_data = iter;
+       mutex_unlock(&ftrace_regex_lock);
+
+       return ret;
+}
+
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+       return ftrace_regex_open(inode, file, 1);
+}
+
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+       return ftrace_regex_open(inode, file, 0);
+}
+
+static ssize_t
+ftrace_regex_read(struct file *file, char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+       if (file->f_mode & FMODE_READ)
+               return seq_read(file, ubuf, cnt, ppos);
+       else
+               return -EPERM;
+}
+
+static loff_t
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
+{
+       loff_t ret;
+
+       if (file->f_mode & FMODE_READ)
+               ret = seq_lseek(file, offset, origin);
+       else
+               file->f_pos = ret = 1;
+
+       return ret;
+}
+
+enum {
+       MATCH_FULL,
+       MATCH_FRONT_ONLY,
+       MATCH_MIDDLE_ONLY,
+       MATCH_END_ONLY,
+};
+
+static void
+ftrace_match(unsigned char *buff, int len, int enable)
+{
+       char str[KSYM_SYMBOL_LEN];
+       char *search = NULL;
+       struct ftrace_page *pg;
+       struct dyn_ftrace *rec;
+       int type = MATCH_FULL;
+       unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+       unsigned i, match = 0, search_len = 0;
+
+       for (i = 0; i < len; i++) {
+               if (buff[i] == '*') {
+                       if (!i) {
+                               search = buff + i + 1;
+                               type = MATCH_END_ONLY;
+                               search_len = len - (i + 1);
+                       } else {
+                               if (type == MATCH_END_ONLY) {
+                                       type = MATCH_MIDDLE_ONLY;
+                               } else {
+                                       match = i;
+                                       type = MATCH_FRONT_ONLY;
+                               }
+                               buff[i] = 0;
+                               break;
+                       }
+               }
+       }
+
+       /* keep kstop machine from running */
+       preempt_disable();
+       if (enable)
+               ftrace_filtered = 1;
+       pg = ftrace_pages_start;
+       while (pg) {
+               for (i = 0; i < pg->index; i++) {
+                       int matched = 0;
+                       char *ptr;
+
+                       rec = &pg->records[i];
+                       if (rec->flags & FTRACE_FL_FAILED)
+                               continue;
+                       kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+                       switch (type) {
+                       case MATCH_FULL:
+                               if (strcmp(str, buff) == 0)
+                                       matched = 1;
+                               break;
+                       case MATCH_FRONT_ONLY:
+                               if (memcmp(str, buff, match) == 0)
+                                       matched = 1;
+                               break;
+                       case MATCH_MIDDLE_ONLY:
+                               if (strstr(str, search))
+                                       matched = 1;
+                               break;
+                       case MATCH_END_ONLY:
+                               ptr = strstr(str, search);
+                               if (ptr && (ptr[search_len] == 0))
+                                       matched = 1;
+                               break;
+                       }
+                       if (matched)
+                               rec->flags |= flag;
+               }
+               pg = pg->next;
+       }
+       preempt_enable();
+}
+
+static ssize_t
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+                  size_t cnt, loff_t *ppos, int enable)
+{
+       struct ftrace_iterator *iter;
+       char ch;
+       size_t read = 0;
+       ssize_t ret;
+
+       if (!cnt || cnt < 0)
+               return 0;
+
+       mutex_lock(&ftrace_regex_lock);
+
+       if (file->f_mode & FMODE_READ) {
+               struct seq_file *m = file->private_data;
+               iter = m->private;
+       } else
+               iter = file->private_data;
+
+       if (!*ppos) {
+               iter->flags &= ~FTRACE_ITER_CONT;
+               iter->buffer_idx = 0;
+       }
+
+       ret = get_user(ch, ubuf++);
+       if (ret)
+               goto out;
+       read++;
+       cnt--;
+
+       if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+               /* skip white space */
+               while (cnt && isspace(ch)) {
+                       ret = get_user(ch, ubuf++);
+                       if (ret)
+                               goto out;
+                       read++;
+                       cnt--;
+               }
+
+               if (isspace(ch)) {
+                       file->f_pos += read;
+                       ret = read;
+                       goto out;
+               }
+
+               iter->buffer_idx = 0;
+       }
+
+       while (cnt && !isspace(ch)) {
+               if (iter->buffer_idx < FTRACE_BUFF_MAX)
+                       iter->buffer[iter->buffer_idx++] = ch;
+               else {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ret = get_user(ch, ubuf++);
+               if (ret)
+                       goto out;
+               read++;
+               cnt--;
+       }
+
+       if (isspace(ch)) {
+               iter->filtered++;
+               iter->buffer[iter->buffer_idx] = 0;
+               ftrace_match(iter->buffer, iter->buffer_idx, enable);
+               iter->buffer_idx = 0;
+       } else
+               iter->flags |= FTRACE_ITER_CONT;
+
+
+       file->f_pos += read;
+
+       ret = read;
+ out:
+       mutex_unlock(&ftrace_regex_lock);
+
+       return ret;
+}
+
+static ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+                   size_t cnt, loff_t *ppos)
+{
+       return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+
+static ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+       return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+
+static void
+ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+{
+       if (unlikely(ftrace_disabled))
+               return;
+
+       mutex_lock(&ftrace_regex_lock);
+       if (reset)
+               ftrace_filter_reset(enable);
+       if (buf)
+               ftrace_match(buf, len, enable);
+       mutex_unlock(&ftrace_regex_lock);
+}
+
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+       ftrace_set_regex(buf, len, reset, 1);
+}
+
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+{
+       ftrace_set_regex(buf, len, reset, 0);
+}
+
+static int
+ftrace_regex_release(struct inode *inode, struct file *file, int enable)
+{
+       struct seq_file *m = (struct seq_file *)file->private_data;
+       struct ftrace_iterator *iter;
+
+       mutex_lock(&ftrace_regex_lock);
+       if (file->f_mode & FMODE_READ) {
+               iter = m->private;
+
+               seq_release(inode, file);
+       } else
+               iter = file->private_data;
+
+       if (iter->buffer_idx) {
+               iter->filtered++;
+               iter->buffer[iter->buffer_idx] = 0;
+               ftrace_match(iter->buffer, iter->buffer_idx, enable);
+       }
+
+       mutex_lock(&ftrace_sysctl_lock);
+       mutex_lock(&ftraced_lock);
+       if (iter->filtered && ftraced_suspend && ftrace_enabled)
+               ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+       mutex_unlock(&ftraced_lock);
+       mutex_unlock(&ftrace_sysctl_lock);
+
+       kfree(iter);
+       mutex_unlock(&ftrace_regex_lock);
+       return 0;
+}
+
+static int
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+       return ftrace_regex_release(inode, file, 1);
+}
+
+static int
+ftrace_notrace_release(struct inode *inode, struct file *file)
+{
+       return ftrace_regex_release(inode, file, 0);
+}
+
+static ssize_t
+ftraced_read(struct file *filp, char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+       /* don't worry about races */
+       char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
+       int r = strlen(buf);
+
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+ftraced_write(struct file *filp, const char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+       char buf[64];
+       long val;
+       int ret;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       if (strncmp(buf, "enable", 6) == 0)
+               val = 1;
+       else if (strncmp(buf, "disable", 7) == 0)
+               val = 0;
+       else {
+               buf[cnt] = 0;
+
+               ret = strict_strtoul(buf, 10, &val);
+               if (ret < 0)
+                       return ret;
+
+               val = !!val;
+       }
+
+       if (val)
+               ftrace_enable_daemon();
+       else
+               ftrace_disable_daemon();
+
+       filp->f_pos += cnt;
+
+       return cnt;
+}
+
+static struct file_operations ftrace_avail_fops = {
+       .open = ftrace_avail_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_failures_fops = {
+       .open = ftrace_failures_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_filter_fops = {
+       .open = ftrace_filter_open,
+       .read = ftrace_regex_read,
+       .write = ftrace_filter_write,
+       .llseek = ftrace_regex_lseek,
+       .release = ftrace_filter_release,
+};
+
+static struct file_operations ftrace_notrace_fops = {
+       .open = ftrace_notrace_open,
+       .read = ftrace_regex_read,
+       .write = ftrace_notrace_write,
+       .llseek = ftrace_regex_lseek,
+       .release = ftrace_notrace_release,
+};
+
+static struct file_operations ftraced_fops = {
+       .open = tracing_open_generic,
+       .read = ftraced_read,
+       .write = ftraced_write,
+};
+
+/**
+ * ftrace_force_update - force an update to all recording ftrace functions
+ */
+int ftrace_force_update(void)
+{
+       int ret = 0;
+
+       if (unlikely(ftrace_disabled))
+               return -ENODEV;
+
+       mutex_lock(&ftrace_sysctl_lock);
+       mutex_lock(&ftraced_lock);
+
+       /*
+        * If ftraced_trigger is not set, then there is nothing
+        * to update.
+        */
+       if (ftraced_trigger && !ftrace_update_code())
+               ret = -EBUSY;
+
+       mutex_unlock(&ftraced_lock);
+       mutex_unlock(&ftrace_sysctl_lock);
+
+       return ret;
+}
+
+static void ftrace_force_shutdown(void)
+{
+       struct task_struct *task;
+       int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
+
+       mutex_lock(&ftraced_lock);
+       task = ftraced_task;
+       ftraced_task = NULL;
+       ftraced_suspend = -1;
+       ftrace_run_update_code(command);
+       mutex_unlock(&ftraced_lock);
+
+       if (task)
+               kthread_stop(task);
+}
+
+static __init int ftrace_init_debugfs(void)
+{
+       struct dentry *d_tracer;
+       struct dentry *entry;
+
+       d_tracer = tracing_init_dentry();
+
+       entry = debugfs_create_file("available_filter_functions", 0444,
+                                   d_tracer, NULL, &ftrace_avail_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'available_filter_functions' entry\n");
+
+       entry = debugfs_create_file("failures", 0444,
+                                   d_tracer, NULL, &ftrace_failures_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs 'failures' entry\n");
+
+       entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
+                                   NULL, &ftrace_filter_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'set_ftrace_filter' entry\n");
+
+       entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+                                   NULL, &ftrace_notrace_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'set_ftrace_notrace' entry\n");
+
+       entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
+                                   NULL, &ftraced_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'ftraced_enabled' entry\n");
+       return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
+static int __init ftrace_dynamic_init(void)
+{
+       struct task_struct *p;
+       unsigned long addr;
+       int ret;
+
+       addr = (unsigned long)ftrace_record_ip;
+
+       stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+
+       /* ftrace_dyn_arch_init places the return code in addr */
+       if (addr) {
+               ret = (int)addr;
+               goto failed;
+       }
+
+       ret = ftrace_dyn_table_alloc();
+       if (ret)
+               goto failed;
+
+       p = kthread_run(ftraced, NULL, "ftraced");
+       if (IS_ERR(p)) {
+               ret = -1;
+               goto failed;
+       }
+
+       last_ftrace_enabled = ftrace_enabled = 1;
+       ftraced_task = p;
+
+       return 0;
+
+ failed:
+       ftrace_disabled = 1;
+       return ret;
+}
+
+core_initcall(ftrace_dynamic_init);
+#else
+# define ftrace_startup()              do { } while (0)
+# define ftrace_shutdown()             do { } while (0)
+# define ftrace_startup_sysctl()       do { } while (0)
+# define ftrace_shutdown_sysctl()      do { } while (0)
+# define ftrace_force_shutdown()       do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/**
+ * ftrace_kill_atomic - kill ftrace from critical sections
+ *
+ * This function should be used by panic code. It stops ftrace
+ * but in a not so nice way. If you need to simply kill ftrace
+ * from a non-atomic section, use ftrace_kill.
+ */
+void ftrace_kill_atomic(void)
+{
+       ftrace_disabled = 1;
+       ftrace_enabled = 0;
+#ifdef CONFIG_DYNAMIC_FTRACE
+       ftraced_suspend = -1;
+#endif
+       clear_ftrace_function();
+}
+
+/**
+ * ftrace_kill - totally shutdown ftrace
+ *
+ * This is a safety measure. If something was detected that seems
+ * wrong, calling this function will keep ftrace from doing
+ * any more modifications, and updates.
+ * used when something went wrong.
+ */
+void ftrace_kill(void)
+{
+       mutex_lock(&ftrace_sysctl_lock);
+       ftrace_disabled = 1;
+       ftrace_enabled = 0;
+
+       clear_ftrace_function();
+       mutex_unlock(&ftrace_sysctl_lock);
+
+       /* Try to totally disable ftrace */
+       ftrace_force_shutdown();
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+       int ret;
+
+       if (unlikely(ftrace_disabled))
+               return -1;
+
+       mutex_lock(&ftrace_sysctl_lock);
+       ret = __register_ftrace_function(ops);
+       ftrace_startup();
+       mutex_unlock(&ftrace_sysctl_lock);
+
+       return ret;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+       int ret;
+
+       mutex_lock(&ftrace_sysctl_lock);
+       ret = __unregister_ftrace_function(ops);
+       ftrace_shutdown();
+       mutex_unlock(&ftrace_sysctl_lock);
+
+       return ret;
+}
+
+int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+                    struct file *file, void __user *buffer, size_t *lenp,
+                    loff_t *ppos)
+{
+       int ret;
+
+       if (unlikely(ftrace_disabled))
+               return -ENODEV;
+
+       mutex_lock(&ftrace_sysctl_lock);
+
+       ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
+
+       if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+               goto out;
+
+       last_ftrace_enabled = ftrace_enabled;
+
+       if (ftrace_enabled) {
+
+               ftrace_startup_sysctl();
+
+               /* we are starting ftrace again */
+               if (ftrace_list != &ftrace_list_end) {
+                       if (ftrace_list->next == &ftrace_list_end)
+                               ftrace_trace_function = ftrace_list->func;
+                       else
+                               ftrace_trace_function = ftrace_list_func;
+               }
+
+       } else {
+               /* stopping ftrace calls (just send to ftrace_stub) */
+               ftrace_trace_function = ftrace_stub;
+
+               ftrace_shutdown_sysctl();
+       }
+
+ out:
+       mutex_unlock(&ftrace_sysctl_lock);
+       return ret;
+}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
new file mode 100644 (file)
index 0000000..868e121
--- /dev/null
@@ -0,0 +1,3161 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally taken from the RT patch by:
+ *    Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/utsrelease.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pagemap.h>
+#include <linux/hardirq.h>
+#include <linux/linkage.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/kprobes.h>
+#include <linux/writeback.h>
+
+#include <linux/stacktrace.h>
+
+#include "trace.h"
+
+unsigned long __read_mostly    tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly    tracing_thresh;
+
+static unsigned long __read_mostly     tracing_nr_buffers;
+static cpumask_t __read_mostly         tracing_buffer_mask;
+
+#define for_each_tracing_cpu(cpu)      \
+       for_each_cpu_mask(cpu, tracing_buffer_mask)
+
+static int trace_alloc_page(void);
+static int trace_free_page(void);
+
+static int tracing_disabled = 1;
+
+static unsigned long tracing_pages_allocated;
+
+long
+ns2usecs(cycle_t nsec)
+{
+       nsec += 500;
+       do_div(nsec, 1000);
+       return nsec;
+}
+
+cycle_t ftrace_now(int cpu)
+{
+       return cpu_clock(cpu);
+}
+
+/*
+ * The global_trace is the descriptor that holds the tracing
+ * buffers for the live tracing. For each CPU, it contains
+ * a link list of pages that will store trace entries. The
+ * page descriptor of the pages in the memory is used to hold
+ * the link list by linking the lru item in the page descriptor
+ * to each of the pages in the buffer per CPU.
+ *
+ * For each active CPU there is a data field that holds the
+ * pages for the buffer for that CPU. Each CPU has the same number
+ * of pages allocated for its buffer.
+ */
+static struct trace_array      global_trace;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+
+/*
+ * The max_tr is used to snapshot the global_trace when a maximum
+ * latency is reached. Some tracers will use this to store a maximum
+ * trace while it continues examining live traces.
+ *
+ * The buffers for the max_tr are set up the same as the global_trace.
+ * When a snapshot is taken, the link list of the max_tr is swapped
+ * with the link list of the global_trace and the buffers are reset for
+ * the global_trace so the tracing can continue.
+ */
+static struct trace_array      max_tr;
+
+static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
+
+/* tracer_enabled is used to toggle activation of a tracer */
+static int                     tracer_enabled = 1;
+
+/* function tracing enabled */
+int                            ftrace_function_enabled;
+
+/*
+ * trace_nr_entries is the number of entries that is allocated
+ * for a buffer. Note, the number of entries is always rounded
+ * to ENTRIES_PER_PAGE.
+ */
+static unsigned long           trace_nr_entries = 65536UL;
+
+/* trace_types holds a link list of available tracers. */
+static struct tracer           *trace_types __read_mostly;
+
+/* current_trace points to the tracer that is currently active */
+static struct tracer           *current_trace __read_mostly;
+
+/*
+ * max_tracer_type_len is used to simplify the allocating of
+ * buffers to read userspace tracer names. We keep track of
+ * the longest tracer name registered.
+ */
+static int                     max_tracer_type_len;
+
+/*
+ * trace_types_lock is used to protect the trace_types list.
+ * This lock is also used to keep user access serialized.
+ * Accesses from userspace will grab this lock while userspace
+ * activities happen inside the kernel.
+ */
+static DEFINE_MUTEX(trace_types_lock);
+
+/* trace_wait is a waitqueue for tasks blocked on trace_poll */
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
+
+/* trace_flags holds iter_ctrl options */
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+
+static notrace void no_trace_init(struct trace_array *tr)
+{
+       int cpu;
+
+       ftrace_function_enabled = 0;
+       if(tr->ctrl)
+               for_each_online_cpu(cpu)
+                       tracing_reset(tr->data[cpu]);
+       tracer_enabled = 0;
+}
+
+/* dummy trace to disable tracing */
+static struct tracer no_tracer __read_mostly = {
+       .name           = "none",
+       .init           = no_trace_init
+};
+
+
+/**
+ * trace_wake_up - wake up tasks waiting for trace input
+ *
+ * Simply wakes up any task that is blocked on the trace_wait
+ * queue. These is used with trace_poll for tasks polling the trace.
+ */
+void trace_wake_up(void)
+{
+       /*
+        * The runqueue_is_locked() can fail, but this is the best we
+        * have for now:
+        */
+       if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
+               wake_up(&trace_wait);
+}
+
+#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
+
+static int __init set_nr_entries(char *str)
+{
+       unsigned long nr_entries;
+       int ret;
+
+       if (!str)
+               return 0;
+       ret = strict_strtoul(str, 0, &nr_entries);
+       /* nr_entries can not be zero */
+       if (ret < 0 || nr_entries == 0)
+               return 0;
+       trace_nr_entries = nr_entries;
+       return 1;
+}
+__setup("trace_entries=", set_nr_entries);
+
+unsigned long nsecs_to_usecs(unsigned long nsecs)
+{
+       return nsecs / 1000;
+}
+
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ *  IRQS_OFF   - interrupts were disabled
+ *  NEED_RESCED - reschedule is requested
+ *  HARDIRQ    - inside an interrupt handler
+ *  SOFTIRQ    - inside a softirq handler
+ */
+enum trace_flag_type {
+       TRACE_FLAG_IRQS_OFF             = 0x01,
+       TRACE_FLAG_NEED_RESCHED         = 0x02,
+       TRACE_FLAG_HARDIRQ              = 0x04,
+       TRACE_FLAG_SOFTIRQ              = 0x08,
+};
+
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+       (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
+/* These must match the bit postions in trace_iterator_flags */
+static const char *trace_options[] = {
+       "print-parent",
+       "sym-offset",
+       "sym-addr",
+       "verbose",
+       "raw",
+       "hex",
+       "bin",
+       "block",
+       "stacktrace",
+       "sched-tree",
+       NULL
+};
+
+/*
+ * ftrace_max_lock is used to protect the swapping of buffers
+ * when taking a max snapshot. The buffers themselves are
+ * protected by per_cpu spinlocks. But the action of the swap
+ * needs its own lock.
+ *
+ * This is defined as a raw_spinlock_t in order to help
+ * with performance when lockdep debugging is enabled.
+ */
+static raw_spinlock_t ftrace_max_lock =
+       (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /debugfs/tracing/latency_trace)
+ */
+static void
+__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+       struct trace_array_cpu *data = tr->data[cpu];
+
+       max_tr.cpu = cpu;
+       max_tr.time_start = data->preempt_timestamp;
+
+       data = max_tr.data[cpu];
+       data->saved_latency = tracing_max_latency;
+
+       memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+       data->pid = tsk->pid;
+       data->uid = tsk->uid;
+       data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
+       data->policy = tsk->policy;
+       data->rt_priority = tsk->rt_priority;
+
+       /* record this tasks comm */
+       tracing_record_cmdline(current);
+}
+
+#define CHECK_COND(cond)                       \
+       if (unlikely(cond)) {                   \
+               tracing_disabled = 1;           \
+               WARN_ON(1);                     \
+               return -1;                      \
+       }
+
+/**
+ * check_pages - integrity check of trace buffers
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+int check_pages(struct trace_array_cpu *data)
+{
+       struct page *page, *tmp;
+
+       CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
+       CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
+
+       list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
+               CHECK_COND(page->lru.next->prev != &page->lru);
+               CHECK_COND(page->lru.prev->next != &page->lru);
+       }
+
+       return 0;
+}
+
+/**
+ * head_page - page address of the first page in per_cpu buffer.
+ *
+ * head_page returns the page address of the first page in
+ * a per_cpu buffer. This also preforms various consistency
+ * checks to make sure the buffer has not been corrupted.
+ */
+void *head_page(struct trace_array_cpu *data)
+{
+       struct page *page;
+
+       if (list_empty(&data->trace_pages))
+               return NULL;
+
+       page = list_entry(data->trace_pages.next, struct page, lru);
+       BUG_ON(&page->lru == &data->trace_pages);
+
+       return page_address(page);
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+       int len = (PAGE_SIZE - 1) - s->len;
+       va_list ap;
+       int ret;
+
+       if (!len)
+               return 0;
+
+       va_start(ap, fmt);
+       ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+       va_end(ap);
+
+       /* If we can't write it all, don't bother writing anything */
+       if (ret >= len)
+               return 0;
+
+       s->len += ret;
+
+       return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+static int
+trace_seq_puts(struct trace_seq *s, const char *str)
+{
+       int len = strlen(str);
+
+       if (len > ((PAGE_SIZE - 1) - s->len))
+               return 0;
+
+       memcpy(s->buffer + s->len, str, len);
+       s->len += len;
+
+       return len;
+}
+
+static int
+trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+       if (s->len >= (PAGE_SIZE - 1))
+               return 0;
+
+       s->buffer[s->len++] = c;
+
+       return 1;
+}
+
+static int
+trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+       if (len > ((PAGE_SIZE - 1) - s->len))
+               return 0;
+
+       memcpy(s->buffer + s->len, mem, len);
+       s->len += len;
+
+       return len;
+}
+
+#define HEX_CHARS 17
+static const char hex2asc[] = "0123456789abcdef";
+
+static int
+trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+       unsigned char hex[HEX_CHARS];
+       unsigned char *data = mem;
+       unsigned char byte;
+       int i, j;
+
+       BUG_ON(len >= HEX_CHARS);
+
+#ifdef __BIG_ENDIAN
+       for (i = 0, j = 0; i < len; i++) {
+#else
+       for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+               byte = data[i];
+
+               hex[j++] = hex2asc[byte & 0x0f];
+               hex[j++] = hex2asc[byte >> 4];
+       }
+       hex[j++] = ' ';
+
+       return trace_seq_putmem(s, hex, j);
+}
+
+static void
+trace_seq_reset(struct trace_seq *s)
+{
+       s->len = 0;
+       s->readpos = 0;
+}
+
+ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
+{
+       int len;
+       int ret;
+
+       if (s->len <= s->readpos)
+               return -EBUSY;
+
+       len = s->len - s->readpos;
+       if (cnt > len)
+               cnt = len;
+       ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+       if (ret)
+               return -EFAULT;
+
+       s->readpos += len;
+       return cnt;
+}
+
+static void
+trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+       int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+       s->buffer[len] = 0;
+       seq_puts(m, s->buffer);
+
+       trace_seq_reset(s);
+}
+
+/*
+ * flip the trace buffers between two trace descriptors.
+ * This usually is the buffers between the global_trace and
+ * the max_tr to record a snapshot of a current trace.
+ *
+ * The ftrace_max_lock must be held.
+ */
+static void
+flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
+{
+       struct list_head flip_pages;
+
+       INIT_LIST_HEAD(&flip_pages);
+
+       memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
+               sizeof(struct trace_array_cpu) -
+               offsetof(struct trace_array_cpu, trace_head_idx));
+
+       check_pages(tr1);
+       check_pages(tr2);
+       list_splice_init(&tr1->trace_pages, &flip_pages);
+       list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
+       list_splice_init(&flip_pages, &tr2->trace_pages);
+       BUG_ON(!list_empty(&flip_pages));
+       check_pages(tr1);
+       check_pages(tr2);
+}
+
+/**
+ * update_max_tr - snapshot all trace buffers from global_trace to max_tr
+ * @tr: tracer
+ * @tsk: the task with the latency
+ * @cpu: The cpu that initiated the trace.
+ *
+ * Flip the buffers between the @tr and the max_tr and record information
+ * about which task was the cause of this latency.
+ */
+void
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+       struct trace_array_cpu *data;
+       int i;
+
+       WARN_ON_ONCE(!irqs_disabled());
+       __raw_spin_lock(&ftrace_max_lock);
+       /* clear out all the previous traces */
+       for_each_tracing_cpu(i) {
+               data = tr->data[i];
+               flip_trace(max_tr.data[i], data);
+               tracing_reset(data);
+       }
+
+       __update_max_tr(tr, tsk, cpu);
+       __raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * update_max_tr_single - only copy one trace over, and reset the rest
+ * @tr - tracer
+ * @tsk - task with the latency
+ * @cpu - the cpu of the buffer to copy.
+ *
+ * Flip the trace of a single CPU buffer between the @tr and the max_tr.
+ */
+void
+update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
+{
+       struct trace_array_cpu *data = tr->data[cpu];
+       int i;
+
+       WARN_ON_ONCE(!irqs_disabled());
+       __raw_spin_lock(&ftrace_max_lock);
+       for_each_tracing_cpu(i)
+               tracing_reset(max_tr.data[i]);
+
+       flip_trace(max_tr.data[cpu], data);
+       tracing_reset(data);
+
+       __update_max_tr(tr, tsk, cpu);
+       __raw_spin_unlock(&ftrace_max_lock);
+}
+
+/**
+ * register_tracer - register a tracer with the ftrace system.
+ * @type - the plugin for the tracer
+ *
+ * Register a new plugin tracer.
+ */
+int register_tracer(struct tracer *type)
+{
+       struct tracer *t;
+       int len;
+       int ret = 0;
+
+       if (!type->name) {
+               pr_info("Tracer must have a name\n");
+               return -1;
+       }
+
+       mutex_lock(&trace_types_lock);
+       for (t = trace_types; t; t = t->next) {
+               if (strcmp(type->name, t->name) == 0) {
+                       /* already found */
+                       pr_info("Trace %s already registered\n",
+                               type->name);
+                       ret = -1;
+                       goto out;
+               }
+       }
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+       if (type->selftest) {
+               struct tracer *saved_tracer = current_trace;
+               struct trace_array_cpu *data;
+               struct trace_array *tr = &global_trace;
+               int saved_ctrl = tr->ctrl;
+               int i;
+               /*
+                * Run a selftest on this tracer.
+                * Here we reset the trace buffer, and set the current
+                * tracer to be this tracer. The tracer can then run some
+                * internal tracing to verify that everything is in order.
+                * If we fail, we do not register this tracer.
+                */
+               for_each_tracing_cpu(i) {
+                       data = tr->data[i];
+                       if (!head_page(data))
+                               continue;
+                       tracing_reset(data);
+               }
+               current_trace = type;
+               tr->ctrl = 0;
+               /* the test is responsible for initializing and enabling */
+               pr_info("Testing tracer %s: ", type->name);
+               ret = type->selftest(type, tr);
+               /* the test is responsible for resetting too */
+               current_trace = saved_tracer;
+               tr->ctrl = saved_ctrl;
+               if (ret) {
+                       printk(KERN_CONT "FAILED!\n");
+                       goto out;
+               }
+               /* Only reset on passing, to avoid touching corrupted buffers */
+               for_each_tracing_cpu(i) {
+                       data = tr->data[i];
+                       if (!head_page(data))
+                               continue;
+                       tracing_reset(data);
+               }
+               printk(KERN_CONT "PASSED\n");
+       }
+#endif
+
+       type->next = trace_types;
+       trace_types = type;
+       len = strlen(type->name);
+       if (len > max_tracer_type_len)
+               max_tracer_type_len = len;
+
+ out:
+       mutex_unlock(&trace_types_lock);
+
+       return ret;
+}
+
+void unregister_tracer(struct tracer *type)
+{
+       struct tracer **t;
+       int len;
+
+       mutex_lock(&trace_types_lock);
+       for (t = &trace_types; *t; t = &(*t)->next) {
+               if (*t == type)
+                       goto found;
+       }
+       pr_info("Trace %s not registered\n", type->name);
+       goto out;
+
+ found:
+       *t = (*t)->next;
+       if (strlen(type->name) != max_tracer_type_len)
+               goto out;
+
+       max_tracer_type_len = 0;
+       for (t = &trace_types; *t; t = &(*t)->next) {
+               len = strlen((*t)->name);
+               if (len > max_tracer_type_len)
+                       max_tracer_type_len = len;
+       }
+ out:
+       mutex_unlock(&trace_types_lock);
+}
+
+void tracing_reset(struct trace_array_cpu *data)
+{
+       data->trace_idx = 0;
+       data->overrun = 0;
+       data->trace_head = data->trace_tail = head_page(data);
+       data->trace_head_idx = 0;
+       data->trace_tail_idx = 0;
+}
+
+#define SAVED_CMDLINES 128
+static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
+static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
+static int cmdline_idx;
+static DEFINE_SPINLOCK(trace_cmdline_lock);
+
+/* temporary disable recording */
+atomic_t trace_record_cmdline_disabled __read_mostly;
+
+static void trace_init_cmdlines(void)
+{
+       memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
+       memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+       cmdline_idx = 0;
+}
+
+void trace_stop_cmdline_recording(void);
+
+static void trace_save_cmdline(struct task_struct *tsk)
+{
+       unsigned map;
+       unsigned idx;
+
+       if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+               return;
+
+       /*
+        * It's not the end of the world if we don't get
+        * the lock, but we also don't want to spin
+        * nor do we want to disable interrupts,
+        * so if we miss here, then better luck next time.
+        */
+       if (!spin_trylock(&trace_cmdline_lock))
+               return;
+
+       idx = map_pid_to_cmdline[tsk->pid];
+       if (idx >= SAVED_CMDLINES) {
+               idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+
+               map = map_cmdline_to_pid[idx];
+               if (map <= PID_MAX_DEFAULT)
+                       map_pid_to_cmdline[map] = (unsigned)-1;
+
+               map_pid_to_cmdline[tsk->pid] = idx;
+
+               cmdline_idx = idx;
+       }
+
+       memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+
+       spin_unlock(&trace_cmdline_lock);
+}
+
+static char *trace_find_cmdline(int pid)
+{
+       char *cmdline = "<...>";
+       unsigned map;
+
+       if (!pid)
+               return "<idle>";
+
+       if (pid > PID_MAX_DEFAULT)
+               goto out;
+
+       map = map_pid_to_cmdline[pid];
+       if (map >= SAVED_CMDLINES)
+               goto out;
+
+       cmdline = saved_cmdlines[map];
+
+ out:
+       return cmdline;
+}
+
+void tracing_record_cmdline(struct task_struct *tsk)
+{
+       if (atomic_read(&trace_record_cmdline_disabled))
+               return;
+
+       trace_save_cmdline(tsk);
+}
+
+static inline struct list_head *
+trace_next_list(struct trace_array_cpu *data, struct list_head *next)
+{
+       /*
+        * Roundrobin - but skip the head (which is not a real page):
+        */
+       next = next->next;
+       if (unlikely(next == &data->trace_pages))
+               next = next->next;
+       BUG_ON(next == &data->trace_pages);
+
+       return next;
+}
+
+static inline void *
+trace_next_page(struct trace_array_cpu *data, void *addr)
+{
+       struct list_head *next;
+       struct page *page;
+
+       page = virt_to_page(addr);
+
+       next = trace_next_list(data, &page->lru);
+       page = list_entry(next, struct page, lru);
+
+       return page_address(page);
+}
+
+static inline struct trace_entry *
+tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
+{
+       unsigned long idx, idx_next;
+       struct trace_entry *entry;
+
+       data->trace_idx++;
+       idx = data->trace_head_idx;
+       idx_next = idx + 1;
+
+       BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
+
+       entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
+
+       if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
+               data->trace_head = trace_next_page(data, data->trace_head);
+               idx_next = 0;
+       }
+
+       if (data->trace_head == data->trace_tail &&
+           idx_next == data->trace_tail_idx) {
+               /* overrun */
+               data->overrun++;
+               data->trace_tail_idx++;
+               if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+                       data->trace_tail =
+                               trace_next_page(data, data->trace_tail);
+                       data->trace_tail_idx = 0;
+               }
+       }
+
+       data->trace_head_idx = idx_next;
+
+       return entry;
+}
+
+static inline void
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
+{
+       struct task_struct *tsk = current;
+       unsigned long pc;
+
+       pc = preempt_count();
+
+       entry->preempt_count    = pc & 0xff;
+       entry->pid              = (tsk) ? tsk->pid : 0;
+       entry->t                = ftrace_now(raw_smp_processor_id());
+       entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
+               ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+               ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+               (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+}
+
+void
+trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+              unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+       struct trace_entry *entry;
+       unsigned long irq_flags;
+
+       raw_local_irq_save(irq_flags);
+       __raw_spin_lock(&data->lock);
+       entry                   = tracing_get_trace_entry(tr, data);
+       tracing_generic_entry_update(entry, flags);
+       entry->type             = TRACE_FN;
+       entry->fn.ip            = ip;
+       entry->fn.parent_ip     = parent_ip;
+       __raw_spin_unlock(&data->lock);
+       raw_local_irq_restore(irq_flags);
+}
+
+void
+ftrace(struct trace_array *tr, struct trace_array_cpu *data,
+       unsigned long ip, unsigned long parent_ip, unsigned long flags)
+{
+       if (likely(!atomic_read(&data->disabled)))
+               trace_function(tr, data, ip, parent_ip, flags);
+}
+
+#ifdef CONFIG_MMIOTRACE
+void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
+                                               struct mmiotrace_rw *rw)
+{
+       struct trace_entry *entry;
+       unsigned long irq_flags;
+
+       raw_local_irq_save(irq_flags);
+       __raw_spin_lock(&data->lock);
+
+       entry                   = tracing_get_trace_entry(tr, data);
+       tracing_generic_entry_update(entry, 0);
+       entry->type             = TRACE_MMIO_RW;
+       entry->mmiorw           = *rw;
+
+       __raw_spin_unlock(&data->lock);
+       raw_local_irq_restore(irq_flags);
+
+       trace_wake_up();
+}
+
+void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
+                                               struct mmiotrace_map *map)
+{
+       struct trace_entry *entry;
+       unsigned long irq_flags;
+
+       raw_local_irq_save(irq_flags);
+       __raw_spin_lock(&data->lock);
+
+       entry                   = tracing_get_trace_entry(tr, data);
+       tracing_generic_entry_update(entry, 0);
+       entry->type             = TRACE_MMIO_MAP;
+       entry->mmiomap          = *map;
+
+       __raw_spin_unlock(&data->lock);
+       raw_local_irq_restore(irq_flags);
+
+       trace_wake_up();
+}
+#endif
+
+void __trace_stack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags,
+                  int skip)
+{
+       struct trace_entry *entry;
+       struct stack_trace trace;
+
+       if (!(trace_flags & TRACE_ITER_STACKTRACE))
+               return;
+
+       entry                   = tracing_get_trace_entry(tr, data);
+       tracing_generic_entry_update(entry, flags);
+       entry->type             = TRACE_STACK;
+
+       memset(&entry->stack, 0, sizeof(entry->stack));
+
+       trace.nr_entries        = 0;
+       trace.max_entries       = FTRACE_STACK_ENTRIES;
+       trace.skip              = skip;
+       trace.entries           = entry->stack.caller;
+
+       save_stack_trace(&trace);
+}
+
+void
+__trace_special(void *__tr, void *__data,
+               unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+       struct trace_array_cpu *data = __data;
+       struct trace_array *tr = __tr;
+       struct trace_entry *entry;
+       unsigned long irq_flags;
+
+       raw_local_irq_save(irq_flags);
+       __raw_spin_lock(&data->lock);
+       entry                   = tracing_get_trace_entry(tr, data);
+       tracing_generic_entry_update(entry, 0);
+       entry->type             = TRACE_SPECIAL;
+       entry->special.arg1     = arg1;
+       entry->special.arg2     = arg2;
+       entry->special.arg3     = arg3;
+       __trace_stack(tr, data, irq_flags, 4);
+       __raw_spin_unlock(&data->lock);
+       raw_local_irq_restore(irq_flags);
+
+       trace_wake_up();
+}
+
+void
+tracing_sched_switch_trace(struct trace_array *tr,
+                          struct trace_array_cpu *data,
+                          struct task_struct *prev,
+                          struct task_struct *next,
+                          unsigned long flags)
+{
+       struct trace_entry *entry;
+       unsigned long irq_flags;
+
+       raw_local_irq_save(irq_flags);
+       __raw_spin_lock(&data->lock);
+       entry                   = tracing_get_trace_entry(tr, data);
+       tracing_generic_entry_update(entry, flags);
+       entry->type             = TRACE_CTX;
+       entry->ctx.prev_pid     = prev->pid;
+       entry->ctx.prev_prio    = prev->prio;
+       entry->ctx.prev_state   = prev->state;
+       entry->ctx.next_pid     = next->pid;
+       entry->ctx.next_prio    = next->prio;
+       entry->ctx.next_state   = next->state;
+       __trace_stack(tr, data, flags, 5);
+       __raw_spin_unlock(&data->lock);
+       raw_local_irq_restore(irq_flags);
+}
+
+void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+                          struct trace_array_cpu *data,
+                          struct task_struct *wakee,
+                          struct task_struct *curr,
+                          unsigned long flags)
+{
+       struct trace_entry *entry;
+       unsigned long irq_flags;
+
+       raw_local_irq_save(irq_flags);
+       __raw_spin_lock(&data->lock);
+       entry                   = tracing_get_trace_entry(tr, data);
+       tracing_generic_entry_update(entry, flags);
+       entry->type             = TRACE_WAKE;
+       entry->ctx.prev_pid     = curr->pid;
+       entry->ctx.prev_prio    = curr->prio;
+       entry->ctx.prev_state   = curr->state;
+       entry->ctx.next_pid     = wakee->pid;
+       entry->ctx.next_prio    = wakee->prio;
+       entry->ctx.next_state   = wakee->state;
+       __trace_stack(tr, data, flags, 6);
+       __raw_spin_unlock(&data->lock);
+       raw_local_irq_restore(irq_flags);
+
+       trace_wake_up();
+}
+
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+
+       if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
+               return;
+
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+
+       if (likely(disabled == 1))
+               __trace_special(tr, data, arg1, arg2, arg3);
+
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FTRACE
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+
+       if (unlikely(!ftrace_function_enabled))
+               return;
+
+       if (skip_trace(ip))
+               return;
+
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+
+       if (likely(disabled == 1))
+               trace_function(tr, data, ip, parent_ip, flags);
+
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+       .func = function_trace_call,
+};
+
+void tracing_start_function_trace(void)
+{
+       ftrace_function_enabled = 0;
+       register_ftrace_function(&trace_ops);
+       if (tracer_enabled)
+               ftrace_function_enabled = 1;
+}
+
+void tracing_stop_function_trace(void)
+{
+       ftrace_function_enabled = 0;
+       unregister_ftrace_function(&trace_ops);
+}
+#endif
+
+enum trace_file_type {
+       TRACE_FILE_LAT_FMT      = 1,
+};
+
+static struct trace_entry *
+trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
+               struct trace_iterator *iter, int cpu)
+{
+       struct page *page;
+       struct trace_entry *array;
+
+       if (iter->next_idx[cpu] >= tr->entries ||
+           iter->next_idx[cpu] >= data->trace_idx ||
+           (data->trace_head == data->trace_tail &&
+            data->trace_head_idx == data->trace_tail_idx))
+               return NULL;
+
+       if (!iter->next_page[cpu]) {
+               /* Initialize the iterator for this cpu trace buffer */
+               WARN_ON(!data->trace_tail);
+               page = virt_to_page(data->trace_tail);
+               iter->next_page[cpu] = &page->lru;
+               iter->next_page_idx[cpu] = data->trace_tail_idx;
+       }
+
+       page = list_entry(iter->next_page[cpu], struct page, lru);
+       BUG_ON(&data->trace_pages == &page->lru);
+
+       array = page_address(page);
+
+       WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
+       return &array[iter->next_page_idx[cpu]];
+}
+
+static struct trace_entry *
+find_next_entry(struct trace_iterator *iter, int *ent_cpu)
+{
+       struct trace_array *tr = iter->tr;
+       struct trace_entry *ent, *next = NULL;
+       int next_cpu = -1;
+       int cpu;
+
+       for_each_tracing_cpu(cpu) {
+               if (!head_page(tr->data[cpu]))
+                       continue;
+               ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+               /*
+                * Pick the entry with the smallest timestamp:
+                */
+               if (ent && (!next || ent->t < next->t)) {
+                       next = ent;
+                       next_cpu = cpu;
+               }
+       }
+
+       if (ent_cpu)
+               *ent_cpu = next_cpu;
+
+       return next;
+}
+
+static void trace_iterator_increment(struct trace_iterator *iter)
+{
+       iter->idx++;
+       iter->next_idx[iter->cpu]++;
+       iter->next_page_idx[iter->cpu]++;
+
+       if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
+               struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+
+               iter->next_page_idx[iter->cpu] = 0;
+               iter->next_page[iter->cpu] =
+                       trace_next_list(data, iter->next_page[iter->cpu]);
+       }
+}
+
+static void trace_consume(struct trace_iterator *iter)
+{
+       struct trace_array_cpu *data = iter->tr->data[iter->cpu];
+
+       data->trace_tail_idx++;
+       if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
+               data->trace_tail = trace_next_page(data, data->trace_tail);
+               data->trace_tail_idx = 0;
+       }
+
+       /* Check if we empty it, then reset the index */
+       if (data->trace_head == data->trace_tail &&
+           data->trace_head_idx == data->trace_tail_idx)
+               data->trace_idx = 0;
+}
+
+static void *find_next_entry_inc(struct trace_iterator *iter)
+{
+       struct trace_entry *next;
+       int next_cpu = -1;
+
+       next = find_next_entry(iter, &next_cpu);
+
+       iter->prev_ent = iter->ent;
+       iter->prev_cpu = iter->cpu;
+
+       iter->ent = next;
+       iter->cpu = next_cpu;
+
+       if (next)
+               trace_iterator_increment(iter);
+
+       return next ? iter : NULL;
+}
+
+static void *s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct trace_iterator *iter = m->private;
+       void *last_ent = iter->ent;
+       int i = (int)*pos;
+       void *ent;
+
+       (*pos)++;
+
+       /* can't go backwards */
+       if (iter->idx > i)
+               return NULL;
+
+       if (iter->idx < 0)
+               ent = find_next_entry_inc(iter);
+       else
+               ent = iter;
+
+       while (ent && iter->idx < i)
+               ent = find_next_entry_inc(iter);
+
+       iter->pos = *pos;
+
+       if (last_ent && !ent)
+               seq_puts(m, "\n\nvim:ft=help\n");
+
+       return ent;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+       struct trace_iterator *iter = m->private;
+       void *p = NULL;
+       loff_t l = 0;
+       int i;
+
+       mutex_lock(&trace_types_lock);
+
+       if (!current_trace || current_trace != iter->trace) {
+               mutex_unlock(&trace_types_lock);
+               return NULL;
+       }
+
+       atomic_inc(&trace_record_cmdline_disabled);
+
+       /* let the tracer grab locks here if needed */
+       if (current_trace->start)
+               current_trace->start(iter);
+
+       if (*pos != iter->pos) {
+               iter->ent = NULL;
+               iter->cpu = 0;
+               iter->idx = -1;
+               iter->prev_ent = NULL;
+               iter->prev_cpu = -1;
+
+               for_each_tracing_cpu(i) {
+                       iter->next_idx[i] = 0;
+                       iter->next_page[i] = NULL;
+               }
+
+               for (p = iter; p && l < *pos; p = s_next(m, p, &l))
+                       ;
+
+       } else {
+               l = *pos - 1;
+               p = s_next(m, p, &l);
+       }
+
+       return p;
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+       struct trace_iterator *iter = m->private;
+
+       atomic_dec(&trace_record_cmdline_disabled);
+
+       /* let the tracer release locks here if needed */
+       if (current_trace && current_trace == iter->trace && iter->trace->stop)
+               iter->trace->stop(iter);
+
+       mutex_unlock(&trace_types_lock);
+}
+
+#define KRETPROBE_MSG "[unknown/kretprobe'd]"
+
+#ifdef CONFIG_KRETPROBES
+static inline int kretprobed(unsigned long addr)
+{
+       return addr == (unsigned long)kretprobe_trampoline;
+}
+#else
+static inline int kretprobed(unsigned long addr)
+{
+       return 0;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+       char str[KSYM_SYMBOL_LEN];
+
+       kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+       return trace_seq_printf(s, fmt, str);
+#endif
+       return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+                    unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+       char str[KSYM_SYMBOL_LEN];
+
+       sprint_symbol(str, address);
+       return trace_seq_printf(s, fmt, str);
+#endif
+       return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+static int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+       int ret;
+
+       if (!ip)
+               return trace_seq_printf(s, "0");
+
+       if (sym_flags & TRACE_ITER_SYM_OFFSET)
+               ret = seq_print_sym_offset(s, "%s", ip);
+       else
+               ret = seq_print_sym_short(s, "%s", ip);
+
+       if (!ret)
+               return 0;
+
+       if (sym_flags & TRACE_ITER_SYM_ADDR)
+               ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+       return ret;
+}
+
+static void print_lat_help_header(struct seq_file *m)
+{
+       seq_puts(m, "#                _------=> CPU#            \n");
+       seq_puts(m, "#               / _-----=> irqs-off        \n");
+       seq_puts(m, "#              | / _----=> need-resched    \n");
+       seq_puts(m, "#              || / _---=> hardirq/softirq \n");
+       seq_puts(m, "#              ||| / _--=> preempt-depth   \n");
+       seq_puts(m, "#              |||| /                      \n");
+       seq_puts(m, "#              |||||     delay             \n");
+       seq_puts(m, "#  cmd     pid ||||| time  |   caller      \n");
+       seq_puts(m, "#     \\   /    |||||   \\   |   /           \n");
+}
+
+static void print_func_help_header(struct seq_file *m)
+{
+       seq_puts(m, "#           TASK-PID   CPU#    TIMESTAMP  FUNCTION\n");
+       seq_puts(m, "#              | |      |          |         |\n");
+}
+
+
+static void
+print_trace_header(struct seq_file *m, struct trace_iterator *iter)
+{
+       unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+       struct trace_array *tr = iter->tr;
+       struct trace_array_cpu *data = tr->data[tr->cpu];
+       struct tracer *type = current_trace;
+       unsigned long total   = 0;
+       unsigned long entries = 0;
+       int cpu;
+       const char *name = "preemption";
+
+       if (type)
+               name = type->name;
+
+       for_each_tracing_cpu(cpu) {
+               if (head_page(tr->data[cpu])) {
+                       total += tr->data[cpu]->trace_idx;
+                       if (tr->data[cpu]->trace_idx > tr->entries)
+                               entries += tr->entries;
+                       else
+                               entries += tr->data[cpu]->trace_idx;
+               }
+       }
+
+       seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+                  name, UTS_RELEASE);
+       seq_puts(m, "-----------------------------------"
+                "---------------------------------\n");
+       seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+                  " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+                  nsecs_to_usecs(data->saved_latency),
+                  entries,
+                  total,
+                  tr->cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+                  "server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+                  "desktop",
+#elif defined(CONFIG_PREEMPT)
+                  "preempt",
+#else
+                  "unknown",
+#endif
+                  /* These are reserved for later use */
+                  0, 0, 0, 0);
+#ifdef CONFIG_SMP
+       seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+       seq_puts(m, ")\n");
+#endif
+       seq_puts(m, "    -----------------\n");
+       seq_printf(m, "    | task: %.16s-%d "
+                  "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
+                  data->comm, data->pid, data->uid, data->nice,
+                  data->policy, data->rt_priority);
+       seq_puts(m, "    -----------------\n");
+
+       if (data->critical_start) {
+               seq_puts(m, " => started at: ");
+               seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
+               trace_print_seq(m, &iter->seq);
+               seq_puts(m, "\n => ended at:   ");
+               seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
+               trace_print_seq(m, &iter->seq);
+               seq_puts(m, "\n");
+       }
+
+       seq_puts(m, "\n");
+}
+
+static void
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+       int hardirq, softirq;
+       char *comm;
+
+       comm = trace_find_cmdline(entry->pid);
+
+       trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+       trace_seq_printf(s, "%d", cpu);
+       trace_seq_printf(s, "%c%c",
+                       (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+                       ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+       hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+       softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+       if (hardirq && softirq) {
+               trace_seq_putc(s, 'H');
+       } else {
+               if (hardirq) {
+                       trace_seq_putc(s, 'h');
+               } else {
+                       if (softirq)
+                               trace_seq_putc(s, 's');
+                       else
+                               trace_seq_putc(s, '.');
+               }
+       }
+
+       if (entry->preempt_count)
+               trace_seq_printf(s, "%x", entry->preempt_count);
+       else
+               trace_seq_puts(s, ".");
+}
+
+unsigned long preempt_mark_thresh = 100;
+
+static void
+lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
+                   unsigned long rel_usecs)
+{
+       trace_seq_printf(s, " %4lldus", abs_usecs);
+       if (rel_usecs > preempt_mark_thresh)
+               trace_seq_puts(s, "!: ");
+       else if (rel_usecs > 1)
+               trace_seq_puts(s, "+: ");
+       else
+               trace_seq_puts(s, " : ");
+}
+
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static int
+print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
+{
+       struct trace_seq *s = &iter->seq;
+       unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+       struct trace_entry *next_entry = find_next_entry(iter, NULL);
+       unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+       struct trace_entry *entry = iter->ent;
+       unsigned long abs_usecs;
+       unsigned long rel_usecs;
+       char *comm;
+       int S, T;
+       int i;
+       unsigned state;
+
+       if (!next_entry)
+               next_entry = entry;
+       rel_usecs = ns2usecs(next_entry->t - entry->t);
+       abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
+
+       if (verbose) {
+               comm = trace_find_cmdline(entry->pid);
+               trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
+                                " %ld.%03ldms (+%ld.%03ldms): ",
+                                comm,
+                                entry->pid, cpu, entry->flags,
+                                entry->preempt_count, trace_idx,
+                                ns2usecs(entry->t),
+                                abs_usecs/1000,
+                                abs_usecs % 1000, rel_usecs/1000,
+                                rel_usecs % 1000);
+       } else {
+               lat_print_generic(s, entry, cpu);
+               lat_print_timestamp(s, abs_usecs, rel_usecs);
+       }
+       switch (entry->type) {
+       case TRACE_FN:
+               seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+               trace_seq_puts(s, " (");
+               if (kretprobed(entry->fn.parent_ip))
+                       trace_seq_puts(s, KRETPROBE_MSG);
+               else
+                       seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
+               trace_seq_puts(s, ")\n");
+               break;
+       case TRACE_CTX:
+       case TRACE_WAKE:
+               T = entry->ctx.next_state < sizeof(state_to_char) ?
+                       state_to_char[entry->ctx.next_state] : 'X';
+
+               state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
+               S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+               comm = trace_find_cmdline(entry->ctx.next_pid);
+               trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
+                                entry->ctx.prev_pid,
+                                entry->ctx.prev_prio,
+                                S, entry->type == TRACE_CTX ? "==>" : "  +",
+                                entry->ctx.next_pid,
+                                entry->ctx.next_prio,
+                                T, comm);
+               break;
+       case TRACE_SPECIAL:
+               trace_seq_printf(s, "# %ld %ld %ld\n",
+                                entry->special.arg1,
+                                entry->special.arg2,
+                                entry->special.arg3);
+               break;
+       case TRACE_STACK:
+               for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+                       if (i)
+                               trace_seq_puts(s, " <= ");
+                       seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
+               }
+               trace_seq_puts(s, "\n");
+               break;
+       default:
+               trace_seq_printf(s, "Unknown type %d\n", entry->type);
+       }
+       return 1;
+}
+
+static int print_trace_fmt(struct trace_iterator *iter)
+{
+       struct trace_seq *s = &iter->seq;
+       unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+       struct trace_entry *entry;
+       unsigned long usec_rem;
+       unsigned long long t;
+       unsigned long secs;
+       char *comm;
+       int ret;
+       int S, T;
+       int i;
+
+       entry = iter->ent;
+
+       comm = trace_find_cmdline(iter->ent->pid);
+
+       t = ns2usecs(entry->t);
+       usec_rem = do_div(t, 1000000ULL);
+       secs = (unsigned long)t;
+
+       ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+       if (!ret)
+               return 0;
+       ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+       if (!ret)
+               return 0;
+       ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+       if (!ret)
+               return 0;
+
+       switch (entry->type) {
+       case TRACE_FN:
+               ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
+               if (!ret)
+                       return 0;
+               if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
+                                               entry->fn.parent_ip) {
+                       ret = trace_seq_printf(s, " <-");
+                       if (!ret)
+                               return 0;
+                       if (kretprobed(entry->fn.parent_ip))
+                               ret = trace_seq_puts(s, KRETPROBE_MSG);
+                       else
+                               ret = seq_print_ip_sym(s, entry->fn.parent_ip,
+                                                      sym_flags);
+                       if (!ret)
+                               return 0;
+               }
+               ret = trace_seq_printf(s, "\n");
+               if (!ret)
+                       return 0;
+               break;
+       case TRACE_CTX:
+       case TRACE_WAKE:
+               S = entry->ctx.prev_state < sizeof(state_to_char) ?
+                       state_to_char[entry->ctx.prev_state] : 'X';
+               T = entry->ctx.next_state < sizeof(state_to_char) ?
+                       state_to_char[entry->ctx.next_state] : 'X';
+               ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
+                                      entry->ctx.prev_pid,
+                                      entry->ctx.prev_prio,
+                                      S,
+                                      entry->type == TRACE_CTX ? "==>" : "  +",
+                                      entry->ctx.next_pid,
+                                      entry->ctx.next_prio,
+                                      T);
+               if (!ret)
+                       return 0;
+               break;
+       case TRACE_SPECIAL:
+               ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+                                entry->special.arg1,
+                                entry->special.arg2,
+                                entry->special.arg3);
+               if (!ret)
+                       return 0;
+               break;
+       case TRACE_STACK:
+               for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+                       if (i) {
+                               ret = trace_seq_puts(s, " <= ");
+                               if (!ret)
+                                       return 0;
+                       }
+                       ret = seq_print_ip_sym(s, entry->stack.caller[i],
+                                              sym_flags);
+                       if (!ret)
+                               return 0;
+               }
+               ret = trace_seq_puts(s, "\n");
+               if (!ret)
+                       return 0;
+               break;
+       }
+       return 1;
+}
+
+static int print_raw_fmt(struct trace_iterator *iter)
+{
+       struct trace_seq *s = &iter->seq;
+       struct trace_entry *entry;
+       int ret;
+       int S, T;
+
+       entry = iter->ent;
+
+       ret = trace_seq_printf(s, "%d %d %llu ",
+               entry->pid, iter->cpu, entry->t);
+       if (!ret)
+               return 0;
+
+       switch (entry->type) {
+       case TRACE_FN:
+               ret = trace_seq_printf(s, "%x %x\n",
+                                       entry->fn.ip, entry->fn.parent_ip);
+               if (!ret)
+                       return 0;
+               break;
+       case TRACE_CTX:
+       case TRACE_WAKE:
+               S = entry->ctx.prev_state < sizeof(state_to_char) ?
+                       state_to_char[entry->ctx.prev_state] : 'X';
+               T = entry->ctx.next_state < sizeof(state_to_char) ?
+                       state_to_char[entry->ctx.next_state] : 'X';
+               if (entry->type == TRACE_WAKE)
+                       S = '+';
+               ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
+                                      entry->ctx.prev_pid,
+                                      entry->ctx.prev_prio,
+                                      S,
+                                      entry->ctx.next_pid,
+                                      entry->ctx.next_prio,
+                                      T);
+               if (!ret)
+                       return 0;
+               break;
+       case TRACE_SPECIAL:
+       case TRACE_STACK:
+               ret = trace_seq_printf(s, "# %ld %ld %ld\n",
+                                entry->special.arg1,
+                                entry->special.arg2,
+                                entry->special.arg3);
+               if (!ret)
+                       return 0;
+               break;
+       }
+       return 1;
+}
+
+#define SEQ_PUT_FIELD_RET(s, x)                                \
+do {                                                   \
+       if (!trace_seq_putmem(s, &(x), sizeof(x)))      \
+               return 0;                               \
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x)                    \
+do {                                                   \
+       if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))  \
+               return 0;                               \
+} while (0)
+
+static int print_hex_fmt(struct trace_iterator *iter)
+{
+       struct trace_seq *s = &iter->seq;
+       unsigned char newline = '\n';
+       struct trace_entry *entry;
+       int S, T;
+
+       entry = iter->ent;
+
+       SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+       SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+       SEQ_PUT_HEX_FIELD_RET(s, entry->t);
+
+       switch (entry->type) {
+       case TRACE_FN:
+               SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
+               SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+               break;
+       case TRACE_CTX:
+       case TRACE_WAKE:
+               S = entry->ctx.prev_state < sizeof(state_to_char) ?
+                       state_to_char[entry->ctx.prev_state] : 'X';
+               T = entry->ctx.next_state < sizeof(state_to_char) ?
+                       state_to_char[entry->ctx.next_state] : 'X';
+               if (entry->type == TRACE_WAKE)
+                       S = '+';
+               SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
+               SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
+               SEQ_PUT_HEX_FIELD_RET(s, S);
+               SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
+               SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
+               SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
+               SEQ_PUT_HEX_FIELD_RET(s, T);
+               break;
+       case TRACE_SPECIAL:
+       case TRACE_STACK:
+               SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
+               SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
+               SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
+               break;
+       }
+       SEQ_PUT_FIELD_RET(s, newline);
+
+       return 1;
+}
+
+static int print_bin_fmt(struct trace_iterator *iter)
+{
+       struct trace_seq *s = &iter->seq;
+       struct trace_entry *entry;
+
+       entry = iter->ent;
+
+       SEQ_PUT_FIELD_RET(s, entry->pid);
+       SEQ_PUT_FIELD_RET(s, entry->cpu);
+       SEQ_PUT_FIELD_RET(s, entry->t);
+
+       switch (entry->type) {
+       case TRACE_FN:
+               SEQ_PUT_FIELD_RET(s, entry->fn.ip);
+               SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
+               break;
+       case TRACE_CTX:
+               SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
+               SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
+               SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
+               SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
+               SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
+               SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
+               break;
+       case TRACE_SPECIAL:
+       case TRACE_STACK:
+               SEQ_PUT_FIELD_RET(s, entry->special.arg1);
+               SEQ_PUT_FIELD_RET(s, entry->special.arg2);
+               SEQ_PUT_FIELD_RET(s, entry->special.arg3);
+               break;
+       }
+       return 1;
+}
+
+static int trace_empty(struct trace_iterator *iter)
+{
+       struct trace_array_cpu *data;
+       int cpu;
+
+       for_each_tracing_cpu(cpu) {
+               data = iter->tr->data[cpu];
+
+               if (head_page(data) && data->trace_idx &&
+                   (data->trace_tail != data->trace_head ||
+                    data->trace_tail_idx != data->trace_head_idx))
+                       return 0;
+       }
+       return 1;
+}
+
+static int print_trace_line(struct trace_iterator *iter)
+{
+       if (iter->trace && iter->trace->print_line)
+               return iter->trace->print_line(iter);
+
+       if (trace_flags & TRACE_ITER_BIN)
+               return print_bin_fmt(iter);
+
+       if (trace_flags & TRACE_ITER_HEX)
+               return print_hex_fmt(iter);
+
+       if (trace_flags & TRACE_ITER_RAW)
+               return print_raw_fmt(iter);
+
+       if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+               return print_lat_fmt(iter, iter->idx, iter->cpu);
+
+       return print_trace_fmt(iter);
+}
+
+static int s_show(struct seq_file *m, void *v)
+{
+       struct trace_iterator *iter = v;
+
+       if (iter->ent == NULL) {
+               if (iter->tr) {
+                       seq_printf(m, "# tracer: %s\n", iter->trace->name);
+                       seq_puts(m, "#\n");
+               }
+               if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+                       /* print nothing if the buffers are empty */
+                       if (trace_empty(iter))
+                               return 0;
+                       print_trace_header(m, iter);
+                       if (!(trace_flags & TRACE_ITER_VERBOSE))
+                               print_lat_help_header(m);
+               } else {
+                       if (!(trace_flags & TRACE_ITER_VERBOSE))
+                               print_func_help_header(m);
+               }
+       } else {
+               print_trace_line(iter);
+               trace_print_seq(m, &iter->seq);
+       }
+
+       return 0;
+}
+
+static struct seq_operations tracer_seq_ops = {
+       .start          = s_start,
+       .next           = s_next,
+       .stop           = s_stop,
+       .show           = s_show,
+};
+
+static struct trace_iterator *
+__tracing_open(struct inode *inode, struct file *file, int *ret)
+{
+       struct trace_iterator *iter;
+
+       if (tracing_disabled) {
+               *ret = -ENODEV;
+               return NULL;
+       }
+
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter) {
+               *ret = -ENOMEM;
+               goto out;
+       }
+
+       mutex_lock(&trace_types_lock);
+       if (current_trace && current_trace->print_max)
+               iter->tr = &max_tr;
+       else
+               iter->tr = inode->i_private;
+       iter->trace = current_trace;
+       iter->pos = -1;
+
+       /* TODO stop tracer */
+       *ret = seq_open(file, &tracer_seq_ops);
+       if (!*ret) {
+               struct seq_file *m = file->private_data;
+               m->private = iter;
+
+               /* stop the trace while dumping */
+               if (iter->tr->ctrl) {
+                       tracer_enabled = 0;
+                       ftrace_function_enabled = 0;
+               }
+
+               if (iter->trace && iter->trace->open)
+                       iter->trace->open(iter);
+       } else {
+               kfree(iter);
+               iter = NULL;
+       }
+       mutex_unlock(&trace_types_lock);
+
+ out:
+       return iter;
+}
+
+int tracing_open_generic(struct inode *inode, struct file *filp)
+{
+       if (tracing_disabled)
+               return -ENODEV;
+
+       filp->private_data = inode->i_private;
+       return 0;
+}
+
+int tracing_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *m = (struct seq_file *)file->private_data;
+       struct trace_iterator *iter = m->private;
+
+       mutex_lock(&trace_types_lock);
+       if (iter->trace && iter->trace->close)
+               iter->trace->close(iter);
+
+       /* reenable tracing if it was previously enabled */
+       if (iter->tr->ctrl) {
+               tracer_enabled = 1;
+               /*
+                * It is safe to enable function tracing even if it
+                * isn't used
+                */
+               ftrace_function_enabled = 1;
+       }
+       mutex_unlock(&trace_types_lock);
+
+       seq_release(inode, file);
+       kfree(iter);
+       return 0;
+}
+
+static int tracing_open(struct inode *inode, struct file *file)
+{
+       int ret;
+
+       __tracing_open(inode, file, &ret);
+
+       return ret;
+}
+
+static int tracing_lt_open(struct inode *inode, struct file *file)
+{
+       struct trace_iterator *iter;
+       int ret;
+
+       iter = __tracing_open(inode, file, &ret);
+
+       if (!ret)
+               iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
+       return ret;
+}
+
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct tracer *t = m->private;
+
+       (*pos)++;
+
+       if (t)
+               t = t->next;
+
+       m->private = t;
+
+       return t;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+       struct tracer *t = m->private;
+       loff_t l = 0;
+
+       mutex_lock(&trace_types_lock);
+       for (; t && l < *pos; t = t_next(m, t, &l))
+               ;
+
+       return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+       mutex_unlock(&trace_types_lock);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+       struct tracer *t = v;
+
+       if (!t)
+               return 0;
+
+       seq_printf(m, "%s", t->name);
+       if (t->next)
+               seq_putc(m, ' ');
+       else
+               seq_putc(m, '\n');
+
+       return 0;
+}
+
+static struct seq_operations show_traces_seq_ops = {
+       .start          = t_start,
+       .next           = t_next,
+       .stop           = t_stop,
+       .show           = t_show,
+};
+
+static int show_traces_open(struct inode *inode, struct file *file)
+{
+       int ret;
+
+       if (tracing_disabled)
+               return -ENODEV;
+
+       ret = seq_open(file, &show_traces_seq_ops);
+       if (!ret) {
+               struct seq_file *m = file->private_data;
+               m->private = trace_types;
+       }
+
+       return ret;
+}
+
+static struct file_operations tracing_fops = {
+       .open           = tracing_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = tracing_release,
+};
+
+static struct file_operations tracing_lt_fops = {
+       .open           = tracing_lt_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = tracing_release,
+};
+
+static struct file_operations show_traces_fops = {
+       .open           = show_traces_open,
+       .read           = seq_read,
+       .release        = seq_release,
+};
+
+/*
+ * Only trace on a CPU if the bitmask is set:
+ */
+static cpumask_t tracing_cpumask = CPU_MASK_ALL;
+
+/*
+ * When tracing/tracing_cpu_mask is modified then this holds
+ * the new bitmask we are about to install:
+ */
+static cpumask_t tracing_cpumask_new;
+
+/*
+ * The tracer itself will not take this lock, but still we want
+ * to provide a consistent cpumask to user-space:
+ */
+static DEFINE_MUTEX(tracing_cpumask_update_lock);
+
+/*
+ * Temporary storage for the character representation of the
+ * CPU bitmask (and one more byte for the newline):
+ */
+static char mask_str[NR_CPUS + 1];
+
+static ssize_t
+tracing_cpumask_read(struct file *filp, char __user *ubuf,
+                    size_t count, loff_t *ppos)
+{
+       int len;
+
+       mutex_lock(&tracing_cpumask_update_lock);
+
+       len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+       if (count - len < 2) {
+               count = -EINVAL;
+               goto out_err;
+       }
+       len += sprintf(mask_str + len, "\n");
+       count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+
+out_err:
+       mutex_unlock(&tracing_cpumask_update_lock);
+
+       return count;
+}
+
+static ssize_t
+tracing_cpumask_write(struct file *filp, const char __user *ubuf,
+                     size_t count, loff_t *ppos)
+{
+       int err, cpu;
+
+       mutex_lock(&tracing_cpumask_update_lock);
+       err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+       if (err)
+               goto err_unlock;
+
+       raw_local_irq_disable();
+       __raw_spin_lock(&ftrace_max_lock);
+       for_each_tracing_cpu(cpu) {
+               /*
+                * Increase/decrease the disabled counter if we are
+                * about to flip a bit in the cpumask:
+                */
+               if (cpu_isset(cpu, tracing_cpumask) &&
+                               !cpu_isset(cpu, tracing_cpumask_new)) {
+                       atomic_inc(&global_trace.data[cpu]->disabled);
+               }
+               if (!cpu_isset(cpu, tracing_cpumask) &&
+                               cpu_isset(cpu, tracing_cpumask_new)) {
+                       atomic_dec(&global_trace.data[cpu]->disabled);
+               }
+       }
+       __raw_spin_unlock(&ftrace_max_lock);
+       raw_local_irq_enable();
+
+       tracing_cpumask = tracing_cpumask_new;
+
+       mutex_unlock(&tracing_cpumask_update_lock);
+
+       return count;
+
+err_unlock:
+       mutex_unlock(&tracing_cpumask_update_lock);
+
+       return err;
+}
+
+static struct file_operations tracing_cpumask_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_cpumask_read,
+       .write          = tracing_cpumask_write,
+};
+
+static ssize_t
+tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+       char *buf;
+       int r = 0;
+       int len = 0;
+       int i;
+
+       /* calulate max size */
+       for (i = 0; trace_options[i]; i++) {
+               len += strlen(trace_options[i]);
+               len += 3; /* "no" and space */
+       }
+
+       /* +2 for \n and \0 */
+       buf = kmalloc(len + 2, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       for (i = 0; trace_options[i]; i++) {
+               if (trace_flags & (1 << i))
+                       r += sprintf(buf + r, "%s ", trace_options[i]);
+               else
+                       r += sprintf(buf + r, "no%s ", trace_options[i]);
+       }
+
+       r += sprintf(buf + r, "\n");
+       WARN_ON(r >= len + 2);
+
+       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+       kfree(buf);
+
+       return r;
+}
+
+static ssize_t
+tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+{
+       char buf[64];
+       char *cmp = buf;
+       int neg = 0;
+       int i;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       if (strncmp(buf, "no", 2) == 0) {
+               neg = 1;
+               cmp += 2;
+       }
+
+       for (i = 0; trace_options[i]; i++) {
+               int len = strlen(trace_options[i]);
+
+               if (strncmp(cmp, trace_options[i], len) == 0) {
+                       if (neg)
+                               trace_flags &= ~(1 << i);
+                       else
+                               trace_flags |= (1 << i);
+                       break;
+               }
+       }
+       /*
+        * If no option could be set, return an error:
+        */
+       if (!trace_options[i])
+               return -EINVAL;
+
+       filp->f_pos += cnt;
+
+       return cnt;
+}
+
+static struct file_operations tracing_iter_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_iter_ctrl_read,
+       .write          = tracing_iter_ctrl_write,
+};
+
+static const char readme_msg[] =
+       "tracing mini-HOWTO:\n\n"
+       "# mkdir /debug\n"
+       "# mount -t debugfs nodev /debug\n\n"
+       "# cat /debug/tracing/available_tracers\n"
+       "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+       "# cat /debug/tracing/current_tracer\n"
+       "none\n"
+       "# echo sched_switch > /debug/tracing/current_tracer\n"
+       "# cat /debug/tracing/current_tracer\n"
+       "sched_switch\n"
+       "# cat /debug/tracing/iter_ctrl\n"
+       "noprint-parent nosym-offset nosym-addr noverbose\n"
+       "# echo print-parent > /debug/tracing/iter_ctrl\n"
+       "# echo 1 > /debug/tracing/tracing_enabled\n"
+       "# cat /debug/tracing/trace > /tmp/trace.txt\n"
+       "echo 0 > /debug/tracing/tracing_enabled\n"
+;
+
+static ssize_t
+tracing_readme_read(struct file *filp, char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+       return simple_read_from_buffer(ubuf, cnt, ppos,
+                                       readme_msg, strlen(readme_msg));
+}
+
+static struct file_operations tracing_readme_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_readme_read,
+};
+
+static ssize_t
+tracing_ctrl_read(struct file *filp, char __user *ubuf,
+                 size_t cnt, loff_t *ppos)
+{
+       struct trace_array *tr = filp->private_data;
+       char buf[64];
+       int r;
+
+       r = sprintf(buf, "%ld\n", tr->ctrl);
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_ctrl_write(struct file *filp, const char __user *ubuf,
+                  size_t cnt, loff_t *ppos)
+{
+       struct trace_array *tr = filp->private_data;
+       char buf[64];
+       long val;
+       int ret;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       ret = strict_strtoul(buf, 10, &val);
+       if (ret < 0)
+               return ret;
+
+       val = !!val;
+
+       mutex_lock(&trace_types_lock);
+       if (tr->ctrl ^ val) {
+               if (val)
+                       tracer_enabled = 1;
+               else
+                       tracer_enabled = 0;
+
+               tr->ctrl = val;
+
+               if (current_trace && current_trace->ctrl_update)
+                       current_trace->ctrl_update(tr);
+       }
+       mutex_unlock(&trace_types_lock);
+
+       filp->f_pos += cnt;
+
+       return cnt;
+}
+
+static ssize_t
+tracing_set_trace_read(struct file *filp, char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+       char buf[max_tracer_type_len+2];
+       int r;
+
+       mutex_lock(&trace_types_lock);
+       if (current_trace)
+               r = sprintf(buf, "%s\n", current_trace->name);
+       else
+               r = sprintf(buf, "\n");
+       mutex_unlock(&trace_types_lock);
+
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+{
+       struct trace_array *tr = &global_trace;
+       struct tracer *t;
+       char buf[max_tracer_type_len+1];
+       int i;
+
+       if (cnt > max_tracer_type_len)
+               cnt = max_tracer_type_len;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       /* strip ending whitespace. */
+       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+               buf[i] = 0;
+
+       mutex_lock(&trace_types_lock);
+       for (t = trace_types; t; t = t->next) {
+               if (strcmp(t->name, buf) == 0)
+                       break;
+       }
+       if (!t || t == current_trace)
+               goto out;
+
+       if (current_trace && current_trace->reset)
+               current_trace->reset(tr);
+
+       current_trace = t;
+       if (t->init)
+               t->init(tr);
+
+ out:
+       mutex_unlock(&trace_types_lock);
+
+       filp->f_pos += cnt;
+
+       return cnt;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+       unsigned long *ptr = filp->private_data;
+       char buf[64];
+       int r;
+
+       r = snprintf(buf, sizeof(buf), "%ld\n",
+                    *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
+       if (r > sizeof(buf))
+               r = sizeof(buf);
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+       long *ptr = filp->private_data;
+       char buf[64];
+       long val;
+       int ret;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       ret = strict_strtoul(buf, 10, &val);
+       if (ret < 0)
+               return ret;
+
+       *ptr = val * 1000;
+
+       return cnt;
+}
+
+static atomic_t tracing_reader;
+
+static int tracing_open_pipe(struct inode *inode, struct file *filp)
+{
+       struct trace_iterator *iter;
+
+       if (tracing_disabled)
+               return -ENODEV;
+
+       /* We only allow for reader of the pipe */
+       if (atomic_inc_return(&tracing_reader) != 1) {
+               atomic_dec(&tracing_reader);
+               return -EBUSY;
+       }
+
+       /* create a buffer to store the information to pass to userspace */
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               return -ENOMEM;
+
+       mutex_lock(&trace_types_lock);
+       iter->tr = &global_trace;
+       iter->trace = current_trace;
+       filp->private_data = iter;
+
+       if (iter->trace->pipe_open)
+               iter->trace->pipe_open(iter);
+       mutex_unlock(&trace_types_lock);
+
+       return 0;
+}
+
+static int tracing_release_pipe(struct inode *inode, struct file *file)
+{
+       struct trace_iterator *iter = file->private_data;
+
+       kfree(iter);
+       atomic_dec(&tracing_reader);
+
+       return 0;
+}
+
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+       struct trace_iterator *iter = filp->private_data;
+
+       if (trace_flags & TRACE_ITER_BLOCK) {
+               /*
+                * Always select as readable when in blocking mode
+                */
+               return POLLIN | POLLRDNORM;
+       } else {
+               if (!trace_empty(iter))
+                       return POLLIN | POLLRDNORM;
+               poll_wait(filp, &trace_wait, poll_table);
+               if (!trace_empty(iter))
+                       return POLLIN | POLLRDNORM;
+
+               return 0;
+       }
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+                 size_t cnt, loff_t *ppos)
+{
+       struct trace_iterator *iter = filp->private_data;
+       struct trace_array_cpu *data;
+       static cpumask_t mask;
+       unsigned long flags;
+#ifdef CONFIG_FTRACE
+       int ftrace_save;
+#endif
+       int cpu;
+       ssize_t sret;
+
+       /* return any leftover data */
+       sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+       if (sret != -EBUSY)
+               return sret;
+       sret = 0;
+
+       trace_seq_reset(&iter->seq);
+
+       mutex_lock(&trace_types_lock);
+       if (iter->trace->read) {
+               sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+               if (sret)
+                       goto out;
+       }
+
+       while (trace_empty(iter)) {
+
+               if ((filp->f_flags & O_NONBLOCK)) {
+                       sret = -EAGAIN;
+                       goto out;
+               }
+
+               /*
+                * This is a make-shift waitqueue. The reason we don't use
+                * an actual wait queue is because:
+                *  1) we only ever have one waiter
+                *  2) the tracing, traces all functions, we don't want
+                *     the overhead of calling wake_up and friends
+                *     (and tracing them too)
+                *     Anyway, this is really very primitive wakeup.
+                */
+               set_current_state(TASK_INTERRUPTIBLE);
+               iter->tr->waiter = current;
+
+               mutex_unlock(&trace_types_lock);
+
+               /* sleep for 100 msecs, and try again. */
+               schedule_timeout(HZ/10);
+
+               mutex_lock(&trace_types_lock);
+
+               iter->tr->waiter = NULL;
+
+               if (signal_pending(current)) {
+                       sret = -EINTR;
+                       goto out;
+               }
+
+               if (iter->trace != current_trace)
+                       goto out;
+
+               /*
+                * We block until we read something and tracing is disabled.
+                * We still block if tracing is disabled, but we have never
+                * read anything. This allows a user to cat this file, and
+                * then enable tracing. But after we have read something,
+                * we give an EOF when tracing is again disabled.
+                *
+                * iter->pos will be 0 if we haven't read anything.
+                */
+               if (!tracer_enabled && iter->pos)
+                       break;
+
+               continue;
+       }
+
+       /* stop when tracing is finished */
+       if (trace_empty(iter))
+               goto out;
+
+       if (cnt >= PAGE_SIZE)
+               cnt = PAGE_SIZE - 1;
+
+       /* reset all but tr, trace, and overruns */
+       memset(&iter->seq, 0,
+              sizeof(struct trace_iterator) -
+              offsetof(struct trace_iterator, seq));
+       iter->pos = -1;
+
+       /*
+        * We need to stop all tracing on all CPUS to read the
+        * the next buffer. This is a bit expensive, but is
+        * not done often. We fill all what we can read,
+        * and then release the locks again.
+        */
+
+       cpus_clear(mask);
+       local_irq_save(flags);
+#ifdef CONFIG_FTRACE
+       ftrace_save = ftrace_enabled;
+       ftrace_enabled = 0;
+#endif
+       smp_wmb();
+       for_each_tracing_cpu(cpu) {
+               data = iter->tr->data[cpu];
+
+               if (!head_page(data) || !data->trace_idx)
+                       continue;
+
+               atomic_inc(&data->disabled);
+               cpu_set(cpu, mask);
+       }
+
+       for_each_cpu_mask(cpu, mask) {
+               data = iter->tr->data[cpu];
+               __raw_spin_lock(&data->lock);
+
+               if (data->overrun > iter->last_overrun[cpu])
+                       iter->overrun[cpu] +=
+                               data->overrun - iter->last_overrun[cpu];
+               iter->last_overrun[cpu] = data->overrun;
+       }
+
+       while (find_next_entry_inc(iter) != NULL) {
+               int ret;
+               int len = iter->seq.len;
+
+               ret = print_trace_line(iter);
+               if (!ret) {
+                       /* don't print partial lines */
+                       iter->seq.len = len;
+                       break;
+               }
+
+               trace_consume(iter);
+
+               if (iter->seq.len >= cnt)
+                       break;
+       }
+
+       for_each_cpu_mask(cpu, mask) {
+               data = iter->tr->data[cpu];
+               __raw_spin_unlock(&data->lock);
+       }
+
+       for_each_cpu_mask(cpu, mask) {
+               data = iter->tr->data[cpu];
+               atomic_dec(&data->disabled);
+       }
+#ifdef CONFIG_FTRACE
+       ftrace_enabled = ftrace_save;
+#endif
+       local_irq_restore(flags);
+
+       /* Now copy what we have to the user */
+       sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+       if (iter->seq.readpos >= iter->seq.len)
+               trace_seq_reset(&iter->seq);
+       if (sret == -EBUSY)
+               sret = 0;
+
+out:
+       mutex_unlock(&trace_types_lock);
+
+       return sret;
+}
+
+static ssize_t
+tracing_entries_read(struct file *filp, char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+       struct trace_array *tr = filp->private_data;
+       char buf[64];
+       int r;
+
+       r = sprintf(buf, "%lu\n", tr->entries);
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+tracing_entries_write(struct file *filp, const char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+       unsigned long val;
+       char buf[64];
+       int i, ret;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       ret = strict_strtoul(buf, 10, &val);
+       if (ret < 0)
+               return ret;
+
+       /* must have at least 1 entry */
+       if (!val)
+               return -EINVAL;
+
+       mutex_lock(&trace_types_lock);
+
+       if (current_trace != &no_tracer) {
+               cnt = -EBUSY;
+               pr_info("ftrace: set current_tracer to none"
+                       " before modifying buffer size\n");
+               goto out;
+       }
+
+       if (val > global_trace.entries) {
+               long pages_requested;
+               unsigned long freeable_pages;
+
+               /* make sure we have enough memory before mapping */
+               pages_requested =
+                       (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
+
+               /* account for each buffer (and max_tr) */
+               pages_requested *= tracing_nr_buffers * 2;
+
+               /* Check for overflow */
+               if (pages_requested < 0) {
+                       cnt = -ENOMEM;
+                       goto out;
+               }
+
+               freeable_pages = determine_dirtyable_memory();
+
+               /* we only allow to request 1/4 of useable memory */
+               if (pages_requested >
+                   ((freeable_pages + tracing_pages_allocated) / 4)) {
+                       cnt = -ENOMEM;
+                       goto out;
+               }
+
+               while (global_trace.entries < val) {
+                       if (trace_alloc_page()) {
+                               cnt = -ENOMEM;
+                               goto out;
+                       }
+                       /* double check that we don't go over the known pages */
+                       if (tracing_pages_allocated > pages_requested)
+                               break;
+               }
+
+       } else {
+               /* include the number of entries in val (inc of page entries) */
+               while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
+                       trace_free_page();
+       }
+
+       /* check integrity */
+       for_each_tracing_cpu(i)
+               check_pages(global_trace.data[i]);
+
+       filp->f_pos += cnt;
+
+       /* If check pages failed, return ENOMEM */
+       if (tracing_disabled)
+               cnt = -ENOMEM;
+ out:
+       max_tr.entries = global_trace.entries;
+       mutex_unlock(&trace_types_lock);
+
+       return cnt;
+}
+
+static struct file_operations tracing_max_lat_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_max_lat_read,
+       .write          = tracing_max_lat_write,
+};
+
+static struct file_operations tracing_ctrl_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_ctrl_read,
+       .write          = tracing_ctrl_write,
+};
+
+static struct file_operations set_tracer_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_set_trace_read,
+       .write          = tracing_set_trace_write,
+};
+
+static struct file_operations tracing_pipe_fops = {
+       .open           = tracing_open_pipe,
+       .poll           = tracing_poll_pipe,
+       .read           = tracing_read_pipe,
+       .release        = tracing_release_pipe,
+};
+
+static struct file_operations tracing_entries_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_entries_read,
+       .write          = tracing_entries_write,
+};
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static ssize_t
+tracing_read_long(struct file *filp, char __user *ubuf,
+                 size_t cnt, loff_t *ppos)
+{
+       unsigned long *p = filp->private_data;
+       char buf[64];
+       int r;
+
+       r = sprintf(buf, "%ld\n", *p);
+
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static struct file_operations tracing_read_long_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_read_long,
+};
+#endif
+
+static struct dentry *d_tracer;
+
+struct dentry *tracing_init_dentry(void)
+{
+       static int once;
+
+       if (d_tracer)
+               return d_tracer;
+
+       d_tracer = debugfs_create_dir("tracing", NULL);
+
+       if (!d_tracer && !once) {
+               once = 1;
+               pr_warning("Could not create debugfs directory 'tracing'\n");
+               return NULL;
+       }
+
+       return d_tracer;
+}
+
+#ifdef CONFIG_FTRACE_SELFTEST
+/* Let selftest have access to static functions in this file */
+#include "trace_selftest.c"
+#endif
+
+static __init void tracer_init_debugfs(void)
+{
+       struct dentry *d_tracer;
+       struct dentry *entry;
+
+       d_tracer = tracing_init_dentry();
+
+       entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
+                                   &global_trace, &tracing_ctrl_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+
+       entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+                                   NULL, &tracing_iter_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+
+       entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
+                                   NULL, &tracing_cpumask_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
+
+       entry = debugfs_create_file("latency_trace", 0444, d_tracer,
+                                   &global_trace, &tracing_lt_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs 'latency_trace' entry\n");
+
+       entry = debugfs_create_file("trace", 0444, d_tracer,
+                                   &global_trace, &tracing_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs 'trace' entry\n");
+
+       entry = debugfs_create_file("available_tracers", 0444, d_tracer,
+                                   &global_trace, &show_traces_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs 'trace' entry\n");
+
+       entry = debugfs_create_file("current_tracer", 0444, d_tracer,
+                                   &global_trace, &set_tracer_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs 'trace' entry\n");
+
+       entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
+                                   &tracing_max_latency,
+                                   &tracing_max_lat_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'tracing_max_latency' entry\n");
+
+       entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
+                                   &tracing_thresh, &tracing_max_lat_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'tracing_threash' entry\n");
+       entry = debugfs_create_file("README", 0644, d_tracer,
+                                   NULL, &tracing_readme_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs 'README' entry\n");
+
+       entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
+                                   NULL, &tracing_pipe_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'tracing_threash' entry\n");
+
+       entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+                                   &global_trace, &tracing_entries_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'tracing_threash' entry\n");
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+       entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+                                   &ftrace_update_tot_cnt,
+                                   &tracing_read_long_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'dyn_ftrace_total_info' entry\n");
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+       init_tracer_sysprof_debugfs(d_tracer);
+#endif
+}
+
+static int trace_alloc_page(void)
+{
+       struct trace_array_cpu *data;
+       struct page *page, *tmp;
+       LIST_HEAD(pages);
+       void *array;
+       unsigned pages_allocated = 0;
+       int i;
+
+       /* first allocate a page for each CPU */
+       for_each_tracing_cpu(i) {
+               array = (void *)__get_free_page(GFP_KERNEL);
+               if (array == NULL) {
+                       printk(KERN_ERR "tracer: failed to allocate page"
+                              "for trace buffer!\n");
+                       goto free_pages;
+               }
+
+               pages_allocated++;
+               page = virt_to_page(array);
+               list_add(&page->lru, &pages);
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+               array = (void *)__get_free_page(GFP_KERNEL);
+               if (array == NULL) {
+                       printk(KERN_ERR "tracer: failed to allocate page"
+                              "for trace buffer!\n");
+                       goto free_pages;
+               }
+               pages_allocated++;
+               page = virt_to_page(array);
+               list_add(&page->lru, &pages);
+#endif
+       }
+
+       /* Now that we successfully allocate a page per CPU, add them */
+       for_each_tracing_cpu(i) {
+               data = global_trace.data[i];
+               page = list_entry(pages.next, struct page, lru);
+               list_del_init(&page->lru);
+               list_add_tail(&page->lru, &data->trace_pages);
+               ClearPageLRU(page);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+               data = max_tr.data[i];
+               page = list_entry(pages.next, struct page, lru);
+               list_del_init(&page->lru);
+               list_add_tail(&page->lru, &data->trace_pages);
+               SetPageLRU(page);
+#endif
+       }
+       tracing_pages_allocated += pages_allocated;
+       global_trace.entries += ENTRIES_PER_PAGE;
+
+       return 0;
+
+ free_pages:
+       list_for_each_entry_safe(page, tmp, &pages, lru) {
+               list_del_init(&page->lru);
+               __free_page(page);
+       }
+       return -ENOMEM;
+}
+
+static int trace_free_page(void)
+{
+       struct trace_array_cpu *data;
+       struct page *page;
+       struct list_head *p;
+       int i;
+       int ret = 0;
+
+       /* free one page from each buffer */
+       for_each_tracing_cpu(i) {
+               data = global_trace.data[i];
+               p = data->trace_pages.next;
+               if (p == &data->trace_pages) {
+                       /* should never happen */
+                       WARN_ON(1);
+                       tracing_disabled = 1;
+                       ret = -1;
+                       break;
+               }
+               page = list_entry(p, struct page, lru);
+               ClearPageLRU(page);
+               list_del(&page->lru);
+               tracing_pages_allocated--;
+               tracing_pages_allocated--;
+               __free_page(page);
+
+               tracing_reset(data);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+               data = max_tr.data[i];
+               p = data->trace_pages.next;
+               if (p == &data->trace_pages) {
+                       /* should never happen */
+                       WARN_ON(1);
+                       tracing_disabled = 1;
+                       ret = -1;
+                       break;
+               }
+               page = list_entry(p, struct page, lru);
+               ClearPageLRU(page);
+               list_del(&page->lru);
+               __free_page(page);
+
+               tracing_reset(data);
+#endif
+       }
+       global_trace.entries -= ENTRIES_PER_PAGE;
+
+       return ret;
+}
+
+__init static int tracer_alloc_buffers(void)
+{
+       struct trace_array_cpu *data;
+       void *array;
+       struct page *page;
+       int pages = 0;
+       int ret = -ENOMEM;
+       int i;
+
+       /* TODO: make the number of buffers hot pluggable with CPUS */
+       tracing_nr_buffers = num_possible_cpus();
+       tracing_buffer_mask = cpu_possible_map;
+
+       /* Allocate the first page for all buffers */
+       for_each_tracing_cpu(i) {
+               data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
+               max_tr.data[i] = &per_cpu(max_data, i);
+
+               array = (void *)__get_free_page(GFP_KERNEL);
+               if (array == NULL) {
+                       printk(KERN_ERR "tracer: failed to allocate page"
+                              "for trace buffer!\n");
+                       goto free_buffers;
+               }
+
+               /* set the array to the list */
+               INIT_LIST_HEAD(&data->trace_pages);
+               page = virt_to_page(array);
+               list_add(&page->lru, &data->trace_pages);
+               /* use the LRU flag to differentiate the two buffers */
+               ClearPageLRU(page);
+
+               data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+               max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+/* Only allocate if we are actually using the max trace */
+#ifdef CONFIG_TRACER_MAX_TRACE
+               array = (void *)__get_free_page(GFP_KERNEL);
+               if (array == NULL) {
+                       printk(KERN_ERR "tracer: failed to allocate page"
+                              "for trace buffer!\n");
+                       goto free_buffers;
+               }
+
+               INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
+               page = virt_to_page(array);
+               list_add(&page->lru, &max_tr.data[i]->trace_pages);
+               SetPageLRU(page);
+#endif
+       }
+
+       /*
+        * Since we allocate by orders of pages, we may be able to
+        * round up a bit.
+        */
+       global_trace.entries = ENTRIES_PER_PAGE;
+       pages++;
+
+       while (global_trace.entries < trace_nr_entries) {
+               if (trace_alloc_page())
+                       break;
+               pages++;
+       }
+       max_tr.entries = global_trace.entries;
+
+       pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n",
+               pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE);
+       pr_info("   actual entries %ld\n", global_trace.entries);
+
+       tracer_init_debugfs();
+
+       trace_init_cmdlines();
+
+       register_tracer(&no_tracer);
+       current_trace = &no_tracer;
+
+       /* All seems OK, enable tracing */
+       global_trace.ctrl = tracer_enabled;
+       tracing_disabled = 0;
+
+       return 0;
+
+ free_buffers:
+       for (i-- ; i >= 0; i--) {
+               struct page *page, *tmp;
+               struct trace_array_cpu *data = global_trace.data[i];
+
+               if (data) {
+                       list_for_each_entry_safe(page, tmp,
+                                                &data->trace_pages, lru) {
+                               list_del_init(&page->lru);
+                               __free_page(page);
+                       }
+               }
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+               data = max_tr.data[i];
+               if (data) {
+                       list_for_each_entry_safe(page, tmp,
+                                                &data->trace_pages, lru) {
+                               list_del_init(&page->lru);
+                               __free_page(page);
+                       }
+               }
+#endif
+       }
+       return ret;
+}
+fs_initcall(tracer_alloc_buffers);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
new file mode 100644 (file)
index 0000000..f69f867
--- /dev/null
@@ -0,0 +1,339 @@
+#ifndef _LINUX_KERNEL_TRACE_H
+#define _LINUX_KERNEL_TRACE_H
+
+#include <linux/fs.h>
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/clocksource.h>
+#include <linux/mmiotrace.h>
+
+enum trace_type {
+       __TRACE_FIRST_TYPE = 0,
+
+       TRACE_FN,
+       TRACE_CTX,
+       TRACE_WAKE,
+       TRACE_STACK,
+       TRACE_SPECIAL,
+       TRACE_MMIO_RW,
+       TRACE_MMIO_MAP,
+
+       __TRACE_LAST_TYPE
+};
+
+/*
+ * Function trace entry - function address and parent function addres:
+ */
+struct ftrace_entry {
+       unsigned long           ip;
+       unsigned long           parent_ip;
+};
+
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ */
+struct ctx_switch_entry {
+       unsigned int            prev_pid;
+       unsigned char           prev_prio;
+       unsigned char           prev_state;
+       unsigned int            next_pid;
+       unsigned char           next_prio;
+       unsigned char           next_state;
+};
+
+/*
+ * Special (free-form) trace entry:
+ */
+struct special_entry {
+       unsigned long           arg1;
+       unsigned long           arg2;
+       unsigned long           arg3;
+};
+
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES   8
+
+struct stack_entry {
+       unsigned long           caller[FTRACE_STACK_ENTRIES];
+};
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+       char                    type;
+       char                    cpu;
+       char                    flags;
+       char                    preempt_count;
+       int                     pid;
+       cycle_t                 t;
+       union {
+               struct ftrace_entry             fn;
+               struct ctx_switch_entry         ctx;
+               struct special_entry            special;
+               struct stack_entry              stack;
+               struct mmiotrace_rw             mmiorw;
+               struct mmiotrace_map            mmiomap;
+       };
+};
+
+#define TRACE_ENTRY_SIZE       sizeof(struct trace_entry)
+
+/*
+ * The CPU trace array - it consists of thousands of trace entries
+ * plus some other descriptor data: (for example which task started
+ * the trace, etc.)
+ */
+struct trace_array_cpu {
+       struct list_head        trace_pages;
+       atomic_t                disabled;
+       raw_spinlock_t          lock;
+       struct lock_class_key   lock_key;
+
+       /* these fields get copied into max-trace: */
+       unsigned                trace_head_idx;
+       unsigned                trace_tail_idx;
+       void                    *trace_head; /* producer */
+       void                    *trace_tail; /* consumer */
+       unsigned long           trace_idx;
+       unsigned long           overrun;
+       unsigned long           saved_latency;
+       unsigned long           critical_start;
+       unsigned long           critical_end;
+       unsigned long           critical_sequence;
+       unsigned long           nice;
+       unsigned long           policy;
+       unsigned long           rt_priority;
+       cycle_t                 preempt_timestamp;
+       pid_t                   pid;
+       uid_t                   uid;
+       char                    comm[TASK_COMM_LEN];
+};
+
+struct trace_iterator;
+
+/*
+ * The trace array - an array of per-CPU trace arrays. This is the
+ * highest level data structure that individual tracers deal with.
+ * They have on/off state as well:
+ */
+struct trace_array {
+       unsigned long           entries;
+       long                    ctrl;
+       int                     cpu;
+       cycle_t                 time_start;
+       struct task_struct      *waiter;
+       struct trace_array_cpu  *data[NR_CPUS];
+};
+
+/*
+ * A specific tracer, represented by methods that operate on a trace array:
+ */
+struct tracer {
+       const char              *name;
+       void                    (*init)(struct trace_array *tr);
+       void                    (*reset)(struct trace_array *tr);
+       void                    (*open)(struct trace_iterator *iter);
+       void                    (*pipe_open)(struct trace_iterator *iter);
+       void                    (*close)(struct trace_iterator *iter);
+       void                    (*start)(struct trace_iterator *iter);
+       void                    (*stop)(struct trace_iterator *iter);
+       ssize_t                 (*read)(struct trace_iterator *iter,
+                                       struct file *filp, char __user *ubuf,
+                                       size_t cnt, loff_t *ppos);
+       void                    (*ctrl_update)(struct trace_array *tr);
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+       int                     (*selftest)(struct tracer *trace,
+                                           struct trace_array *tr);
+#endif
+       int                     (*print_line)(struct trace_iterator *iter);
+       struct tracer           *next;
+       int                     print_max;
+};
+
+struct trace_seq {
+       unsigned char           buffer[PAGE_SIZE];
+       unsigned int            len;
+       unsigned int            readpos;
+};
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+       struct trace_array      *tr;
+       struct tracer           *trace;
+       void                    *private;
+       long                    last_overrun[NR_CPUS];
+       long                    overrun[NR_CPUS];
+
+       /* The below is zeroed out in pipe_read */
+       struct trace_seq        seq;
+       struct trace_entry      *ent;
+       int                     cpu;
+
+       struct trace_entry      *prev_ent;
+       int                     prev_cpu;
+
+       unsigned long           iter_flags;
+       loff_t                  pos;
+       unsigned long           next_idx[NR_CPUS];
+       struct list_head        *next_page[NR_CPUS];
+       unsigned                next_page_idx[NR_CPUS];
+       long                    idx;
+};
+
+void tracing_reset(struct trace_array_cpu *data);
+int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *tracing_init_dentry(void);
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
+
+void ftrace(struct trace_array *tr,
+                           struct trace_array_cpu *data,
+                           unsigned long ip,
+                           unsigned long parent_ip,
+                           unsigned long flags);
+void tracing_sched_switch_trace(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct task_struct *prev,
+                               struct task_struct *next,
+                               unsigned long flags);
+void tracing_record_cmdline(struct task_struct *tsk);
+
+void tracing_sched_wakeup_trace(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct task_struct *wakee,
+                               struct task_struct *cur,
+                               unsigned long flags);
+void trace_special(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long arg1,
+                  unsigned long arg2,
+                  unsigned long arg3);
+void trace_function(struct trace_array *tr,
+                   struct trace_array_cpu *data,
+                   unsigned long ip,
+                   unsigned long parent_ip,
+                   unsigned long flags);
+
+void tracing_start_cmdline_record(void);
+void tracing_stop_cmdline_record(void);
+int register_tracer(struct tracer *type);
+void unregister_tracer(struct tracer *type);
+
+extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+
+extern unsigned long tracing_max_latency;
+extern unsigned long tracing_thresh;
+
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr_single(struct trace_array *tr,
+                         struct task_struct *tsk, int cpu);
+
+extern cycle_t ftrace_now(int cpu);
+
+#ifdef CONFIG_FTRACE
+void tracing_start_function_trace(void);
+void tracing_stop_function_trace(void);
+#else
+# define tracing_start_function_trace()                do { } while (0)
+# define tracing_stop_function_trace()         do { } while (0)
+#endif
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+typedef void
+(*tracer_switch_func_t)(void *private,
+                       void *__rq,
+                       struct task_struct *prev,
+                       struct task_struct *next);
+
+struct tracer_switch_ops {
+       tracer_switch_func_t            func;
+       void                            *private;
+       struct tracer_switch_ops        *next;
+};
+
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern unsigned long ftrace_update_tot_cnt;
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+extern int DYN_FTRACE_TEST_NAME(void);
+#endif
+
+#ifdef CONFIG_MMIOTRACE
+extern void __trace_mmiotrace_rw(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct mmiotrace_rw *rw);
+extern void __trace_mmiotrace_map(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct mmiotrace_map *map);
+#endif
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+#ifdef CONFIG_FTRACE
+extern int trace_selftest_startup_function(struct tracer *trace,
+                                          struct trace_array *tr);
+#endif
+#ifdef CONFIG_IRQSOFF_TRACER
+extern int trace_selftest_startup_irqsoff(struct tracer *trace,
+                                         struct trace_array *tr);
+#endif
+#ifdef CONFIG_PREEMPT_TRACER
+extern int trace_selftest_startup_preemptoff(struct tracer *trace,
+                                            struct trace_array *tr);
+#endif
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
+                                                struct trace_array *tr);
+#endif
+#ifdef CONFIG_SCHED_TRACER
+extern int trace_selftest_startup_wakeup(struct tracer *trace,
+                                        struct trace_array *tr);
+#endif
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern int trace_selftest_startup_sched_switch(struct tracer *trace,
+                                              struct trace_array *tr);
+#endif
+#ifdef CONFIG_SYSPROF_TRACER
+extern int trace_selftest_startup_sysprof(struct tracer *trace,
+                                              struct trace_array *tr);
+#endif
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
+
+extern void *head_page(struct trace_array_cpu *data);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+                                size_t cnt);
+extern long ns2usecs(cycle_t nsec);
+
+extern unsigned long trace_flags;
+
+/*
+ * trace_iterator_flags is an enumeration that defines bit
+ * positions into trace_flags that controls the output.
+ *
+ * NOTE: These bits must match the trace_options array in
+ *       trace.c.
+ */
+enum trace_iterator_flags {
+       TRACE_ITER_PRINT_PARENT         = 0x01,
+       TRACE_ITER_SYM_OFFSET           = 0x02,
+       TRACE_ITER_SYM_ADDR             = 0x04,
+       TRACE_ITER_VERBOSE              = 0x08,
+       TRACE_ITER_RAW                  = 0x10,
+       TRACE_ITER_HEX                  = 0x20,
+       TRACE_ITER_BIN                  = 0x40,
+       TRACE_ITER_BLOCK                = 0x80,
+       TRACE_ITER_STACKTRACE           = 0x100,
+       TRACE_ITER_SCHED_TREE           = 0x200,
+};
+
+#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
new file mode 100644 (file)
index 0000000..3121448
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * ring buffer based function tracer
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static void function_reset(struct trace_array *tr)
+{
+       int cpu;
+
+       tr->time_start = ftrace_now(tr->cpu);
+
+       for_each_online_cpu(cpu)
+               tracing_reset(tr->data[cpu]);
+}
+
+static void start_function_trace(struct trace_array *tr)
+{
+       tr->cpu = get_cpu();
+       function_reset(tr);
+       put_cpu();
+
+       tracing_start_cmdline_record();
+       tracing_start_function_trace();
+}
+
+static void stop_function_trace(struct trace_array *tr)
+{
+       tracing_stop_function_trace();
+       tracing_stop_cmdline_record();
+}
+
+static void function_trace_init(struct trace_array *tr)
+{
+       if (tr->ctrl)
+               start_function_trace(tr);
+}
+
+static void function_trace_reset(struct trace_array *tr)
+{
+       if (tr->ctrl)
+               stop_function_trace(tr);
+}
+
+static void function_trace_ctrl_update(struct trace_array *tr)
+{
+       if (tr->ctrl)
+               start_function_trace(tr);
+       else
+               stop_function_trace(tr);
+}
+
+static struct tracer function_trace __read_mostly =
+{
+       .name        = "ftrace",
+       .init        = function_trace_init,
+       .reset       = function_trace_reset,
+       .ctrl_update = function_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest    = trace_selftest_startup_function,
+#endif
+};
+
+static __init int init_function_trace(void)
+{
+       return register_tracer(&function_trace);
+}
+
+device_initcall(init_function_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644 (file)
index 0000000..421d6fe
--- /dev/null
@@ -0,0 +1,486 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static struct trace_array              *irqsoff_trace __read_mostly;
+static int                             tracer_enabled __read_mostly;
+
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+static DEFINE_SPINLOCK(max_trace_lock);
+
+enum {
+       TRACER_IRQS_OFF         = (1 << 1),
+       TRACER_PREEMPT_OFF      = (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int
+preempt_trace(void)
+{
+       return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int
+irq_trace(void)
+{
+       return ((trace_type & TRACER_IRQS_OFF) &&
+               irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp      unsigned long max_sequence;
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct trace_array *tr = irqsoff_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+
+       /*
+        * Does not matter if we preempt. We test the flags
+        * afterward, to see if irqs are disabled or not.
+        * If we preempt and get a false positive, the flags
+        * test will fail.
+        */
+       cpu = raw_smp_processor_id();
+       if (likely(!per_cpu(tracing_cpu, cpu)))
+               return;
+
+       local_save_flags(flags);
+       /* slight chance to get a false positive on tracing_cpu */
+       if (!irqs_disabled_flags(flags))
+               return;
+
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+
+       if (likely(disabled == 1))
+               trace_function(tr, data, ip, parent_ip, flags);
+
+       atomic_dec(&data->disabled);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+       .func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+       if (tracing_thresh) {
+               if (delta < tracing_thresh)
+                       return 0;
+       } else {
+               if (delta <= tracing_max_latency)
+                       return 0;
+       }
+       return 1;
+}
+
+static void
+check_critical_timing(struct trace_array *tr,
+                     struct trace_array_cpu *data,
+                     unsigned long parent_ip,
+                     int cpu)
+{
+       unsigned long latency, t0, t1;
+       cycle_t T0, T1, delta;
+       unsigned long flags;
+
+       /*
+        * usecs conversion is slow so we try to delay the conversion
+        * as long as possible:
+        */
+       T0 = data->preempt_timestamp;
+       T1 = ftrace_now(cpu);
+       delta = T1-T0;
+
+       local_save_flags(flags);
+
+       if (!report_latency(delta))
+               goto out;
+
+       spin_lock_irqsave(&max_trace_lock, flags);
+
+       /* check if we are still the max latency */
+       if (!report_latency(delta))
+               goto out_unlock;
+
+       trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+
+       latency = nsecs_to_usecs(delta);
+
+       if (data->critical_sequence != max_sequence)
+               goto out_unlock;
+
+       tracing_max_latency = delta;
+       t0 = nsecs_to_usecs(T0);
+       t1 = nsecs_to_usecs(T1);
+
+       data->critical_end = parent_ip;
+
+       update_max_tr_single(tr, current, cpu);
+
+       max_sequence++;
+
+out_unlock:
+       spin_unlock_irqrestore(&max_trace_lock, flags);
+
+out:
+       data->critical_sequence = max_sequence;
+       data->preempt_timestamp = ftrace_now(cpu);
+       tracing_reset(data);
+       trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+
+static inline void
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+       int cpu;
+       struct trace_array *tr = irqsoff_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+
+       if (likely(!tracer_enabled))
+               return;
+
+       cpu = raw_smp_processor_id();
+
+       if (per_cpu(tracing_cpu, cpu))
+               return;
+
+       data = tr->data[cpu];
+
+       if (unlikely(!data) || atomic_read(&data->disabled))
+               return;
+
+       atomic_inc(&data->disabled);
+
+       data->critical_sequence = max_sequence;
+       data->preempt_timestamp = ftrace_now(cpu);
+       data->critical_start = parent_ip ? : ip;
+       tracing_reset(data);
+
+       local_save_flags(flags);
+
+       trace_function(tr, data, ip, parent_ip, flags);
+
+       per_cpu(tracing_cpu, cpu) = 1;
+
+       atomic_dec(&data->disabled);
+}
+
+static inline void
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+       int cpu;
+       struct trace_array *tr = irqsoff_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+
+       cpu = raw_smp_processor_id();
+       /* Always clear the tracing cpu on stopping the trace */
+       if (unlikely(per_cpu(tracing_cpu, cpu)))
+               per_cpu(tracing_cpu, cpu) = 0;
+       else
+               return;
+
+       if (!tracer_enabled)
+               return;
+
+       data = tr->data[cpu];
+
+       if (unlikely(!data) || unlikely(!head_page(data)) ||
+           !data->critical_start || atomic_read(&data->disabled))
+               return;
+
+       atomic_inc(&data->disabled);
+
+       local_save_flags(flags);
+       trace_function(tr, data, ip, parent_ip, flags);
+       check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+       data->critical_start = 0;
+       atomic_dec(&data->disabled);
+}
+
+/* start and stop critical timings used to for stoppage (in idle) */
+void start_critical_timings(void)
+{
+       if (preempt_trace() || irq_trace())
+               start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+void stop_critical_timings(void)
+{
+       if (preempt_trace() || irq_trace())
+               stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+#ifdef CONFIG_PROVE_LOCKING
+void time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+       if (!preempt_trace() && irq_trace())
+               stop_critical_timing(a0, a1);
+}
+
+void time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+       if (!preempt_trace() && irq_trace())
+               start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void trace_hardirqs_on(void)
+{
+       if (!preempt_trace() && irq_trace())
+               stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+       if (!preempt_trace() && irq_trace())
+               start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+       if (!preempt_trace() && irq_trace())
+               stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+       if (!preempt_trace() && irq_trace())
+               start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+       stop_critical_timing(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+       start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+       register_ftrace_function(&trace_ops);
+       tracer_enabled = 1;
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+       tracer_enabled = 0;
+       unregister_ftrace_function(&trace_ops);
+}
+
+static void __irqsoff_tracer_init(struct trace_array *tr)
+{
+       irqsoff_trace = tr;
+       /* make sure that the tracer is visible */
+       smp_wmb();
+
+       if (tr->ctrl)
+               start_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+       if (tr->ctrl)
+               stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+       if (tr->ctrl)
+               start_irqsoff_tracer(tr);
+       else
+               stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_open(struct trace_iterator *iter)
+{
+       /* stop the trace while dumping */
+       if (iter->tr->ctrl)
+               stop_irqsoff_tracer(iter->tr);
+}
+
+static void irqsoff_tracer_close(struct trace_iterator *iter)
+{
+       if (iter->tr->ctrl)
+               start_irqsoff_tracer(iter->tr);
+}
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+       trace_type = TRACER_IRQS_OFF;
+
+       __irqsoff_tracer_init(tr);
+}
+static struct tracer irqsoff_tracer __read_mostly =
+{
+       .name           = "irqsoff",
+       .init           = irqsoff_tracer_init,
+       .reset          = irqsoff_tracer_reset,
+       .open           = irqsoff_tracer_open,
+       .close          = irqsoff_tracer_close,
+       .ctrl_update    = irqsoff_tracer_ctrl_update,
+       .print_max      = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest    = trace_selftest_startup_irqsoff,
+#endif
+};
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+       trace_type = TRACER_PREEMPT_OFF;
+
+       __irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+       .name           = "preemptoff",
+       .init           = preemptoff_tracer_init,
+       .reset          = irqsoff_tracer_reset,
+       .open           = irqsoff_tracer_open,
+       .close          = irqsoff_tracer_close,
+       .ctrl_update    = irqsoff_tracer_ctrl_update,
+       .print_max      = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest    = trace_selftest_startup_preemptoff,
+#endif
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+       defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+       trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+       __irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+       .name           = "preemptirqsoff",
+       .init           = preemptirqsoff_tracer_init,
+       .reset          = irqsoff_tracer_reset,
+       .open           = irqsoff_tracer_open,
+       .close          = irqsoff_tracer_close,
+       .ctrl_update    = irqsoff_tracer_ctrl_update,
+       .print_max      = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest    = trace_selftest_startup_preemptirqsoff,
+#endif
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
+
+__init static int init_irqsoff_tracer(void)
+{
+       register_irqsoff(irqsoff_tracer);
+       register_preemptoff(preemptoff_tracer);
+       register_preemptirqsoff(preemptirqsoff_tracer);
+
+       return 0;
+}
+device_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644 (file)
index 0000000..b13dc19
--- /dev/null
@@ -0,0 +1,295 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+#include <linux/pci.h>
+
+#include "trace.h"
+
+struct header_iter {
+       struct pci_dev *dev;
+};
+
+static struct trace_array *mmio_trace_array;
+static bool overrun_detected;
+
+static void mmio_reset_data(struct trace_array *tr)
+{
+       int cpu;
+
+       overrun_detected = false;
+       tr->time_start = ftrace_now(tr->cpu);
+
+       for_each_online_cpu(cpu)
+               tracing_reset(tr->data[cpu]);
+}
+
+static void mmio_trace_init(struct trace_array *tr)
+{
+       pr_debug("in %s\n", __func__);
+       mmio_trace_array = tr;
+       if (tr->ctrl) {
+               mmio_reset_data(tr);
+               enable_mmiotrace();
+       }
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+       pr_debug("in %s\n", __func__);
+       if (tr->ctrl)
+               disable_mmiotrace();
+       mmio_reset_data(tr);
+       mmio_trace_array = NULL;
+}
+
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+       pr_debug("in %s\n", __func__);
+       if (tr->ctrl) {
+               mmio_reset_data(tr);
+               enable_mmiotrace();
+       } else {
+               disable_mmiotrace();
+       }
+}
+
+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+       int ret = 0;
+       int i;
+       resource_size_t start, end;
+       const struct pci_driver *drv = pci_dev_driver(dev);
+
+       /* XXX: incomplete checks for trace_seq_printf() return value */
+       ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+                               dev->bus->number, dev->devfn,
+                               dev->vendor, dev->device, dev->irq);
+       /*
+        * XXX: is pci_resource_to_user() appropriate, since we are
+        * supposed to interpret the __ioremap() phys_addr argument based on
+        * these printed values?
+        */
+       for (i = 0; i < 7; i++) {
+               pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+               ret += trace_seq_printf(s, " %llx",
+                       (unsigned long long)(start |
+                       (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+       }
+       for (i = 0; i < 7; i++) {
+               pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+               ret += trace_seq_printf(s, " %llx",
+                       dev->resource[i].start < dev->resource[i].end ?
+                       (unsigned long long)(end - start) + 1 : 0);
+       }
+       if (drv)
+               ret += trace_seq_printf(s, " %s\n", drv->name);
+       else
+               ret += trace_seq_printf(s, " \n");
+       return ret;
+}
+
+static void destroy_header_iter(struct header_iter *hiter)
+{
+       if (!hiter)
+               return;
+       pci_dev_put(hiter->dev);
+       kfree(hiter);
+}
+
+static void mmio_pipe_open(struct trace_iterator *iter)
+{
+       struct header_iter *hiter;
+       struct trace_seq *s = &iter->seq;
+
+       trace_seq_printf(s, "VERSION 20070824\n");
+
+       hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
+       if (!hiter)
+               return;
+
+       hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
+       iter->private = hiter;
+}
+
+/* XXX: This is not called when the pipe is closed! */
+static void mmio_close(struct trace_iterator *iter)
+{
+       struct header_iter *hiter = iter->private;
+       destroy_header_iter(hiter);
+       iter->private = NULL;
+}
+
+static unsigned long count_overruns(struct trace_iterator *iter)
+{
+       int cpu;
+       unsigned long cnt = 0;
+       for_each_online_cpu(cpu) {
+               cnt += iter->overrun[cpu];
+               iter->overrun[cpu] = 0;
+       }
+       return cnt;
+}
+
+static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
+                               char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+       ssize_t ret;
+       struct header_iter *hiter = iter->private;
+       struct trace_seq *s = &iter->seq;
+       unsigned long n;
+
+       n = count_overruns(iter);
+       if (n) {
+               /* XXX: This is later than where events were lost. */
+               trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
+               if (!overrun_detected)
+                       pr_warning("mmiotrace has lost events.\n");
+               overrun_detected = true;
+               goto print_out;
+       }
+
+       if (!hiter)
+               return 0;
+
+       mmio_print_pcidev(s, hiter->dev);
+       hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
+
+       if (!hiter->dev) {
+               destroy_header_iter(hiter);
+               iter->private = NULL;
+       }
+
+print_out:
+       ret = trace_seq_to_user(s, ubuf, cnt);
+       return (ret == -EBUSY) ? 0 : ret;
+}
+
+static int mmio_print_rw(struct trace_iterator *iter)
+{
+       struct trace_entry *entry = iter->ent;
+       struct mmiotrace_rw *rw = &entry->mmiorw;
+       struct trace_seq *s     = &iter->seq;
+       unsigned long long t    = ns2usecs(entry->t);
+       unsigned long usec_rem  = do_div(t, 1000000ULL);
+       unsigned secs           = (unsigned long)t;
+       int ret = 1;
+
+       switch (entry->mmiorw.opcode) {
+       case MMIO_READ:
+               ret = trace_seq_printf(s,
+                       "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+                       rw->width, secs, usec_rem, rw->map_id,
+                       (unsigned long long)rw->phys,
+                       rw->value, rw->pc, 0);
+               break;
+       case MMIO_WRITE:
+               ret = trace_seq_printf(s,
+                       "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+                       rw->width, secs, usec_rem, rw->map_id,
+                       (unsigned long long)rw->phys,
+                       rw->value, rw->pc, 0);
+               break;
+       case MMIO_UNKNOWN_OP:
+               ret = trace_seq_printf(s,
+                       "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+                       secs, usec_rem, rw->map_id,
+                       (unsigned long long)rw->phys,
+                       (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+                       (rw->value >> 0) & 0xff, rw->pc, 0);
+               break;
+       default:
+               ret = trace_seq_printf(s, "rw what?\n");
+               break;
+       }
+       if (ret)
+               return 1;
+       return 0;
+}
+
+static int mmio_print_map(struct trace_iterator *iter)
+{
+       struct trace_entry *entry = iter->ent;
+       struct mmiotrace_map *m = &entry->mmiomap;
+       struct trace_seq *s     = &iter->seq;
+       unsigned long long t    = ns2usecs(entry->t);
+       unsigned long usec_rem  = do_div(t, 1000000ULL);
+       unsigned secs           = (unsigned long)t;
+       int ret = 1;
+
+       switch (entry->mmiorw.opcode) {
+       case MMIO_PROBE:
+               ret = trace_seq_printf(s,
+                       "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+                       secs, usec_rem, m->map_id,
+                       (unsigned long long)m->phys, m->virt, m->len,
+                       0UL, 0);
+               break;
+       case MMIO_UNPROBE:
+               ret = trace_seq_printf(s,
+                       "UNMAP %lu.%06lu %d 0x%lx %d\n",
+                       secs, usec_rem, m->map_id, 0UL, 0);
+               break;
+       default:
+               ret = trace_seq_printf(s, "map what?\n");
+               break;
+       }
+       if (ret)
+               return 1;
+       return 0;
+}
+
+/* return 0 to abort printing without consuming current entry in pipe mode */
+static int mmio_print_line(struct trace_iterator *iter)
+{
+       switch (iter->ent->type) {
+       case TRACE_MMIO_RW:
+               return mmio_print_rw(iter);
+       case TRACE_MMIO_MAP:
+               return mmio_print_map(iter);
+       default:
+               return 1; /* ignore unknown entries */
+       }
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+       .name           = "mmiotrace",
+       .init           = mmio_trace_init,
+       .reset          = mmio_trace_reset,
+       .pipe_open      = mmio_pipe_open,
+       .close          = mmio_close,
+       .read           = mmio_read,
+       .ctrl_update    = mmio_trace_ctrl_update,
+       .print_line     = mmio_print_line,
+};
+
+__init static int init_mmio_trace(void)
+{
+       return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+void mmio_trace_rw(struct mmiotrace_rw *rw)
+{
+       struct trace_array *tr = mmio_trace_array;
+       struct trace_array_cpu *data = tr->data[smp_processor_id()];
+       __trace_mmiotrace_rw(tr, data, rw);
+}
+
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+       struct trace_array *tr = mmio_trace_array;
+       struct trace_array_cpu *data;
+
+       preempt_disable();
+       data = tr->data[smp_processor_id()];
+       __trace_mmiotrace_map(tr, data, map);
+       preempt_enable();
+}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644 (file)
index 0000000..cb817a2
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ * trace context switch
+ *
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/marker.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array      *ctx_trace;
+static int __read_mostly       tracer_enabled;
+static atomic_t                        sched_ref;
+
+static void
+sched_switch_func(void *private, void *__rq, struct task_struct *prev,
+                       struct task_struct *next)
+{
+       struct trace_array **ptr = private;
+       struct trace_array *tr = *ptr;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+
+       tracing_record_cmdline(prev);
+       tracing_record_cmdline(next);
+
+       if (!tracer_enabled)
+               return;
+
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+
+       if (likely(disabled == 1))
+               tracing_sched_switch_trace(tr, data, prev, next, flags);
+
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+}
+
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+                     const char *format, va_list *args)
+{
+       struct task_struct *prev;
+       struct task_struct *next;
+       struct rq *__rq;
+
+       if (!atomic_read(&sched_ref))
+               return;
+
+       /* skip prev_pid %d next_pid %d prev_state %ld */
+       (void)va_arg(*args, int);
+       (void)va_arg(*args, int);
+       (void)va_arg(*args, long);
+       __rq = va_arg(*args, typeof(__rq));
+       prev = va_arg(*args, typeof(prev));
+       next = va_arg(*args, typeof(next));
+
+       /*
+        * If tracer_switch_func only points to the local
+        * switch func, it still needs the ptr passed to it.
+        */
+       sched_switch_func(probe_data, __rq, prev, next);
+}
+
+static void
+wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
+                       task_struct *curr)
+{
+       struct trace_array **ptr = private;
+       struct trace_array *tr = *ptr;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+
+       if (!tracer_enabled)
+               return;
+
+       tracing_record_cmdline(curr);
+
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+
+       if (likely(disabled == 1))
+               tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+}
+
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+                const char *format, va_list *args)
+{
+       struct task_struct *curr;
+       struct task_struct *task;
+       struct rq *__rq;
+
+       if (likely(!tracer_enabled))
+               return;
+
+       /* Skip pid %d state %ld */
+       (void)va_arg(*args, int);
+       (void)va_arg(*args, long);
+       /* now get the meat: "rq %p task %p rq->curr %p" */
+       __rq = va_arg(*args, typeof(__rq));
+       task = va_arg(*args, typeof(task));
+       curr = va_arg(*args, typeof(curr));
+
+       tracing_record_cmdline(task);
+       tracing_record_cmdline(curr);
+
+       wakeup_func(probe_data, __rq, task, curr);
+}
+
+static void sched_switch_reset(struct trace_array *tr)
+{
+       int cpu;
+
+       tr->time_start = ftrace_now(tr->cpu);
+
+       for_each_online_cpu(cpu)
+               tracing_reset(tr->data[cpu]);
+}
+
+static int tracing_sched_register(void)
+{
+       int ret;
+
+       ret = marker_probe_register("kernel_sched_wakeup",
+                       "pid %d state %ld ## rq %p task %p rq->curr %p",
+                       wake_up_callback,
+                       &ctx_trace);
+       if (ret) {
+               pr_info("wakeup trace: Couldn't add marker"
+                       " probe to kernel_sched_wakeup\n");
+               return ret;
+       }
+
+       ret = marker_probe_register("kernel_sched_wakeup_new",
+                       "pid %d state %ld ## rq %p task %p rq->curr %p",
+                       wake_up_callback,
+                       &ctx_trace);
+       if (ret) {
+               pr_info("wakeup trace: Couldn't add marker"
+                       " probe to kernel_sched_wakeup_new\n");
+               goto fail_deprobe;
+       }
+
+       ret = marker_probe_register("kernel_sched_schedule",
+               "prev_pid %d next_pid %d prev_state %ld "
+               "## rq %p prev %p next %p",
+               sched_switch_callback,
+               &ctx_trace);
+       if (ret) {
+               pr_info("sched trace: Couldn't add marker"
+                       " probe to kernel_sched_schedule\n");
+               goto fail_deprobe_wake_new;
+       }
+
+       return ret;
+fail_deprobe_wake_new:
+       marker_probe_unregister("kernel_sched_wakeup_new",
+                               wake_up_callback,
+                               &ctx_trace);
+fail_deprobe:
+       marker_probe_unregister("kernel_sched_wakeup",
+                               wake_up_callback,
+                               &ctx_trace);
+       return ret;
+}
+
+static void tracing_sched_unregister(void)
+{
+       marker_probe_unregister("kernel_sched_schedule",
+                               sched_switch_callback,
+                               &ctx_trace);
+       marker_probe_unregister("kernel_sched_wakeup_new",
+                               wake_up_callback,
+                               &ctx_trace);
+       marker_probe_unregister("kernel_sched_wakeup",
+                               wake_up_callback,
+                               &ctx_trace);
+}
+
+static void tracing_start_sched_switch(void)
+{
+       long ref;
+
+       ref = atomic_inc_return(&sched_ref);
+       if (ref == 1)
+               tracing_sched_register();
+}
+
+static void tracing_stop_sched_switch(void)
+{
+       long ref;
+
+       ref = atomic_dec_and_test(&sched_ref);
+       if (ref)
+               tracing_sched_unregister();
+}
+
+void tracing_start_cmdline_record(void)
+{
+       tracing_start_sched_switch();
+}
+
+void tracing_stop_cmdline_record(void)
+{
+       tracing_stop_sched_switch();
+}
+
+static void start_sched_trace(struct trace_array *tr)
+{
+       sched_switch_reset(tr);
+       tracing_start_cmdline_record();
+       tracer_enabled = 1;
+}
+
+static void stop_sched_trace(struct trace_array *tr)
+{
+       tracer_enabled = 0;
+       tracing_stop_cmdline_record();
+}
+
+static void sched_switch_trace_init(struct trace_array *tr)
+{
+       ctx_trace = tr;
+
+       if (tr->ctrl)
+               start_sched_trace(tr);
+}
+
+static void sched_switch_trace_reset(struct trace_array *tr)
+{
+       if (tr->ctrl)
+               stop_sched_trace(tr);
+}
+
+static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+{
+       /* When starting a new trace, reset the buffers */
+       if (tr->ctrl)
+               start_sched_trace(tr);
+       else
+               stop_sched_trace(tr);
+}
+
+static struct tracer sched_switch_trace __read_mostly =
+{
+       .name           = "sched_switch",
+       .init           = sched_switch_trace_init,
+       .reset          = sched_switch_trace_reset,
+       .ctrl_update    = sched_switch_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest    = trace_selftest_startup_sched_switch,
+#endif
+};
+
+__init static int init_sched_switch_trace(void)
+{
+       int ret = 0;
+
+       if (atomic_read(&sched_ref))
+               ret = tracing_sched_register();
+       if (ret) {
+               pr_info("error registering scheduler trace\n");
+               return ret;
+       }
+       return register_tracer(&sched_switch_trace);
+}
+device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644 (file)
index 0000000..3c8d61d
--- /dev/null
@@ -0,0 +1,448 @@
+/*
+ * trace task wakeup timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/marker.h>
+
+#include "trace.h"
+
+static struct trace_array      *wakeup_trace;
+static int __read_mostly       tracer_enabled;
+
+static struct task_struct      *wakeup_task;
+static int                     wakeup_cpu;
+static unsigned                        wakeup_prio = -1;
+
+static DEFINE_SPINLOCK(wakeup_lock);
+
+static void __wakeup_reset(struct trace_array *tr);
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct trace_array *tr = wakeup_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int resched;
+       int cpu;
+
+       if (likely(!wakeup_task))
+               return;
+
+       resched = need_resched();
+       preempt_disable_notrace();
+
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (unlikely(disabled != 1))
+               goto out;
+
+       spin_lock_irqsave(&wakeup_lock, flags);
+
+       if (unlikely(!wakeup_task))
+               goto unlock;
+
+       /*
+        * The task can't disappear because it needs to
+        * wake up first, and we have the wakeup_lock.
+        */
+       if (task_cpu(wakeup_task) != cpu)
+               goto unlock;
+
+       trace_function(tr, data, ip, parent_ip, flags);
+
+ unlock:
+       spin_unlock_irqrestore(&wakeup_lock, flags);
+
+ out:
+       atomic_dec(&data->disabled);
+
+       /*
+        * To prevent recursion from the scheduler, if the
+        * resched flag was set before we entered, then
+        * don't reschedule.
+        */
+       if (resched)
+               preempt_enable_no_resched_notrace();
+       else
+               preempt_enable_notrace();
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+       .func = wakeup_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycle_t delta)
+{
+       if (tracing_thresh) {
+               if (delta < tracing_thresh)
+                       return 0;
+       } else {
+               if (delta <= tracing_max_latency)
+                       return 0;
+       }
+       return 1;
+}
+
+static void notrace
+wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
+       struct task_struct *next)
+{
+       unsigned long latency = 0, t0 = 0, t1 = 0;
+       struct trace_array **ptr = private;
+       struct trace_array *tr = *ptr;
+       struct trace_array_cpu *data;
+       cycle_t T0, T1, delta;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+
+       if (unlikely(!tracer_enabled))
+               return;
+
+       /*
+        * When we start a new trace, we set wakeup_task to NULL
+        * and then set tracer_enabled = 1. We want to make sure
+        * that another CPU does not see the tracer_enabled = 1
+        * and the wakeup_task with an older task, that might
+        * actually be the same as next.
+        */
+       smp_rmb();
+
+       if (next != wakeup_task)
+               return;
+
+       /* The task we are waiting for is waking up */
+       data = tr->data[wakeup_cpu];
+
+       /* disable local data, not wakeup_cpu data */
+       cpu = raw_smp_processor_id();
+       disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+       if (likely(disabled != 1))
+               goto out;
+
+       spin_lock_irqsave(&wakeup_lock, flags);
+
+       /* We could race with grabbing wakeup_lock */
+       if (unlikely(!tracer_enabled || next != wakeup_task))
+               goto out_unlock;
+
+       trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
+
+       /*
+        * usecs conversion is slow so we try to delay the conversion
+        * as long as possible:
+        */
+       T0 = data->preempt_timestamp;
+       T1 = ftrace_now(cpu);
+       delta = T1-T0;
+
+       if (!report_latency(delta))
+               goto out_unlock;
+
+       latency = nsecs_to_usecs(delta);
+
+       tracing_max_latency = delta;
+       t0 = nsecs_to_usecs(T0);
+       t1 = nsecs_to_usecs(T1);
+
+       update_max_tr(tr, wakeup_task, wakeup_cpu);
+
+out_unlock:
+       __wakeup_reset(tr);
+       spin_unlock_irqrestore(&wakeup_lock, flags);
+out:
+       atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+                     const char *format, va_list *args)
+{
+       struct task_struct *prev;
+       struct task_struct *next;
+       struct rq *__rq;
+
+       /* skip prev_pid %d next_pid %d prev_state %ld */
+       (void)va_arg(*args, int);
+       (void)va_arg(*args, int);
+       (void)va_arg(*args, long);
+       __rq = va_arg(*args, typeof(__rq));
+       prev = va_arg(*args, typeof(prev));
+       next = va_arg(*args, typeof(next));
+
+       tracing_record_cmdline(prev);
+
+       /*
+        * If tracer_switch_func only points to the local
+        * switch func, it still needs the ptr passed to it.
+        */
+       wakeup_sched_switch(probe_data, __rq, prev, next);
+}
+
+static void __wakeup_reset(struct trace_array *tr)
+{
+       struct trace_array_cpu *data;
+       int cpu;
+
+       assert_spin_locked(&wakeup_lock);
+
+       for_each_possible_cpu(cpu) {
+               data = tr->data[cpu];
+               tracing_reset(data);
+       }
+
+       wakeup_cpu = -1;
+       wakeup_prio = -1;
+
+       if (wakeup_task)
+               put_task_struct(wakeup_task);
+
+       wakeup_task = NULL;
+}
+
+static void wakeup_reset(struct trace_array *tr)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&wakeup_lock, flags);
+       __wakeup_reset(tr);
+       spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static void
+wakeup_check_start(struct trace_array *tr, struct task_struct *p,
+                  struct task_struct *curr)
+{
+       int cpu = smp_processor_id();
+       unsigned long flags;
+       long disabled;
+
+       if (likely(!rt_task(p)) ||
+                       p->prio >= wakeup_prio ||
+                       p->prio >= curr->prio)
+               return;
+
+       disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+       if (unlikely(disabled != 1))
+               goto out;
+
+       /* interrupts should be off from try_to_wake_up */
+       spin_lock(&wakeup_lock);
+
+       /* check for races. */
+       if (!tracer_enabled || p->prio >= wakeup_prio)
+               goto out_locked;
+
+       /* reset the trace */
+       __wakeup_reset(tr);
+
+       wakeup_cpu = task_cpu(p);
+       wakeup_prio = p->prio;
+
+       wakeup_task = p;
+       get_task_struct(wakeup_task);
+
+       local_save_flags(flags);
+
+       tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
+       trace_function(tr, tr->data[wakeup_cpu],
+                      CALLER_ADDR1, CALLER_ADDR2, flags);
+
+out_locked:
+       spin_unlock(&wakeup_lock);
+out:
+       atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+                const char *format, va_list *args)
+{
+       struct trace_array **ptr = probe_data;
+       struct trace_array *tr = *ptr;
+       struct task_struct *curr;
+       struct task_struct *task;
+       struct rq *__rq;
+
+       if (likely(!tracer_enabled))
+               return;
+
+       /* Skip pid %d state %ld */
+       (void)va_arg(*args, int);
+       (void)va_arg(*args, long);
+       /* now get the meat: "rq %p task %p rq->curr %p" */
+       __rq = va_arg(*args, typeof(__rq));
+       task = va_arg(*args, typeof(task));
+       curr = va_arg(*args, typeof(curr));
+
+       tracing_record_cmdline(task);
+       tracing_record_cmdline(curr);
+
+       wakeup_check_start(tr, task, curr);
+}
+
+static void start_wakeup_tracer(struct trace_array *tr)
+{
+       int ret;
+
+       ret = marker_probe_register("kernel_sched_wakeup",
+                       "pid %d state %ld ## rq %p task %p rq->curr %p",
+                       wake_up_callback,
+                       &wakeup_trace);
+       if (ret) {
+               pr_info("wakeup trace: Couldn't add marker"
+                       " probe to kernel_sched_wakeup\n");
+               return;
+       }
+
+       ret = marker_probe_register("kernel_sched_wakeup_new",
+                       "pid %d state %ld ## rq %p task %p rq->curr %p",
+                       wake_up_callback,
+                       &wakeup_trace);
+       if (ret) {
+               pr_info("wakeup trace: Couldn't add marker"
+                       " probe to kernel_sched_wakeup_new\n");
+               goto fail_deprobe;
+       }
+
+       ret = marker_probe_register("kernel_sched_schedule",
+               "prev_pid %d next_pid %d prev_state %ld "
+               "## rq %p prev %p next %p",
+               sched_switch_callback,
+               &wakeup_trace);
+       if (ret) {
+               pr_info("sched trace: Couldn't add marker"
+                       " probe to kernel_sched_schedule\n");
+               goto fail_deprobe_wake_new;
+       }
+
+       wakeup_reset(tr);
+
+       /*
+        * Don't let the tracer_enabled = 1 show up before
+        * the wakeup_task is reset. This may be overkill since
+        * wakeup_reset does a spin_unlock after setting the
+        * wakeup_task to NULL, but I want to be safe.
+        * This is a slow path anyway.
+        */
+       smp_wmb();
+
+       register_ftrace_function(&trace_ops);
+
+       tracer_enabled = 1;
+
+       return;
+fail_deprobe_wake_new:
+       marker_probe_unregister("kernel_sched_wakeup_new",
+                               wake_up_callback,
+                               &wakeup_trace);
+fail_deprobe:
+       marker_probe_unregister("kernel_sched_wakeup",
+                               wake_up_callback,
+                               &wakeup_trace);
+}
+
+static void stop_wakeup_tracer(struct trace_array *tr)
+{
+       tracer_enabled = 0;
+       unregister_ftrace_function(&trace_ops);
+       marker_probe_unregister("kernel_sched_schedule",
+                               sched_switch_callback,
+                               &wakeup_trace);
+       marker_probe_unregister("kernel_sched_wakeup_new",
+                               wake_up_callback,
+                               &wakeup_trace);
+       marker_probe_unregister("kernel_sched_wakeup",
+                               wake_up_callback,
+                               &wakeup_trace);
+}
+
+static void wakeup_tracer_init(struct trace_array *tr)
+{
+       wakeup_trace = tr;
+
+       if (tr->ctrl)
+               start_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_reset(struct trace_array *tr)
+{
+       if (tr->ctrl) {
+               stop_wakeup_tracer(tr);
+               /* make sure we put back any tasks we are tracing */
+               wakeup_reset(tr);
+       }
+}
+
+static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+{
+       if (tr->ctrl)
+               start_wakeup_tracer(tr);
+       else
+               stop_wakeup_tracer(tr);
+}
+
+static void wakeup_tracer_open(struct trace_iterator *iter)
+{
+       /* stop the trace while dumping */
+       if (iter->tr->ctrl)
+               stop_wakeup_tracer(iter->tr);
+}
+
+static void wakeup_tracer_close(struct trace_iterator *iter)
+{
+       /* forget about any processes we were recording */
+       if (iter->tr->ctrl)
+               start_wakeup_tracer(iter->tr);
+}
+
+static struct tracer wakeup_tracer __read_mostly =
+{
+       .name           = "wakeup",
+       .init           = wakeup_tracer_init,
+       .reset          = wakeup_tracer_reset,
+       .open           = wakeup_tracer_open,
+       .close          = wakeup_tracer_close,
+       .ctrl_update    = wakeup_tracer_ctrl_update,
+       .print_max      = 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest    = trace_selftest_startup_wakeup,
+#endif
+};
+
+__init static int init_wakeup_tracer(void)
+{
+       int ret;
+
+       ret = register_tracer(&wakeup_tracer);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
new file mode 100644 (file)
index 0000000..0911b7e
--- /dev/null
@@ -0,0 +1,563 @@
+/* Include in trace.c */
+
+#include <linux/kthread.h>
+#include <linux/delay.h>
+
+static inline int trace_valid_entry(struct trace_entry *entry)
+{
+       switch (entry->type) {
+       case TRACE_FN:
+       case TRACE_CTX:
+       case TRACE_WAKE:
+       case TRACE_STACK:
+       case TRACE_SPECIAL:
+               return 1;
+       }
+       return 0;
+}
+
+static int
+trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
+{
+       struct trace_entry *entries;
+       struct page *page;
+       int idx = 0;
+       int i;
+
+       BUG_ON(list_empty(&data->trace_pages));
+       page = list_entry(data->trace_pages.next, struct page, lru);
+       entries = page_address(page);
+
+       check_pages(data);
+       if (head_page(data) != entries)
+               goto failed;
+
+       /*
+        * The starting trace buffer always has valid elements,
+        * if any element exists.
+        */
+       entries = head_page(data);
+
+       for (i = 0; i < tr->entries; i++) {
+
+               if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
+                       printk(KERN_CONT ".. invalid entry %d ",
+                               entries[idx].type);
+                       goto failed;
+               }
+
+               idx++;
+               if (idx >= ENTRIES_PER_PAGE) {
+                       page = virt_to_page(entries);
+                       if (page->lru.next == &data->trace_pages) {
+                               if (i != tr->entries - 1) {
+                                       printk(KERN_CONT ".. entries buffer mismatch");
+                                       goto failed;
+                               }
+                       } else {
+                               page = list_entry(page->lru.next, struct page, lru);
+                               entries = page_address(page);
+                       }
+                       idx = 0;
+               }
+       }
+
+       page = virt_to_page(entries);
+       if (page->lru.next != &data->trace_pages) {
+               printk(KERN_CONT ".. too many entries");
+               goto failed;
+       }
+
+       return 0;
+
+ failed:
+       /* disable tracing */
+       tracing_disabled = 1;
+       printk(KERN_CONT ".. corrupted trace buffer .. ");
+       return -1;
+}
+
+/*
+ * Test the trace buffer to see if all the elements
+ * are still sane.
+ */
+static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+{
+       unsigned long flags, cnt = 0;
+       int cpu, ret = 0;
+
+       /* Don't allow flipping of max traces now */
+       raw_local_irq_save(flags);
+       __raw_spin_lock(&ftrace_max_lock);
+       for_each_possible_cpu(cpu) {
+               if (!head_page(tr->data[cpu]))
+                       continue;
+
+               cnt += tr->data[cpu]->trace_idx;
+
+               ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
+               if (ret)
+                       break;
+       }
+       __raw_spin_unlock(&ftrace_max_lock);
+       raw_local_irq_restore(flags);
+
+       if (count)
+               *count = cnt;
+
+       return ret;
+}
+
+#ifdef CONFIG_FTRACE
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
+/* Test dynamic code modification and ftrace filters */
+int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+                                          struct trace_array *tr,
+                                          int (*func)(void))
+{
+       unsigned long count;
+       int ret;
+       int save_ftrace_enabled = ftrace_enabled;
+       int save_tracer_enabled = tracer_enabled;
+       char *func_name;
+
+       /* The ftrace test PASSED */
+       printk(KERN_CONT "PASSED\n");
+       pr_info("Testing dynamic ftrace: ");
+
+       /* enable tracing, and record the filter function */
+       ftrace_enabled = 1;
+       tracer_enabled = 1;
+
+       /* passed in by parameter to fool gcc from optimizing */
+       func();
+
+       /* update the records */
+       ret = ftrace_force_update();
+       if (ret) {
+               printk(KERN_CONT ".. ftraced failed .. ");
+               return ret;
+       }
+
+       /*
+        * Some archs *cough*PowerPC*cough* add charachters to the
+        * start of the function names. We simply put a '*' to
+        * accomodate them.
+        */
+       func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+
+       /* filter only on our function */
+       ftrace_set_filter(func_name, strlen(func_name), 1);
+
+       /* enable tracing */
+       tr->ctrl = 1;
+       trace->init(tr);
+       /* Sleep for a 1/10 of a second */
+       msleep(100);
+
+       /* we should have nothing in the buffer */
+       ret = trace_test_buffer(tr, &count);
+       if (ret)
+               goto out;
+
+       if (count) {
+               ret = -1;
+               printk(KERN_CONT ".. filter did not filter .. ");
+               goto out;
+       }
+
+       /* call our function again */
+       func();
+
+       /* sleep again */
+       msleep(100);
+
+       /* stop the tracing. */
+       tr->ctrl = 0;
+       trace->ctrl_update(tr);
+       ftrace_enabled = 0;
+
+       /* check the trace buffer */
+       ret = trace_test_buffer(tr, &count);
+       trace->reset(tr);
+
+       /* we should only have one item */
+       if (!ret && count != 1) {
+               printk(KERN_CONT ".. filter failed count=%ld ..", count);
+               ret = -1;
+               goto out;
+       }
+ out:
+       ftrace_enabled = save_ftrace_enabled;
+       tracer_enabled = save_tracer_enabled;
+
+       /* Enable tracing on all functions again */
+       ftrace_set_filter(NULL, 0, 1);
+
+       return ret;
+}
+#else
+# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+#endif /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * Simple verification test of ftrace function tracer.
+ * Enable ftrace, sleep 1/10 second, and then read the trace
+ * buffer to see if all is in order.
+ */
+int
+trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
+{
+       unsigned long count;
+       int ret;
+       int save_ftrace_enabled = ftrace_enabled;
+       int save_tracer_enabled = tracer_enabled;
+
+       /* make sure msleep has been recorded */
+       msleep(1);
+
+       /* force the recorded functions to be traced */
+       ret = ftrace_force_update();
+       if (ret) {
+               printk(KERN_CONT ".. ftraced failed .. ");
+               return ret;
+       }
+
+       /* start the tracing */
+       ftrace_enabled = 1;
+       tracer_enabled = 1;
+
+       tr->ctrl = 1;
+       trace->init(tr);
+       /* Sleep for a 1/10 of a second */
+       msleep(100);
+       /* stop the tracing. */
+       tr->ctrl = 0;
+       trace->ctrl_update(tr);
+       ftrace_enabled = 0;
+
+       /* check the trace buffer */
+       ret = trace_test_buffer(tr, &count);
+       trace->reset(tr);
+
+       if (!ret && !count) {
+               printk(KERN_CONT ".. no entries found ..");
+               ret = -1;
+               goto out;
+       }
+
+       ret = trace_selftest_startup_dynamic_tracing(trace, tr,
+                                                    DYN_FTRACE_TEST_NAME);
+
+ out:
+       ftrace_enabled = save_ftrace_enabled;
+       tracer_enabled = save_tracer_enabled;
+
+       /* kill ftrace totally if we failed */
+       if (ret)
+               ftrace_kill();
+
+       return ret;
+}
+#endif /* CONFIG_FTRACE */
+
+#ifdef CONFIG_IRQSOFF_TRACER
+int
+trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
+{
+       unsigned long save_max = tracing_max_latency;
+       unsigned long count;
+       int ret;
+
+       /* start the tracing */
+       tr->ctrl = 1;
+       trace->init(tr);
+       /* reset the max latency */
+       tracing_max_latency = 0;
+       /* disable interrupts for a bit */
+       local_irq_disable();
+       udelay(100);
+       local_irq_enable();
+       /* stop the tracing. */
+       tr->ctrl = 0;
+       trace->ctrl_update(tr);
+       /* check both trace buffers */
+       ret = trace_test_buffer(tr, NULL);
+       if (!ret)
+               ret = trace_test_buffer(&max_tr, &count);
+       trace->reset(tr);
+
+       if (!ret && !count) {
+               printk(KERN_CONT ".. no entries found ..");
+               ret = -1;
+       }
+
+       tracing_max_latency = save_max;
+
+       return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+int
+trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
+{
+       unsigned long save_max = tracing_max_latency;
+       unsigned long count;
+       int ret;
+
+       /* start the tracing */
+       tr->ctrl = 1;
+       trace->init(tr);
+       /* reset the max latency */
+       tracing_max_latency = 0;
+       /* disable preemption for a bit */
+       preempt_disable();
+       udelay(100);
+       preempt_enable();
+       /* stop the tracing. */
+       tr->ctrl = 0;
+       trace->ctrl_update(tr);
+       /* check both trace buffers */
+       ret = trace_test_buffer(tr, NULL);
+       if (!ret)
+               ret = trace_test_buffer(&max_tr, &count);
+       trace->reset(tr);
+
+       if (!ret && !count) {
+               printk(KERN_CONT ".. no entries found ..");
+               ret = -1;
+       }
+
+       tracing_max_latency = save_max;
+
+       return ret;
+}
+#endif /* CONFIG_PREEMPT_TRACER */
+
+#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
+int
+trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
+{
+       unsigned long save_max = tracing_max_latency;
+       unsigned long count;
+       int ret;
+
+       /* start the tracing */
+       tr->ctrl = 1;
+       trace->init(tr);
+
+       /* reset the max latency */
+       tracing_max_latency = 0;
+
+       /* disable preemption and interrupts for a bit */
+       preempt_disable();
+       local_irq_disable();
+       udelay(100);
+       preempt_enable();
+       /* reverse the order of preempt vs irqs */
+       local_irq_enable();
+
+       /* stop the tracing. */
+       tr->ctrl = 0;
+       trace->ctrl_update(tr);
+       /* check both trace buffers */
+       ret = trace_test_buffer(tr, NULL);
+       if (ret)
+               goto out;
+
+       ret = trace_test_buffer(&max_tr, &count);
+       if (ret)
+               goto out;
+
+       if (!ret && !count) {
+               printk(KERN_CONT ".. no entries found ..");
+               ret = -1;
+               goto out;
+       }
+
+       /* do the test by disabling interrupts first this time */
+       tracing_max_latency = 0;
+       tr->ctrl = 1;
+       trace->ctrl_update(tr);
+       preempt_disable();
+       local_irq_disable();
+       udelay(100);
+       preempt_enable();
+       /* reverse the order of preempt vs irqs */
+       local_irq_enable();
+
+       /* stop the tracing. */
+       tr->ctrl = 0;
+       trace->ctrl_update(tr);
+       /* check both trace buffers */
+       ret = trace_test_buffer(tr, NULL);
+       if (ret)
+               goto out;
+
+       ret = trace_test_buffer(&max_tr, &count);
+
+       if (!ret && !count) {
+               printk(KERN_CONT ".. no entries found ..");
+               ret = -1;
+               goto out;
+       }
+
+ out:
+       trace->reset(tr);
+       tracing_max_latency = save_max;
+
+       return ret;
+}
+#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
+
+#ifdef CONFIG_SCHED_TRACER
+static int trace_wakeup_test_thread(void *data)
+{
+       /* Make this a RT thread, doesn't need to be too high */
+       struct sched_param param = { .sched_priority = 5 };
+       struct completion *x = data;
+
+       sched_setscheduler(current, SCHED_FIFO, &param);
+
+       /* Make it know we have a new prio */
+       complete(x);
+
+       /* now go to sleep and let the test wake us up */
+       set_current_state(TASK_INTERRUPTIBLE);
+       schedule();
+
+       /* we are awake, now wait to disappear */
+       while (!kthread_should_stop()) {
+               /*
+                * This is an RT task, do short sleeps to let
+                * others run.
+                */
+               msleep(100);
+       }
+
+       return 0;
+}
+
+int
+trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
+{
+       unsigned long save_max = tracing_max_latency;
+       struct task_struct *p;
+       struct completion isrt;
+       unsigned long count;
+       int ret;
+
+       init_completion(&isrt);
+
+       /* create a high prio thread */
+       p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
+       if (IS_ERR(p)) {
+               printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
+               return -1;
+       }
+
+       /* make sure the thread is running at an RT prio */
+       wait_for_completion(&isrt);
+
+       /* start the tracing */
+       tr->ctrl = 1;
+       trace->init(tr);
+       /* reset the max latency */
+       tracing_max_latency = 0;
+
+       /* sleep to let the RT thread sleep too */
+       msleep(100);
+
+       /*
+        * Yes this is slightly racy. It is possible that for some
+        * strange reason that the RT thread we created, did not
+        * call schedule for 100ms after doing the completion,
+        * and we do a wakeup on a task that already is awake.
+        * But that is extremely unlikely, and the worst thing that
+        * happens in such a case, is that we disable tracing.
+        * Honestly, if this race does happen something is horrible
+        * wrong with the system.
+        */
+
+       wake_up_process(p);
+
+       /* stop the tracing. */
+       tr->ctrl = 0;
+       trace->ctrl_update(tr);
+       /* check both trace buffers */
+       ret = trace_test_buffer(tr, NULL);
+       if (!ret)
+               ret = trace_test_buffer(&max_tr, &count);
+
+
+       trace->reset(tr);
+
+       tracing_max_latency = save_max;
+
+       /* kill the thread */
+       kthread_stop(p);
+
+       if (!ret && !count) {
+               printk(KERN_CONT ".. no entries found ..");
+               ret = -1;
+       }
+
+       return ret;
+}
+#endif /* CONFIG_SCHED_TRACER */
+
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+int
+trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
+{
+       unsigned long count;
+       int ret;
+
+       /* start the tracing */
+       tr->ctrl = 1;
+       trace->init(tr);
+       /* Sleep for a 1/10 of a second */
+       msleep(100);
+       /* stop the tracing. */
+       tr->ctrl = 0;
+       trace->ctrl_update(tr);
+       /* check the trace buffer */
+       ret = trace_test_buffer(tr, &count);
+       trace->reset(tr);
+
+       if (!ret && !count) {
+               printk(KERN_CONT ".. no entries found ..");
+               ret = -1;
+       }
+
+       return ret;
+}
+#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
+
+#ifdef CONFIG_SYSPROF_TRACER
+int
+trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
+{
+       unsigned long count;
+       int ret;
+
+       /* start the tracing */
+       tr->ctrl = 1;
+       trace->init(tr);
+       /* Sleep for a 1/10 of a second */
+       msleep(100);
+       /* stop the tracing. */
+       tr->ctrl = 0;
+       trace->ctrl_update(tr);
+       /* check the trace buffer */
+       ret = trace_test_buffer(tr, &count);
+       trace->reset(tr);
+
+       return ret;
+}
+#endif /* CONFIG_SYSPROF_TRACER */
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
new file mode 100644 (file)
index 0000000..54dd77c
--- /dev/null
@@ -0,0 +1,7 @@
+#include "trace.h"
+
+int DYN_FTRACE_TEST_NAME(void)
+{
+       /* used to call mcount */
+       return 0;
+}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
new file mode 100644 (file)
index 0000000..2301e1e
--- /dev/null
@@ -0,0 +1,363 @@
+/*
+ * trace stack traces
+ *
+ * Copyright (C) 2004-2008, Soeren Sandmann
+ * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/hrtimer.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/irq.h>
+#include <linux/fs.h>
+
+#include <asm/stacktrace.h>
+
+#include "trace.h"
+
+static struct trace_array      *sysprof_trace;
+static int __read_mostly       tracer_enabled;
+
+/*
+ * 1 msec sample interval by default:
+ */
+static unsigned long sample_period = 1000000;
+static const unsigned int sample_max_depth = 512;
+
+static DEFINE_MUTEX(sample_timer_lock);
+/*
+ * Per CPU hrtimers that do the profiling:
+ */
+static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
+
+struct stack_frame {
+       const void __user       *next_fp;
+       unsigned long           return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+       int ret;
+
+       if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+               return 0;
+
+       ret = 1;
+       pagefault_disable();
+       if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+               ret = 0;
+       pagefault_enable();
+
+       return ret;
+}
+
+struct backtrace_info {
+       struct trace_array_cpu  *data;
+       struct trace_array      *tr;
+       int                     pos;
+};
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+       /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+       /* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+       /* Don't bother with IRQ stacks for now */
+       return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+       struct backtrace_info *info = data;
+
+       if (info->pos < sample_max_depth && reliable) {
+               __trace_special(info->tr, info->data, 1, addr, 0);
+
+               info->pos++;
+       }
+}
+
+const static struct stacktrace_ops backtrace_ops = {
+       .warning                = backtrace_warning,
+       .warning_symbol         = backtrace_warning_symbol,
+       .stack                  = backtrace_stack,
+       .address                = backtrace_address,
+};
+
+static int
+trace_kernel(struct pt_regs *regs, struct trace_array *tr,
+            struct trace_array_cpu *data)
+{
+       struct backtrace_info info;
+       unsigned long bp;
+       char *stack;
+
+       info.tr = tr;
+       info.data = data;
+       info.pos = 1;
+
+       __trace_special(info.tr, info.data, 1, regs->ip, 0);
+
+       stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+       bp = regs->bp;
+#else
+       bp = 0;
+#endif
+
+       dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
+
+       return info.pos;
+}
+
+static void timer_notify(struct pt_regs *regs, int cpu)
+{
+       struct trace_array_cpu *data;
+       struct stack_frame frame;
+       struct trace_array *tr;
+       const void __user *fp;
+       int is_user;
+       int i;
+
+       if (!regs)
+               return;
+
+       tr = sysprof_trace;
+       data = tr->data[cpu];
+       is_user = user_mode(regs);
+
+       if (!current || current->pid == 0)
+               return;
+
+       if (is_user && current->state != TASK_RUNNING)
+               return;
+
+       __trace_special(tr, data, 0, 0, current->pid);
+
+       if (!is_user)
+               i = trace_kernel(regs, tr, data);
+       else
+               i = 0;
+
+       /*
+        * Trace user stack if we are not a kernel thread
+        */
+       if (current->mm && i < sample_max_depth) {
+               regs = (struct pt_regs *)current->thread.sp0 - 1;
+
+               fp = (void __user *)regs->bp;
+
+               __trace_special(tr, data, 2, regs->ip, 0);
+
+               while (i < sample_max_depth) {
+                       frame.next_fp = 0;
+                       frame.return_address = 0;
+                       if (!copy_stack_frame(fp, &frame))
+                               break;
+                       if ((unsigned long)fp < regs->sp)
+                               break;
+
+                       __trace_special(tr, data, 2, frame.return_address,
+                                       (unsigned long)fp);
+                       fp = frame.next_fp;
+
+                       i++;
+               }
+
+       }
+
+       /*
+        * Special trace entry if we overflow the max depth:
+        */
+       if (i == sample_max_depth)
+               __trace_special(tr, data, -1, -1, -1);
+
+       __trace_special(tr, data, 3, current->pid, i);
+}
+
+static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
+{
+       /* trace here */
+       timer_notify(get_irq_regs(), smp_processor_id());
+
+       hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
+
+       return HRTIMER_RESTART;
+}
+
+static void start_stack_timer(int cpu)
+{
+       struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+
+       hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtimer->function = stack_trace_timer_fn;
+       hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
+       hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+}
+
+static void start_stack_timers(void)
+{
+       cpumask_t saved_mask = current->cpus_allowed;
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+               start_stack_timer(cpu);
+       }
+       set_cpus_allowed_ptr(current, &saved_mask);
+}
+
+static void stop_stack_timer(int cpu)
+{
+       struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
+
+       hrtimer_cancel(hrtimer);
+}
+
+static void stop_stack_timers(void)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               stop_stack_timer(cpu);
+}
+
+static void stack_reset(struct trace_array *tr)
+{
+       int cpu;
+
+       tr->time_start = ftrace_now(tr->cpu);
+
+       for_each_online_cpu(cpu)
+               tracing_reset(tr->data[cpu]);
+}
+
+static void start_stack_trace(struct trace_array *tr)
+{
+       mutex_lock(&sample_timer_lock);
+       stack_reset(tr);
+       start_stack_timers();
+       tracer_enabled = 1;
+       mutex_unlock(&sample_timer_lock);
+}
+
+static void stop_stack_trace(struct trace_array *tr)
+{
+       mutex_lock(&sample_timer_lock);
+       stop_stack_timers();
+       tracer_enabled = 0;
+       mutex_unlock(&sample_timer_lock);
+}
+
+static void stack_trace_init(struct trace_array *tr)
+{
+       sysprof_trace = tr;
+
+       if (tr->ctrl)
+               start_stack_trace(tr);
+}
+
+static void stack_trace_reset(struct trace_array *tr)
+{
+       if (tr->ctrl)
+               stop_stack_trace(tr);
+}
+
+static void stack_trace_ctrl_update(struct trace_array *tr)
+{
+       /* When starting a new trace, reset the buffers */
+       if (tr->ctrl)
+               start_stack_trace(tr);
+       else
+               stop_stack_trace(tr);
+}
+
+static struct tracer stack_trace __read_mostly =
+{
+       .name           = "sysprof",
+       .init           = stack_trace_init,
+       .reset          = stack_trace_reset,
+       .ctrl_update    = stack_trace_ctrl_update,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest    = trace_selftest_startup_sysprof,
+#endif
+};
+
+__init static int init_stack_trace(void)
+{
+       return register_tracer(&stack_trace);
+}
+device_initcall(init_stack_trace);
+
+#define MAX_LONG_DIGITS 22
+
+static ssize_t
+sysprof_sample_read(struct file *filp, char __user *ubuf,
+                   size_t cnt, loff_t *ppos)
+{
+       char buf[MAX_LONG_DIGITS];
+       int r;
+
+       r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
+
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+sysprof_sample_write(struct file *filp, const char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+       char buf[MAX_LONG_DIGITS];
+       unsigned long val;
+
+       if (cnt > MAX_LONG_DIGITS-1)
+               cnt = MAX_LONG_DIGITS-1;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       val = simple_strtoul(buf, NULL, 10);
+       /*
+        * Enforce a minimum sample period of 100 usecs:
+        */
+       if (val < 100)
+               val = 100;
+
+       mutex_lock(&sample_timer_lock);
+       stop_stack_timers();
+       sample_period = val * 1000;
+       start_stack_timers();
+       mutex_unlock(&sample_timer_lock);
+
+       return cnt;
+}
+
+static struct file_operations sysprof_sample_fops = {
+       .read           = sysprof_sample_read,
+       .write          = sysprof_sample_write,
+};
+
+void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
+{
+       struct dentry *entry;
+
+       entry = debugfs_create_file("sysprof_sample_period", 0644,
+                       d_tracer, NULL, &sysprof_sample_fops);
+       if (entry)
+               return;
+       pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
+}
index d2099f41aa1ebe46bc614c7187290f7a32dd941c..d8b6279a9b4232cf7ca92305199b4a915cd0f591 100644 (file)
@@ -634,6 +634,8 @@ config LATENCYTOP
          Enable this option if you want to use the LatencyTOP tool
          to find out which userspace is blocking on what kernel operations.
 
+source kernel/trace/Kconfig
+
 config PROVIDE_OHCI1394_DMA_INIT
        bool "Remote debugging over FireWire early on boot"
        depends on PCI && X86
index 74b0cfb1fcc3c0eb009648484a368116d0681f88..4b836a53c08f1dcf84b060c3480f6a9155c3960f 100644 (file)
@@ -8,6 +8,15 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
         sha1.o irq_regs.o reciprocal_div.o argv_split.o \
         proportions.o prio_heap.o ratelimit.o
 
+ifdef CONFIG_FTRACE
+# Do not profile string.o, since it may be used in early boot or vdso
+CFLAGS_REMOVE_string.o = -pg
+# Also do not profile any debug utilities
+CFLAGS_REMOVE_spinlock_debug.o = -pg
+CFLAGS_REMOVE_list_debug.o = -pg
+CFLAGS_REMOVE_debugobjects.o = -pg
+endif
+
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 
index 6c90fb90e19c2ec25571947b7b98f88262e23ba4..3b4dc098181e47ae4d3a239aa4c27dcd6d9a7d47 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/kallsyms.h>
 #include <linux/sched.h>
 
-unsigned int debug_smp_processor_id(void)
+notrace unsigned int debug_smp_processor_id(void)
 {
        unsigned long preempt_count = preempt_count();
        int this_cpu = raw_smp_processor_id();
@@ -37,7 +37,7 @@ unsigned int debug_smp_processor_id(void)
        /*
         * Avoid recursion:
         */
-       preempt_disable();
+       preempt_disable_notrace();
 
        if (!printk_ratelimit())
                goto out_enable;
@@ -49,7 +49,7 @@ unsigned int debug_smp_processor_id(void)
        dump_stack();
 
 out_enable:
-       preempt_enable_no_resched();
+       preempt_enable_no_resched_notrace();
 out:
        return this_cpu;
 }
index 789b6adbef37f1f38c3887f10ba130595903408a..b38f700825fca31b81a48ecce6d60b7856bfcf84 100644 (file)
@@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages);
 static struct prop_descriptor vm_completions;
 static struct prop_descriptor vm_dirties;
 
-static unsigned long determine_dirtyable_memory(void);
-
 /*
  * couple the period to the dirty_ratio:
  *
@@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 #endif
 }
 
-static unsigned long determine_dirtyable_memory(void)
+/**
+ * determine_dirtyable_memory - amount of memory that may be used
+ *
+ * Returns the numebr of pages that can currently be freed and used
+ * by the kernel for direct mappings.
+ */
+unsigned long determine_dirtyable_memory(void)
 {
        unsigned long x;
 
index 8e440233c27dacf8e3083417047d1f8024f25131..ea48b82a37076123c09c04346f796bc8e68da5db 100644 (file)
@@ -96,7 +96,8 @@ basename_flags = -D"KBUILD_BASENAME=KBUILD_STR($(call name-fix,$(basetarget)))"
 modname_flags  = $(if $(filter 1,$(words $(modname))),\
                  -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
 
-_c_flags       = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o)
+orig_c_flags   = $(KBUILD_CFLAGS) $(ccflags-y) $(CFLAGS_$(basetarget).o)
+_c_flags       = $(filter-out $(CFLAGS_REMOVE_$(basetarget).o), $(orig_c_flags))
 _a_flags       = $(KBUILD_AFLAGS) $(asflags-y) $(AFLAGS_$(basetarget).o)
 _cpp_flags     = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(@F))