Merge branch 'core/percpu' into percpu-cpumask-x86-for-linus-2

author Ingo Molnar <mingo@elte.hu>

Thu, 26 Mar 2009 20:39:17 +0000 (21:39 +0100)

committer Ingo Molnar <mingo@elte.hu>

Fri, 27 Mar 2009 16:28:43 +0000 (17:28 +0100)
author Ingo Molnar <mingo@elte.hu>
Thu, 26 Mar 2009 20:39:17 +0000 (21:39 +0100)
committer Ingo Molnar <mingo@elte.hu>
Fri, 27 Mar 2009 16:28:43 +0000 (17:28 +0100)
diff --combined Documentation/kernel-parameters.txt

index fa4e1239a8fa308b69c57995b575e347ca0530eb,28de395fa096a4077b1b521ca23cab9f05b66502..6b979d1d09ab50489c79ac94f7a537733ce2c98b
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -44,7 -44,6 +44,7 @@@ parameter is applicable
         FB      The frame buffer device is enabled.
         HW      Appropriate hardware is enabled.
         IA-64   IA-64 architecture is enabled.
+ +      IMA     Integrity measurement architecture is enabled.
         IOSCHED More than one I/O scheduler is enabled.
         IP_PNP  IP DHCP, BOOTP, or RARP is enabled.
         ISAPNP  ISA PnP code is enabled.
@@@ -493,12 -492,10 +493,12 @@@ and is between 256 and 4096 characters
                         Default: 64
   
         hpet=           [X86-32,HPET] option to control HPET usage
- -                      Format: { enable (default) | disable | force }
+ +                      Format: { enable (default) | disable | force |
+ +                              verbose }
                         disable: disable HPET and use PIT instead
                         force: allow force enabled of undocumented chips (ICH4,
                         VIA, nVidia)
+ +                      verbose: show contents of HPET registers during setup
   
         com20020=       [HW,NET] ARCnet - COM20020 chipset
                         Format:
@@@ -832,9 -829,6 +832,9 @@@
   
         hvc_iucv=       [S390] Number of z/VM IUCV hypervisor console (HVC)
                                terminal devices. Valid values: 0..8
+ +      hvc_iucv_allow= [S390] Comma-separated list of z/VM user IDs.
+ +                             If specified, z/VM IUCV HVC accepts connections
+ +                             from listed z/VM user IDs only.
   
         i8042.debug     [HW] Toggle i8042 debug mode
         i8042.direct    [HW] Put keyboard port into non-translated mode
@@@ -908,15 -902,6 +908,15 @@@
         ihash_entries=  [KNL]
                         Set number of hash buckets for inode cache.
   
+ +      ima_audit=      [IMA]
+ +                      Format: { "0" | "1" }
+ +                      0 -- integrity auditing messages. (Default)
+ +                      1 -- enable informational integrity auditing messages.
+ +
+ +      ima_hash=       [IMA]
+ +                      Formt: { "sha1" | "md5" }
+ +                      default: "sha1"
+ +
         in2000=         [HW,SCSI]
                         See header of drivers/scsi/in2000.c.
   
@@@ -1325,8 -1310,13 +1325,13 @@@
   
         memtest=        [KNL,X86] Enable memtest
                         Format: <integer>
-                       range: 0,4 : pattern number
                         default : 0 <disable>
+                       Specifies the number of memtest passes to be
+                       performed. Each pass selects another test
+                       pattern from a given set of patterns. Memtest
+                       fills the memory with this pattern, validates
+                       memory contents and reserves bad memory
+                       regions that are detected.
   
         meye.*=         [HW] Set MotionEye Camera parameters
                         See Documentation/video4linux/meye.txt.
@@@ -1831,6 -1821,11 +1836,6 @@@
                         autoconfiguration.
                         Ranges are in pairs (memory base and size).
   
- -      dynamic_printk  Enables pr_debug()/dev_dbg() calls if
- -                      CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled.
- -                      These can also be switched on/off via
- -                      <debugfs>/dynamic_printk/modules
- -
         print-fatal-signals=
                         [KNL] debug: print fatal signals
                         print-fatal-signals=1: print segfault info to
diff --combined Makefile

index 1ab3ebfc909198065a5d0b10a6e3ee0f1e768b3c,27fb890a2bffe029236a2199637518bdaf951c6c..c6307b6d069f8ec9a969ca6ad617245b95486d94
--- 1/Makefile
--- 2/Makefile
+++ b/Makefile
@@@ -1,8 -1,8 +1,8 @@@
   VERSION = 2
   PATCHLEVEL = 6
   SUBLEVEL = 29
- -EXTRAVERSION = -rc6
- -NAME = Erotic Pickled Herring
+ +EXTRAVERSION =
+ +NAME = Temporary Tasmanian Devil
   
   # *DOCUMENTATION*
   # To see a list of typical targets execute "make help"
@@@ -533,8 -533,9 +533,9 @@@ KBUILD_CFLAGS += $(call cc-option,-Wfra
   endif
   
   # Force gcc to behave correct even for buggy distributions
- # Arch Makefiles may override this setting
+ ifndef CONFIG_CC_STACKPROTECTOR
   KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector)
+ endif
   
   ifdef CONFIG_FRAME_POINTER
   KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls
@@@ -566,12 -567,6 +567,12 @@@ KBUILD_CFLAGS += $(call cc-option,-Wdec
   # disable pointer signed / unsigned warnings in gcc 4.0
   KBUILD_CFLAGS += $(call cc-option,-Wno-pointer-sign,)
   
+ +# disable invalid "can't wrap" optimzations for signed / pointers
+ +KBUILD_CFLAGS += $(call cc-option,-fwrapv)
+ +
+ +# revert to pre-gcc-4.4 behaviour of .eh_frame
+ +KBUILD_CFLAGS += $(call cc-option,-fno-dwarf2-cfi-asm)
+ +
   # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
   # But warn user when we do so
   warn-assign = \
@@@ -910,18 -905,12 +911,18 @@@ localver = $(subst $(space),, $(string
   # and if the SCM is know a tag from the SCM is appended.
   # The appended tag is determined by the SCM used.
   #
- -# Currently, only git is supported.
- -# Other SCMs can edit scripts/setlocalversion and add the appropriate
- -# checks as needed.
+ +# .scmversion is used when generating rpm packages so we do not loose
+ +# the version information from the SCM when we do the build of the kernel
+ +# from the copied source
   ifdef CONFIG_LOCALVERSION_AUTO
- -      _localver-auto = $(shell $(CONFIG_SHELL) \
- -                        $(srctree)/scripts/setlocalversion $(srctree))
+ +
+ +ifeq ($(wildcard .scmversion),)
+ +        _localver-auto = $(shell $(CONFIG_SHELL) \
+ +                         $(srctree)/scripts/setlocalversion $(srctree))
+ +else
+ +        _localver-auto = $(shell cat .scmversion 2> /dev/null)
+ +endif
+ +
         localver-auto  = $(LOCALVERSION)$(_localver-auto)
   endif
   
@@@ -1549,7 -1538,7 +1550,7 @@@ quiet_cmd_depmod = DEPMOD  $(KERNELRELE
         cmd_depmod = \
         if [ -r System.map -a -x $(DEPMOD) ]; then                              \
                 $(DEPMOD) -ae -F System.map                                     \
- -              $(if $(strip $(INSTALL_MOD_PATH)), -b $(INSTALL_MOD_PATH) -r)   \
+ +              $(if $(strip $(INSTALL_MOD_PATH)), -b $(INSTALL_MOD_PATH) )     \
                 $(KERNELRELEASE);                                               \
         fi
   
diff --combined arch/alpha/kernel/irq.c

index d3812eb84015bcf5f9d573d1db382fb2d04e3fbf,7bc7489223f3e03b77aec838838202e8aed3f3b4..cc783466142754b97475522de5fd39aaa5b85f37
--- 1/arch/alpha/kernel/irq.c
--- 2/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@@ -55,7 -55,7 +55,7 @@@ int irq_select_affinity(unsigned int ir
                 cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
         last_cpu = cpu;
   
-       irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+       cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
         irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu));
         return 0;
   }
@@@ -90,7 -90,7 +90,7 @@@ show_interrupts(struct seq_file *p, voi
                 seq_printf(p, "%10u ", kstat_irqs(irq));
   #else
                 for_each_online_cpu(j)
- -                      seq_printf(p, "%10u ", kstat_cpu(j).irqs[irq]);
+ +                      seq_printf(p, "%10u ", kstat_irqs_cpu(irq, j));
   #endif
                 seq_printf(p, " %14s", irq_desc[irq].chip->typename);
                 seq_printf(p, "  %c%s",
diff --combined arch/arm/kernel/irq.c

index 7296f041628663f2c88258b90f2e411d99d3a4aa,45eacb5a2ecd80fb7a30dc56d6f3386f1c842f19..6874c7dca75aeb287319019ebff639a61f473a5a
--- 1/arch/arm/kernel/irq.c
--- 2/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@@ -76,7 -76,7 +76,7 @@@ int show_interrupts(struct seq_file *p
   
                 seq_printf(p, "%3d: ", i);
                 for_each_present_cpu(cpu)
- -                      seq_printf(p, "%10u ", kstat_cpu(cpu).irqs[i]);
+ +                      seq_printf(p, "%10u ", kstat_irqs_cpu(i, cpu));
                 seq_printf(p, " %10s", irq_desc[i].chip->name ? : "-");
                 seq_printf(p, "  %s", action->name);
                 for (action = action->next; action; action = action->next)
@@@ -104,6 -104,11 +104,11 @@@ static struct irq_desc bad_irq_desc = 
         .lock = __SPIN_LOCK_UNLOCKED(bad_irq_desc.lock),
   };
   
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ /* We are not allocating bad_irq_desc.affinity or .pending_mask */
+ #error "ARM architecture does not support CONFIG_CPUMASK_OFFSTACK."
+ #endif
+ 
   /*
    * do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
    * come via this function.  Instead, they should provide their
@@@ -161,7 -166,7 +166,7 @@@ void __init init_IRQ(void
                 irq_desc[irq].status |= IRQ_NOREQUEST | IRQ_NOPROBE;
   
   #ifdef CONFIG_SMP
-       bad_irq_desc.affinity = CPU_MASK_ALL;
+       cpumask_setall(bad_irq_desc.affinity);
         bad_irq_desc.cpu = smp_processor_id();
   #endif
         init_arch_irq();
@@@ -191,15 -196,16 +196,16 @@@ void migrate_irqs(void
                 struct irq_desc *desc = irq_desc + i;
   
                 if (desc->cpu == cpu) {
-                       unsigned int newcpu = any_online_cpu(desc->affinity);
- 
-                       if (newcpu == NR_CPUS) {
+                       unsigned int newcpu = cpumask_any_and(desc->affinity,
+                                                             cpu_online_mask);
+                       if (newcpu >= nr_cpu_ids) {
                                 if (printk_ratelimit())
                                         printk(KERN_INFO "IRQ%u no longer affine to CPU%u\n",
                                                i, cpu);
   
-                               cpus_setall(desc->affinity);
-                               newcpu = any_online_cpu(desc->affinity);
+                               cpumask_setall(desc->affinity);
+                               newcpu = cpumask_any_and(desc->affinity,
+                                                        cpu_online_mask);
                         }
   
                         route_irq(desc, i, newcpu);
diff --combined arch/blackfin/kernel/irqchip.c

index bd052a67032e467bfef3a4aa30eb6dd1b4bc02e7,23e9aa080710f095e3389b74892c2b6933dbdbda..401bd32aa499f10be2009a4a622b0e1a530994b4
--- 1/arch/blackfin/kernel/irqchip.c
--- 2/arch/blackfin/kernel/irqchip.c
+++ b/arch/blackfin/kernel/irqchip.c
@@@ -70,6 -70,11 +70,11 @@@ static struct irq_desc bad_irq_desc = 
   #endif
   };
   
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ /* We are not allocating a variable-sized bad_irq_desc.affinity */
+ #error "Blackfin architecture does not support CONFIG_CPUMASK_OFFSTACK."
+ #endif
+ 
   int show_interrupts(struct seq_file *p, void *v)
   {
         int i = *(loff_t *) v, j;
@@@ -83,7 -88,7 +88,7 @@@
                         goto skip;
                 seq_printf(p, "%3d: ", i);
                 for_each_online_cpu(j)
- -                      seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ +                      seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
                 seq_printf(p, " %8s", irq_desc[i].chip->name);
                 seq_printf(p, "  %s", action->name);
                 for (action = action->next; action; action = action->next)
@@@ -144,15 -149,11 +149,15 @@@ asmlinkage void asm_do_IRQ(unsigned in
   #endif
         generic_handle_irq(irq);
   
- -#ifndef CONFIG_IPIPE  /* Useless and bugous over the I-pipe: IRQs are threaded. */
- -      /* If we're the only interrupt running (ignoring IRQ15 which is for
- -         syscalls), lower our priority to IRQ14 so that softirqs run at
- -         that level.  If there's another, lower-level interrupt, irq_exit
- -         will defer softirqs to that.  */
+ +#ifndef CONFIG_IPIPE
+ +      /*
+ +       * If we're the only interrupt running (ignoring IRQ15 which
+ +       * is for syscalls), lower our priority to IRQ14 so that
+ +       * softirqs run at that level.  If there's another,
+ +       * lower-level interrupt, irq_exit will defer softirqs to
+ +       * that. If the interrupt pipeline is enabled, we are already
+ +       * running at IRQ14 priority, so we don't need this code.
+ +       */
         CSYNC();
         pending = bfin_read_IPEND() & ~0x8000;
         other_ints = pending & (pending - 1);
diff --combined arch/ia64/kernel/irq.c

index 4f596613bffd86ad842d781b1143928782465a09,226233a6fa19a2d3d8eb9f9ab5684dfda50b2c9d..7429752ef5ade56035e39b3bdd295bbd573497e3
--- 1/arch/ia64/kernel/irq.c
--- 2/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@@ -80,7 -80,7 +80,7 @@@ int show_interrupts(struct seq_file *p
                 seq_printf(p, "%10u ", kstat_irqs(i));
   #else
                 for_each_online_cpu(j) {
- -                      seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ +                      seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
                 }
   #endif
                 seq_printf(p, " %14s", irq_desc[i].chip->name);
@@@ -103,7 -103,7 +103,7 @@@ static char irq_redir [NR_IRQS]; // = 
   void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
   {
         if (irq < NR_IRQS) {
-               cpumask_copy(&irq_desc[irq].affinity,
+               cpumask_copy(irq_desc[irq].affinity,
                              cpumask_of(cpu_logical_id(hwid)));
                 irq_redir[irq] = (char) (redir & 0xff);
         }
@@@ -148,7 -148,7 +148,7 @@@ static void migrate_irqs(void
                 if (desc->status == IRQ_PER_CPU)
                         continue;
   
-               if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask)
+               if (cpumask_any_and(irq_desc[irq].affinity, cpu_online_mask)
                     >= nr_cpu_ids) {
                         /*
                          * Save it for phase 2 processing
diff --combined arch/ia64/kernel/msi_ia64.c

index 368ee4e5266d43e9b23e2fe18bdd2199f7676bed,dcb6b7c51ea7eb8dc3c8514bf24ef38ee18a6fe2..2b15e233f7fef6b016f50367c90f9c91ba096b63
--- 1/arch/ia64/kernel/msi_ia64.c
--- 2/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@@ -7,7 -7,44 +7,7 @@@
   #include <linux/msi.h>
   #include <linux/dmar.h>
   #include <asm/smp.h>
- -
- -/*
- - * Shifts for APIC-based data
- - */
- -
- -#define MSI_DATA_VECTOR_SHIFT         0
- -#define           MSI_DATA_VECTOR(v)          (((u8)v) << MSI_DATA_VECTOR_SHIFT)
- -#define MSI_DATA_VECTOR_MASK          0xffffff00
- -
- -#define MSI_DATA_DELIVERY_SHIFT               8
- -#define     MSI_DATA_DELIVERY_FIXED   (0 << MSI_DATA_DELIVERY_SHIFT)
- -#define     MSI_DATA_DELIVERY_LOWPRI  (1 << MSI_DATA_DELIVERY_SHIFT)
- -
- -#define MSI_DATA_LEVEL_SHIFT          14
- -#define     MSI_DATA_LEVEL_DEASSERT   (0 << MSI_DATA_LEVEL_SHIFT)
- -#define     MSI_DATA_LEVEL_ASSERT     (1 << MSI_DATA_LEVEL_SHIFT)
- -
- -#define MSI_DATA_TRIGGER_SHIFT                15
- -#define     MSI_DATA_TRIGGER_EDGE     (0 << MSI_DATA_TRIGGER_SHIFT)
- -#define     MSI_DATA_TRIGGER_LEVEL    (1 << MSI_DATA_TRIGGER_SHIFT)
- -
- -/*
- - * Shift/mask fields for APIC-based bus address
- - */
- -
- -#define MSI_TARGET_CPU_SHIFT          4
- -#define MSI_ADDR_HEADER                       0xfee00000
- -
- -#define MSI_ADDR_DESTID_MASK          0xfff0000f
- -#define     MSI_ADDR_DESTID_CPU(cpu)  ((cpu) << MSI_TARGET_CPU_SHIFT)
- -
- -#define MSI_ADDR_DESTMODE_SHIFT               2
- -#define     MSI_ADDR_DESTMODE_PHYS    (0 << MSI_ADDR_DESTMODE_SHIFT)
- -#define           MSI_ADDR_DESTMODE_LOGIC     (1 << MSI_ADDR_DESTMODE_SHIFT)
- -
- -#define MSI_ADDR_REDIRECTION_SHIFT    3
- -#define     MSI_ADDR_REDIRECTION_CPU  (0 << MSI_ADDR_REDIRECTION_SHIFT)
- -#define     MSI_ADDR_REDIRECTION_LOWPRI       (1 << MSI_ADDR_REDIRECTION_SHIFT)
+ +#include <asm/msidef.h>
   
   static struct irq_chip        ia64_msi_chip;
   
@@@ -28,8 -65,8 +28,8 @@@ static void ia64_set_msi_irq_affinity(u
         read_msi_msg(irq, &msg);
   
         addr = msg.address_lo;
- -      addr &= MSI_ADDR_DESTID_MASK;
- -      addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
+ +      addr &= MSI_ADDR_DEST_ID_MASK;
+ +      addr |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
         msg.address_lo = addr;
   
         data = msg.data;
@@@ -38,7 -75,7 +38,7 @@@
         msg.data = data;
   
         write_msi_msg(irq, &msg);
-       irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+       cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
   }
   #endif /* CONFIG_SMP */
   
@@@ -61,9 -98,9 +61,9 @@@ int ia64_setup_msi_irq(struct pci_dev *
         msg.address_hi = 0;
         msg.address_lo =
                 MSI_ADDR_HEADER |
- -              MSI_ADDR_DESTMODE_PHYS |
+ +              MSI_ADDR_DEST_MODE_PHYS |
                 MSI_ADDR_REDIRECTION_CPU |
- -              MSI_ADDR_DESTID_CPU(dest_phys_id);
+ +              MSI_ADDR_DEST_ID_CPU(dest_phys_id);
   
         msg.data =
                 MSI_DATA_TRIGGER_EDGE |
@@@ -146,11 -183,11 +146,11 @@@ static void dmar_msi_set_affinity(unsig
   
         msg.data &= ~MSI_DATA_VECTOR_MASK;
         msg.data |= MSI_DATA_VECTOR(cfg->vector);
- -      msg.address_lo &= ~MSI_ADDR_DESTID_MASK;
- -      msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
+ +      msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ +      msg.address_lo |= MSI_ADDR_DEST_ID_CPU(cpu_physical_id(cpu));
   
         dmar_msi_write(irq, &msg);
-       irq_desc[irq].affinity = *mask;
+       cpumask_copy(irq_desc[irq].affinity, mask);
   }
   #endif /* CONFIG_SMP */
   
@@@ -178,9 -215,9 +178,9 @@@ msi_compose_msg(struct pci_dev *pdev, u
         msg->address_hi = 0;
         msg->address_lo =
                 MSI_ADDR_HEADER |
- -              MSI_ADDR_DESTMODE_PHYS |
+ +              MSI_ADDR_DEST_MODE_PHYS |
                 MSI_ADDR_REDIRECTION_CPU |
- -              MSI_ADDR_DESTID_CPU(dest);
+ +              MSI_ADDR_DEST_ID_CPU(dest);
   
         msg->data =
                 MSI_DATA_TRIGGER_EDGE |
diff --combined arch/parisc/kernel/irq.c

index adfd617b4c18c66dfdfca0202b6a960157f4bf73,49482806863fa522d71e6e37d90a5c470e1e4023..1c740f5cbd6347f0046dc1560fb52dc806fd3103
--- 1/arch/parisc/kernel/irq.c
--- 2/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@@ -112,7 -112,7 +112,7 @@@ void cpu_end_irq(unsigned int irq
   }
   
   #ifdef CONFIG_SMP
- -int cpu_check_affinity(unsigned int irq, cpumask_t *dest)
+ +int cpu_check_affinity(unsigned int irq, const struct cpumask *dest)
   {
         int cpu_dest;
   
@@@ -120,25 -120,23 +120,25 @@@
         if (CHECK_IRQ_PER_CPU(irq)) {
                 /* Bad linux design decision.  The mask has already
                  * been set; we must reset it */
- -              cpumask_setall(irq_desc[irq].affinity);
+ +              cpumask_setall(&irq_desc[irq].affinity);
                 return -EINVAL;
         }
   
         /* whatever mask they set, we just allow one CPU */
         cpu_dest = first_cpu(*dest);
- -      *dest = cpumask_of_cpu(cpu_dest);
   
- -      return 0;
+ +      return cpu_dest;
   }
   
   static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
   {
- -      if (cpu_check_affinity(irq, dest))
+ +      int cpu_dest;
+ +
+ +      cpu_dest = cpu_check_affinity(irq, dest);
+ +      if (cpu_dest < 0)
                 return;
   
-       cpumask_copy(&irq_desc[irq].affinity, &cpumask_of_cpu(cpu_dest));
- -      cpumask_copy(irq_desc[irq].affinity, dest);
++      cpumask_copy(&irq_desc[irq].affinity, dest);
   }
   #endif
   
@@@ -185,7 -183,7 +185,7 @@@ int show_interrupts(struct seq_file *p
                 seq_printf(p, "%3d: ", i);
   #ifdef CONFIG_SMP
                 for_each_online_cpu(j)
- -                      seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ +                      seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
   #else
                 seq_printf(p, "%10u ", kstat_irqs(i));
   #endif
@@@ -297,7 -295,7 +297,7 @@@ int txn_alloc_irq(unsigned int bits_wid
   unsigned long txn_affinity_addr(unsigned int irq, int cpu)
   {
   #ifdef CONFIG_SMP
- -      cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
+ +      cpumask_copy(&irq_desc[irq].affinity, cpumask_of(cpu));
   #endif
   
         return per_cpu(cpu_data, cpu).txn_addr;
@@@ -354,7 -352,7 +354,7 @@@ void do_cpu_irq_mask(struct pt_regs *re
         irq = eirr_to_irq(eirr_val);
   
   #ifdef CONFIG_SMP
- -      cpumask_copy(&dest, irq_desc[irq].affinity);
+ +      cpumask_copy(&dest, &irq_desc[irq].affinity);
         if (CHECK_IRQ_PER_CPU(irq_desc[irq].status) &&
             !cpu_isset(smp_processor_id(), dest)) {
                 int cpu = first_cpu(dest);
diff --combined arch/powerpc/kernel/irq.c

index 17efb7118db1140296bf8fa447b1b1133867c398,ad1e5ac721d86f557bac20079b407c8929459273..1b55ffdf002652d09376ce7e1a22d32c1101610f
--- 1/arch/powerpc/kernel/irq.c
--- 2/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@@ -190,7 -190,7 +190,7 @@@ int show_interrupts(struct seq_file *p
                 seq_printf(p, "%3d: ", i);
   #ifdef CONFIG_SMP
                 for_each_online_cpu(j)
- -                      seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ +                      seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
   #else
                 seq_printf(p, "%10u ", kstat_irqs(i));
   #endif /* CONFIG_SMP */
@@@ -231,7 -231,7 +231,7 @@@ void fixup_irqs(cpumask_t map
                 if (irq_desc[irq].status & IRQ_PER_CPU)
                         continue;
   
-               cpus_and(mask, irq_desc[irq].affinity, map);
+               cpumask_and(&mask, irq_desc[irq].affinity, &map);
                 if (any_online_cpu(mask) == NR_CPUS) {
                         printk("Breaking affinity for irq %i\n", irq);
                         mask = map;
diff --combined arch/sparc/kernel/irq_64.c

index 8ba064f08a6fbdd31bed882e1137ff924bd6a350,3d2c6baae96bf05b251d188ce1739150af3065fd..d0d6a515499ac952f487a49e389b2bdd5519bd02
--- 1/arch/sparc/kernel/irq_64.c
--- 2/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@@ -185,7 -185,7 +185,7 @@@ int show_interrupts(struct seq_file *p
                 seq_printf(p, "%10u ", kstat_irqs(i));
   #else
                 for_each_online_cpu(j)
- -                      seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+ +                      seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
   #endif
                 seq_printf(p, " %9s", irq_desc[i].chip->typename);
                 seq_printf(p, "  %s", action->name);
@@@ -252,9 -252,10 +252,10 @@@ struct irq_handler_data 
   #ifdef CONFIG_SMP
   static int irq_choose_cpu(unsigned int virt_irq)
   {
-       cpumask_t mask = irq_desc[virt_irq].affinity;
+       cpumask_t mask;
         int cpuid;
   
+       cpumask_copy(&mask, irq_desc[virt_irq].affinity);
         if (cpus_equal(mask, CPU_MASK_ALL)) {
                 static int irq_rover;
                 static DEFINE_SPINLOCK(irq_rover_lock);
@@@ -323,25 -324,17 +324,25 @@@ static void sun4u_set_affinity(unsigne
         sun4u_irq_enable(virt_irq);
   }
   
+ +/* Don't do anything.  The desc->status check for IRQ_DISABLED in
+ + * handler_irq() will skip the handler call and that will leave the
+ + * interrupt in the sent state.  The next ->enable() call will hit the
+ + * ICLR register to reset the state machine.
+ + *
+ + * This scheme is necessary, instead of clearing the Valid bit in the
+ + * IMAP register, to handle the case of IMAP registers being shared by
+ + * multiple INOs (and thus ICLR registers).  Since we use a different
+ + * virtual IRQ for each shared IMAP instance, the generic code thinks
+ + * there is only one user so it prematurely calls ->disable() on
+ + * free_irq().
+ + *
+ + * We have to provide an explicit ->disable() method instead of using
+ + * NULL to get the default.  The reason is that if the generic code
+ + * sees that, it also hooks up a default ->shutdown method which
+ + * invokes ->mask() which we do not want.  See irq_chip_set_defaults().
+ + */
   static void sun4u_irq_disable(unsigned int virt_irq)
   {
- -      struct irq_handler_data *data = get_irq_chip_data(virt_irq);
- -
- -      if (likely(data)) {
- -              unsigned long imap = data->imap;
- -              unsigned long tmp = upa_readq(imap);
- -
- -              tmp &= ~IMAP_VALID;
- -              upa_writeq(tmp, imap);
- -      }
   }
   
   static void sun4u_irq_eoi(unsigned int virt_irq)
@@@ -754,8 -747,7 +755,8 @@@ void handler_irq(int irq, struct pt_reg
   
                 desc = irq_desc + virt_irq;
   
- -              desc->handle_irq(virt_irq, desc);
+ +              if (!(desc->status & IRQ_DISABLED))
+ +                      desc->handle_irq(virt_irq, desc);
   
                 bucket_pa = next_pa;
         }
@@@ -805,7 -797,7 +806,7 @@@ void fixup_irqs(void
                     !(irq_desc[irq].status & IRQ_PER_CPU)) {
                         if (irq_desc[irq].chip->set_affinity)
                                 irq_desc[irq].chip->set_affinity(irq,
-                                       &irq_desc[irq].affinity);
+                                       irq_desc[irq].affinity);
                 }
                 spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
         }
diff --combined arch/sparc/kernel/time_64.c

index 642562d83ec44603dddfbc6276fcb7a2e52a211d,db310aa00183cb0b92243cc234f9fcaedc03deeb..f95066b6f8053e8a93e8cfafed0b102408793593
--- 1/arch/sparc/kernel/time_64.c
--- 2/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@@ -36,10 -36,10 +36,10 @@@
   #include <linux/clocksource.h>
   #include <linux/of_device.h>
   #include <linux/platform_device.h>
+ +#include <linux/irq.h>
   
   #include <asm/oplib.h>
   #include <asm/timer.h>
- -#include <asm/irq.h>
   #include <asm/io.h>
   #include <asm/prom.h>
   #include <asm/starfire.h>
@@@ -729,7 -729,7 +729,7 @@@ void timer_interrupt(int irq, struct pt
   
         irq_enter();
   
-       kstat_this_cpu.irqs[0]++;
+       kstat_incr_irqs_this_cpu(0, irq_to_desc(0));
   
         if (unlikely(!evt->event_handler)) {
                 printk(KERN_WARNING
diff --combined arch/x86/Kconfig

index 3a330a437c6f9d55b44a0545b306242b6f55edba,f5cef3fbf9a5b21588b71add9f3abff0af847e99..06c02c00d7d9b040a95c477570a66555325a3df7
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -5,7 -5,7 +5,7 @@@ mainmenu "Linux Kernel Configuration fo
   config 64BIT
         bool "64-bit kernel" if ARCH = "x86"
         default ARCH = "x86_64"
-       help
+       ---help---
           Say yes to build a 64-bit kernel - formerly known as x86_64
           Say no to build a 32-bit kernel - formerly known as i386
   
@@@ -34,12 -34,15 +34,15 @@@ config X8
         select HAVE_FUNCTION_TRACER
         select HAVE_FUNCTION_GRAPH_TRACER
         select HAVE_FUNCTION_TRACE_MCOUNT_TEST
-       select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
-       select HAVE_ARCH_KGDB if !X86_VOYAGER
+       select HAVE_KVM
+       select HAVE_ARCH_KGDB
         select HAVE_ARCH_TRACEHOOK
         select HAVE_GENERIC_DMA_COHERENT if X86_32
         select HAVE_EFFICIENT_UNALIGNED_ACCESS
         select USER_STACKTRACE_SUPPORT
+       select HAVE_KERNEL_GZIP
+       select HAVE_KERNEL_BZIP2
+       select HAVE_KERNEL_LZMA
   
   config ARCH_DEFCONFIG
         string
@@@ -133,18 -136,19 +136,19 @@@ config ARCH_HAS_CACHE_LINE_SIZ
         def_bool y
   
   config HAVE_SETUP_PER_CPU_AREA
-       def_bool X86_64_SMP || (X86_SMP && !X86_VOYAGER)
+       def_bool y
+ 
+ config HAVE_DYNAMIC_PER_CPU_AREA
+       def_bool y
   
   config HAVE_CPUMASK_OF_CPU_MAP
         def_bool X86_64_SMP
   
   config ARCH_HIBERNATION_POSSIBLE
         def_bool y
-       depends on !SMP || !X86_VOYAGER
   
   config ARCH_SUSPEND_POSSIBLE
         def_bool y
-       depends on !X86_VOYAGER
   
   config ZONE_DMA32
         bool
@@@ -165,9 -169,6 +169,9 @@@ config GENERIC_HARDIRQ
         bool
         default y
   
+ +config GENERIC_HARDIRQS_NO__DO_IRQ
+ +       def_bool y
+ +
   config GENERIC_IRQ_PROBE
         bool
         default y
@@@ -177,11 -178,6 +181,6 @@@ config GENERIC_PENDING_IR
         depends on GENERIC_HARDIRQS && SMP
         default y
   
- config X86_SMP
-       bool
-       depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
-       default y
- 
   config USE_GENERIC_SMP_HELPERS
         def_bool y
         depends on SMP
@@@ -197,19 -193,17 +196,17 @@@ config X86_64_SM
   config X86_HT
         bool
         depends on SMP
-       depends on (X86_32 && !X86_VOYAGER) || X86_64
-       default y
- 
- config X86_BIOS_REBOOT
-       bool
-       depends on !X86_VOYAGER
         default y
   
   config X86_TRAMPOLINE
         bool
-       depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
+       depends on SMP || (64BIT && ACPI_SLEEP)
         default y
   
+ config X86_32_LAZY_GS
+       def_bool y
+       depends on X86_32 && !CC_STACKPROTECTOR
+ 
   config KTIME_SCALAR
         def_bool X86_32
   source "init/Kconfig"
@@@ -247,14 -241,24 +244,24 @@@ config SM
   
           If you don't know what to do here, say N.
   
- config X86_HAS_BOOT_CPU_ID
-       def_bool y
-       depends on X86_VOYAGER
+ config X86_X2APIC
+       bool "Support x2apic"
+       depends on X86_LOCAL_APIC && X86_64
+       ---help---
+         This enables x2apic support on CPUs that have this feature.
+ 
+         This allows 32-bit apic IDs (so it can support very large systems),
+         and accesses the local apic via MSRs not via mmio.
+ 
+         ( On certain CPU models you may need to enable INTR_REMAP too,
+           to get functional x2apic mode. )
+ 
+         If you don't know what to do here, say N.
   
   config SPARSE_IRQ
         bool "Support sparse irq numbering"
         depends on PCI_MSI || HT_IRQ
-       help
+       ---help---
           This enables support for sparse irqs. This is useful for distro
           kernels that want to define a high CONFIG_NR_CPUS value but still
           want to have low kernel memory footprint on smaller machines.
@@@ -268,114 -272,140 +275,140 @@@ config NUMA_MIGRATE_IRQ_DES
         bool "Move irq desc when changing irq smp_affinity"
         depends on SPARSE_IRQ && NUMA
         default n
-       help
+       ---help---
           This enables moving irq_desc to cpu/node that irq will use handled.
   
           If you don't know what to do here, say N.
   
- config X86_FIND_SMP_CONFIG
-       def_bool y
-       depends on X86_MPPARSE || X86_VOYAGER
- 
   config X86_MPPARSE
         bool "Enable MPS table" if ACPI
         default y
         depends on X86_LOCAL_APIC
-       help
+       ---help---
           For old smp systems that do not have proper acpi support. Newer systems
           (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
   
- choice
-       prompt "Subarchitecture Type"
-       default X86_PC
+ config X86_BIGSMP
+       bool "Support for big SMP systems with more than 8 CPUs"
+       depends on X86_32 && SMP
+       ---help---
+         This option is needed for the systems that have more than 8 CPUs
   
- config X86_PC
-       bool "PC-compatible"
-       help
-         Choose this option if your computer is a standard PC or compatible.
+ if X86_32
+ config X86_EXTENDED_PLATFORM
+       bool "Support for extended (non-PC) x86 platforms"
+       default y
+       ---help---
+         If you disable this option then the kernel will only support
+         standard PC platforms. (which covers the vast majority of
+         systems out there.)
+ 
+         If you enable this option then you'll be able to select support
+         for the following (non-PC) 32 bit x86 platforms:
+               AMD Elan
+               NUMAQ (IBM/Sequent)
+               RDC R-321x SoC
+               SGI 320/540 (Visual Workstation)
+               Summit/EXA (IBM x440)
+               Unisys ES7000 IA32 series
+ 
+         If you have one of these systems, or if you want to build a
+         generic distribution kernel, say Y here - otherwise say N.
+ endif
+ 
+ if X86_64
+ config X86_EXTENDED_PLATFORM
+       bool "Support for extended (non-PC) x86 platforms"
+       default y
+       ---help---
+         If you disable this option then the kernel will only support
+         standard PC platforms. (which covers the vast majority of
+         systems out there.)
+ 
+         If you enable this option then you'll be able to select support
+         for the following (non-PC) 64 bit x86 platforms:
+               ScaleMP vSMP
+               SGI Ultraviolet
+ 
+         If you have one of these systems, or if you want to build a
+         generic distribution kernel, say Y here - otherwise say N.
+ endif
+ # This is an alphabetically sorted list of 64 bit extended platforms
+ # Please maintain the alphabetic order if and when there are additions
+ 
+ config X86_VSMP
+       bool "ScaleMP vSMP"
+       select PARAVIRT
+       depends on X86_64 && PCI
+       depends on X86_EXTENDED_PLATFORM
+       ---help---
+         Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
+         supposed to run on these EM64T-based machines.  Only choose this option
+         if you have one of these machines.
+ 
+ config X86_UV
+       bool "SGI Ultraviolet"
+       depends on X86_64
+       depends on X86_EXTENDED_PLATFORM
+       select X86_X2APIC
+       ---help---
+         This option is needed in order to support SGI Ultraviolet systems.
+         If you don't have one of these, you should say N here.
+ 
+ # Following is an alphabetically sorted list of 32 bit extended platforms
+ # Please maintain the alphabetic order if and when there are additions
   
   config X86_ELAN
         bool "AMD Elan"
         depends on X86_32
-       help
+       depends on X86_EXTENDED_PLATFORM
+       ---help---
           Select this for an AMD Elan processor.
   
           Do not use this option for K6/Athlon/Opteron processors!
   
           If unsure, choose "PC-compatible" instead.
   
- config X86_VOYAGER
-       bool "Voyager (NCR)"
-       depends on X86_32 && (SMP || BROKEN) && !PCI
-       help
-         Voyager is an MCA-based 32-way capable SMP architecture proprietary
-         to NCR Corp.  Machine classes 345x/35xx/4100/51xx are Voyager-based.
- 
-         *** WARNING ***
- 
-         If you do not specifically know you have a Voyager based machine,
-         say N here, otherwise the kernel you build will not be bootable.
- 
- config X86_GENERICARCH
-        bool "Generic architecture"
+ config X86_RDC321X
+       bool "RDC R-321x SoC"
         depends on X86_32
-        help
-           This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
+       depends on X86_EXTENDED_PLATFORM
+       select M486
+       select X86_REBOOTFIXUPS
+       ---help---
+         This option is needed for RDC R-321x system-on-chip, also known
+         as R-8610-(G).
+         If you don't have one of these chips, you should say N here.
+ 
+ config X86_32_NON_STANDARD
+       bool "Support non-standard 32-bit SMP architectures"
+       depends on X86_32 && SMP
+       depends on X86_EXTENDED_PLATFORM
+       ---help---
+         This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default
           subarchitectures.  It is intended for a generic binary kernel.
           if you select them all, kernel will probe it one by one. and will
           fallback to default.
   
- if X86_GENERICARCH
+ # Alphabetically sorted list of Non standard 32 bit platforms
   
   config X86_NUMAQ
         bool "NUMAQ (IBM/Sequent)"
-       depends on SMP && X86_32 && PCI && X86_MPPARSE
+       depends on X86_32_NON_STANDARD
         select NUMA
-       help
+       select X86_MPPARSE
+       ---help---
           This option is used for getting Linux to run on a NUMAQ (IBM/Sequent)
           NUMA multiquad box. This changes the way that processors are
           bootstrapped, and uses Clustered Logical APIC addressing mode instead
           of Flat Logical.  You will need a new lynxer.elf file to flash your
           firmware with - send email to <Martin.Bligh@us.ibm.com>.
   
- config X86_SUMMIT
-       bool "Summit/EXA (IBM x440)"
-       depends on X86_32 && SMP
-       help
-         This option is needed for IBM systems that use the Summit/EXA chipset.
-         In particular, it is needed for the x440.
- 
- config X86_ES7000
-       bool "Support for Unisys ES7000 IA32 series"
-       depends on X86_32 && SMP
-       help
-         Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
-         supposed to run on an IA32-based Unisys ES7000 system.
- 
- config X86_BIGSMP
-       bool "Support for big SMP systems with more than 8 CPUs"
-       depends on X86_32 && SMP
-       help
-         This option is needed for the systems that have more than 8 CPUs
-         and if the system is not of any sub-arch type above.
- 
- endif
- 
- config X86_VSMP
-       bool "Support for ScaleMP vSMP"
-       select PARAVIRT
-       depends on X86_64 && PCI
-       help
-         Support for ScaleMP vSMP systems.  Say 'Y' here if this kernel is
-         supposed to run on these EM64T-based machines.  Only choose this option
-         if you have one of these machines.
- 
- endchoice
- 
   config X86_VISWS
         bool "SGI 320/540 (Visual Workstation)"
-       depends on X86_32 && PCI && !X86_VOYAGER && X86_MPPARSE && PCI_GODIRECT
-       help
+       depends on X86_32 && PCI && X86_MPPARSE && PCI_GODIRECT
+       depends on X86_32_NON_STANDARD
+       ---help---
           The SGI Visual Workstation series is an IA32-based workstation
           based on SGI systems chips with some legacy PC hardware attached.
   
@@@ -384,21 -414,25 +417,25 @@@
           A kernel compiled for the Visual Workstation will run on general
           PCs as well. See <file:Documentation/sgi-visws.txt> for details.
   
- config X86_RDC321X
-       bool "RDC R-321x SoC"
-       depends on X86_32
-       select M486
-       select X86_REBOOTFIXUPS
-       help
-         This option is needed for RDC R-321x system-on-chip, also known
-         as R-8610-(G).
-         If you don't have one of these chips, you should say N here.
+ config X86_SUMMIT
+       bool "Summit/EXA (IBM x440)"
+       depends on X86_32_NON_STANDARD
+       ---help---
+         This option is needed for IBM systems that use the Summit/EXA chipset.
+         In particular, it is needed for the x440.
+ 
+ config X86_ES7000
+       bool "Unisys ES7000 IA32 series"
+       depends on X86_32_NON_STANDARD && X86_BIGSMP
+       ---help---
+         Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
+         supposed to run on an IA32-based Unisys ES7000 system.
   
   config SCHED_OMIT_FRAME_POINTER
         def_bool y
         prompt "Single-depth WCHAN output"
         depends on X86
-       help
+       ---help---
           Calculate simpler /proc/<PID>/wchan values. If this option
           is disabled then wchan values will recurse back to the
           caller function. This provides more accurate wchan values,
@@@ -408,7 -442,7 +445,7 @@@
   
   menuconfig PARAVIRT_GUEST
         bool "Paravirtualized guest support"
-       help
+       ---help---
           Say Y here to get to see options related to running Linux under
           various hypervisors.  This option alone does not add any kernel code.
   
@@@ -422,8 -456,7 +459,7 @@@ config VM
         bool "VMI Guest support"
         select PARAVIRT
         depends on X86_32
-       depends on !X86_VOYAGER
-       help
+       ---help---
           VMI provides a paravirtualized interface to the VMware ESX server
           (it could be used by other hypervisors in theory too, but is not
           at the moment), by linking the kernel to a GPL-ed ROM module
@@@ -433,8 -466,7 +469,7 @@@ config KVM_CLOC
         bool "KVM paravirtualized clock"
         select PARAVIRT
         select PARAVIRT_CLOCK
-       depends on !X86_VOYAGER
-       help
+       ---help---
           Turning on this option will allow you to run a paravirtualized clock
           when running over the KVM hypervisor. Instead of relying on a PIT
           (or probably other) emulation by the underlying device model, the host
@@@ -444,17 -476,15 +479,15 @@@
   config KVM_GUEST
         bool "KVM Guest support"
         select PARAVIRT
-       depends on !X86_VOYAGER
-       help
-        This option enables various optimizations for running under the KVM
-        hypervisor.
+       ---help---
+         This option enables various optimizations for running under the KVM
+         hypervisor.
   
   source "arch/x86/lguest/Kconfig"
   
   config PARAVIRT
         bool "Enable paravirtualization code"
-       depends on !X86_VOYAGER
-       help
+       ---help---
           This changes the kernel so it can modify itself when it is run
           under a hypervisor, potentially improving performance significantly
           over full virtualization.  However, when run without a hypervisor
@@@ -467,51 -497,51 +500,51 @@@ config PARAVIRT_CLOC
   endif
   
   config PARAVIRT_DEBUG
-        bool "paravirt-ops debugging"
-        depends on PARAVIRT && DEBUG_KERNEL
-        help
-          Enable to debug paravirt_ops internals.  Specifically, BUG if
-        a paravirt_op is missing when it is called.
+       bool "paravirt-ops debugging"
+       depends on PARAVIRT && DEBUG_KERNEL
+       ---help---
+         Enable to debug paravirt_ops internals.  Specifically, BUG if
+         a paravirt_op is missing when it is called.
   
   config MEMTEST
         bool "Memtest"
-       help
+       ---help---
           This option adds a kernel parameter 'memtest', which allows memtest
           to be set.
-               memtest=0, mean disabled; -- default
-               memtest=1, mean do 1 test pattern;
-               ...
-               memtest=4, mean do 4 test patterns.
+               memtest=0, mean disabled; -- default
+               memtest=1, mean do 1 test pattern;
+               ...
+               memtest=4, mean do 4 test patterns.
           If you are unsure how to answer this question, answer N.
   
   config X86_SUMMIT_NUMA
         def_bool y
-       depends on X86_32 && NUMA && X86_GENERICARCH
+       depends on X86_32 && NUMA && X86_32_NON_STANDARD
   
   config X86_CYCLONE_TIMER
         def_bool y
-       depends on X86_GENERICARCH
+       depends on X86_32_NON_STANDARD
   
   source "arch/x86/Kconfig.cpu"
   
   config HPET_TIMER
         def_bool X86_64
         prompt "HPET Timer Support" if X86_32
-       help
-          Use the IA-PC HPET (High Precision Event Timer) to manage
-          time in preference to the PIT and RTC, if a HPET is
-          present.
-          HPET is the next generation timer replacing legacy 8254s.
-          The HPET provides a stable time base on SMP
-          systems, unlike the TSC, but it is more expensive to access,
-          as it is off-chip.  You can find the HPET spec at
-          <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
+       ---help---
+         Use the IA-PC HPET (High Precision Event Timer) to manage
+         time in preference to the PIT and RTC, if a HPET is
+         present.
+         HPET is the next generation timer replacing legacy 8254s.
+         The HPET provides a stable time base on SMP
+         systems, unlike the TSC, but it is more expensive to access,
+         as it is off-chip.  You can find the HPET spec at
+         <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
   
-          You can safely choose Y here.  However, HPET will only be
-          activated if the platform and the BIOS support this feature.
-          Otherwise the 8254 will be used for timing services.
+         You can safely choose Y here.  However, HPET will only be
+         activated if the platform and the BIOS support this feature.
+         Otherwise the 8254 will be used for timing services.
   
-          Choose N to continue using the legacy 8254 timer.
+         Choose N to continue using the legacy 8254 timer.
   
   config HPET_EMULATE_RTC
         def_bool y
@@@ -522,7 -552,7 +555,7 @@@
   config DMI
         default y
         bool "Enable DMI scanning" if EMBEDDED
-       help
+       ---help---
           Enabled scanning of DMI to identify machine quirks. Say Y
           here unless you have verified that your setup is not
           affected by entries in the DMI blacklist. Required by PNP
@@@ -534,7 -564,7 +567,7 @@@ config GART_IOMM
         select SWIOTLB
         select AGP
         depends on X86_64 && PCI
-       help
+       ---help---
           Support for full DMA access of devices with 32bit memory access only
           on systems with more than 3GB. This is usually needed for USB,
           sound, many IDE/SATA chipsets and some other devices.
@@@ -549,7 -579,7 +582,7 @@@ config CALGARY_IOMM
         bool "IBM Calgary IOMMU support"
         select SWIOTLB
         depends on X86_64 && PCI && EXPERIMENTAL
-       help
+       ---help---
           Support for hardware IOMMUs in IBM's xSeries x366 and x460
           systems. Needed to run systems with more than 3GB of memory
           properly with 32-bit PCI devices that do not support DAC
@@@ -567,7 -597,7 +600,7 @@@ config CALGARY_IOMMU_ENABLED_BY_DEFAUL
         def_bool y
         prompt "Should Calgary be enabled by default?"
         depends on CALGARY_IOMMU
-       help
+       ---help---
           Should Calgary be enabled by default? if you choose 'y', Calgary
           will be used (if it exists). If you choose 'n', Calgary will not be
           used even if it exists. If you choose 'n' and would like to use
@@@ -579,7 -609,7 +612,7 @@@ config AMD_IOMM
         select SWIOTLB
         select PCI_MSI
         depends on X86_64 && PCI && ACPI
-       help
+       ---help---
           With this option you can enable support for AMD IOMMU hardware in
           your system. An IOMMU is a hardware component which provides
           remapping of DMA memory accesses from devices. With an AMD IOMMU you
@@@ -594,7 -624,7 +627,7 @@@ config AMD_IOMMU_STAT
         bool "Export AMD IOMMU statistics to debugfs"
         depends on AMD_IOMMU
         select DEBUG_FS
-       help
+       ---help---
           This option enables code in the AMD IOMMU driver to collect various
           statistics about whats happening in the driver and exports that
           information to userspace via debugfs.
@@@ -603,7 -633,7 +636,7 @@@
   # need this always selected by IOMMU for the VIA workaround
   config SWIOTLB
         def_bool y if X86_64
-       help
+       ---help---
           Support for software bounce buffers used on x86-64 systems
           which don't have a hardware IOMMU (e.g. the current generation
           of Intel's x86-64 CPUs). Using this PCI devices which can only
@@@ -621,7 -651,7 +654,7 @@@ config MAXSM
         depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
         select CPUMASK_OFFSTACK
         default n
-       help
+       ---help---
           Configure maximum number of CPUS and NUMA Nodes for this architecture.
           If unsure, say N.
   
@@@ -632,7 -662,7 +665,7 @@@ config NR_CPU
         default "4096" if MAXSMP
         default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
         default "8" if SMP
-       help
+       ---help---
           This allows you to specify the maximum number of CPUs which this
           kernel will support.  The maximum supported value is 512 and the
           minimum value which makes sense is 2.
@@@ -643,7 -673,7 +676,7 @@@
   config SCHED_SMT
         bool "SMT (Hyperthreading) scheduler support"
         depends on X86_HT
-       help
+       ---help---
           SMT scheduler support improves the CPU scheduler's decision making
           when dealing with Intel Pentium 4 chips with HyperThreading at a
           cost of slightly increased overhead in some places. If unsure say
@@@ -653,7 -683,7 +686,7 @@@ config SCHED_M
         def_bool y
         prompt "Multi-core scheduler support"
         depends on X86_HT
-       help
+       ---help---
           Multi-core scheduler support improves the CPU scheduler's decision
           making when dealing with multi-core CPU chips at a cost of slightly
           increased overhead in some places. If unsure say N here.
@@@ -662,8 -692,8 +695,8 @@@ source "kernel/Kconfig.preempt
   
   config X86_UP_APIC
         bool "Local APIC support on uniprocessors"
-       depends on X86_32 && !SMP && !(X86_VOYAGER || X86_GENERICARCH)
-       help
+       depends on X86_32 && !SMP && !X86_32_NON_STANDARD
+       ---help---
           A local APIC (Advanced Programmable Interrupt Controller) is an
           integrated interrupt controller in the CPU. If you have a single-CPU
           system which has a processor with a local APIC, you can say Y here to
@@@ -676,7 -706,7 +709,7 @@@
   config X86_UP_IOAPIC
         bool "IO-APIC support on uniprocessors"
         depends on X86_UP_APIC
-       help
+       ---help---
           An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
           SMP-capable replacement for PC-style interrupt controllers. Most
           SMP systems and many recent uniprocessor systems have one.
@@@ -687,11 -717,11 +720,11 @@@
   
   config X86_LOCAL_APIC
         def_bool y
-       depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+       depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
   
   config X86_IO_APIC
         def_bool y
-       depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+       depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
   
   config X86_VISWS_APIC
         def_bool y
@@@ -701,7 -731,7 +734,7 @@@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQ
         bool "Reroute for broken boot IRQs"
         default n
         depends on X86_IO_APIC
-       help
+       ---help---
           This option enables a workaround that fixes a source of
           spurious interrupts. This is recommended when threaded
           interrupt handling is used on systems where the generation of
@@@ -723,7 -753,6 +756,6 @@@
   
   config X86_MCE
         bool "Machine Check Exception"
-       depends on !X86_VOYAGER
         ---help---
           Machine Check Exception support allows the processor to notify the
           kernel if it detects a problem (e.g. overheating, component failure).
@@@ -742,7 -771,7 +774,7 @@@ config X86_MCE_INTE
         def_bool y
         prompt "Intel MCE features"
         depends on X86_64 && X86_MCE && X86_LOCAL_APIC
-       help
+       ---help---
            Additional support for intel specific MCE features such as
            the thermal monitor.
   
@@@ -750,14 -779,14 +782,14 @@@ config X86_MCE_AM
         def_bool y
         prompt "AMD MCE features"
         depends on X86_64 && X86_MCE && X86_LOCAL_APIC
-       help
+       ---help---
            Additional support for AMD specific MCE features such as
            the DRAM Error Threshold.
   
   config X86_MCE_NONFATAL
         tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
         depends on X86_32 && X86_MCE
-       help
+       ---help---
           Enabling this feature starts a timer that triggers every 5 seconds which
           will look at the machine check registers to see if anything happened.
           Non-fatal problems automatically get corrected (but still logged).
@@@ -770,7 -799,7 +802,7 @@@
   config X86_MCE_P4THERMAL
         bool "check for P4 thermal throttling interrupt."
         depends on X86_32 && X86_MCE && (X86_UP_APIC || SMP)
-       help
+       ---help---
           Enabling this feature will cause a message to be printed when the P4
           enters thermal throttling.
   
@@@ -778,11 -807,11 +810,11 @@@ config VM8
         bool "Enable VM86 support" if EMBEDDED
         default y
         depends on X86_32
-       help
-           This option is required by programs like DOSEMU to run 16-bit legacy
+       ---help---
+         This option is required by programs like DOSEMU to run 16-bit legacy
           code on X86 processors. It also may be needed by software like
-           XFree86 to initialize some video cards via BIOS. Disabling this
-           option saves about 6k.
+         XFree86 to initialize some video cards via BIOS. Disabling this
+         option saves about 6k.
   
   config TOSHIBA
         tristate "Toshiba Laptop support"
@@@ -856,33 -885,33 +888,33 @@@ config MICROCOD
           module will be called microcode.
   
   config MICROCODE_INTEL
-        bool "Intel microcode patch loading support"
-        depends on MICROCODE
-        default MICROCODE
-        select FW_LOADER
-        --help---
-          This options enables microcode patch loading support for Intel
-          processors.
- 
-          For latest news and information on obtaining all the required
-          Intel ingredients for this driver, check:
-          <http://www.urbanmyth.org/microcode/>.
+       bool "Intel microcode patch loading support"
+       depends on MICROCODE
+       default MICROCODE
+       select FW_LOADER
+       ---help---
+         This options enables microcode patch loading support for Intel
+         processors.
+ 
+         For latest news and information on obtaining all the required
+         Intel ingredients for this driver, check:
+         <http://www.urbanmyth.org/microcode/>.
   
   config MICROCODE_AMD
-        bool "AMD microcode patch loading support"
-        depends on MICROCODE
-        select FW_LOADER
-        --help---
-          If you select this option, microcode patch loading support for AMD
-        processors will be enabled.
+       bool "AMD microcode patch loading support"
+       depends on MICROCODE
+       select FW_LOADER
+       ---help---
+         If you select this option, microcode patch loading support for AMD
+         processors will be enabled.
   
-    config MICROCODE_OLD_INTERFACE
+ config MICROCODE_OLD_INTERFACE
         def_bool y
         depends on MICROCODE
   
   config X86_MSR
         tristate "/dev/cpu/*/msr - Model-specific register support"
-       help
+       ---help---
           This device gives privileged processes access to the x86
           Model-Specific Registers (MSRs).  It is a character device with
           major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
@@@ -891,7 -920,7 +923,7 @@@
   
   config X86_CPUID
         tristate "/dev/cpu/*/cpuid - CPU information support"
-       help
+       ---help---
           This device gives processes access to the x86 CPUID instruction to
           be executed on a specific processor.  It is a character device
           with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
@@@ -943,7 -972,7 +975,7 @@@ config NOHIGHME
   config HIGHMEM4G
         bool "4GB"
         depends on !X86_NUMAQ
-       help
+       ---help---
           Select this if you have a 32-bit processor and between 1 and 4
           gigabytes of physical RAM.
   
@@@ -951,7 -980,7 +983,7 @@@ config HIGHMEM64
         bool "64GB"
         depends on !M386 && !M486
         select X86_PAE
-       help
+       ---help---
           Select this if you have a 32-bit processor and more than 4
           gigabytes of physical RAM.
   
@@@ -962,7 -991,7 +994,7 @@@ choic
         prompt "Memory split" if EMBEDDED
         default VMSPLIT_3G
         depends on X86_32
-       help
+       ---help---
           Select the desired split between kernel and user memory.
   
           If the address range available to the kernel is less than the
@@@ -1008,20 -1037,20 +1040,20 @@@ config HIGHME
   config X86_PAE
         bool "PAE (Physical Address Extension) Support"
         depends on X86_32 && !HIGHMEM4G
-       help
+       ---help---
           PAE is required for NX support, and furthermore enables
           larger swapspace support for non-overcommit purposes. It
           has the cost of more pagetable lookup overhead, and also
           consumes more pagetable space per process.
   
   config ARCH_PHYS_ADDR_T_64BIT
-        def_bool X86_64 || X86_PAE
+       def_bool X86_64 || X86_PAE
   
   config DIRECT_GBPAGES
         bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
         default y
         depends on X86_64
-       help
+       ---help---
           Allow the kernel linear mapping to use 1GB pages on CPUs that
           support it. This can improve the kernel's performance a tiny bit by
           reducing TLB pressure. If in doubt, say "Y".
@@@ -1031,9 -1060,8 +1063,8 @@@ config NUM
         bool "Numa Memory Allocation and Scheduler Support"
         depends on SMP
         depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
-       default n if X86_PC
         default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
-       help
+       ---help---
           Enable NUMA (Non Uniform Memory Access) support.
   
           The kernel will try to allocate memory used by a CPU on the
@@@ -1056,19 -1084,19 +1087,19 @@@ config K8_NUM
         def_bool y
         prompt "Old style AMD Opteron NUMA detection"
         depends on X86_64 && NUMA && PCI
-       help
-        Enable K8 NUMA node topology detection.  You should say Y here if
-        you have a multi processor AMD K8 system. This uses an old
-        method to read the NUMA configuration directly from the builtin
-        Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
-        instead, which also takes priority if both are compiled in.
+       ---help---
+         Enable K8 NUMA node topology detection.  You should say Y here if
+         you have a multi processor AMD K8 system. This uses an old
+         method to read the NUMA configuration directly from the builtin
+         Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
+         instead, which also takes priority if both are compiled in.
   
   config X86_64_ACPI_NUMA
         def_bool y
         prompt "ACPI NUMA detection"
         depends on X86_64 && NUMA && ACPI && PCI
         select ACPI_NUMA
-       help
+       ---help---
           Enable ACPI SRAT based node topology detection.
   
   # Some NUMA nodes have memory ranges that span
@@@ -1083,7 -1111,7 +1114,7 @@@ config NODES_SPAN_OTHER_NODE
   config NUMA_EMU
         bool "NUMA emulation"
         depends on X86_64 && NUMA
-       help
+       ---help---
           Enable NUMA emulation. A flat machine will be split
           into virtual nodes when booted with "numa=fake=N", where N is the
           number of nodes. This is only useful for debugging.
@@@ -1096,11 -1124,11 +1127,11 @@@ config NODES_SHIF
         default "4" if X86_NUMAQ
         default "3"
         depends on NEED_MULTIPLE_NODES
-       help
+       ---help---
           Specify the maximum number of NUMA Nodes available on the target
           system.  Increases memory reserved to accomodate various tables.
   
- config HAVE_ARCH_BOOTMEM_NODE
+ config HAVE_ARCH_BOOTMEM
         def_bool y
         depends on X86_32 && NUMA
   
@@@ -1134,7 -1162,7 +1165,7 @@@ config ARCH_SPARSEMEM_DEFAUL
   
   config ARCH_SPARSEMEM_ENABLE
         def_bool y
-       depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
+       depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
         select SPARSEMEM_STATIC if X86_32
         select SPARSEMEM_VMEMMAP_ENABLE if X86_64
   
@@@ -1151,61 -1179,61 +1182,61 @@@ source "mm/Kconfig
   config HIGHPTE
         bool "Allocate 3rd-level pagetables from highmem"
         depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
-       help
+       ---help---
           The VM uses one page table entry for each page of physical memory.
           For systems with a lot of RAM, this can be wasteful of precious
           low memory.  Setting this option will put user-space page table
           entries in high memory.
   
   config X86_CHECK_BIOS_CORRUPTION
-         bool "Check for low memory corruption"
-       help
-        Periodically check for memory corruption in low memory, which
-        is suspected to be caused by BIOS.  Even when enabled in the
-        configuration, it is disabled at runtime.  Enable it by
-        setting "memory_corruption_check=1" on the kernel command
-        line.  By default it scans the low 64k of memory every 60
-        seconds; see the memory_corruption_check_size and
-        memory_corruption_check_period parameters in
-        Documentation/kernel-parameters.txt to adjust this.
- 
-        When enabled with the default parameters, this option has
-        almost no overhead, as it reserves a relatively small amount
-        of memory and scans it infrequently.  It both detects corruption
-        and prevents it from affecting the running system.
- 
-        It is, however, intended as a diagnostic tool; if repeatable
-        BIOS-originated corruption always affects the same memory,
-        you can use memmap= to prevent the kernel from using that
-        memory.
+       bool "Check for low memory corruption"
+       ---help---
+         Periodically check for memory corruption in low memory, which
+         is suspected to be caused by BIOS.  Even when enabled in the
+         configuration, it is disabled at runtime.  Enable it by
+         setting "memory_corruption_check=1" on the kernel command
+         line.  By default it scans the low 64k of memory every 60
+         seconds; see the memory_corruption_check_size and
+         memory_corruption_check_period parameters in
+         Documentation/kernel-parameters.txt to adjust this.
+ 
+         When enabled with the default parameters, this option has
+         almost no overhead, as it reserves a relatively small amount
+         of memory and scans it infrequently.  It both detects corruption
+         and prevents it from affecting the running system.
+ 
+         It is, however, intended as a diagnostic tool; if repeatable
+         BIOS-originated corruption always affects the same memory,
+         you can use memmap= to prevent the kernel from using that
+         memory.
   
   config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
-         bool "Set the default setting of memory_corruption_check"
+       bool "Set the default setting of memory_corruption_check"
         depends on X86_CHECK_BIOS_CORRUPTION
         default y
-       help
-        Set whether the default state of memory_corruption_check is
-        on or off.
+       ---help---
+         Set whether the default state of memory_corruption_check is
+         on or off.
   
   config X86_RESERVE_LOW_64K
-         bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
+       bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
         default y
-       help
-        Reserve the first 64K of physical RAM on BIOSes that are known
-        to potentially corrupt that memory range. A numbers of BIOSes are
-        known to utilize this area during suspend/resume, so it must not
-        be used by the kernel.
+       ---help---
+         Reserve the first 64K of physical RAM on BIOSes that are known
+         to potentially corrupt that memory range. A numbers of BIOSes are
+         known to utilize this area during suspend/resume, so it must not
+         be used by the kernel.
   
-        Set this to N if you are absolutely sure that you trust the BIOS
-        to get all its memory reservations and usages right.
+         Set this to N if you are absolutely sure that you trust the BIOS
+         to get all its memory reservations and usages right.
   
-        If you have doubts about the BIOS (e.g. suspend/resume does not
-        work or there's kernel crashes after certain hardware hotplug
-        events) and it's not AMI or Phoenix, then you might want to enable
-        X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
-        corruption patterns.
+         If you have doubts about the BIOS (e.g. suspend/resume does not
+         work or there's kernel crashes after certain hardware hotplug
+         events) and it's not AMI or Phoenix, then you might want to enable
+         X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
+         corruption patterns.
   
-        Say Y if unsure.
+         Say Y if unsure.
   
   config MATH_EMULATION
         bool
@@@ -1271,7 -1299,7 +1302,7 @@@ config MTRR_SANITIZE
         def_bool y
         prompt "MTRR cleanup support"
         depends on MTRR
-       help
+       ---help---
           Convert MTRR layout from continuous to discrete, so X drivers can
           add writeback entries.
   
@@@ -1286,7 -1314,7 +1317,7 @@@ config MTRR_SANITIZER_ENABLE_DEFAUL
         range 0 1
         default "0"
         depends on MTRR_SANITIZER
-       help
+       ---help---
           Enable mtrr cleanup default value
   
   config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
@@@ -1294,7 -1322,7 +1325,7 @@@
         range 0 7
         default "1"
         depends on MTRR_SANITIZER
-       help
+       ---help---
           mtrr cleanup spare entries default, it can be changed via
           mtrr_spare_reg_nr=N on the kernel command line.
   
@@@ -1302,7 -1330,7 +1333,7 @@@ config X86_PA
         bool
         prompt "x86 PAT support"
         depends on MTRR
-       help
+       ---help---
           Use PAT attributes to setup page level cache control.
   
           PATs are the modern equivalents of MTRRs and are much more
@@@ -1317,20 -1345,20 +1348,20 @@@ config EF
         bool "EFI runtime service support"
         depends on ACPI
         ---help---
-       This enables the kernel to use EFI runtime services that are
-       available (such as the EFI variable services).
+         This enables the kernel to use EFI runtime services that are
+         available (such as the EFI variable services).
   
-       This option is only useful on systems that have EFI firmware.
-       In addition, you should use the latest ELILO loader available
-       at <http://elilo.sourceforge.net> in order to take advantage
-       of EFI runtime services. However, even with this option, the
-       resultant kernel should continue to boot on existing non-EFI
-       platforms.
+         This option is only useful on systems that have EFI firmware.
+         In addition, you should use the latest ELILO loader available
+         at <http://elilo.sourceforge.net> in order to take advantage
+         of EFI runtime services. However, even with this option, the
+         resultant kernel should continue to boot on existing non-EFI
+         platforms.
   
   config SECCOMP
         def_bool y
         prompt "Enable seccomp to safely compute untrusted bytecode"
-       help
+       ---help---
           This kernel feature is useful for number crunching applications
           that may need to compute untrusted bytecode during their
           execution. By using pipes or other transports made available to
@@@ -1343,13 -1371,16 +1374,16 @@@
   
           If unsure, say Y. Only embedded should say N here.
   
+ config CC_STACKPROTECTOR_ALL
+       bool
+ 
   config CC_STACKPROTECTOR
         bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
-       depends on X86_64 && EXPERIMENTAL && BROKEN
-       help
-          This option turns on the -fstack-protector GCC feature. This
-         feature puts, at the beginning of critical functions, a canary
-         value on the stack just before the return address, and validates
+       select CC_STACKPROTECTOR_ALL
+       ---help---
+         This option turns on the -fstack-protector GCC feature. This
+         feature puts, at the beginning of functions, a canary value on
+         the stack just before the return address, and validates
           the value just before actually returning.  Stack based buffer
           overflows (that need to overwrite this return address) now also
           overwrite the canary, which gets detected and the attack is then
@@@ -1357,22 -1388,14 +1391,14 @@@
   
           This feature requires gcc version 4.2 or above, or a distribution
           gcc with the feature backported. Older versions are automatically
-         detected and for those versions, this configuration option is ignored.
- 
- config CC_STACKPROTECTOR_ALL
-       bool "Use stack-protector for all functions"
-       depends on CC_STACKPROTECTOR
-       help
-         Normally, GCC only inserts the canary value protection for
-         functions that use large-ish on-stack buffers. By enabling
-         this option, GCC will be asked to do this for ALL functions.
+         detected and for those versions, this configuration option is
+         ignored. (and a warning is printed during bootup)
   
   source kernel/Kconfig.hz
   
   config KEXEC
         bool "kexec system call"
-       depends on X86_BIOS_REBOOT
-       help
+       ---help---
           kexec is a system call that implements the ability to shutdown your
           current kernel, and to start another kernel.  It is like a reboot
           but it is independent of the system firmware.   And like a reboot
@@@ -1389,7 -1412,7 +1415,7 @@@
   config CRASH_DUMP
         bool "kernel crash dumps"
         depends on X86_64 || (X86_32 && HIGHMEM)
-       help
+       ---help---
           Generate crash dump after being started by kexec.
           This should be normally only set in special crash dump kernels
           which are loaded in the main kernel with kexec-tools into
@@@ -1404,7 -1427,7 +1430,7 @@@ config KEXEC_JUM
         bool "kexec jump (EXPERIMENTAL)"
         depends on EXPERIMENTAL
         depends on KEXEC && HIBERNATION && X86_32
-       help
+       ---help---
           Jump between original kernel and kexeced kernel and invoke
           code in physical address mode via KEXEC
   
@@@ -1413,7 -1436,7 +1439,7 @@@ config PHYSICAL_STAR
         default "0x1000000" if X86_NUMAQ
         default "0x200000" if X86_64
         default "0x100000"
-       help
+       ---help---
           This gives the physical address where the kernel is loaded.
   
           If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
@@@ -1454,7 -1477,7 +1480,7 @@@
   config RELOCATABLE
         bool "Build a relocatable kernel (EXPERIMENTAL)"
         depends on EXPERIMENTAL
-       help
+       ---help---
           This builds a kernel image that retains relocation information
           so it can be loaded someplace besides the default 1MB.
           The relocations tend to make the kernel binary about 10% larger,
@@@ -1474,7 -1497,7 +1500,7 @@@ config PHYSICAL_ALIG
         default "0x100000" if X86_32
         default "0x200000" if X86_64
         range 0x2000 0x400000
-       help
+       ---help---
           This value puts the alignment restrictions on physical address
           where kernel is loaded and run from. Kernel is compiled for an
           address which meets above alignment restriction.
@@@ -1495,7 -1518,7 +1521,7 @@@
   
   config HOTPLUG_CPU
         bool "Support for hot-pluggable CPUs"
-       depends on SMP && HOTPLUG && !X86_VOYAGER
+       depends on SMP && HOTPLUG
         ---help---
           Say Y here to allow turning CPUs off and on. CPUs can be
           controlled through /sys/devices/system/cpu.
@@@ -1507,7 -1530,7 +1533,7 @@@ config COMPAT_VDS
         def_bool y
         prompt "Compat VDSO support"
         depends on X86_32 || IA32_EMULATION
-       help
+       ---help---
           Map the 32-bit VDSO to the predictable old-style address too.
         ---help---
           Say N here if you are running a sufficiently recent glibc
@@@ -1519,7 -1542,7 +1545,7 @@@
   config CMDLINE_BOOL
         bool "Built-in kernel command line"
         default n
-       help
+       ---help---
           Allow for specifying boot arguments to the kernel at
           build time.  On some systems (e.g. embedded ones), it is
           necessary or convenient to provide some or all of the
@@@ -1537,7 -1560,7 +1563,7 @@@ config CMDLIN
         string "Built-in kernel command string"
         depends on CMDLINE_BOOL
         default ""
-       help
+       ---help---
           Enter arguments here that should be compiled into the kernel
           image and used at boot time.  If the boot loader provides a
           command line at boot time, it is appended to this string to
@@@ -1554,7 -1577,7 +1580,7 @@@ config CMDLINE_OVERRID
         bool "Built-in command line overrides boot loader arguments"
         default n
         depends on CMDLINE_BOOL
-       help
+       ---help---
           Set this option to 'Y' to have the kernel ignore the boot loader
           command line, and use ONLY the built-in command line.
   
@@@ -1576,7 -1599,6 +1602,6 @@@ config HAVE_ARCH_EARLY_PFN_TO_NI
         depends on NUMA
   
   menu "Power management and ACPI options"
-       depends on !X86_VOYAGER
   
   config ARCH_HIBERNATION_HEADER
         def_bool y
@@@ -1654,7 -1676,7 +1679,7 @@@ if AP
   
   config APM_IGNORE_USER_SUSPEND
         bool "Ignore USER SUSPEND"
-       help
+       ---help---
           This option will ignore USER SUSPEND requests. On machines with a
           compliant APM BIOS, you want to say N. However, on the NEC Versa M
           series notebooks, it is necessary to say Y because of a BIOS bug.
@@@ -1678,7 -1700,7 +1703,7 @@@ config APM_DO_ENABL
   
   config APM_CPU_IDLE
         bool "Make CPU Idle calls when idle"
-       help
+       ---help---
           Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
           On some machines, this can activate improved power savings, such as
           a slowed CPU clock rate, when the machine is idle. These idle calls
@@@ -1689,7 -1711,7 +1714,7 @@@
   
   config APM_DISPLAY_BLANK
         bool "Enable console blanking using APM"
-       help
+       ---help---
           Enable console blanking using the APM. Some laptops can use this to
           turn off the LCD backlight when the screen blanker of the Linux
           virtual console blanks the screen. Note that this is only used by
@@@ -1702,7 -1724,7 +1727,7 @@@
   
   config APM_ALLOW_INTS
         bool "Allow interrupts during APM BIOS calls"
-       help
+       ---help---
           Normally we disable external interrupts while we are making calls to
           the APM BIOS as a measure to lessen the effects of a badly behaving
           BIOS implementation.  The BIOS should reenable interrupts if it
@@@ -1727,7 -1749,7 +1752,7 @@@ config PC
         bool "PCI support"
         default y
         select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
-       help
+       ---help---
           Find out whether you have a PCI motherboard. PCI is the name of a
           bus system, i.e. the way the CPU talks to the other stuff inside
           your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or
@@@ -1798,7 -1820,7 +1823,7 @@@ config PCI_MMCONFI
   config DMAR
         bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
         depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL
-       help
+       ---help---
           DMA remapping (DMAR) devices support enables independent address
           translations for Direct Memory Access (DMA) from devices.
           These DMA remapping devices are reported via ACPI tables
@@@ -1820,29 -1842,30 +1845,30 @@@ config DMAR_GFX_W
         def_bool y
         prompt "Support for Graphics workaround"
         depends on DMAR
-       help
-        Current Graphics drivers tend to use physical address
-        for DMA and avoid using DMA APIs. Setting this config
-        option permits the IOMMU driver to set a unity map for
-        all the OS-visible memory. Hence the driver can continue
-        to use physical addresses for DMA.
+       ---help---
+         Current Graphics drivers tend to use physical address
+         for DMA and avoid using DMA APIs. Setting this config
+         option permits the IOMMU driver to set a unity map for
+         all the OS-visible memory. Hence the driver can continue
+         to use physical addresses for DMA.
   
   config DMAR_FLOPPY_WA
         def_bool y
         depends on DMAR
-       help
-        Floppy disk drivers are know to bypass DMA API calls
-        thereby failing to work when IOMMU is enabled. This
-        workaround will setup a 1:1 mapping for the first
-        16M to make floppy (an ISA device) work.
+       ---help---
+         Floppy disk drivers are know to bypass DMA API calls
+         thereby failing to work when IOMMU is enabled. This
+         workaround will setup a 1:1 mapping for the first
+         16M to make floppy (an ISA device) work.
   
   config INTR_REMAP
         bool "Support for Interrupt Remapping (EXPERIMENTAL)"
         depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
-       help
-        Supports Interrupt remapping for IO-APIC and MSI devices.
-        To use x2apic mode in the CPU's which support x2APIC enhancements or
-        to support platforms with CPU's having > 8 bit APIC ID, say Y.
+       select X86_X2APIC
+       ---help---
+         Supports Interrupt remapping for IO-APIC and MSI devices.
+         To use x2apic mode in the CPU's which support x2APIC enhancements or
+         to support platforms with CPU's having > 8 bit APIC ID, say Y.
   
   source "drivers/pci/pcie/Kconfig"
   
@@@ -1856,8 -1879,7 +1882,7 @@@ if X86_3
   
   config ISA
         bool "ISA support"
-       depends on !X86_VOYAGER
-       help
+       ---help---
           Find out whether you have ISA slots on your motherboard.  ISA is the
           name of a bus system, i.e. the way the CPU talks to the other stuff
           inside your box.  Other bus systems are PCI, EISA, MicroChannel
@@@ -1883,9 -1905,8 +1908,8 @@@ config EIS
   source "drivers/eisa/Kconfig"
   
   config MCA
-       bool "MCA support" if !X86_VOYAGER
-       default y if X86_VOYAGER
-       help
+       bool "MCA support"
+       ---help---
           MicroChannel Architecture is found in some IBM PS/2 machines and
           laptops.  It is a bus system similar to PCI or ISA. See
           <file:Documentation/mca.txt> (and especially the web page given
@@@ -1895,8 -1916,7 +1919,7 @@@ source "drivers/mca/Kconfig
   
   config SCx200
         tristate "NatSemi SCx200 support"
-       depends on !X86_VOYAGER
-       help
+       ---help---
           This provides basic support for National Semiconductor's
           (now AMD's) Geode processors.  The driver probes for the
           PCI-IDs of several on-chip devices, so its a good dependency
@@@ -1908,7 -1928,7 +1931,7 @@@ config SCx200HR_TIME
         tristate "NatSemi SCx200 27MHz High-Resolution Timer Support"
         depends on SCx200 && GENERIC_TIME
         default y
-       help
+       ---help---
           This driver provides a clocksource built upon the on-chip
           27MHz high-resolution timer.  Its also a workaround for
           NSC Geode SC-1100's buggy TSC, which loses time when the
@@@ -1919,7 -1939,7 +1942,7 @@@ config GEODE_MFGPT_TIME
         def_bool y
         prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
         depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
-       help
+       ---help---
           This driver provides a clock event source based on the MFGPT
           timer(s) in the CS5535 and CS5536 companion chip for the geode.
           MFGPTs have a better resolution and max interval than the
@@@ -1928,7 -1948,7 +1951,7 @@@
   config OLPC
         bool "One Laptop Per Child support"
         default n
-       help
+       ---help---
           Add support for detecting the unique features of the OLPC
           XO hardware.
   
@@@ -1953,16 -1973,16 +1976,16 @@@ config IA32_EMULATIO
         bool "IA32 Emulation"
         depends on X86_64
         select COMPAT_BINFMT_ELF
-       help
+       ---help---
           Include code to run 32-bit programs under a 64-bit kernel. You should
           likely turn this on, unless you're 100% sure that you don't have any
           32-bit programs left.
   
   config IA32_AOUT
-        tristate "IA32 a.out support"
-        depends on IA32_EMULATION
-        help
-          Support old a.out binaries in the 32bit emulation.
+       tristate "IA32 a.out support"
+       depends on IA32_EMULATION
+       ---help---
+         Support old a.out binaries in the 32bit emulation.
   
   config COMPAT
         def_bool y
diff --combined arch/x86/include/asm/fixmap.h

index 23696d44a0af85fb2d3bf407d8b44def4874bf81,dca8f03da5b29574af570717c435ea3a5320f22e..63a79c77d220058f74eb81a70e70e3f3721be76d
--- 1/arch/x86/include/asm/fixmap.h
--- 2/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@@ -1,11 -1,155 +1,145 @@@
+ /*
+  * fixmap.h: compile-time virtual memory allocation
+  *
+  * This file is subject to the terms and conditions of the GNU General Public
+  * License.  See the file "COPYING" in the main directory of this archive
+  * for more details.
+  *
+  * Copyright (C) 1998 Ingo Molnar
+  *
+  * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+  * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+  */
+ 
   #ifndef _ASM_X86_FIXMAP_H
   #define _ASM_X86_FIXMAP_H
   
- -#ifdef CONFIG_EFI
- -#include <asm/efi.h>
- -#endif
+ #ifndef __ASSEMBLY__
+ #include <linux/kernel.h>
+ #include <asm/acpi.h>
+ #include <asm/apicdef.h>
+ #include <asm/page.h>
+ #ifdef CONFIG_X86_32
+ #include <linux/threads.h>
+ #include <asm/kmap_types.h>
+ #else
+ #include <asm/vsyscall.h>
+ #endif
+ 
+ /*
+  * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
+  * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
+  * Because of this, FIXADDR_TOP x86 integration was left as later work.
+  */
+ #ifdef CONFIG_X86_32
+ /* used by vmalloc.c, vsyscall.lds.S.
+  *
+  * Leave one empty page between vmalloc'ed areas and
+  * the start of the fixmap.
+  */
+ extern unsigned long __FIXADDR_TOP;
+ #define FIXADDR_TOP   ((unsigned long)__FIXADDR_TOP)
+ 
+ #define FIXADDR_USER_START     __fix_to_virt(FIX_VDSO)
+ #define FIXADDR_USER_END       __fix_to_virt(FIX_VDSO - 1)
+ #else
+ #define FIXADDR_TOP   (VSYSCALL_END-PAGE_SIZE)
+ 
+ /* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+ #define FIXADDR_USER_START    ((unsigned long)VSYSCALL32_VSYSCALL)
+ #define FIXADDR_USER_END      (FIXADDR_USER_START + PAGE_SIZE)
+ #endif
+ 
+ 
+ /*
+  * Here we define all the compile-time 'special' virtual
+  * addresses. The point is to have a constant address at
+  * compile time, but to set the physical address only
+  * in the boot process.
+  * for x86_32: We allocate these special addresses
+  * from the end of virtual memory (0xfffff000) backwards.
+  * Also this lets us do fail-safe vmalloc(), we
+  * can guarantee that these special addresses and
+  * vmalloc()-ed addresses never overlap.
+  *
+  * These 'compile-time allocated' memory buffers are
+  * fixed-size 4k pages (or larger if used with an increment
+  * higher than 1). Use set_fixmap(idx,phys) to associate
+  * physical memory with fixmap indices.
+  *
+  * TLB entries of such buffers will not be flushed across
+  * task switches.
+  */
+ enum fixed_addresses {
   #ifdef CONFIG_X86_32
- # include "fixmap_32.h"
+       FIX_HOLE,
+       FIX_VDSO,
   #else
- # include "fixmap_64.h"
+       VSYSCALL_LAST_PAGE,
+       VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+                           + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+       VSYSCALL_HPET,
   #endif
- -#ifdef CONFIG_X86_64
- -#ifdef CONFIG_EFI
- -      FIX_EFI_IO_MAP_LAST_PAGE,
- -      FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
- -                                + MAX_EFI_IO_PAGES - 1,
- -#endif
- -#endif
+       FIX_DBGP_BASE,
+       FIX_EARLYCON_MEM_BASE,
+ #ifdef CONFIG_X86_LOCAL_APIC
+       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
+ #endif
+ #ifdef CONFIG_X86_IO_APIC
+       FIX_IO_APIC_BASE_0,
+       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+ #endif
+ #ifdef CONFIG_X86_VISWS_APIC
+       FIX_CO_CPU,     /* Cobalt timer */
+       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */
+       FIX_LI_PCIA,    /* Lithium PCI Bridge A */
+       FIX_LI_PCIB,    /* Lithium PCI Bridge B */
+ #endif
+ #ifdef CONFIG_X86_F00F_BUG
+       FIX_F00F_IDT,   /* Virtual mapping for IDT */
+ #endif
+ #ifdef CONFIG_X86_CYCLONE_TIMER
+       FIX_CYCLONE_TIMER, /*cyclone timer register*/
+ #endif
+ #ifdef CONFIG_X86_32
+       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+ #ifdef CONFIG_PCI_MMCONFIG
+       FIX_PCIE_MCFG,
+ #endif
+ #endif
+ #ifdef CONFIG_PARAVIRT
+       FIX_PARAVIRT_BOOTMAP,
+ #endif
+       __end_of_permanent_fixed_addresses,
+ #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+       FIX_OHCI1394_BASE,
+ #endif
+       /*
+        * 256 temporary boot-time mappings, used by early_ioremap(),
+        * before ioremap() is functional.
+        *
+        * We round it up to the next 256 pages boundary so that we
+        * can have a single pgd entry and a single pte table:
+        */
+ #define NR_FIX_BTMAPS         64
+ #define FIX_BTMAPS_SLOTS      4
+       FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
+                       (__end_of_permanent_fixed_addresses & 255),
+       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
+ #ifdef CONFIG_X86_32
+       FIX_WP_TEST,
+ #endif
+       __end_of_fixed_addresses
+ };
+ 
+ 
+ extern void reserve_top_address(unsigned long reserve);
+ 
+ #define FIXADDR_SIZE  (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+ #define FIXADDR_BOOT_SIZE     (__end_of_fixed_addresses << PAGE_SHIFT)
+ #define FIXADDR_START         (FIXADDR_TOP - FIXADDR_SIZE)
+ #define FIXADDR_BOOT_START    (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
   
   extern int fixmaps_set;
   
@@@ -69,4 -213,5 +203,5 @@@ static inline unsigned long virt_to_fix
         BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
         return __virt_to_fix(vaddr);
   }
+ #endif /* !__ASSEMBLY__ */
   #endif /* _ASM_X86_FIXMAP_H */
diff --combined arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c

index 3babe1f1e912eba061378087b5d4211022e4f772,22590cf688aedd45f9165c5ee6d9ea571f20a61c..23da96e57b17ed610b7a2a940c9055e402590e11
--- 1/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
--- 2/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@@ -1,5 -1,5 +1,5 @@@
   /*
- - * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $)
+ + * acpi-cpufreq.c - ACPI Processor P-States Driver
    *
    *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
    *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
@@@ -36,18 -36,16 +36,18 @@@
   #include <linux/ftrace.h>
   
   #include <linux/acpi.h>
+ +#include <linux/io.h>
+ +#include <linux/delay.h>
+ +#include <linux/uaccess.h>
+ +
   #include <acpi/processor.h>
   
- -#include <asm/io.h>
   #include <asm/msr.h>
   #include <asm/processor.h>
   #include <asm/cpufeature.h>
- -#include <asm/delay.h>
- -#include <asm/uaccess.h>
   
- -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
+ +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
+ +              "acpi-cpufreq", msg)
   
   MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
   MODULE_DESCRIPTION("ACPI Processor P-States Driver");
@@@ -97,7 -95,7 +97,7 @@@ static unsigned extract_io(u32 value, s
   
         perf = data->acpi_data;
   
- -      for (i=0; i<perf->state_count; i++) {
+ +      for (i = 0; i < perf->state_count; i++) {
                 if (value == perf->states[i].status)
                         return data->freq_table[i].frequency;
         }
@@@ -112,7 -110,7 +112,7 @@@ static unsigned extract_msr(u32 msr, st
         msr &= INTEL_MSR_RANGE;
         perf = data->acpi_data;
   
- -      for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+ +      for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
                 if (msr == perf->states[data->freq_table[i].index].status)
                         return data->freq_table[i].frequency;
         }
@@@ -140,13 -138,15 +140,13 @@@ struct io_addr 
         u8 bit_width;
   };
   
- -typedef union {
- -      struct msr_addr msr;
- -      struct io_addr io;
- -} drv_addr_union;
- -
   struct drv_cmd {
         unsigned int type;
         const struct cpumask *mask;
- -      drv_addr_union addr;
+ +      union {
+ +              struct msr_addr msr;
+ +              struct io_addr io;
+ +      } addr;
         u32 val;
   };
   
@@@ -369,7 -369,7 +369,7 @@@ static unsigned int check_freqs(const s
         unsigned int cur_freq;
         unsigned int i;
   
- -      for (i=0; i<100; i++) {
+ +      for (i = 0; i < 100; i++) {
                 cur_freq = extract_freq(get_cur_val(mask), data);
                 if (cur_freq == freq)
                         return 1;
@@@ -494,7 -494,7 +494,7 @@@ acpi_cpufreq_guess_freq(struct acpi_cpu
                 unsigned long freq;
                 unsigned long freqn = perf->states[0].core_frequency * 1000;
   
- -              for (i=0; i<(perf->state_count-1); i++) {
+ +              for (i = 0; i < (perf->state_count-1); i++) {
                         freq = freqn;
                         freqn = perf->states[i+1].core_frequency * 1000;
                         if ((2 * cpu_khz) > (freqn + freq)) {
@@@ -601,7 -601,7 +601,7 @@@ static int acpi_cpufreq_cpu_init(struc
         if (!data)
                 return -ENOMEM;
   
-       data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+       data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
         per_cpu(drv_data, cpu) = data;
   
         if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
@@@ -673,7 -673,7 +673,7 @@@
   
         /* detect transition latency */
         policy->cpuinfo.transition_latency = 0;
- -      for (i=0; i<perf->state_count; i++) {
+ +      for (i = 0; i < perf->state_count; i++) {
                 if ((perf->states[i].transition_latency * 1000) >
                     policy->cpuinfo.transition_latency)
                         policy->cpuinfo.transition_latency =
@@@ -682,8 -682,8 +682,8 @@@
   
         data->max_freq = perf->states[0].core_frequency * 1000;
         /* table init */
- -      for (i=0; i<perf->state_count; i++) {
- -              if (i>0 && perf->states[i].core_frequency >=
+ +      for (i = 0; i < perf->state_count; i++) {
+ +              if (i > 0 && perf->states[i].core_frequency >=
                     data->freq_table[valid_states-1].frequency / 1000)
                         continue;
   
diff --combined arch/x86/kernel/cpu/cpufreq/e_powersaver.c

index 3f83ea12c47a5972c2a32e3f5750e6d79fa9f105,41ab3f064cb14fc26bca09b8fc3f55fd98824a87..35a257dd4bb76f848762bccf5fe9e10db3248a77
--- 1/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
--- 2/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
@@@ -12,12 -12,12 +12,12 @@@
   #include <linux/cpufreq.h>
   #include <linux/ioport.h>
   #include <linux/slab.h>
+ +#include <linux/timex.h>
+ +#include <linux/io.h>
+ +#include <linux/delay.h>
   
   #include <asm/msr.h>
   #include <asm/tsc.h>
- -#include <asm/timex.h>
- -#include <asm/io.h>
- -#include <asm/delay.h>
   
   #define EPS_BRAND_C7M 0
   #define EPS_BRAND_C7  1
@@@ -184,7 -184,7 +184,7 @@@ static int eps_cpu_init(struct cpufreq_
                 break;
         }
   
- -      switch(brand) {
+ +      switch (brand) {
         case EPS_BRAND_C7M:
                 printk(KERN_CONT "C7-M\n");
                 break;
@@@ -204,12 -204,12 +204,12 @@@
         }
         /* Enable Enhanced PowerSaver */
         rdmsrl(MSR_IA32_MISC_ENABLE, val);
-       if (!(val & 1 << 16)) {
-               val |= 1 << 16;
+       if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
+               val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
                 wrmsrl(MSR_IA32_MISC_ENABLE, val);
                 /* Can be locked at 0 */
                 rdmsrl(MSR_IA32_MISC_ENABLE, val);
-               if (!(val & 1 << 16)) {
+               if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
                         printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
                         return -ENODEV;
                 }
@@@ -218,20 -218,17 +218,20 @@@
         /* Print voltage and multiplier */
         rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
         current_voltage = lo & 0xff;
- -      printk(KERN_INFO "eps: Current voltage = %dmV\n", current_voltage * 16 + 700);
+ +      printk(KERN_INFO "eps: Current voltage = %dmV\n",
+ +                      current_voltage * 16 + 700);
         current_multiplier = (lo >> 8) & 0xff;
         printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
   
         /* Print limits */
         max_voltage = hi & 0xff;
- -      printk(KERN_INFO "eps: Highest voltage = %dmV\n", max_voltage * 16 + 700);
+ +      printk(KERN_INFO "eps: Highest voltage = %dmV\n",
+ +                      max_voltage * 16 + 700);
         max_multiplier = (hi >> 8) & 0xff;
         printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
         min_voltage = (hi >> 16) & 0xff;
- -      printk(KERN_INFO "eps: Lowest voltage = %dmV\n", min_voltage * 16 + 700);
+ +      printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
+ +                      min_voltage * 16 + 700);
         min_multiplier = (hi >> 24) & 0xff;
         printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
   
@@@ -321,7 -318,7 +321,7 @@@ static int eps_cpu_exit(struct cpufreq_
         return 0;
   }
   
- -static struct freq_attr* eps_attr[] = {
+ +static struct freq_attr *eps_attr[] = {
         &cpufreq_freq_attr_scaling_available_freqs,
         NULL,
   };
@@@ -359,7 -356,7 +359,7 @@@ static void __exit eps_exit(void
         cpufreq_unregister_driver(&eps_driver);
   }
   
- -MODULE_AUTHOR("Rafa³ Bilski <rafalbilski@interia.pl>");
+ +MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
   MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
   MODULE_LICENSE("GPL");
   
diff --combined arch/x86/kernel/cpu/intel.c

index 5fff00c70de0d1d8ddd3e3ce1ecad2ecec147247,25c559ba8d546fd4e332720165b84b7f5aea6a8e..1a89a2b68d1539a92939e4d33747a1bcd916390c
--- 1/arch/x86/kernel/cpu/intel.c
--- 2/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@@ -4,7 -4,6 +4,7 @@@
   #include <linux/string.h>
   #include <linux/bitops.h>
   #include <linux/smp.h>
+ +#include <linux/sched.h>
   #include <linux/thread_info.h>
   #include <linux/module.h>
   
@@@ -25,7 -24,6 +25,6 @@@
   #ifdef CONFIG_X86_LOCAL_APIC
   #include <asm/mpspec.h>
   #include <asm/apic.h>
- #include <mach_apic.h>
   #endif
   
   static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
@@@ -57,18 -55,25 +56,30 @@@
   
         /*
          * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
- -       * with P/T states and does not stop in deep C-states
+ +       * with P/T states and does not stop in deep C-states.
+ +       *
+ +       * It is also reliable across cores and sockets. (but not across
+ +       * cabinets - we turn it off in that case explicitly.)
          */
         if (c->x86_power & (1 << 8)) {
                 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ +              set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
+ +              sched_clock_stable = 1;
         }
   
+       /*
+        * There is a known erratum on Pentium III and Core Solo
+        * and Core Duo CPUs.
+        * " Page with PAT set to WC while associated MTRR is UC
+        *   may consolidate to UC "
+        * Because of this erratum, it is better to stick with
+        * setting WC in MTRR rather than using PAT on these CPUs.
+        *
+        * Enable PAT WC only on P4, Core 2 or later CPUs.
+        */
+       if (c->x86 == 6 && c->x86_model < 15)
+               clear_cpu_cap(c, X86_FEATURE_PAT);
   }
   
   #ifdef CONFIG_X86_32
@@@ -141,10 -146,10 +152,10 @@@ static void __cpuinit intel_workarounds
          */
         if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
                 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
-               if ((lo & (1<<9)) == 0) {
+               if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
                         printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
                         printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
-                       lo |= (1<<9);   /* Disable hw prefetching */
+                       lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
                         wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
                 }
         }
diff --combined arch/x86/kernel/efi.c

index eb1ef3b67dd50278c7ce88bd92f5895d7f06ede6,b205272ad3947e8318659720f73917df8a7bcde5..1736acc4d7aa6cfc13bc8ebd0db0e2727a8ebf5e
--- 1/arch/x86/kernel/efi.c
--- 2/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@@ -366,10 -366,12 +366,12 @@@ void __init efi_init(void
                                         SMBIOS_TABLE_GUID)) {
                         efi.smbios = config_tables[i].table;
                         printk(" SMBIOS=0x%lx ", config_tables[i].table);
+ #ifdef CONFIG_X86_UV
                 } else if (!efi_guidcmp(config_tables[i].guid,
                                         UV_SYSTEM_TABLE_GUID)) {
                         efi.uv_systab = config_tables[i].table;
                         printk(" UVsystab=0x%lx ", config_tables[i].table);
+ #endif
                 } else if (!efi_guidcmp(config_tables[i].guid,
                                         HCDP_TABLE_GUID)) {
                         efi.hcdp = config_tables[i].table;
@@@ -467,7 -469,7 +469,7 @@@ void __init efi_enter_virtual_mode(void
         efi_memory_desc_t *md;
         efi_status_t status;
         unsigned long size;
- -      u64 end, systab, addr, npages;
+ +      u64 end, systab, addr, npages, end_pfn;
         void *p, *va;
   
         efi.systab = NULL;
@@@ -479,10 -481,7 +481,10 @@@
                 size = md->num_pages << EFI_PAGE_SHIFT;
                 end = md->phys_addr + size;
   
- -              if (PFN_UP(end) <= max_low_pfn_mapped)
+ +              end_pfn = PFN_UP(end);
+ +              if (end_pfn <= max_low_pfn_mapped
+ +                  || (end_pfn > (1UL << (32 - PAGE_SHIFT))
+ +                      && end_pfn <= max_pfn_mapped))
                         va = __va(md->phys_addr);
                 else
                         va = efi_ioremap(md->phys_addr, size);
diff --combined arch/x86/kernel/efi_64.c

index cb783b92c50cce5b9d123c8bff127ba967a9b230,a4ee29127fdf24d916af929339a12faab1c7e618..22c3b7828c50fa1f0c61e17d6680cbf19d0b6a17
--- 1/arch/x86/kernel/efi_64.c
--- 2/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@@ -36,6 -36,7 +36,7 @@@
   #include <asm/proto.h>
   #include <asm/efi.h>
   #include <asm/cacheflush.h>
+ #include <asm/fixmap.h>
   
   static pgd_t save_pgd __initdata;
   static unsigned long efi_flags __initdata;
@@@ -99,11 -100,24 +100,11 @@@ void __init efi_call_phys_epilog(void
   
   void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
   {
- -      static unsigned pages_mapped __initdata;
- -      unsigned i, pages;
- -      unsigned long offset;
+ +      unsigned long last_map_pfn;
   
- -      pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
- -      offset = phys_addr & ~PAGE_MASK;
- -      phys_addr &= PAGE_MASK;
- -
- -      if (pages_mapped + pages > MAX_EFI_IO_PAGES)
+ +      last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+ +      if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
                 return NULL;
   
- -      for (i = 0; i < pages; i++) {
- -              __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
- -                           phys_addr, PAGE_KERNEL);
- -              phys_addr += PAGE_SIZE;
- -              pages_mapped++;
- -      }
- -
- -      return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
- -                                           (pages_mapped - pages)) + offset;
+ +      return (void __iomem *)__va(phys_addr);
   }
diff --combined arch/x86/kernel/reboot.c

index 4526b3a75ed2aef73579212c7fde9ca6ed15e946,1cc18d439bbbd377bd6941eece6fa2397508d359..2aef36d8aca2783a2cf9cb04f74fe9a72c564734
--- 1/arch/x86/kernel/reboot.c
--- 2/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@@ -14,6 -14,7 +14,7 @@@
   #include <asm/reboot.h>
   #include <asm/pci_x86.h>
   #include <asm/virtext.h>
+ #include <asm/cpu.h>
   
   #ifdef CONFIG_X86_32
   # include <linux/dmi.h>
@@@ -23,8 -24,6 +24,6 @@@
   # include <asm/iommu.h>
   #endif
   
- #include <mach_ipi.h>
- 
   /*
    * Power off function, if any
    */
@@@ -217,14 -216,6 +216,14 @@@ static struct dmi_system_id __initdata 
                         DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
                 },
         },
+ +      {       /* Handle problems with rebooting on Dell XPS710 */
+ +              .callback = set_bios_reboot,
+ +              .ident = "Dell XPS710",
+ +              .matches = {
+ +                      DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ +                      DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+ +              },
+ +      },
         { }
   };
   
@@@ -658,7 -649,7 +657,7 @@@ static int crash_nmi_callback(struct no
   
   static void smp_send_nmi_allbutself(void)
   {
-       send_IPI_allbutself(NMI_VECTOR);
+       apic->send_IPI_allbutself(NMI_VECTOR);
   }
   
   static struct notifier_block crash_nmi_nb = {
diff --combined arch/x86/kernel/setup.c

index 6a8811a693245e2b96f8815df4debdc5c9111749,4c54bc0d8ff3cc632f10a2639628dbf0592f7715..b746deb9ebc649685c4c167f50e525541b8da292
--- 1/arch/x86/kernel/setup.c
--- 2/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@@ -74,14 -74,15 +74,15 @@@
   #include <asm/e820.h>
   #include <asm/mpspec.h>
   #include <asm/setup.h>
- #include <asm/arch_hooks.h>
   #include <asm/efi.h>
+ #include <asm/timer.h>
+ #include <asm/i8259.h>
   #include <asm/sections.h>
   #include <asm/dmi.h>
   #include <asm/io_apic.h>
   #include <asm/ist.h>
   #include <asm/vmi.h>
- #include <setup_arch.h>
+ #include <asm/setup_arch.h>
   #include <asm/bios_ebda.h>
   #include <asm/cacheflush.h>
   #include <asm/processor.h>
@@@ -89,7 -90,7 +90,7 @@@
   
   #include <asm/system.h>
   #include <asm/vsyscall.h>
- #include <asm/smp.h>
+ #include <asm/cpu.h>
   #include <asm/desc.h>
   #include <asm/dma.h>
   #include <asm/iommu.h>
@@@ -97,7 -98,6 +98,6 @@@
   #include <asm/mmu_context.h>
   #include <asm/proto.h>
   
- #include <mach_apic.h>
   #include <asm/paravirt.h>
   #include <asm/hypervisor.h>
   
@@@ -112,6 -112,20 +112,20 @@@
   #define ARCH_SETUP
   #endif
   
+ unsigned int boot_cpu_id __read_mostly;
+ 
+ #ifdef CONFIG_X86_64
+ int default_cpu_present_to_apicid(int mps_cpu)
+ {
+       return __default_cpu_present_to_apicid(mps_cpu);
+ }
+ 
+ int default_check_phys_apicid_present(int boot_cpu_physical_apicid)
+ {
+       return __default_check_phys_apicid_present(boot_cpu_physical_apicid);
+ }
+ #endif
+ 
   #ifndef CONFIG_DEBUG_BOOT_PARAMS
   struct boot_params __initdata boot_params;
   #else
@@@ -586,20 -600,7 +600,7 @@@ static int __init setup_elfcorehdr(cha
   early_param("elfcorehdr", setup_elfcorehdr);
   #endif
   
- static int __init default_update_genapic(void)
- {
- #ifdef CONFIG_X86_SMP
- # if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
-       genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
- # endif
- #endif
- 
-       return 0;
- }
- 
- static struct x86_quirks default_x86_quirks __initdata = {
-       .update_genapic         = default_update_genapic,
- };
+ static struct x86_quirks default_x86_quirks __initdata;
   
   struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
   
@@@ -656,7 -657,6 +657,6 @@@ void __init setup_arch(char **cmdline_p
   #ifdef CONFIG_X86_32
         memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
         visws_early_detect();
-       pre_setup_arch_hook();
   #else
         printk(KERN_INFO "Command line: %s\n", boot_command_line);
   #endif
@@@ -770,9 -770,6 +770,9 @@@
   
         finish_e820_parsing();
   
+ +      if (efi_enabled)
+ +              efi_init();
+ +
         dmi_scan_machine();
   
         dmi_check_system(bad_bios_dmi_table);
@@@ -792,6 -789,8 +792,6 @@@
         insert_resource(&iomem_resource, &data_resource);
         insert_resource(&iomem_resource, &bss_resource);
   
- -      if (efi_enabled)
- -              efi_init();
   
   #ifdef CONFIG_X86_32
         if (ppro_with_ram_bug()) {
@@@ -824,8 -823,7 +824,7 @@@
   #else
         num_physpages = max_pfn;
   
-       if (cpu_has_x2apic)
-               check_x2apic();
+       check_x2apic();
   
         /* How many end-of-memory variables you have, grandma! */
         /* need this before calling reserve_initrd */
@@@ -865,9 -863,7 +864,7 @@@
   
         reserve_initrd();
   
- #ifdef CONFIG_X86_64
         vsmp_init();
- #endif
   
         io_delay_init();
   
@@@ -893,12 -889,11 +890,11 @@@
          */
         acpi_reserve_bootmem();
   #endif
- #ifdef CONFIG_X86_FIND_SMP_CONFIG
         /*
          * Find and reserve possible boot-time SMP configuration:
          */
         find_smp_config();
- #endif
+ 
         reserve_crashkernel();
   
   #ifdef CONFIG_X86_64
@@@ -925,9 -920,7 +921,7 @@@
         map_vsyscall();
   #endif
   
- #ifdef CONFIG_X86_GENERICARCH
         generic_apic_probe();
- #endif
   
         early_quirks();
   
@@@ -978,4 -971,95 +972,95 @@@
   #endif
   }
   
+ #ifdef CONFIG_X86_32
   
+ /**
+  * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+  *
+  * Description:
+  *    Perform any necessary interrupt initialisation prior to setting up
+  *    the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+  *    interrupts should be initialised here if the machine emulates a PC
+  *    in any way.
+  **/
+ void __init x86_quirk_pre_intr_init(void)
+ {
+       if (x86_quirks->arch_pre_intr_init) {
+               if (x86_quirks->arch_pre_intr_init())
+                       return;
+       }
+       init_ISA_irqs();
+ }
+ 
+ /**
+  * x86_quirk_intr_init - post gate setup interrupt initialisation
+  *
+  * Description:
+  *    Fill in any interrupts that may have been left out by the general
+  *    init_IRQ() routine.  interrupts having to do with the machine rather
+  *    than the devices on the I/O bus (like APIC interrupts in intel MP
+  *    systems) are started here.
+  **/
+ void __init x86_quirk_intr_init(void)
+ {
+       if (x86_quirks->arch_intr_init) {
+               if (x86_quirks->arch_intr_init())
+                       return;
+       }
+ }
+ 
+ /**
+  * x86_quirk_trap_init - initialise system specific traps
+  *
+  * Description:
+  *    Called as the final act of trap_init().  Used in VISWS to initialise
+  *    the various board specific APIC traps.
+  **/
+ void __init x86_quirk_trap_init(void)
+ {
+       if (x86_quirks->arch_trap_init) {
+               if (x86_quirks->arch_trap_init())
+                       return;
+       }
+ }
+ 
+ static struct irqaction irq0  = {
+       .handler = timer_interrupt,
+       .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+       .mask = CPU_MASK_NONE,
+       .name = "timer"
+ };
+ 
+ /**
+  * x86_quirk_pre_time_init - do any specific initialisations before.
+  *
+  **/
+ void __init x86_quirk_pre_time_init(void)
+ {
+       if (x86_quirks->arch_pre_time_init)
+               x86_quirks->arch_pre_time_init();
+ }
+ 
+ /**
+  * x86_quirk_time_init - do any specific initialisations for the system timer.
+  *
+  * Description:
+  *    Must plug the system timer interrupt source at HZ into the IRQ listed
+  *    in irq_vectors.h:TIMER_IRQ
+  **/
+ void __init x86_quirk_time_init(void)
+ {
+       if (x86_quirks->arch_time_init) {
+               /*
+                * A nonzero return code does not mean failure, it means
+                * that the architecture quirk does not want any
+                * generic (timer) setup to be performed after this:
+                */
+               if (x86_quirks->arch_time_init())
+                       return;
+       }
+ 
+       irq0.mask = cpumask_of_cpu(0);
+       setup_irq(0, &irq0);
+ }
+ #endif /* CONFIG_X86_32 */
diff --combined arch/x86/kernel/tsc.c

index 08afa1579e6d84d9ba94e10045d83761def46850,83d53ce5d4c4a98aa703032f4f7390292a35536f..7a567ebe63614381e1cd511f3730ecc898cbd3e0
--- 1/arch/x86/kernel/tsc.c
--- 2/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@@ -17,21 -17,20 +17,21 @@@
   #include <asm/delay.h>
   #include <asm/hypervisor.h>
   
- -unsigned int cpu_khz;           /* TSC clocks / usec, not used here */
+ +unsigned int __read_mostly cpu_khz;   /* TSC clocks / usec, not used here */
   EXPORT_SYMBOL(cpu_khz);
- -unsigned int tsc_khz;
+ +
+ +unsigned int __read_mostly tsc_khz;
   EXPORT_SYMBOL(tsc_khz);
   
   /*
    * TSC can be unstable due to cpufreq or due to unsynced TSCs
    */
- -static int tsc_unstable;
+ +static int __read_mostly tsc_unstable;
   
   /* native_sched_clock() is called before tsc_init(), so
      we must start with the TSC soft disabled to prevent
      erroneous rdtsc usage on !cpu_has_tsc processors */
- -static int tsc_disabled = -1;
+ +static int __read_mostly tsc_disabled = -1;
   
   static int tsc_clocksource_reliable;
   /*
@@@ -274,43 -273,30 +274,43 @@@ static unsigned long pit_calibrate_tsc(
    * use the TSC value at the transitions to calculate a pretty
    * good value for the TSC frequencty.
    */
- -static inline int pit_expect_msb(unsigned char val)
+ +static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
   {
- -      int count = 0;
+ +      int count;
+ +      u64 tsc = 0;
   
         for (count = 0; count < 50000; count++) {
                 /* Ignore LSB */
                 inb(0x42);
                 if (inb(0x42) != val)
                         break;
+ +              tsc = get_cycles();
         }
- -      return count > 50;
+ +      *deltap = get_cycles() - tsc;
+ +      *tscp = tsc;
+ +
+ +      /*
+ +       * We require _some_ success, but the quality control
+ +       * will be based on the error terms on the TSC values.
+ +       */
+ +      return count > 5;
   }
   
   /*
- - * How many MSB values do we want to see? We aim for a
- - * 15ms calibration, which assuming a 2us counter read
- - * error should give us roughly 150 ppm precision for
- - * the calibration.
+ + * How many MSB values do we want to see? We aim for
+ + * a maximum error rate of 500ppm (in practice the
+ + * real error is much smaller), but refuse to spend
+ + * more than 25ms on it.
    */
- -#define QUICK_PIT_MS 15
- -#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+ +#define MAX_QUICK_PIT_MS 25
+ +#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
   
   static unsigned long quick_pit_calibrate(void)
   {
+ +      int i;
+ +      u64 tsc, delta;
+ +      unsigned long d1, d2;
+ +
         /* Set the Gate high, disable speaker */
         outb((inb(0x61) & ~0x02) | 0x01, 0x61);
   
@@@ -329,52 -315,45 +329,52 @@@
         outb(0xff, 0x42);
         outb(0xff, 0x42);
   
- -      if (pit_expect_msb(0xff)) {
- -              int i;
- -              u64 t1, t2, delta;
- -              unsigned char expect = 0xfe;
- -
- -              t1 = get_cycles();
- -              for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
- -                      if (!pit_expect_msb(expect))
- -                              goto failed;
+ +      /*
+ +       * The PIT starts counting at the next edge, so we
+ +       * need to delay for a microsecond. The easiest way
+ +       * to do that is to just read back the 16-bit counter
+ +       * once from the PIT.
+ +       */
+ +      inb(0x42);
+ +      inb(0x42);
+ +
+ +      if (pit_expect_msb(0xff, &tsc, &d1)) {
+ +              for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
+ +                      if (!pit_expect_msb(0xff-i, &delta, &d2))
+ +                              break;
+ +
+ +                      /*
+ +                       * Iterate until the error is less than 500 ppm
+ +                       */
+ +                      delta -= tsc;
+ +                      if (d1+d2 < delta >> 11)
+ +                              goto success;
                 }
- -              t2 = get_cycles();
- -
- -              /*
- -               * Make sure we can rely on the second TSC timestamp:
- -               */
- -              if (!pit_expect_msb(expect))
- -                      goto failed;
- -
- -              /*
- -               * Ok, if we get here, then we've seen the
- -               * MSB of the PIT decrement QUICK_PIT_ITERATIONS
- -               * times, and each MSB had many hits, so we never
- -               * had any sudden jumps.
- -               *
- -               * As a result, we can depend on there not being
- -               * any odd delays anywhere, and the TSC reads are
- -               * reliable.
- -               *
- -               * kHz = ticks / time-in-seconds / 1000;
- -               * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
- -               * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
- -               */
- -              delta = (t2 - t1)*PIT_TICK_RATE;
- -              do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
- -              printk("Fast TSC calibration using PIT\n");
- -              return delta;
         }
- -failed:
+ +      printk("Fast TSC calibration failed\n");
         return 0;
+ +
+ +success:
+ +      /*
+ +       * Ok, if we get here, then we've seen the
+ +       * MSB of the PIT decrement 'i' times, and the
+ +       * error has shrunk to less than 500 ppm.
+ +       *
+ +       * As a result, we can depend on there not being
+ +       * any odd delays anywhere, and the TSC reads are
+ +       * reliable (within the error). We also adjust the
+ +       * delta to the middle of the error bars, just
+ +       * because it looks nicer.
+ +       *
+ +       * kHz = ticks / time-in-seconds / 1000;
+ +       * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
+ +       * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
+ +       */
+ +      delta += (long)(d2 - d1)/2;
+ +      delta *= PIT_TICK_RATE;
+ +      do_div(delta, i*256*1000);
+ +      printk("Fast TSC calibration using PIT\n");
+ +      return delta;
   }
   
   /**
@@@ -544,6 -523,8 +544,6 @@@ unsigned long native_calibrate_tsc(void
         return tsc_pit_min;
   }
   
- -#ifdef CONFIG_X86_32
- -/* Only called from the Powernow K7 cpu freq driver */
   int recalibrate_cpu_khz(void)
   {
   #ifndef CONFIG_SMP
@@@ -565,6 -546,7 +565,6 @@@
   
   EXPORT_SYMBOL(recalibrate_cpu_khz);
   
- -#endif /* CONFIG_X86_32 */
   
   /* Accelerators for sched_clock()
    * convert from cycles(64bits) => nanoseconds (64bits)
@@@ -791,7 -773,7 +791,7 @@@ __cpuinit int unsynchronized_tsc(void
         if (!cpu_has_tsc || tsc_unstable)
                 return 1;
   
- #ifdef CONFIG_X86_SMP
+ #ifdef CONFIG_SMP
         if (apic_is_clustered_box())
                 return 1;
   #endif
diff --combined arch/x86/lguest/boot.c

index 960a8d9c049c697e8505e003e96e7cffcd089f86,f3a5305b8adfcf0fc243692287bd9ec4769c977f..9fe4ddaa8f6ff1fc53bbe09443d32d733876131c
--- 1/arch/x86/lguest/boot.c
--- 2/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@@ -173,24 -173,29 +173,29 @@@ static unsigned long save_fl(void
   {
         return lguest_data.irq_enabled;
   }
+ PV_CALLEE_SAVE_REGS_THUNK(save_fl);
   
   /* restore_flags() just sets the flags back to the value given. */
   static void restore_fl(unsigned long flags)
   {
         lguest_data.irq_enabled = flags;
   }
+ PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
   
   /* Interrupts go off... */
   static void irq_disable(void)
   {
         lguest_data.irq_enabled = 0;
   }
+ PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
   
   /* Interrupts go on... */
   static void irq_enable(void)
   {
         lguest_data.irq_enabled = X86_EFLAGS_IF;
   }
+ PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+ 
   /*:*/
   /*M:003 Note that we don't check for outstanding interrupts when we re-enable
    * them (or when we unmask an interrupt).  This seems to work for the moment,
@@@ -278,7 -283,7 +283,7 @@@ static void lguest_load_tls(struct thre
         /* There's one problem which normal hardware doesn't have: the Host
          * can't handle us removing entries we're currently using.  So we clear
          * the GS register here: if it's needed it'll be reloaded anyway. */
-       loadsegment(gs, 0);
+       lazy_load_gs(0);
         lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
   }
   
@@@ -343,11 -348,6 +348,11 @@@ static void lguest_cpuid(unsigned int *
                  * flush_tlb_user() for both user and kernel mappings unless
                  * the Page Global Enable (PGE) feature bit is set. */
                 *dx |= 0x00002000;
+ +              /* We also lie, and say we're family id 5.  6 or greater
+ +               * leads to a rdmsr in early_init_intel which we can't handle.
+ +               * Family ID is returned as bits 8-12 in ax. */
+ +              *ax &= 0xFFFFF0FF;
+ +              *ax |= 0x00000500;
                 break;
         case 0x80000000:
                 /* Futureproof this a little: if they ask how much extended
@@@ -594,21 -594,19 +599,21 @@@ static void __init lguest_init_IRQ(void
                 /* Some systems map "vectors" to interrupts weirdly.  Lguest has
                  * a straightforward 1 to 1 mapping, so force that here. */
                 __get_cpu_var(vector_irq)[vector] = i;
- -              if (vector != SYSCALL_VECTOR) {
- -                      set_intr_gate(vector,
- -                                    interrupt[vector-FIRST_EXTERNAL_VECTOR]);
- -                      set_irq_chip_and_handler_name(i, &lguest_irq_controller,
- -                                                    handle_level_irq,
- -                                                    "level");
- -              }
+ +              if (vector != SYSCALL_VECTOR)
+ +                      set_intr_gate(vector, interrupt[i]);
         }
         /* This call is required to set up for 4k stacks, where we have
          * separate stacks for hard and soft interrupts. */
         irq_ctx_init(smp_processor_id());
   }
   
+ +void lguest_setup_irq(unsigned int irq)
+ +{
+ +      irq_to_desc_alloc_cpu(irq, 0);
+ +      set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
+ +                                    handle_level_irq, "level");
+ +}
+ +
   /*
    * Time.
    *
@@@ -830,13 -828,14 +835,14 @@@ static u32 lguest_apic_safe_wait_icr_id
         return 0;
   }
   
- static struct apic_ops lguest_basic_apic_ops = {
-       .read = lguest_apic_read,
-       .write = lguest_apic_write,
-       .icr_read = lguest_apic_icr_read,
-       .icr_write = lguest_apic_icr_write,
-       .wait_icr_idle = lguest_apic_wait_icr_idle,
-       .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
+ static void set_lguest_basic_apic_ops(void)
+ {
+       apic->read = lguest_apic_read;
+       apic->write = lguest_apic_write;
+       apic->icr_read = lguest_apic_icr_read;
+       apic->icr_write = lguest_apic_icr_write;
+       apic->wait_icr_idle = lguest_apic_wait_icr_idle;
+       apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
   };
   #endif
   
@@@ -991,10 -990,10 +997,10 @@@ __init void lguest_init(void
   
         /* interrupt-related operations */
         pv_irq_ops.init_IRQ = lguest_init_IRQ;
-       pv_irq_ops.save_fl = save_fl;
-       pv_irq_ops.restore_fl = restore_fl;
-       pv_irq_ops.irq_disable = irq_disable;
-       pv_irq_ops.irq_enable = irq_enable;
+       pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
+       pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+       pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
+       pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
         pv_irq_ops.safe_halt = lguest_safe_halt;
   
         /* init-time operations */
@@@ -1037,7 -1036,7 +1043,7 @@@
   
   #ifdef CONFIG_X86_LOCAL_APIC
         /* apic read/write intercepts */
-       apic_ops = &lguest_basic_apic_ops;
+       set_lguest_basic_apic_ops();
   #endif
   
         /* time operations */
diff --combined arch/x86/mm/pageattr.c

index 7233bd7e357bbccf2e6f091322606b083ca6f9d1,8253bc97587e4950e2c2ea3170da767245ac825f..9c4294986af779ed62ab088af0dae0f0048eb0c9
--- 1/arch/x86/mm/pageattr.c
--- 2/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@@ -482,6 -482,13 +482,13 @@@ static int split_large_page(pte_t *kpte
         pbase = (pte_t *)page_address(base);
         paravirt_alloc_pte(&init_mm, page_to_pfn(base));
         ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+       /*
+        * If we ever want to utilize the PAT bit, we need to
+        * update this function to make sure it's converted from
+        * bit 12 to bit 7 when we cross from the 2MB level to
+        * the 4K level:
+        */
+       WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
   
   #ifdef CONFIG_X86_64
         if (level == PG_LEVEL_1G) {
@@@ -515,17 -522,6 +522,17 @@@
          * primary protection behavior:
          */
         __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+ +
+ +      /*
+ +       * Intel Atom errata AAH41 workaround.
+ +       *
+ +       * The real fix should be in hw or in a microcode update, but
+ +       * we also probabilistically try to reduce the window of having
+ +       * a large TLB mixed with 4K TLBs while instruction fetches are
+ +       * going on.
+ +       */
+ +      __flush_tlb_all();
+ +
         base = NULL;
   
   out_unlock:
diff --combined drivers/acpi/osl.c

index 1e35f342957c2cf63241433ab30fef0d1cd5459e,2b6c5902825437470d50e7e841a23d3c37b10d61..eb8980d67368e12b45667ddc87cba141bf4024de
--- 1/drivers/acpi/osl.c
--- 2/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@@ -272,14 -272,21 +272,21 @@@ acpi_os_map_memory(acpi_physical_addres
   }
   EXPORT_SYMBOL_GPL(acpi_os_map_memory);
   
- void acpi_os_unmap_memory(void __iomem * virt, acpi_size size)
+ void __ref acpi_os_unmap_memory(void __iomem *virt, acpi_size size)
   {
-       if (acpi_gbl_permanent_mmap) {
+       if (acpi_gbl_permanent_mmap)
                 iounmap(virt);
-       }
+       else
+               __acpi_unmap_table(virt, size);
   }
   EXPORT_SYMBOL_GPL(acpi_os_unmap_memory);
   
+ void __init early_acpi_os_unmap_memory(void __iomem *virt, acpi_size size)
+ {
+       if (!acpi_gbl_permanent_mmap)
+               __acpi_unmap_table(virt, size);
+ }
+ 
   #ifdef ACPI_FUTURE_USAGE
   acpi_status
   acpi_os_get_physical_address(void *virt, acpi_physical_address * phys)
@@@ -1317,6 -1324,54 +1324,6 @@@ acpi_os_validate_interface (char *inter
         return AE_SUPPORT;
   }
   
- -#ifdef        CONFIG_X86
- -
- -struct aml_port_desc {
- -      uint    start;
- -      uint    end;
- -      char*   name;
- -      char    warned;
- -};
- -
- -static struct aml_port_desc aml_invalid_port_list[] = {
- -      {0x20, 0x21, "PIC0", 0},
- -      {0xA0, 0xA1, "PIC1", 0},
- -      {0x4D0, 0x4D1, "ELCR", 0}
- -};
- -
- -/*
- - * valid_aml_io_address()
- - *
- - * if valid, return true
- - * else invalid, warn once, return false
- - */
- -static bool valid_aml_io_address(uint address, uint length)
- -{
- -      int i;
- -      int entries = sizeof(aml_invalid_port_list) / sizeof(struct aml_port_desc);
- -
- -      for (i = 0; i < entries; ++i) {
- -              if ((address >= aml_invalid_port_list[i].start &&
- -                      address <= aml_invalid_port_list[i].end) ||
- -                      (address + length >= aml_invalid_port_list[i].start &&
- -                      address  + length <= aml_invalid_port_list[i].end))
- -              {
- -                      if (!aml_invalid_port_list[i].warned)
- -                      {
- -                              printk(KERN_ERR "ACPI: Denied BIOS AML access"
- -                                      " to invalid port 0x%x+0x%x (%s)\n",
- -                                      address, length,
- -                                      aml_invalid_port_list[i].name);
- -                              aml_invalid_port_list[i].warned = 1;
- -                      }
- -                      return false;   /* invalid */
- -              }
- -      }
- -      return true;    /* valid */
- -}
- -#else
- -static inline bool valid_aml_io_address(uint address, uint length) { return true; }
- -#endif
   /******************************************************************************
    *
    * FUNCTION:    acpi_os_validate_address
@@@ -1346,6 -1401,8 +1353,6 @@@ acpi_os_validate_address 
   
         switch (space_id) {
         case ACPI_ADR_SPACE_SYSTEM_IO:
- -              if (!valid_aml_io_address(address, length))
- -                      return AE_AML_ILLEGAL_ADDRESS;
         case ACPI_ADR_SPACE_SYSTEM_MEMORY:
                 /* Only interference checks against SystemIO and SytemMemory
                    are needed */
diff --combined drivers/net/sfc/efx.c

index 6eff9ca6c6c81c87df26b418f6c31cb3a94fa908,847e9bb0098f2bf0b2543a1cb53e1135a3f5939a..00c23b1babcad720154fe7f73b346cddc1b3b3f9
--- 1/drivers/net/sfc/efx.c
--- 2/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@@ -133,16 -133,6 +133,16 @@@ static int phy_flash_cfg
   module_param(phy_flash_cfg, int, 0644);
   MODULE_PARM_DESC(phy_flash_cfg, "Set PHYs into reflash mode initially");
   
+ +static unsigned irq_adapt_low_thresh = 10000;
+ +module_param(irq_adapt_low_thresh, uint, 0644);
+ +MODULE_PARM_DESC(irq_adapt_low_thresh,
+ +               "Threshold score for reducing IRQ moderation");
+ +
+ +static unsigned irq_adapt_high_thresh = 20000;
+ +module_param(irq_adapt_high_thresh, uint, 0644);
+ +MODULE_PARM_DESC(irq_adapt_high_thresh,
+ +               "Threshold score for increasing IRQ moderation");
+ +
   /**************************************************************************
    *
    * Utility functions and prototypes
@@@ -192,6 -182,7 +192,6 @@@ static int efx_process_channel(struct e
                 channel->rx_pkt = NULL;
         }
   
- -      efx_flush_lro(channel);
         efx_rx_strategy(channel);
   
         efx_fast_push_rx_descriptors(&efx->rx_queue[channel->channel]);
@@@ -233,41 -224,12 +233,41 @@@ static int efx_poll(struct napi_struct 
         rx_packets = efx_process_channel(channel, budget);
   
         if (rx_packets < budget) {
+ +              struct efx_nic *efx = channel->efx;
+ +
+ +              if (channel->used_flags & EFX_USED_BY_RX &&
+ +                  efx->irq_rx_adaptive &&
+ +                  unlikely(++channel->irq_count == 1000)) {
+ +                      unsigned old_irq_moderation = channel->irq_moderation;
+ +
+ +                      if (unlikely(channel->irq_mod_score <
+ +                                   irq_adapt_low_thresh)) {
+ +                              channel->irq_moderation =
+ +                                      max_t(int,
+ +                                            channel->irq_moderation -
+ +                                            FALCON_IRQ_MOD_RESOLUTION,
+ +                                            FALCON_IRQ_MOD_RESOLUTION);
+ +                      } else if (unlikely(channel->irq_mod_score >
+ +                                          irq_adapt_high_thresh)) {
+ +                              channel->irq_moderation =
+ +                                      min(channel->irq_moderation +
+ +                                          FALCON_IRQ_MOD_RESOLUTION,
+ +                                          efx->irq_rx_moderation);
+ +                      }
+ +
+ +                      if (channel->irq_moderation != old_irq_moderation)
+ +                              falcon_set_int_moderation(channel);
+ +
+ +                      channel->irq_count = 0;
+ +                      channel->irq_mod_score = 0;
+ +              }
+ +
                 /* There is no race here; although napi_disable() will
- -               * only wait for netif_rx_complete(), this isn't a problem
+ +               * only wait for napi_complete(), this isn't a problem
                  * since efx_channel_processed() will have no effect if
                  * interrupts have already been disabled.
                  */
- -              netif_rx_complete(napi);
+ +              napi_complete(napi);
                 efx_channel_processed(channel);
         }
   
@@@ -596,8 -558,6 +596,8 @@@ static void efx_link_status_changed(str
   
   }
   
+ +static void efx_fini_port(struct efx_nic *efx);
+ +
   /* This call reinitialises the MAC to pick up new PHY settings. The
    * caller must hold the mac_lock */
   void __efx_reconfigure_port(struct efx_nic *efx)
@@@ -633,8 -593,8 +633,8 @@@
   
   fail:
         EFX_ERR(efx, "failed to reconfigure MAC\n");
- -      efx->phy_op->fini(efx);
- -      efx->port_initialized = false;
+ +      efx->port_enabled = false;
+ +      efx_fini_port(efx);
   }
   
   /* Reinitialise the MAC to pick up new PHY settings, even if the port is
@@@ -894,20 -854,27 +894,27 @@@ static void efx_fini_io(struct efx_nic 
    * interrupts across them. */
   static int efx_wanted_rx_queues(void)
   {
-       cpumask_t core_mask;
+       cpumask_var_t core_mask;
         int count;
         int cpu;
   
-       cpus_clear(core_mask);
+       if (!alloc_cpumask_var(&core_mask, GFP_KERNEL)) {
+               printk(KERN_WARNING
+                      "efx.c: allocation failure, irq balancing hobbled\n");
+               return 1;
+       }
+ 
+       cpumask_clear(core_mask);
         count = 0;
         for_each_online_cpu(cpu) {
-               if (!cpu_isset(cpu, core_mask)) {
+               if (!cpumask_test_cpu(cpu, core_mask)) {
                         ++count;
-                       cpus_or(core_mask, core_mask,
-                               topology_core_siblings(cpu));
+                       cpumask_or(core_mask, core_mask,
+                                  topology_core_cpumask(cpu));
                 }
         }
   
+       free_cpumask_var(core_mask);
         return count;
   }
   
@@@ -1030,7 -997,7 +1037,7 @@@ static int efx_probe_nic(struct efx_ni
         efx_set_channels(efx);
   
         /* Initialise the interrupt moderation settings */
- -      efx_init_irq_moderation(efx, tx_irq_mod_usec, rx_irq_mod_usec);
+ +      efx_init_irq_moderation(efx, tx_irq_mod_usec, rx_irq_mod_usec, true);
   
         return 0;
   }
@@@ -1227,8 -1194,7 +1234,8 @@@ void efx_flush_queues(struct efx_nic *e
    **************************************************************************/
   
   /* Set interrupt moderation parameters */
- -void efx_init_irq_moderation(struct efx_nic *efx, int tx_usecs, int rx_usecs)
+ +void efx_init_irq_moderation(struct efx_nic *efx, int tx_usecs, int rx_usecs,
+ +                           bool rx_adaptive)
   {
         struct efx_tx_queue *tx_queue;
         struct efx_rx_queue *rx_queue;
@@@ -1238,8 -1204,6 +1245,8 @@@
         efx_for_each_tx_queue(tx_queue, efx)
                 tx_queue->channel->irq_moderation = tx_usecs;
   
+ +      efx->irq_rx_adaptive = rx_adaptive;
+ +      efx->irq_rx_moderation = rx_usecs;
         efx_for_each_rx_queue(rx_queue, efx)
                 rx_queue->channel->irq_moderation = rx_usecs;
   }
@@@ -1312,11 -1276,18 +1319,11 @@@ static int efx_ioctl(struct net_device 
   static int efx_init_napi(struct efx_nic *efx)
   {
         struct efx_channel *channel;
- -      int rc;
   
         efx_for_each_channel(channel, efx) {
                 channel->napi_dev = efx->net_dev;
- -              rc = efx_lro_init(&channel->lro_mgr, efx);
- -              if (rc)
- -                      goto err;
         }
         return 0;
- - err:
- -      efx_fini_napi(efx);
- -      return rc;
   }
   
   static void efx_fini_napi(struct efx_nic *efx)
@@@ -1324,6 -1295,7 +1331,6 @@@
         struct efx_channel *channel;
   
         efx_for_each_channel(channel, efx) {
- -              efx_lro_fini(&channel->lro_mgr);
                 channel->napi_dev = NULL;
         }
   }
@@@ -1711,8 -1683,7 +1718,8 @@@ int efx_reset_up(struct efx_nic *efx, e
                         rc = efx->phy_op->init(efx);
                         if (rc)
                                 ok = false;
- -              } else
+ +              }
+ +              if (!ok)
                         efx->port_initialized = false;
         }
   
@@@ -1893,8 -1864,8 +1900,8 @@@ static struct efx_phy_operations efx_du
   
   static struct efx_board efx_dummy_board_info = {
         .init           = efx_port_dummy_op_int,
- -      .init_leds      = efx_port_dummy_op_int,
- -      .set_fault_led  = efx_port_dummy_op_blink,
+ +      .init_leds      = efx_port_dummy_op_void,
+ +      .set_id_led     = efx_port_dummy_op_blink,
         .monitor        = efx_port_dummy_op_int,
         .blink          = efx_port_dummy_op_blink,
         .fini           = efx_port_dummy_op_void,
@@@ -2156,7 -2127,7 +2163,7 @@@ static int __devinit efx_pci_probe(stru
         net_dev->features |= (NETIF_F_IP_CSUM | NETIF_F_SG |
                               NETIF_F_HIGHDMA | NETIF_F_TSO);
         if (lro)
- -              net_dev->features |= NETIF_F_LRO;
+ +              net_dev->features |= NETIF_F_GRO;
         /* Mask for features that also apply to VLAN devices */
         net_dev->vlan_features |= (NETIF_F_ALL_CSUM | NETIF_F_SG |
                                    NETIF_F_HIGHDMA | NETIF_F_TSO);
diff --combined drivers/net/sfc/falcon.c

index 23a1b148d5b236347531e42a030bdbd912d1d3c5,064307c2277eff8bb400bb1f161426ff6c61ee29..d4629ab2c614d64cbd553d1235fc60b79c86fb39
--- 1/drivers/net/sfc/falcon.c
--- 2/drivers/net/sfc/falcon.c
+++ b/drivers/net/sfc/falcon.c
@@@ -39,16 -39,11 +39,16 @@@
    * @next_buffer_table: First available buffer table id
    * @pci_dev2: The secondary PCI device if present
    * @i2c_data: Operations and state for I2C bit-bashing algorithm
+ + * @int_error_count: Number of internal errors seen recently
+ + * @int_error_expire: Time at which error count will be expired
    */
   struct falcon_nic_data {
         unsigned next_buffer_table;
         struct pci_dev *pci_dev2;
         struct i2c_algo_bit_data i2c_data;
+ +
+ +      unsigned int_error_count;
+ +      unsigned long int_error_expire;
   };
   
   /**************************************************************************
@@@ -124,12 -119,8 +124,12 @@@ MODULE_PARM_DESC(rx_xon_thresh_bytes, "
   #define FALCON_EVQ_SIZE 4096
   #define FALCON_EVQ_MASK (FALCON_EVQ_SIZE - 1)
   
- -/* Max number of internal errors. After this resets will not be performed */
- -#define FALCON_MAX_INT_ERRORS 4
+ +/* If FALCON_MAX_INT_ERRORS internal errors occur within
+ + * FALCON_INT_ERROR_EXPIRE seconds, we consider the NIC broken and
+ + * disable it.
+ + */
+ +#define FALCON_INT_ERROR_EXPIRE 3600
+ +#define FALCON_MAX_INT_ERRORS 5
   
   /* We poll for events every FLUSH_INTERVAL ms, and check FLUSH_POLL_COUNT times
    */
@@@ -155,6 -146,13 +155,6 @@@
   /* Dummy SRAM size code */
   #define SRM_NB_BSZ_ONCHIP_ONLY (-1)
   
- -/* Be nice if these (or equiv.) were in linux/pci_regs.h, but they're not. */
- -#define PCI_EXP_DEVCAP_PWR_VAL_LBN    18
- -#define PCI_EXP_DEVCAP_PWR_SCL_LBN    26
- -#define PCI_EXP_DEVCTL_PAYLOAD_LBN    5
- -#define PCI_EXP_LNKSTA_LNK_WID                0x3f0
- -#define PCI_EXP_LNKSTA_LNK_WID_LBN    4
- -
   #define FALCON_IS_DUAL_FUNC(efx)              \
         (falcon_rev(efx) < FALCON_REV_B0)
   
@@@ -340,10 -338,10 +340,10 @@@ static int falcon_alloc_special_buffer(
         nic_data->next_buffer_table += buffer->entries;
   
         EFX_LOG(efx, "allocating special buffers %d-%d at %llx+%x "
-               "(virt %p phys %lx)\n", buffer->index,
+               "(virt %p phys %llx)\n", buffer->index,
                 buffer->index + buffer->entries - 1,
-               (unsigned long long)buffer->dma_addr, len,
-               buffer->addr, virt_to_phys(buffer->addr));
+               (u64)buffer->dma_addr, len,
+               buffer->addr, (u64)virt_to_phys(buffer->addr));
   
         return 0;
   }
@@@ -355,10 -353,10 +355,10 @@@ static void falcon_free_special_buffer(
                 return;
   
         EFX_LOG(efx, "deallocating special buffers %d-%d at %llx+%x "
-               "(virt %p phys %lx)\n", buffer->index,
+               "(virt %p phys %llx)\n", buffer->index,
                 buffer->index + buffer->entries - 1,
-               (unsigned long long)buffer->dma_addr, buffer->len,
-               buffer->addr, virt_to_phys(buffer->addr));
+               (u64)buffer->dma_addr, buffer->len,
+               buffer->addr, (u64)virt_to_phys(buffer->addr));
   
         pci_free_consistent(efx->pci_dev, buffer->len, buffer->addr,
                             buffer->dma_addr);
@@@ -729,9 -727,6 +729,9 @@@ static void falcon_handle_tx_event(stru
                 tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, TX_EV_DESC_PTR);
                 tx_ev_q_label = EFX_QWORD_FIELD(*event, TX_EV_Q_LABEL);
                 tx_queue = &efx->tx_queue[tx_ev_q_label];
+ +              channel->irq_mod_score +=
+ +                      (tx_ev_desc_ptr - tx_queue->read_count) &
+ +                      efx->type->txd_ring_mask;
                 efx_xmit_done(tx_queue, tx_ev_desc_ptr);
         } else if (EFX_QWORD_FIELD(*event, TX_EV_WQ_FF_FULL)) {
                 /* Rewrite the FIFO write pointer */
@@@ -901,8 -896,6 +901,8 @@@ static void falcon_handle_rx_event(stru
                         discard = true;
         }
   
+ +      channel->irq_mod_score += 2;
+ +
         /* Handle received packet */
         efx_rx_packet(rx_queue, rx_ev_desc_ptr, rx_ev_byte_cnt,
                       checksummed, discard);
@@@ -1080,15 -1073,14 +1080,15 @@@ void falcon_set_int_moderation(struct e
                  * program is based at 0.  So actual interrupt moderation
                  * achieved is ((x + 1) * res).
                  */
- -              unsigned int res = 5;
- -              channel->irq_moderation -= (channel->irq_moderation % res);
- -              if (channel->irq_moderation < res)
- -                      channel->irq_moderation = res;
+ +              channel->irq_moderation -= (channel->irq_moderation %
+ +                                          FALCON_IRQ_MOD_RESOLUTION);
+ +              if (channel->irq_moderation < FALCON_IRQ_MOD_RESOLUTION)
+ +                      channel->irq_moderation = FALCON_IRQ_MOD_RESOLUTION;
                 EFX_POPULATE_DWORD_2(timer_cmd,
                                      TIMER_MODE, TIMER_MODE_INT_HLDOFF,
                                      TIMER_VAL,
- -                                   (channel->irq_moderation / res) - 1);
+ +                                   channel->irq_moderation /
+ +                                   FALCON_IRQ_MOD_RESOLUTION - 1);
         } else {
                 EFX_POPULATE_DWORD_2(timer_cmd,
                                      TIMER_MODE, TIMER_MODE_DIS,
@@@ -1195,29 -1187,31 +1195,29 @@@ static void falcon_poll_flush_events(st
         struct efx_channel *channel = &efx->channel[0];
         struct efx_tx_queue *tx_queue;
         struct efx_rx_queue *rx_queue;
- -      unsigned int read_ptr, i;
+ +      unsigned int read_ptr = channel->eventq_read_ptr;
+ +      unsigned int end_ptr = (read_ptr - 1) & FALCON_EVQ_MASK;
   
- -      read_ptr = channel->eventq_read_ptr;
- -      for (i = 0; i < FALCON_EVQ_SIZE; ++i) {
+ +      do {
                 efx_qword_t *event = falcon_event(channel, read_ptr);
                 int ev_code, ev_sub_code, ev_queue;
                 bool ev_failed;
+ +
                 if (!falcon_event_present(event))
                         break;
   
                 ev_code = EFX_QWORD_FIELD(*event, EV_CODE);
- -              if (ev_code != DRIVER_EV_DECODE)
- -                      continue;
- -
                 ev_sub_code = EFX_QWORD_FIELD(*event, DRIVER_EV_SUB_CODE);
- -              switch (ev_sub_code) {
- -              case TX_DESCQ_FLS_DONE_EV_DECODE:
+ +              if (ev_code == DRIVER_EV_DECODE &&
+ +                  ev_sub_code == TX_DESCQ_FLS_DONE_EV_DECODE) {
                         ev_queue = EFX_QWORD_FIELD(*event,
                                                    DRIVER_EV_TX_DESCQ_ID);
                         if (ev_queue < EFX_TX_QUEUE_COUNT) {
                                 tx_queue = efx->tx_queue + ev_queue;
                                 tx_queue->flushed = true;
                         }
- -                      break;
- -              case RX_DESCQ_FLS_DONE_EV_DECODE:
+ +              } else if (ev_code == DRIVER_EV_DECODE &&
+ +                         ev_sub_code == RX_DESCQ_FLS_DONE_EV_DECODE) {
                         ev_queue = EFX_QWORD_FIELD(*event,
                                                    DRIVER_EV_RX_DESCQ_ID);
                         ev_failed = EFX_QWORD_FIELD(*event,
@@@ -1231,10 -1225,11 +1231,10 @@@
                                 else
                                         rx_queue->flushed = true;
                         }
- -                      break;
                 }
   
                 read_ptr = (read_ptr + 1) & FALCON_EVQ_MASK;
- -      }
+ +      } while (read_ptr != end_ptr);
   }
   
   /* Handle tx and rx flushes at the same time, since they run in
@@@ -1382,6 -1377,7 +1382,6 @@@ static irqreturn_t falcon_fatal_interru
         efx_oword_t *int_ker = efx->irq_status.addr;
         efx_oword_t fatal_intr;
         int error, mem_perr;
- -      static int n_int_errors;
   
         falcon_read(efx, &fatal_intr, FATAL_INTR_REG_KER);
         error = EFX_OWORD_FIELD(fatal_intr, INT_KER_ERROR);
@@@ -1408,14 -1404,7 +1408,14 @@@
                 pci_clear_master(nic_data->pci_dev2);
         falcon_disable_interrupts(efx);
   
- -      if (++n_int_errors < FALCON_MAX_INT_ERRORS) {
+ +      /* Count errors and reset or disable the NIC accordingly */
+ +      if (nic_data->int_error_count == 0 ||
+ +          time_after(jiffies, nic_data->int_error_expire)) {
+ +              nic_data->int_error_count = 0;
+ +              nic_data->int_error_expire =
+ +                      jiffies + FALCON_INT_ERROR_EXPIRE * HZ;
+ +      }
+ +      if (++nic_data->int_error_count < FALCON_MAX_INT_ERRORS) {
                 EFX_ERR(efx, "SYSTEM ERROR - reset scheduled\n");
                 efx_schedule_reset(efx, RESET_TYPE_INT_ERROR);
         } else {
@@@ -1434,7 -1423,6 +1434,7 @@@ static irqreturn_t falcon_legacy_interr
   {
         struct efx_nic *efx = dev_id;
         efx_oword_t *int_ker = efx->irq_status.addr;
+ +      irqreturn_t result = IRQ_NONE;
         struct efx_channel *channel;
         efx_dword_t reg;
         u32 queues;
@@@ -1449,24 -1437,23 +1449,24 @@@
         if (unlikely(syserr))
                 return falcon_fatal_interrupt(efx);
   
- -      if (queues == 0)
- -              return IRQ_NONE;
- -
- -      efx->last_irq_cpu = raw_smp_processor_id();
- -      EFX_TRACE(efx, "IRQ %d on CPU %d status " EFX_DWORD_FMT "\n",
- -                irq, raw_smp_processor_id(), EFX_DWORD_VAL(reg));
- -
         /* Schedule processing of any interrupting queues */
- -      channel = &efx->channel[0];
- -      while (queues) {
- -              if (queues & 0x01)
+ +      efx_for_each_channel(channel, efx) {
+ +              if ((queues & 1) ||
+ +                  falcon_event_present(
+ +                          falcon_event(channel, channel->eventq_read_ptr))) {
                         efx_schedule_channel(channel);
- -              channel++;
+ +                      result = IRQ_HANDLED;
+ +              }
                 queues >>= 1;
         }
   
- -      return IRQ_HANDLED;
+ +      if (result == IRQ_HANDLED) {
+ +              efx->last_irq_cpu = raw_smp_processor_id();
+ +              EFX_TRACE(efx, "IRQ %d on CPU %d status " EFX_DWORD_FMT "\n",
+ +                        irq, raw_smp_processor_id(), EFX_DWORD_VAL(reg));
+ +      }
+ +
+ +      return result;
   }
   
   
@@@ -2262,7 -2249,6 +2262,7 @@@ static int falcon_probe_phy(struct efx_
                 efx->phy_op = &falcon_sft9001_phy_ops;
                 break;
         case PHY_TYPE_QT2022C2:
+ +      case PHY_TYPE_QT2025C:
                 efx->phy_op = &falcon_xfp_phy_ops;
                 break;
         default:
@@@ -2357,10 -2343,10 +2357,10 @@@ int falcon_probe_port(struct efx_nic *e
                                  FALCON_MAC_STATS_SIZE);
         if (rc)
                 return rc;
-       EFX_LOG(efx, "stats buffer at %llx (virt %p phys %lx)\n",
-               (unsigned long long)efx->stats_buffer.dma_addr,
+       EFX_LOG(efx, "stats buffer at %llx (virt %p phys %llx)\n",
+               (u64)efx->stats_buffer.dma_addr,
                 efx->stats_buffer.addr,
-               virt_to_phys(efx->stats_buffer.addr));
+               (u64)virt_to_phys(efx->stats_buffer.addr));
   
         return 0;
   }
@@@ -2935,9 -2921,9 +2935,9 @@@ int falcon_probe_nic(struct efx_nic *ef
                 goto fail4;
         BUG_ON(efx->irq_status.dma_addr & 0x0f);
   
-       EFX_LOG(efx, "INT_KER at %llx (virt %p phys %lx)\n",
-               (unsigned long long)efx->irq_status.dma_addr,
-               efx->irq_status.addr, virt_to_phys(efx->irq_status.addr));
+       EFX_LOG(efx, "INT_KER at %llx (virt %p phys %llx)\n",
+               (u64)efx->irq_status.dma_addr,
+               efx->irq_status.addr, (u64)virt_to_phys(efx->irq_status.addr));
   
         falcon_probe_spi_devices(efx);
   
@@@ -3127,10 -3113,8 +3127,10 @@@ void falcon_remove_nic(struct efx_nic *
         struct falcon_nic_data *nic_data = efx->nic_data;
         int rc;
   
+ +      /* Remove I2C adapter and clear it in preparation for a retry */
         rc = i2c_del_adapter(&efx->i2c_adap);
         BUG_ON(rc);
+ +      memset(&efx->i2c_adap, 0, sizeof(efx->i2c_adap));
   
         falcon_remove_spi_devices(efx);
         falcon_free_buffer(efx, &efx->irq_status);
diff --combined drivers/pci/intr_remapping.c

index b721c2fbe8f5005c871874cd39c90c5f03ab4d40,8e44db040db7f4dba8cf8c12dc2087012965e16b..9d07a05d26f1125227fa8f37a73ac0c948ed32d2
--- 1/drivers/pci/intr_remapping.c
--- 2/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@@ -6,6 -6,7 +6,7 @@@
   #include <linux/irq.h>
   #include <asm/io_apic.h>
   #include <asm/smp.h>
+ #include <asm/cpu.h>
   #include <linux/intel-iommu.h>
   #include "intr_remapping.h"
   
@@@ -20,7 -21,7 +21,7 @@@ struct irq_2_iommu 
         u8  irte_mask;
   };
   
- -#ifdef CONFIG_SPARSE_IRQ
+ +#ifdef CONFIG_GENERIC_HARDIRQS
   static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
   {
         struct irq_2_iommu *iommu;
diff --combined include/asm-generic/vmlinux.lds.h

index aca40b93bd28f604f12d9f04d8443277a6f3dba0,5406e70aba864d1d03f154e36b2f212baacb93c4..a654d724d3b05b7d1e6a280d994f2a3f4df58dd1
--- 1/include/asm-generic/vmlinux.lds.h
--- 2/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@@ -80,11 -80,6 +80,11 @@@
         VMLINUX_SYMBOL(__start___tracepoints) = .;                      \
         *(__tracepoints)                                                \
         VMLINUX_SYMBOL(__stop___tracepoints) = .;                       \
+ +      /* implement dynamic printk debug */                            \
+ +      . = ALIGN(8);                                                   \
+ +      VMLINUX_SYMBOL(__start___verbose) = .;                          \
+ +      *(__verbose)                                                    \
+ +      VMLINUX_SYMBOL(__stop___verbose) = .;                           \
         LIKELY_PROFILE()                                                \
         BRANCH_PROFILE()
   
@@@ -314,7 -309,15 +314,7 @@@
         CPU_DISCARD(init.data)                                          \
         CPU_DISCARD(init.rodata)                                        \
         MEM_DISCARD(init.data)                                          \
- -      MEM_DISCARD(init.rodata)                                        \
- -      /* implement dynamic printk debug */                            \
- -      VMLINUX_SYMBOL(__start___verbose_strings) = .;                  \
- -      *(__verbose_strings)                                            \
- -      VMLINUX_SYMBOL(__stop___verbose_strings) = .;                   \
- -      . = ALIGN(8);                                                   \
- -      VMLINUX_SYMBOL(__start___verbose) = .;                          \
- -      *(__verbose)                                                    \
- -      VMLINUX_SYMBOL(__stop___verbose) = .;
+ +      MEM_DISCARD(init.rodata)
   
   #define INIT_TEXT                                                     \
         *(.init.text)                                                   \
@@@ -427,12 -430,59 +427,59 @@@
         *(.initcall7.init)                                              \
         *(.initcall7s.init)
   
+ /**
+  * PERCPU_VADDR - define output section for percpu area
+  * @vaddr: explicit base address (optional)
+  * @phdr: destination PHDR (optional)
+  *
+  * Macro which expands to output section for percpu area.  If @vaddr
+  * is not blank, it specifies explicit base address and all percpu
+  * symbols will be offset from the given address.  If blank, @vaddr
+  * always equals @laddr + LOAD_OFFSET.
+  *
+  * @phdr defines the output PHDR to use if not blank.  Be warned that
+  * output PHDR is sticky.  If @phdr is specified, the next output
+  * section in the linker script will go there too.  @phdr should have
+  * a leading colon.
+  *
+  * Note that this macros defines __per_cpu_load as an absolute symbol.
+  * If there is no need to put the percpu section at a predetermined
+  * address, use PERCPU().
+  */
+ #define PERCPU_VADDR(vaddr, phdr)                                     \
+       VMLINUX_SYMBOL(__per_cpu_load) = .;                             \
+       .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)          \
+                               - LOAD_OFFSET) {                        \
+               VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
+               *(.data.percpu.first)                                   \
+               *(.data.percpu.page_aligned)                            \
+               *(.data.percpu)                                         \
+               *(.data.percpu.shared_aligned)                          \
+               VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
+       } phdr                                                          \
+       . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
+ 
+ /**
+  * PERCPU - define output section for percpu area, simple version
+  * @align: required alignment
+  *
+  * Align to @align and outputs output section for percpu area.  This
+  * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
+  * __per_cpu_start will be identical.
+  *
+  * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except
+  * that __per_cpu_load is defined as a relative symbol against
+  * .data.percpu which is required for relocatable x86_32
+  * configuration.
+  */
   #define PERCPU(align)                                                 \
         . = ALIGN(align);                                               \
-       VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
-       .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {          \
+       .data.percpu    : AT(ADDR(.data.percpu) - LOAD_OFFSET) {        \
+               VMLINUX_SYMBOL(__per_cpu_load) = .;                     \
+               VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
+               *(.data.percpu.first)                                   \
                 *(.data.percpu.page_aligned)                            \
                 *(.data.percpu)                                         \
                 *(.data.percpu.shared_aligned)                          \
-       }                                                               \
-       VMLINUX_SYMBOL(__per_cpu_end) = .;
+               VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
+       }
diff --combined include/linux/interrupt.h

index 91658d0765982c04ee79e665411c82a17de10cb7,472f11765f608318093ed82919f5c7fdf4f00a7d..0c9cb63e689530cfbc6a060b8a0b07ff2175bd62
--- 1/include/linux/interrupt.h
--- 2/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@@ -61,17 -61,6 +61,17 @@@
   
   typedef irqreturn_t (*irq_handler_t)(int, void *);
   
+ +/**
+ + * struct irqaction - per interrupt action descriptor
+ + * @handler:  interrupt handler function
+ + * @flags:    flags (see IRQF_* above)
+ + * @mask:     no comment as it is useless and about to be removed
+ + * @name:     name of the device
+ + * @dev_id:   cookie to identify the device
+ + * @next:     pointer to the next irqaction for shared interrupts
+ + * @irq:      interrupt number
+ + * @dir:      pointer to the proc/irq/NN/name entry
+ + */
   struct irqaction {
         irq_handler_t handler;
         unsigned long flags;
@@@ -473,17 -462,12 +473,18 @@@ static inline void init_irq_proc(void
   }
   #endif
   
+ +#if defined(CONFIG_GENERIC_HARDIRQS) && defined(CONFIG_DEBUG_SHIRQ)
+ +extern void debug_poll_all_shared_irqs(void);
+ +#else
+ +static inline void debug_poll_all_shared_irqs(void) { }
+ +#endif
+ +
   int show_interrupts(struct seq_file *p, void *v);
   
   struct irq_desc;
   
   extern int early_irq_init(void);
+ extern int arch_probe_nr_irqs(void);
   extern int arch_early_irq_init(void);
   extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
   
diff --combined include/linux/irq.h

index 6db939a575bd4780fe39bb5a91ff7fa3bac08757,27a67536511ef45907a4108c46983fa1977acb44..873e4ac11b813accc72c6699265b039f62e7c575
--- 1/include/linux/irq.h
--- 2/include/linux/irq.h
+++ b/include/linux/irq.h
@@@ -160,10 -160,12 +160,10 @@@ struct irq_2_iommu
    */
   struct irq_desc {
         unsigned int            irq;
- -#ifdef CONFIG_SPARSE_IRQ
         struct timer_rand_state *timer_rand_state;
         unsigned int            *kstat_irqs;
- -# ifdef CONFIG_INTR_REMAP
+ +#ifdef CONFIG_INTR_REMAP
         struct irq_2_iommu      *irq_2_iommu;
- -# endif
   #endif
         irq_flow_handler_t      handle_irq;
         struct irq_chip         *chip;
@@@ -180,11 -182,11 +180,11 @@@
         unsigned int            irqs_unhandled;
         spinlock_t              lock;
   #ifdef CONFIG_SMP
-       cpumask_t               affinity;
+       cpumask_var_t           affinity;
         unsigned int            cpu;
- #endif
   #ifdef CONFIG_GENERIC_PENDING_IRQ
-       cpumask_t               pending_mask;
+       cpumask_var_t           pending_mask;
+ #endif
   #endif
   #ifdef CONFIG_PROC_FS
         struct proc_dir_entry   *dir;
@@@ -200,6 -202,12 +200,6 @@@ extern void arch_free_chip_data(struct 
   extern struct irq_desc irq_desc[NR_IRQS];
   #else /* CONFIG_SPARSE_IRQ */
   extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
- -
- -#define kstat_irqs_this_cpu(DESC) \
- -      ((DESC)->kstat_irqs[smp_processor_id()])
- -#define kstat_incr_irqs_this_cpu(irqno, DESC) \
- -      ((DESC)->kstat_irqs[smp_processor_id()]++)
- -
   #endif /* CONFIG_SPARSE_IRQ */
   
   extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
@@@ -218,6 -226,7 +218,6 @@@ irq_remap_to_desc(unsigned int irq, str
    * Migration helpers for obsolete names, they will go away:
    */
   #define hw_interrupt_type     irq_chip
- -typedef struct irq_chip               hw_irq_controller;
   #define no_irq_type           no_irq_chip
   typedef struct irq_desc               irq_desc_t;
   
@@@ -227,7 -236,6 +227,7 @@@
   #include <asm/hw_irq.h>
   
   extern int setup_irq(unsigned int irq, struct irqaction *new);
+ +extern void remove_irq(unsigned int irq, struct irqaction *act);
   
   #ifdef CONFIG_GENERIC_HARDIRQS
   
@@@ -272,7 -280,7 +272,7 @@@ static inline int irq_balancing_disable
   }
   
   /* Handle irq action chains: */
- -extern int handle_IRQ_event(unsigned int irq, struct irqaction *action);
+ +extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action);
   
   /*
    * Built-in IRQ handlers for various IRQ types,
@@@ -317,7 -325,7 +317,7 @@@ static inline void generic_handle_irq(u
   
   /* Handling of unhandled and spurious interrupts: */
   extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
- -                         int action_ret);
+ +                         irqreturn_t action_ret);
   
   /* Resending of interrupts :*/
   void check_irq_resend(struct irq_desc *desc, unsigned int irq);
@@@ -414,4 -422,84 +414,84 @@@ extern int set_irq_msi(unsigned int irq
   
   #endif /* !CONFIG_S390 */
   
+ #ifdef CONFIG_SMP
+ /**
+  * init_alloc_desc_masks - allocate cpumasks for irq_desc
+  * @desc:     pointer to irq_desc struct
+  * @cpu:      cpu which will be handling the cpumasks
+  * @boot:     true if need bootmem
+  *
+  * Allocates affinity and pending_mask cpumask if required.
+  * Returns true if successful (or not required).
+  * Side effect: affinity has all bits set, pending_mask has all bits clear.
+  */
+ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+                                                               bool boot)
+ {
+       int node;
+ 
+       if (boot) {
+               alloc_bootmem_cpumask_var(&desc->affinity);
+               cpumask_setall(desc->affinity);
+ 
+ #ifdef CONFIG_GENERIC_PENDING_IRQ
+               alloc_bootmem_cpumask_var(&desc->pending_mask);
+               cpumask_clear(desc->pending_mask);
+ #endif
+               return true;
+       }
+ 
+       node = cpu_to_node(cpu);
+ 
+       if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+               return false;
+       cpumask_setall(desc->affinity);
+ 
+ #ifdef CONFIG_GENERIC_PENDING_IRQ
+       if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+               free_cpumask_var(desc->affinity);
+               return false;
+       }
+       cpumask_clear(desc->pending_mask);
+ #endif
+       return true;
+ }
+ 
+ /**
+  * init_copy_desc_masks - copy cpumasks for irq_desc
+  * @old_desc: pointer to old irq_desc struct
+  * @new_desc: pointer to new irq_desc struct
+  *
+  * Insures affinity and pending_masks are copied to new irq_desc.
+  * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
+  * irq_desc struct so the copy is redundant.
+  */
+ 
+ static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+                                       struct irq_desc *new_desc)
+ {
+ #ifdef CONFIG_CPUMASKS_OFFSTACK
+       cpumask_copy(new_desc->affinity, old_desc->affinity);
+ 
+ #ifdef CONFIG_GENERIC_PENDING_IRQ
+       cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
+ #endif
+ #endif
+ }
+ 
+ #else /* !CONFIG_SMP */
+ 
+ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+                                                               bool boot)
+ {
+       return true;
+ }
+ 
+ static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+                                       struct irq_desc *new_desc)
+ {
+ }
+ 
+ #endif        /* CONFIG_SMP */
+ 
   #endif /* _LINUX_IRQ_H */
diff --combined include/linux/irqnr.h

index 52ebbb4b161d46372ad3c53be48b2e14cf61811a,887477bc2ab0841287460aef51517beb2377ea58..ec87b212ff7d03c446bc16b0429baeb86c5d645c
--- 1/include/linux/irqnr.h
--- 2/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@@ -20,6 -20,7 +20,7 @@@
   
   # define for_each_irq_desc_reverse(irq, desc)                          \
         for (irq = nr_irqs - 1; irq >= 0; irq--)
+ 
   #else /* CONFIG_GENERIC_HARDIRQS */
   
   extern int nr_irqs;
@@@ -28,17 -29,13 +29,17 @@@ extern struct irq_desc *irq_to_desc(uns
   # define for_each_irq_desc(irq, desc)                                 \
         for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;           \
              irq++, desc = irq_to_desc(irq))                            \
- -              if (desc)
+ +              if (!desc)                                              \
+ +                      ;                                               \
+ +              else
   
   
   # define for_each_irq_desc_reverse(irq, desc)                         \
         for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0;      \
              irq--, desc = irq_to_desc(irq))                            \
- -              if (desc)
+ +              if (!desc)                                              \
+ +                      ;                                               \
+ +              else
   
   #endif /* CONFIG_GENERIC_HARDIRQS */
   
diff --combined include/linux/sched.h

index ff904b0606d472fa05d4f069000afbe62fd356ae,f0a50b20e8a03c2f98dec1cd859aa6e06853f7e2..1d19c025f9d2e78b34692b3f350d9bef6fe522ed
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -998,7 -998,6 +998,7 @@@ struct sched_class 
                               struct rq *busiest, struct sched_domain *sd,
                               enum cpu_idle_type idle);
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+ +      int (*needs_post_schedule) (struct rq *this_rq);
         void (*post_schedule) (struct rq *this_rq);
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
   
@@@ -1053,10 -1052,6 +1053,10 @@@ struct sched_entity 
         u64                     last_wakeup;
         u64                     avg_overlap;
   
+ +      u64                     start_runtime;
+ +      u64                     avg_wakeup;
+ +      u64                     nr_migrations;
+ +
   #ifdef CONFIG_SCHEDSTATS
         u64                     wait_start;
         u64                     wait_max;
@@@ -1072,6 -1067,7 +1072,6 @@@
         u64                     exec_max;
         u64                     slice_max;
   
- -      u64                     nr_migrations;
         u64                     nr_migrations_cold;
         u64                     nr_failed_migrations_affine;
         u64                     nr_failed_migrations_running;
@@@ -1168,7 -1164,6 +1168,7 @@@ struct task_struct 
   #endif
   
         struct list_head tasks;
+ +      struct plist_node pushable_tasks;
   
         struct mm_struct *mm, *active_mm;
   
@@@ -1180,15 -1175,12 +1180,14 @@@
         /* ??? */
         unsigned int personality;
         unsigned did_exec:1;
+ +      unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
+ +                               * execve */
         pid_t pid;
         pid_t tgid;
   
- #ifdef CONFIG_CC_STACKPROTECTOR
         /* Canary value for the -fstack-protector gcc feature */
         unsigned long stack_canary;
- #endif
+ 
         /* 
          * pointers to (original) parent process, youngest child, younger sibling,
          * older sibling, respectively.  (p->father can be replaced with 
@@@ -1426,9 -1418,6 +1425,9 @@@
   #endif
   };
   
+ +/* Future-safe accessor for struct task_struct's cpus_allowed. */
+ +#define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
+ +
   /*
    * Priority of a process goes from 0..MAX_PRIO-1, valid RT
    * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@@ -1680,16 -1669,6 +1679,16 @@@ static inline int set_cpus_allowed(stru
         return set_cpus_allowed_ptr(p, &new_mask);
   }
   
+ +/*
+ + * Architectures can set this to 1 if they have specified
+ + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ + * but then during bootup it turns out that sched_clock()
+ + * is reliable after all:
+ + */
+ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ +extern int sched_clock_stable;
+ +#endif
+ +
   extern unsigned long long sched_clock(void);
   
   extern void sched_clock_init(void);
@@@ -2107,6 -2086,19 +2106,19 @@@ static inline int object_is_on_stack(vo
   
   extern void thread_info_cache_init(void);
   
+ #ifdef CONFIG_DEBUG_STACK_USAGE
+ static inline unsigned long stack_not_used(struct task_struct *p)
+ {
+       unsigned long *n = end_of_stack(p);
+ 
+       do {    /* Skip over canary */
+               n++;
+       } while (!*n);
+ 
+       return (unsigned long)n - (unsigned long)end_of_stack(p);
+ }
+ #endif
+ 
   /* set thread flags in other task's structures
    * - see asm/thread_info.h for TIF_xxxx flags available
    */
@@@ -2311,13 -2303,9 +2323,13 @@@ extern long sched_group_rt_runtime(stru
   extern int sched_group_set_rt_period(struct task_group *tg,
                                       long rt_period_us);
   extern long sched_group_rt_period(struct task_group *tg);
+ +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
   #endif
   #endif
   
+ +extern int task_can_switch_user(struct user_struct *up,
+ +                                      struct task_struct *tsk);
+ +
   #ifdef CONFIG_TASK_XACCT
   static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
   {
diff --combined init/Kconfig

index 68699137b1479044434a7a8f6e063bd2cb1b59d8,95a66131403a522651f6a0974a0c5538b8127945..14c483d2b7c90b4a5713b6ed116fa16260f1228d
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -101,6 -101,66 +101,66 @@@ config LOCALVERSION_AUT
   
           which is done within the script "scripts/setlocalversion".)
   
+ config HAVE_KERNEL_GZIP
+       bool
+ 
+ config HAVE_KERNEL_BZIP2
+       bool
+ 
+ config HAVE_KERNEL_LZMA
+       bool
+ 
+ choice
+       prompt "Kernel compression mode"
+       default KERNEL_GZIP
+       depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA
+       help
+         The linux kernel is a kind of self-extracting executable.
+         Several compression algorithms are available, which differ
+         in efficiency, compression and decompression speed.
+         Compression speed is only relevant when building a kernel.
+         Decompression speed is relevant at each boot.
+ 
+         If you have any problems with bzip2 or lzma compressed
+         kernels, mail me (Alain Knaff) <alain@knaff.lu>. (An older
+         version of this functionality (bzip2 only), for 2.4, was
+         supplied by Christian Ludwig)
+ 
+         High compression options are mostly useful for users, who
+         are low on disk space (embedded systems), but for whom ram
+         size matters less.
+ 
+         If in doubt, select 'gzip'
+ 
+ config KERNEL_GZIP
+       bool "Gzip"
+       depends on HAVE_KERNEL_GZIP
+       help
+         The old and tried gzip compression. Its compression ratio is
+         the poorest among the 3 choices; however its speed (both
+         compression and decompression) is the fastest.
+ 
+ config KERNEL_BZIP2
+       bool "Bzip2"
+       depends on HAVE_KERNEL_BZIP2
+       help
+         Its compression ratio and speed is intermediate.
+         Decompression speed is slowest among the three.  The kernel
+         size is about 10% smaller with bzip2, in comparison to gzip.
+         Bzip2 uses a large amount of memory. For modern kernels you
+         will need at least 8MB RAM or more for booting.
+ 
+ config KERNEL_LZMA
+       bool "LZMA"
+       depends on HAVE_KERNEL_LZMA
+       help
+         The most recent compression algorithm.
+         Its ratio is best, decompression speed is between the other
+         two. Compression is slowest.  The kernel size is about 33%
+         smaller with LZMA in comparison to gzip.
+ 
+ endchoice
+ 
   config SWAP
         bool "Support for paging of anonymous memory (swap)"
         depends on MMU && BLOCK
@@@ -675,9 -735,6 +735,9 @@@ config CC_OPTIMIZE_FOR_SIZ
   config SYSCTL
         bool
   
+ +config ANON_INODES
+ +      bool
+ +
   menuconfig EMBEDDED
         bool "Configure standard kernel features (for small systems)"
         help
@@@ -783,6 -840,18 +843,6 @@@ config PCSPKR_PLATFOR
             This option allows to disable the internal PC-Speaker
             support, saving some memory.
   
- -config COMPAT_BRK
- -      bool "Disable heap randomization"
- -      default y
- -      help
- -        Randomizing heap placement makes heap exploits harder, but it
- -        also breaks ancient binaries (including anything libc5 based).
- -        This option changes the bootup default to heap randomization
- -        disabled, and can be overriden runtime by setting
- -        /proc/sys/kernel/randomize_va_space to 2.
- -
- -        On non-ancient distros (post-2000 ones) N is usually a safe choice.
- -
   config BASE_FULL
         default y
         bool "Enable full-sized data structures for core" if EMBEDDED
@@@ -800,6 -869,9 +860,6 @@@ config FUTE
           support for "fast userspace mutexes".  The resulting kernel may not
           run glibc-based applications correctly.
   
- -config ANON_INODES
- -      bool
- -
   config EPOLL
         bool "Enable eventpoll support" if EMBEDDED
         default y
@@@ -885,18 -957,6 +945,18 @@@ config SLUB_DEBU
           SLUB sysfs support. /sys/slab will not exist and there will be
           no support for cache validation etc.
   
+ +config COMPAT_BRK
+ +      bool "Disable heap randomization"
+ +      default y
+ +      help
+ +        Randomizing heap placement makes heap exploits harder, but it
+ +        also breaks ancient binaries (including anything libc5 based).
+ +        This option changes the bootup default to heap randomization
+ +        disabled, and can be overriden runtime by setting
+ +        /proc/sys/kernel/randomize_va_space to 2.
+ +
+ +        On non-ancient distros (post-2000 ones) N is usually a safe choice.
+ +
   choice
         prompt "Choose SLAB allocator"
         default SLUB
@@@ -966,6 -1026,7 +1026,6 @@@ config SLABINF
   
   config RT_MUTEXES
         boolean
- -      select PLIST
   
   config BASE_SMALL
         int
diff --combined init/main.c

index 83697e160b3a786d3d91c14000f116a441f61bfc,6441083f827355108980280d7c7dd08f1ec16f5e..6bf83afd654da44b5b52383a4697d4d98e0ff6fa
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -14,6 -14,7 +14,7 @@@
   #include <linux/proc_fs.h>
   #include <linux/kernel.h>
   #include <linux/syscalls.h>
+ #include <linux/stackprotector.h>
   #include <linux/string.h>
   #include <linux/ctype.h>
   #include <linux/delay.h>
@@@ -97,7 -98,7 +98,7 @@@ static inline void mark_rodata_ro(void
   extern void tc_init(void);
   #endif
   
- -enum system_states system_state;
+ +enum system_states system_state __read_mostly;
   EXPORT_SYMBOL(system_state);
   
   /*
@@@ -135,14 -136,14 +136,14 @@@ unsigned int __initdata setup_max_cpus 
    * greater than 0, limits the maximum number of CPUs activated in
    * SMP mode to <NUM>.
    */
- #ifndef CONFIG_X86_IO_APIC
- static inline void disable_ioapic_setup(void) {};
- #endif
+ 
+ void __weak arch_disable_smp_support(void) { }
   
   static int __init nosmp(char *str)
   {
         setup_max_cpus = 0;
-       disable_ioapic_setup();
+       arch_disable_smp_support();
+ 
         return 0;
   }
   
@@@ -152,14 -153,14 +153,14 @@@ static int __init maxcpus(char *str
   {
         get_option(&str, &setup_max_cpus);
         if (setup_max_cpus == 0)
-               disable_ioapic_setup();
+               arch_disable_smp_support();
   
         return 0;
   }
   
   early_param("maxcpus", maxcpus);
   #else
- #define setup_max_cpus NR_CPUS
+ const unsigned int setup_max_cpus = NR_CPUS;
   #endif
   
   /*
@@@ -463,7 -464,6 +464,7 @@@ static noinline void __init_refok rest_
          * at least once to get things moving:
          */
         init_idle_bootup_task(current);
+ +      rcu_scheduler_starting();
         preempt_enable_no_resched();
         schedule();
         preempt_disable();
@@@ -540,6 -540,12 +541,12 @@@ asmlinkage void __init start_kernel(voi
          */
         lockdep_init();
         debug_objects_early_init();
+ 
+       /*
+        * Set up the the initial canary ASAP:
+        */
+       boot_init_stack_canary();
+ 
         cgroup_init_early();
   
         local_irq_disable();
diff --combined kernel/fork.c

index 4854c2c4a82eae9ed1032ee1b66abc46124499ad,8de303bdd4e51915c20c463c2ab0217e94b5b3ad..6715ebc3761de3ed10eeda84ff932e59ff8289c5
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -61,6 -61,7 +61,7 @@@
   #include <linux/proc_fs.h>
   #include <linux/blkdev.h>
   #include <trace/sched.h>
+ #include <linux/magic.h>
   
   #include <asm/pgtable.h>
   #include <asm/pgalloc.h>
@@@ -212,6 -213,8 +213,8 @@@ static struct task_struct *dup_task_str
   {
         struct task_struct *tsk;
         struct thread_info *ti;
+       unsigned long *stackend;
+ 
         int err;
   
         prepare_to_copy(orig);
@@@ -237,6 -240,8 +240,8 @@@
                 goto out;
   
         setup_thread_stack(tsk, orig);
+       stackend = end_of_stack(tsk);
+       *stackend = STACK_END_MAGIC;    /* for overflow detection */
   
   #ifdef CONFIG_CC_STACKPROTECTOR
         tsk->stack_canary = get_random_int();
@@@ -1179,6 -1184,10 +1184,6 @@@ static struct task_struct *copy_process
   #endif
         clear_all_latency_tracing(p);
   
- -      /* Our parent execution domain becomes current domain
- -         These must match for thread signalling to apply */
- -      p->parent_exec_id = p->self_exec_id;
- -
         /* ok, now we should be set up.. */
         p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
         p->pdeath_signal = 0;
@@@ -1216,13 -1225,10 +1221,13 @@@
                 set_task_cpu(p, smp_processor_id());
   
         /* CLONE_PARENT re-uses the old parent */
- -      if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+ +      if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                 p->real_parent = current->real_parent;
- -      else
+ +              p->parent_exec_id = current->parent_exec_id;
+ +      } else {
                 p->real_parent = current;
+ +              p->parent_exec_id = current->self_exec_id;
+ +      }
   
         spin_lock(&current->sighand->siglock);
   
diff --combined kernel/irq/chip.c

index 03d0bed2b8d925492a16e84951772330fccdcbd8,122fef4b0bd30d82ade4af7a84f281d019030280..c687ba4363f2b4a95a5c3988998286a1e8ab699b
--- 1/kernel/irq/chip.c
--- 2/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@@ -46,7 -46,10 +46,10 @@@ void dynamic_irq_init(unsigned int irq
         desc->irq_count = 0;
         desc->irqs_unhandled = 0;
   #ifdef CONFIG_SMP
-       cpumask_setall(&desc->affinity);
+       cpumask_setall(desc->affinity);
+ #ifdef CONFIG_GENERIC_PENDING_IRQ
+       cpumask_clear(desc->pending_mask);
+ #endif
   #endif
         spin_unlock_irqrestore(&desc->lock, flags);
   }
@@@ -78,7 -81,6 +81,7 @@@ void dynamic_irq_cleanup(unsigned int i
         desc->handle_irq = handle_bad_irq;
         desc->chip = &no_irq_chip;
         desc->name = NULL;
+ +      clear_kstat_irqs(desc);
         spin_unlock_irqrestore(&desc->lock, flags);
   }
   
@@@ -291,8 -293,7 +294,8 @@@ static inline void mask_ack_irq(struct 
                 desc->chip->mask_ack(irq);
         else {
                 desc->chip->mask(irq);
- -              desc->chip->ack(irq);
+ +              if (desc->chip->ack)
+ +                      desc->chip->ack(irq);
         }
   }
   
@@@ -478,8 -479,7 +481,8 @@@ handle_edge_irq(unsigned int irq, struc
         kstat_incr_irqs_this_cpu(irq, desc);
   
         /* Start handling the irq */
- -      desc->chip->ack(irq);
+ +      if (desc->chip->ack)
+ +              desc->chip->ack(irq);
         desc = irq_remap_to_desc(irq, desc);
   
         /* Mark the IRQ currently in progress.*/
diff --combined kernel/irq/handle.c

index f6cdda68e5c6c04b4c0da4626db67d05a78af3b0,f51eaee921b603b202bf184cdfdaee3a8da2ca08..9ebf77968871550a365713d7a29a6f131f231ee5
--- 1/kernel/irq/handle.c
--- 2/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@@ -17,6 -17,7 +17,7 @@@
   #include <linux/kernel_stat.h>
   #include <linux/rculist.h>
   #include <linux/hash.h>
+ #include <linux/bootmem.h>
   
   #include "internals.h"
   
@@@ -69,6 -70,7 +70,7 @@@ int nr_irqs = NR_IRQS
   EXPORT_SYMBOL_GPL(nr_irqs);
   
   #ifdef CONFIG_SPARSE_IRQ
+ 
   static struct irq_desc irq_desc_init = {
         .irq        = -1,
         .status     = IRQ_DISABLED,
@@@ -76,28 -78,23 +78,25 @@@
         .handle_irq = handle_bad_irq,
         .depth      = 1,
         .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
- #ifdef CONFIG_SMP
-       .affinity   = CPU_MASK_ALL
- #endif
   };
   
   void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
   {
- -      unsigned long bytes;
- -      char *ptr;
         int node;
- -
- -      /* Compute how many bytes we need per irq and allocate them */
- -      bytes = nr * sizeof(unsigned int);
+ +      void *ptr;
   
         node = cpu_to_node(cpu);
- -      ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
- -      printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+ +      ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
   
- -      if (ptr)
- -              desc->kstat_irqs = (unsigned int *)ptr;
+ +      /*
+ +       * don't overwite if can not get new one
+ +       * init_copy_kstat_irqs() could still use old one
+ +       */
+ +      if (ptr) {
+ +              printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n",
+ +                       cpu, node);
+ +              desc->kstat_irqs = ptr;
+ +      }
   }
   
   static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
@@@ -115,6 -112,10 +114,10 @@@
                 printk(KERN_ERR "can not alloc kstat_irqs\n");
                 BUG_ON(1);
         }
+       if (!init_alloc_desc_masks(desc, cpu, false)) {
+               printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+               BUG_ON(1);
+       }
         arch_init_chip_data(desc, cpu);
   }
   
@@@ -123,7 -124,7 +126,7 @@@
    */
   DEFINE_SPINLOCK(sparse_irq_lock);
   
- struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
+ struct irq_desc **irq_desc_ptrs __read_mostly;
   
   static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
         [0 ... NR_IRQS_LEGACY-1] = {
@@@ -133,14 -134,10 +136,10 @@@
                 .handle_irq = handle_bad_irq,
                 .depth      = 1,
                 .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
- #ifdef CONFIG_SMP
-               .affinity   = CPU_MASK_ALL
- #endif
         }
   };
   
- /* FIXME: use bootmem alloc ...*/
- static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+ static unsigned int *kstat_irqs_legacy;
   
   int __init early_irq_init(void)
   {
@@@ -150,18 -147,30 +149,30 @@@
   
         init_irq_default_affinity();
   
+        /* initialize nr_irqs based on nr_cpu_ids */
+       arch_probe_nr_irqs();
+       printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
+ 
         desc = irq_desc_legacy;
         legacy_count = ARRAY_SIZE(irq_desc_legacy);
   
+       /* allocate irq_desc_ptrs array based on nr_irqs */
+       irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+ 
+       /* allocate based on nr_cpu_ids */
+       /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
+       kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
+                                         sizeof(int));
+ 
         for (i = 0; i < legacy_count; i++) {
                 desc[i].irq = i;
-               desc[i].kstat_irqs = kstat_irqs_legacy[i];
+               desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
                 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
- 
+               init_alloc_desc_masks(&desc[i], 0, true);
                 irq_desc_ptrs[i] = desc + i;
         }
   
-       for (i = legacy_count; i < NR_IRQS; i++)
+       for (i = legacy_count; i < nr_irqs; i++)
                 irq_desc_ptrs[i] = NULL;
   
         return arch_early_irq_init();
@@@ -169,7 -178,10 +180,10 @@@
   
   struct irq_desc *irq_to_desc(unsigned int irq)
   {
-       return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
+       if (irq_desc_ptrs && irq < nr_irqs)
+               return irq_desc_ptrs[irq];
+ 
+       return NULL;
   }
   
   struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
@@@ -178,10 -190,9 +192,9 @@@
         unsigned long flags;
         int node;
   
-       if (irq >= NR_IRQS) {
-               printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
-                               irq, NR_IRQS);
-               WARN_ON(1);
+       if (irq >= nr_irqs) {
+               WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
+                       irq, nr_irqs);
                 return NULL;
         }
   
@@@ -223,13 -234,9 +236,10 @@@ struct irq_desc irq_desc[NR_IRQS] __cac
                 .handle_irq = handle_bad_irq,
                 .depth = 1,
                 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
- #ifdef CONFIG_SMP
-               .affinity = CPU_MASK_ALL
- #endif
         }
   };
   
+ +static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
   int __init early_irq_init(void)
   {
         struct irq_desc *desc;
@@@ -238,14 -245,15 +248,16 @@@
   
         init_irq_default_affinity();
   
+       printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+ 
         desc = irq_desc;
         count = ARRAY_SIZE(irq_desc);
   
         for (i = 0; i < count; i++) {
                 desc[i].irq = i;
+               init_alloc_desc_masks(&desc[i], 0, true);
+ +              desc[i].kstat_irqs = kstat_irqs_all[i];
         }
- 
         return arch_early_irq_init();
   }
   
@@@ -260,11 -268,6 +272,11 @@@ struct irq_desc *irq_to_desc_alloc_cpu(
   }
   #endif /* !CONFIG_SPARSE_IRQ */
   
+ +void clear_kstat_irqs(struct irq_desc *desc)
+ +{
+ +      memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+ +}
+ +
   /*
    * What should we do if we get a hw irq event on an illegal vector?
    * Each architecture has to answer this themself.
@@@ -338,8 -341,6 +350,8 @@@ irqreturn_t handle_IRQ_event(unsigned i
         irqreturn_t ret, retval = IRQ_NONE;
         unsigned int status = 0;
   
+ +      WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!");
+ +
         if (!(action->flags & IRQF_DISABLED))
                 local_irq_enable_in_hardirq();
   
@@@ -359,11 -360,6 +371,11 @@@
   }
   
   #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+ +
+ +#ifdef CONFIG_ENABLE_WARN_DEPRECATED
+ +# warning __do_IRQ is deprecated. Please convert to proper flow handlers
+ +#endif
+ +
   /**
    * __do_IRQ - original all in one highlevel IRQ handler
    * @irq:      the interrupt number
@@@ -484,10 -480,12 +496,10 @@@ void early_init_irq_lock_class(void
         }
   }
   
- -#ifdef CONFIG_SPARSE_IRQ
   unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
   {
         struct irq_desc *desc = irq_to_desc(irq);
         return desc ? desc->kstat_irqs[cpu] : 0;
   }
- -#endif
   EXPORT_SYMBOL(kstat_irqs_cpu);
   
diff --combined kernel/irq/internals.h

index b60950bf5a16fb57286d95515bb0ceaa2022410a,40416a81a0f5af98b159bf675e3fd18e41648ce1..ee1aa9f8e8b9341af0c81ec9073f52e48375eb5b
--- 1/kernel/irq/internals.h
--- 2/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@@ -15,9 -15,15 +15,16 @@@ extern int __irq_set_trigger(struct irq
   
   extern struct lock_class_key irq_desc_lock_class;
   extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+ +extern void clear_kstat_irqs(struct irq_desc *desc);
   extern spinlock_t sparse_irq_lock;
+ 
+ #ifdef CONFIG_SPARSE_IRQ
+ /* irq_desc_ptrs allocated at boot time */
+ extern struct irq_desc **irq_desc_ptrs;
+ #else
+ /* irq_desc_ptrs is a fixed size array */
   extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+ #endif
   
   #ifdef CONFIG_PROC_FS
   extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
diff --combined kernel/irq/manage.c

index ea119effe096b47628c2b8061c2e76e8fcfb5d43,a3a5dc9ef346d813edf3b971926cb649d9c89d67..6458e99984c08f3af7a108ac0f7db03b21a7e5e8
--- 1/kernel/irq/manage.c
--- 2/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@@ -90,14 -90,14 +90,14 @@@ int irq_set_affinity(unsigned int irq, 
   
   #ifdef CONFIG_GENERIC_PENDING_IRQ
         if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-               cpumask_copy(&desc->affinity, cpumask);
+               cpumask_copy(desc->affinity, cpumask);
                 desc->chip->set_affinity(irq, cpumask);
         } else {
                 desc->status |= IRQ_MOVE_PENDING;
-               cpumask_copy(&desc->pending_mask, cpumask);
+               cpumask_copy(desc->pending_mask, cpumask);
         }
   #else
-       cpumask_copy(&desc->affinity, cpumask);
+       cpumask_copy(desc->affinity, cpumask);
         desc->chip->set_affinity(irq, cpumask);
   #endif
         desc->status |= IRQ_AFFINITY_SET;
@@@ -109,7 -109,7 +109,7 @@@
   /*
    * Generic version of the affinity autoselector.
    */
- -int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
+ +static int setup_affinity(unsigned int irq, struct irq_desc *desc)
   {
         if (!irq_can_set_affinity(irq))
                 return 0;
@@@ -119,21 -119,21 +119,21 @@@
          * one of the targets is online.
          */
         if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-               if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+               if (cpumask_any_and(desc->affinity, cpu_online_mask)
                     < nr_cpu_ids)
                         goto set_affinity;
                 else
                         desc->status &= ~IRQ_AFFINITY_SET;
         }
   
-       cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
+       cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
   set_affinity:
-       desc->chip->set_affinity(irq, &desc->affinity);
+       desc->chip->set_affinity(irq, desc->affinity);
   
         return 0;
   }
   #else
- -static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
+ +static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
   {
         return irq_select_affinity(irq);
   }
@@@ -149,14 -149,14 +149,14 @@@ int irq_select_affinity_usr(unsigned in
         int ret;
   
         spin_lock_irqsave(&desc->lock, flags);
- -      ret = do_irq_select_affinity(irq, desc);
+ +      ret = setup_affinity(irq, desc);
         spin_unlock_irqrestore(&desc->lock, flags);
   
         return ret;
   }
   
   #else
- -static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
+ +static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
   {
         return 0;
   }
@@@ -389,9 -389,9 +389,9 @@@ int __irq_set_trigger(struct irq_desc *
    * allocate special interrupts that are part of the architecture.
    */
   static int
- -__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
+ +__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
   {
- -      struct irqaction *old, **p;
+ +      struct irqaction *old, **old_ptr;
         const char *old_name = NULL;
         unsigned long flags;
         int shared = 0;
@@@ -423,8 -423,8 +423,8 @@@
          * The following block of code has to be executed atomically
          */
         spin_lock_irqsave(&desc->lock, flags);
- -      p = &desc->action;
- -      old = *p;
+ +      old_ptr = &desc->action;
+ +      old = *old_ptr;
         if (old) {
                 /*
                  * Can't share interrupts unless both agree to and are
@@@ -447,8 -447,8 +447,8 @@@
   
                 /* add new interrupt at end of irq queue */
                 do {
- -                      p = &old->next;
- -                      old = *p;
+ +                      old_ptr = &old->next;
+ +                      old = *old_ptr;
                 } while (old);
                 shared = 1;
         }
@@@ -488,7 -488,7 +488,7 @@@
                         desc->status |= IRQ_NO_BALANCING;
   
                 /* Set default affinity mask once everything is setup */
- -              do_irq_select_affinity(irq, desc);
+ +              setup_affinity(irq, desc);
   
         } else if ((new->flags & IRQF_TRIGGER_MASK)
                         && (new->flags & IRQF_TRIGGER_MASK)
@@@ -499,7 -499,7 +499,7 @@@
                                 (int)(new->flags & IRQF_TRIGGER_MASK));
         }
   
- -      *p = new;
+ +      *old_ptr = new;
   
         /* Reset broken irq detection when installing new handler */
         desc->irq_count = 0;
@@@ -549,117 -549,90 +549,117 @@@ int setup_irq(unsigned int irq, struct 
   
         return __setup_irq(irq, desc, act);
   }
+ +EXPORT_SYMBOL_GPL(setup_irq);
   
- -/**
- - *    free_irq - free an interrupt
- - *    @irq: Interrupt line to free
- - *    @dev_id: Device identity to free
- - *
- - *    Remove an interrupt handler. The handler is removed and if the
- - *    interrupt line is no longer in use by any driver it is disabled.
- - *    On a shared IRQ the caller must ensure the interrupt is disabled
- - *    on the card it drives before calling this function. The function
- - *    does not return until any executing interrupts for this IRQ
- - *    have completed.
- - *
- - *    This function must not be called from interrupt context.
+ + /*
+ + * Internal function to unregister an irqaction - used to free
+ + * regular and special interrupts that are part of the architecture.
    */
- -void free_irq(unsigned int irq, void *dev_id)
+ +static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
   {
         struct irq_desc *desc = irq_to_desc(irq);
- -      struct irqaction **p;
+ +      struct irqaction *action, **action_ptr;
         unsigned long flags;
   
- -      WARN_ON(in_interrupt());
+ +      WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
   
         if (!desc)
- -              return;
+ +              return NULL;
   
         spin_lock_irqsave(&desc->lock, flags);
- -      p = &desc->action;
+ +
+ +      /*
+ +       * There can be multiple actions per IRQ descriptor, find the right
+ +       * one based on the dev_id:
+ +       */
+ +      action_ptr = &desc->action;
         for (;;) {
- -              struct irqaction *action = *p;
+ +              action = *action_ptr;
   
- -              if (action) {
- -                      struct irqaction **pp = p;
+ +              if (!action) {
+ +                      WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ +                      spin_unlock_irqrestore(&desc->lock, flags);
   
- -                      p = &action->next;
- -                      if (action->dev_id != dev_id)
- -                              continue;
+ +                      return NULL;
+ +              }
   
- -                      /* Found it - now remove it from the list of entries */
- -                      *pp = action->next;
+ +              if (action->dev_id == dev_id)
+ +                      break;
+ +              action_ptr = &action->next;
+ +      }
   
- -                      /* Currently used only by UML, might disappear one day.*/
+ +      /* Found it - now remove it from the list of entries: */
+ +      *action_ptr = action->next;
+ +
+ +      /* Currently used only by UML, might disappear one day: */
   #ifdef CONFIG_IRQ_RELEASE_METHOD
- -                      if (desc->chip->release)
- -                              desc->chip->release(irq, dev_id);
+ +      if (desc->chip->release)
+ +              desc->chip->release(irq, dev_id);
   #endif
   
- -                      if (!desc->action) {
- -                              desc->status |= IRQ_DISABLED;
- -                              if (desc->chip->shutdown)
- -                                      desc->chip->shutdown(irq);
- -                              else
- -                                      desc->chip->disable(irq);
- -                      }
- -                      spin_unlock_irqrestore(&desc->lock, flags);
- -                      unregister_handler_proc(irq, action);
+ +      /* If this was the last handler, shut down the IRQ line: */
+ +      if (!desc->action) {
+ +              desc->status |= IRQ_DISABLED;
+ +              if (desc->chip->shutdown)
+ +                      desc->chip->shutdown(irq);
+ +              else
+ +                      desc->chip->disable(irq);
+ +      }
+ +      spin_unlock_irqrestore(&desc->lock, flags);
+ +
+ +      unregister_handler_proc(irq, action);
+ +
+ +      /* Make sure it's not being used on another CPU: */
+ +      synchronize_irq(irq);
   
- -                      /* Make sure it's not being used on another CPU */
- -                      synchronize_irq(irq);
- -#ifdef CONFIG_DEBUG_SHIRQ
- -                      /*
- -                       * It's a shared IRQ -- the driver ought to be
- -                       * prepared for it to happen even now it's
- -                       * being freed, so let's make sure....  We do
- -                       * this after actually deregistering it, to
- -                       * make sure that a 'real' IRQ doesn't run in
- -                       * parallel with our fake
- -                       */
- -                      if (action->flags & IRQF_SHARED) {
- -                              local_irq_save(flags);
- -                              action->handler(irq, dev_id);
- -                              local_irq_restore(flags);
- -                      }
- -#endif
- -                      kfree(action);
- -                      return;
- -              }
- -              printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
   #ifdef CONFIG_DEBUG_SHIRQ
- -              dump_stack();
- -#endif
- -              spin_unlock_irqrestore(&desc->lock, flags);
- -              return;
+ +      /*
+ +       * It's a shared IRQ -- the driver ought to be prepared for an IRQ
+ +       * event to happen even now it's being freed, so let's make sure that
+ +       * is so by doing an extra call to the handler ....
+ +       *
+ +       * ( We do this after actually deregistering it, to make sure that a
+ +       *   'real' IRQ doesn't run in * parallel with our fake. )
+ +       */
+ +      if (action->flags & IRQF_SHARED) {
+ +              local_irq_save(flags);
+ +              action->handler(irq, dev_id);
+ +              local_irq_restore(flags);
         }
+ +#endif
+ +      return action;
+ +}
+ +
+ +/**
+ + *    remove_irq - free an interrupt
+ + *    @irq: Interrupt line to free
+ + *    @act: irqaction for the interrupt
+ + *
+ + * Used to remove interrupts statically setup by the early boot process.
+ + */
+ +void remove_irq(unsigned int irq, struct irqaction *act)
+ +{
+ +      __free_irq(irq, act->dev_id);
+ +}
+ +EXPORT_SYMBOL_GPL(remove_irq);
+ +
+ +/**
+ + *    free_irq - free an interrupt allocated with request_irq
+ + *    @irq: Interrupt line to free
+ + *    @dev_id: Device identity to free
+ + *
+ + *    Remove an interrupt handler. The handler is removed and if the
+ + *    interrupt line is no longer in use by any driver it is disabled.
+ + *    On a shared IRQ the caller must ensure the interrupt is disabled
+ + *    on the card it drives before calling this function. The function
+ + *    does not return until any executing interrupts for this IRQ
+ + *    have completed.
+ + *
+ + *    This function must not be called from interrupt context.
+ + */
+ +void free_irq(unsigned int irq, void *dev_id)
+ +{
+ +      kfree(__free_irq(irq, dev_id));
   }
   EXPORT_SYMBOL(free_irq);
   
@@@ -706,12 -679,11 +706,12 @@@ int request_irq(unsigned int irq, irq_h
          * the behavior is classified as "will not fix" so we need to
          * start nudging drivers away from using that idiom.
          */
- -      if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
- -                      == (IRQF_SHARED|IRQF_DISABLED))
- -              pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
- -                              "guaranteed on shared IRQs\n",
- -                              irq, devname);
+ +      if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
+ +                                      (IRQF_SHARED|IRQF_DISABLED)) {
+ +              pr_warning(
+ +                "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
+ +                      irq, devname);
+ +      }
   
   #ifdef CONFIG_LOCKDEP
         /*
@@@ -737,13 -709,15 +737,13 @@@
         if (!handler)
                 return -EINVAL;
   
- -      action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+ +      action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
         if (!action)
                 return -ENOMEM;
   
         action->handler = handler;
         action->flags = irqflags;
- -      cpus_clear(action->mask);
         action->name = devname;
- -      action->next = NULL;
         action->dev_id = dev_id;
   
         retval = __setup_irq(irq, desc, action);
diff --combined kernel/irq/numa_migrate.c

index aef18ab6b75bf1954f23ccff4a52a948d62b74d2,7f9b80434e32a295b463bc7aaa97457207d56686..243d6121e50e08c1b54972fd3bb61c827a8c3dfe
--- 1/kernel/irq/numa_migrate.c
--- 2/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@@ -17,11 -17,16 +17,11 @@@ static void init_copy_kstat_irqs(struc
                                  struct irq_desc *desc,
                                  int cpu, int nr)
   {
- -      unsigned long bytes;
- -
         init_kstat_irqs(desc, cpu, nr);
   
- -      if (desc->kstat_irqs != old_desc->kstat_irqs) {
- -              /* Compute how many bytes we need per irq and allocate them */
- -              bytes = nr * sizeof(unsigned int);
- -
- -              memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
- -      }
+ +      if (desc->kstat_irqs != old_desc->kstat_irqs)
+ +              memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
+ +                       nr * sizeof(*desc->kstat_irqs));
   }
   
   static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
@@@ -33,15 -38,22 +33,22 @@@
         old_desc->kstat_irqs = NULL;
   }
   
- static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
                  struct irq_desc *desc, int cpu)
   {
         memcpy(desc, old_desc, sizeof(struct irq_desc));
+       if (!init_alloc_desc_masks(desc, cpu, false)) {
+               printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+                               "for migration.\n", irq);
+               return false;
+       }
         spin_lock_init(&desc->lock);
         desc->cpu = cpu;
         lockdep_set_class(&desc->lock, &irq_desc_lock_class);
         init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+       init_copy_desc_masks(old_desc, desc);
         arch_init_copy_chip_data(old_desc, desc, cpu);
+       return true;
   }
   
   static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@@ -71,12 -83,18 +78,18 @@@ static struct irq_desc *__real_move_irq
         node = cpu_to_node(cpu);
         desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
         if (!desc) {
-               printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
+               printk(KERN_ERR "irq %d: can not get new irq_desc "
+                               "for migration.\n", irq);
+               /* still use old one */
+               desc = old_desc;
+               goto out_unlock;
+       }
+       if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
                 /* still use old one */
+               kfree(desc);
                 desc = old_desc;
                 goto out_unlock;
         }
-       init_copy_one_irq_desc(irq, old_desc, desc, cpu);
   
         irq_desc_ptrs[irq] = desc;
         spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --combined kernel/module.c

index 77672233387ffc095b746435a90cb0689cabcbf6,f0e04d6b67d8cb325526dce1fdb456265ba3f926..f77ac320d0b51d021b52ba4c48dba5680e7c4d01
--- 1/kernel/module.c
--- 2/kernel/module.c
+++ b/kernel/module.c
@@@ -51,6 -51,7 +51,7 @@@
   #include <linux/tracepoint.h>
   #include <linux/ftrace.h>
   #include <linux/async.h>
+ #include <linux/percpu.h>
   
   #if 0
   #define DEBUGP printk
@@@ -366,6 -367,34 +367,34 @@@ static struct module *find_module(cons
   }
   
   #ifdef CONFIG_SMP
+ 
+ #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ 
+ static void *percpu_modalloc(unsigned long size, unsigned long align,
+                            const char *name)
+ {
+       void *ptr;
+ 
+       if (align > PAGE_SIZE) {
+               printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+                      name, align, PAGE_SIZE);
+               align = PAGE_SIZE;
+       }
+ 
+       ptr = __alloc_reserved_percpu(size, align);
+       if (!ptr)
+               printk(KERN_WARNING
+                      "Could not allocate %lu bytes percpu data\n", size);
+       return ptr;
+ }
+ 
+ static void percpu_modfree(void *freeme)
+ {
+       free_percpu(freeme);
+ }
+ 
+ #else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+ 
   /* Number of blocks used and allocated. */
   static unsigned int pcpu_num_used, pcpu_num_allocated;
   /* Size of each block.  -ve means used. */
@@@ -480,21 -509,6 +509,6 @@@ static void percpu_modfree(void *freeme
         }
   }
   
- static unsigned int find_pcpusec(Elf_Ehdr *hdr,
-                                Elf_Shdr *sechdrs,
-                                const char *secstrings)
- {
-       return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
- }
- 
- static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
- {
-       int cpu;
- 
-       for_each_possible_cpu(cpu)
-               memcpy(pcpudest + per_cpu_offset(cpu), from, size);
- }
- 
   static int percpu_modinit(void)
   {
         pcpu_num_used = 2;
@@@ -513,7 -527,26 +527,26 @@@
         return 0;
   }
   __initcall(percpu_modinit);
+ 
+ #endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+ 
+ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+                                Elf_Shdr *sechdrs,
+                                const char *secstrings)
+ {
+       return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+ }
+ 
+ static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+ {
+       int cpu;
+ 
+       for_each_possible_cpu(cpu)
+               memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+ }
+ 
   #else /* ... !CONFIG_SMP */
+ 
   static inline void *percpu_modalloc(unsigned long size, unsigned long align,
                                     const char *name)
   {
@@@ -535,6 -568,7 +568,7 @@@ static inline void percpu_modcopy(void 
         /* pcpusec should be 0, and size of that section should be 0. */
         BUG_ON(size != 0);
   }
+ 
   #endif /* CONFIG_SMP */
   
   #define MODINFO_ATTR(field)   \
@@@ -822,7 -856,7 +856,7 @@@ SYSCALL_DEFINE2(delete_module, const ch
         mutex_lock(&module_mutex);
         /* Store the name of the last unloaded module for diagnostic purposes */
         strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
- -      unregister_dynamic_debug_module(mod->name);
+ +      ddebug_remove_module(mod->name);
         free_module(mod);
   
    out:
@@@ -1827,13 -1861,19 +1861,13 @@@ static inline void add_kallsyms(struct 
   }
   #endif /* CONFIG_KALLSYMS */
   
- -static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
+ +static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
   {
- -#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
- -      unsigned int i;
- -
- -      for (i = 0; i < num; i++) {
- -              register_dynamic_debug_module(debug[i].modname,
- -                                            debug[i].type,
- -                                            debug[i].logical_modname,
- -                                            debug[i].flag_names,
- -                                            debug[i].hash, debug[i].hash2);
- -      }
- -#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+ +#ifdef CONFIG_DYNAMIC_DEBUG
+ +      if (ddebug_add_module(debug, num, debug->modname))
+ +              printk(KERN_ERR "dynamic debug error adding module: %s\n",
+ +                                      debug->modname);
+ +#endif
   }
   
   static void *module_alloc_update_bounds(unsigned long size)
@@@ -2009,6 -2049,14 +2043,6 @@@ static noinline struct module *load_mod
         if (err < 0)
                 goto free_mod;
   
- -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
- -      mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
- -                                    mod->name);
- -      if (!mod->refptr) {
- -              err = -ENOMEM;
- -              goto free_mod;
- -      }
- -#endif
         if (pcpuindex) {
                 /* We have a special allocation for this section. */
                 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@@ -2016,7 -2064,7 +2050,7 @@@
                                          mod->name);
                 if (!percpu) {
                         err = -ENOMEM;
- -                      goto free_percpu;
+ +                      goto free_mod;
                 }
                 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
                 mod->percpu = percpu;
@@@ -2068,14 -2116,6 +2102,14 @@@
         /* Module has been moved. */
         mod = (void *)sechdrs[modindex].sh_addr;
   
+ +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ +      mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+ +                                    mod->name);
+ +      if (!mod->refptr) {
+ +              err = -ENOMEM;
+ +              goto free_init;
+ +      }
+ +#endif
         /* Now we've moved module, initialize linked lists, etc. */
         module_unload_init(mod);
   
@@@ -2207,13 -2247,12 +2241,13 @@@
         add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
   
         if (!mod->taints) {
- -              struct mod_debug *debug;
+ +              struct _ddebug *debug;
                 unsigned int num_debug;
   
                 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
                                      sizeof(*debug), &num_debug);
- -              dynamic_printk_setup(debug, num_debug);
+ +              if (debug)
+ +                      dynamic_debug_setup(debug, num_debug);
         }
   
         /* sechdrs[0].sh_size is always zero */
@@@ -2283,17 -2322,15 +2317,17 @@@
         ftrace_release(mod->module_core, mod->core_size);
    free_unload:
         module_unload_free(mod);
+ + free_init:
+ +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ +      percpu_modfree(mod->refptr);
+ +#endif
         module_free(mod, mod->module_init);
    free_core:
         module_free(mod, mod->module_core);
+ +      /* mod will be freed with core. Don't access it beyond this line! */
    free_percpu:
         if (percpu)
                 percpu_modfree(percpu);
- -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
- -      percpu_modfree(mod->refptr);
- -#endif
    free_mod:
         kfree(args);
    free_hdr:
diff --combined kernel/sched.c

index 9f8506d68fdc1dcb54c48ef27530ca5602dbe1d6,0e5c38e1c8b5cdad3e2ab022fa8db8f79b88316c..f4c413bdd38d790e10957e970edc02657c245670
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -223,7 -223,7 +223,7 @@@ static void start_rt_bandwidth(struct r
   {
         ktime_t now;
   
- -      if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+ +      if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                 return;
   
         if (hrtimer_active(&rt_b->rt_period_timer))
@@@ -331,13 -331,6 +331,13 @@@ static DEFINE_PER_CPU(struct rt_rq, ini
    */
   static DEFINE_SPINLOCK(task_group_lock);
   
+ +#ifdef CONFIG_SMP
+ +static int root_task_group_empty(void)
+ +{
+ +      return list_empty(&root_task_group.children);
+ +}
+ +#endif
+ +
   #ifdef CONFIG_FAIR_GROUP_SCHED
   #ifdef CONFIG_USER_SCHED
   # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@@ -398,13 -391,6 +398,13 @@@ static inline void set_task_rq(struct t
   
   #else
   
+ +#ifdef CONFIG_SMP
+ +static int root_task_group_empty(void)
+ +{
+ +      return 1;
+ +}
+ +#endif
+ +
   static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
   static inline struct task_group *task_group(struct task_struct *p)
   {
@@@ -481,17 -467,11 +481,17 @@@ struct rt_rq 
         struct rt_prio_array active;
         unsigned long rt_nr_running;
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -      int highest_prio; /* highest queued rt task prio */
+ +      struct {
+ +              int curr; /* highest queued rt task prio */
+ +#ifdef CONFIG_SMP
+ +              int next; /* next highest */
+ +#endif
+ +      } highest_prio;
   #endif
   #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
         int overloaded;
+ +      struct plist_head pushable_tasks;
   #endif
         int rt_throttled;
         u64 rt_time;
@@@ -569,6 -549,7 +569,6 @@@ struct rq 
         unsigned long nr_running;
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- -      unsigned char idle_at_tick;
   #ifdef CONFIG_NO_HZ
         unsigned long last_tick_seen;
         unsigned char in_nohz_recently;
@@@ -609,7 -590,6 +609,7 @@@
         struct root_domain *rd;
         struct sched_domain *sd;
   
+ +      unsigned char idle_at_tick;
         /* For active balancing */
         int active_balance;
         int push_cpu;
@@@ -638,6 -618,9 +638,6 @@@
         /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
   
         /* sys_sched_yield() stats */
- -      unsigned int yld_exp_empty;
- -      unsigned int yld_act_empty;
- -      unsigned int yld_both_empty;
         unsigned int yld_count;
   
         /* schedule() stats */
@@@ -1200,10 -1183,10 +1200,10 @@@ static void resched_task(struct task_st
   
         assert_spin_locked(&task_rq(p)->lock);
   
- -      if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ +      if (test_tsk_need_resched(p))
                 return;
   
- -      set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+ +      set_tsk_need_resched(p);
   
         cpu = task_cpu(p);
         if (cpu == smp_processor_id())
@@@ -1259,7 -1242,7 +1259,7 @@@ void wake_up_idle_cpu(int cpu
          * lockless. The worst case is that the other CPU runs the
          * idle task through an additional NOOP schedule()
          */
- -      set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+ +      set_tsk_need_resched(rq->idle);
   
         /* NEED_RESCHED must be visible before we test polling */
         smp_mb();
@@@ -1627,42 -1610,21 +1627,42 @@@ static inline void update_shares_locked
   
   #endif
   
+ +#ifdef CONFIG_PREEMPT
+ +
   /*
- - * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ + * way at the expense of forcing extra atomic operations in all
+ + * invocations.  This assures that the double_lock is acquired using the
+ + * same underlying policy as the spinlock_t on this architecture, which
+ + * reduces latency compared to the unfair variant below.  However, it
+ + * also adds more overhead and therefore may reduce throughput.
    */
- -static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +      __releases(this_rq->lock)
+ +      __acquires(busiest->lock)
+ +      __acquires(this_rq->lock)
+ +{
+ +      spin_unlock(&this_rq->lock);
+ +      double_rq_lock(this_rq, busiest);
+ +
+ +      return 1;
+ +}
+ +
+ +#else
+ +/*
+ + * Unfair double_lock_balance: Optimizes throughput at the expense of
+ + * latency by eliminating extra atomic operations when the locks are
+ + * already in proper order on entry.  This favors lower cpu-ids and will
+ + * grant the double lock to lower cpus over higher ids under contention,
+ + * regardless of entry order into the function.
+ + */
+ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(this_rq->lock)
         __acquires(busiest->lock)
         __acquires(this_rq->lock)
   {
         int ret = 0;
   
- -      if (unlikely(!irqs_disabled())) {
- -              /* printk() doesn't work good under rq->lock */
- -              spin_unlock(&this_rq->lock);
- -              BUG_ON(1);
- -      }
         if (unlikely(!spin_trylock(&busiest->lock))) {
                 if (busiest < this_rq) {
                         spin_unlock(&this_rq->lock);
@@@ -1675,22 -1637,6 +1675,22 @@@
         return ret;
   }
   
+ +#endif /* CONFIG_PREEMPT */
+ +
+ +/*
+ + * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ + */
+ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ +{
+ +      if (unlikely(!irqs_disabled())) {
+ +              /* printk() doesn't work good under rq->lock */
+ +              spin_unlock(&this_rq->lock);
+ +              BUG_ON(1);
+ +      }
+ +
+ +      return _double_lock_balance(this_rq, busiest);
+ +}
+ +
   static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(busiest->lock)
   {
@@@ -1759,9 -1705,6 +1759,9 @@@ static void update_avg(u64 *avg, u64 sa
   
   static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
   {
+ +      if (wakeup)
+ +              p->se.start_runtime = p->se.sum_exec_runtime;
+ +
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, wakeup);
         p->se.on_rq = 1;
@@@ -1769,15 -1712,10 +1769,15 @@@
   
   static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
   {
- -      if (sleep && p->se.last_wakeup) {
- -              update_avg(&p->se.avg_overlap,
- -                         p->se.sum_exec_runtime - p->se.last_wakeup);
- -              p->se.last_wakeup = 0;
+ +      if (sleep) {
+ +              if (p->se.last_wakeup) {
+ +                      update_avg(&p->se.avg_overlap,
+ +                              p->se.sum_exec_runtime - p->se.last_wakeup);
+ +                      p->se.last_wakeup = 0;
+ +              } else {
+ +                      update_avg(&p->se.avg_wakeup,
+ +                              sysctl_sched_wakeup_granularity);
+ +              }
         }
   
         sched_info_dequeued(p);
@@@ -2079,7 -2017,7 +2079,7 @@@ unsigned long wait_task_inactive(struc
                  * it must be off the runqueue _entirely_, and not
                  * preempted!
                  *
- -               * So if it wa still runnable (but just not actively
+ +               * So if it was still runnable (but just not actively
                  * running right now), it's preempted, and we should
                  * yield - it could be a while.
                  */
@@@ -2329,7 -2267,7 +2329,7 @@@ static int try_to_wake_up(struct task_s
                 sync = 0;
   
   #ifdef CONFIG_SMP
- -      if (sched_feat(LB_WAKEUP_UPDATE)) {
+ +      if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                 struct sched_domain *sd;
   
                 this_cpu = raw_smp_processor_id();
@@@ -2407,22 -2345,6 +2407,22 @@@ out_activate
         activate_task(rq, p, 1);
         success = 1;
   
+ +      /*
+ +       * Only attribute actual wakeups done by this task.
+ +       */
+ +      if (!in_interrupt()) {
+ +              struct sched_entity *se = &current->se;
+ +              u64 sample = se->sum_exec_runtime;
+ +
+ +              if (se->last_wakeup)
+ +                      sample -= se->last_wakeup;
+ +              else
+ +                      sample -= se->start_runtime;
+ +              update_avg(&se->avg_wakeup, sample);
+ +
+ +              se->last_wakeup = se->sum_exec_runtime;
+ +      }
+ +
   out_running:
         trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
@@@ -2433,6 -2355,8 +2433,6 @@@
                 p->sched_class->task_wake_up(rq, p);
   #endif
   out:
- -      current->se.last_wakeup = current->se.sum_exec_runtime;
- -
         task_rq_unlock(rq, &flags);
   
         return success;
@@@ -2462,8 -2386,6 +2462,8 @@@ static void __sched_fork(struct task_st
         p->se.prev_sum_exec_runtime     = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
+ +      p->se.start_runtime             = 0;
+ +      p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
   
   #ifdef CONFIG_SCHEDSTATS
         p->se.wait_start                = 0;
@@@ -2526,8 -2448,6 +2526,8 @@@ void sched_fork(struct task_struct *p, 
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
   #endif
+ +      plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ +
         put_cpu();
   }
   
@@@ -2571,7 -2491,7 +2571,7 @@@ void wake_up_new_task(struct task_struc
   #ifdef CONFIG_PREEMPT_NOTIFIERS
   
   /**
- - * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ + * preempt_notifier_register - tell me when current is being preempted & rescheduled
    * @notifier: notifier struct to register
    */
   void preempt_notifier_register(struct preempt_notifier *notifier)
@@@ -2668,12 -2588,6 +2668,12 @@@ static void finish_task_switch(struct r
   {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
+ +#ifdef CONFIG_SMP
+ +      int post_schedule = 0;
+ +
+ +      if (current->sched_class->needs_post_schedule)
+ +              post_schedule = current->sched_class->needs_post_schedule(rq);
+ +#endif
   
         rq->prev_mm = NULL;
   
@@@ -2692,7 -2606,7 +2692,7 @@@
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
   #ifdef CONFIG_SMP
- -      if (current->sched_class->post_schedule)
+ +      if (post_schedule)
                 current->sched_class->post_schedule(rq);
   #endif
   
@@@ -2999,7 -2913,6 +2999,7 @@@ int can_migrate_task(struct task_struc
                      struct sched_domain *sd, enum cpu_idle_type idle,
                      int *all_pinned)
   {
+ +      int tsk_cache_hot = 0;
         /*
          * We do not migrate tasks that are:
          * 1) running (obviously), or
@@@ -3023,11 -2936,10 +3023,11 @@@
          * 2) too many balance attempts have failed.
          */
   
- -      if (!task_hot(p, rq->clock, sd) ||
- -                      sd->nr_balance_failed > sd->cache_nice_tries) {
+ +      tsk_cache_hot = task_hot(p, rq->clock, sd);
+ +      if (!tsk_cache_hot ||
+ +              sd->nr_balance_failed > sd->cache_nice_tries) {
   #ifdef CONFIG_SCHEDSTATS
- -              if (task_hot(p, rq->clock, sd)) {
+ +              if (tsk_cache_hot) {
                         schedstat_inc(sd, lb_hot_gained[idle]);
                         schedstat_inc(p, se.nr_forced_migrations);
                 }
@@@ -3035,7 -2947,7 +3035,7 @@@
                 return 1;
         }
   
- -      if (task_hot(p, rq->clock, sd)) {
+ +      if (tsk_cache_hot) {
                 schedstat_inc(p, se.nr_failed_migrations_hot);
                 return 0;
         }
@@@ -3075,16 -2987,6 +3075,16 @@@ next
         pulled++;
         rem_load_move -= p->se.load.weight;
   
+ +#ifdef CONFIG_PREEMPT
+ +      /*
+ +       * NEWIDLE balancing is a source of latency, so preemptible kernels
+ +       * will stop after the first task is pulled to minimize the critical
+ +       * section.
+ +       */
+ +      if (idle == CPU_NEWLY_IDLE)
+ +              goto out;
+ +#endif
+ +
         /*
          * We only want to steal up to the prescribed amount of weighted load.
          */
@@@ -3131,15 -3033,9 +3131,15 @@@ static int move_tasks(struct rq *this_r
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
   
+ +#ifdef CONFIG_PREEMPT
+ +              /*
+ +               * NEWIDLE balancing is a source of latency, so preemptible
+ +               * kernels will stop after the first task is pulled to minimize
+ +               * the critical section.
+ +               */
                 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                         break;
- -
+ +#endif
         } while (class && max_load_move > total_load_moved);
   
         return total_load_moved > 0;
@@@ -3189,479 -3085,246 +3189,479 @@@ static int move_one_task(struct rq *thi
   
         return 0;
   }
+ +/********** Helpers for find_busiest_group ************************/
+ +/**
+ + * sd_lb_stats - Structure to store the statistics of a sched_domain
+ + *            during load balancing.
+ + */
+ +struct sd_lb_stats {
+ +      struct sched_group *busiest; /* Busiest group in this sd */
+ +      struct sched_group *this;  /* Local group in this sd */
+ +      unsigned long total_load;  /* Total load of all groups in sd */
+ +      unsigned long total_pwr;   /*   Total power of all groups in sd */
+ +      unsigned long avg_load;    /* Average load across all groups in sd */
+ +
+ +      /** Statistics of this group */
+ +      unsigned long this_load;
+ +      unsigned long this_load_per_task;
+ +      unsigned long this_nr_running;
+ +
+ +      /* Statistics of the busiest group */
+ +      unsigned long max_load;
+ +      unsigned long busiest_load_per_task;
+ +      unsigned long busiest_nr_running;
+ +
+ +      int group_imb; /* Is there imbalance in this sd */
+ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ +      int power_savings_balance; /* Is powersave balance needed for this sd */
+ +      struct sched_group *group_min; /* Least loaded group in sd */
+ +      struct sched_group *group_leader; /* Group which relieves group_min */
+ +      unsigned long min_load_per_task; /* load_per_task in group_min */
+ +      unsigned long leader_nr_running; /* Nr running of group_leader */
+ +      unsigned long min_nr_running; /* Nr running of group_min */
+ +#endif
+ +};
   
- -/*
- - * find_busiest_group finds and returns the busiest CPU group within the
- - * domain. It calculates and returns the amount of weighted load which
- - * should be moved to restore balance via the imbalance parameter.
+ +/**
+ + * sg_lb_stats - stats of a sched_group required for load_balancing
+ + */
+ +struct sg_lb_stats {
+ +      unsigned long avg_load; /*Avg load across the CPUs of the group */
+ +      unsigned long group_load; /* Total load over the CPUs of the group */
+ +      unsigned long sum_nr_running; /* Nr tasks running in the group */
+ +      unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ +      unsigned long group_capacity;
+ +      int group_imb; /* Is there an imbalance in the group ? */
+ +};
+ +
+ +/**
+ + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ + * @group: The group whose first cpu is to be returned.
    */
- -static struct sched_group *
- -find_busiest_group(struct sched_domain *sd, int this_cpu,
- -                 unsigned long *imbalance, enum cpu_idle_type idle,
- -                 int *sd_idle, const struct cpumask *cpus, int *balance)
+ +static inline unsigned int group_first_cpu(struct sched_group *group)
   {
- -      struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
- -      unsigned long max_load, avg_load, total_load, this_load, total_pwr;
- -      unsigned long max_pull;
- -      unsigned long busiest_load_per_task, busiest_nr_running;
- -      unsigned long this_load_per_task, this_nr_running;
- -      int load_idx, group_imb = 0;
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -      int power_savings_balance = 1;
- -      unsigned long leader_nr_running = 0, min_load_per_task = 0;
- -      unsigned long min_nr_running = ULONG_MAX;
- -      struct sched_group *group_min = NULL, *group_leader = NULL;
- -#endif
+ +      return cpumask_first(sched_group_cpus(group));
+ +}
   
- -      max_load = this_load = total_load = total_pwr = 0;
- -      busiest_load_per_task = busiest_nr_running = 0;
- -      this_load_per_task = this_nr_running = 0;
+ +/**
+ + * get_sd_load_idx - Obtain the load index for a given sched domain.
+ + * @sd: The sched_domain whose load_idx is to be obtained.
+ + * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ + */
+ +static inline int get_sd_load_idx(struct sched_domain *sd,
+ +                                      enum cpu_idle_type idle)
+ +{
+ +      int load_idx;
   
- -      if (idle == CPU_NOT_IDLE)
+ +      switch (idle) {
+ +      case CPU_NOT_IDLE:
                 load_idx = sd->busy_idx;
- -      else if (idle == CPU_NEWLY_IDLE)
+ +              break;
+ +
+ +      case CPU_NEWLY_IDLE:
                 load_idx = sd->newidle_idx;
- -      else
+ +              break;
+ +      default:
                 load_idx = sd->idle_idx;
+ +              break;
+ +      }
   
- -      do {
- -              unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
- -              int local_group;
- -              int i;
- -              int __group_imb = 0;
- -              unsigned int balance_cpu = -1, first_idle_cpu = 0;
- -              unsigned long sum_nr_running, sum_weighted_load;
- -              unsigned long sum_avg_load_per_task;
- -              unsigned long avg_load_per_task;
+ +      return load_idx;
+ +}
   
- -              local_group = cpumask_test_cpu(this_cpu,
- -                                             sched_group_cpus(group));
   
- -              if (local_group)
- -                      balance_cpu = cpumask_first(sched_group_cpus(group));
+ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ +/**
+ + * init_sd_power_savings_stats - Initialize power savings statistics for
+ + * the given sched_domain, during load balancing.
+ + *
+ + * @sd: Sched domain whose power-savings statistics are to be initialized.
+ + * @sds: Variable containing the statistics for sd.
+ + * @idle: Idle status of the CPU at which we're performing load-balancing.
+ + */
+ +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ +{
+ +      /*
+ +       * Busy processors will not participate in power savings
+ +       * balance.
+ +       */
+ +      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+ +              sds->power_savings_balance = 0;
+ +      else {
+ +              sds->power_savings_balance = 1;
+ +              sds->min_nr_running = ULONG_MAX;
+ +              sds->leader_nr_running = 0;
+ +      }
+ +}
   
- -              /* Tally up the load of all CPUs in the group */
- -              sum_weighted_load = sum_nr_running = avg_load = 0;
- -              sum_avg_load_per_task = avg_load_per_task = 0;
+ +/**
+ + * update_sd_power_savings_stats - Update the power saving stats for a
+ + * sched_domain while performing load balancing.
+ + *
+ + * @group: sched_group belonging to the sched_domain under consideration.
+ + * @sds: Variable containing the statistics of the sched_domain
+ + * @local_group: Does group contain the CPU for which we're performing
+ + *            load balancing ?
+ + * @sgs: Variable containing the statistics of the group.
+ + */
+ +static inline void update_sd_power_savings_stats(struct sched_group *group,
+ +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ +{
+ +
+ +      if (!sds->power_savings_balance)
+ +              return;
   
- -              max_cpu_load = 0;
- -              min_cpu_load = ~0UL;
+ +      /*
+ +       * If the local group is idle or completely loaded
+ +       * no need to do power savings balance at this domain
+ +       */
+ +      if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+ +                              !sds->this_nr_running))
+ +              sds->power_savings_balance = 0;
   
- -              for_each_cpu_and(i, sched_group_cpus(group), cpus) {
- -                      struct rq *rq = cpu_rq(i);
+ +      /*
+ +       * If a group is already running at full capacity or idle,
+ +       * don't include that group in power savings calculations
+ +       */
+ +      if (!sds->power_savings_balance ||
+ +              sgs->sum_nr_running >= sgs->group_capacity ||
+ +              !sgs->sum_nr_running)
+ +              return;
   
- -                      if (*sd_idle && rq->nr_running)
- -                              *sd_idle = 0;
+ +      /*
+ +       * Calculate the group which has the least non-idle load.
+ +       * This is the group from where we need to pick up the load
+ +       * for saving power
+ +       */
+ +      if ((sgs->sum_nr_running < sds->min_nr_running) ||
+ +          (sgs->sum_nr_running == sds->min_nr_running &&
+ +           group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+ +              sds->group_min = group;
+ +              sds->min_nr_running = sgs->sum_nr_running;
+ +              sds->min_load_per_task = sgs->sum_weighted_load /
+ +                                              sgs->sum_nr_running;
+ +      }
   
- -                      /* Bias balancing toward cpus of our domain */
- -                      if (local_group) {
- -                              if (idle_cpu(i) && !first_idle_cpu) {
- -                                      first_idle_cpu = 1;
- -                                      balance_cpu = i;
- -                              }
+ +      /*
+ +       * Calculate the group which is almost near its
+ +       * capacity but still has some space to pick up some load
+ +       * from other group and save more power
+ +       */
+ +      if (sgs->sum_nr_running > sgs->group_capacity - 1)
+ +              return;
   
- -                              load = target_load(i, load_idx);
- -                      } else {
- -                              load = source_load(i, load_idx);
- -                              if (load > max_cpu_load)
- -                                      max_cpu_load = load;
- -                              if (min_cpu_load > load)
- -                                      min_cpu_load = load;
- -                      }
+ +      if (sgs->sum_nr_running > sds->leader_nr_running ||
+ +          (sgs->sum_nr_running == sds->leader_nr_running &&
+ +           group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+ +              sds->group_leader = group;
+ +              sds->leader_nr_running = sgs->sum_nr_running;
+ +      }
+ +}
   
- -                      avg_load += load;
- -                      sum_nr_running += rq->nr_running;
- -                      sum_weighted_load += weighted_cpuload(i);
+ +/**
+ + * check_power_save_busiest_group - Check if we have potential to perform
+ + *    some power-savings balance. If yes, set the busiest group to be
+ + *    the least loaded group in the sched_domain, so that it's CPUs can
+ + *    be put to idle.
+ + *
+ + * @sds: Variable containing the statistics of the sched_domain
+ + *    under consideration.
+ + * @this_cpu: Cpu at which we're currently performing load-balancing.
+ + * @imbalance: Variable to store the imbalance.
+ + *
+ + * Returns 1 if there is potential to perform power-savings balance.
+ + * Else returns 0.
+ + */
+ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ +                                      int this_cpu, unsigned long *imbalance)
+ +{
+ +      if (!sds->power_savings_balance)
+ +              return 0;
   
- -                      sum_avg_load_per_task += cpu_avg_load_per_task(i);
- -              }
+ +      if (sds->this != sds->group_leader ||
+ +                      sds->group_leader == sds->group_min)
+ +              return 0;
   
- -              /*
- -               * First idle cpu or the first cpu(busiest) in this sched group
- -               * is eligible for doing load balancing at this and above
- -               * domains. In the newly idle case, we will allow all the cpu's
- -               * to do the newly idle load balance.
- -               */
- -              if (idle != CPU_NEWLY_IDLE && local_group &&
- -                  balance_cpu != this_cpu && balance) {
- -                      *balance = 0;
- -                      goto ret;
- -              }
+ +      *imbalance = sds->min_load_per_task;
+ +      sds->busiest = sds->group_min;
   
- -              total_load += avg_load;
- -              total_pwr += group->__cpu_power;
+ +      if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ +              cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ +                      group_first_cpu(sds->group_leader);
+ +      }
   
- -              /* Adjust by relative CPU power of the group */
- -              avg_load = sg_div_cpu_power(group,
- -                              avg_load * SCHED_LOAD_SCALE);
+ +      return 1;
   
+ +}
+ +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ +static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+ +      struct sd_lb_stats *sds, enum cpu_idle_type idle)
+ +{
+ +      return;
+ +}
   
- -              /*
- -               * Consider the group unbalanced when the imbalance is larger
- -               * than the average weight of two tasks.
- -               *
- -               * APZ: with cgroup the avg task weight can vary wildly and
- -               *      might not be a suitable number - should we keep a
- -               *      normalized nr_running number somewhere that negates
- -               *      the hierarchy?
- -               */
- -              avg_load_per_task = sg_div_cpu_power(group,
- -                              sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ +static inline void update_sd_power_savings_stats(struct sched_group *group,
+ +      struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+ +{
+ +      return;
+ +}
+ +
+ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+ +                                      int this_cpu, unsigned long *imbalance)
+ +{
+ +      return 0;
+ +}
+ +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+ +
+ +
+ +/**
+ + * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ + * @group: sched_group whose statistics are to be updated.
+ + * @this_cpu: Cpu for which load balance is currently performed.
+ + * @idle: Idle status of this_cpu
+ + * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ + * @sd_idle: Idle status of the sched_domain containing group.
+ + * @local_group: Does group contain this_cpu.
+ + * @cpus: Set of cpus considered for load balancing.
+ + * @balance: Should we balance.
+ + * @sgs: variable to hold the statistics for this group.
+ + */
+ +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+ +                      enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ +                      int local_group, const struct cpumask *cpus,
+ +                      int *balance, struct sg_lb_stats *sgs)
+ +{
+ +      unsigned long load, max_cpu_load, min_cpu_load;
+ +      int i;
+ +      unsigned int balance_cpu = -1, first_idle_cpu = 0;
+ +      unsigned long sum_avg_load_per_task;
+ +      unsigned long avg_load_per_task;
   
- -              if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
- -                      __group_imb = 1;
+ +      if (local_group)
+ +              balance_cpu = group_first_cpu(group);
   
- -              group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ +      /* Tally up the load of all CPUs in the group */
+ +      sum_avg_load_per_task = avg_load_per_task = 0;
+ +      max_cpu_load = 0;
+ +      min_cpu_load = ~0UL;
   
+ +      for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ +              struct rq *rq = cpu_rq(i);
+ +
+ +              if (*sd_idle && rq->nr_running)
+ +                      *sd_idle = 0;
+ +
+ +              /* Bias balancing toward cpus of our domain */
                 if (local_group) {
- -                      this_load = avg_load;
- -                      this = group;
- -                      this_nr_running = sum_nr_running;
- -                      this_load_per_task = sum_weighted_load;
- -              } else if (avg_load > max_load &&
- -                         (sum_nr_running > group_capacity || __group_imb)) {
- -                      max_load = avg_load;
- -                      busiest = group;
- -                      busiest_nr_running = sum_nr_running;
- -                      busiest_load_per_task = sum_weighted_load;
- -                      group_imb = __group_imb;
+ +                      if (idle_cpu(i) && !first_idle_cpu) {
+ +                              first_idle_cpu = 1;
+ +                              balance_cpu = i;
+ +                      }
+ +
+ +                      load = target_load(i, load_idx);
+ +              } else {
+ +                      load = source_load(i, load_idx);
+ +                      if (load > max_cpu_load)
+ +                              max_cpu_load = load;
+ +                      if (min_cpu_load > load)
+ +                              min_cpu_load = load;
                 }
   
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -              /*
- -               * Busy processors will not participate in power savings
- -               * balance.
- -               */
- -              if (idle == CPU_NOT_IDLE ||
- -                              !(sd->flags & SD_POWERSAVINGS_BALANCE))
- -                      goto group_next;
+ +              sgs->group_load += load;
+ +              sgs->sum_nr_running += rq->nr_running;
+ +              sgs->sum_weighted_load += weighted_cpuload(i);
   
- -              /*
- -               * If the local group is idle or completely loaded
- -               * no need to do power savings balance at this domain
- -               */
- -              if (local_group && (this_nr_running >= group_capacity ||
- -                                  !this_nr_running))
- -                      power_savings_balance = 0;
+ +              sum_avg_load_per_task += cpu_avg_load_per_task(i);
+ +      }
   
- -              /*
- -               * If a group is already running at full capacity or idle,
- -               * don't include that group in power savings calculations
- -               */
- -              if (!power_savings_balance || sum_nr_running >= group_capacity
- -                  || !sum_nr_running)
- -                      goto group_next;
+ +      /*
+ +       * First idle cpu or the first cpu(busiest) in this sched group
+ +       * is eligible for doing load balancing at this and above
+ +       * domains. In the newly idle case, we will allow all the cpu's
+ +       * to do the newly idle load balance.
+ +       */
+ +      if (idle != CPU_NEWLY_IDLE && local_group &&
+ +          balance_cpu != this_cpu && balance) {
+ +              *balance = 0;
+ +              return;
+ +      }
   
- -              /*
- -               * Calculate the group which has the least non-idle load.
- -               * This is the group from where we need to pick up the load
- -               * for saving power
- -               */
- -              if ((sum_nr_running < min_nr_running) ||
- -                  (sum_nr_running == min_nr_running &&
- -                   cpumask_first(sched_group_cpus(group)) >
- -                   cpumask_first(sched_group_cpus(group_min)))) {
- -                      group_min = group;
- -                      min_nr_running = sum_nr_running;
- -                      min_load_per_task = sum_weighted_load /
- -                                              sum_nr_running;
- -              }
+ +      /* Adjust by relative CPU power of the group */
+ +      sgs->avg_load = sg_div_cpu_power(group,
+ +                      sgs->group_load * SCHED_LOAD_SCALE);
   
- -              /*
- -               * Calculate the group which is almost near its
- -               * capacity but still has some space to pick up some load
- -               * from other group and save more power
- -               */
- -              if (sum_nr_running <= group_capacity - 1) {
- -                      if (sum_nr_running > leader_nr_running ||
- -                          (sum_nr_running == leader_nr_running &&
- -                           cpumask_first(sched_group_cpus(group)) <
- -                           cpumask_first(sched_group_cpus(group_leader)))) {
- -                              group_leader = group;
- -                              leader_nr_running = sum_nr_running;
- -                      }
+ +
+ +      /*
+ +       * Consider the group unbalanced when the imbalance is larger
+ +       * than the average weight of two tasks.
+ +       *
+ +       * APZ: with cgroup the avg task weight can vary wildly and
+ +       *      might not be a suitable number - should we keep a
+ +       *      normalized nr_running number somewhere that negates
+ +       *      the hierarchy?
+ +       */
+ +      avg_load_per_task = sg_div_cpu_power(group,
+ +                      sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ +
+ +      if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ +              sgs->group_imb = 1;
+ +
+ +      sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ +
+ +}
+ +
+ +/**
+ + * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ + * @sd: sched_domain whose statistics are to be updated.
+ + * @this_cpu: Cpu for which load balance is currently performed.
+ + * @idle: Idle status of this_cpu
+ + * @sd_idle: Idle status of the sched_domain containing group.
+ + * @cpus: Set of cpus considered for load balancing.
+ + * @balance: Should we balance.
+ + * @sds: variable to hold the statistics for this sched_domain.
+ + */
+ +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+ +                      enum cpu_idle_type idle, int *sd_idle,
+ +                      const struct cpumask *cpus, int *balance,
+ +                      struct sd_lb_stats *sds)
+ +{
+ +      struct sched_group *group = sd->groups;
+ +      struct sg_lb_stats sgs;
+ +      int load_idx;
+ +
+ +      init_sd_power_savings_stats(sd, sds, idle);
+ +      load_idx = get_sd_load_idx(sd, idle);
+ +
+ +      do {
+ +              int local_group;
+ +
+ +              local_group = cpumask_test_cpu(this_cpu,
+ +                                             sched_group_cpus(group));
+ +              memset(&sgs, 0, sizeof(sgs));
+ +              update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+ +                              local_group, cpus, balance, &sgs);
+ +
+ +              if (local_group && balance && !(*balance))
+ +                      return;
+ +
+ +              sds->total_load += sgs.group_load;
+ +              sds->total_pwr += group->__cpu_power;
+ +
+ +              if (local_group) {
+ +                      sds->this_load = sgs.avg_load;
+ +                      sds->this = group;
+ +                      sds->this_nr_running = sgs.sum_nr_running;
+ +                      sds->this_load_per_task = sgs.sum_weighted_load;
+ +              } else if (sgs.avg_load > sds->max_load &&
+ +                         (sgs.sum_nr_running > sgs.group_capacity ||
+ +                              sgs.group_imb)) {
+ +                      sds->max_load = sgs.avg_load;
+ +                      sds->busiest = group;
+ +                      sds->busiest_nr_running = sgs.sum_nr_running;
+ +                      sds->busiest_load_per_task = sgs.sum_weighted_load;
+ +                      sds->group_imb = sgs.group_imb;
                 }
- -group_next:
- -#endif
+ +
+ +              update_sd_power_savings_stats(group, sds, local_group, &sgs);
                 group = group->next;
         } while (group != sd->groups);
   
- -      if (!busiest || this_load >= max_load || busiest_nr_running == 0)
- -              goto out_balanced;
- -
- -      avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+ +}
   
- -      if (this_load >= avg_load ||
- -                      100*max_load <= sd->imbalance_pct*this_load)
- -              goto out_balanced;
+ +/**
+ + * fix_small_imbalance - Calculate the minor imbalance that exists
+ + *                    amongst the groups of a sched_domain, during
+ + *                    load balancing.
+ + * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ + * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ + * @imbalance: Variable to store the imbalance.
+ + */
+ +static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+ +                              int this_cpu, unsigned long *imbalance)
+ +{
+ +      unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ +      unsigned int imbn = 2;
+ +
+ +      if (sds->this_nr_running) {
+ +              sds->this_load_per_task /= sds->this_nr_running;
+ +              if (sds->busiest_load_per_task >
+ +                              sds->this_load_per_task)
+ +                      imbn = 1;
+ +      } else
+ +              sds->this_load_per_task =
+ +                      cpu_avg_load_per_task(this_cpu);
   
- -      busiest_load_per_task /= busiest_nr_running;
- -      if (group_imb)
- -              busiest_load_per_task = min(busiest_load_per_task, avg_load);
+ +      if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+ +                      sds->busiest_load_per_task * imbn) {
+ +              *imbalance = sds->busiest_load_per_task;
+ +              return;
+ +      }
   
         /*
- -       * We're trying to get all the cpus to the average_load, so we don't
- -       * want to push ourselves above the average load, nor do we wish to
- -       * reduce the max loaded cpu below the average load, as either of these
- -       * actions would just result in more rebalancing later, and ping-pong
- -       * tasks around. Thus we look for the minimum possible imbalance.
- -       * Negative imbalances (*we* are more loaded than anyone else) will
- -       * be counted as no imbalance for these purposes -- we can't fix that
- -       * by pulling tasks to us. Be careful of negative numbers as they'll
- -       * appear as very large values with unsigned longs.
+ +       * OK, we don't have enough imbalance to justify moving tasks,
+ +       * however we may be able to increase total CPU power used by
+ +       * moving them.
          */
- -      if (max_load <= busiest_load_per_task)
- -              goto out_balanced;
   
+ +      pwr_now += sds->busiest->__cpu_power *
+ +                      min(sds->busiest_load_per_task, sds->max_load);
+ +      pwr_now += sds->this->__cpu_power *
+ +                      min(sds->this_load_per_task, sds->this_load);
+ +      pwr_now /= SCHED_LOAD_SCALE;
+ +
+ +      /* Amount of load we'd subtract */
+ +      tmp = sg_div_cpu_power(sds->busiest,
+ +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ +      if (sds->max_load > tmp)
+ +              pwr_move += sds->busiest->__cpu_power *
+ +                      min(sds->busiest_load_per_task, sds->max_load - tmp);
+ +
+ +      /* Amount of load we'd add */
+ +      if (sds->max_load * sds->busiest->__cpu_power <
+ +              sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+ +              tmp = sg_div_cpu_power(sds->this,
+ +                      sds->max_load * sds->busiest->__cpu_power);
+ +      else
+ +              tmp = sg_div_cpu_power(sds->this,
+ +                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ +      pwr_move += sds->this->__cpu_power *
+ +                      min(sds->this_load_per_task, sds->this_load + tmp);
+ +      pwr_move /= SCHED_LOAD_SCALE;
+ +
+ +      /* Move if we gain throughput */
+ +      if (pwr_move > pwr_now)
+ +              *imbalance = sds->busiest_load_per_task;
+ +}
+ +
+ +/**
+ + * calculate_imbalance - Calculate the amount of imbalance present within the
+ + *                     groups of a given sched_domain during load balance.
+ + * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ + * @this_cpu: Cpu for which currently load balance is being performed.
+ + * @imbalance: The variable to store the imbalance.
+ + */
+ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+ +              unsigned long *imbalance)
+ +{
+ +      unsigned long max_pull;
         /*
          * In the presence of smp nice balancing, certain scenarios can have
          * max load less than avg load(as we skip the groups at or below
          * its cpu_power, while calculating max_load..)
          */
- -      if (max_load < avg_load) {
+ +      if (sds->max_load < sds->avg_load) {
                 *imbalance = 0;
- -              goto small_imbalance;
+ +              return fix_small_imbalance(sds, this_cpu, imbalance);
         }
   
         /* Don't want to pull so many tasks that a group would go idle */
- -      max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+ +      max_pull = min(sds->max_load - sds->avg_load,
+ +                      sds->max_load - sds->busiest_load_per_task);
   
         /* How much load to actually move to equalise the imbalance */
- -      *imbalance = min(max_pull * busiest->__cpu_power,
- -                              (avg_load - this_load) * this->__cpu_power)
+ +      *imbalance = min(max_pull * sds->busiest->__cpu_power,
+ +              (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                         / SCHED_LOAD_SCALE;
   
         /*
@@@ -3670,110 -3333,78 +3670,110 @@@
          * a think about bumping its value to force at least one task to be
          * moved
          */
- -      if (*imbalance < busiest_load_per_task) {
- -              unsigned long tmp, pwr_now, pwr_move;
- -              unsigned int imbn;
- -
- -small_imbalance:
- -              pwr_move = pwr_now = 0;
- -              imbn = 2;
- -              if (this_nr_running) {
- -                      this_load_per_task /= this_nr_running;
- -                      if (busiest_load_per_task > this_load_per_task)
- -                              imbn = 1;
- -              } else
- -                      this_load_per_task = cpu_avg_load_per_task(this_cpu);
+ +      if (*imbalance < sds->busiest_load_per_task)
+ +              return fix_small_imbalance(sds, this_cpu, imbalance);
   
- -              if (max_load - this_load + busiest_load_per_task >=
- -                                      busiest_load_per_task * imbn) {
- -                      *imbalance = busiest_load_per_task;
- -                      return busiest;
- -              }
+ +}
+ +/******* find_busiest_group() helpers end here *********************/
   
- -              /*
- -               * OK, we don't have enough imbalance to justify moving tasks,
- -               * however we may be able to increase total CPU power used by
- -               * moving them.
- -               */
+ +/**
+ + * find_busiest_group - Returns the busiest group within the sched_domain
+ + * if there is an imbalance. If there isn't an imbalance, and
+ + * the user has opted for power-savings, it returns a group whose
+ + * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ + * such a group exists.
+ + *
+ + * Also calculates the amount of weighted load which should be moved
+ + * to restore balance.
+ + *
+ + * @sd: The sched_domain whose busiest group is to be returned.
+ + * @this_cpu: The cpu for which load balancing is currently being performed.
+ + * @imbalance: Variable which stores amount of weighted load which should
+ + *            be moved to restore balance/put a group to idle.
+ + * @idle: The idle status of this_cpu.
+ + * @sd_idle: The idleness of sd
+ + * @cpus: The set of CPUs under consideration for load-balancing.
+ + * @balance: Pointer to a variable indicating if this_cpu
+ + *    is the appropriate cpu to perform load balancing at this_level.
+ + *
+ + * Returns:   - the busiest group if imbalance exists.
+ + *            - If no imbalance and user has opted for power-savings balance,
+ + *               return the least loaded group whose CPUs can be
+ + *               put to idle by rebalancing its tasks onto our group.
+ + */
+ +static struct sched_group *
+ +find_busiest_group(struct sched_domain *sd, int this_cpu,
+ +                 unsigned long *imbalance, enum cpu_idle_type idle,
+ +                 int *sd_idle, const struct cpumask *cpus, int *balance)
+ +{
+ +      struct sd_lb_stats sds;
   
- -              pwr_now += busiest->__cpu_power *
- -                              min(busiest_load_per_task, max_load);
- -              pwr_now += this->__cpu_power *
- -                              min(this_load_per_task, this_load);
- -              pwr_now /= SCHED_LOAD_SCALE;
- -
- -              /* Amount of load we'd subtract */
- -              tmp = sg_div_cpu_power(busiest,
- -                              busiest_load_per_task * SCHED_LOAD_SCALE);
- -              if (max_load > tmp)
- -                      pwr_move += busiest->__cpu_power *
- -                              min(busiest_load_per_task, max_load - tmp);
- -
- -              /* Amount of load we'd add */
- -              if (max_load * busiest->__cpu_power <
- -                              busiest_load_per_task * SCHED_LOAD_SCALE)
- -                      tmp = sg_div_cpu_power(this,
- -                                      max_load * busiest->__cpu_power);
- -              else
- -                      tmp = sg_div_cpu_power(this,
- -                              busiest_load_per_task * SCHED_LOAD_SCALE);
- -              pwr_move += this->__cpu_power *
- -                              min(this_load_per_task, this_load + tmp);
- -              pwr_move /= SCHED_LOAD_SCALE;
+ +      memset(&sds, 0, sizeof(sds));
   
- -              /* Move if we gain throughput */
- -              if (pwr_move > pwr_now)
- -                      *imbalance = busiest_load_per_task;
- -      }
+ +      /*
+ +       * Compute the various statistics relavent for load balancing at
+ +       * this level.
+ +       */
+ +      update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+ +                                      balance, &sds);
+ +
+ +      /* Cases where imbalance does not exist from POV of this_cpu */
+ +      /* 1) this_cpu is not the appropriate cpu to perform load balancing
+ +       *    at this level.
+ +       * 2) There is no busy sibling group to pull from.
+ +       * 3) This group is the busiest group.
+ +       * 4) This group is more busy than the avg busieness at this
+ +       *    sched_domain.
+ +       * 5) The imbalance is within the specified limit.
+ +       * 6) Any rebalance would lead to ping-pong
+ +       */
+ +      if (balance && !(*balance))
+ +              goto ret;
   
- -      return busiest;
+ +      if (!sds.busiest || sds.busiest_nr_running == 0)
+ +              goto out_balanced;
   
- -out_balanced:
- -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -      if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- -              goto ret;
+ +      if (sds.this_load >= sds.max_load)
+ +              goto out_balanced;
   
- -      if (this == group_leader && group_leader != group_min) {
- -              *imbalance = min_load_per_task;
- -              if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
- -                      cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
- -                              cpumask_first(sched_group_cpus(group_leader));
- -              }
- -              return group_min;
- -      }
- -#endif
+ +      sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+ +
+ +      if (sds.this_load >= sds.avg_load)
+ +              goto out_balanced;
+ +
+ +      if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ +              goto out_balanced;
+ +
+ +      sds.busiest_load_per_task /= sds.busiest_nr_running;
+ +      if (sds.group_imb)
+ +              sds.busiest_load_per_task =
+ +                      min(sds.busiest_load_per_task, sds.avg_load);
+ +
+ +      /*
+ +       * We're trying to get all the cpus to the average_load, so we don't
+ +       * want to push ourselves above the average load, nor do we wish to
+ +       * reduce the max loaded cpu below the average load, as either of these
+ +       * actions would just result in more rebalancing later, and ping-pong
+ +       * tasks around. Thus we look for the minimum possible imbalance.
+ +       * Negative imbalances (*we* are more loaded than anyone else) will
+ +       * be counted as no imbalance for these purposes -- we can't fix that
+ +       * by pulling tasks to us. Be careful of negative numbers as they'll
+ +       * appear as very large values with unsigned longs.
+ +       */
+ +      if (sds.max_load <= sds.busiest_load_per_task)
+ +              goto out_balanced;
+ +
+ +      /* Looks like there is an imbalance. Compute it */
+ +      calculate_imbalance(&sds, this_cpu, imbalance);
+ +      return sds.busiest;
+ +
+ +out_balanced:
+ +      /*
+ +       * There is no obvious imbalance. But check if we can do some balancing
+ +       * to save power.
+ +       */
+ +      if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+ +              return sds.busiest;
   ret:
         *imbalance = 0;
         return NULL;
@@@ -4426,11 -4057,6 +4426,11 @@@ static void run_rebalance_domains(struc
   #endif
   }
   
+ +static inline int on_null_domain(int cpu)
+ +{
+ +      return !rcu_dereference(cpu_rq(cpu)->sd);
+ +}
+ +
   /*
    * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
    *
@@@ -4488,9 -4114,7 +4488,9 @@@ static inline void trigger_load_balance
             cpumask_test_cpu(cpu, nohz.cpu_mask))
                 return;
   #endif
- -      if (time_after_eq(jiffies, rq->next_balance))
+ +      /* Don't need to rebalance while attached to NULL domain */
+ +      if (time_after_eq(jiffies, rq->next_balance) &&
+ +          likely(!on_null_domain(cpu)))
                 raise_softirq(SCHED_SOFTIRQ);
   }
   
@@@ -4884,33 -4508,11 +4884,33 @@@ static inline void schedule_debug(struc
   #endif
   }
   
+ +static void put_prev_task(struct rq *rq, struct task_struct *prev)
+ +{
+ +      if (prev->state == TASK_RUNNING) {
+ +              u64 runtime = prev->se.sum_exec_runtime;
+ +
+ +              runtime -= prev->se.prev_sum_exec_runtime;
+ +              runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+ +
+ +              /*
+ +               * In order to avoid avg_overlap growing stale when we are
+ +               * indeed overlapping and hence not getting put to sleep, grow
+ +               * the avg_overlap on preemption.
+ +               *
+ +               * We use the average preemption runtime because that
+ +               * correlates to the amount of cache footprint a task can
+ +               * build up.
+ +               */
+ +              update_avg(&prev->se.avg_overlap, runtime);
+ +      }
+ +      prev->sched_class->put_prev_task(rq, prev);
+ +}
+ +
   /*
    * Pick up the highest-prio task:
    */
   static inline struct task_struct *
- -pick_next_task(struct rq *rq, struct task_struct *prev)
+ +pick_next_task(struct rq *rq)
   {
         const struct sched_class *class;
         struct task_struct *p;
@@@ -4984,8 -4586,8 +4984,8 @@@ need_resched_nonpreemptible
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
   
- -      prev->sched_class->put_prev_task(rq, prev);
- -      next = pick_next_task(rq, prev);
+ +      put_prev_task(rq, prev);
+ +      next = pick_next_task(rq);
   
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
@@@ -5040,7 -4642,7 +5040,7 @@@ asmlinkage void __sched preempt_schedul
                  * between schedule and now.
                  */
                 barrier();
- -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ +      } while (need_resched());
   }
   EXPORT_SYMBOL(preempt_schedule);
   
@@@ -5069,7 -4671,7 +5069,7 @@@ asmlinkage void __sched preempt_schedul
                  * between schedule and now.
                  */
                 barrier();
- -      } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ +      } while (need_resched());
   }
   
   #endif /* CONFIG_PREEMPT */
@@@ -5543,7 -5145,7 +5543,7 @@@ SYSCALL_DEFINE1(nice, int, increment
         if (increment > 40)
                 increment = 40;
   
- -      nice = PRIO_TO_NICE(current->static_prio) + increment;
+ +      nice = TASK_NICE(current) + increment;
         if (nice < -20)
                 nice = -20;
         if (nice > 19)
@@@ -6342,12 -5944,7 +6342,7 @@@ void sched_show_task(struct task_struc
                 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
   #endif
   #ifdef CONFIG_DEBUG_STACK_USAGE
-       {
-               unsigned long *n = end_of_stack(p);
-               while (!*n)
-                       n++;
-               free = (unsigned long)n - (unsigned long)end_of_stack(p);
-       }
+       free = stack_not_used(p);
   #endif
         printk(KERN_CONT "%5lu %5d %6d\n", free,
                 task_pid_nr(p), task_pid_nr(p->real_parent));
@@@ -6821,7 -6418,7 +6816,7 @@@ static void migrate_dead_tasks(unsigne
                 if (!rq->nr_running)
                         break;
                 update_rq_clock(rq);
- -              next = pick_next_task(rq, rq->curr);
+ +              next = pick_next_task(rq);
                 if (!next)
                         break;
                 next->sched_class->put_prev_task(rq, next);
@@@ -8616,15 -8213,11 +8611,15 @@@ static void init_rt_rq(struct rt_rq *rt
         __set_bit(MAX_RT_PRIO, array->bitmap);
   
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -      rt_rq->highest_prio = MAX_RT_PRIO;
+ +      rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ +#ifdef CONFIG_SMP
+ +      rt_rq->highest_prio.next = MAX_RT_PRIO;
+ +#endif
   #endif
   #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
+ +      plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
   #endif
   
         rt_rq->rt_time = 0;
@@@ -9626,16 -9219,6 +9621,16 @@@ static int sched_rt_global_constraints(
   
         return ret;
   }
+ +
+ +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+ +{
+ +      /* Don't accept realtime tasks when there is no way for them to run */
+ +      if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ +              return 0;
+ +
+ +      return 1;
+ +}
+ +
   #else /* !CONFIG_RT_GROUP_SCHED */
   static int sched_rt_global_constraints(void)
   {
@@@ -9729,7 -9312,8 +9724,7 @@@ cpu_cgroup_can_attach(struct cgroup_sub
                       struct task_struct *tsk)
   {
   #ifdef CONFIG_RT_GROUP_SCHED
- -      /* Don't accept realtime tasks when there is no way for them to run */
- -      if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+ +      if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
                 return -EINVAL;
   #else
         /* We don't support RT-tasks being in separate groups */
@@@ -9892,7 -9476,7 +9887,7 @@@ cpuacct_destroy(struct cgroup_subsys *s
   
   static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
   {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
         u64 data;
   
   #ifndef CONFIG_64BIT
@@@ -9911,7 -9495,7 +9906,7 @@@
   
   static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
   {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
   
   #ifndef CONFIG_64BIT
         /*
@@@ -10000,14 -9584,14 +9995,14 @@@ static void cpuacct_charge(struct task_
         struct cpuacct *ca;
         int cpu;
   
- -      if (!cpuacct_subsys.active)
+ +      if (unlikely(!cpuacct_subsys.active))
                 return;
   
         cpu = task_cpu(tsk);
         ca = task_ca(tsk);
   
         for (; ca; ca = ca->parent) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
   }
diff --combined kernel/sched_rt.c

index c79dc7844012d7b69926298673bdffc6fa3ff79b,da932f4c85240abce39c62461b08a8d2c298bf9f..299d012b4394e8c62d3a677502e41c41802ab444
--- 1/kernel/sched_rt.c
--- 2/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@ -3,40 -3,6 +3,40 @@@
    * policies)
    */
   
+ +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+ +{
+ +      return container_of(rt_se, struct task_struct, rt);
+ +}
+ +
+ +#ifdef CONFIG_RT_GROUP_SCHED
+ +
+ +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+ +{
+ +      return rt_rq->rq;
+ +}
+ +
+ +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+ +{
+ +      return rt_se->rt_rq;
+ +}
+ +
+ +#else /* CONFIG_RT_GROUP_SCHED */
+ +
+ +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+ +{
+ +      return container_of(rt_rq, struct rq, rt);
+ +}
+ +
+ +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+ +{
+ +      struct task_struct *p = rt_task_of(rt_se);
+ +      struct rq *rq = task_rq(p);
+ +
+ +      return &rq->rt;
+ +}
+ +
+ +#endif /* CONFIG_RT_GROUP_SCHED */
+ +
   #ifdef CONFIG_SMP
   
   static inline int rt_overloaded(struct rq *rq)
@@@ -71,69 -37,25 +71,69 @@@ static inline void rt_clear_overload(st
         cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
   }
   
- -static void update_rt_migration(struct rq *rq)
+ +static void update_rt_migration(struct rt_rq *rt_rq)
   {
- -      if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
- -              if (!rq->rt.overloaded) {
- -                      rt_set_overload(rq);
- -                      rq->rt.overloaded = 1;
+ +      if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
+ +              if (!rt_rq->overloaded) {
+ +                      rt_set_overload(rq_of_rt_rq(rt_rq));
+ +                      rt_rq->overloaded = 1;
                 }
- -      } else if (rq->rt.overloaded) {
- -              rt_clear_overload(rq);
- -              rq->rt.overloaded = 0;
+ +      } else if (rt_rq->overloaded) {
+ +              rt_clear_overload(rq_of_rt_rq(rt_rq));
+ +              rt_rq->overloaded = 0;
         }
   }
- -#endif /* CONFIG_SMP */
   
- -static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+ +static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +      if (rt_se->nr_cpus_allowed > 1)
+ +              rt_rq->rt_nr_migratory++;
+ +
+ +      update_rt_migration(rt_rq);
+ +}
+ +
+ +static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +      if (rt_se->nr_cpus_allowed > 1)
+ +              rt_rq->rt_nr_migratory--;
+ +
+ +      update_rt_migration(rt_rq);
+ +}
+ +
+ +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+ +{
+ +      plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ +      plist_node_init(&p->pushable_tasks, p->prio);
+ +      plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ +}
+ +
+ +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+ +{
+ +      plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ +}
+ +
+ +#else
+ +
+ +static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
   {
- -      return container_of(rt_se, struct task_struct, rt);
   }
   
+ +static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+ +{
+ +}
+ +
+ +static inline
+ +void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +}
+ +
+ +static inline
+ +void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +}
+ +
+ +#endif /* CONFIG_SMP */
+ +
   static inline int on_rt_rq(struct sched_rt_entity *rt_se)
   {
         return !list_empty(&rt_se->run_list);
@@@ -157,6 -79,16 +157,6 @@@ static inline u64 sched_rt_period(struc
   #define for_each_leaf_rt_rq(rt_rq, rq) \
         list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
   
- -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
- -{
- -      return rt_rq->rq;
- -}
- -
- -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
- -{
- -      return rt_se->rt_rq;
- -}
- -
   #define for_each_sched_rt_entity(rt_se) \
         for (; rt_se; rt_se = rt_se->parent)
   
@@@ -176,7 -108,7 +176,7 @@@ static void sched_rt_rq_enqueue(struct 
         if (rt_rq->rt_nr_running) {
                 if (rt_se && !on_rt_rq(rt_se))
                         enqueue_rt_entity(rt_se);
- -              if (rt_rq->highest_prio < curr->prio)
+ +              if (rt_rq->highest_prio.curr < curr->prio)
                         resched_task(curr);
         }
   }
@@@ -244,6 -176,19 +244,6 @@@ static inline u64 sched_rt_period(struc
   #define for_each_leaf_rt_rq(rt_rq, rq) \
         for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
   
- -static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
- -{
- -      return container_of(rt_rq, struct rq, rt);
- -}
- -
- -static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
- -{
- -      struct task_struct *p = rt_task_of(rt_se);
- -      struct rq *rq = task_rq(p);
- -
- -      return &rq->rt;
- -}
- -
   #define for_each_sched_rt_entity(rt_se) \
         for (; rt_se; rt_se = NULL)
   
@@@ -528,7 -473,7 +528,7 @@@ static inline int rt_se_prio(struct sch
         struct rt_rq *rt_rq = group_rt_rq(rt_se);
   
         if (rt_rq)
- -              return rt_rq->highest_prio;
+ +              return rt_rq->highest_prio.curr;
   #endif
   
         return rt_task_of(rt_se)->prio;
@@@ -602,174 -547,91 +602,174 @@@ static void update_curr_rt(struct rq *r
         }
   }
   
- -static inline
- -void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +#if defined CONFIG_SMP
+ +
+ +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+ +
+ +static inline int next_prio(struct rq *rq)
   {
- -      WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- -      rt_rq->rt_nr_running++;
- -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- -      if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
- -#ifdef CONFIG_SMP
- -              struct rq *rq = rq_of_rt_rq(rt_rq);
- -#endif
+ +      struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+ +
+ +      if (next && rt_prio(next->prio))
+ +              return next->prio;
+ +      else
+ +              return MAX_RT_PRIO;
+ +}
+ +
+ +static void
+ +inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+ +{
+ +      struct rq *rq = rq_of_rt_rq(rt_rq);
+ +
+ +      if (prio < prev_prio) {
+ +
+ +              /*
+ +               * If the new task is higher in priority than anything on the
+ +               * run-queue, we know that the previous high becomes our
+ +               * next-highest.
+ +               */
+ +              rt_rq->highest_prio.next = prev_prio;
   
- -              rt_rq->highest_prio = rt_se_prio(rt_se);
- -#ifdef CONFIG_SMP
                 if (rq->online)
- -                      cpupri_set(&rq->rd->cpupri, rq->cpu,
- -                                 rt_se_prio(rt_se));
- -#endif
- -      }
- -#endif
- -#ifdef CONFIG_SMP
- -      if (rt_se->nr_cpus_allowed > 1) {
- -              struct rq *rq = rq_of_rt_rq(rt_rq);
+ +                      cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
   
- -              rq->rt.rt_nr_migratory++;
- -      }
+ +      } else if (prio == rt_rq->highest_prio.curr)
+ +              /*
+ +               * If the next task is equal in priority to the highest on
+ +               * the run-queue, then we implicitly know that the next highest
+ +               * task cannot be any lower than current
+ +               */
+ +              rt_rq->highest_prio.next = prio;
+ +      else if (prio < rt_rq->highest_prio.next)
+ +              /*
+ +               * Otherwise, we need to recompute next-highest
+ +               */
+ +              rt_rq->highest_prio.next = next_prio(rq);
+ +}
   
- -      update_rt_migration(rq_of_rt_rq(rt_rq));
- -#endif
- -#ifdef CONFIG_RT_GROUP_SCHED
- -      if (rt_se_boosted(rt_se))
- -              rt_rq->rt_nr_boosted++;
+ +static void
+ +dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+ +{
+ +      struct rq *rq = rq_of_rt_rq(rt_rq);
   
- -      if (rt_rq->tg)
- -              start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
- -#else
- -      start_rt_bandwidth(&def_rt_bandwidth);
- -#endif
+ +      if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
+ +              rt_rq->highest_prio.next = next_prio(rq);
+ +
+ +      if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+ +              cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
   }
   
+ +#else /* CONFIG_SMP */
+ +
   static inline
- -void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
- -{
- -#ifdef CONFIG_SMP
- -      int highest_prio = rt_rq->highest_prio;
- -#endif
+ +void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+ +static inline
+ +void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+ +
+ +#endif /* CONFIG_SMP */
   
- -      WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- -      WARN_ON(!rt_rq->rt_nr_running);
- -      rt_rq->rt_nr_running--;
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ +static void
+ +inc_rt_prio(struct rt_rq *rt_rq, int prio)
+ +{
+ +      int prev_prio = rt_rq->highest_prio.curr;
+ +
+ +      if (prio < prev_prio)
+ +              rt_rq->highest_prio.curr = prio;
+ +
+ +      inc_rt_prio_smp(rt_rq, prio, prev_prio);
+ +}
+ +
+ +static void
+ +dec_rt_prio(struct rt_rq *rt_rq, int prio)
+ +{
+ +      int prev_prio = rt_rq->highest_prio.curr;
+ +
         if (rt_rq->rt_nr_running) {
- -              struct rt_prio_array *array;
   
- -              WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
- -              if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
- -                      /* recalculate */
- -                      array = &rt_rq->active;
- -                      rt_rq->highest_prio =
+ +              WARN_ON(prio < prev_prio);
+ +
+ +              /*
+ +               * This may have been our highest task, and therefore
+ +               * we may have some recomputation to do
+ +               */
+ +              if (prio == prev_prio) {
+ +                      struct rt_prio_array *array = &rt_rq->active;
+ +
+ +                      rt_rq->highest_prio.curr =
                                 sched_find_first_bit(array->bitmap);
- -              } /* otherwise leave rq->highest prio alone */
+ +              }
+ +
         } else
- -              rt_rq->highest_prio = MAX_RT_PRIO;
- -#endif
- -#ifdef CONFIG_SMP
- -      if (rt_se->nr_cpus_allowed > 1) {
- -              struct rq *rq = rq_of_rt_rq(rt_rq);
- -              rq->rt.rt_nr_migratory--;
- -      }
+ +              rt_rq->highest_prio.curr = MAX_RT_PRIO;
   
- -      if (rt_rq->highest_prio != highest_prio) {
- -              struct rq *rq = rq_of_rt_rq(rt_rq);
+ +      dec_rt_prio_smp(rt_rq, prio, prev_prio);
+ +}
   
- -              if (rq->online)
- -                      cpupri_set(&rq->rd->cpupri, rq->cpu,
- -                                 rt_rq->highest_prio);
- -      }
+ +#else
+ +
+ +static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+ +static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+ +
+ +#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
   
- -      update_rt_migration(rq_of_rt_rq(rt_rq));
- -#endif /* CONFIG_SMP */
   #ifdef CONFIG_RT_GROUP_SCHED
+ +
+ +static void
+ +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +      if (rt_se_boosted(rt_se))
+ +              rt_rq->rt_nr_boosted++;
+ +
+ +      if (rt_rq->tg)
+ +              start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+ +}
+ +
+ +static void
+ +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
         if (rt_se_boosted(rt_se))
                 rt_rq->rt_nr_boosted--;
   
         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
- -#endif
+ +}
+ +
+ +#else /* CONFIG_RT_GROUP_SCHED */
+ +
+ +static void
+ +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +      start_rt_bandwidth(&def_rt_bandwidth);
+ +}
+ +
+ +static inline
+ +void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+ +
+ +#endif /* CONFIG_RT_GROUP_SCHED */
+ +
+ +static inline
+ +void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +      int prio = rt_se_prio(rt_se);
+ +
+ +      WARN_ON(!rt_prio(prio));
+ +      rt_rq->rt_nr_running++;
+ +
+ +      inc_rt_prio(rt_rq, prio);
+ +      inc_rt_migration(rt_se, rt_rq);
+ +      inc_rt_group(rt_se, rt_rq);
+ +}
+ +
+ +static inline
+ +void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+ +{
+ +      WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+ +      WARN_ON(!rt_rq->rt_nr_running);
+ +      rt_rq->rt_nr_running--;
+ +
+ +      dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ +      dec_rt_migration(rt_se, rt_rq);
+ +      dec_rt_group(rt_se, rt_rq);
   }
   
   static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@@ -856,9 -718,6 +856,9 @@@ static void enqueue_task_rt(struct rq *
   
         enqueue_rt_entity(rt_se);
   
+ +      if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ +              enqueue_pushable_task(rq, p);
+ +
         inc_cpu_load(rq, p->se.load.weight);
   }
   
@@@ -869,8 -728,6 +869,8 @@@ static void dequeue_task_rt(struct rq *
         update_curr_rt(rq);
         dequeue_rt_entity(rt_se);
   
+ +      dequeue_pushable_task(rq, p);
+ +
         dec_cpu_load(rq, p->se.load.weight);
   }
   
@@@ -1021,7 -878,7 +1021,7 @@@ static struct sched_rt_entity *pick_nex
         return next;
   }
   
- -static struct task_struct *pick_next_task_rt(struct rq *rq)
+ +static struct task_struct *_pick_next_task_rt(struct rq *rq)
   {
         struct sched_rt_entity *rt_se;
         struct task_struct *p;
@@@ -1043,18 -900,6 +1043,18 @@@
   
         p = rt_task_of(rt_se);
         p->se.exec_start = rq->clock;
+ +
+ +      return p;
+ +}
+ +
+ +static struct task_struct *pick_next_task_rt(struct rq *rq)
+ +{
+ +      struct task_struct *p = _pick_next_task_rt(rq);
+ +
+ +      /* The running task is never eligible for pushing */
+ +      if (p)
+ +              dequeue_pushable_task(rq, p);
+ +
         return p;
   }
   
@@@ -1062,13 -907,6 +1062,13 @@@ static void put_prev_task_rt(struct rq 
   {
         update_curr_rt(rq);
         p->se.exec_start = 0;
+ +
+ +      /*
+ +       * The previous task needs to be made eligible for pushing
+ +       * if it is still active
+ +       */
+ +      if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+ +              enqueue_pushable_task(rq, p);
   }
   
   #ifdef CONFIG_SMP
@@@ -1122,12 -960,13 +1122,13 @@@ static struct task_struct *pick_next_hi
   
   static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
   
- static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+ static inline int pick_optimal_cpu(int this_cpu,
+                                  const struct cpumask *mask)
   {
         int first;
   
         /* "this_cpu" is cheaper to preempt than a remote processor */
-       if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+       if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
                 return this_cpu;
   
         first = cpumask_first(mask);
@@@ -1143,6 -982,7 +1144,7 @@@ static int find_lowest_rq(struct task_s
         struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
         int this_cpu = smp_processor_id();
         int cpu      = task_cpu(task);
+       cpumask_var_t domain_mask;
   
         if (task->rt.nr_cpus_allowed == 1)
                 return -1; /* No other targets possible */
@@@ -1175,19 -1015,25 +1177,25 @@@
         if (this_cpu == cpu)
                 this_cpu = -1; /* Skip this_cpu opt if the same */
   
-       for_each_domain(cpu, sd) {
-               if (sd->flags & SD_WAKE_AFFINE) {
-                       cpumask_t domain_mask;
-                       int       best_cpu;
+       if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
+               for_each_domain(cpu, sd) {
+                       if (sd->flags & SD_WAKE_AFFINE) {
+                               int best_cpu;
   
-                       cpumask_and(&domain_mask, sched_domain_span(sd),
-                                   lowest_mask);
+                               cpumask_and(domain_mask,
+                                           sched_domain_span(sd),
+                                           lowest_mask);
   
-                       best_cpu = pick_optimal_cpu(this_cpu,
-                                                   &domain_mask);
-                       if (best_cpu != -1)
-                               return best_cpu;
+                               best_cpu = pick_optimal_cpu(this_cpu,
+                                                           domain_mask);
+ 
+                               if (best_cpu != -1) {
+                                       free_cpumask_var(domain_mask);
+                                       return best_cpu;
+                               }
+                       }
                 }
+               free_cpumask_var(domain_mask);
         }
   
         /*
@@@ -1234,7 -1080,7 +1242,7 @@@ static struct rq *find_lock_lowest_rq(s
                 }
   
                 /* If this rq is still suitable use it. */
- -              if (lowest_rq->rt.highest_prio > task->prio)
+ +              if (lowest_rq->rt.highest_prio.curr > task->prio)
                         break;
   
                 /* try again */
@@@ -1245,31 -1091,6 +1253,31 @@@
         return lowest_rq;
   }
   
+ +static inline int has_pushable_tasks(struct rq *rq)
+ +{
+ +      return !plist_head_empty(&rq->rt.pushable_tasks);
+ +}
+ +
+ +static struct task_struct *pick_next_pushable_task(struct rq *rq)
+ +{
+ +      struct task_struct *p;
+ +
+ +      if (!has_pushable_tasks(rq))
+ +              return NULL;
+ +
+ +      p = plist_first_entry(&rq->rt.pushable_tasks,
+ +                            struct task_struct, pushable_tasks);
+ +
+ +      BUG_ON(rq->cpu != task_cpu(p));
+ +      BUG_ON(task_current(rq, p));
+ +      BUG_ON(p->rt.nr_cpus_allowed <= 1);
+ +
+ +      BUG_ON(!p->se.on_rq);
+ +      BUG_ON(!rt_task(p));
+ +
+ +      return p;
+ +}
+ +
   /*
    * If the current CPU has more than one RT task, see if the non
    * running task can migrate over to a CPU that is running a task
@@@ -1279,11 -1100,13 +1287,11 @@@ static int push_rt_task(struct rq *rq
   {
         struct task_struct *next_task;
         struct rq *lowest_rq;
- -      int ret = 0;
- -      int paranoid = RT_MAX_TRIES;
   
         if (!rq->rt.overloaded)
                 return 0;
   
- -      next_task = pick_next_highest_task_rt(rq, -1);
+ +      next_task = pick_next_pushable_task(rq);
         if (!next_task)
                 return 0;
   
@@@ -1312,34 -1135,16 +1320,34 @@@
                 struct task_struct *task;
                 /*
                  * find lock_lowest_rq releases rq->lock
- -               * so it is possible that next_task has changed.
- -               * If it has, then try again.
+ +               * so it is possible that next_task has migrated.
+ +               *
+ +               * We need to make sure that the task is still on the same
+ +               * run-queue and is also still the next task eligible for
+ +               * pushing.
                  */
- -              task = pick_next_highest_task_rt(rq, -1);
- -              if (unlikely(task != next_task) && task && paranoid--) {
- -                      put_task_struct(next_task);
- -                      next_task = task;
- -                      goto retry;
+ +              task = pick_next_pushable_task(rq);
+ +              if (task_cpu(next_task) == rq->cpu && task == next_task) {
+ +                      /*
+ +                       * If we get here, the task hasnt moved at all, but
+ +                       * it has failed to push.  We will not try again,
+ +                       * since the other cpus will pull from us when they
+ +                       * are ready.
+ +                       */
+ +                      dequeue_pushable_task(rq, next_task);
+ +                      goto out;
                 }
- -              goto out;
+ +
+ +              if (!task)
+ +                      /* No more tasks, just exit */
+ +                      goto out;
+ +
+ +              /*
+ +               * Something has shifted, try again.
+ +               */
+ +              put_task_struct(next_task);
+ +              next_task = task;
+ +              goto retry;
         }
   
         deactivate_task(rq, next_task, 0);
@@@ -1350,12 -1155,23 +1358,12 @@@
   
         double_unlock_balance(rq, lowest_rq);
   
- -      ret = 1;
   out:
         put_task_struct(next_task);
   
- -      return ret;
+ +      return 1;
   }
   
- -/*
- - * TODO: Currently we just use the second highest prio task on
- - *       the queue, and stop when it can't migrate (or there's
- - *       no more RT tasks).  There may be a case where a lower
- - *       priority RT task has a different affinity than the
- - *       higher RT task. In this case the lower RT task could
- - *       possibly be able to migrate where as the higher priority
- - *       RT task could not.  We currently ignore this issue.
- - *       Enhancements are welcome!
- - */
   static void push_rt_tasks(struct rq *rq)
   {
         /* push_rt_task will return true if it moved an RT */
@@@ -1366,35 -1182,33 +1374,35 @@@
   static int pull_rt_task(struct rq *this_rq)
   {
         int this_cpu = this_rq->cpu, ret = 0, cpu;
- -      struct task_struct *p, *next;
+ +      struct task_struct *p;
         struct rq *src_rq;
   
         if (likely(!rt_overloaded(this_rq)))
                 return 0;
   
- -      next = pick_next_task_rt(this_rq);
- -
         for_each_cpu(cpu, this_rq->rd->rto_mask) {
                 if (this_cpu == cpu)
                         continue;
   
                 src_rq = cpu_rq(cpu);
+ +
+ +              /*
+ +               * Don't bother taking the src_rq->lock if the next highest
+ +               * task is known to be lower-priority than our current task.
+ +               * This may look racy, but if this value is about to go
+ +               * logically higher, the src_rq will push this task away.
+ +               * And if its going logically lower, we do not care
+ +               */
+ +              if (src_rq->rt.highest_prio.next >=
+ +                  this_rq->rt.highest_prio.curr)
+ +                      continue;
+ +
                 /*
                  * We can potentially drop this_rq's lock in
                  * double_lock_balance, and another CPU could
- -               * steal our next task - hence we must cause
- -               * the caller to recalculate the next task
- -               * in that case:
+ +               * alter this_rq
                  */
- -              if (double_lock_balance(this_rq, src_rq)) {
- -                      struct task_struct *old_next = next;
- -
- -                      next = pick_next_task_rt(this_rq);
- -                      if (next != old_next)
- -                              ret = 1;
- -              }
+ +              double_lock_balance(this_rq, src_rq);
   
                 /*
                  * Are there still pullable RT tasks?
@@@ -1408,7 -1222,7 +1416,7 @@@
                  * Do we have an RT task that preempts
                  * the to-be-scheduled task?
                  */
- -              if (p && (!next || (p->prio < next->prio))) {
+ +              if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                         WARN_ON(p == src_rq->curr);
                         WARN_ON(!p->se.on_rq);
   
@@@ -1418,9 -1232,12 +1426,9 @@@
                          * This is just that p is wakeing up and hasn't
                          * had a chance to schedule. We only pull
                          * p if it is lower in priority than the
- -                       * current task on the run queue or
- -                       * this_rq next task is lower in prio than
- -                       * the current task on that rq.
+ +                       * current task on the run queue
                          */
- -                      if (p->prio < src_rq->curr->prio ||
- -                          (next && next->prio < src_rq->curr->prio))
+ +                      if (p->prio < src_rq->curr->prio)
                                 goto skip;
   
                         ret = 1;
@@@ -1433,7 -1250,13 +1441,7 @@@
                          * case there's an even higher prio task
                          * in another runqueue. (low likelyhood
                          * but possible)
- -                       *
- -                       * Update next so that we won't pick a task
- -                       * on another cpu with a priority lower (or equal)
- -                       * than the one we just picked.
                          */
- -                      next = p;
- -
                 }
    skip:
                 double_unlock_balance(this_rq, src_rq);
@@@ -1445,27 -1268,24 +1453,27 @@@
   static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
   {
         /* Try to pull RT tasks here if we lower this rq's prio */
- -      if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+ +      if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
                 pull_rt_task(rq);
   }
   
+ +/*
+ + * assumes rq->lock is held
+ + */
+ +static int needs_post_schedule_rt(struct rq *rq)
+ +{
+ +      return has_pushable_tasks(rq);
+ +}
+ +
   static void post_schedule_rt(struct rq *rq)
   {
         /*
- -       * If we have more than one rt_task queued, then
- -       * see if we can push the other rt_tasks off to other CPUS.
- -       * Note we may release the rq lock, and since
- -       * the lock was owned by prev, we need to release it
- -       * first via finish_lock_switch and then reaquire it here.
+ +       * This is only called if needs_post_schedule_rt() indicates that
+ +       * we need to push tasks away
          */
- -      if (unlikely(rq->rt.overloaded)) {
- -              spin_lock_irq(&rq->lock);
- -              push_rt_tasks(rq);
- -              spin_unlock_irq(&rq->lock);
- -      }
+ +      spin_lock_irq(&rq->lock);
+ +      push_rt_tasks(rq);
+ +      spin_unlock_irq(&rq->lock);
   }
   
   /*
@@@ -1476,8 -1296,7 +1484,8 @@@ static void task_wake_up_rt(struct rq *
   {
         if (!task_running(rq, p) &&
             !test_tsk_need_resched(rq->curr) &&
- -          rq->rt.overloaded)
+ +          has_pushable_tasks(rq) &&
+ +          p->rt.nr_cpus_allowed > 1)
                 push_rt_tasks(rq);
   }
   
@@@ -1513,24 -1332,6 +1521,24 @@@ static void set_cpus_allowed_rt(struct 
         if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
                 struct rq *rq = task_rq(p);
   
+ +              if (!task_current(rq, p)) {
+ +                      /*
+ +                       * Make sure we dequeue this task from the pushable list
+ +                       * before going further.  It will either remain off of
+ +                       * the list because we are no longer pushable, or it
+ +                       * will be requeued.
+ +                       */
+ +                      if (p->rt.nr_cpus_allowed > 1)
+ +                              dequeue_pushable_task(rq, p);
+ +
+ +                      /*
+ +                       * Requeue if our weight is changing and still > 1
+ +                       */
+ +                      if (weight > 1)
+ +                              enqueue_pushable_task(rq, p);
+ +
+ +              }
+ +
                 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
                         rq->rt.rt_nr_migratory++;
                 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@@ -1538,7 -1339,7 +1546,7 @@@
                         rq->rt.rt_nr_migratory--;
                 }
   
- -              update_rt_migration(rq);
+ +              update_rt_migration(&rq->rt);
         }
   
         cpumask_copy(&p->cpus_allowed, new_mask);
@@@ -1553,7 -1354,7 +1561,7 @@@ static void rq_online_rt(struct rq *rq
   
         __enable_runtime(rq);
   
- -      cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+ +      cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
   }
   
   /* Assumes rq->lock is held */
@@@ -1645,7 -1446,7 +1653,7 @@@ static void prio_changed_rt(struct rq *
                  * can release the rq lock and p could migrate.
                  * Only reschedule if p is still on the same runqueue.
                  */
- -              if (p->prio > rq->rt.highest_prio && rq->curr == p)
+ +              if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
                         resched_task(p);
   #else
                 /* For UP simply resched on drop of prio */
@@@ -1716,9 -1517,6 +1724,9 @@@ static void set_curr_task_rt(struct rq 
         struct task_struct *p = rq->curr;
   
         p->se.exec_start = rq->clock;
+ +
+ +      /* The running task is never eligible for pushing */
+ +      dequeue_pushable_task(rq, p);
   }
   
   static const struct sched_class rt_sched_class = {
@@@ -1741,7 -1539,6 +1749,7 @@@
         .rq_online              = rq_online_rt,
         .rq_offline             = rq_offline_rt,
         .pre_schedule           = pre_schedule_rt,
+ +      .needs_post_schedule    = needs_post_schedule_rt,
         .post_schedule          = post_schedule_rt,
         .task_wake_up           = task_wake_up_rt,
         .switched_from          = switched_from_rt,
diff --combined kernel/softirq.c

index 9041ea7948feffbf887ed92fb8ce1e73ad55c81c,0365b4899a3d37e17c9f7fe4042e7ed3b4d0617b..57d3f67f6f38af7fdfb0fad66ff1cdbef790951e
--- 1/kernel/softirq.c
--- 2/kernel/softirq.c
+++ b/kernel/softirq.c
@@@ -626,7 -626,6 +626,7 @@@ static int ksoftirqd(void * __bind_cpu
                         preempt_enable_no_resched();
                         cond_resched();
                         preempt_disable();
+ +                      rcu_qsctr_inc((long)__bind_cpu);
                 }
                 preempt_enable();
                 set_current_state(TASK_INTERRUPTIBLE);
@@@ -796,6 -795,11 +796,11 @@@ int __init __weak early_irq_init(void
         return 0;
   }
   
+ int __init __weak arch_probe_nr_irqs(void)
+ {
+       return 0;
+ }
+ 
   int __init __weak arch_early_irq_init(void)
   {
         return 0;
diff --combined lib/Kconfig

index 54aaf4feaf6c01e05b436866af7522301890eaa2,daa481824d9c9a68438d097ec4bedd9c2114dc2e..2a9c69f3448216e8ce3f6a0f5488a947dfa55060
--- 1/lib/Kconfig
--- 2/lib/Kconfig
+++ b/lib/Kconfig
@@@ -97,6 -97,20 +97,20 @@@ config LZO_COMPRES
   config LZO_DECOMPRESS
         tristate
   
+ #
+ # These all provide a common interface (hence the apparent duplication with
+ # ZLIB_INFLATE; DECOMPRESS_GZIP is just a wrapper.)
+ #
+ config DECOMPRESS_GZIP
+       select ZLIB_INFLATE
+       tristate
+ 
+ config DECOMPRESS_BZIP2
+       tristate
+ 
+ config DECOMPRESS_LZMA
+       tristate
+ 
   #
   # Generic allocator support is selected if needed
   #
@@@ -136,6 -150,12 +150,6 @@@ config TEXTSEARCH_B
   config TEXTSEARCH_FSM
         tristate
   
- -#
- -# plist support is select#ed if needed
- -#
- -config PLIST
- -      boolean
- -
   config HAS_IOMEM
         boolean
         depends on !NO_IOMEM
@@@ -168,10 -188,4 +182,10 @@@ config DISABLE_OBSOLETE_CPUMASK_FUNCTIO
          bool "Disable obsolete cpumask functions" if DEBUG_PER_CPU_MAPS
          depends on EXPERIMENTAL && BROKEN
   
+ +#
+ +# Netlink attribute parsing support is select'ed if needed
+ +#
+ +config NLATTR
+ +      bool
+ +
   endmenu
diff --combined lib/Makefile

index 8bdc647e6d6256f4d2c0821cff16415153a02dfe,790de7c25d0d01eac70dcfce6327ca100919da5a..051a33a8e0285b99c0bf4e98a61580d9e32431a8
--- 1/lib/Makefile
--- 2/lib/Makefile
+++ b/lib/Makefile
@@@ -12,7 -12,7 +12,7 @@@ lib-y := ctype.o string.o vsprintf.o cm
          idr.o int_sqrt.o extable.o prio_tree.o \
          sha1.o irq_regs.o reciprocal_div.o argv_split.o \
          proportions.o prio_heap.o ratelimit.o show_mem.o \
-        is_single_threaded.o plist.o
- -       is_single_threaded.o decompress.o
++       is_single_threaded.o plist.o decompress.o
   
   lib-$(CONFIG_MMU) += ioremap.o
   lib-$(CONFIG_SMP) += cpumask.o
@@@ -41,6 -41,7 +41,6 @@@ lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += 
   lib-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
   obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
   obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
- -obj-$(CONFIG_PLIST) += plist.o
   obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
   obj-$(CONFIG_DEBUG_LIST) += list_debug.o
   obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
@@@ -65,6 -66,10 +65,10 @@@ obj-$(CONFIG_REED_SOLOMON) += reed_solo
   obj-$(CONFIG_LZO_COMPRESS) += lzo/
   obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
   
+ lib-$(CONFIG_DECOMPRESS_GZIP) += decompress_inflate.o
+ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
+ lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o
+ 
   obj-$(CONFIG_TEXTSEARCH) += textsearch.o
   obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
   obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
@@@ -82,9 -87,7 +86,9 @@@ obj-$(CONFIG_HAVE_LMB) += lmb.
   
   obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
   
- -obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o
+ +obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
+ +
+ +obj-$(CONFIG_NLATTR) += nlattr.o
   
   hostprogs-y   := gen_crc32table
   clean-files   := crc32table.h
diff --combined net/ipv4/af_inet.c

index d5aaabbb7cb381054df71725d0ca2edc1a8289ff,3a3dad8013548d5581ae8e9250f8e818a2a3a122..7f03373b8c07b8efd7d3bba01221297dfad5ebaa
--- 1/net/ipv4/af_inet.c
--- 2/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@@ -369,6 -369,7 +369,6 @@@ lookup_protocol
         sock_init_data(sock, sk);
   
         sk->sk_destruct    = inet_sock_destruct;
- -      sk->sk_family      = PF_INET;
         sk->sk_protocol    = protocol;
         sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
   
@@@ -1252,10 -1253,10 +1252,10 @@@ static struct sk_buff **inet_gro_receiv
         int proto;
         int id;
   
- -      if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ +      iph = skb_gro_header(skb, sizeof(*iph));
+ +      if (unlikely(!iph))
                 goto out;
   
- -      iph = ip_hdr(skb);
         proto = iph->protocol & (MAX_INET_PROTOS - 1);
   
         rcu_read_lock();
@@@ -1263,13 -1264,13 +1263,13 @@@
         if (!ops || !ops->gro_receive)
                 goto out_unlock;
   
- -      if (iph->version != 4 || iph->ihl != 5)
+ +      if (*(u8 *)iph != 0x45)
                 goto out_unlock;
   
         if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                 goto out_unlock;
   
- -      flush = ntohs(iph->tot_len) != skb->len ||
+ +      flush = ntohs(iph->tot_len) != skb_gro_len(skb) ||
                 iph->frag_off != htons(IP_DF);
         id = ntohs(iph->id);
   
@@@ -1281,25 -1282,24 +1281,25 @@@
   
                 iph2 = ip_hdr(p);
   
- -              if (iph->protocol != iph2->protocol ||
- -                  iph->tos != iph2->tos ||
- -                  memcmp(&iph->saddr, &iph2->saddr, 8)) {
+ +              if ((iph->protocol ^ iph2->protocol) |
+ +                  (iph->tos ^ iph2->tos) |
+ +                  (iph->saddr ^ iph2->saddr) |
+ +                  (iph->daddr ^ iph2->daddr)) {
                         NAPI_GRO_CB(p)->same_flow = 0;
                         continue;
                 }
   
                 /* All fields must match except length and checksum. */
                 NAPI_GRO_CB(p)->flush |=
- -                      memcmp(&iph->frag_off, &iph2->frag_off, 4) ||
- -                      (u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) != id;
+ +                      (iph->ttl ^ iph2->ttl) |
+ +                      ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
   
                 NAPI_GRO_CB(p)->flush |= flush;
         }
   
         NAPI_GRO_CB(skb)->flush |= flush;
- -      __skb_pull(skb, sizeof(*iph));
- -      skb_reset_transport_header(skb);
+ +      skb_gro_pull(skb, sizeof(*iph));
+ +      skb_set_transport_header(skb, skb_gro_offset(skb));
   
         pp = ops->gro_receive(head, skb);
   
@@@ -1375,10 -1375,10 +1375,10 @@@ EXPORT_SYMBOL_GPL(snmp_fold_field)
   int snmp_mib_init(void *ptr[2], size_t mibsize)
   {
         BUG_ON(ptr == NULL);
-       ptr[0] = __alloc_percpu(mibsize);
+       ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
         if (!ptr[0])
                 goto err0;
-       ptr[1] = __alloc_percpu(mibsize);
+       ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
         if (!ptr[1])
                 goto err1;
         return 0;
@@@ -1500,8 -1500,8 +1500,8 @@@ static int ipv4_proc_init(void)
    *    IP protocol layer initialiser
    */
   
- -static struct packet_type ip_packet_type = {
- -      .type = __constant_htons(ETH_P_IP),
+ +static struct packet_type ip_packet_type __read_mostly = {
+ +      .type = cpu_to_be16(ETH_P_IP),
         .func = ip_rcv,
         .gso_send_check = inet_gso_send_check,
         .gso_segment = inet_gso_segment,
diff --combined net/ipv4/route.c

index 5caee609be06bb9f3b205a7a1a8628f655212ec5,bf895401218fe634df8432d914a02221d7c62766..c40debe51b38ace1224d3c9b9342ad0c73513e1c
--- 1/net/ipv4/route.c
--- 2/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@@ -151,7 -151,7 +151,7 @@@ static void rt_emergency_hash_rebuild(s
   
   static struct dst_ops ipv4_dst_ops = {
         .family =               AF_INET,
- -      .protocol =             __constant_htons(ETH_P_IP),
+ +      .protocol =             cpu_to_be16(ETH_P_IP),
         .gc =                   rt_garbage_collect,
         .check =                ipv4_dst_check,
         .destroy =              ipv4_dst_destroy,
@@@ -2696,7 -2696,7 +2696,7 @@@ static void ipv4_rt_blackhole_update_pm
   
   static struct dst_ops ipv4_dst_blackhole_ops = {
         .family                 =       AF_INET,
- -      .protocol               =       __constant_htons(ETH_P_IP),
+ +      .protocol               =       cpu_to_be16(ETH_P_IP),
         .destroy                =       ipv4_dst_destroy,
         .check                  =       ipv4_dst_check,
         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
@@@ -2779,8 -2779,7 +2779,8 @@@ int ip_route_output_key(struct net *net
         return ip_route_output_flow(net, rp, flp, NULL, 0);
   }
   
- -static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ +static int rt_fill_info(struct net *net,
+ +                      struct sk_buff *skb, u32 pid, u32 seq, int event,
                         int nowait, unsigned int flags)
   {
         struct rtable *rt = skb->rtable;
@@@ -2845,8 -2844,8 +2845,8 @@@
                 __be32 dst = rt->rt_dst;
   
                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
- -                  IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
- -                      int err = ipmr_get_route(skb, r, nowait);
+ +                  IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+ +                      int err = ipmr_get_route(net, skb, r, nowait);
                         if (err <= 0) {
                                 if (!nowait) {
                                         if (err == 0)
@@@ -2951,7 -2950,7 +2951,7 @@@ static int inet_rtm_getroute(struct sk_
         if (rtm->rtm_flags & RTM_F_NOTIFY)
                 rt->rt_flags |= RTCF_NOTIFY;
   
- -      err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+ +      err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
                            RTM_NEWROUTE, 0, 0);
         if (err <= 0)
                 goto errout_free;
@@@ -2989,7 -2988,7 +2989,7 @@@ int ip_rt_dump(struct sk_buff *skb,  st
                         if (rt_is_expired(rt))
                                 continue;
                         skb->dst = dst_clone(&rt->u.dst);
- -                      if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+ +                      if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
                                          1, NLM_F_MULTI) <= 0) {
                                 dst_release(xchg(&skb->dst, NULL));
@@@ -3377,7 -3376,7 +3377,7 @@@ int __init ip_rt_init(void
         int rc = 0;
   
   #ifdef CONFIG_NET_CLS_ROUTE
-       ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
+       ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
         if (!ip_rt_acct)
                 panic("IP: failed to allocate ip_rt_acct\n");
   #endif
diff --combined scripts/Makefile.lib

index c18fa150b6fe502319c92bd09f98ee984d48819c,3b949a354470e7022cd71769351324b155d307e1..979619574f70f700368d2d7742760c9cdb1602f2
--- 1/scripts/Makefile.lib
--- 2/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@@ -97,7 -97,7 +97,7 @@@ modname_flags  = $(if $(filter 1,$(word
                    -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
   
   #hash values
- -ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
+ +ifdef CONFIG_DYNAMIC_DEBUG
   debug_flags = -D"DEBUG_HASH=$(shell ./scripts/basic/hash djb2 $(@D)$(modname))"\
                 -D"DEBUG_HASH2=$(shell ./scripts/basic/hash r5 $(@D)$(modname))"
   else
@@@ -186,3 -186,17 +186,17 @@@ quiet_cmd_gzip = GZIP    $
   cmd_gzip = gzip -f -9 < $< > $@
   
   
+ # Bzip2
+ # ---------------------------------------------------------------------------
+ 
+ # Bzip2 does not include size in file... so we have to fake that
+ size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size
+ 
+ quiet_cmd_bzip2 = BZIP2    $@
+ cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false)
+ 
+ # Lzma
+ # ---------------------------------------------------------------------------
+ 
+ quiet_cmd_lzma = LZMA    $@
+ cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false)
author	Ingo Molnar <mingo@elte.hu>
	Thu, 26 Mar 2009 20:39:17 +0000 (21:39 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Fri, 27 Mar 2009 16:28:43 +0000 (17:28 +0100)
		1	2
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/alpha/kernel/irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/arm/kernel/irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/blackfin/kernel/irqchip.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/kernel/irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/kernel/msi_ia64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/parisc/kernel/irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/kernel/irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/sparc/kernel/irq_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/sparc/kernel/time_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/fixmap.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/cpufreq/e_powersaver.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/intel.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/efi.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/efi_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/reboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/setup.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/tsc.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/lguest/boot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pageattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/acpi/osl.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/sfc/efx.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/sfc/falcon.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/pci/intr_remapping.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-generic/vmlinux.lds.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/interrupt.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/irq.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/irqnr.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/chip.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/handle.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/internals.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/manage.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/irq/numa_migrate.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/module.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_rt.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/softirq.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
lib/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/af_inet.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/route.c	patch \|	diff1 \|	diff2 \|	blob \| history
scripts/Makefile.lib	patch \|	diff1 \|	diff2 \|	blob \| history