Merge branch 'linus' into sched/core

author Ingo Molnar <mingo@elte.hu>

Thu, 14 Oct 2010 07:11:43 +0000 (09:11 +0200)

committer Ingo Molnar <mingo@elte.hu>

Thu, 14 Oct 2010 07:11:46 +0000 (09:11 +0200)
author Ingo Molnar <mingo@elte.hu>
Thu, 14 Oct 2010 07:11:43 +0000 (09:11 +0200)
committer Ingo Molnar <mingo@elte.hu>
Thu, 14 Oct 2010 07:11:46 +0000 (09:11 +0200)
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt

index f1c5c4bccd3e8ed6674903eedbc59db3cf76ff9e..902d3151f527919ab190d8f30ff9253e3da2c8d0 100644 (file)
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -14,25 +14,39 @@ to /proc/cpuinfo.
         identifier (rather than the kernel's).  The actual value is
         architecture and platform dependent.
  
-3) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
+3) /sys/devices/system/cpu/cpuX/topology/book_id:
+
+       the book ID of cpuX. Typically it is the hardware platform's
+       identifier (rather than the kernel's).  The actual value is
+       architecture and platform dependent.
+
+4) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
  
         internel kernel map of cpuX's hardware threads within the same
         core as cpuX
  
-4) /sys/devices/system/cpu/cpuX/topology/core_siblings:
+5) /sys/devices/system/cpu/cpuX/topology/core_siblings:
  
         internal kernel map of cpuX's hardware threads within the same
         physical_package_id.
  
+6) /sys/devices/system/cpu/cpuX/topology/book_siblings:
+
+       internal kernel map of cpuX's hardware threads within the same
+       book_id.
+
  To implement it in an architecture-neutral way, a new source file,
-drivers/base/topology.c, is to export the 4 attributes.
+drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
+related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
  
  For an architecture to support this feature, it must define some of
  these macros in include/asm-XXX/topology.h:
  #define topology_physical_package_id(cpu)
  #define topology_core_id(cpu)
+#define topology_book_id(cpu)
  #define topology_thread_cpumask(cpu)
  #define topology_core_cpumask(cpu)
+#define topology_book_cpumask(cpu)
  
  The type of **_id is int.
  The type of siblings is (const) struct cpumask *.
@@ -45,6 +59,9 @@ not defined by include/asm-XXX/topology.h:
  3) thread_siblings: just the given CPU
  4) core_siblings: just the given CPU
  
+For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
+default definitions for topology_book_id() and topology_book_cpumask().
+
  Additionally, CPU topology information is provided under
  /sys/devices/system/cpu and includes these files.  The internal
  source for the output is in brackets ("[]").
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index f0777a47e3a531c17156a76694648aaf89cacd44..74a2f1b607a473b6bf39ced372bf89b96e0db452 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -198,6 +198,13 @@ config HOTPLUG_CPU
           can be controlled through /sys/devices/system/cpu/cpu#.
           Say N if you want to disable CPU hotplug.
  
+config SCHED_BOOK
+       bool "Book scheduler support"
+       depends on SMP
+       help
+         Book scheduler support improves the CPU scheduler's decision making
+         when dealing with machines that have several books.
+
  config MATHEMU
         bool "IEEE FPU emulation"
         depends on MARCH_G5
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h

index 831bd033ea77b3f2c72c408f40621a82d22ac017..051107a2c5e249397f71163544ae105dbde6d302 100644 (file)
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -3,15 +3,32 @@
  
  #include <linux/cpumask.h>
  
-#define mc_capable()   (1)
-
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
-
  extern unsigned char cpu_core_id[NR_CPUS];
  extern cpumask_t cpu_core_map[NR_CPUS];
  
+static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
+{
+       return &cpu_core_map[cpu];
+}
+
  #define topology_core_id(cpu)          (cpu_core_id[cpu])
  #define topology_core_cpumask(cpu)     (&cpu_core_map[cpu])
+#define mc_capable()                   (1)
+
+#ifdef CONFIG_SCHED_BOOK
+
+extern unsigned char cpu_book_id[NR_CPUS];
+extern cpumask_t cpu_book_map[NR_CPUS];
+
+static inline const struct cpumask *cpu_book_mask(unsigned int cpu)
+{
+       return &cpu_book_map[cpu];
+}
+
+#define topology_book_id(cpu)          (cpu_book_id[cpu])
+#define topology_book_cpumask(cpu)     (&cpu_book_map[cpu])
+
+#endif /* CONFIG_SCHED_BOOK */
  
  int topology_set_cpu_management(int fc);
  void topology_schedule_update(void);
@@ -30,6 +47,8 @@ static inline void s390_init_cpu_topology(void)
  };
  #endif
  
+#define SD_BOOK_INIT   SD_CPU_INIT
+
  #include <asm-generic/topology.h>
  
  #endif /* _ASM_S390_TOPOLOGY_H */
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c

index bcef00766a646dec0a5158bdf10c0574d8177293..13559c9938470b9d7d0c2597f6418e518d865c78 100644 (file)
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -57,8 +57,8 @@ struct tl_info {
         union tl_entry tle[0];
  };
  
-struct core_info {
-       struct core_info *next;
+struct mask_info {
+       struct mask_info *next;
         unsigned char id;
         cpumask_t mask;
  };
@@ -66,7 +66,6 @@ struct core_info {
  static int topology_enabled;
  static void topology_work_fn(struct work_struct *work);
  static struct tl_info *tl_info;
-static struct core_info core_info;
  static int machine_has_topology;
  static struct timer_list topology_timer;
  static void set_topology_timer(void);
@@ -74,38 +73,37 @@ static DECLARE_WORK(topology_work, topology_work_fn);
  /* topology_lock protects the core linked list */
  static DEFINE_SPINLOCK(topology_lock);
  
+static struct mask_info core_info;
  cpumask_t cpu_core_map[NR_CPUS];
  unsigned char cpu_core_id[NR_CPUS];
  
-static cpumask_t cpu_coregroup_map(unsigned int cpu)
+#ifdef CONFIG_SCHED_BOOK
+static struct mask_info book_info;
+cpumask_t cpu_book_map[NR_CPUS];
+unsigned char cpu_book_id[NR_CPUS];
+#endif
+
+static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
  {
-       struct core_info *core = &core_info;
-       unsigned long flags;
         cpumask_t mask;
  
         cpus_clear(mask);
         if (!topology_enabled || !machine_has_topology)
                 return cpu_possible_map;
-       spin_lock_irqsave(&topology_lock, flags);
-       while (core) {
-               if (cpu_isset(cpu, core->mask)) {
-                       mask = core->mask;
+       while (info) {
+               if (cpu_isset(cpu, info->mask)) {
+                       mask = info->mask;
                         break;
                 }
-               core = core->next;
+               info = info->next;
         }
-       spin_unlock_irqrestore(&topology_lock, flags);
         if (cpus_empty(mask))
                 mask = cpumask_of_cpu(cpu);
         return mask;
  }
  
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
-{
-       return &cpu_core_map[cpu];
-}
-
-static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
+static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book,
+                            struct mask_info *core)
  {
         unsigned int cpu;
  
@@ -117,23 +115,35 @@ static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
  
                 rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
                 for_each_present_cpu(lcpu) {
-                       if (cpu_logical_map(lcpu) == rcpu) {
-                               cpu_set(lcpu, core->mask);
-                               cpu_core_id[lcpu] = core->id;
-                               smp_cpu_polarization[lcpu] = tl_cpu->pp;
-                       }
+                       if (cpu_logical_map(lcpu) != rcpu)
+                               continue;
+#ifdef CONFIG_SCHED_BOOK
+                       cpu_set(lcpu, book->mask);
+                       cpu_book_id[lcpu] = book->id;
+#endif
+                       cpu_set(lcpu, core->mask);
+                       cpu_core_id[lcpu] = core->id;
+                       smp_cpu_polarization[lcpu] = tl_cpu->pp;
                 }
         }
  }
  
-static void clear_cores(void)
+static void clear_masks(void)
  {
-       struct core_info *core = &core_info;
+       struct mask_info *info;
  
-       while (core) {
-               cpus_clear(core->mask);
-               core = core->next;
+       info = &core_info;
+       while (info) {
+               cpus_clear(info->mask);
+               info = info->next;
+       }
+#ifdef CONFIG_SCHED_BOOK
+       info = &book_info;
+       while (info) {
+               cpus_clear(info->mask);
+               info = info->next;
         }
+#endif
  }
  
  static union tl_entry *next_tle(union tl_entry *tle)
@@ -146,29 +156,36 @@ static union tl_entry *next_tle(union tl_entry *tle)
  
  static void tl_to_cores(struct tl_info *info)
  {
+#ifdef CONFIG_SCHED_BOOK
+       struct mask_info *book = &book_info;
+#else
+       struct mask_info *book = NULL;
+#endif
+       struct mask_info *core = &core_info;
         union tl_entry *tle, *end;
-       struct core_info *core = &core_info;
+
  
         spin_lock_irq(&topology_lock);
-       clear_cores();
+       clear_masks();
         tle = info->tle;
         end = (union tl_entry *)((unsigned long)info + info->length);
         while (tle < end) {
                 switch (tle->nl) {
-               case 5:
-               case 4:
-               case 3:
+#ifdef CONFIG_SCHED_BOOK
                 case 2:
+                       book = book->next;
+                       book->id = tle->container.id;
                         break;
+#endif
                 case 1:
                         core = core->next;
                         core->id = tle->container.id;
                         break;
                 case 0:
-                       add_cpus_to_core(&tle->cpu, core);
+                       add_cpus_to_mask(&tle->cpu, book, core);
                         break;
                 default:
-                       clear_cores();
+                       clear_masks();
                         machine_has_topology = 0;
                         goto out;
                 }
@@ -221,10 +238,29 @@ int topology_set_cpu_management(int fc)
  
  static void update_cpu_core_map(void)
  {
+       unsigned long flags;
         int cpu;
  
-       for_each_possible_cpu(cpu)
-               cpu_core_map[cpu] = cpu_coregroup_map(cpu);
+       spin_lock_irqsave(&topology_lock, flags);
+       for_each_possible_cpu(cpu) {
+               cpu_core_map[cpu] = cpu_group_map(&core_info, cpu);
+#ifdef CONFIG_SCHED_BOOK
+               cpu_book_map[cpu] = cpu_group_map(&book_info, cpu);
+#endif
+       }
+       spin_unlock_irqrestore(&topology_lock, flags);
+}
+
+static void store_topology(struct tl_info *info)
+{
+#ifdef CONFIG_SCHED_BOOK
+       int rc;
+
+       rc = stsi(info, 15, 1, 3);
+       if (rc != -ENOSYS)
+               return;
+#endif
+       stsi(info, 15, 1, 2);
  }
  
  int arch_update_cpu_topology(void)
@@ -238,7 +274,7 @@ int arch_update_cpu_topology(void)
                 topology_update_polarization_simple();
                 return 0;
         }
-       stsi(info, 15, 1, 2);
+       store_topology(info);
         tl_to_cores(info);
         update_cpu_core_map();
         for_each_online_cpu(cpu) {
@@ -299,12 +335,24 @@ out:
  }
  __initcall(init_topology_update);
  
+static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset)
+{
+       int i, nr_masks;
+
+       nr_masks = info->mag[NR_MAG - offset];
+       for (i = 0; i < info->mnest - offset; i++)
+               nr_masks *= info->mag[NR_MAG - offset - 1 - i];
+       nr_masks = max(nr_masks, 1);
+       for (i = 0; i < nr_masks; i++) {
+               mask->next = alloc_bootmem(sizeof(struct mask_info));
+               mask = mask->next;
+       }
+}
+
  void __init s390_init_cpu_topology(void)
  {
         unsigned long long facility_bits;
         struct tl_info *info;
-       struct core_info *core;
-       int nr_cores;
         int i;
  
         if (stfle(&facility_bits, 1) <= 0)
@@ -315,25 +363,13 @@ void __init s390_init_cpu_topology(void)
  
         tl_info = alloc_bootmem_pages(PAGE_SIZE);
         info = tl_info;
-       stsi(info, 15, 1, 2);
-
-       nr_cores = info->mag[NR_MAG - 2];
-       for (i = 0; i < info->mnest - 2; i++)
-               nr_cores *= info->mag[NR_MAG - 3 - i];
-
+       store_topology(info);
         pr_info("The CPU configuration topology of the machine is:");
         for (i = 0; i < NR_MAG; i++)
                 printk(" %d", info->mag[i]);
         printk(" / %d\n", info->mnest);
-
-       core = &core_info;
-       for (i = 0; i < nr_cores; i++) {
-               core->next = alloc_bootmem(sizeof(struct core_info));
-               core = core->next;
-               if (!core)
-                       goto error;
-       }
-       return;
-error:
-       machine_has_topology = 0;
+       alloc_masks(info, &core_info, 2);
+#ifdef CONFIG_SCHED_BOOK
+       alloc_masks(info, &book_info, 3);
+#endif
  }
diff --git a/drivers/base/topology.c b/drivers/base/topology.c

index 9fc630ce1ddb4b46f93dd2c35a459849399efa8b..f6f37a05a0c3a5664503040bcdfaae32516d6e6c 100644 (file)
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -45,7 +45,8 @@ static ssize_t show_##name(struct sys_device *dev,            \
         return sprintf(buf, "%d\n", topology_##name(cpu));      \
  }
  
-#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
+#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) || \
+    defined(topology_book_cpumask)
  static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
  {
         ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
@@ -114,6 +115,14 @@ define_siblings_show_func(core_cpumask);
  define_one_ro_named(core_siblings, show_core_cpumask);
  define_one_ro_named(core_siblings_list, show_core_cpumask_list);
  
+#ifdef CONFIG_SCHED_BOOK
+define_id_show_func(book_id);
+define_one_ro(book_id);
+define_siblings_show_func(book_cpumask);
+define_one_ro_named(book_siblings, show_book_cpumask);
+define_one_ro_named(book_siblings_list, show_book_cpumask_list);
+#endif
+
  static struct attribute *default_attrs[] = {
         &attr_physical_package_id.attr,
         &attr_core_id.attr,
@@ -121,6 +130,11 @@ static struct attribute *default_attrs[] = {
         &attr_thread_siblings_list.attr,
         &attr_core_siblings.attr,
         &attr_core_siblings_list.attr,
+#ifdef CONFIG_SCHED_BOOK
+       &attr_book_id.attr,
+       &attr_book_siblings.attr,
+       &attr_book_siblings_list.attr,
+#endif
         NULL
  };
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 1e2a6db2d7dd03466bf850dc5011860c23e8f9c9..cdf56693ecbf5460a8302805de2b8724c45101d7 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -875,6 +875,7 @@ enum sched_domain_level {
         SD_LV_NONE = 0,
         SD_LV_SIBLING,
         SD_LV_MC,
+       SD_LV_BOOK,
         SD_LV_CPU,
         SD_LV_NODE,
         SD_LV_ALLNODES,
@@ -1681,8 +1682,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
  /*
   * Per process flags
   */
-#define PF_ALIGNWARN   0x00000001      /* Print alignment warning msgs */
-                                       /* Not implemented yet, only for 486*/
  #define PF_STARTING    0x00000002      /* being created */
  #define PF_EXITING     0x00000004      /* getting shut down */
  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
diff --git a/include/linux/topology.h b/include/linux/topology.h

index 64e084ff5e5c9a6e68257068821b79110a760f02..b91a40e847d236d9046dc3154a7e7a58ea594776 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -201,6 +201,12 @@ int arch_update_cpu_topology(void);
         .balance_interval       = 64,                                   \
  }
  
+#ifdef CONFIG_SCHED_BOOK
+#ifndef SD_BOOK_INIT
+#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
+#endif
+#endif /* CONFIG_SCHED_BOOK */
+
  #ifdef CONFIG_NUMA
  #ifndef SD_NODE_INIT
  #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

index 9208c92aeab5eee575b21f3000ea8034f6c00788..f6334782a593847907b82fcfc5756040b1bd6ff8 100644 (file)
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime,
                         (unsigned long long)__entry->vruntime)
  );
  
+/*
+ * Tracepoint for showing priority inheritance modifying a tasks
+ * priority.
+ */
+TRACE_EVENT(sched_pi_setprio,
+
+       TP_PROTO(struct task_struct *tsk, int newprio),
+
+       TP_ARGS(tsk, newprio),
+
+       TP_STRUCT__entry(
+               __array( char,  comm,   TASK_COMM_LEN   )
+               __field( pid_t, pid                     )
+               __field( int,   oldprio                 )
+               __field( int,   newprio                 )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+               __entry->pid            = tsk->pid;
+               __entry->oldprio        = tsk->prio;
+               __entry->newprio        = newprio;
+       ),
+
+       TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
+                       __entry->comm, __entry->pid,
+                       __entry->oldprio, __entry->newprio)
+);
+
  #endif /* _TRACE_SCHED_H */
  
  /* This part must be outside protection */
diff --git a/kernel/sched.c b/kernel/sched.c

index dc85ceb908322cad7196339f4df8dd58c37b1cec..2111491f642473e5b2a3662462f03266678473ac 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
          */
         cpumask_var_t rto_mask;
         atomic_t rto_count;
-#ifdef CONFIG_SMP
         struct cpupri cpupri;
-#endif
  };
  
  /*
@@ -437,7 +435,7 @@ struct root_domain {
   */
  static struct root_domain def_root_domain;
  
-#endif
+#endif /* CONFIG_SMP */
  
  /*
   * This is the main, per-CPU runqueue data structure.
@@ -723,7 +721,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                 size_t cnt, loff_t *ppos)
  {
         char buf[64];
-       char *cmp = buf;
+       char *cmp;
         int neg = 0;
         int i;
  
@@ -734,6 +732,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                 return -EFAULT;
  
         buf[cnt] = 0;
+       cmp = strstrip(buf);
  
         if (strncmp(buf, "NO_", 3) == 0) {
                 neg = 1;
@@ -741,9 +740,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
         }
  
         for (i = 0; sched_feat_names[i]; i++) {
-               int len = strlen(sched_feat_names[i]);
-
-               if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+               if (strcmp(cmp, sched_feat_names[i]) == 0) {
                         if (neg)
                                 sysctl_sched_features &= ~(1UL << i);
                         else
@@ -2852,14 +2849,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
          */
         arch_start_context_switch(prev);
  
-       if (likely(!mm)) {
+       if (!mm) {
                 next->active_mm = oldmm;
                 atomic_inc(&oldmm->mm_count);
                 enter_lazy_tlb(oldmm, next);
         } else
                 switch_mm(oldmm, mm, next);
  
-       if (likely(!prev->mm)) {
+       if (!prev->mm) {
                 prev->active_mm = NULL;
                 rq->prev_mm = oldmm;
         }
@@ -4358,6 +4355,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         rq = task_rq_lock(p, &flags);
  
+       trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
         on_rq = p->se.on_rq;
@@ -6514,6 +6512,7 @@ struct s_data {
         cpumask_var_t           nodemask;
         cpumask_var_t           this_sibling_map;
         cpumask_var_t           this_core_map;
+       cpumask_var_t           this_book_map;
         cpumask_var_t           send_covered;
         cpumask_var_t           tmpmask;
         struct sched_group      **sched_group_nodes;
@@ -6525,6 +6524,7 @@ enum s_alloc {
         sa_rootdomain,
         sa_tmpmask,
         sa_send_covered,
+       sa_this_book_map,
         sa_this_core_map,
         sa_this_sibling_map,
         sa_nodemask,
@@ -6560,31 +6560,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
  #ifdef CONFIG_SCHED_MC
  static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
  static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
  
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
  static int
  cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
                   struct sched_group **sg, struct cpumask *mask)
  {
         int group;
-
+#ifdef CONFIG_SCHED_SMT
         cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
         group = cpumask_first(mask);
+#else
+       group = cpu;
+#endif
         if (sg)
                 *sg = &per_cpu(sched_group_core, group).sg;
         return group;
  }
-#elif defined(CONFIG_SCHED_MC)
+#endif /* CONFIG_SCHED_MC */
+
+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+
  static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
-                 struct sched_group **sg, struct cpumask *unused)
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
  {
+       int group = cpu;
+#ifdef CONFIG_SCHED_MC
+       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+       group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+       group = cpumask_first(mask);
+#endif
         if (sg)
-               *sg = &per_cpu(sched_group_core, cpu).sg;
-       return cpu;
+               *sg = &per_cpu(sched_group_book, group).sg;
+       return group;
  }
-#endif
+#endif /* CONFIG_SCHED_BOOK */
  
  static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
  static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6594,7 +6611,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
                   struct sched_group **sg, struct cpumask *mask)
  {
         int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+       cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+       group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
         cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
         group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
@@ -6855,6 +6875,9 @@ SD_INIT_FUNC(CPU)
  #ifdef CONFIG_SCHED_MC
   SD_INIT_FUNC(MC)
  #endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
  
  static int default_relax_domain_level = -1;
  
@@ -6904,6 +6927,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                 free_cpumask_var(d->tmpmask); /* fall through */
         case sa_send_covered:
                 free_cpumask_var(d->send_covered); /* fall through */
+       case sa_this_book_map:
+               free_cpumask_var(d->this_book_map); /* fall through */
         case sa_this_core_map:
                 free_cpumask_var(d->this_core_map); /* fall through */
         case sa_this_sibling_map:
@@ -6950,8 +6975,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                 return sa_nodemask;
         if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
                 return sa_this_sibling_map;
-       if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+       if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
                 return sa_this_core_map;
+       if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+               return sa_this_book_map;
         if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
                 return sa_send_covered;
         d->rd = alloc_rootdomain();
@@ -7009,6 +7036,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
         return sd;
  }
  
+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+       struct sched_domain *parent, int i)
+{
+       struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+       sd = &per_cpu(book_domains, i).sd;
+       SD_INIT(sd, BOOK);
+       set_domain_attribute(sd, attr);
+       cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+       sd->parent = parent;
+       parent->child = sd;
+       cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+       return sd;
+}
+
  static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
         const struct cpumask *cpu_map, struct sched_domain_attr *attr,
         struct sched_domain *parent, int i)
@@ -7065,6 +7109,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
                                                 &cpu_to_core_group,
                                                 d->send_covered, d->tmpmask);
                 break;
+#endif
+#ifdef CONFIG_SCHED_BOOK
+       case SD_LV_BOOK: /* set up book groups */
+               cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+               if (cpu == cpumask_first(d->this_book_map))
+                       init_sched_build_groups(d->this_book_map, cpu_map,
+                                               &cpu_to_book_group,
+                                               d->send_covered, d->tmpmask);
+               break;
  #endif
         case SD_LV_CPU: /* set up physical groups */
                 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
@@ -7113,12 +7166,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
  
                 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
                 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+               sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
                 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
                 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
         }
  
         for_each_cpu(i, cpu_map) {
                 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+               build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
                 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
         }
  
@@ -7149,6 +7204,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                 init_sched_groups_power(i, sd);
         }
  #endif
+#ifdef CONFIG_SCHED_BOOK
+       for_each_cpu(i, cpu_map) {
+               sd = &per_cpu(book_domains, i).sd;
+               init_sched_groups_power(i, sd);
+       }
+#endif
  
         for_each_cpu(i, cpu_map) {
                 sd = &per_cpu(phys_domains, i).sd;
@@ -7174,6 +7235,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                 sd = &per_cpu(cpu_domains, i).sd;
  #elif defined(CONFIG_SCHED_MC)
                 sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+               sd = &per_cpu(book_domains, i).sd;
  #else
                 sd = &per_cpu(phys_domains, i).sd;
  #endif
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index db3f674ca49dbe93a611716b650bb8c715464da3..623e9aceef8f1bde4812525cb0bda5e6feb59270 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3031,7 +3031,14 @@ redo:
  
         if (!ld_moved) {
                 schedstat_inc(sd, lb_failed[idle]);
-               sd->nr_balance_failed++;
+               /*
+                * Increment the failure counter only on periodic balance.
+                * We do not want newidle balance, which can be very
+                * frequent, pollute the failure counter causing
+                * excessive cache_hot migrations and active balances.
+                */
+               if (idle != CPU_NEWLY_IDLE)
+                       sd->nr_balance_failed++;
  
                 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
                                         this_cpu)) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index d10c80ebb67a2821038a9c59b912bf8e323d67e3..baef30f08405560d56bbe930bf366bd56eef5ef7 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
          * runqueue. Otherwise simply start this RT task
          * on its current runqueue.
          *
-        * We want to avoid overloading runqueues. Even if
-        * the RT task is of higher priority than the current RT task.
-        * RT tasks behave differently than other tasks. If
-        * one gets preempted, we try to push it off to another queue.
-        * So trying to keep a preempting RT task on the same
-        * cache hot CPU will force the running RT task to
-        * a cold CPU. So we waste all the cache for the lower
-        * RT task in hopes of saving some of a RT task
-        * that is just being woken and probably will have
-        * cold cache anyway.
+        * We want to avoid overloading runqueues. If the woken
+        * task is a higher priority, then it will stay on this CPU
+        * and the lower prio task should be moved to another CPU.
+        * Even though this will probably make the lower prio task
+        * lose its cache, we do not want to bounce a higher task
+        * around just because it gave up its CPU, perhaps for a
+        * lock?
+        *
+        * For equal prio tasks, we just let the scheduler sort it out.
          */
         if (unlikely(rt_task(rq->curr)) &&
+           (rq->curr->rt.nr_cpus_allowed < 2 ||
+            rq->curr->prio < p->prio) &&
             (p->rt.nr_cpus_allowed > 1)) {
                 int cpu = find_lowest_rq(p);
  
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
         if (!task_running(rq, p) &&
             !test_tsk_need_resched(rq->curr) &&
             has_pushable_tasks(rq) &&
-           p->rt.nr_cpus_allowed > 1)
+           p->rt.nr_cpus_allowed > 1 &&
+           rt_task(rq->curr) &&
+           (rq->curr->rt.nr_cpus_allowed < 2 ||
+            rq->curr->prio < p->prio))
                 push_rt_tasks(rq);
  }
author	Ingo Molnar <mingo@elte.hu>
	Thu, 14 Oct 2010 07:11:43 +0000 (09:11 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Thu, 14 Oct 2010 07:11:46 +0000 (09:11 +0200)
Documentation/cputopology.txt		patch \| blob \| history
arch/s390/Kconfig		patch \| blob \| history
arch/s390/include/asm/topology.h		patch \| blob \| history
arch/s390/kernel/topology.c		patch \| blob \| history
drivers/base/topology.c		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/topology.h		patch \| blob \| history
include/trace/events/sched.h		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/sched_fair.c		patch \| blob \| history
kernel/sched_rt.c		patch \| blob \| history