cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment (take 2)

author Max Krasnyansky <maxk@qualcomm.com>

Tue, 15 Jul 2008 11:43:49 +0000 (04:43 -0700)

committer Ingo Molnar <mingo@elte.hu>

Fri, 18 Jul 2008 11:22:25 +0000 (13:22 +0200)
author Max Krasnyansky <maxk@qualcomm.com>
Tue, 15 Jul 2008 11:43:49 +0000 (04:43 -0700)
committer Ingo Molnar <mingo@elte.hu>
Fri, 18 Jul 2008 11:22:25 +0000 (13:22 +0200)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h

index c24875bd9c5b0ef1b6347f61a87716c5e8673914..d614d24727985cf0825bd76a94b9910baea7b4f8 100644 (file)
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -359,13 +359,14 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
  
  /*
   * The following particular system cpumasks and operations manage
- * possible, present and online cpus.  Each of them is a fixed size
+ * possible, present, active and online cpus.  Each of them is a fixed size
   * bitmap of size NR_CPUS.
   *
   *  #ifdef CONFIG_HOTPLUG_CPU
   *     cpu_possible_map - has bit 'cpu' set iff cpu is populatable
   *     cpu_present_map  - has bit 'cpu' set iff cpu is populated
   *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
+ *     cpu_active_map   - has bit 'cpu' set iff cpu available to migration
   *  #else
   *     cpu_possible_map - has bit 'cpu' set iff cpu is populated
   *     cpu_present_map  - copy of cpu_possible_map
@@ -416,6 +417,7 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
  extern cpumask_t cpu_possible_map;
  extern cpumask_t cpu_online_map;
  extern cpumask_t cpu_present_map;
+extern cpumask_t cpu_active_map;
  
  #if NR_CPUS > 1
  #define num_online_cpus()      cpus_weight(cpu_online_map)
@@ -424,6 +426,7 @@ extern cpumask_t cpu_present_map;
  #define cpu_online(cpu)                cpu_isset((cpu), cpu_online_map)
  #define cpu_possible(cpu)      cpu_isset((cpu), cpu_possible_map)
  #define cpu_present(cpu)       cpu_isset((cpu), cpu_present_map)
+#define cpu_active(cpu)                cpu_isset((cpu), cpu_active_map)
  #else
  #define num_online_cpus()      1
  #define num_possible_cpus()    1
@@ -431,6 +434,7 @@ extern cpumask_t cpu_present_map;
  #define cpu_online(cpu)                ((cpu) == 0)
  #define cpu_possible(cpu)      ((cpu) == 0)
  #define cpu_present(cpu)       ((cpu) == 0)
+#define cpu_active(cpu)                ((cpu) == 0)
  #endif
  
  #define cpu_is_offline(cpu)    unlikely(!cpu_online(cpu))
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h

index 038578362b47fba21d555593b635c5d0eabebe15..e8f450c499b0ac42302fe6c0cf91ecb4115973f2 100644 (file)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -78,6 +78,8 @@ extern void cpuset_track_online_nodes(void);
  
  extern int current_cpuset_is_being_rebound(void);
  
+extern void rebuild_sched_domains(void);
+
  #else /* !CONFIG_CPUSETS */
  
  static inline int cpuset_init_early(void) { return 0; }
@@ -156,6 +158,11 @@ static inline int current_cpuset_is_being_rebound(void)
         return 0;
  }
  
+static inline void rebuild_sched_domains(void)
+{
+       partition_sched_domains(0, NULL, NULL);
+}
+
  #endif /* !CONFIG_CPUSETS */
  
  #endif /* _LINUX_CPUSET_H */
diff --git a/init/main.c b/init/main.c

index edeace036fd96b08de9f5b57fffcb04cf8f7c44c..dd25259530eacb334d618cc862451b98b6708157 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -415,6 +415,13 @@ static void __init smp_init(void)
  {
         unsigned int cpu;
  
+       /*
+        * Set up the current CPU as possible to migrate to.
+        * The other ones will be done by cpu_up/cpu_down()
+        */
+       cpu = smp_processor_id();
+       cpu_set(cpu, cpu_active_map);
+
         /* FIXME: This should be done in userspace --RR */
         for_each_present_cpu(cpu) {
                 if (num_online_cpus() >= setup_max_cpus)
diff --git a/kernel/cpu.c b/kernel/cpu.c

index cfb1d43ab801b69f24c47c0cdb17332c25f6f99e..a1ac7ea245d7ba96727ae3647552564cefab120d 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
         cpu_hotplug.refcount = 0;
  }
  
+cpumask_t cpu_active_map;
+
  #ifdef CONFIG_HOTPLUG_CPU
  
  void get_online_cpus(void)
@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu)
         int err = 0;
  
         cpu_maps_update_begin();
-       if (cpu_hotplug_disabled)
+
+       if (cpu_hotplug_disabled) {
                 err = -EBUSY;
-       else
-               err = _cpu_down(cpu, 0);
+               goto out;
+       }
+
+       cpu_clear(cpu, cpu_active_map);
+
+       err = _cpu_down(cpu, 0);
+
+       if (cpu_online(cpu))
+               cpu_set(cpu, cpu_active_map);
  
+out:
         cpu_maps_update_done();
         return err;
  }
@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu)
         }
  
         cpu_maps_update_begin();
-       if (cpu_hotplug_disabled)
+
+       if (cpu_hotplug_disabled) {
                 err = -EBUSY;
-       else
-               err = _cpu_up(cpu, 0);
+               goto out;
+       }
  
+       err = _cpu_up(cpu, 0);
+
+       if (cpu_online(cpu))
+               cpu_set(cpu, cpu_active_map);
+
+out:
         cpu_maps_update_done();
         return err;
  }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 459d601947a82a29693bbaa7444c74542311919e..3c3ef02f65f109a52a6623f5353ea71e473a796a 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
   *     partition_sched_domains().
   */
  
-static void rebuild_sched_domains(void)
+void rebuild_sched_domains(void)
  {
         struct kfifo *q;        /* queue of cpusets to be scanned */
         struct cpuset *cp;      /* scans q */
diff --git a/kernel/sched.c b/kernel/sched.c

index 1ee18dbb451659de9feb1de076708a89777d22a6..c237624a8a042be35bfe49eeb17e6b04f7915e22 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
  
         rq = task_rq_lock(p, &flags);
         if (!cpu_isset(dest_cpu, p->cpus_allowed)
-           || unlikely(cpu_is_offline(dest_cpu)))
+           || unlikely(!cpu_active(dest_cpu)))
                 goto out;
  
         /* force the process onto the specified CPU */
@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
                 /*
                  * If we are going offline and still the leader, give up!
                  */
-               if (cpu_is_offline(cpu) &&
+               if (!cpu_active(cpu) &&
                     atomic_read(&nohz.load_balancer) == cpu) {
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                 BUG();
@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         struct rq *rq_dest, *rq_src;
         int ret = 0, on_rq;
  
-       if (unlikely(cpu_is_offline(dest_cpu)))
+       if (unlikely(!cpu_active(dest_cpu)))
                 return ret;
  
         rq_src = cpu_rq(src_cpu);
@@ -7553,18 +7553,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
  {
  }
  
-/*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-       ndoms_cur = 0;
-       if (doms_cur != &fallback_doms)
-               kfree(doms_cur);
-       doms_cur = &fallback_doms;
-}
-
  /*
   * Set up scheduler domains and groups. Callers must hold the hotplug lock.
   * For now this just excludes isolated cpus, but could be used to
@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
   * ownership of it and will kfree it when done with it. If the caller
   * failed the kmalloc call, then it can pass in doms_new == NULL,
   * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
   *
   * Call with hotplug lock held
   */
@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
         /* always unregister in case we don't destroy any domains */
         unregister_sched_domain_sysctl();
  
-       if (doms_new == NULL) {
-               ndoms_new = 1;
-               doms_new = &fallback_doms;
-               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-               dattr_new = NULL;
-       }
+       if (doms_new == NULL)
+               ndoms_new = 0;
  
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
@@ -7677,6 +7661,14 @@ match1:
                 ;
         }
  
+       if (doms_new == NULL) {
+               ndoms_cur = 0;
+               ndoms_new = 1;
+               doms_new = &fallback_doms;
+               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+               dattr_new = NULL;
+       }
+
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
                 for (j = 0; j < ndoms_cur; j++) {
@@ -7707,17 +7699,10 @@ match2:
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
  int arch_reinit_sched_domains(void)
  {
-       int err;
-
         get_online_cpus();
-       mutex_lock(&sched_domains_mutex);
-       detach_destroy_domains(&cpu_online_map);
-       free_sched_domains();
-       err = arch_init_sched_domains(&cpu_online_map);
-       mutex_unlock(&sched_domains_mutex);
+       rebuild_sched_domains();
         put_online_cpus();
-
-       return err;
+       return 0;
  }
  
  static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7783,14 +7768,30 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
  }
  #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  
+#ifndef CONFIG_CPUSETS
  /*
- * Force a reinitialization of the sched domains hierarchy. The domains
- * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
+ * Add online and remove offline CPUs from the scheduler domains.
+ * When cpusets are enabled they take over this function.
   */
  static int update_sched_domains(struct notifier_block *nfb,
                                 unsigned long action, void *hcpu)
+{
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               partition_sched_domains(0, NULL, NULL);
+               return NOTIFY_OK;
+
+       default:
+               return NOTIFY_DONE;
+       }
+}
+#endif
+
+static int update_runtime(struct notifier_block *nfb,
+                               unsigned long action, void *hcpu)
  {
         int cpu = (int)(long)hcpu;
  
@@ -7798,44 +7799,18 @@ static int update_sched_domains(struct notifier_block *nfb,
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
                 disable_runtime(cpu_rq(cpu));
-               /* fall-through */
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               detach_destroy_domains(&cpu_online_map);
-               free_sched_domains();
                 return NOTIFY_OK;
  
-
         case CPU_DOWN_FAILED:
         case CPU_DOWN_FAILED_FROZEN:
         case CPU_ONLINE:
         case CPU_ONLINE_FROZEN:
                 enable_runtime(cpu_rq(cpu));
-               /* fall-through */
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               /*
-                * Fall through and re-initialise the domains.
-                */
-               break;
+               return NOTIFY_OK;
+
         default:
                 return NOTIFY_DONE;
         }
-
-#ifndef CONFIG_CPUSETS
-       /*
-        * Create default domain partitioning if cpusets are disabled.
-        * Otherwise we let cpusets rebuild the domains based on the
-        * current setup.
-        */
-
-       /* The hotplug lock is already held by cpu_up/cpu_down */
-       arch_init_sched_domains(&cpu_online_map);
-#endif
-
-       return NOTIFY_OK;
  }
  
  void __init sched_init_smp(void)
@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void)
                 cpu_set(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
+
+#ifndef CONFIG_CPUSETS
         /* XXX: Theoretical race here - CPU may be hotplugged now */
         hotcpu_notifier(update_sched_domains, 0);
+#endif
+
+       /* RT runtime code needs to handle some hotplug events */
+       hotcpu_notifier(update_runtime, 0);
+
         init_hrtick();
  
         /* Move init over to a non-isolated CPU */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index f2aa987027d695750f2ca4b8f917d02171eeb3b8..d924c679dfac84bcd027e636cc2f8f3cf3e7a718 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq)
   * not idle and an idle cpu is available.  The span of cpus to
   * search starts with cpus closest then further out as needed,
   * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
   *
   * Returns the CPU we should wake onto.
   */
@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p)
                     || ((sd->flags & SD_WAKE_IDLE_FAR)
                         && !task_hot(p, task_rq(p)->clock, sd))) {
                         cpus_and(tmp, sd->span, p->cpus_allowed);
+                       cpus_and(tmp, tmp, cpu_active_map);
                         for_each_cpu_mask(i, tmp) {
                                 if (idle_cpu(i)) {
                                         if (i != task_cpu(p)) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index d3d1cccb3d7b3a2b4dac15be9a8ee53bcf35ee2d..50735bb961491bbe645f048b0715e0725782ed61 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -933,6 +933,13 @@ static int find_lowest_rq(struct task_struct *task)
         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
                 return -1; /* No targets found */
  
+       /*
+        * Only consider CPUs that are usable for migration.
+        * I guess we might want to change cpupri_find() to ignore those
+        * in the first place.
+        */
+       cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+
         /*
          * At this point we have built a mask of cpus representing the
          * lowest priority tasks in the system.  Now we want to elect
author	Max Krasnyansky <maxk@qualcomm.com>
	Tue, 15 Jul 2008 11:43:49 +0000 (04:43 -0700)
committer	Ingo Molnar <mingo@elte.hu>
	Fri, 18 Jul 2008 11:22:25 +0000 (13:22 +0200)
include/linux/cpumask.h		patch \| blob \| history
include/linux/cpuset.h		patch \| blob \| history
init/main.c		patch \| blob \| history
kernel/cpu.c		patch \| blob \| history
kernel/cpuset.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/sched_fair.c		patch \| blob \| history
kernel/sched_rt.c		patch \| blob \| history