sched: Nominate idle load balancer from a semi-idle package.

[sfrench/cifs-2.6.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 6cc1fd5d5072b69638c562d7e01697d4c9870684..b0fefa300b40189c2a5e1ecbb3fb995040c12234 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1418,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
                    struct rq_iterator *iterator);
  #endif
  
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+       CPUACCT_STAT_USER,      /* ... user mode */
+       CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+
+       CPUACCT_STAT_NSTATS,
+};
+
  #ifdef CONFIG_CGROUP_CPUACCT
  static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val);
  #else
  static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val) {}
  #endif
  
  static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4228,10 +4240,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
  static struct {
         atomic_t load_balancer;
         cpumask_var_t cpu_mask;
+       cpumask_var_t ilb_grp_nohz_mask;
  } nohz ____cacheline_aligned = {
         .load_balancer = ATOMIC_INIT(-1),
  };
  
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:       The cpu whose lowest level of sched domain is to
+ *             be returned.
+ * @flag:      The flag to check for the lowest sched_domain
+ *             for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+       struct sched_domain *sd;
+
+       for_each_domain(cpu, sd)
+               if (sd && (sd->flags & flag))
+                       break;
+
+       return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:       The cpu whose domains we're iterating over.
+ * @sd:                variable holding the value of the power_savings_sd
+ *             for cpu.
+ * @flag:      The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+       for (sd = lowest_flag_domain(cpu, flag); \
+               (sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group: group to be checked for semi-idleness
+ *
+ * Returns:    1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+       cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                       sched_group_cpus(ilb_group));
+
+       /*
+        * A sched_group is semi-idle when it has atleast one busy cpu
+        * and atleast one idle cpu.
+        */
+       if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+               return 0;
+
+       if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+               return 0;
+
+       return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:       The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:    Returns the id of the idle load balancer if it exists,
+ *             Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+       struct sched_domain *sd;
+       struct sched_group *ilb_group;
+
+       /*
+        * Have idle load balancer selection from semi-idle packages only
+        * when power-aware load balancing is enabled
+        */
+       if (!(sched_smt_power_savings || sched_mc_power_savings))
+               goto out_done;
+
+       /*
+        * Optimize for the case when we have no idle CPUs or only one
+        * idle CPU. Don't walk the sched_domain hierarchy in such cases
+        */
+       if (cpumask_weight(nohz.cpu_mask) < 2)
+               goto out_done;
+
+       for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+               ilb_group = sd->groups;
+
+               do {
+                       if (is_semi_idle_group(ilb_group))
+                               return cpumask_first(nohz.ilb_grp_nohz_mask);
+
+                       ilb_group = ilb_group->next;
+
+               } while (ilb_group != sd->groups);
+       }
+
+out_done:
+       return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+       return first_cpu(nohz.cpu_mask);
+}
+#endif
+
  /*
   * This routine will try to nominate the ilb (idle load balancing)
   * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4456,15 +4584,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                 }
  
                 if (atomic_read(&nohz.load_balancer) == -1) {
-                       /*
-                        * simple selection for now: Nominate the
-                        * first cpu in the nohz list to be the next
-                        * ilb owner.
-                        *
-                        * TBD: Traverse the sched domains and nominate
-                        * the nearest cpu in the nohz.cpu_mask.
-                        */
-                       int ilb = cpumask_first(nohz.cpu_mask);
+                       int ilb = find_new_ilb(cpu);
  
                         if (ilb < nr_cpu_ids)
                                 resched_cpu(ilb);
@@ -4511,9 +4631,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
  EXPORT_PER_CPU_SYMBOL(kstat);
  
  /*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
   * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
   */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+       u64 ns = 0;
+
+       if (task_current(rq, p)) {
+               update_rq_clock(rq);
+               ns = rq->clock - p->se.exec_start;
+               if ((s64)ns < 0)
+                       ns = 0;
+       }
+
+       return ns;
+}
+
  unsigned long long task_delta_exec(struct task_struct *p)
  {
         unsigned long flags;
@@ -4521,16 +4657,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
         u64 ns = 0;
  
         rq = task_rq_lock(p, &flags);
+       ns = do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
  
-       if (task_current(rq, p)) {
-               u64 delta_exec;
+       return ns;
+}
  
-               update_rq_clock(rq);
-               delta_exec = rq->clock - p->se.exec_start;
-               if ((s64)delta_exec > 0)
-                       ns = delta_exec;
-       }
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns = 0;
+
+       rq = task_rq_lock(p, &flags);
+       ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
+
+       return ns;
+}
+
+/*
+ * Return sum_exec_runtime for the thread group.
+ * In case the task is currently running, return the sum plus current's
+ * pending runtime that have not been accounted yet.
+ *
+ * Note that the thread group might have other running tasks as well,
+ * so the return value not includes other pending runtime that other
+ * running tasks might have.
+ */
+unsigned long long thread_group_sched_runtime(struct task_struct *p)
+{
+       struct task_cputime totals;
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns;
  
+       rq = task_rq_lock(p, &flags);
+       thread_group_cputime(p, &totals);
+       ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
         task_rq_unlock(rq, &flags);
  
         return ns;
@@ -4559,6 +4728,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
         else
                 cpustat->user = cputime64_add(cpustat->user, tmp);
+
+       cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
         /* Account for user time used */
         acct_update_integrals(p);
  }
@@ -4620,6 +4791,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
  
+       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
         /* Account for system time used */
         acct_update_integrals(p);
  }
@@ -7302,7 +7475,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  
                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-               printk(KERN_CONT " %s", str);
+               printk(KERN_CONT " %s (__cpu_power = %d)", str,
+                                               group->__cpu_power);
  
                 group = group->next;
         } while (group != sd->groups);
@@ -8985,6 +9159,7 @@ void __init sched_init(void)
  #ifdef CONFIG_SMP
  #ifdef CONFIG_NO_HZ
         alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+       alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
  #endif
         alloc_bootmem_cpumask_var(&cpu_isolated_map);
  #endif /* SMP */
@@ -9925,6 +10100,7 @@ struct cpuacct {
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
         u64 *cpuusage;
+       struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
         struct cpuacct *parent;
  };
  
@@ -9949,20 +10125,32 @@ static struct cgroup_subsys_state *cpuacct_create(
         struct cgroup_subsys *ss, struct cgroup *cgrp)
  {
         struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+       int i;
  
         if (!ca)
-               return ERR_PTR(-ENOMEM);
+               goto out;
  
         ca->cpuusage = alloc_percpu(u64);
-       if (!ca->cpuusage) {
-               kfree(ca);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!ca->cpuusage)
+               goto out_free_ca;
+
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               if (percpu_counter_init(&ca->cpustat[i], 0))
+                       goto out_free_counters;
  
         if (cgrp->parent)
                 ca->parent = cgroup_ca(cgrp->parent);
  
         return &ca->css;
+
+out_free_counters:
+       while (--i >= 0)
+               percpu_counter_destroy(&ca->cpustat[i]);
+       free_percpu(ca->cpuusage);
+out_free_ca:
+       kfree(ca);
+out:
+       return ERR_PTR(-ENOMEM);
  }
  
  /* destroy an existing cpu accounting group */
@@ -9970,7 +10158,10 @@ static void
  cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
  {
         struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
  
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               percpu_counter_destroy(&ca->cpustat[i]);
         free_percpu(ca->cpuusage);
         kfree(ca);
  }
@@ -10057,6 +10248,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
         return 0;
  }
  
+static const char *cpuacct_stat_desc[] = {
+       [CPUACCT_STAT_USER] = "user",
+       [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+               struct cgroup_map_cb *cb)
+{
+       struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
+
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+               s64 val = percpu_counter_read(&ca->cpustat[i]);
+               val = cputime64_to_clock_t(val);
+               cb->fill(cb, cpuacct_stat_desc[i], val);
+       }
+       return 0;
+}
+
  static struct cftype files[] = {
         {
                 .name = "usage",
@@ -10067,7 +10277,10 @@ static struct cftype files[] = {
                 .name = "usage_percpu",
                 .read_seq_string = cpuacct_percpu_seq_read,
         },
-
+       {
+               .name = "stat",
+               .read_map = cpuacct_stats_show,
+       },
  };
  
  static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10089,12 +10302,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
                 return;
  
         cpu = task_cpu(tsk);
+
+       rcu_read_lock();
+
         ca = task_ca(tsk);
  
         for (; ca; ca = ca->parent) {
                 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
+
+       rcu_read_unlock();
+}
+
+/*
+ * Charge the system/user time to the task's accounting group.
+ */
+static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val)
+{
+       struct cpuacct *ca;
+
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+
+       rcu_read_lock();
+       ca = task_ca(tsk);
+
+       do {
+               percpu_counter_add(&ca->cpustat[idx], val);
+               ca = ca->parent;
+       } while (ca);
+       rcu_read_unlock();
  }
  
  struct cgroup_subsys cpuacct_subsys = {