Merge branch 'locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

[sfrench/cifs-2.6.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 8e2558c2ba67be440ef7bcb2d95ff8138c806404..196d48babbef87c088214e4f252b4397d2bf25be 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
   */
  static DEFINE_SPINLOCK(task_group_lock);
  
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+       return list_empty(&root_task_group.children);
+}
+#endif
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
  #ifdef CONFIG_USER_SCHED
  # define INIT_TASK_GROUP_LOAD  (2*NICE_0_LOAD)
@@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
  
  #else
  
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+       return 1;
+}
+#endif
+
  static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
  static inline struct task_group *task_group(struct task_struct *p)
  {
@@ -467,11 +481,17 @@ struct rt_rq {
         struct rt_prio_array active;
         unsigned long rt_nr_running;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       int highest_prio; /* highest queued rt task prio */
+       struct {
+               int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+               int next; /* next highest */
+#endif
+       } highest_prio;
  #endif
  #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
         int overloaded;
+       struct plist_head pushable_tasks;
  #endif
         int rt_throttled;
         u64 rt_time;
@@ -549,7 +569,6 @@ struct rq {
         unsigned long nr_running;
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-       unsigned char idle_at_tick;
  #ifdef CONFIG_NO_HZ
         unsigned long last_tick_seen;
         unsigned char in_nohz_recently;
@@ -590,6 +609,7 @@ struct rq {
         struct root_domain *rd;
         struct sched_domain *sd;
  
+       unsigned char idle_at_tick;
         /* For active balancing */
         int active_balance;
         int push_cpu;
@@ -618,9 +638,6 @@ struct rq {
         /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
         /* sys_sched_yield() stats */
-       unsigned int yld_exp_empty;
-       unsigned int yld_act_empty;
-       unsigned int yld_both_empty;
         unsigned int yld_count;
  
         /* schedule() stats */
@@ -1183,10 +1200,10 @@ static void resched_task(struct task_struct *p)
  
         assert_spin_locked(&task_rq(p)->lock);
  
-       if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+       if (test_tsk_need_resched(p))
                 return;
  
-       set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+       set_tsk_need_resched(p);
  
         cpu = task_cpu(p);
         if (cpu == smp_processor_id())
@@ -1242,7 +1259,7 @@ void wake_up_idle_cpu(int cpu)
          * lockless. The worst case is that the other CPU runs the
          * idle task through an additional NOOP schedule()
          */
-       set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+       set_tsk_need_resched(rq->idle);
  
         /* NEED_RESCHED must be visible before we test polling */
         smp_mb();
@@ -1610,21 +1627,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  
  #endif
  
+#ifdef CONFIG_PREEMPT
+
  /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
   */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       spin_unlock(&this_rq->lock);
+       double_rq_lock(this_rq, busiest);
+
+       return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(this_rq->lock)
         __acquires(busiest->lock)
         __acquires(this_rq->lock)
  {
         int ret = 0;
  
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
         if (unlikely(!spin_trylock(&busiest->lock))) {
                 if (busiest < this_rq) {
                         spin_unlock(&this_rq->lock);
@@ -1637,6 +1675,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
         return ret;
  }
  
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+
+       return _double_lock_balance(this_rq, busiest);
+}
+
  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(busiest->lock)
  {
@@ -1705,6 +1759,9 @@ static void update_avg(u64 *avg, u64 sample)
  
  static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  {
+       if (wakeup)
+               p->se.start_runtime = p->se.sum_exec_runtime;
+
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, wakeup);
         p->se.on_rq = 1;
@@ -1712,10 +1769,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
  {
-       if (sleep && p->se.last_wakeup) {
-               update_avg(&p->se.avg_overlap,
-                          p->se.sum_exec_runtime - p->se.last_wakeup);
-               p->se.last_wakeup = 0;
+       if (sleep) {
+               if (p->se.last_wakeup) {
+                       update_avg(&p->se.avg_overlap,
+                               p->se.sum_exec_runtime - p->se.last_wakeup);
+                       p->se.last_wakeup = 0;
+               } else {
+                       update_avg(&p->se.avg_wakeup,
+                               sysctl_sched_wakeup_granularity);
+               }
         }
  
         sched_info_dequeued(p);
@@ -2017,7 +2079,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                  * it must be off the runqueue _entirely_, and not
                  * preempted!
                  *
-                * So if it wa still runnable (but just not actively
+                * So if it was still runnable (but just not actively
                  * running right now), it's preempted, and we should
                  * yield - it could be a while.
                  */
@@ -2267,7 +2329,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                 sync = 0;
  
  #ifdef CONFIG_SMP
-       if (sched_feat(LB_WAKEUP_UPDATE)) {
+       if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                 struct sched_domain *sd;
  
                 this_cpu = raw_smp_processor_id();
@@ -2345,6 +2407,22 @@ out_activate:
         activate_task(rq, p, 1);
         success = 1;
  
+       /*
+        * Only attribute actual wakeups done by this task.
+        */
+       if (!in_interrupt()) {
+               struct sched_entity *se = &current->se;
+               u64 sample = se->sum_exec_runtime;
+
+               if (se->last_wakeup)
+                       sample -= se->last_wakeup;
+               else
+                       sample -= se->start_runtime;
+               update_avg(&se->avg_wakeup, sample);
+
+               se->last_wakeup = se->sum_exec_runtime;
+       }
+
  out_running:
         trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
@@ -2355,8 +2433,6 @@ out_running:
                 p->sched_class->task_wake_up(rq, p);
  #endif
  out:
-       current->se.last_wakeup = current->se.sum_exec_runtime;
-
         task_rq_unlock(rq, &flags);
  
         return success;
@@ -2386,6 +2462,8 @@ static void __sched_fork(struct task_struct *p)
         p->se.prev_sum_exec_runtime     = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
+       p->se.start_runtime             = 0;
+       p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
  
  #ifdef CONFIG_SCHEDSTATS
         p->se.wait_start                = 0;
@@ -2448,6 +2526,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
  #endif
+       plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
         put_cpu();
  }
  
@@ -2491,7 +2571,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
  #ifdef CONFIG_PREEMPT_NOTIFIERS
  
  /**
- * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
   * @notifier: notifier struct to register
   */
  void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2588,6 +2668,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
  {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
+#ifdef CONFIG_SMP
+       int post_schedule = 0;
+
+       if (current->sched_class->needs_post_schedule)
+               post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
  
         rq->prev_mm = NULL;
  
@@ -2606,7 +2692,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
-       if (current->sched_class->post_schedule)
+       if (post_schedule)
                 current->sched_class->post_schedule(rq);
  #endif
  
@@ -2913,6 +2999,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
                      struct sched_domain *sd, enum cpu_idle_type idle,
                      int *all_pinned)
  {
+       int tsk_cache_hot = 0;
         /*
          * We do not migrate tasks that are:
          * 1) running (obviously), or
@@ -2936,10 +3023,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
          * 2) too many balance attempts have failed.
          */
  
-       if (!task_hot(p, rq->clock, sd) ||
-                       sd->nr_balance_failed > sd->cache_nice_tries) {
+       tsk_cache_hot = task_hot(p, rq->clock, sd);
+       if (!tsk_cache_hot ||
+               sd->nr_balance_failed > sd->cache_nice_tries) {
  #ifdef CONFIG_SCHEDSTATS
-               if (task_hot(p, rq->clock, sd)) {
+               if (tsk_cache_hot) {
                         schedstat_inc(sd, lb_hot_gained[idle]);
                         schedstat_inc(p, se.nr_forced_migrations);
                 }
@@ -2947,7 +3035,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
                 return 1;
         }
  
-       if (task_hot(p, rq->clock, sd)) {
+       if (tsk_cache_hot) {
                 schedstat_inc(p, se.nr_failed_migrations_hot);
                 return 0;
         }
@@ -2987,6 +3075,16 @@ next:
         pulled++;
         rem_load_move -= p->se.load.weight;
  
+#ifdef CONFIG_PREEMPT
+       /*
+        * NEWIDLE balancing is a source of latency, so preemptible kernels
+        * will stop after the first task is pulled to minimize the critical
+        * section.
+        */
+       if (idle == CPU_NEWLY_IDLE)
+               goto out;
+#endif
+
         /*
          * We only want to steal up to the prescribed amount of weighted load.
          */
@@ -3033,9 +3131,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
  
+#ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
                 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                         break;
-
+#endif
         } while (class && max_load_move > total_load_moved);
  
         return total_load_moved > 0;
@@ -3085,246 +3189,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
  
         return 0;
  }
-
+/********** Helpers for find_busiest_group ************************/
  /*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ *             during load balancing.
   */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const struct cpumask *cpus, int *balance)
-{
-       struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-       unsigned long max_load, avg_load, total_load, this_load, total_pwr;
-       unsigned long max_pull;
-       unsigned long busiest_load_per_task, busiest_nr_running;
-       unsigned long this_load_per_task, this_nr_running;
-       int load_idx, group_imb = 0;
+struct sd_lb_stats {
+       struct sched_group *busiest; /* Busiest group in this sd */
+       struct sched_group *this;  /* Local group in this sd */
+       unsigned long total_load;  /* Total load of all groups in sd */
+       unsigned long total_pwr;   /*   Total power of all groups in sd */
+       unsigned long avg_load;    /* Average load across all groups in sd */
+
+       /** Statistics of this group */
+       unsigned long this_load;
+       unsigned long this_load_per_task;
+       unsigned long this_nr_running;
+
+       /* Statistics of the busiest group */
+       unsigned long max_load;
+       unsigned long busiest_load_per_task;
+       unsigned long busiest_nr_running;
+
+       int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       int power_savings_balance = 1;
-       unsigned long leader_nr_running = 0, min_load_per_task = 0;
-       unsigned long min_nr_running = ULONG_MAX;
-       struct sched_group *group_min = NULL, *group_leader = NULL;
+       int power_savings_balance; /* Is powersave balance needed for this sd */
+       struct sched_group *group_min; /* Least loaded group in sd */
+       struct sched_group *group_leader; /* Group which relieves group_min */
+       unsigned long min_load_per_task; /* load_per_task in group_min */
+       unsigned long leader_nr_running; /* Nr running of group_leader */
+       unsigned long min_nr_running; /* Nr running of group_min */
  #endif
+};
+
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+       unsigned long avg_load; /*Avg load across the CPUs of the group */
+       unsigned long group_load; /* Total load over the CPUs of the group */
+       unsigned long sum_nr_running; /* Nr tasks running in the group */
+       unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+       unsigned long group_capacity;
+       int group_imb; /* Is there an imbalance in the group ? */
+};
  
-       max_load = this_load = total_load = total_pwr = 0;
-       busiest_load_per_task = busiest_nr_running = 0;
-       this_load_per_task = this_nr_running = 0;
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+       return cpumask_first(sched_group_cpus(group));
+}
  
-       if (idle == CPU_NOT_IDLE)
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+                                       enum cpu_idle_type idle)
+{
+       int load_idx;
+
+       switch (idle) {
+       case CPU_NOT_IDLE:
                 load_idx = sd->busy_idx;
-       else if (idle == CPU_NEWLY_IDLE)
+               break;
+
+       case CPU_NEWLY_IDLE:
                 load_idx = sd->newidle_idx;
-       else
+               break;
+       default:
                 load_idx = sd->idle_idx;
+               break;
+       }
  
-       do {
-               unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
-               int local_group;
-               int i;
-               int __group_imb = 0;
-               unsigned int balance_cpu = -1, first_idle_cpu = 0;
-               unsigned long sum_nr_running, sum_weighted_load;
-               unsigned long sum_avg_load_per_task;
-               unsigned long avg_load_per_task;
+       return load_idx;
+}
  
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_cpus(group));
  
-               if (local_group)
-                       balance_cpu = cpumask_first(sched_group_cpus(group));
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+       /*
+        * Busy processors will not participate in power savings
+        * balance.
+        */
+       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+               sds->power_savings_balance = 0;
+       else {
+               sds->power_savings_balance = 1;
+               sds->min_nr_running = ULONG_MAX;
+               sds->leader_nr_running = 0;
+       }
+}
  
-               /* Tally up the load of all CPUs in the group */
-               sum_weighted_load = sum_nr_running = avg_load = 0;
-               sum_avg_load_per_task = avg_load_per_task = 0;
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ *             load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
  
-               max_cpu_load = 0;
-               min_cpu_load = ~0UL;
+       if (!sds->power_savings_balance)
+               return;
  
-               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-                       struct rq *rq = cpu_rq(i);
+       /*
+        * If the local group is idle or completely loaded
+        * no need to do power savings balance at this domain
+        */
+       if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                               !sds->this_nr_running))
+               sds->power_savings_balance = 0;
  
-                       if (*sd_idle && rq->nr_running)
-                               *sd_idle = 0;
+       /*
+        * If a group is already running at full capacity or idle,
+        * don't include that group in power savings calculations
+        */
+       if (!sds->power_savings_balance ||
+               sgs->sum_nr_running >= sgs->group_capacity ||
+               !sgs->sum_nr_running)
+               return;
  
-                       /* Bias balancing toward cpus of our domain */
-                       if (local_group) {
-                               if (idle_cpu(i) && !first_idle_cpu) {
-                                       first_idle_cpu = 1;
-                                       balance_cpu = i;
-                               }
+       /*
+        * Calculate the group which has the least non-idle load.
+        * This is the group from where we need to pick up the load
+        * for saving power
+        */
+       if ((sgs->sum_nr_running < sds->min_nr_running) ||
+           (sgs->sum_nr_running == sds->min_nr_running &&
+            group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+               sds->group_min = group;
+               sds->min_nr_running = sgs->sum_nr_running;
+               sds->min_load_per_task = sgs->sum_weighted_load /
+                                               sgs->sum_nr_running;
+       }
  
-                               load = target_load(i, load_idx);
-                       } else {
-                               load = source_load(i, load_idx);
-                               if (load > max_cpu_load)
-                                       max_cpu_load = load;
-                               if (min_cpu_load > load)
-                                       min_cpu_load = load;
-                       }
+       /*
+        * Calculate the group which is almost near its
+        * capacity but still has some space to pick up some load
+        * from other group and save more power
+        */
+       if (sgs->sum_nr_running > sgs->group_capacity - 1)
+               return;
  
-                       avg_load += load;
-                       sum_nr_running += rq->nr_running;
-                       sum_weighted_load += weighted_cpuload(i);
+       if (sgs->sum_nr_running > sds->leader_nr_running ||
+           (sgs->sum_nr_running == sds->leader_nr_running &&
+            group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+               sds->group_leader = group;
+               sds->leader_nr_running = sgs->sum_nr_running;
+       }
+}
  
-                       sum_avg_load_per_task += cpu_avg_load_per_task(i);
-               }
+/**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ *     under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+{
+       if (!sds->power_savings_balance)
+               return 0;
  
-               /*
-                * First idle cpu or the first cpu(busiest) in this sched group
-                * is eligible for doing load balancing at this and above
-                * domains. In the newly idle case, we will allow all the cpu's
-                * to do the newly idle load balance.
-                */
-               if (idle != CPU_NEWLY_IDLE && local_group &&
-                   balance_cpu != this_cpu && balance) {
-                       *balance = 0;
-                       goto ret;
-               }
+       if (sds->this != sds->group_leader ||
+                       sds->group_leader == sds->group_min)
+               return 0;
  
-               total_load += avg_load;
-               total_pwr += group->__cpu_power;
+       *imbalance = sds->min_load_per_task;
+       sds->busiest = sds->group_min;
  
-               /* Adjust by relative CPU power of the group */
-               avg_load = sg_div_cpu_power(group,
-                               avg_load * SCHED_LOAD_SCALE);
+       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                       group_first_cpu(sds->group_leader);
+       }
  
+       return 1;
  
-               /*
-                * Consider the group unbalanced when the imbalance is larger
-                * than the average weight of two tasks.
-                *
-                * APZ: with cgroup the avg task weight can vary wildly and
-                *      might not be a suitable number - should we keep a
-                *      normalized nr_running number somewhere that negates
-                *      the hierarchy?
-                */
-               avg_load_per_task = sg_div_cpu_power(group,
-                               sum_avg_load_per_task * SCHED_LOAD_SCALE);
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+       return;
+}
+
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+       return;
+}
  
-               if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-                       __group_imb = 1;
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+{
+       return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
  
-               group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
  
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+                       enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                       int local_group, const struct cpumask *cpus,
+                       int *balance, struct sg_lb_stats *sgs)
+{
+       unsigned long load, max_cpu_load, min_cpu_load;
+       int i;
+       unsigned int balance_cpu = -1, first_idle_cpu = 0;
+       unsigned long sum_avg_load_per_task;
+       unsigned long avg_load_per_task;
+
+       if (local_group)
+               balance_cpu = group_first_cpu(group);
+
+       /* Tally up the load of all CPUs in the group */
+       sum_avg_load_per_task = avg_load_per_task = 0;
+       max_cpu_load = 0;
+       min_cpu_load = ~0UL;
+
+       for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+               struct rq *rq = cpu_rq(i);
+
+               if (*sd_idle && rq->nr_running)
+                       *sd_idle = 0;
+
+               /* Bias balancing toward cpus of our domain */
                 if (local_group) {
-                       this_load = avg_load;
-                       this = group;
-                       this_nr_running = sum_nr_running;
-                       this_load_per_task = sum_weighted_load;
-               } else if (avg_load > max_load &&
-                          (sum_nr_running > group_capacity || __group_imb)) {
-                       max_load = avg_load;
-                       busiest = group;
-                       busiest_nr_running = sum_nr_running;
-                       busiest_load_per_task = sum_weighted_load;
-                       group_imb = __group_imb;
+                       if (idle_cpu(i) && !first_idle_cpu) {
+                               first_idle_cpu = 1;
+                               balance_cpu = i;
+                       }
+
+                       load = target_load(i, load_idx);
+               } else {
+                       load = source_load(i, load_idx);
+                       if (load > max_cpu_load)
+                               max_cpu_load = load;
+                       if (min_cpu_load > load)
+                               min_cpu_load = load;
                 }
  
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-               /*
-                * Busy processors will not participate in power savings
-                * balance.
-                */
-               if (idle == CPU_NOT_IDLE ||
-                               !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                       goto group_next;
+               sgs->group_load += load;
+               sgs->sum_nr_running += rq->nr_running;
+               sgs->sum_weighted_load += weighted_cpuload(i);
  
-               /*
-                * If the local group is idle or completely loaded
-                * no need to do power savings balance at this domain
-                */
-               if (local_group && (this_nr_running >= group_capacity ||
-                                   !this_nr_running))
-                       power_savings_balance = 0;
+               sum_avg_load_per_task += cpu_avg_load_per_task(i);
+       }
  
-               /*
-                * If a group is already running at full capacity or idle,
-                * don't include that group in power savings calculations
-                */
-               if (!power_savings_balance || sum_nr_running >= group_capacity
-                   || !sum_nr_running)
-                       goto group_next;
+       /*
+        * First idle cpu or the first cpu(busiest) in this sched group
+        * is eligible for doing load balancing at this and above
+        * domains. In the newly idle case, we will allow all the cpu's
+        * to do the newly idle load balance.
+        */
+       if (idle != CPU_NEWLY_IDLE && local_group &&
+           balance_cpu != this_cpu && balance) {
+               *balance = 0;
+               return;
+       }
  
-               /*
-                * Calculate the group which has the least non-idle load.
-                * This is the group from where we need to pick up the load
-                * for saving power
-                */
-               if ((sum_nr_running < min_nr_running) ||
-                   (sum_nr_running == min_nr_running &&
-                    cpumask_first(sched_group_cpus(group)) >
-                    cpumask_first(sched_group_cpus(group_min)))) {
-                       group_min = group;
-                       min_nr_running = sum_nr_running;
-                       min_load_per_task = sum_weighted_load /
-                                               sum_nr_running;
-               }
+       /* Adjust by relative CPU power of the group */
+       sgs->avg_load = sg_div_cpu_power(group,
+                       sgs->group_load * SCHED_LOAD_SCALE);
  
-               /*
-                * Calculate the group which is almost near its
-                * capacity but still has some space to pick up some load
-                * from other group and save more power
-                */
-               if (sum_nr_running <= group_capacity - 1) {
-                       if (sum_nr_running > leader_nr_running ||
-                           (sum_nr_running == leader_nr_running &&
-                            cpumask_first(sched_group_cpus(group)) <
-                            cpumask_first(sched_group_cpus(group_leader)))) {
-                               group_leader = group;
-                               leader_nr_running = sum_nr_running;
-                       }
+
+       /*
+        * Consider the group unbalanced when the imbalance is larger
+        * than the average weight of two tasks.
+        *
+        * APZ: with cgroup the avg task weight can vary wildly and
+        *      might not be a suitable number - should we keep a
+        *      normalized nr_running number somewhere that negates
+        *      the hierarchy?
+        */
+       avg_load_per_task = sg_div_cpu_power(group,
+                       sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+               sgs->group_imb = 1;
+
+       sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+
+}
+
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+                       enum cpu_idle_type idle, int *sd_idle,
+                       const struct cpumask *cpus, int *balance,
+                       struct sd_lb_stats *sds)
+{
+       struct sched_group *group = sd->groups;
+       struct sg_lb_stats sgs;
+       int load_idx;
+
+       init_sd_power_savings_stats(sd, sds, idle);
+       load_idx = get_sd_load_idx(sd, idle);
+
+       do {
+               int local_group;
+
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
+               memset(&sgs, 0, sizeof(sgs));
+               update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+                               local_group, cpus, balance, &sgs);
+
+               if (local_group && balance && !(*balance))
+                       return;
+
+               sds->total_load += sgs.group_load;
+               sds->total_pwr += group->__cpu_power;
+
+               if (local_group) {
+                       sds->this_load = sgs.avg_load;
+                       sds->this = group;
+                       sds->this_nr_running = sgs.sum_nr_running;
+                       sds->this_load_per_task = sgs.sum_weighted_load;
+               } else if (sgs.avg_load > sds->max_load &&
+                          (sgs.sum_nr_running > sgs.group_capacity ||
+                               sgs.group_imb)) {
+                       sds->max_load = sgs.avg_load;
+                       sds->busiest = group;
+                       sds->busiest_nr_running = sgs.sum_nr_running;
+                       sds->busiest_load_per_task = sgs.sum_weighted_load;
+                       sds->group_imb = sgs.group_imb;
                 }
-group_next:
-#endif
+
+               update_sd_power_savings_stats(group, sds, local_group, &sgs);
                 group = group->next;
         } while (group != sd->groups);
  
-       if (!busiest || this_load >= max_load || busiest_nr_running == 0)
-               goto out_balanced;
-
-       avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+}
  
-       if (this_load >= avg_load ||
-                       100*max_load <= sd->imbalance_pct*this_load)
-               goto out_balanced;
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ *                     amongst the groups of a sched_domain, during
+ *                     load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                               int this_cpu, unsigned long *imbalance)
+{
+       unsigned long tmp, pwr_now = 0, pwr_move = 0;
+       unsigned int imbn = 2;
+
+       if (sds->this_nr_running) {
+               sds->this_load_per_task /= sds->this_nr_running;
+               if (sds->busiest_load_per_task >
+                               sds->this_load_per_task)
+                       imbn = 1;
+       } else
+               sds->this_load_per_task =
+                       cpu_avg_load_per_task(this_cpu);
  
-       busiest_load_per_task /= busiest_nr_running;
-       if (group_imb)
-               busiest_load_per_task = min(busiest_load_per_task, avg_load);
+       if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+                       sds->busiest_load_per_task * imbn) {
+               *imbalance = sds->busiest_load_per_task;
+               return;
+       }
  
         /*
-        * We're trying to get all the cpus to the average_load, so we don't
-        * want to push ourselves above the average load, nor do we wish to
-        * reduce the max loaded cpu below the average load, as either of these
-        * actions would just result in more rebalancing later, and ping-pong
-        * tasks around. Thus we look for the minimum possible imbalance.
-        * Negative imbalances (*we* are more loaded than anyone else) will
-        * be counted as no imbalance for these purposes -- we can't fix that
-        * by pulling tasks to us. Be careful of negative numbers as they'll
-        * appear as very large values with unsigned longs.
+        * OK, we don't have enough imbalance to justify moving tasks,
+        * however we may be able to increase total CPU power used by
+        * moving them.
          */
-       if (max_load <= busiest_load_per_task)
-               goto out_balanced;
  
+       pwr_now += sds->busiest->__cpu_power *
+                       min(sds->busiest_load_per_task, sds->max_load);
+       pwr_now += sds->this->__cpu_power *
+                       min(sds->this_load_per_task, sds->this_load);
+       pwr_now /= SCHED_LOAD_SCALE;
+
+       /* Amount of load we'd subtract */
+       tmp = sg_div_cpu_power(sds->busiest,
+                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       if (sds->max_load > tmp)
+               pwr_move += sds->busiest->__cpu_power *
+                       min(sds->busiest_load_per_task, sds->max_load - tmp);
+
+       /* Amount of load we'd add */
+       if (sds->max_load * sds->busiest->__cpu_power <
+               sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+               tmp = sg_div_cpu_power(sds->this,
+                       sds->max_load * sds->busiest->__cpu_power);
+       else
+               tmp = sg_div_cpu_power(sds->this,
+                       sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+       pwr_move += sds->this->__cpu_power *
+                       min(sds->this_load_per_task, sds->this_load + tmp);
+       pwr_move /= SCHED_LOAD_SCALE;
+
+       /* Move if we gain throughput */
+       if (pwr_move > pwr_now)
+               *imbalance = sds->busiest_load_per_task;
+}
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *                      groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+               unsigned long *imbalance)
+{
+       unsigned long max_pull;
         /*
          * In the presence of smp nice balancing, certain scenarios can have
          * max load less than avg load(as we skip the groups at or below
          * its cpu_power, while calculating max_load..)
          */
-       if (max_load < avg_load) {
+       if (sds->max_load < sds->avg_load) {
                 *imbalance = 0;
-               goto small_imbalance;
+               return fix_small_imbalance(sds, this_cpu, imbalance);
         }
  
         /* Don't want to pull so many tasks that a group would go idle */
-       max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+       max_pull = min(sds->max_load - sds->avg_load,
+                       sds->max_load - sds->busiest_load_per_task);
  
         /* How much load to actually move to equalise the imbalance */
-       *imbalance = min(max_pull * busiest->__cpu_power,
-                               (avg_load - this_load) * this->__cpu_power)
+       *imbalance = min(max_pull * sds->busiest->__cpu_power,
+               (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                         / SCHED_LOAD_SCALE;
  
         /*
@@ -3333,78 +3671,110 @@ group_next:
          * a think about bumping its value to force at least one task to be
          * moved
          */
-       if (*imbalance < busiest_load_per_task) {
-               unsigned long tmp, pwr_now, pwr_move;
-               unsigned int imbn;
-
-small_imbalance:
-               pwr_move = pwr_now = 0;
-               imbn = 2;
-               if (this_nr_running) {
-                       this_load_per_task /= this_nr_running;
-                       if (busiest_load_per_task > this_load_per_task)
-                               imbn = 1;
-               } else
-                       this_load_per_task = cpu_avg_load_per_task(this_cpu);
+       if (*imbalance < sds->busiest_load_per_task)
+               return fix_small_imbalance(sds, this_cpu, imbalance);
  
-               if (max_load - this_load + busiest_load_per_task >=
-                                       busiest_load_per_task * imbn) {
-                       *imbalance = busiest_load_per_task;
-                       return busiest;
-               }
+}
+/******* find_busiest_group() helpers end here *********************/
  
-               /*
-                * OK, we don't have enough imbalance to justify moving tasks,
-                * however we may be able to increase total CPU power used by
-                * moving them.
-                */
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *             be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *     is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:    - the busiest group if imbalance exists.
+ *             - If no imbalance and user has opted for power-savings balance,
+ *                return the least loaded group whose CPUs can be
+ *                put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+                  unsigned long *imbalance, enum cpu_idle_type idle,
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+       struct sd_lb_stats sds;
  
-               pwr_now += busiest->__cpu_power *
-                               min(busiest_load_per_task, max_load);
-               pwr_now += this->__cpu_power *
-                               min(this_load_per_task, this_load);
-               pwr_now /= SCHED_LOAD_SCALE;
-
-               /* Amount of load we'd subtract */
-               tmp = sg_div_cpu_power(busiest,
-                               busiest_load_per_task * SCHED_LOAD_SCALE);
-               if (max_load > tmp)
-                       pwr_move += busiest->__cpu_power *
-                               min(busiest_load_per_task, max_load - tmp);
-
-               /* Amount of load we'd add */
-               if (max_load * busiest->__cpu_power <
-                               busiest_load_per_task * SCHED_LOAD_SCALE)
-                       tmp = sg_div_cpu_power(this,
-                                       max_load * busiest->__cpu_power);
-               else
-                       tmp = sg_div_cpu_power(this,
-                               busiest_load_per_task * SCHED_LOAD_SCALE);
-               pwr_move += this->__cpu_power *
-                               min(this_load_per_task, this_load + tmp);
-               pwr_move /= SCHED_LOAD_SCALE;
+       memset(&sds, 0, sizeof(sds));
  
-               /* Move if we gain throughput */
-               if (pwr_move > pwr_now)
-                       *imbalance = busiest_load_per_task;
-       }
+       /*
+        * Compute the various statistics relavent for load balancing at
+        * this level.
+        */
+       update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+                                       balance, &sds);
+
+       /* Cases where imbalance does not exist from POV of this_cpu */
+       /* 1) this_cpu is not the appropriate cpu to perform load balancing
+        *    at this level.
+        * 2) There is no busy sibling group to pull from.
+        * 3) This group is the busiest group.
+        * 4) This group is more busy than the avg busieness at this
+        *    sched_domain.
+        * 5) The imbalance is within the specified limit.
+        * 6) Any rebalance would lead to ping-pong
+        */
+       if (balance && !(*balance))
+               goto ret;
  
-       return busiest;
+       if (!sds.busiest || sds.busiest_nr_running == 0)
+               goto out_balanced;
  
-out_balanced:
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-               goto ret;
+       if (sds.this_load >= sds.max_load)
+               goto out_balanced;
  
-       if (this == group_leader && group_leader != group_min) {
-               *imbalance = min_load_per_task;
-               if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-                       cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-                               cpumask_first(sched_group_cpus(group_leader));
-               }
-               return group_min;
-       }
-#endif
+       sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+
+       if (sds.this_load >= sds.avg_load)
+               goto out_balanced;
+
+       if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+               goto out_balanced;
+
+       sds.busiest_load_per_task /= sds.busiest_nr_running;
+       if (sds.group_imb)
+               sds.busiest_load_per_task =
+                       min(sds.busiest_load_per_task, sds.avg_load);
+
+       /*
+        * We're trying to get all the cpus to the average_load, so we don't
+        * want to push ourselves above the average load, nor do we wish to
+        * reduce the max loaded cpu below the average load, as either of these
+        * actions would just result in more rebalancing later, and ping-pong
+        * tasks around. Thus we look for the minimum possible imbalance.
+        * Negative imbalances (*we* are more loaded than anyone else) will
+        * be counted as no imbalance for these purposes -- we can't fix that
+        * by pulling tasks to us. Be careful of negative numbers as they'll
+        * appear as very large values with unsigned longs.
+        */
+       if (sds.max_load <= sds.busiest_load_per_task)
+               goto out_balanced;
+
+       /* Looks like there is an imbalance. Compute it */
+       calculate_imbalance(&sds, this_cpu, imbalance);
+       return sds.busiest;
+
+out_balanced:
+       /*
+        * There is no obvious imbalance. But check if we can do some balancing
+        * to save power.
+        */
+       if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+               return sds.busiest;
  ret:
         *imbalance = 0;
         return NULL;
@@ -4057,6 +4427,11 @@ static void run_rebalance_domains(struct softirq_action *h)
  #endif
  }
  
+static inline int on_null_domain(int cpu)
+{
+       return !rcu_dereference(cpu_rq(cpu)->sd);
+}
+
  /*
   * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
   *
@@ -4114,7 +4489,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
             cpumask_test_cpu(cpu, nohz.cpu_mask))
                 return;
  #endif
-       if (time_after_eq(jiffies, rq->next_balance))
+       /* Don't need to rebalance while attached to NULL domain */
+       if (time_after_eq(jiffies, rq->next_balance) &&
+           likely(!on_null_domain(cpu)))
                 raise_softirq(SCHED_SOFTIRQ);
  }
  
@@ -4508,11 +4885,33 @@ static inline void schedule_debug(struct task_struct *prev)
  #endif
  }
  
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+       if (prev->state == TASK_RUNNING) {
+               u64 runtime = prev->se.sum_exec_runtime;
+
+               runtime -= prev->se.prev_sum_exec_runtime;
+               runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+
+               /*
+                * In order to avoid avg_overlap growing stale when we are
+                * indeed overlapping and hence not getting put to sleep, grow
+                * the avg_overlap on preemption.
+                *
+                * We use the average preemption runtime because that
+                * correlates to the amount of cache footprint a task can
+                * build up.
+                */
+               update_avg(&prev->se.avg_overlap, runtime);
+       }
+       prev->sched_class->put_prev_task(rq, prev);
+}
+
  /*
   * Pick up the highest-prio task:
   */
  static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq)
  {
         const struct sched_class *class;
         struct task_struct *p;
@@ -4543,15 +4942,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
  /*
   * schedule() is the main scheduler function.
   */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
         struct rq *rq;
         int cpu;
  
-need_resched:
-       preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
         rcu_qsctr_inc(cpu);
@@ -4586,8 +4983,8 @@ need_resched_nonpreemptible:
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
  
-       prev->sched_class->put_prev_task(rq, prev);
-       next = pick_next_task(rq, prev);
+       put_prev_task(rq, prev);
+       next = pick_next_task(rq);
  
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
@@ -4608,13 +5005,80 @@ need_resched_nonpreemptible:
  
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
+}
  
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+       preempt_disable();
+       __schedule();
         preempt_enable_no_resched();
         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                 goto need_resched;
  }
  EXPORT_SYMBOL(schedule);
  
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+       unsigned int cpu;
+       struct rq *rq;
+
+       if (!sched_feat(OWNER_SPIN))
+               return 0;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       /*
+        * Need to access the cpu field knowing that
+        * DEBUG_PAGEALLOC could have unmapped it if
+        * the mutex owner just released it and exited.
+        */
+       if (probe_kernel_address(&owner->cpu, cpu))
+               goto out;
+#else
+       cpu = owner->cpu;
+#endif
+
+       /*
+        * Even if the access succeeded (likely case),
+        * the cpu field may no longer be valid.
+        */
+       if (cpu >= nr_cpumask_bits)
+               goto out;
+
+       /*
+        * We need to validate that we can do a
+        * get_cpu() and that we have the percpu area.
+        */
+       if (!cpu_online(cpu))
+               goto out;
+
+       rq = cpu_rq(cpu);
+
+       for (;;) {
+               /*
+                * Owner changed, break to re-assess state.
+                */
+               if (lock->owner != owner)
+                       break;
+
+               /*
+                * Is that owner really running on that cpu?
+                */
+               if (task_thread_info(rq->curr) != owner || need_resched())
+                       return 0;
+
+               cpu_relax();
+       }
+out:
+       return 1;
+}
+#endif
+
  #ifdef CONFIG_PREEMPT
  /*
   * this is the entry point to schedule() from in-kernel preemption
@@ -4642,7 +5106,7 @@ asmlinkage void __sched preempt_schedule(void)
                  * between schedule and now.
                  */
                 barrier();
-       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+       } while (need_resched());
  }
  EXPORT_SYMBOL(preempt_schedule);
  
@@ -4671,7 +5135,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
                  * between schedule and now.
                  */
                 barrier();
-       } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+       } while (need_resched());
  }
  
  #endif /* CONFIG_PREEMPT */
@@ -5145,7 +5609,7 @@ SYSCALL_DEFINE1(nice, int, increment)
         if (increment > 40)
                 increment = 40;
  
-       nice = PRIO_TO_NICE(current->static_prio) + increment;
+       nice = TASK_NICE(current) + increment;
         if (nice < -20)
                 nice = -20;
         if (nice > 19)
@@ -5944,12 +6408,7 @@ void sched_show_task(struct task_struct *p)
                 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
  #endif
  #ifdef CONFIG_DEBUG_STACK_USAGE
-       {
-               unsigned long *n = end_of_stack(p);
-               while (!*n)
-                       n++;
-               free = (unsigned long)n - (unsigned long)end_of_stack(p);
-       }
+       free = stack_not_used(p);
  #endif
         printk(KERN_CONT "%5lu %5d %6d\n", free,
                 task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -6423,7 +6882,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
                 if (!rq->nr_running)
                         break;
                 update_rq_clock(rq);
-               next = pick_next_task(rq, rq->curr);
+               next = pick_next_task(rq);
                 if (!next)
                         break;
                 next->sched_class->put_prev_task(rq, next);
@@ -8218,11 +8677,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
         __set_bit(MAX_RT_PRIO, array->bitmap);
  
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       rt_rq->highest_prio = MAX_RT_PRIO;
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
+       rt_rq->highest_prio.next = MAX_RT_PRIO;
+#endif
  #endif
  #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
+       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
  #endif
  
         rt_rq->rt_time = 0;
@@ -9490,7 +9953,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
  
  static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
  {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
         u64 data;
  
  #ifndef CONFIG_64BIT
@@ -9509,7 +9972,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
  
  static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
  {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
  
  #ifndef CONFIG_64BIT
         /*
@@ -9598,14 +10061,14 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
         struct cpuacct *ca;
         int cpu;
  
-       if (!cpuacct_subsys.active)
+       if (unlikely(!cpuacct_subsys.active))
                 return;
  
         cpu = task_cpu(tsk);
         ca = task_ca(tsk);
  
         for (; ca; ca = ca->parent) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
  }