Merge branches 'x86-build-for-linus', 'x86-cleanups-for-linus' and 'x86-debug-for...

[sfrench/cifs-2.6.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 3bdf01b494fe29c267a0abe73828b02a799a737d..1211575a220895dab1c9c2eba9fa20bf688420ff 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
                 return;
  
         delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+       if (delta < 0)
+               return;
         rq->clock += delta;
         update_rq_clock_task(rq, delta);
  }
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
         char buf[64];
         char *cmp;
         int i;
+       struct inode *inode;
  
         if (cnt > 63)
                 cnt = 63;
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
         buf[cnt] = 0;
         cmp = strstrip(buf);
  
+       /* Ensure the static_key remains in a consistent state */
+       inode = file_inode(filp);
+       mutex_lock(&inode->i_mutex);
         i = sched_feat_set(cmp);
+       mutex_unlock(&inode->i_mutex);
         if (i == __SCHED_FEAT_NR)
                 return -EINVAL;
  
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
  #endif
  
  /*
- * resched_task - mark a task 'to be rescheduled now'.
+ * resched_curr - mark rq's current task 'to be rescheduled now'.
   *
   * On UP this means the setting of the need_resched flag, on SMP it
   * might also involve a cross-CPU call to trigger the scheduler on
   * the target CPU.
   */
-void resched_task(struct task_struct *p)
+void resched_curr(struct rq *rq)
  {
+       struct task_struct *curr = rq->curr;
         int cpu;
  
-       lockdep_assert_held(&task_rq(p)->lock);
+       lockdep_assert_held(&rq->lock);
  
-       if (test_tsk_need_resched(p))
+       if (test_tsk_need_resched(curr))
                 return;
  
-       cpu = task_cpu(p);
+       cpu = cpu_of(rq);
  
         if (cpu == smp_processor_id()) {
-               set_tsk_need_resched(p);
+               set_tsk_need_resched(curr);
                 set_preempt_need_resched();
                 return;
         }
  
-       if (set_nr_and_not_polling(p))
+       if (set_nr_and_not_polling(curr))
                 smp_send_reschedule(cpu);
         else
                 trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
  
         if (!raw_spin_trylock_irqsave(&rq->lock, flags))
                 return;
-       resched_task(cpu_curr(cpu));
+       resched_curr(rq);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
  
  static bool wake_up_full_nohz_cpu(int cpu)
  {
+       /*
+        * We just need the target to call irq_exit() and re-evaluate
+        * the next tick. The nohz full kick at least implies that.
+        * If needed we can still optimize that later with an
+        * empty IRQ.
+        */
         if (tick_nohz_full_cpu(cpu)) {
                 if (cpu != smp_processor_id() ||
                     tick_nohz_tick_stopped())
-                       smp_send_reschedule(cpu);
+                       tick_nohz_full_kick_cpu(cpu);
                 return true;
         }
  
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
  #ifdef CONFIG_NO_HZ_FULL
  bool sched_can_stop_tick(void)
  {
-       struct rq *rq;
-
-       rq = this_rq();
-
-       /* Make sure rq->nr_running update is visible after the IPI */
-       smp_rmb();
-
-       /* More than one running task need preemption */
-       if (rq->nr_running > 1)
-               return false;
+       /*
+        * More than one running task need preemption.
+        * nr_running update is assumed to be visible
+        * after IPI is sent from wakers.
+        */
+       if (this_rq()->nr_running > 1)
+               return false;
  
-       return true;
+       return true;
  }
  #endif /* CONFIG_NO_HZ_FULL */
  
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                         if (class == rq->curr->sched_class)
                                 break;
                         if (class == p->sched_class) {
-                               resched_task(rq->curr);
+                               resched_curr(rq);
                                 break;
                         }
                 }
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
          */
         preempt_fold_need_resched();
  
-       if (llist_empty(&this_rq()->wake_list)
-                       && !tick_nohz_full_cpu(smp_processor_id())
-                       && !got_nohz_idle_kick())
+       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
                 return;
  
         /*
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
          * somewhat pessimize the simple resched case.
          */
         irq_enter();
-       tick_nohz_full_check();
         sched_ttwu_pending();
  
         /*
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
  {
         u64 ns = 0;
  
-       if (task_current(rq, p)) {
+       /*
+        * Must be ->curr _and_ ->on_rq.  If dequeued, we would
+        * project cycles that may never be accounted to this
+        * thread, breaking clock_gettime().
+        */
+       if (task_current(rq, p) && p->on_rq) {
                 update_rq_clock(rq);
                 ns = rq_clock_task(rq) - p->se.exec_start;
                 if ((s64)ns < 0)
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
          * If we race with it leaving cpu, we'll take a lock. So we're correct.
          * If we race with it entering cpu, unaccounted time is 0. This is
          * indistinguishable from the read occurring a few cycles earlier.
+        * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+        * been accounted, so we're correct here as well.
          */
-       if (!p->on_cpu)
+       if (!p->on_cpu || !p->on_rq)
                 return p->se.sum_exec_runtime;
  #endif
  
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         }
  
         trace_sched_pi_setprio(p, prio);
-       p->pi_top_task = rt_mutex_get_top_task(p);
         oldprio = p->prio;
         prev_class = p->sched_class;
         on_rq = p->on_rq;
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
          *          running task
          */
         if (dl_prio(prio)) {
-               if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
-                       dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+               struct task_struct *pi_task = rt_mutex_get_top_task(p);
+               if (!dl_prio(p->normal_prio) ||
+                   (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
                         p->dl.dl_throttled = 0;
                         enqueue_flag = ENQUEUE_REPLENISH;
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)
                  * lowered its priority, then reschedule its CPU:
                  */
                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
-                       resched_task(rq->curr);
+                       resched_curr(rq);
         }
  out_unlock:
         task_rq_unlock(rq, p, &flags);
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
         dl_se->dl_yielded = 0;
  }
  
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY        -1
+
  static void __setscheduler_params(struct task_struct *p,
                 const struct sched_attr *attr)
  {
         int policy = attr->sched_policy;
  
-       if (policy == -1) /* setparam */
+       if (policy == SETPARAM_POLICY)
                 policy = p->policy;
  
         p->policy = policy;
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
                 .sched_nice     = PRIO_TO_NICE(p->static_prio),
         };
  
-       /*
-        * Fixup the legacy SCHED_RESET_ON_FORK hack
-        */
-       if (policy & SCHED_RESET_ON_FORK) {
+       /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+       if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
                 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
                 policy &= ~SCHED_RESET_ON_FORK;
                 attr.sched_policy = policy;
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
   */
  SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
  {
-       return do_sched_setscheduler(pid, -1, param);
+       return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
  }
  
  /**
@@ -4147,7 +4166,6 @@ static void __cond_resched(void)
  
  int __sched _cond_resched(void)
  {
-       rcu_cond_resched();
         if (should_resched()) {
                 __cond_resched();
                 return 1;
@@ -4166,18 +4184,15 @@ EXPORT_SYMBOL(_cond_resched);
   */
  int __cond_resched_lock(spinlock_t *lock)
  {
-       bool need_rcu_resched = rcu_should_resched();
         int resched = should_resched();
         int ret = 0;
  
         lockdep_assert_held(lock);
  
-       if (spin_needbreak(lock) || resched || need_rcu_resched) {
+       if (spin_needbreak(lock) || resched) {
                 spin_unlock(lock);
                 if (resched)
                         __cond_resched();
-               else if (unlikely(need_rcu_resched))
-                       rcu_resched();
                 else
                         cpu_relax();
                 ret = 1;
@@ -4191,7 +4206,6 @@ int __sched __cond_resched_softirq(void)
  {
         BUG_ON(!in_softirq());
  
-       rcu_cond_resched();  /* BH disabled OK, just recording QSes. */
         if (should_resched()) {
                 local_bh_enable();
                 __cond_resched();
@@ -4290,7 +4304,7 @@ again:
                  * fairness.
                  */
                 if (preempt && rq != p_rq)
-                       resched_task(p_rq->curr);
+                       resched_curr(p_rq);
         }
  
  out_unlock:
@@ -6470,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
                 child->parent = sd;
                 sd->child = child;
+
+               if (!cpumask_subset(sched_domain_span(child),
+                                   sched_domain_span(sd))) {
+                       pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+                       pr_err("     the %s domain not a subset of the %s domain\n",
+                                       child->name, sd->name);
+#endif
+                       /* Fixup, ensure @sd has at least @child cpus. */
+                       cpumask_or(sched_domain_span(sd),
+                                  sched_domain_span(sd),
+                                  sched_domain_span(child));
+               }
+
         }
         set_domain_attribute(sd, attr);
  
@@ -7097,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
         __setscheduler(rq, p, &attr);
         if (on_rq) {
                 enqueue_task(rq, p, 0);
-               resched_task(rq->curr);
+               resched_curr(rq);
         }
  
         check_class_changed(rq, p, prev_class, old_prio);
@@ -7808,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
         if (period > max_cfs_quota_period)
                 return -EINVAL;
  
+       /*
+        * Prevent race between setting of cfs_rq->runtime_enabled and
+        * unthrottle_offline_cfs_rqs().
+        */
+       get_online_cpus();
         mutex_lock(&cfs_constraints_mutex);
         ret = __cfs_schedulable(tg, period, quota);
         if (ret)
@@ -7833,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
         }
         raw_spin_unlock_irq(&cfs_b->lock);
  
-       for_each_possible_cpu(i) {
+       for_each_online_cpu(i) {
                 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                 struct rq *rq = cfs_rq->rq;
  
@@ -7849,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                 cfs_bandwidth_usage_dec();
  out_unlock:
         mutex_unlock(&cfs_constraints_mutex);
+       put_online_cpus();
  
         return ret;
  }
@@ -8088,7 +8122,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
         .exit           = cpu_cgroup_exit,
-       .base_cftypes   = cpu_files,
+       .legacy_cftypes = cpu_files,
         .early_init     = 1,
  };