sched/core: Fix DEBUG_SPINLOCK annotation for rq->lock
[sfrench/cifs-2.6.git] / kernel / sched / core.c
index 644fa2e3d993b5ef1bd1c57e4286168daadf14ff..e7c535eee0a6d493a2a43eba210c08c6858b63d1 100644 (file)
@@ -508,7 +508,8 @@ void resched_cpu(int cpu)
        unsigned long flags;
 
        raw_spin_lock_irqsave(&rq->lock, flags);
-       resched_curr(rq);
+       if (cpu_online(cpu) || cpu == smp_processor_id())
+               resched_curr(rq);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1629,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 
 #ifdef CONFIG_SMP
        if (cpu == rq->cpu) {
-               schedstat_inc(rq->ttwu_local);
-               schedstat_inc(p->se.statistics.nr_wakeups_local);
+               __schedstat_inc(rq->ttwu_local);
+               __schedstat_inc(p->se.statistics.nr_wakeups_local);
        } else {
                struct sched_domain *sd;
 
-               schedstat_inc(p->se.statistics.nr_wakeups_remote);
+               __schedstat_inc(p->se.statistics.nr_wakeups_remote);
                rcu_read_lock();
                for_each_domain(rq->cpu, sd) {
                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sd->ttwu_wake_remote);
+                               __schedstat_inc(sd->ttwu_wake_remote);
                                break;
                        }
                }
@@ -1646,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
        }
 
        if (wake_flags & WF_MIGRATED)
-               schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+               __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
 
-       schedstat_inc(rq->ttwu_count);
-       schedstat_inc(p->se.statistics.nr_wakeups);
+       __schedstat_inc(rq->ttwu_count);
+       __schedstat_inc(p->se.statistics.nr_wakeups);
 
        if (wake_flags & WF_SYNC)
-               schedstat_inc(p->se.statistics.nr_wakeups_sync);
+               __schedstat_inc(p->se.statistics.nr_wakeups_sync);
 }
 
 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2045,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         * If the owning (remote) CPU is still in the middle of schedule() with
         * this task as prev, wait until its done referencing the task.
         *
-        * Pairs with the smp_store_release() in finish_lock_switch().
+        * Pairs with the smp_store_release() in finish_task().
         *
         * This ensures that tasks getting woken will be fully ordered against
         * their previous state and preserve Program Order.
@@ -2056,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        p->state = TASK_WAKING;
 
        if (p->in_iowait) {
-               delayacct_blkio_end();
+               delayacct_blkio_end(p);
                atomic_dec(&task_rq(p)->nr_iowait);
        }
 
@@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 #else /* CONFIG_SMP */
 
        if (p->in_iowait) {
-               delayacct_blkio_end();
+               delayacct_blkio_end(p);
                atomic_dec(&task_rq(p)->nr_iowait);
        }
 
@@ -2122,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
 
        if (!task_on_rq_queued(p)) {
                if (p->in_iowait) {
-                       delayacct_blkio_end();
+                       delayacct_blkio_end(p);
                        atomic_dec(&rq->nr_iowait);
                }
                ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
@@ -2460,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p)
         * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
         * as we're not fully set-up yet.
         */
+       p->recent_used_cpu = task_cpu(p);
        __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
        rq = __task_rq_lock(p, &rf);
@@ -2571,6 +2573,62 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 
 #endif /* CONFIG_PREEMPT_NOTIFIERS */
 
+static inline void prepare_task(struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+       /*
+        * Claim the task as running, we do this before switching to it
+        * such that any running task will have this set.
+        */
+       next->on_cpu = 1;
+#endif
+}
+
+static inline void finish_task(struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+       /*
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
+        *
+        * In particular, the load of prev->state in finish_task_switch() must
+        * happen before this.
+        *
+        * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+        */
+       smp_store_release(&prev->on_cpu, 0);
+#endif
+}
+
+static inline void
+prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
+{
+       /*
+        * Since the runqueue lock will be released by the next
+        * task (which is an invalid locking op but in the case
+        * of the scheduler it's an obvious special-case), so we
+        * do an early lockdep release here:
+        */
+       rq_unpin_lock(rq, rf);
+       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#ifdef CONFIG_DEBUG_SPINLOCK
+       /* this is a valid case when another task releases the spinlock */
+       rq->lock.owner = next;
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq)
+{
+       /*
+        * If we are tracking spinlock dependencies then we have to
+        * fix up the runqueue lock - which gets 'carried over' from
+        * prev into current:
+        */
+       spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+       raw_spin_unlock_irq(&rq->lock);
+}
+
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
@@ -2591,7 +2649,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
        sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
-       prepare_lock_switch(rq, next);
+       prepare_task(next);
        prepare_arch_switch(next);
 }
 
@@ -2646,29 +2704,34 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         * the scheduled task must drop that reference.
         *
         * We must observe prev->state before clearing prev->on_cpu (in
-        * finish_lock_switch), otherwise a concurrent wakeup can get prev
+        * finish_task), otherwise a concurrent wakeup can get prev
         * running on another CPU and we could rave with its RUNNING -> DEAD
         * transition, resulting in a double drop.
         */
        prev_state = prev->state;
        vtime_task_switch(prev);
        perf_event_task_sched_in(prev, current);
-       /*
-        * The membarrier system call requires a full memory barrier
-        * after storing to rq->curr, before going back to user-space.
-        *
-        * TODO: This smp_mb__after_unlock_lock can go away if PPC end
-        * up adding a full barrier to switch_mm(), or we should figure
-        * out if a smp_mb__after_unlock_lock is really the proper API
-        * to use.
-        */
-       smp_mb__after_unlock_lock();
-       finish_lock_switch(rq, prev);
+       finish_task(prev);
+       finish_lock_switch(rq);
        finish_arch_post_lock_switch();
 
        fire_sched_in_preempt_notifiers(current);
-       if (mm)
+       /*
+        * When switching through a kernel thread, the loop in
+        * membarrier_{private,global}_expedited() may have observed that
+        * kernel thread and not issued an IPI. It is therefore possible to
+        * schedule between user->kernel->user threads without passing though
+        * switch_mm(). Membarrier requires a barrier after storing to
+        * rq->curr, before returning to userspace, so provide them here:
+        *
+        * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+        *   provided by mmdrop(),
+        * - a sync_core for SYNC_CORE.
+        */
+       if (mm) {
+               membarrier_mm_sync_core_before_usermode(mm);
                mmdrop(mm);
+       }
        if (unlikely(prev_state == TASK_DEAD)) {
                if (prev->sched_class->task_dead)
                        prev->sched_class->task_dead(prev);
@@ -2772,6 +2835,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
         */
        arch_start_context_switch(prev);
 
+       /*
+        * If mm is non-NULL, we pass through switch_mm(). If mm is
+        * NULL, we will pass through mmdrop() in finish_task_switch().
+        * Both of these contain the full memory barrier required by
+        * membarrier after storing to rq->curr, before returning to
+        * user-space.
+        */
        if (!mm) {
                next->active_mm = oldmm;
                mmgrab(oldmm);
@@ -2786,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
-       /*
-        * Since the runqueue lock will be released by the next
-        * task (which is an invalid locking op but in the case
-        * of the scheduler it's an obvious special-case), so we
-        * do an early lockdep release here:
-        */
-       rq_unpin_lock(rq, rf);
-       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+       prepare_lock_switch(rq, next, rf);
 
        /* Here we just switch the register state and the stack. */
        switch_to(prev, next, prev);
@@ -3308,6 +3371,9 @@ static void __sched notrace __schedule(bool preempt)
         * Make sure that signal_pending_state()->signal_pending() below
         * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
         * done by the caller to avoid the race with signal_wake_up().
+        *
+        * The membarrier system call requires a full memory barrier
+        * after coming from user-space, before storing to rq->curr.
         */
        rq_lock(rq, &rf);
        smp_mb__after_spinlock();
@@ -3355,17 +3421,16 @@ static void __sched notrace __schedule(bool preempt)
                /*
                 * The membarrier system call requires each architecture
                 * to have a full memory barrier after updating
-                * rq->curr, before returning to user-space. For TSO
-                * (e.g. x86), the architecture must provide its own
-                * barrier in switch_mm(). For weakly ordered machines
-                * for which spin_unlock() acts as a full memory
-                * barrier, finish_lock_switch() in common code takes
-                * care of this barrier. For weakly ordered machines for
-                * which spin_unlock() acts as a RELEASE barrier (only
-                * arm64 and PowerPC), arm64 has a full barrier in
-                * switch_to(), and PowerPC has
-                * smp_mb__after_unlock_lock() before
-                * finish_lock_switch().
+                * rq->curr, before returning to user-space.
+                *
+                * Here are the schemes providing that barrier on the
+                * various architectures:
+                * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+                *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+                * - finish_lock_switch() for weakly-ordered
+                *   architectures where spin_unlock is a full barrier,
+                * - switch_to() for arm64 (weakly-ordered, spin_unlock
+                *   is a RELEASE barrier),
                 */
                ++*switch_count;
 
@@ -4040,8 +4105,7 @@ recheck:
                        return -EINVAL;
        }
 
-       if (attr->sched_flags &
-               ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
+       if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
                return -EINVAL;
 
        /*
@@ -4108,6 +4172,9 @@ recheck:
        }
 
        if (user) {
+               if (attr->sched_flags & SCHED_FLAG_SUGOV)
+                       return -EINVAL;
+
                retval = security_task_setscheduler(p);
                if (retval)
                        return retval;
@@ -4163,7 +4230,8 @@ change:
                }
 #endif
 #ifdef CONFIG_SMP
-               if (dl_bandwidth_enabled() && dl_policy(policy)) {
+               if (dl_bandwidth_enabled() && dl_policy(policy) &&
+                               !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
                        cpumask_t *span = rq->rd->span;
 
                        /*
@@ -4293,6 +4361,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+       return __sched_setscheduler(p, attr, false, true);
+}
+
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
  * @p: the task in question.
@@ -4799,7 +4872,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 
        ret = sched_getaffinity(pid, mask);
        if (ret == 0) {
-               size_t retlen = min_t(size_t, len, cpumask_size());
+               unsigned int retlen = min(len, cpumask_size());
 
                if (copy_to_user(user_mask_ptr, mask, retlen))
                        ret = -EFAULT;