Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 28 Sep 2019 19:39:07 +0000 (12:39 -0700)
Pull scheduler fixes from Ingo Molnar:

 - Apply a number of membarrier related fixes and cleanups, which fixes
   a use-after-free race in the membarrier code

 - Introduce proper RCU protection for tasks on the runqueue - to get
   rid of the subtle task_rcu_dereference() interface that was easy to
   get wrong

 - Misc fixes, but also an EAS speedup

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Avoid redundant EAS calculation
  sched/core: Remove double update_max_interval() call on CPU startup
  sched/core: Fix preempt_schedule() interrupt return comment
  sched/fair: Fix -Wunused-but-set-variable warnings
  sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr()
  sched/membarrier: Return -ENOMEM to userspace on memory allocation failure
  sched/membarrier: Skip IPIs when mm->mm_users == 1
  selftests, sched/membarrier: Add multi-threaded test
  sched/membarrier: Fix p->mm->membarrier_state racy load
  sched/membarrier: Call sync_core only before usermode for same mm
  sched/membarrier: Remove redundant check
  sched/membarrier: Fix private expedited registration check
  tasks, sched/core: RCUify the assignment of rq->curr
  tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code
  tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue
  tasks: Add a count of task RCU users
  sched/core: Convert vcpu_is_preempted() from macro to an inline function
  sched/fair: Remove unused cfs_rq_clock_task() function

17 files changed:
fs/exec.c
include/linux/mm_types.h
include/linux/rcuwait.h
include/linux/sched.h
include/linux/sched/mm.h
include/linux/sched/task.h
kernel/exit.c
kernel/fork.c
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/membarrier.c
kernel/sched/sched.h
tools/testing/selftests/membarrier/.gitignore
tools/testing/selftests/membarrier/Makefile
tools/testing/selftests/membarrier/membarrier_test_impl.h [moved from tools/testing/selftests/membarrier/membarrier_test.c with 95% similarity]
tools/testing/selftests/membarrier/membarrier_test_multi_thread.c [new file with mode: 0644]
tools/testing/selftests/membarrier/membarrier_test_single_thread.c [new file with mode: 0644]

index f7f6a140856a8315c9403eeaac0c2376071e95ec..555e93c7dec82ae6b81db2c5402d838824e50c3c 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm)
        }
        task_lock(tsk);
        active_mm = tsk->active_mm;
+       membarrier_exec_mmap(mm);
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
@@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-       membarrier_execve(current);
        rseq_execve(current);
        acct_update_integrals(current);
        task_numa_free(current, false);
index 5183e0d77dfa6cf6fdcfb1c928bf315a61aec900..2222fa795284183344d603c5286fcfd8c40dffa3 100644 (file)
@@ -383,6 +383,16 @@ struct mm_struct {
                unsigned long highest_vm_end;   /* highest vma end address */
                pgd_t * pgd;
 
+#ifdef CONFIG_MEMBARRIER
+               /**
+                * @membarrier_state: Flags controlling membarrier behavior.
+                *
+                * This field is close to @pgd to hopefully fit in the same
+                * cache-line, which needs to be touched by switch_mm().
+                */
+               atomic_t membarrier_state;
+#endif
+
                /**
                 * @mm_users: The number of users including userspace.
                 *
@@ -452,9 +462,7 @@ struct mm_struct {
                unsigned long flags; /* Must use atomic bitops to access */
 
                struct core_state *core_state; /* coredumping support */
-#ifdef CONFIG_MEMBARRIER
-               atomic_t membarrier_state;
-#endif
+
 #ifdef CONFIG_AIO
                spinlock_t                      ioctx_lock;
                struct kioctx_table __rcu       *ioctx_table;
index 563290fc194f247d92ab2cd430fcf99d8706f725..75c97e4bbc574794e9320e4a01085537dc16c698 100644 (file)
@@ -6,16 +6,11 @@
 
 /*
  * rcuwait provides a way of blocking and waking up a single
- * task in an rcu-safe manner; where it is forbidden to use
- * after exit_notify(). task_struct is not properly rcu protected,
- * unless dealing with rcu-aware lists, ie: find_task_by_*().
+ * task in an rcu-safe manner.
  *
- * Alternatively we have task_rcu_dereference(), but the return
- * semantics have different implications which would break the
- * wakeup side. The only time @task is non-nil is when a user is
- * blocked (or checking if it needs to) on a condition, and reset
- * as soon as we know that the condition has succeeded and are
- * awoken.
+ * The only time @task is non-nil is when a user is blocked (or
+ * checking if it needs to) on a condition, and reset as soon as we
+ * know that the condition has succeeded and are awoken.
  */
 struct rcuwait {
        struct task_struct __rcu *task;
@@ -37,13 +32,6 @@ extern void rcuwait_wake_up(struct rcuwait *w);
  */
 #define rcuwait_wait_event(w, condition)                               \
 ({                                                                     \
-       /*                                                              \
-        * Complain if we are called after do_exit()/exit_notify(),     \
-        * as we cannot rely on the rcu critical region for the         \
-        * wakeup side.                                                 \
-        */                                                             \
-       WARN_ON(current->exit_state);                                   \
-                                                                       \
        rcu_assign_pointer((w)->task, current);                         \
        for (;;) {                                                      \
                /*                                                      \
index 70db597d6fd4f2ceb53b1a71c0db243e681d7070..2c2e56bd8913250e88982ee8e47e36b23081fe23 100644 (file)
@@ -1130,7 +1130,10 @@ struct task_struct {
 
        struct tlbflush_unmap_batch     tlb_ubc;
 
-       struct rcu_head                 rcu;
+       union {
+               refcount_t              rcu_users;
+               struct rcu_head         rcu;
+       };
 
        /* Cache last used pipe for splice(): */
        struct pipe_inode_info          *splice_pipe;
@@ -1839,7 +1842,10 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
  * running or not.
  */
 #ifndef vcpu_is_preempted
-# define vcpu_is_preempted(cpu)        false
+static inline bool vcpu_is_preempted(int cpu)
+{
+       return false;
+}
 #endif
 
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
index 4a7944078cc35dddcea4963f1099e81d147c6936..e6770012db1819e3c8d2975ffbd1a9e84fdac372 100644 (file)
@@ -362,16 +362,16 @@ enum {
 
 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 {
+       if (current->mm != mm)
+               return;
        if (likely(!(atomic_read(&mm->membarrier_state) &
                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
                return;
        sync_core_before_usermode();
 }
 
-static inline void membarrier_execve(struct task_struct *t)
-{
-       atomic_set(&t->mm->membarrier_state, 0);
-}
+extern void membarrier_exec_mmap(struct mm_struct *mm);
+
 #else
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
 static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -380,7 +380,7 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
 {
 }
 #endif
-static inline void membarrier_execve(struct task_struct *t)
+static inline void membarrier_exec_mmap(struct mm_struct *mm)
 {
 }
 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
index 3d90ed8f75f0992e98d9169f087c74b8fd5a4d74..4b1c3b664f517e5e35e6f581282e432649f734d7 100644 (file)
@@ -119,7 +119,7 @@ static inline void put_task_struct(struct task_struct *t)
                __put_task_struct(t);
 }
 
-struct task_struct *task_rcu_dereference(struct task_struct **ptask);
+void put_task_struct_rcu_user(struct task_struct *task);
 
 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
 extern int arch_task_struct_size __read_mostly;
index 22ab6a4bdc513ce04c059b81bad920dd0308b2c9..a46a50d67002d9877b00c8052ba922e02e9740af 100644 (file)
@@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
        put_task_struct(tsk);
 }
 
+void put_task_struct_rcu_user(struct task_struct *task)
+{
+       if (refcount_dec_and_test(&task->rcu_users))
+               call_rcu(&task->rcu, delayed_put_task_struct);
+}
 
 void release_task(struct task_struct *p)
 {
@@ -222,76 +227,13 @@ repeat:
 
        write_unlock_irq(&tasklist_lock);
        release_thread(p);
-       call_rcu(&p->rcu, delayed_put_task_struct);
+       put_task_struct_rcu_user(p);
 
        p = leader;
        if (unlikely(zap_leader))
                goto repeat;
 }
 
-/*
- * Note that if this function returns a valid task_struct pointer (!NULL)
- * task->usage must remain >0 for the duration of the RCU critical section.
- */
-struct task_struct *task_rcu_dereference(struct task_struct **ptask)
-{
-       struct sighand_struct *sighand;
-       struct task_struct *task;
-
-       /*
-        * We need to verify that release_task() was not called and thus
-        * delayed_put_task_struct() can't run and drop the last reference
-        * before rcu_read_unlock(). We check task->sighand != NULL,
-        * but we can read the already freed and reused memory.
-        */
-retry:
-       task = rcu_dereference(*ptask);
-       if (!task)
-               return NULL;
-
-       probe_kernel_address(&task->sighand, sighand);
-
-       /*
-        * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
-        * was already freed we can not miss the preceding update of this
-        * pointer.
-        */
-       smp_rmb();
-       if (unlikely(task != READ_ONCE(*ptask)))
-               goto retry;
-
-       /*
-        * We've re-checked that "task == *ptask", now we have two different
-        * cases:
-        *
-        * 1. This is actually the same task/task_struct. In this case
-        *    sighand != NULL tells us it is still alive.
-        *
-        * 2. This is another task which got the same memory for task_struct.
-        *    We can't know this of course, and we can not trust
-        *    sighand != NULL.
-        *
-        *    In this case we actually return a random value, but this is
-        *    correct.
-        *
-        *    If we return NULL - we can pretend that we actually noticed that
-        *    *ptask was updated when the previous task has exited. Or pretend
-        *    that probe_slab_address(&sighand) reads NULL.
-        *
-        *    If we return the new task (because sighand is not NULL for any
-        *    reason) - this is fine too. This (new) task can't go away before
-        *    another gp pass.
-        *
-        *    And note: We could even eliminate the false positive if re-read
-        *    task->sighand once again to avoid the falsely NULL. But this case
-        *    is very unlikely so we don't care.
-        */
-       if (!sighand)
-               return NULL;
-
-       return task;
-}
-
 void rcuwait_wake_up(struct rcuwait *w)
 {
        struct task_struct *task;
@@ -311,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w)
         */
        smp_mb(); /* (B) */
 
-       /*
-        * Avoid using task_rcu_dereference() magic as long as we are careful,
-        * see comment in rcuwait_wait_event() regarding ->exit_state.
-        */
        task = rcu_dereference(w->task);
        if (task)
                wake_up_process(task);
index 60763c043aa3c7d71da8c3a1eaa78b4b0b634c68..f9572f416126283dd2e8ac6ce3bfd66899e58016 100644 (file)
@@ -915,10 +915,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
                tsk->cpus_ptr = &tsk->cpus_mask;
 
        /*
-        * One for us, one for whoever does the "release_task()" (usually
-        * parent)
+        * One for the user space visible state that goes away when reaped.
+        * One for the scheduler.
         */
-       refcount_set(&tsk->usage, 2);
+       refcount_set(&tsk->rcu_users, 2);
+       /* One for the rcu users */
+       refcount_set(&tsk->usage, 1);
 #ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
 #endif
index f9a1346a5fa9502be6ca45ecb1bd822bf706725e..7880f4f64d0eea19dab03a0408625a505b4454ed 100644 (file)
@@ -1656,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
        if (cpumask_equal(p->cpus_ptr, new_mask))
                goto out;
 
-       if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+       dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+       if (dest_cpu >= nr_cpu_ids) {
                ret = -EINVAL;
                goto out;
        }
@@ -1677,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
 
-       dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
        if (task_running(rq, p) || p->state == TASK_WAKING) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
@@ -3254,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
                /* Task is done with its stack. */
                put_task_stack(prev);
 
-               put_task_struct(prev);
+               put_task_struct_rcu_user(prev);
        }
 
        tick_nohz_task_switch();
@@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
                else
                        prev->active_mm = NULL;
        } else {                                        // to user
+               membarrier_switch_mm(rq, prev->active_mm, next->mm);
                /*
                 * sys_membarrier() requires an smp_mb() between setting
-                * rq->curr and returning to userspace.
+                * rq->curr / membarrier_switch_mm() and returning to userspace.
                 *
                 * The below provides this either through switch_mm(), or in
                 * case 'prev->active_mm == next->mm' through
                 * finish_task_switch()'s mmdrop().
                 */
-
                switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
                if (!prev->mm) {                        // from kernel
@@ -4042,7 +4042,11 @@ static void __sched notrace __schedule(bool preempt)
 
        if (likely(prev != next)) {
                rq->nr_switches++;
-               rq->curr = next;
+               /*
+                * RCU users of rcu_dereference(rq->curr) may not see
+                * changes to task_struct made by pick_next_task().
+                */
+               RCU_INIT_POINTER(rq->curr, next);
                /*
                 * The membarrier system call requires each architecture
                 * to have a full memory barrier after updating
@@ -4223,9 +4227,8 @@ static void __sched notrace preempt_schedule_common(void)
 
 #ifdef CONFIG_PREEMPTION
 /*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
+ * This is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable.
  */
 asmlinkage __visible void __sched notrace preempt_schedule(void)
 {
@@ -4296,7 +4299,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
 #endif /* CONFIG_PREEMPTION */
 
 /*
- * this is the entry point to schedule() from kernel preemption
+ * This is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
@@ -6069,7 +6072,8 @@ void init_idle(struct task_struct *idle, int cpu)
        __set_task_cpu(idle, cpu);
        rcu_read_unlock();
 
-       rq->curr = rq->idle = idle;
+       rq->idle = idle;
+       rcu_assign_pointer(rq->curr, idle);
        idle->on_rq = TASK_ON_RQ_QUEUED;
 #ifdef CONFIG_SMP
        idle->on_cpu = 1;
@@ -6430,8 +6434,6 @@ int sched_cpu_activate(unsigned int cpu)
        }
        rq_unlock_irqrestore(rq, &rf);
 
-       update_max_interval();
-
        return 0;
 }
 
index d4bbf68c31611fcd6fa3da456ef435021cefae53..83ab35e2374f632e0211029d7e1baa70150c388a 100644 (file)
@@ -749,7 +749,6 @@ void init_entity_runnable_average(struct sched_entity *se)
        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 static void attach_entity_cfs_rq(struct sched_entity *se);
 
 /*
@@ -1603,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env,
                return;
 
        rcu_read_lock();
-       cur = task_rcu_dereference(&dst_rq->curr);
+       cur = rcu_dereference(dst_rq->curr);
        if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
                cur = NULL;
 
@@ -4354,21 +4353,16 @@ static inline u64 sched_cfs_bandwidth_slice(void)
 }
 
 /*
- * Replenish runtime according to assigned quota and update expiration time.
- * We use sched_clock_cpu directly instead of rq->clock to avoid adding
- * additional synchronization around rq->lock.
+ * Replenish runtime according to assigned quota. We use sched_clock_cpu
+ * directly instead of rq->clock to avoid adding additional synchronization
+ * around rq->lock.
  *
  * requires cfs_b->lock
  */
 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 {
-       u64 now;
-
-       if (cfs_b->quota == RUNTIME_INF)
-               return;
-
-       now = sched_clock_cpu(smp_processor_id());
-       cfs_b->runtime = cfs_b->quota;
+       if (cfs_b->quota != RUNTIME_INF)
+               cfs_b->runtime = cfs_b->quota;
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4376,15 +4370,6 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
        return &tg->cfs_bandwidth;
 }
 
-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
-       if (unlikely(cfs_rq->throttle_count))
-               return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
-
-       return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
-}
-
 /* returns 0 on failure to allocate runtime */
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -4476,7 +4461,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
 
        cfs_rq->throttle_count--;
        if (!cfs_rq->throttle_count) {
-               /* adjust cfs_rq_clock_task() */
                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                             cfs_rq->throttled_clock_task;
 
@@ -4994,15 +4978,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
-       u64 overrun;
-
        lockdep_assert_held(&cfs_b->lock);
 
        if (cfs_b->period_active)
                return;
 
        cfs_b->period_active = 1;
-       overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+       hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
        hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
 }
 
@@ -5080,11 +5062,6 @@ static inline bool cfs_bandwidth_used(void)
        return false;
 }
 
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-{
-       return rq_clock_task(rq_of(cfs_rq));
-}
-
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
@@ -6412,7 +6389,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                }
 
                /* Evaluate the energy impact of using this CPU. */
-               if (max_spare_cap_cpu >= 0) {
+               if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
                        cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
                        cur_delta -= base_energy_pd;
                        if (cur_delta < best_delta) {
index aa8d7580410887d319623353c88b37dfe54d7645..a39bed2c784f42be244082e5d80320f72e9d8daf 100644 (file)
@@ -30,10 +30,42 @@ static void ipi_mb(void *info)
        smp_mb();       /* IPIs should be serializing but paranoid. */
 }
 
+static void ipi_sync_rq_state(void *info)
+{
+       struct mm_struct *mm = (struct mm_struct *) info;
+
+       if (current->mm != mm)
+               return;
+       this_cpu_write(runqueues.membarrier_state,
+                      atomic_read(&mm->membarrier_state));
+       /*
+        * Issue a memory barrier after setting
+        * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
+        * guarantee that no memory access following registration is reordered
+        * before registration.
+        */
+       smp_mb();
+}
+
+void membarrier_exec_mmap(struct mm_struct *mm)
+{
+       /*
+        * Issue a memory barrier before clearing membarrier_state to
+        * guarantee that no memory access prior to exec is reordered after
+        * clearing this state.
+        */
+       smp_mb();
+       atomic_set(&mm->membarrier_state, 0);
+       /*
+        * Keep the runqueue membarrier_state in sync with this mm
+        * membarrier_state.
+        */
+       this_cpu_write(runqueues.membarrier_state, 0);
+}
+
 static int membarrier_global_expedited(void)
 {
        int cpu;
-       bool fallback = false;
        cpumask_var_t tmpmask;
 
        if (num_online_cpus() == 1)
@@ -45,17 +77,11 @@ static int membarrier_global_expedited(void)
         */
        smp_mb();       /* system call entry is not a mb. */
 
-       /*
-        * Expedited membarrier commands guarantee that they won't
-        * block, hence the GFP_NOWAIT allocation flag and fallback
-        * implementation.
-        */
-       if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
-               /* Fallback for OOM. */
-               fallback = true;
-       }
+       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               return -ENOMEM;
 
        cpus_read_lock();
+       rcu_read_lock();
        for_each_online_cpu(cpu) {
                struct task_struct *p;
 
@@ -70,23 +96,28 @@ static int membarrier_global_expedited(void)
                if (cpu == raw_smp_processor_id())
                        continue;
 
-               rcu_read_lock();
-               p = task_rcu_dereference(&cpu_rq(cpu)->curr);
-               if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
-                                  MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
-                       if (!fallback)
-                               __cpumask_set_cpu(cpu, tmpmask);
-                       else
-                               smp_call_function_single(cpu, ipi_mb, NULL, 1);
-               }
-               rcu_read_unlock();
-       }
-       if (!fallback) {
-               preempt_disable();
-               smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
-               preempt_enable();
-               free_cpumask_var(tmpmask);
+               if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
+                   MEMBARRIER_STATE_GLOBAL_EXPEDITED))
+                       continue;
+
+               /*
+                * Skip the CPU if it runs a kernel thread. The scheduler
+                * leaves the prior task mm in place as an optimization when
+                * scheduling a kthread.
+                */
+               p = rcu_dereference(cpu_rq(cpu)->curr);
+               if (p->flags & PF_KTHREAD)
+                       continue;
+
+               __cpumask_set_cpu(cpu, tmpmask);
        }
+       rcu_read_unlock();
+
+       preempt_disable();
+       smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+       preempt_enable();
+
+       free_cpumask_var(tmpmask);
        cpus_read_unlock();
 
        /*
@@ -101,22 +132,22 @@ static int membarrier_global_expedited(void)
 static int membarrier_private_expedited(int flags)
 {
        int cpu;
-       bool fallback = false;
        cpumask_var_t tmpmask;
+       struct mm_struct *mm = current->mm;
 
        if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
                if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
                        return -EINVAL;
-               if (!(atomic_read(&current->mm->membarrier_state) &
+               if (!(atomic_read(&mm->membarrier_state) &
                      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
                        return -EPERM;
        } else {
-               if (!(atomic_read(&current->mm->membarrier_state) &
+               if (!(atomic_read(&mm->membarrier_state) &
                      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
                        return -EPERM;
        }
 
-       if (num_online_cpus() == 1)
+       if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
                return 0;
 
        /*
@@ -125,17 +156,11 @@ static int membarrier_private_expedited(int flags)
         */
        smp_mb();       /* system call entry is not a mb. */
 
-       /*
-        * Expedited membarrier commands guarantee that they won't
-        * block, hence the GFP_NOWAIT allocation flag and fallback
-        * implementation.
-        */
-       if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
-               /* Fallback for OOM. */
-               fallback = true;
-       }
+       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               return -ENOMEM;
 
        cpus_read_lock();
+       rcu_read_lock();
        for_each_online_cpu(cpu) {
                struct task_struct *p;
 
@@ -150,21 +175,17 @@ static int membarrier_private_expedited(int flags)
                if (cpu == raw_smp_processor_id())
                        continue;
                rcu_read_lock();
-               p = task_rcu_dereference(&cpu_rq(cpu)->curr);
-               if (p && p->mm == current->mm) {
-                       if (!fallback)
-                               __cpumask_set_cpu(cpu, tmpmask);
-                       else
-                               smp_call_function_single(cpu, ipi_mb, NULL, 1);
-               }
-               rcu_read_unlock();
-       }
-       if (!fallback) {
-               preempt_disable();
-               smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
-               preempt_enable();
-               free_cpumask_var(tmpmask);
+               p = rcu_dereference(cpu_rq(cpu)->curr);
+               if (p && p->mm == mm)
+                       __cpumask_set_cpu(cpu, tmpmask);
        }
+       rcu_read_unlock();
+
+       preempt_disable();
+       smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+       preempt_enable();
+
+       free_cpumask_var(tmpmask);
        cpus_read_unlock();
 
        /*
@@ -177,32 +198,78 @@ static int membarrier_private_expedited(int flags)
        return 0;
 }
 
+static int sync_runqueues_membarrier_state(struct mm_struct *mm)
+{
+       int membarrier_state = atomic_read(&mm->membarrier_state);
+       cpumask_var_t tmpmask;
+       int cpu;
+
+       if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
+               this_cpu_write(runqueues.membarrier_state, membarrier_state);
+
+               /*
+                * For single mm user, we can simply issue a memory barrier
+                * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
+                * mm and in the current runqueue to guarantee that no memory
+                * access following registration is reordered before
+                * registration.
+                */
+               smp_mb();
+               return 0;
+       }
+
+       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               return -ENOMEM;
+
+       /*
+        * For mm with multiple users, we need to ensure all future
+        * scheduler executions will observe @mm's new membarrier
+        * state.
+        */
+       synchronize_rcu();
+
+       /*
+        * For each cpu runqueue, if the task's mm match @mm, ensure that all
+        * @mm's membarrier state set bits are also set in in the runqueue's
+        * membarrier state. This ensures that a runqueue scheduling
+        * between threads which are users of @mm has its membarrier state
+        * updated.
+        */
+       cpus_read_lock();
+       rcu_read_lock();
+       for_each_online_cpu(cpu) {
+               struct rq *rq = cpu_rq(cpu);
+               struct task_struct *p;
+
+               p = rcu_dereference(rq->curr);
+               if (p && p->mm == mm)
+                       __cpumask_set_cpu(cpu, tmpmask);
+       }
+       rcu_read_unlock();
+
+       preempt_disable();
+       smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
+       preempt_enable();
+
+       free_cpumask_var(tmpmask);
+       cpus_read_unlock();
+
+       return 0;
+}
+
 static int membarrier_register_global_expedited(void)
 {
        struct task_struct *p = current;
        struct mm_struct *mm = p->mm;
+       int ret;
 
        if (atomic_read(&mm->membarrier_state) &
            MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
                return 0;
        atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
-       if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
-               /*
-                * For single mm user, single threaded process, we can
-                * simply issue a memory barrier after setting
-                * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
-                * no memory access following registration is reordered
-                * before registration.
-                */
-               smp_mb();
-       } else {
-               /*
-                * For multi-mm user threads, we need to ensure all
-                * future scheduler executions will observe the new
-                * thread flag state for this mm.
-                */
-               synchronize_rcu();
-       }
+       ret = sync_runqueues_membarrier_state(mm);
+       if (ret)
+               return ret;
        atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
                  &mm->membarrier_state);
 
@@ -213,12 +280,15 @@ static int membarrier_register_private_expedited(int flags)
 {
        struct task_struct *p = current;
        struct mm_struct *mm = p->mm;
-       int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+       int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+           set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
+           ret;
 
        if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
                if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
                        return -EINVAL;
-               state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+               ready_state =
+                       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
        }
 
        /*
@@ -226,20 +296,15 @@ static int membarrier_register_private_expedited(int flags)
         * groups, which use the same mm. (CLONE_VM but not
         * CLONE_THREAD).
         */
-       if (atomic_read(&mm->membarrier_state) & state)
+       if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
                return 0;
-       atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
        if (flags & MEMBARRIER_FLAG_SYNC_CORE)
-               atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
-                         &mm->membarrier_state);
-       if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
-               /*
-                * Ensure all future scheduler executions will observe the
-                * new thread flag state for this process.
-                */
-               synchronize_rcu();
-       }
-       atomic_or(state, &mm->membarrier_state);
+               set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+       atomic_or(set_state, &mm->membarrier_state);
+       ret = sync_runqueues_membarrier_state(mm);
+       if (ret)
+               return ret;
+       atomic_or(ready_state, &mm->membarrier_state);
 
        return 0;
 }
@@ -253,8 +318,10 @@ static int membarrier_register_private_expedited(int flags)
  * command specified does not exist, not available on the running
  * kernel, or if the command argument is invalid, this system call
  * returns -EINVAL. For a given command, with flags argument set to 0,
- * this system call is guaranteed to always return the same value until
- * reboot.
+ * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
+ * always return the same value until reboot. In addition, it can return
+ * -ENOMEM if there is not enough memory available to perform the system
+ * call.
  *
  * All memory accesses performed in program order from each targeted thread
  * is guaranteed to be ordered with respect to sys_membarrier(). If we use
index b3cb895d14a2088eef7a8853f00e7fc9b5823dd6..0db2c1b3361e03077d3971876f9dc06cf9bbb027 100644 (file)
@@ -911,6 +911,10 @@ struct rq {
 
        atomic_t                nr_iowait;
 
+#ifdef CONFIG_MEMBARRIER
+       int membarrier_state;
+#endif
+
 #ifdef CONFIG_SMP
        struct root_domain              *rd;
        struct sched_domain __rcu       *sd;
@@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void)
 static inline bool sched_energy_enabled(void) { return false; }
 
 #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+#ifdef CONFIG_MEMBARRIER
+/*
+ * The scheduler provides memory barriers required by membarrier between:
+ * - prior user-space memory accesses and store to rq->membarrier_state,
+ * - store to rq->membarrier_state and following user-space memory accesses.
+ * In the same way it provides those guarantees around store to rq->curr.
+ */
+static inline void membarrier_switch_mm(struct rq *rq,
+                                       struct mm_struct *prev_mm,
+                                       struct mm_struct *next_mm)
+{
+       int membarrier_state;
+
+       if (prev_mm == next_mm)
+               return;
+
+       membarrier_state = atomic_read(&next_mm->membarrier_state);
+       if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+               return;
+
+       WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+#else
+static inline void membarrier_switch_mm(struct rq *rq,
+                                       struct mm_struct *prev_mm,
+                                       struct mm_struct *next_mm)
+{
+}
+#endif
index 020c44f49a9e675e2419b9d50dc689206887de14..f2f7ec0a99b4d2aa6761112ee1a31489197aaa3c 100644 (file)
@@ -1 +1,2 @@
-membarrier_test
+membarrier_test_multi_thread
+membarrier_test_single_thread
index 97e3bdf3d1e96f27f10bcef4e10a4dd373b6f602..34d1c81a2324a692193e5b16f9ec09f86a1a0c46 100644 (file)
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0-only
 CFLAGS += -g -I../../../../usr/include/
+LDLIBS += -lpthread
 
-TEST_GEN_PROGS := membarrier_test
+TEST_GEN_PROGS := membarrier_test_single_thread \
+               membarrier_test_multi_thread
 
 include ../lib.mk
-
similarity index 95%
rename from tools/testing/selftests/membarrier/membarrier_test.c
rename to tools/testing/selftests/membarrier/membarrier_test_impl.h
index 70b4ddbf126b059e3336d0e25f74409b59fe52fb..186be69f0a59b1855e6e8d48ba60d96e3602036a 100644 (file)
@@ -1,10 +1,11 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
 #define _GNU_SOURCE
 #include <linux/membarrier.h>
 #include <syscall.h>
 #include <stdio.h>
 #include <errno.h>
 #include <string.h>
+#include <pthread.h>
 
 #include "../kselftest.h"
 
@@ -223,7 +224,7 @@ static int test_membarrier_global_expedited_success(void)
        return 0;
 }
 
-static int test_membarrier(void)
+static int test_membarrier_fail(void)
 {
        int status;
 
@@ -233,10 +234,27 @@ static int test_membarrier(void)
        status = test_membarrier_flags_fail();
        if (status)
                return status;
-       status = test_membarrier_global_success();
+       status = test_membarrier_private_expedited_fail();
        if (status)
                return status;
-       status = test_membarrier_private_expedited_fail();
+       status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0);
+       if (status < 0) {
+               ksft_test_result_fail("sys_membarrier() failed\n");
+               return status;
+       }
+       if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
+               status = test_membarrier_private_expedited_sync_core_fail();
+               if (status)
+                       return status;
+       }
+       return 0;
+}
+
+static int test_membarrier_success(void)
+{
+       int status;
+
+       status = test_membarrier_global_success();
        if (status)
                return status;
        status = test_membarrier_register_private_expedited_success();
@@ -251,9 +269,6 @@ static int test_membarrier(void)
                return status;
        }
        if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
-               status = test_membarrier_private_expedited_sync_core_fail();
-               if (status)
-                       return status;
                status = test_membarrier_register_private_expedited_sync_core_success();
                if (status)
                        return status;
@@ -300,14 +315,3 @@ static int test_membarrier_query(void)
        ksft_test_result_pass("sys_membarrier available\n");
        return 0;
 }
-
-int main(int argc, char **argv)
-{
-       ksft_print_header();
-       ksft_set_plan(13);
-
-       test_membarrier_query();
-       test_membarrier();
-
-       return ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
new file mode 100644 (file)
index 0000000..ac5613e
--- /dev/null
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <linux/membarrier.h>
+#include <syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "membarrier_test_impl.h"
+
+static int thread_ready, thread_quit;
+static pthread_mutex_t test_membarrier_thread_mutex =
+       PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t test_membarrier_thread_cond =
+       PTHREAD_COND_INITIALIZER;
+
+void *test_membarrier_thread(void *arg)
+{
+       pthread_mutex_lock(&test_membarrier_thread_mutex);
+       thread_ready = 1;
+       pthread_cond_broadcast(&test_membarrier_thread_cond);
+       pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+       pthread_mutex_lock(&test_membarrier_thread_mutex);
+       while (!thread_quit)
+               pthread_cond_wait(&test_membarrier_thread_cond,
+                                 &test_membarrier_thread_mutex);
+       pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+       return NULL;
+}
+
+static int test_mt_membarrier(void)
+{
+       int i;
+       pthread_t test_thread;
+
+       pthread_create(&test_thread, NULL,
+                      test_membarrier_thread, NULL);
+
+       pthread_mutex_lock(&test_membarrier_thread_mutex);
+       while (!thread_ready)
+               pthread_cond_wait(&test_membarrier_thread_cond,
+                                 &test_membarrier_thread_mutex);
+       pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+       test_membarrier_fail();
+
+       test_membarrier_success();
+
+       pthread_mutex_lock(&test_membarrier_thread_mutex);
+       thread_quit = 1;
+       pthread_cond_broadcast(&test_membarrier_thread_cond);
+       pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+       pthread_join(test_thread, NULL);
+
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       ksft_print_header();
+       ksft_set_plan(13);
+
+       test_membarrier_query();
+
+       /* Multi-threaded */
+       test_mt_membarrier();
+
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
new file mode 100644 (file)
index 0000000..c1c9639
--- /dev/null
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <linux/membarrier.h>
+#include <syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "membarrier_test_impl.h"
+
+int main(int argc, char **argv)
+{
+       ksft_print_header();
+       ksft_set_plan(13);
+
+       test_membarrier_query();
+
+       test_membarrier_fail();
+
+       test_membarrier_success();
+
+       return ksft_exit_pass();
+}