Merge branch 'linus' into sched/urgent, to resolve conflicts

author Ingo Molnar <mingo@kernel.org>

Tue, 6 Feb 2018 20:12:31 +0000 (21:12 +0100)

committer Ingo Molnar <mingo@kernel.org>

Tue, 6 Feb 2018 20:12:31 +0000 (21:12 +0100)
author Ingo Molnar <mingo@kernel.org>
Tue, 6 Feb 2018 20:12:31 +0000 (21:12 +0100)
committer Ingo Molnar <mingo@kernel.org>
Tue, 6 Feb 2018 20:12:31 +0000 (21:12 +0100)
diff --git a/MAINTAINERS b/MAINTAINERS

index 836f6c7bbf2d50c67bb56e5a1a882f01d9d98a41..a284684b1463ce6c8aabb01d62dde836acf58b7d 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9018,6 +9018,7 @@ L:        linux-kernel@vger.kernel.org
  S:     Supported
  F:     kernel/sched/membarrier.c
  F:     include/uapi/linux/membarrier.h
+F:     arch/powerpc/include/asm/membarrier.h
  
  MEMORY MANAGEMENT
  L:     linux-mm@kvack.org
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index 53612879fe567022a74ad989b48e664f7ac99013..7381eeb7ef8e40197ccdd4de2c61386ea8409110 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -16,6 +16,7 @@ config ARM64
         select ARCH_HAS_GCOV_PROFILE_ALL
         select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
         select ARCH_HAS_KCOV
+       select ARCH_HAS_MEMBARRIER_SYNC_CORE
         select ARCH_HAS_SET_MEMORY
         select ARCH_HAS_SG_CHAIN
         select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S

index b34e717d75970cba4f646d204265d5c0d84e89c0..cccd2788e63195199b67bf1006d7f5ea9a056702 100644 (file)
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -324,6 +324,10 @@ alternative_else_nop_endif
         ldp     x28, x29, [sp, #16 * 14]
         ldr     lr, [sp, #S_LR]
         add     sp, sp, #S_FRAME_SIZE           // restore sp
+       /*
+        * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on eret context synchronization
+        * when returning from IPI handler, and when returning to user-space.
+        */
  
         .if     \el == 0
  alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index 9d3329811cc17f7a1d9a4ca8941b9462cf531047..73ce5dd076420720822890ab8425632d3632870c 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,6 +141,7 @@ config PPC
         select ARCH_HAS_GCOV_PROFILE_ALL
         select ARCH_HAS_PHYS_TO_DMA
         select ARCH_HAS_PMEM_API                if PPC64
+       select ARCH_HAS_MEMBARRIER_CALLBACKS
         select ARCH_HAS_SCALED_CPUTIME          if VIRT_CPU_ACCOUNTING_NATIVE
         select ARCH_HAS_SG_CHAIN
         select ARCH_HAS_STRICT_KERNEL_RWX       if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)
diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h

new file mode 100644 (file)

index 0000000..6e20bb5
--- /dev/null
+++ b/arch/powerpc/include/asm/membarrier.h
@@ -0,0 +1,27 @@
+#ifndef _ASM_POWERPC_MEMBARRIER_H
+#define _ASM_POWERPC_MEMBARRIER_H
+
+static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
+                                            struct mm_struct *next,
+                                            struct task_struct *tsk)
+{
+       /*
+        * Only need the full barrier when switching between processes.
+        * Barrier when switching from kernel to userspace is not
+        * required here, given that it is implied by mmdrop(). Barrier
+        * when switching from userspace to kernel is not needed after
+        * store to rq->curr.
+        */
+       if (likely(!(atomic_read(&next->membarrier_state) &
+                    (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+                     MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
+               return;
+
+       /*
+        * The membarrier system call requires a full memory barrier
+        * after storing to rq->curr, before going back to user-space.
+        */
+       smp_mb();
+}
+
+#endif /* _ASM_POWERPC_MEMBARRIER_H */
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c

index d60a62bf4fc763bd52008fff96fcbe9aca70b68a..0ab297c4cfad1486a54195022929ee022d75b964 100644 (file)
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -12,6 +12,7 @@
  
  #include <linux/mm.h>
  #include <linux/cpu.h>
+#include <linux/sched/mm.h>
  
  #include <asm/mmu_context.h>
  
@@ -58,6 +59,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                  *
                  * On the read side the barrier is in pte_xchg(), which orders
                  * the store to the PTE vs the load of mm_cpumask.
+                *
+                * This full barrier is needed by membarrier when switching
+                * between processes after store to rq->curr, before user-space
+                * memory accesses.
                  */
                 smp_mb();
  
@@ -80,6 +85,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
  
         if (new_on_cpu)
                 radix_kvm_prefetch_workaround(next);
+       else
+               membarrier_arch_switch_mm(prev, next, tsk);
  
         /*
          * The actual HW switching method differs between the various
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index b0771ceabb4b223199a31adb7993c4b62ded0754..cefa6dbe80aebf6eae2a0ef61f390a9827985f8f 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,7 @@ config X86
         select ARCH_HAS_GCOV_PROFILE_ALL
         select ARCH_HAS_KCOV                    if X86_64
         select ARCH_HAS_PHYS_TO_DMA
+       select ARCH_HAS_MEMBARRIER_SYNC_CORE
         select ARCH_HAS_PMEM_API                if X86_64
         select ARCH_HAS_REFCOUNT
         select ARCH_HAS_UACCESS_FLUSHCACHE      if X86_64
@@ -62,6 +63,7 @@ config X86
         select ARCH_HAS_SG_CHAIN
         select ARCH_HAS_STRICT_KERNEL_RWX
         select ARCH_HAS_STRICT_MODULE_RWX
+       select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
         select ARCH_HAS_UBSAN_SANITIZE_ALL
         select ARCH_HAS_ZONE_DEVICE             if X86_64
         select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S

index 2a35b1e0fb902ab83784095114ee302957d65931..abee6d2b9311e4c1cea1ba7991a3220e4a7cd397 100644 (file)
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -566,6 +566,11 @@ restore_all:
  .Lrestore_nocheck:
         RESTORE_REGS 4                          # skip orig_eax/error_code
  .Lirq_return:
+       /*
+        * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+        * when returning from IPI handler and when returning from
+        * scheduler to user-space.
+        */
         INTERRUPT_RETURN
  
  .section .fixup, "ax"
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S

index c752abe89d80787395179f2f3c698d7480cbd34a..4a9bef6aca346c76945cd15c43090146214658f2 100644 (file)
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -691,6 +691,10 @@ GLOBAL(restore_regs_and_return_to_kernel)
         POP_EXTRA_REGS
         POP_C_REGS
         addq    $8, %rsp        /* skip regs->orig_ax */
+       /*
+        * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+        * when returning from IPI handler.
+        */
         INTERRUPT_RETURN
  
  ENTRY(native_iret)
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h

new file mode 100644 (file)

index 0000000..c67caaf
--- /dev/null
+++ b/arch/x86/include/asm/sync_core.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SYNC_CORE_H
+#define _ASM_X86_SYNC_CORE_H
+
+#include <linux/preempt.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+
+/*
+ * Ensure that a core serializing instruction is issued before returning
+ * to user-mode. x86 implements return to user-space through sysexit,
+ * sysrel, and sysretq, which are not core serializing.
+ */
+static inline void sync_core_before_usermode(void)
+{
+       /* With PTI, we unconditionally serialize before running user code. */
+       if (static_cpu_has(X86_FEATURE_PTI))
+               return;
+       /*
+        * Return from interrupt and NMI is done through iret, which is core
+        * serializing.
+        */
+       if (in_irq() || in_nmi())
+               return;
+       sync_core();
+}
+
+#endif /* _ASM_X86_SYNC_CORE_H */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

index 012d0262484894c3a2e9ee4590f41e9f5756aa3d..8dcc0607f80584748f92fe43aba9a32685fc6f9b 100644 (file)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -229,6 +229,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
  #endif
         this_cpu_write(cpu_tlbstate.is_lazy, false);
  
+       /*
+        * The membarrier system call requires a full memory barrier and
+        * core serialization before returning to user-space, after
+        * storing to rq->curr. Writing to CR3 provides that full
+        * memory barrier and core serializing instruction.
+        */
         if (real_prev == next) {
                 VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
                            next->context.ctx_id);
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 166144c04ef61af4b8ec47033d99b95a678aaeec..92744e3f155665a6e7161739963770708ae60f32 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -555,6 +555,14 @@ struct task_struct {
         unsigned long                   wakee_flip_decay_ts;
         struct task_struct              *last_wakee;
  
+       /*
+        * recent_used_cpu is initially set as the last CPU used by a task
+        * that wakes affine another task. Waker/wakee relationships can
+        * push tasks around a CPU where each wakeup moves to the next one.
+        * Tracking a recently used CPU allows a quick search for a recently
+        * used CPU that may be idle.
+        */
+       int                             recent_used_cpu;
         int                             wake_cpu;
  #endif
         int                             on_rq;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h

index bd422561a75e51783d89b18d3cb0e70a271df935..1149533aa2fa2838d28b4351522314cb302bc0a4 100644 (file)
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -7,6 +7,7 @@
  #include <linux/sched.h>
  #include <linux/mm_types.h>
  #include <linux/gfp.h>
+#include <linux/sync_core.h>
  
  /*
   * Routines for handling mm_structs
@@ -194,18 +195,48 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
  
  #ifdef CONFIG_MEMBARRIER
  enum {
-       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY        = (1U << 0),
-       MEMBARRIER_STATE_SWITCH_MM                      = (1U << 1),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY                = (1U << 0),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED                      = (1U << 1),
+       MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY                 = (1U << 2),
+       MEMBARRIER_STATE_GLOBAL_EXPEDITED                       = (1U << 3),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY      = (1U << 4),
+       MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE            = (1U << 5),
  };
  
+enum {
+       MEMBARRIER_FLAG_SYNC_CORE       = (1U << 0),
+};
+
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
+#include <asm/membarrier.h>
+#endif
+
+static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
+{
+       if (likely(!(atomic_read(&mm->membarrier_state) &
+                    MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
+               return;
+       sync_core_before_usermode();
+}
+
  static inline void membarrier_execve(struct task_struct *t)
  {
         atomic_set(&t->mm->membarrier_state, 0);
  }
  #else
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
+static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
+                                            struct mm_struct *next,
+                                            struct task_struct *tsk)
+{
+}
+#endif
  static inline void membarrier_execve(struct task_struct *t)
  {
  }
+static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
+{
+}
  #endif
  
  #endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h

new file mode 100644 (file)

index 0000000..013da4b
--- /dev/null
+++ b/include/linux/sync_core.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SYNC_CORE_H
+#define _LINUX_SYNC_CORE_H
+
+#ifdef CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
+#include <asm/sync_core.h>
+#else
+/*
+ * This is a dummy sync_core_before_usermode() implementation that can be used
+ * on all architectures which return to user-space through core serializing
+ * instructions.
+ * If your architecture returns to user-space through non-core-serializing
+ * instructions, you need to write your own functions.
+ */
+static inline void sync_core_before_usermode(void)
+{
+}
+#endif
+
+#endif /* _LINUX_SYNC_CORE_H */
+
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h

index 4e01ad7ffe9831c63f90a46d40445e467365beec..5891d7614c8c0d230f5ee85ecc0ae3d4f11dc2a9 100644 (file)
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -31,7 +31,7 @@
   * enum membarrier_cmd - membarrier system call command
   * @MEMBARRIER_CMD_QUERY:   Query the set of supported commands. It returns
   *                          a bitmask of valid commands.
- * @MEMBARRIER_CMD_SHARED:  Execute a memory barrier on all running threads.
+ * @MEMBARRIER_CMD_GLOBAL:  Execute a memory barrier on all running threads.
   *                          Upon return from system call, the caller thread
   *                          is ensured that all running threads have passed
   *                          through a state where all memory accesses to
@@ -40,6 +40,28 @@
   *                          (non-running threads are de facto in such a
   *                          state). This covers threads from all processes
   *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ *                          Execute a memory barrier on all running threads
+ *                          of all processes which previously registered
+ *                          with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
+ *                          Upon return from system call, the caller thread
+ *                          is ensured that all running threads have passed
+ *                          through a state where all memory accesses to
+ *                          user-space addresses match program order between
+ *                          entry to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This only covers threads from processes
+ *                          which registered with
+ *                          MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
+ *                          This command returns 0. Given that
+ *                          registration is about the intent to receive
+ *                          the barriers, it is valid to invoke
+ *                          MEMBARRIER_CMD_GLOBAL_EXPEDITED from a
+ *                          non-registered process.
+ * @MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ *                          Register the process intent to receive
+ *                          MEMBARRIER_CMD_GLOBAL_EXPEDITED memory
+ *                          barriers. Always returns 0.
   * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
   *                          Execute a memory barrier on each running
   *                          thread belonging to the same process as the current
@@ -51,7 +73,7 @@
   *                          to and return from the system call
   *                          (non-running threads are de facto in such a
   *                          state). This only covers threads from the
- *                          same processes as the caller thread. This
+ *                          same process as the caller thread. This
   *                          command returns 0 on success. The
   *                          "expedited" commands complete faster than
   *                          the non-expedited ones, they never block,
@@ -64,18 +86,54 @@
   *                          Register the process intent to use
   *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
   *                          returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ *                          In addition to provide memory ordering
+ *                          guarantees described in
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED, ensure
+ *                          the caller thread, upon return from system
+ *                          call, that all its running threads siblings
+ *                          have executed a core serializing
+ *                          instruction. (architectures are required to
+ *                          guarantee that non-running threads issue
+ *                          core serializing instructions before they
+ *                          resume user-space execution). This only
+ *                          covers threads from the same process as the
+ *                          caller thread. This command returns 0 on
+ *                          success. The "expedited" commands complete
+ *                          faster than the non-expedited ones, they
+ *                          never block, but have the downside of
+ *                          causing extra overhead. If this command is
+ *                          not implemented by an architecture, -EINVAL
+ *                          is returned. A process needs to register its
+ *                          intent to use the private expedited sync
+ *                          core command prior to using it, otherwise
+ *                          this command returns -EPERM.
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ *                          Register the process intent to use
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE.
+ *                          If this command is not implemented by an
+ *                          architecture, -EINVAL is returned.
+ *                          Returns 0 on success.
+ * @MEMBARRIER_CMD_SHARED:
+ *                          Alias to MEMBARRIER_CMD_GLOBAL. Provided for
+ *                          header backward compatibility.
   *
   * Command to be passed to the membarrier system call. The commands need to
   * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
   * the value 0.
   */
  enum membarrier_cmd {
-       MEMBARRIER_CMD_QUERY                            = 0,
-       MEMBARRIER_CMD_SHARED                           = (1 << 0),
-       /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
-       /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
-       MEMBARRIER_CMD_PRIVATE_EXPEDITED                = (1 << 3),
-       MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED       = (1 << 4),
+       MEMBARRIER_CMD_QUERY                                    = 0,
+       MEMBARRIER_CMD_GLOBAL                                   = (1 << 0),
+       MEMBARRIER_CMD_GLOBAL_EXPEDITED                         = (1 << 1),
+       MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                = (1 << 2),
+       MEMBARRIER_CMD_PRIVATE_EXPEDITED                        = (1 << 3),
+       MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED               = (1 << 4),
+       MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE              = (1 << 5),
+       MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE     = (1 << 6),
+
+       /* Alias for header backward compatibility. */
+       MEMBARRIER_CMD_SHARED                   = MEMBARRIER_CMD_GLOBAL,
  };
  
  #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/init/Kconfig b/init/Kconfig

index a9a2e2c86671a18c02c49643151204dd3358157b..e37f4b2a64453bc30d7b84c550043d0f678d8634 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1412,6 +1412,12 @@ config USERFAULTFD
           Enable the userfaultfd() system call that allows to intercept and
           handle page faults in userland.
  
+config ARCH_HAS_MEMBARRIER_CALLBACKS
+       bool
+
+config ARCH_HAS_MEMBARRIER_SYNC_CORE
+       bool
+
  config EMBEDDED
         bool "Embedded system"
         option allnoconfig_y
@@ -1915,3 +1921,6 @@ config ASN1
           functions to call on what tags.
  
  source "kernel/Kconfig.locks"
+
+config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
+       bool
diff --git a/kernel/fork.c b/kernel/fork.c

index 5c372c954f3b9213fe3d2e22c4fa9816b788afa3..c7c112391d79876b880121195ab8e88905b43d24 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -606,6 +606,11 @@ static void __mmdrop(struct mm_struct *mm)
  
  void mmdrop(struct mm_struct *mm)
  {
+       /*
+        * The implicit full barrier implied by atomic_dec_and_test() is
+        * required by the membarrier system call before returning to
+        * user-space, after storing to rq->curr.
+        */
         if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                 __mmdrop(mm);
  }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 3da7a2444a911131589ce616147efd517c7bd354..36f113ac6353f8760590866b8a8655f1e355bb31 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1630,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
  
  #ifdef CONFIG_SMP
         if (cpu == rq->cpu) {
-               schedstat_inc(rq->ttwu_local);
-               schedstat_inc(p->se.statistics.nr_wakeups_local);
+               __schedstat_inc(rq->ttwu_local);
+               __schedstat_inc(p->se.statistics.nr_wakeups_local);
         } else {
                 struct sched_domain *sd;
  
-               schedstat_inc(p->se.statistics.nr_wakeups_remote);
+               __schedstat_inc(p->se.statistics.nr_wakeups_remote);
                 rcu_read_lock();
                 for_each_domain(rq->cpu, sd) {
                         if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sd->ttwu_wake_remote);
+                               __schedstat_inc(sd->ttwu_wake_remote);
                                 break;
                         }
                 }
@@ -1647,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
         }
  
         if (wake_flags & WF_MIGRATED)
-               schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+               __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
  #endif /* CONFIG_SMP */
  
-       schedstat_inc(rq->ttwu_count);
-       schedstat_inc(p->se.statistics.nr_wakeups);
+       __schedstat_inc(rq->ttwu_count);
+       __schedstat_inc(p->se.statistics.nr_wakeups);
  
         if (wake_flags & WF_SYNC)
-               schedstat_inc(p->se.statistics.nr_wakeups_sync);
+               __schedstat_inc(p->se.statistics.nr_wakeups_sync);
  }
  
  static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2461,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p)
          * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
          * as we're not fully set-up yet.
          */
+       p->recent_used_cpu = task_cpu(p);
         __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
         rq = __task_rq_lock(p, &rf);
@@ -2698,23 +2699,27 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         prev_state = prev->state;
         vtime_task_switch(prev);
         perf_event_task_sched_in(prev, current);
-       /*
-        * The membarrier system call requires a full memory barrier
-        * after storing to rq->curr, before going back to user-space.
-        *
-        * TODO: This smp_mb__after_unlock_lock can go away if PPC end
-        * up adding a full barrier to switch_mm(), or we should figure
-        * out if a smp_mb__after_unlock_lock is really the proper API
-        * to use.
-        */
-       smp_mb__after_unlock_lock();
         finish_task(prev);
         finish_lock_switch(rq);
         finish_arch_post_lock_switch();
  
         fire_sched_in_preempt_notifiers(current);
-       if (mm)
+       /*
+        * When switching through a kernel thread, the loop in
+        * membarrier_{private,global}_expedited() may have observed that
+        * kernel thread and not issued an IPI. It is therefore possible to
+        * schedule between user->kernel->user threads without passing though
+        * switch_mm(). Membarrier requires a barrier after storing to
+        * rq->curr, before returning to userspace, so provide them here:
+        *
+        * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+        *   provided by mmdrop(),
+        * - a sync_core for SYNC_CORE.
+        */
+       if (mm) {
+               membarrier_mm_sync_core_before_usermode(mm);
                 mmdrop(mm);
+       }
         if (unlikely(prev_state == TASK_DEAD)) {
                 if (prev->sched_class->task_dead)
                         prev->sched_class->task_dead(prev);
@@ -2818,6 +2823,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
          */
         arch_start_context_switch(prev);
  
+       /*
+        * If mm is non-NULL, we pass through switch_mm(). If mm is
+        * NULL, we will pass through mmdrop() in finish_task_switch().
+        * Both of these contain the full memory barrier required by
+        * membarrier after storing to rq->curr, before returning to
+        * user-space.
+        */
         if (!mm) {
                 next->active_mm = oldmm;
                 mmgrab(oldmm);
@@ -3354,6 +3366,9 @@ static void __sched notrace __schedule(bool preempt)
          * Make sure that signal_pending_state()->signal_pending() below
          * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
          * done by the caller to avoid the race with signal_wake_up().
+        *
+        * The membarrier system call requires a full memory barrier
+        * after coming from user-space, before storing to rq->curr.
          */
         rq_lock(rq, &rf);
         smp_mb__after_spinlock();
@@ -3401,17 +3416,16 @@ static void __sched notrace __schedule(bool preempt)
                 /*
                  * The membarrier system call requires each architecture
                  * to have a full memory barrier after updating
-                * rq->curr, before returning to user-space. For TSO
-                * (e.g. x86), the architecture must provide its own
-                * barrier in switch_mm(). For weakly ordered machines
-                * for which spin_unlock() acts as a full memory
-                * barrier, finish_lock_switch() in common code takes
-                * care of this barrier. For weakly ordered machines for
-                * which spin_unlock() acts as a RELEASE barrier (only
-                * arm64 and PowerPC), arm64 has a full barrier in
-                * switch_to(), and PowerPC has
-                * smp_mb__after_unlock_lock() before
-                * finish_lock_switch().
+                * rq->curr, before returning to user-space.
+                *
+                * Here are the schemes providing that barrier on the
+                * various architectures:
+                * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+                *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+                * - finish_lock_switch() for weakly-ordered
+                *   architectures where spin_unlock is a full barrier,
+                * - switch_to() for arm64 (weakly-ordered, spin_unlock
+                *   is a RELEASE barrier),
                  */
                 ++*switch_count;
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 7b65359875009cc44b2f900d8f5cba166eb33ac3..5eb3ffc9be84c289b013949ccc41d41c3a43b9d8 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
             likely(wait_start > prev_wait_start))
                 wait_start -= prev_wait_start;
  
-       schedstat_set(se->statistics.wait_start, wait_start);
+       __schedstat_set(se->statistics.wait_start, wait_start);
  }
  
  static inline void
@@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
                          * time stamp can be adjusted to accumulate wait time
                          * prior to migration.
                          */
-                       schedstat_set(se->statistics.wait_start, delta);
+                       __schedstat_set(se->statistics.wait_start, delta);
                         return;
                 }
                 trace_sched_stat_wait(p, delta);
         }
  
-       schedstat_set(se->statistics.wait_max,
+       __schedstat_set(se->statistics.wait_max,
                       max(schedstat_val(se->statistics.wait_max), delta));
-       schedstat_inc(se->statistics.wait_count);
-       schedstat_add(se->statistics.wait_sum, delta);
-       schedstat_set(se->statistics.wait_start, 0);
+       __schedstat_inc(se->statistics.wait_count);
+       __schedstat_add(se->statistics.wait_sum, delta);
+       __schedstat_set(se->statistics.wait_start, 0);
  }
  
  static inline void
@@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         delta = 0;
  
                 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
-                       schedstat_set(se->statistics.sleep_max, delta);
+                       __schedstat_set(se->statistics.sleep_max, delta);
  
-               schedstat_set(se->statistics.sleep_start, 0);
-               schedstat_add(se->statistics.sum_sleep_runtime, delta);
+               __schedstat_set(se->statistics.sleep_start, 0);
+               __schedstat_add(se->statistics.sum_sleep_runtime, delta);
  
                 if (tsk) {
                         account_scheduler_latency(tsk, delta >> 10, 1);
@@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         delta = 0;
  
                 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
-                       schedstat_set(se->statistics.block_max, delta);
+                       __schedstat_set(se->statistics.block_max, delta);
  
-               schedstat_set(se->statistics.block_start, 0);
-               schedstat_add(se->statistics.sum_sleep_runtime, delta);
+               __schedstat_set(se->statistics.block_start, 0);
+               __schedstat_add(se->statistics.sum_sleep_runtime, delta);
  
                 if (tsk) {
                         if (tsk->in_iowait) {
-                               schedstat_add(se->statistics.iowait_sum, delta);
-                               schedstat_inc(se->statistics.iowait_count);
+                               __schedstat_add(se->statistics.iowait_sum, delta);
+                               __schedstat_inc(se->statistics.iowait_count);
                                 trace_sched_stat_iowait(tsk, delta);
                         }
  
@@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                 struct task_struct *tsk = task_of(se);
  
                 if (tsk->state & TASK_INTERRUPTIBLE)
-                       schedstat_set(se->statistics.sleep_start,
+                       __schedstat_set(se->statistics.sleep_start,
                                       rq_clock(rq_of(cfs_rq)));
                 if (tsk->state & TASK_UNINTERRUPTIBLE)
-                       schedstat_set(se->statistics.block_start,
+                       __schedstat_set(se->statistics.block_start,
                                       rq_clock(rq_of(cfs_rq)));
         }
  }
@@ -5692,27 +5692,31 @@ static int wake_wide(struct task_struct *p)
   *                       scheduling latency of the CPUs. This seems to work
   *                       for the overloaded case.
   */
-
-static bool
-wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
-                int this_cpu, int prev_cpu, int sync)
+static int
+wake_affine_idle(int this_cpu, int prev_cpu, int sync)
  {
         /*
          * If this_cpu is idle, it implies the wakeup is from interrupt
          * context. Only allow the move if cache is shared. Otherwise an
          * interrupt intensive workload could force all tasks onto one
          * node depending on the IO topology or IRQ affinity settings.
+        *
+        * If the prev_cpu is idle and cache affine then avoid a migration.
+        * There is no guarantee that the cache hot data from an interrupt
+        * is more important than cache hot data on the prev_cpu and from
+        * a cpufreq perspective, it's better to have higher utilisation
+        * on one CPU.
          */
         if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
-               return true;
+               return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
  
         if (sync && cpu_rq(this_cpu)->nr_running == 1)
-               return true;
+               return this_cpu;
  
-       return false;
+       return nr_cpumask_bits;
  }
  
-static bool
+static int
  wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
                    int this_cpu, int prev_cpu, int sync)
  {
@@ -5726,7 +5730,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
                 unsigned long current_load = task_h_load(current);
  
                 if (current_load > this_eff_load)
-                       return true;
+                       return this_cpu;
  
                 this_eff_load -= current_load;
         }
@@ -5743,28 +5747,28 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
                 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
         prev_eff_load *= capacity_of(this_cpu);
  
-       return this_eff_load <= prev_eff_load;
+       return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
  }
  
  static int wake_affine(struct sched_domain *sd, struct task_struct *p,
                        int prev_cpu, int sync)
  {
         int this_cpu = smp_processor_id();
-       bool affine = false;
+       int target = nr_cpumask_bits;
  
-       if (sched_feat(WA_IDLE) && !affine)
-               affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+       if (sched_feat(WA_IDLE))
+               target = wake_affine_idle(this_cpu, prev_cpu, sync);
  
-       if (sched_feat(WA_WEIGHT) && !affine)
-               affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+       if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
+               target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
  
         schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
-       if (affine) {
-               schedstat_inc(sd->ttwu_move_affine);
-               schedstat_inc(p->se.statistics.nr_wakeups_affine);
-       }
+       if (target == nr_cpumask_bits)
+               return prev_cpu;
  
-       return affine;
+       schedstat_inc(sd->ttwu_move_affine);
+       schedstat_inc(p->se.statistics.nr_wakeups_affine);
+       return target;
  }
  
  static inline unsigned long task_util(struct task_struct *p);
@@ -6193,7 +6197,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
  static int select_idle_sibling(struct task_struct *p, int prev, int target)
  {
         struct sched_domain *sd;
-       int i;
+       int i, recent_used_cpu;
  
         if (idle_cpu(target))
                 return target;
@@ -6204,6 +6208,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
                 return prev;
  
+       /* Check a recently used CPU as a potential idle candidate */
+       recent_used_cpu = p->recent_used_cpu;
+       if (recent_used_cpu != prev &&
+           recent_used_cpu != target &&
+           cpus_share_cache(recent_used_cpu, target) &&
+           idle_cpu(recent_used_cpu) &&
+           cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+               /*
+                * Replace recent_used_cpu with prev as it is a potential
+                * candidate for the next wake.
+                */
+               p->recent_used_cpu = prev;
+               return recent_used_cpu;
+       }
+
         sd = rcu_dereference(per_cpu(sd_llc, target));
         if (!sd)
                 return target;
@@ -6357,8 +6376,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                 if (cpu == prev_cpu)
                         goto pick_cpu;
  
-               if (wake_affine(affine_sd, p, prev_cpu, sync))
-                       new_cpu = cpu;
+               new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
         }
  
         if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@@ -6372,9 +6390,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         if (!sd) {
  pick_cpu:
-               if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+               if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
                         new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
  
+                       if (want_affine)
+                               current->recent_used_cpu = cpu;
+               }
         } else {
                 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
         }
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c

index 9bcbacba82a8115ddceda7fb26097a23c50d44f5..5d076263363971325a2aa8be51aed404959165d6 100644 (file)
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -26,24 +26,110 @@
   * Bitmask made from a "or" of all commands within enum membarrier_cmd,
   * except MEMBARRIER_CMD_QUERY.
   */
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+       (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+#else
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
+#endif
+
  #define MEMBARRIER_CMD_BITMASK \
-       (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED       \
-       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
+       (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
+       | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
+       | MEMBARRIER_CMD_PRIVATE_EXPEDITED      \
+       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED     \
+       | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
  
  static void ipi_mb(void *info)
  {
         smp_mb();       /* IPIs should be serializing but paranoid. */
  }
  
-static int membarrier_private_expedited(void)
+static int membarrier_global_expedited(void)
  {
         int cpu;
         bool fallback = false;
         cpumask_var_t tmpmask;
  
-       if (!(atomic_read(&current->mm->membarrier_state)
-                       & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
-               return -EPERM;
+       if (num_online_cpus() == 1)
+               return 0;
+
+       /*
+        * Matches memory barriers around rq->curr modification in
+        * scheduler.
+        */
+       smp_mb();       /* system call entry is not a mb. */
+
+       /*
+        * Expedited membarrier commands guarantee that they won't
+        * block, hence the GFP_NOWAIT allocation flag and fallback
+        * implementation.
+        */
+       if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+               /* Fallback for OOM. */
+               fallback = true;
+       }
+
+       cpus_read_lock();
+       for_each_online_cpu(cpu) {
+               struct task_struct *p;
+
+               /*
+                * Skipping the current CPU is OK even through we can be
+                * migrated at any point. The current CPU, at the point
+                * where we read raw_smp_processor_id(), is ensured to
+                * be in program order with respect to the caller
+                * thread. Therefore, we can skip this CPU from the
+                * iteration.
+                */
+               if (cpu == raw_smp_processor_id())
+                       continue;
+               rcu_read_lock();
+               p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+               if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
+                                  MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+                       if (!fallback)
+                               __cpumask_set_cpu(cpu, tmpmask);
+                       else
+                               smp_call_function_single(cpu, ipi_mb, NULL, 1);
+               }
+               rcu_read_unlock();
+       }
+       if (!fallback) {
+               preempt_disable();
+               smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+               preempt_enable();
+               free_cpumask_var(tmpmask);
+       }
+       cpus_read_unlock();
+
+       /*
+        * Memory barrier on the caller thread _after_ we finished
+        * waiting for the last IPI. Matches memory barriers around
+        * rq->curr modification in scheduler.
+        */
+       smp_mb();       /* exit from system call is not a mb */
+       return 0;
+}
+
+static int membarrier_private_expedited(int flags)
+{
+       int cpu;
+       bool fallback = false;
+       cpumask_var_t tmpmask;
+
+       if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+               if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+                       return -EINVAL;
+               if (!(atomic_read(&current->mm->membarrier_state) &
+                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
+                       return -EPERM;
+       } else {
+               if (!(atomic_read(&current->mm->membarrier_state) &
+                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
+                       return -EPERM;
+       }
  
         if (num_online_cpus() == 1)
                 return 0;
@@ -105,21 +191,69 @@ static int membarrier_private_expedited(void)
         return 0;
  }
  
-static void membarrier_register_private_expedited(void)
+static int membarrier_register_global_expedited(void)
  {
         struct task_struct *p = current;
         struct mm_struct *mm = p->mm;
  
+       if (atomic_read(&mm->membarrier_state) &
+           MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
+               return 0;
+       atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+       if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
+               /*
+                * For single mm user, single threaded process, we can
+                * simply issue a memory barrier after setting
+                * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
+                * no memory access following registration is reordered
+                * before registration.
+                */
+               smp_mb();
+       } else {
+               /*
+                * For multi-mm user threads, we need to ensure all
+                * future scheduler executions will observe the new
+                * thread flag state for this mm.
+                */
+               synchronize_sched();
+       }
+       atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
+                 &mm->membarrier_state);
+       return 0;
+}
+
+static int membarrier_register_private_expedited(int flags)
+{
+       struct task_struct *p = current;
+       struct mm_struct *mm = p->mm;
+       int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+
+       if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+               if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
+                       return -EINVAL;
+               state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+       }
+
         /*
          * We need to consider threads belonging to different thread
          * groups, which use the same mm. (CLONE_VM but not
          * CLONE_THREAD).
          */
-       if (atomic_read(&mm->membarrier_state)
-                       & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)
-               return;
-       atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
-                       &mm->membarrier_state);
+       if (atomic_read(&mm->membarrier_state) & state)
+               return 0;
+       atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
+       if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+               atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
+                         &mm->membarrier_state);
+       if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
+               /*
+                * Ensure all future scheduler executions will observe the
+                * new thread flag state for this process.
+                */
+               synchronize_sched();
+       }
+       atomic_or(state, &mm->membarrier_state);
+       return 0;
  }
  
  /**
@@ -159,21 +293,28 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
                 int cmd_mask = MEMBARRIER_CMD_BITMASK;
  
                 if (tick_nohz_full_enabled())
-                       cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+                       cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
                 return cmd_mask;
         }
-       case MEMBARRIER_CMD_SHARED:
-               /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+       case MEMBARRIER_CMD_GLOBAL:
+               /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
                 if (tick_nohz_full_enabled())
                         return -EINVAL;
                 if (num_online_cpus() > 1)
                         synchronize_sched();
                 return 0;
+       case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+               return membarrier_global_expedited();
+       case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+               return membarrier_register_global_expedited();
         case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-               return membarrier_private_expedited();
+               return membarrier_private_expedited(0);
         case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
-               membarrier_register_private_expedited();
-               return 0;
+               return membarrier_register_private_expedited(0);
+       case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+               return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+       case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+               return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
         default:
                 return -EINVAL;
         }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 862a513adca3d0285e7fa45ac878520ea2b70cf8..663b2355a3aa772d8bcc8c90b55a3e0e0e3a6e17 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq)
  {
         struct task_struct *curr = rq->curr;
         struct sched_rt_entity *rt_se = &curr->rt;
+       u64 now = rq_clock_task(rq);
         u64 delta_exec;
  
         if (curr->sched_class != &rt_sched_class)
                 return;
  
-       delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+       delta_exec = now - curr->se.exec_start;
         if (unlikely((s64)delta_exec <= 0))
                 return;
  
@@ -968,7 +969,7 @@ static void update_curr_rt(struct rq *rq)
         curr->se.sum_exec_runtime += delta_exec;
         account_group_exec_runtime(curr, delta_exec);
  
-       curr->se.exec_start = rq_clock_task(rq);
+       curr->se.exec_start = now;
         cgroup_account_cputime(curr, delta_exec);
  
         sched_rt_avg_update(rq, delta_exec);
@@ -1907,9 +1908,8 @@ static void push_rt_tasks(struct rq *rq)
   * the rt_loop_next will cause the iterator to perform another scan.
   *
   */
-static int rto_next_cpu(struct rq *rq)
+static int rto_next_cpu(struct root_domain *rd)
  {
-       struct root_domain *rd = rq->rd;
         int next;
         int cpu;
  
@@ -1985,19 +1985,24 @@ static void tell_cpu_to_push(struct rq *rq)
          * Otherwise it is finishing up and an ipi needs to be sent.
          */
         if (rq->rd->rto_cpu < 0)
-               cpu = rto_next_cpu(rq);
+               cpu = rto_next_cpu(rq->rd);
  
         raw_spin_unlock(&rq->rd->rto_lock);
  
         rto_start_unlock(&rq->rd->rto_loop_start);
  
-       if (cpu >= 0)
+       if (cpu >= 0) {
+               /* Make sure the rd does not get freed while pushing */
+               sched_get_rd(rq->rd);
                 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+       }
  }
  
  /* Called from hardirq context */
  void rto_push_irq_work_func(struct irq_work *work)
  {
+       struct root_domain *rd =
+               container_of(work, struct root_domain, rto_push_work);
         struct rq *rq;
         int cpu;
  
@@ -2013,18 +2018,20 @@ void rto_push_irq_work_func(struct irq_work *work)
                 raw_spin_unlock(&rq->lock);
         }
  
-       raw_spin_lock(&rq->rd->rto_lock);
+       raw_spin_lock(&rd->rto_lock);
  
         /* Pass the IPI to the next rt overloaded queue */
-       cpu = rto_next_cpu(rq);
+       cpu = rto_next_cpu(rd);
  
-       raw_spin_unlock(&rq->rd->rto_lock);
+       raw_spin_unlock(&rd->rto_lock);
  
-       if (cpu < 0)
+       if (cpu < 0) {
+               sched_put_rd(rd);
                 return;
+       }
  
         /* Try the next RT overloaded CPU */
-       irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+       irq_work_queue_on(&rd->rto_push_work, cpu);
  }
  #endif /* HAVE_RT_PUSH_IPI */
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 2e95505e23c692a6c0ecd25b752f3bc6f72fe2d0..fb5fc458547ff83672dc4265c39a330c74a60e62 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -691,6 +691,8 @@ extern struct mutex sched_domains_mutex;
  extern void init_defrootdomain(void);
  extern int sched_init_domains(const struct cpumask *cpu_map);
  extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
+extern void sched_get_rd(struct root_domain *rd);
+extern void sched_put_rd(struct root_domain *rd);
  
  #ifdef HAVE_RT_PUSH_IPI
  extern void rto_push_irq_work_func(struct irq_work *work);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

index baf500d12b7c9eaa951657598b781b6b67ab8b3e..8e7b58de61e7eba9f8287c734bb0bca26b7df296 100644 (file)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -31,8 +31,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
                 rq->rq_sched_info.run_delay += delta;
  }
  #define schedstat_enabled()            static_branch_unlikely(&sched_schedstats)
+#define __schedstat_inc(var)           do { var++; } while (0)
  #define schedstat_inc(var)             do { if (schedstat_enabled()) { var++; } } while (0)
+#define __schedstat_add(var, amt)      do { var += (amt); } while (0)
  #define schedstat_add(var, amt)                do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define __schedstat_set(var, val)              do { var = (val); } while (0)
  #define schedstat_set(var, val)                do { if (schedstat_enabled()) { var = (val); } } while (0)
  #define schedstat_val(var)             (var)
  #define schedstat_val_or_zero(var)     ((schedstat_enabled()) ? (var) : 0)
@@ -48,8 +51,11 @@ static inline void
  rq_sched_info_depart(struct rq *rq, unsigned long long delta)
  {}
  #define schedstat_enabled()            0
+#define __schedstat_inc(var)           do { } while (0)
  #define schedstat_inc(var)             do { } while (0)
+#define __schedstat_add(var, amt)      do { } while (0)
  #define schedstat_add(var, amt)                do { } while (0)
+#define __schedstat_set(var, val)      do { } while (0)
  #define schedstat_set(var, val)                do { } while (0)
  #define schedstat_val(var)             0
  #define schedstat_val_or_zero(var)     0
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index 034cbed7f88b4f14dc44fb9565706436805c68b8..519b024f4e94f9385e2e7df913dba2fb314410d1 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -259,6 +259,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
                 call_rcu_sched(&old_rd->rcu, free_rootdomain);
  }
  
+void sched_get_rd(struct root_domain *rd)
+{
+       atomic_inc(&rd->refcount);
+}
+
+void sched_put_rd(struct root_domain *rd)
+{
+       if (!atomic_dec_and_test(&rd->refcount))
+               return;
+
+       call_rcu_sched(&rd->rcu, free_rootdomain);
+}
+
  static int init_rootdomain(struct root_domain *rd)
  {
         if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c

index 9e674d9514d1d14b43e32c58ef530991f7f3561e..22bffd55a523c94983dd65f45705f9f8d30059bd 100644 (file)
--- a/tools/testing/selftests/membarrier/membarrier_test.c
+++ b/tools/testing/selftests/membarrier/membarrier_test.c
@@ -16,49 +16,210 @@ static int sys_membarrier(int cmd, int flags)
  static int test_membarrier_cmd_fail(void)
  {
         int cmd = -1, flags = 0;
+       const char *test_name = "sys membarrier invalid command";
  
         if (sys_membarrier(cmd, flags) != -1) {
                 ksft_exit_fail_msg(
-                       "sys membarrier invalid command test: command = %d, flags = %d. Should fail, but passed\n",
-                       cmd, flags);
+                       "%s test: command = %d, flags = %d. Should fail, but passed\n",
+                       test_name, cmd, flags);
+       }
+       if (errno != EINVAL) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+                       test_name, flags, EINVAL, strerror(EINVAL),
+                       errno, strerror(errno));
         }
  
         ksft_test_result_pass(
-               "sys membarrier invalid command test: command = %d, flags = %d. Failed as expected\n",
-               cmd, flags);
+               "%s test: command = %d, flags = %d, errno = %d. Failed as expected\n",
+               test_name, cmd, flags, errno);
         return 0;
  }
  
  static int test_membarrier_flags_fail(void)
  {
         int cmd = MEMBARRIER_CMD_QUERY, flags = 1;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_QUERY invalid flags";
+
+       if (sys_membarrier(cmd, flags) != -1) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d. Should fail, but passed\n",
+                       test_name, flags);
+       }
+       if (errno != EINVAL) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+                       test_name, flags, EINVAL, strerror(EINVAL),
+                       errno, strerror(errno));
+       }
+
+       ksft_test_result_pass(
+               "%s test: flags = %d, errno = %d. Failed as expected\n",
+               test_name, flags, errno);
+       return 0;
+}
+
+static int test_membarrier_global_success(void)
+{
+       int cmd = MEMBARRIER_CMD_GLOBAL, flags = 0;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_GLOBAL";
+
+       if (sys_membarrier(cmd, flags) != 0) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d, errno = %d\n",
+                       test_name, flags, errno);
+       }
+
+       ksft_test_result_pass(
+               "%s test: flags = %d\n", test_name, flags);
+       return 0;
+}
+
+static int test_membarrier_private_expedited_fail(void)
+{
+       int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED not registered failure";
+
+       if (sys_membarrier(cmd, flags) != -1) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d. Should fail, but passed\n",
+                       test_name, flags);
+       }
+       if (errno != EPERM) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+                       test_name, flags, EPERM, strerror(EPERM),
+                       errno, strerror(errno));
+       }
+
+       ksft_test_result_pass(
+               "%s test: flags = %d, errno = %d\n",
+               test_name, flags, errno);
+       return 0;
+}
+
+static int test_membarrier_register_private_expedited_success(void)
+{
+       int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, flags = 0;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED";
+
+       if (sys_membarrier(cmd, flags) != 0) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d, errno = %d\n",
+                       test_name, flags, errno);
+       }
+
+       ksft_test_result_pass(
+               "%s test: flags = %d\n",
+               test_name, flags);
+       return 0;
+}
+
+static int test_membarrier_private_expedited_success(void)
+{
+       int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED";
+
+       if (sys_membarrier(cmd, flags) != 0) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d, errno = %d\n",
+                       test_name, flags, errno);
+       }
+
+       ksft_test_result_pass(
+               "%s test: flags = %d\n",
+               test_name, flags);
+       return 0;
+}
+
+static int test_membarrier_private_expedited_sync_core_fail(void)
+{
+       int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, flags = 0;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE not registered failure";
  
         if (sys_membarrier(cmd, flags) != -1) {
                 ksft_exit_fail_msg(
-                       "sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Should fail, but passed\n",
-                       flags);
+                       "%s test: flags = %d. Should fail, but passed\n",
+                       test_name, flags);
+       }
+       if (errno != EPERM) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+                       test_name, flags, EPERM, strerror(EPERM),
+                       errno, strerror(errno));
+       }
+
+       ksft_test_result_pass(
+               "%s test: flags = %d, errno = %d\n",
+               test_name, flags, errno);
+       return 0;
+}
+
+static int test_membarrier_register_private_expedited_sync_core_success(void)
+{
+       int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, flags = 0;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE";
+
+       if (sys_membarrier(cmd, flags) != 0) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d, errno = %d\n",
+                       test_name, flags, errno);
+       }
+
+       ksft_test_result_pass(
+               "%s test: flags = %d\n",
+               test_name, flags);
+       return 0;
+}
+
+static int test_membarrier_private_expedited_sync_core_success(void)
+{
+       int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE";
+
+       if (sys_membarrier(cmd, flags) != 0) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d, errno = %d\n",
+                       test_name, flags, errno);
         }
  
         ksft_test_result_pass(
-               "sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Failed as expected\n",
-               flags);
+               "%s test: flags = %d\n",
+               test_name, flags);
         return 0;
  }
  
-static int test_membarrier_success(void)
+static int test_membarrier_register_global_expedited_success(void)
  {
-       int cmd = MEMBARRIER_CMD_SHARED, flags = 0;
-       const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED\n";
+       int cmd = MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, flags = 0;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED";
  
         if (sys_membarrier(cmd, flags) != 0) {
                 ksft_exit_fail_msg(
-                       "sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
-                       flags);
+                       "%s test: flags = %d, errno = %d\n",
+                       test_name, flags, errno);
         }
  
         ksft_test_result_pass(
-               "sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
-               flags);
+               "%s test: flags = %d\n",
+               test_name, flags);
+       return 0;
+}
+
+static int test_membarrier_global_expedited_success(void)
+{
+       int cmd = MEMBARRIER_CMD_GLOBAL_EXPEDITED, flags = 0;
+       const char *test_name = "sys membarrier MEMBARRIER_CMD_GLOBAL_EXPEDITED";
+
+       if (sys_membarrier(cmd, flags) != 0) {
+               ksft_exit_fail_msg(
+                       "%s test: flags = %d, errno = %d\n",
+                       test_name, flags, errno);
+       }
+
+       ksft_test_result_pass(
+               "%s test: flags = %d\n",
+               test_name, flags);
         return 0;
  }
  
@@ -72,7 +233,45 @@ static int test_membarrier(void)
         status = test_membarrier_flags_fail();
         if (status)
                 return status;
-       status = test_membarrier_success();
+       status = test_membarrier_global_success();
+       if (status)
+               return status;
+       status = test_membarrier_private_expedited_fail();
+       if (status)
+               return status;
+       status = test_membarrier_register_private_expedited_success();
+       if (status)
+               return status;
+       status = test_membarrier_private_expedited_success();
+       if (status)
+               return status;
+       status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0);
+       if (status < 0) {
+               ksft_test_result_fail("sys_membarrier() failed\n");
+               return status;
+       }
+       if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
+               status = test_membarrier_private_expedited_sync_core_fail();
+               if (status)
+                       return status;
+               status = test_membarrier_register_private_expedited_sync_core_success();
+               if (status)
+                       return status;
+               status = test_membarrier_private_expedited_sync_core_success();
+               if (status)
+                       return status;
+       }
+       /*
+        * It is valid to send a global membarrier from a non-registered
+        * process.
+        */
+       status = test_membarrier_global_expedited_success();
+       if (status)
+               return status;
+       status = test_membarrier_register_global_expedited_success();
+       if (status)
+               return status;
+       status = test_membarrier_global_expedited_success();
         if (status)
                 return status;
         return 0;
@@ -94,8 +293,10 @@ static int test_membarrier_query(void)
                 }
                 ksft_exit_fail_msg("sys_membarrier() failed\n");
         }
-       if (!(ret & MEMBARRIER_CMD_SHARED))
+       if (!(ret & MEMBARRIER_CMD_GLOBAL)) {
+               ksft_test_result_fail("sys_membarrier() CMD_GLOBAL query failed\n");
                 ksft_exit_fail_msg("sys_membarrier is not supported.\n");
+       }
  
         ksft_test_result_pass("sys_membarrier available\n");
         return 0;
@@ -108,5 +309,5 @@ int main(int argc, char **argv)
         test_membarrier_query();
         test_membarrier();
  
-       ksft_exit_pass();
+       return ksft_exit_pass();
  }
author	Ingo Molnar <mingo@kernel.org>
	Tue, 6 Feb 2018 20:12:31 +0000 (21:12 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 6 Feb 2018 20:12:31 +0000 (21:12 +0100)
MAINTAINERS		patch \| blob \| history
arch/arm64/Kconfig		patch \| blob \| history
arch/arm64/kernel/entry.S		patch \| blob \| history
arch/powerpc/Kconfig		patch \| blob \| history
arch/powerpc/include/asm/membarrier.h	[new file with mode: 0644]	patch \| blob
arch/powerpc/mm/mmu_context.c		patch \| blob \| history
arch/x86/Kconfig		patch \| blob \| history
arch/x86/entry/entry_32.S		patch \| blob \| history
arch/x86/entry/entry_64.S		patch \| blob \| history
arch/x86/include/asm/sync_core.h	[new file with mode: 0644]	patch \| blob
arch/x86/mm/tlb.c		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/sched/mm.h		patch \| blob \| history
include/linux/sync_core.h	[new file with mode: 0644]	patch \| blob
include/uapi/linux/membarrier.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/membarrier.c		patch \| blob \| history
kernel/sched/rt.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/sched/stats.h		patch \| blob \| history
kernel/sched/topology.c		patch \| blob \| history
tools/testing/selftests/membarrier/membarrier_test.c		patch \| blob \| history