Merge tag 'x86-asm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Oct 2023 00:18:00 +0000 (14:18 -1000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Oct 2023 00:18:00 +0000 (14:18 -1000)
Pull x86 assembly code updates from Ingo Molnar:

 - Micro-optimize the x86 bitops code

 - Define target-specific {raw,this}_cpu_try_cmpxchg{64,128}() to
   improve code generation

 - Define and use raw_cpu_try_cmpxchg() preempt_count_set()

 - Do not clobber %rsi in percpu_{try_,}cmpxchg{64,128}_op

 - Remove the unused __sw_hweight64() implementation on x86-32

 - Misc fixes and cleanups

* tag 'x86-asm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/lib: Address kernel-doc warnings
  x86/entry: Fix typos in comments
  x86/entry: Remove unused argument %rsi passed to exc_nmi()
  x86/bitops: Remove unused __sw_hweight64() assembly implementation on x86-32
  x86/percpu: Do not clobber %rsi in percpu_{try_,}cmpxchg{64,128}_op
  x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set()
  x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg()
  x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}
  x86/asm/bitops: Use __builtin_clz{l|ll} to evaluate constant expressions

arch/x86/entry/entry_64.S
arch/x86/include/asm/bitops.h
arch/x86/include/asm/percpu.h
arch/x86/include/asm/preempt.h
arch/x86/lib/csum-wrappers_64.c
arch/x86/lib/hweight.S

index be08efa33e9f715fb3e549ef36a1d7a717aa1d70..d656924eefc233a9121eeb5762a4c3563fd4e21b 100644 (file)
@@ -1163,8 +1163,8 @@ SYM_CODE_START(asm_exc_nmi)
         * anyway.
         *
         * To handle this case we do the following:
-        *  Check the a special location on the stack that contains
-        *  variable that is set when NMIs are executing.
+        *  Check a special location on the stack that contains a
+        *  variable that is set when NMIs are executing.
         *  The interrupted task's stack is also checked to see if it
         *  is an NMI stack.
         *  If the variable is not set and the stack is not the NMI
@@ -1237,7 +1237,6 @@ SYM_CODE_START(asm_exc_nmi)
         */
 
        movq    %rsp, %rdi
-       movq    $-1, %rsi
        call    exc_nmi
 
        /*
@@ -1295,8 +1294,8 @@ SYM_CODE_START(asm_exc_nmi)
         * end_repeat_nmi, then we are a nested NMI.  We must not
         * modify the "iret" frame because it's being written by
         * the outer NMI.  That's okay; the outer NMI handler is
-        * about to about to call exc_nmi() anyway, so we can just
-        * resume the outer NMI.
+        * about to call exc_nmi() anyway, so we can just resume
+        * the outer NMI.
         */
 
        movq    $repeat_nmi, %rdx
@@ -1451,7 +1450,6 @@ end_repeat_nmi:
        UNWIND_HINT_REGS
 
        movq    %rsp, %rdi
-       movq    $-1, %rsi
        call    exc_nmi
 
        /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
index 2edf68475fec4138895968c90d7845e08da54d62..50e5ebf9d0a0dc74cb7248e419d619c6294f9021 100644 (file)
@@ -293,6 +293,9 @@ static __always_inline unsigned long variable_ffz(unsigned long word)
  */
 static __always_inline unsigned long __fls(unsigned long word)
 {
+       if (__builtin_constant_p(word))
+               return BITS_PER_LONG - 1 - __builtin_clzl(word);
+
        asm("bsr %1,%0"
            : "=r" (word)
            : "rm" (word));
@@ -360,6 +363,9 @@ static __always_inline int fls(unsigned int x)
 {
        int r;
 
+       if (__builtin_constant_p(x))
+               return x ? 32 - __builtin_clz(x) : 0;
+
 #ifdef CONFIG_X86_64
        /*
         * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
@@ -401,6 +407,9 @@ static __always_inline int fls(unsigned int x)
 static __always_inline int fls64(__u64 x)
 {
        int bitpos = -1;
+
+       if (__builtin_constant_p(x))
+               return x ? 64 - __builtin_clzll(x) : 0;
        /*
         * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
         * dest reg is undefined if x==0, but their CPU architect says its
index 34734d73046397058357e113df1b26a8ce75f24b..20624b80f89041fdb8ac4bbdbdac9b60b66e9682 100644 (file)
@@ -210,6 +210,25 @@ do {                                                                       \
        (typeof(_var))(unsigned long) pco_old__;                        \
 })
 
+#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval)         \
+({                                                                     \
+       bool success;                                                   \
+       __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
+       __pcpu_type_##size pco_old__ = *pco_oval__;                     \
+       __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval);       \
+       asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]",               \
+                                   __percpu_arg([var]))                \
+                 CC_SET(z)                                             \
+                 : CC_OUT(z) (success),                                \
+                   [oval] "+a" (pco_old__),                            \
+                   [var] "+m" (_var)                                   \
+                 : [nval] __pcpu_reg_##size(, pco_new__)               \
+                 : "memory");                                          \
+       if (unlikely(!success))                                         \
+               *pco_oval__ = pco_old__;                                \
+       likely(success);                                                \
+})
+
 #if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
 #define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval)            \
 ({                                                                     \
@@ -223,26 +242,63 @@ do {                                                                      \
        old__.var = _oval;                                              \
        new__.var = _nval;                                              \
                                                                        \
-       asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \
+       asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu",            \
                              "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
                  : [var] "+m" (_var),                                  \
                    "+a" (old__.low),                                   \
                    "+d" (old__.high)                                   \
                  : "b" (new__.low),                                    \
-                   "c" (new__.high)                                    \
-                 : "memory", "esi");                                   \
+                   "c" (new__.high),                                   \
+                   "S" (&(_var))                                       \
+                 : "memory");                                          \
                                                                        \
        old__.var;                                                      \
 })
 
 #define raw_cpu_cmpxchg64(pcp, oval, nval)     percpu_cmpxchg64_op(8,         , pcp, oval, nval)
 #define this_cpu_cmpxchg64(pcp, oval, nval)    percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval)       \
+({                                                                     \
+       bool success;                                                   \
+       u64 *_oval = (u64 *)(_ovalp);                                   \
+       union {                                                         \
+               u64 var;                                                \
+               struct {                                                \
+                       u32 low, high;                                  \
+               };                                                      \
+       } old__, new__;                                                 \
+                                                                       \
+       old__.var = *_oval;                                             \
+       new__.var = _nval;                                              \
+                                                                       \
+       asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu",            \
+                             "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+                 CC_SET(z)                                             \
+                 : CC_OUT(z) (success),                                \
+                   [var] "+m" (_var),                                  \
+                   "+a" (old__.low),                                   \
+                   "+d" (old__.high)                                   \
+                 : "b" (new__.low),                                    \
+                   "c" (new__.high),                                   \
+                   "S" (&(_var))                                       \
+                 : "memory");                                          \
+       if (unlikely(!success))                                         \
+               *_oval = old__.var;                                     \
+       likely(success);                                                \
+})
+
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval)                percpu_try_cmpxchg64_op(8,         , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval)       percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 #ifdef CONFIG_X86_64
 #define raw_cpu_cmpxchg64(pcp, oval, nval)     percpu_cmpxchg_op(8,         , pcp, oval, nval);
 #define this_cpu_cmpxchg64(pcp, oval, nval)    percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
 
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval)                percpu_try_cmpxchg_op(8,         , pcp, ovalp, nval);
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval)       percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
+
 #define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval)           \
 ({                                                                     \
        union {                                                         \
@@ -255,20 +311,54 @@ do {                                                                      \
        old__.var = _oval;                                              \
        new__.var = _nval;                                              \
                                                                        \
-       asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \
+       asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu",           \
                              "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
                  : [var] "+m" (_var),                                  \
                    "+a" (old__.low),                                   \
                    "+d" (old__.high)                                   \
                  : "b" (new__.low),                                    \
-                   "c" (new__.high)                                    \
-                 : "memory", "rsi");                                   \
+                   "c" (new__.high),                                   \
+                   "S" (&(_var))                                       \
+                 : "memory");                                          \
                                                                        \
        old__.var;                                                      \
 })
 
 #define raw_cpu_cmpxchg128(pcp, oval, nval)    percpu_cmpxchg128_op(16,         , pcp, oval, nval)
 #define this_cpu_cmpxchg128(pcp, oval, nval)   percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval)      \
+({                                                                     \
+       bool success;                                                   \
+       u128 *_oval = (u128 *)(_ovalp);                                 \
+       union {                                                         \
+               u128 var;                                               \
+               struct {                                                \
+                       u64 low, high;                                  \
+               };                                                      \
+       } old__, new__;                                                 \
+                                                                       \
+       old__.var = *_oval;                                             \
+       new__.var = _nval;                                              \
+                                                                       \
+       asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu",           \
+                             "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+                 CC_SET(z)                                             \
+                 : CC_OUT(z) (success),                                \
+                   [var] "+m" (_var),                                  \
+                   "+a" (old__.low),                                   \
+                   "+d" (old__.high)                                   \
+                 : "b" (new__.low),                                    \
+                   "c" (new__.high),                                   \
+                   "S" (&(_var))                                       \
+                 : "memory");                                          \
+       if (unlikely(!success))                                         \
+               *_oval = old__.var;                                     \
+       likely(success);                                                \
+})
+
+#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval)       percpu_try_cmpxchg128_op(16,         , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg128(pcp, ovalp, nval)      percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
 #endif
 
 /*
@@ -343,6 +433,9 @@ do {                                                                        \
 #define raw_cpu_cmpxchg_1(pcp, oval, nval)     percpu_cmpxchg_op(1, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_2(pcp, oval, nval)     percpu_cmpxchg_op(2, , pcp, oval, nval)
 #define raw_cpu_cmpxchg_4(pcp, oval, nval)     percpu_cmpxchg_op(4, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval)        percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval)        percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
+#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval)        percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
 
 #define this_cpu_add_return_1(pcp, val)                percpu_add_return_op(1, volatile, pcp, val)
 #define this_cpu_add_return_2(pcp, val)                percpu_add_return_op(2, volatile, pcp, val)
@@ -350,6 +443,9 @@ do {                                                                        \
 #define this_cpu_cmpxchg_1(pcp, oval, nval)    percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_2(pcp, oval, nval)    percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
 #define this_cpu_cmpxchg_4(pcp, oval, nval)    percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval)       percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval)       percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval)       percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
 
 /*
  * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -364,6 +460,7 @@ do {                                                                        \
 #define raw_cpu_add_return_8(pcp, val)         percpu_add_return_op(8, , pcp, val)
 #define raw_cpu_xchg_8(pcp, nval)              raw_percpu_xchg_op(pcp, nval)
 #define raw_cpu_cmpxchg_8(pcp, oval, nval)     percpu_cmpxchg_op(8, , pcp, oval, nval)
+#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval)        percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
 
 #define this_cpu_read_8(pcp)                   percpu_from_op(8, volatile, "mov", pcp)
 #define this_cpu_write_8(pcp, val)             percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -373,6 +470,7 @@ do {                                                                        \
 #define this_cpu_add_return_8(pcp, val)                percpu_add_return_op(8, volatile, pcp, val)
 #define this_cpu_xchg_8(pcp, nval)             percpu_xchg_op(8, volatile, pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)    percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
+#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval)       percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
 #endif
 
 static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
index 2d13f25b1bd8f332b860f5bd48b6f47102ca0dae..4527e1430c6dc13dbfd0f49a4edd26bf9fce007d 100644 (file)
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
 {
        int old, new;
 
+       old = raw_cpu_read_4(pcpu_hot.preempt_count);
        do {
-               old = raw_cpu_read_4(pcpu_hot.preempt_count);
                new = (old & PREEMPT_NEED_RESCHED) |
                        (pc & ~PREEMPT_NEED_RESCHED);
-       } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
+       } while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
 }
 
 /*
index 145f9a0bde29a16d391f69e589f876faabf9b035..f4df4d241526c64a5ad2eabdcbf5f0d8d56d6fd8 100644 (file)
@@ -14,8 +14,6 @@
  * @src: source address (user space)
  * @dst: destination address
  * @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad source address.
  *
  * Returns an 32bit unfolded checksum of the buffer.
  * src and dst are best aligned to 64bits.
@@ -38,8 +36,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len)
  * @src: source address
  * @dst: destination address (user space)
  * @len: number of bytes to be copied.
- * @isum: initial sum that is added into the result (32bit unfolded)
- * @errp: set to -EFAULT for an bad destination address.
  *
  * Returns an 32bit unfolded checksum of the buffer.
  * src and dst are best aligned to 64bits.
@@ -62,7 +58,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len)
  * @src: source address
  * @dst: destination address
  * @len: number of bytes to be copied.
- * @sum: initial sum that is added into the result (32bit unfolded)
  *
  * Returns an 32bit unfolded checksum of the buffer.
  */
index 5e5e9e3f8fb728a066ae24f94dd1d3d665a9c62a..774bdf3e6f0a9d633c60943e7b8f0ed11e9ee537 100644 (file)
@@ -36,8 +36,12 @@ SYM_FUNC_START(__sw_hweight32)
 SYM_FUNC_END(__sw_hweight32)
 EXPORT_SYMBOL(__sw_hweight32)
 
-SYM_FUNC_START(__sw_hweight64)
+/*
+ * No 32-bit variant, because it's implemented as an inline wrapper
+ * on top of __arch_hweight32():
+ */
 #ifdef CONFIG_X86_64
+SYM_FUNC_START(__sw_hweight64)
        pushq   %rdi
        pushq   %rdx
 
@@ -66,18 +70,6 @@ SYM_FUNC_START(__sw_hweight64)
        popq    %rdx
        popq    %rdi
        RET
-#else /* CONFIG_X86_32 */
-       /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
-       pushl   %ecx
-
-       call    __sw_hweight32
-       movl    %eax, %ecx                      # stash away result
-       movl    %edx, %eax                      # second part of input
-       call    __sw_hweight32
-       addl    %ecx, %eax                      # result
-
-       popl    %ecx
-       RET
-#endif
 SYM_FUNC_END(__sw_hweight64)
 EXPORT_SYMBOL(__sw_hweight64)
+#endif