crypto: serpent-sse2 - change transpose_4x4 to only use integer instructions
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>
Tue, 20 Dec 2011 10:58:06 +0000 (12:58 +0200)
committerHerbert Xu <herbert@gondor.apana.org.au>
Fri, 13 Jan 2012 05:38:40 +0000 (16:38 +1100)
Matrix transpose macro in serpent-sse2 uses mix of SSE2 integer and SSE floating
point instructions, which might cause performance penality on some CPUs.

This patch replaces transpose_4x4 macro with version that uses only SSE2
integer instructions.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
arch/x86/crypto/serpent-sse2-i586-asm_32.S
arch/x86/crypto/serpent-sse2-x86_64-asm_64.S

index 4e37677ca85120644ed167914ee8611cf0607ec2..c00053d42f998ae6e5fc11dcf667ffc32334712a 100644 (file)
        pand x0,                x4; \
        pxor x2,                x4;
 
-#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
-       movdqa x2,              t3; \
-       movdqa x0,              t1; \
-       unpcklps x3,            t3; \
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
        movdqa x0,              t2; \
-       unpcklps x1,            t1; \
-       unpckhps x1,            t2; \
-       movdqa t3,              x1; \
-       unpckhps x3,            x2; \
-       movdqa t1,              x0; \
-       movhlps t1,             x1; \
-       movdqa t2,              t1; \
-       movlhps t3,             x0; \
-       movlhps x2,             t1; \
-       movhlps t2,             x2; \
-       movdqa x2,              x3; \
-       movdqa t1,              x2;
+       punpckldq x1,           x0; \
+       punpckhdq x1,           t2; \
+       movdqa x2,              t1; \
+       punpckhdq x3,           x2; \
+       punpckldq x3,           t1; \
+       movdqa x0,              x1; \
+       punpcklqdq t1,          x0; \
+       punpckhqdq t1,          x1; \
+       movdqa t2,              x3; \
+       punpcklqdq x2,          t2; \
+       punpckhqdq x2,          x3; \
+       movdqa t2,              x2;
 
 #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
        movdqu (0*4*4)(in),     x0; \
index 7f24a1540821053d66c1460db83453a1febfb80e..3ee1ff04d3e9f30c04bd913c17d90ef8c1796e23 100644 (file)
        get_key(i, 1, RK1); \
        SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 
-#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
-       movdqa x2,              t3; \
-       movdqa x0,              t1; \
-       unpcklps x3,            t3; \
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
        movdqa x0,              t2; \
-       unpcklps x1,            t1; \
-       unpckhps x1,            t2; \
-       movdqa t3,              x1; \
-       unpckhps x3,            x2; \
-       movdqa t1,              x0; \
-       movhlps t1,             x1; \
-       movdqa t2,              t1; \
-       movlhps t3,             x0; \
-       movlhps x2,             t1; \
-       movhlps t2,             x2; \
-       movdqa x2,              x3; \
-       movdqa t1,              x2;
+       punpckldq x1,           x0; \
+       punpckhdq x1,           t2; \
+       movdqa x2,              t1; \
+       punpckhdq x3,           x2; \
+       punpckldq x3,           t1; \
+       movdqa x0,              x1; \
+       punpcklqdq t1,          x0; \
+       punpckhqdq t1,          x1; \
+       movdqa t2,              x3; \
+       punpcklqdq x2,          t2; \
+       punpckhqdq x2,          x3; \
+       movdqa t2,              x2;
 
 #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
        movdqu (0*4*4)(in),     x0; \