2 * Implement fast SHA-1 with AVX2 instructions. (x86_64)
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
9 * Copyright(c) 2014 Intel Corporation.
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * Contact Information:
21 * Ilya Albrekht <ilya.albrekht@intel.com>
22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
23 * Ronen Zohar <ronen.zohar@intel.com>
24 * Chandramouli Narayanan <mouli@linux.intel.com>
28 * Copyright(c) 2014 Intel Corporation.
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
34 * Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in
38 * the documentation and/or other materials provided with the
40 * Neither the name of Intel Corporation nor the names of its
41 * contributors may be used to endorse or promote products derived
42 * from this software without specific prior written permission.
44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
61 *This implementation is based on the previous SSSE3 release:
62 *Visit http://software.intel.com/en-us/articles/
63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
65 *Updates 20-byte SHA-1 record in 'hash' for even number of
66 *'num_blocks' consecutive 64-byte blocks
68 *extern "C" void sha1_transform_avx2(
69 * int *hash, const char* input, size_t num_blocks );
72 #include <linux/linkage.h>
74 #define CTX %rdi /* arg1 */
75 #define BUF %rsi /* arg2 */
76 #define CNT %rdx /* arg3 */
93 #define xmm_mov vmovups
94 #define avx2_zeroupper vzeroupper
121 #define BLOCKS_CTR %r8
122 #define BUFFER_PTR %r10
123 #define BUFFER_PTR2 %r13
125 #define PRECALC_BUF %r14
130 #define WY_TMP2 %ymm9
142 #define YMM_SHUFB_BSWAP %ymm10
145 * Keep 2 iterations precalculated at a time:
146 * - 80 DWORDs per iteration * 2
148 #define W_SIZE (80*2*2 +16)
150 #define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
151 #define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
154 .macro UPDATE_HASH hash, val
159 .macro PRECALC_RESET_WY
171 .macro PRECALC_ROTATE_WY
183 /* Define register aliases */
185 .set WY_minus_04, WY_04
186 .set WY_minus_08, WY_08
187 .set WY_minus_12, WY_12
188 .set WY_minus_16, WY_16
189 .set WY_minus_20, WY_20
190 .set WY_minus_24, WY_24
191 .set WY_minus_28, WY_28
196 .if (i == 0) # Initialize and rotate registers
201 /* message scheduling pre-compute for rounds 0-15 */
204 * blended AVX2 and ALU instruction scheduling
205 * 1 vector iteration per 8 rounds
207 vmovdqu (i * 2)(BUFFER_PTR), W_TMP
208 .elseif ((i & 7) == 1)
209 vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
211 .elseif ((i & 7) == 2)
212 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
213 .elseif ((i & 7) == 4)
214 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
215 .elseif ((i & 7) == 7)
216 vmovdqu WY_TMP, PRECALC_WK(i&~7)
224 * message scheduling pre-compute for rounds 16-31
225 * calculating last 32 w[i] values in 8 XMM registers
226 * pre-calculate K+w[i] values and store to mem
227 * for later load by ALU add instruction
229 * "brute force" vectorization for rounds 16-31 only
230 * due to w[i]->w[i-3] dependency
234 * blended AVX2 and ALU instruction scheduling
235 * 1 vector iteration per 8 rounds
238 vpalignr $8, WY_minus_16, WY_minus_12, WY
239 vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
240 .elseif ((i & 7) == 1)
241 vpxor WY_minus_08, WY, WY
242 vpxor WY_minus_16, WY_TMP, WY_TMP
243 .elseif ((i & 7) == 2)
245 vpslldq $12, WY, WY_TMP2
246 .elseif ((i & 7) == 3)
247 vpslld $1, WY, WY_TMP
249 .elseif ((i & 7) == 4)
250 vpor WY, WY_TMP, WY_TMP
251 vpslld $2, WY_TMP2, WY
252 .elseif ((i & 7) == 5)
253 vpsrld $30, WY_TMP2, WY_TMP2
254 vpxor WY, WY_TMP, WY_TMP
255 .elseif ((i & 7) == 7)
256 vpxor WY_TMP2, WY_TMP, WY
257 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
258 vmovdqu WY_TMP, PRECALC_WK(i&~7)
266 * in SHA-1 specification:
267 * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
268 * instead we do equal:
269 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
270 * allows more efficient vectorization
271 * since w[i]=>w[i-3] dependency is broken
276 * blended AVX2 and ALU instruction scheduling
277 * 1 vector iteration per 8 rounds
279 vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
280 .elseif ((i & 7) == 1)
281 /* W is W_minus_32 before xor */
282 vpxor WY_minus_28, WY, WY
283 .elseif ((i & 7) == 2)
284 vpxor WY_minus_16, WY_TMP, WY_TMP
285 .elseif ((i & 7) == 3)
287 .elseif ((i & 7) == 4)
288 vpslld $2, WY, WY_TMP
289 .elseif ((i & 7) == 5)
292 .elseif ((i & 7) == 7)
293 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
294 vmovdqu WY_TMP, PRECALC_WK(i&~7)
340 /* Macro relies on saved ROUND_Fx */
345 .elseif (\f == RND_F2)
347 .elseif (\f == RND_F3)
353 .set round_id, (\r % 80)
355 .if (round_id == 0) /* Precalculate F for first round */
356 .set ROUND_FUNC, RND_F1
359 rorx $(32-30), B, B /* b>>>2 */
365 RND_FUN ROUND_FUNC, \r
369 .set ROUND_FUNC, RND_F2
370 .elseif (round_id == 38)
371 .set ROUND_FUNC, RND_F3
372 .elseif (round_id == 58)
373 .set ROUND_FUNC, RND_F2
376 .set round_id, ( (\r+1) % 80)
378 RND_FUN ROUND_FUNC, (\r+1)
385 andn C, A, T1 /* ~b&d */
386 lea (RE,RTB), E /* Add F from the previous round */
388 rorx $(32-5), A, TA /* T2 = A >>> 5 */
389 rorx $(32-30),A, TB /* b>>>2 for next round */
391 PRECALC (\r) /* msg scheduling for next 2 blocks */
394 * Calculate F for the next round
395 * (b & c) ^ andn[b, d]
398 xor T1, A /* F1 = (b&c) ^ (~b&d) */
400 lea (RE,RTA), E /* E += A >>> 5 */
405 lea (RE,RTB), E /* Add F from the previous round */
407 /* Calculate F for the next round */
408 rorx $(32-5), A, TA /* T2 = A >>> 5 */
409 .if ((round_id) < 79)
410 rorx $(32-30), A, TB /* b>>>2 for next round */
412 PRECALC (\r) /* msg scheduling for next 2 blocks */
414 .if ((round_id) < 79)
418 add TA, E /* E += A >>> 5 */
420 .if ((round_id) < 79)
427 PRECALC (\r) /* msg scheduling for next 2 blocks */
429 lea (RE,RTB), E /* Add F from the previous round */
434 rorx $(32-5), A, TA /* T2 = A >>> 5 */
435 rorx $(32-30), A, TB /* b>>>2 for next round */
437 /* Calculate F for the next round
438 * (b and c) or (d and (b or c))
444 add TA, E /* E += A >>> 5 */
448 /* Add constant only if (%2 > %3) condition met (uses RTA as temp)
449 * %1 + %2 >= %3 ? %4 : 0
451 .macro ADD_IF_GE a, b, c, d
459 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
461 .macro SHA1_PIPELINED_MAIN_BODY
471 mov %rsp, PRECALC_BUF
472 lea (2*4*80+32)(%rsp), WK_BUF
474 # Precalc WK for first 2 blocks
475 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
482 /* Go to next block if needed */
483 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
484 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
485 xchg WK_BUF, PRECALC_BUF
490 * code loops through more than one block
491 * we use K_BASE value as a signal of a last block,
492 * it is set below by: cmovae BUFFER_PTR, K_BASE
494 test BLOCKS_CTR, BLOCKS_CTR
529 /* Move to the next block only if needed*/
530 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
541 UPDATE_HASH (HASH_PTR), A
542 UPDATE_HASH 4(HASH_PTR), TB
543 UPDATE_HASH 8(HASH_PTR), C
544 UPDATE_HASH 12(HASH_PTR), D
545 UPDATE_HASH 16(HASH_PTR), E
547 test BLOCKS_CTR, BLOCKS_CTR
552 /* Process second block */
555 * 0+80, 2+80, 4+80, 6+80, 8+80
556 * 10+80,12+80,14+80,16+80,18+80
569 * 20+80,22+80,24+80,26+80,28+80
570 * 30+80,32+80,34+80,36+80,38+80
582 * 40+80,42+80,44+80,46+80,48+80
583 * 50+80,52+80,54+80,56+80,58+80
592 /* Move to the next block only if needed*/
593 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
600 * 60+80,62+80,64+80,66+80,68+80
601 * 70+80,72+80,74+80,76+80,78+80
608 UPDATE_HASH (HASH_PTR), A
609 UPDATE_HASH 4(HASH_PTR), TB
610 UPDATE_HASH 8(HASH_PTR), C
611 UPDATE_HASH 12(HASH_PTR), D
612 UPDATE_HASH 16(HASH_PTR), E
614 /* Reset state for AVX2 reg permutation */
624 xchg WK_BUF, PRECALC_BUF
633 * macro implements SHA-1 function's body for several 64-byte blocks
634 * param: function's name
636 .macro SHA1_VECTOR_ASM name
645 RESERVE_STACK = (W_SIZE*4 + 8+24)
651 sub $RESERVE_STACK, %rsp
655 /* Setup initial values */
662 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
664 SHA1_PIPELINED_MAIN_BODY
668 add $RESERVE_STACK, %rsp
684 #define K1 0x5a827999
685 #define K2 0x6ed9eba1
686 #define K3 0x8f1bbcdc
687 #define K4 0xca62c1d6
711 SHA1_VECTOR_ASM sha1_transform_avx2