arch/x86/crypto/sha1_avx2_x86_64_asm.S

   1 /*
   2  *      Implement fast SHA-1 with AVX2 instructions. (x86_64)
   3  *
   4  * This file is provided under a dual BSD/GPLv2 license.  When using or
   5  * redistributing this file, you may do so under either license.
   6  *
   7  * GPL LICENSE SUMMARY
   8  *
   9  * Copyright(c) 2014 Intel Corporation.
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of version 2 of the GNU General Public License as
  13  * published by the Free Software Foundation.
  14  *
  15  * This program is distributed in the hope that it will be useful, but
  16  * WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * General Public License for more details.
  19  *
  20  * Contact Information:
  21  * Ilya Albrekht <ilya.albrekht@intel.com>
  22  * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  23  * Ronen Zohar <ronen.zohar@intel.com>
  24  * Chandramouli Narayanan <mouli@linux.intel.com>
  25  *
  26  * BSD LICENSE
  27  *
  28  * Copyright(c) 2014 Intel Corporation.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  *
  34  * Redistributions of source code must retain the above copyright
  35  * notice, this list of conditions and the following disclaimer.
  36  * Redistributions in binary form must reproduce the above copyright
  37  * notice, this list of conditions and the following disclaimer in
  38  * the documentation and/or other materials provided with the
  39  * distribution.
  40  * Neither the name of Intel Corporation nor the names of its
  41  * contributors may be used to endorse or promote products derived
  42  * from this software without specific prior written permission.
  43  *
  44  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  45  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  46  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  47  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  48  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  49  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  50  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  51  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  52  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  53  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  54  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55  *
  56  */
  57
  58 /*
  59  * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
  60  *
  61  *This implementation is based on the previous SSSE3 release:
  62  *Visit http://software.intel.com/en-us/articles/
  63  *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
  64  *
  65  *Updates 20-byte SHA-1 record in 'hash' for even number of
  66  *'num_blocks' consecutive 64-byte blocks
  67  *
  68  *extern "C" void sha1_transform_avx2(
  69  *      int *hash, const char* input, size_t num_blocks );
  70  */
  71
  72 #include <linux/linkage.h>
  73
  74 #define CTX     %rdi    /* arg1 */
  75 #define BUF     %rsi    /* arg2 */
  76 #define CNT     %rdx    /* arg3 */
  77
  78 #define REG_A   %ecx
  79 #define REG_B   %esi
  80 #define REG_C   %edi
  81 #define REG_D   %eax
  82 #define REG_E   %edx
  83 #define REG_TB  %ebx
  84 #define REG_TA  %r12d
  85 #define REG_RA  %rcx
  86 #define REG_RB  %rsi
  87 #define REG_RC  %rdi
  88 #define REG_RD  %rax
  89 #define REG_RE  %rdx
  90 #define REG_RTA %r12
  91 #define REG_RTB %rbx
  92 #define REG_T1  %r11d
  93 #define xmm_mov vmovups
  94 #define avx2_zeroupper  vzeroupper
  95 #define RND_F1  1
  96 #define RND_F2  2
  97 #define RND_F3  3
  98
  99 .macro REGALLOC
 100         .set A, REG_A
 101         .set B, REG_B
 102         .set C, REG_C
 103         .set D, REG_D
 104         .set E, REG_E
 105         .set TB, REG_TB
 106         .set TA, REG_TA
 107
 108         .set RA, REG_RA
 109         .set RB, REG_RB
 110         .set RC, REG_RC
 111         .set RD, REG_RD
 112         .set RE, REG_RE
 113
 114         .set RTA, REG_RTA
 115         .set RTB, REG_RTB
 116
 117         .set T1, REG_T1
 118 .endm
 119
 120 #define HASH_PTR        %r9
 121 #define BLOCKS_CTR      %r8
 122 #define BUFFER_PTR      %r10
 123 #define BUFFER_PTR2     %r13
 124
 125 #define PRECALC_BUF     %r14
 126 #define WK_BUF          %r15
 127
 128 #define W_TMP           %xmm0
 129 #define WY_TMP          %ymm0
 130 #define WY_TMP2         %ymm9
 131
 132 # AVX2 variables
 133 #define WY0             %ymm3
 134 #define WY4             %ymm5
 135 #define WY08            %ymm7
 136 #define WY12            %ymm8
 137 #define WY16            %ymm12
 138 #define WY20            %ymm13
 139 #define WY24            %ymm14
 140 #define WY28            %ymm15
 141
 142 #define YMM_SHUFB_BSWAP %ymm10
 143
 144 /*
 145  * Keep 2 iterations precalculated at a time:
 146  *    - 80 DWORDs per iteration * 2
 147  */
 148 #define W_SIZE          (80*2*2 +16)
 149
 150 #define WK(t)   ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
 151 #define PRECALC_WK(t)   ((t)*2*2)(PRECALC_BUF)
 152
 153
 154 .macro UPDATE_HASH  hash, val
 155         add     \hash, \val
 156         mov     \val, \hash
 157 .endm
 158
 159 .macro PRECALC_RESET_WY
 160         .set WY_00, WY0
 161         .set WY_04, WY4
 162         .set WY_08, WY08
 163         .set WY_12, WY12
 164         .set WY_16, WY16
 165         .set WY_20, WY20
 166         .set WY_24, WY24
 167         .set WY_28, WY28
 168         .set WY_32, WY_00
 169 .endm
 170
 171 .macro PRECALC_ROTATE_WY
 172         /* Rotate macros */
 173         .set WY_32, WY_28
 174         .set WY_28, WY_24
 175         .set WY_24, WY_20
 176         .set WY_20, WY_16
 177         .set WY_16, WY_12
 178         .set WY_12, WY_08
 179         .set WY_08, WY_04
 180         .set WY_04, WY_00
 181         .set WY_00, WY_32
 182
 183         /* Define register aliases */
 184         .set WY, WY_00
 185         .set WY_minus_04, WY_04
 186         .set WY_minus_08, WY_08
 187         .set WY_minus_12, WY_12
 188         .set WY_minus_16, WY_16
 189         .set WY_minus_20, WY_20
 190         .set WY_minus_24, WY_24
 191         .set WY_minus_28, WY_28
 192         .set WY_minus_32, WY
 193 .endm
 194
 195 .macro PRECALC_00_15
 196         .if (i == 0) # Initialize and rotate registers
 197                 PRECALC_RESET_WY
 198                 PRECALC_ROTATE_WY
 199         .endif
 200
 201         /* message scheduling pre-compute for rounds 0-15 */
 202         .if   ((i & 7) == 0)
 203                 /*
 204                  * blended AVX2 and ALU instruction scheduling
 205                  * 1 vector iteration per 8 rounds
 206                  */
 207                 vmovdqu (i * 2)(BUFFER_PTR), W_TMP
 208         .elseif ((i & 7) == 1)
 209                 vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
 210                          WY_TMP, WY_TMP
 211         .elseif ((i & 7) == 2)
 212                 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
 213         .elseif ((i & 7) == 4)
 214                 vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 215         .elseif ((i & 7) == 7)
 216                 vmovdqu  WY_TMP, PRECALC_WK(i&~7)
 217
 218                 PRECALC_ROTATE_WY
 219         .endif
 220 .endm
 221
 222 .macro PRECALC_16_31
 223         /*
 224          * message scheduling pre-compute for rounds 16-31
 225          * calculating last 32 w[i] values in 8 XMM registers
 226          * pre-calculate K+w[i] values and store to mem
 227          * for later load by ALU add instruction
 228          *
 229          * "brute force" vectorization for rounds 16-31 only
 230          * due to w[i]->w[i-3] dependency
 231          */
 232         .if   ((i & 7) == 0)
 233                 /*
 234                  * blended AVX2 and ALU instruction scheduling
 235                  * 1 vector iteration per 8 rounds
 236                  */
 237                 /* w[i-14] */
 238                 vpalignr        $8, WY_minus_16, WY_minus_12, WY
 239                 vpsrldq $4, WY_minus_04, WY_TMP               /* w[i-3] */
 240         .elseif ((i & 7) == 1)
 241                 vpxor   WY_minus_08, WY, WY
 242                 vpxor   WY_minus_16, WY_TMP, WY_TMP
 243         .elseif ((i & 7) == 2)
 244                 vpxor   WY_TMP, WY, WY
 245                 vpslldq $12, WY, WY_TMP2
 246         .elseif ((i & 7) == 3)
 247                 vpslld  $1, WY, WY_TMP
 248                 vpsrld  $31, WY, WY
 249         .elseif ((i & 7) == 4)
 250                 vpor    WY, WY_TMP, WY_TMP
 251                 vpslld  $2, WY_TMP2, WY
 252         .elseif ((i & 7) == 5)
 253                 vpsrld  $30, WY_TMP2, WY_TMP2
 254                 vpxor   WY, WY_TMP, WY_TMP
 255         .elseif ((i & 7) == 7)
 256                 vpxor   WY_TMP2, WY_TMP, WY
 257                 vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 258                 vmovdqu WY_TMP, PRECALC_WK(i&~7)
 259
 260                 PRECALC_ROTATE_WY
 261         .endif
 262 .endm
 263
 264 .macro PRECALC_32_79
 265         /*
 266          * in SHA-1 specification:
 267          * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
 268          * instead we do equal:
 269          * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
 270          * allows more efficient vectorization
 271          * since w[i]=>w[i-3] dependency is broken
 272          */
 273
 274         .if   ((i & 7) == 0)
 275         /*
 276          * blended AVX2 and ALU instruction scheduling
 277          * 1 vector iteration per 8 rounds
 278          */
 279                 vpalignr        $8, WY_minus_08, WY_minus_04, WY_TMP
 280         .elseif ((i & 7) == 1)
 281                 /* W is W_minus_32 before xor */
 282                 vpxor   WY_minus_28, WY, WY
 283         .elseif ((i & 7) == 2)
 284                 vpxor   WY_minus_16, WY_TMP, WY_TMP
 285         .elseif ((i & 7) == 3)
 286                 vpxor   WY_TMP, WY, WY
 287         .elseif ((i & 7) == 4)
 288                 vpslld  $2, WY, WY_TMP
 289         .elseif ((i & 7) == 5)
 290                 vpsrld  $30, WY, WY
 291                 vpor    WY, WY_TMP, WY
 292         .elseif ((i & 7) == 7)
 293                 vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 294                 vmovdqu WY_TMP, PRECALC_WK(i&~7)
 295
 296                 PRECALC_ROTATE_WY
 297         .endif
 298 .endm
 299
 300 .macro PRECALC r, s
 301         .set i, \r
 302
 303         .if (i < 40)
 304                 .set K_XMM, 32*0
 305         .elseif (i < 80)
 306                 .set K_XMM, 32*1
 307         .elseif (i < 120)
 308                 .set K_XMM, 32*2
 309         .else
 310                 .set K_XMM, 32*3
 311         .endif
 312
 313         .if (i<32)
 314                 PRECALC_00_15   \s
 315         .elseif (i<64)
 316                 PRECALC_16_31   \s
 317         .elseif (i < 160)
 318                 PRECALC_32_79   \s
 319         .endif
 320 .endm
 321
 322 .macro ROTATE_STATE
 323         .set T_REG, E
 324         .set E, D
 325         .set D, C
 326         .set C, B
 327         .set B, TB
 328         .set TB, A
 329         .set A, T_REG
 330
 331         .set T_REG, RE
 332         .set RE, RD
 333         .set RD, RC
 334         .set RC, RB
 335         .set RB, RTB
 336         .set RTB, RA
 337         .set RA, T_REG
 338 .endm
 339
 340 /* Macro relies on saved ROUND_Fx */
 341
 342 .macro RND_FUN f, r
 343         .if (\f == RND_F1)
 344                 ROUND_F1        \r
 345         .elseif (\f == RND_F2)
 346                 ROUND_F2        \r
 347         .elseif (\f == RND_F3)
 348                 ROUND_F3        \r
 349         .endif
 350 .endm
 351
 352 .macro RR r
 353         .set round_id, (\r % 80)
 354
 355         .if (round_id == 0)        /* Precalculate F for first round */
 356                 .set ROUND_FUNC, RND_F1
 357                 mov     B, TB
 358
 359                 rorx    $(32-30), B, B    /* b>>>2 */
 360                 andn    D, TB, T1
 361                 and     C, TB
 362                 xor     T1, TB
 363         .endif
 364
 365         RND_FUN ROUND_FUNC, \r
 366         ROTATE_STATE
 367
 368         .if   (round_id == 18)
 369                 .set ROUND_FUNC, RND_F2
 370         .elseif (round_id == 38)
 371                 .set ROUND_FUNC, RND_F3
 372         .elseif (round_id == 58)
 373                 .set ROUND_FUNC, RND_F2
 374         .endif
 375
 376         .set round_id, ( (\r+1) % 80)
 377
 378         RND_FUN ROUND_FUNC, (\r+1)
 379         ROTATE_STATE
 380 .endm
 381
 382 .macro ROUND_F1 r
 383         add     WK(\r), E
 384
 385         andn    C, A, T1                        /* ~b&d */
 386         lea     (RE,RTB), E             /* Add F from the previous round */
 387
 388         rorx    $(32-5), A, TA          /* T2 = A >>> 5 */
 389         rorx    $(32-30),A, TB          /* b>>>2 for next round */
 390
 391         PRECALC (\r)                    /* msg scheduling for next 2 blocks */
 392
 393         /*
 394          * Calculate F for the next round
 395          * (b & c) ^ andn[b, d]
 396          */
 397         and     B, A                    /* b&c */
 398         xor     T1, A                   /* F1 = (b&c) ^ (~b&d) */
 399
 400         lea     (RE,RTA), E             /* E += A >>> 5 */
 401 .endm
 402
 403 .macro ROUND_F2 r
 404         add     WK(\r), E
 405         lea     (RE,RTB), E             /* Add F from the previous round */
 406
 407         /* Calculate F for the next round */
 408         rorx    $(32-5), A, TA          /* T2 = A >>> 5 */
 409         .if ((round_id) < 79)
 410                 rorx    $(32-30), A, TB /* b>>>2 for next round */
 411         .endif
 412         PRECALC (\r)                    /* msg scheduling for next 2 blocks */
 413
 414         .if ((round_id) < 79)
 415                 xor     B, A
 416         .endif
 417
 418         add     TA, E                   /* E += A >>> 5 */
 419
 420         .if ((round_id) < 79)
 421                 xor     C, A
 422         .endif
 423 .endm
 424
 425 .macro ROUND_F3 r
 426         add     WK(\r), E
 427         PRECALC (\r)                    /* msg scheduling for next 2 blocks */
 428
 429         lea     (RE,RTB), E             /* Add F from the previous round */
 430
 431         mov     B, T1
 432         or      A, T1
 433
 434         rorx    $(32-5), A, TA          /* T2 = A >>> 5 */
 435         rorx    $(32-30), A, TB         /* b>>>2 for next round */
 436
 437         /* Calculate F for the next round
 438          * (b and c) or (d and (b or c))
 439          */
 440         and     C, T1
 441         and     B, A
 442         or      T1, A
 443
 444         add     TA, E                   /* E += A >>> 5 */
 445
 446 .endm
 447
 448 /* Add constant only if (%2 > %3) condition met (uses RTA as temp)
 449  * %1 + %2 >= %3 ? %4 : 0
 450  */
 451 .macro ADD_IF_GE a, b, c, d
 452         mov     \a, RTA
 453         add     $\d, RTA
 454         cmp     $\c, \b
 455         cmovge  RTA, \a
 456 .endm
 457
 458 /*
 459  * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
 460  */
 461 .macro SHA1_PIPELINED_MAIN_BODY
 462
 463         REGALLOC
 464
 465         mov     (HASH_PTR), A
 466         mov     4(HASH_PTR), B
 467         mov     8(HASH_PTR), C
 468         mov     12(HASH_PTR), D
 469         mov     16(HASH_PTR), E
 470
 471         mov     %rsp, PRECALC_BUF
 472         lea     (2*4*80+32)(%rsp), WK_BUF
 473
 474         # Precalc WK for first 2 blocks
 475         ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
 476         .set i, 0
 477         .rept    160
 478                 PRECALC i
 479                 .set i, i + 1
 480         .endr
 481
 482         /* Go to next block if needed */
 483         ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
 484         ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 485         xchg    WK_BUF, PRECALC_BUF
 486
 487         .align 32
 488 _loop:
 489         /*
 490          * code loops through more than one block
 491          * we use K_BASE value as a signal of a last block,
 492          * it is set below by: cmovae BUFFER_PTR, K_BASE
 493          */
 494         test BLOCKS_CTR, BLOCKS_CTR
 495         jnz _begin
 496         .align 32
 497         jmp     _end
 498         .align 32
 499 _begin:
 500
 501         /*
 502          * Do first block
 503          * rounds: 0,2,4,6,8
 504          */
 505         .set j, 0
 506         .rept 5
 507                 RR      j
 508                 .set j, j+2
 509         .endr
 510
 511         jmp _loop0
 512 _loop0:
 513
 514         /*
 515          * rounds:
 516          * 10,12,14,16,18
 517          * 20,22,24,26,28
 518          * 30,32,34,36,38
 519          * 40,42,44,46,48
 520          * 50,52,54,56,58
 521          */
 522         .rept 25
 523                 RR      j
 524                 .set j, j+2
 525         .endr
 526
 527         /* Update Counter */
 528         sub $1, BLOCKS_CTR
 529         /* Move to the next block only if needed*/
 530         ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
 531         /*
 532          * rounds
 533          * 60,62,64,66,68
 534          * 70,72,74,76,78
 535          */
 536         .rept 10
 537                 RR      j
 538                 .set j, j+2
 539         .endr
 540
 541         UPDATE_HASH     (HASH_PTR), A
 542         UPDATE_HASH     4(HASH_PTR), TB
 543         UPDATE_HASH     8(HASH_PTR), C
 544         UPDATE_HASH     12(HASH_PTR), D
 545         UPDATE_HASH     16(HASH_PTR), E
 546
 547         test    BLOCKS_CTR, BLOCKS_CTR
 548         jz      _loop
 549
 550         mov     TB, B
 551
 552         /* Process second block */
 553         /*
 554          * rounds
 555          *  0+80, 2+80, 4+80, 6+80, 8+80
 556          * 10+80,12+80,14+80,16+80,18+80
 557          */
 558
 559         .set j, 0
 560         .rept 10
 561                 RR      j+80
 562                 .set j, j+2
 563         .endr
 564
 565         jmp     _loop1
 566 _loop1:
 567         /*
 568          * rounds
 569          * 20+80,22+80,24+80,26+80,28+80
 570          * 30+80,32+80,34+80,36+80,38+80
 571          */
 572         .rept 10
 573                 RR      j+80
 574                 .set j, j+2
 575         .endr
 576
 577         jmp     _loop2
 578 _loop2:
 579
 580         /*
 581          * rounds
 582          * 40+80,42+80,44+80,46+80,48+80
 583          * 50+80,52+80,54+80,56+80,58+80
 584          */
 585         .rept 10
 586                 RR      j+80
 587                 .set j, j+2
 588         .endr
 589
 590         /* update counter */
 591         sub     $1, BLOCKS_CTR
 592         /* Move to the next block only if needed*/
 593         ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 594
 595         jmp     _loop3
 596 _loop3:
 597
 598         /*
 599          * rounds
 600          * 60+80,62+80,64+80,66+80,68+80
 601          * 70+80,72+80,74+80,76+80,78+80
 602          */
 603         .rept 10
 604                 RR      j+80
 605                 .set j, j+2
 606         .endr
 607
 608         UPDATE_HASH     (HASH_PTR), A
 609         UPDATE_HASH     4(HASH_PTR), TB
 610         UPDATE_HASH     8(HASH_PTR), C
 611         UPDATE_HASH     12(HASH_PTR), D
 612         UPDATE_HASH     16(HASH_PTR), E
 613
 614         /* Reset state for AVX2 reg permutation */
 615         mov     A, TA
 616         mov     TB, A
 617         mov     C, TB
 618         mov     E, C
 619         mov     D, B
 620         mov     TA, D
 621
 622         REGALLOC
 623
 624         xchg    WK_BUF, PRECALC_BUF
 625
 626         jmp     _loop
 627
 628         .align 32
 629         _end:
 630
 631 .endm
 632 /*
 633  * macro implements SHA-1 function's body for several 64-byte blocks
 634  * param: function's name
 635  */
 636 .macro SHA1_VECTOR_ASM  name
 637         ENTRY(\name)
 638
 639         push    %rbx
 640         push    %r12
 641         push    %r13
 642         push    %r14
 643         push    %r15
 644
 645         RESERVE_STACK  = (W_SIZE*4 + 8+24)
 646
 647         /* Align stack */
 648         mov     %rsp, %rbx
 649         and     $~(0x20-1), %rsp
 650         push    %rbx
 651         sub     $RESERVE_STACK, %rsp
 652
 653         avx2_zeroupper
 654
 655         /* Setup initial values */
 656         mov     CTX, HASH_PTR
 657         mov     BUF, BUFFER_PTR
 658
 659         mov     BUF, BUFFER_PTR2
 660         mov     CNT, BLOCKS_CTR
 661
 662         xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
 663
 664         SHA1_PIPELINED_MAIN_BODY
 665
 666         avx2_zeroupper
 667
 668         add     $RESERVE_STACK, %rsp
 669         pop     %rsp
 670
 671         pop     %r15
 672         pop     %r14
 673         pop     %r13
 674         pop     %r12
 675         pop     %rbx
 676
 677         ret
 678
 679         ENDPROC(\name)
 680 .endm
 681
 682 .section .rodata
 683
 684 #define K1 0x5a827999
 685 #define K2 0x6ed9eba1
 686 #define K3 0x8f1bbcdc
 687 #define K4 0xca62c1d6
 688
 689 .align 128
 690 K_XMM_AR:
 691         .long K1, K1, K1, K1
 692         .long K1, K1, K1, K1
 693         .long K2, K2, K2, K2
 694         .long K2, K2, K2, K2
 695         .long K3, K3, K3, K3
 696         .long K3, K3, K3, K3
 697         .long K4, K4, K4, K4
 698         .long K4, K4, K4, K4
 699
 700 BSWAP_SHUFB_CTL:
 701         .long 0x00010203
 702         .long 0x04050607
 703         .long 0x08090a0b
 704         .long 0x0c0d0e0f
 705         .long 0x00010203
 706         .long 0x04050607
 707         .long 0x08090a0b
 708         .long 0x0c0d0e0f
 709 .text
 710
 711 SHA1_VECTOR_ASM     sha1_transform_avx2