arch/x86/crypto/aesni-intel_asm.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Implement AES algorithm in Intel AES-NI instructions.
   4  *
   5  * The white paper of AES-NI instructions can be downloaded from:
   6  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7  *
   8  * Copyright (C) 2008, Intel Corp.
   9  *    Author: Huang Ying <ying.huang@intel.com>
  10  *            Vinodh Gopal <vinodh.gopal@intel.com>
  11  *            Kahraman Akdemir
  12  *
  13  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14  * interface for 64-bit kernels.
  15  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17  *             Adrian Hoban <adrian.hoban@intel.com>
  18  *             James Guilford (james.guilford@intel.com)
  19  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20  *             Tadeusz Struk (tadeusz.struk@intel.com)
  21  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22  *    Copyright (c) 2010, Intel Corporation.
  23  *
  24  * Ported x86_64 version to x86:
  25  *    Author: Mathias Krause <minipli@googlemail.com>
  26  */
  27
  28 #include <linux/linkage.h>
  29 #include <asm/inst.h>
  30 #include <asm/frame.h>
  31 #include <asm/nospec-branch.h>
  32
  33 /*
  34  * The following macros are used to move an (un)aligned 16 byte value to/from
  35  * an XMM register.  This can done for either FP or integer values, for FP use
  36  * movaps (move aligned packed single) or integer use movdqa (move double quad
  37  * aligned).  It doesn't make a performance difference which instruction is used
  38  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  39  * shorter, so that is the one we'll use for now. (same for unaligned).
  40  */
  41 #define MOVADQ  movaps
  42 #define MOVUDQ  movups
  43
  44 #ifdef __x86_64__
  45
  46 # constants in mergeable sections, linker can reorder and merge
  47 .section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  48 .align 16
  49 .Lgf128mul_x_ble_mask:
  50         .octa 0x00000000000000010000000000000087
  51 .section        .rodata.cst16.POLY, "aM", @progbits, 16
  52 .align 16
  53 POLY:   .octa 0xC2000000000000000000000000000001
  54 .section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  55 .align 16
  56 TWOONE: .octa 0x00000001000000000000000000000001
  57
  58 .section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  59 .align 16
  60 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  61 .section        .rodata.cst16.MASK1, "aM", @progbits, 16
  62 .align 16
  63 MASK1:      .octa 0x0000000000000000ffffffffffffffff
  64 .section        .rodata.cst16.MASK2, "aM", @progbits, 16
  65 .align 16
  66 MASK2:      .octa 0xffffffffffffffff0000000000000000
  67 .section        .rodata.cst16.ONE, "aM", @progbits, 16
  68 .align 16
  69 ONE:        .octa 0x00000000000000000000000000000001
  70 .section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  71 .align 16
  72 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  73 .section        .rodata.cst16.dec, "aM", @progbits, 16
  74 .align 16
  75 dec:        .octa 0x1
  76 .section        .rodata.cst16.enc, "aM", @progbits, 16
  77 .align 16
  78 enc:        .octa 0x2
  79
  80 # order of these constants should not change.
  81 # more specifically, ALL_F should follow SHIFT_MASK,
  82 # and zero should follow ALL_F
  83 .section        .rodata, "a", @progbits
  84 .align 16
  85 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  86 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  87             .octa 0x00000000000000000000000000000000
  88
  89 .text
  90
  91
  92 #define STACK_OFFSET    8*3
  93
  94 #define AadHash 16*0
  95 #define AadLen 16*1
  96 #define InLen (16*1)+8
  97 #define PBlockEncKey 16*2
  98 #define OrigIV 16*3
  99 #define CurCount 16*4
 100 #define PBlockLen 16*5
 101 #define HashKey         16*6    // store HashKey <<1 mod poly here
 102 #define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
 103 #define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
 104 #define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
 105 #define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
 106                                 // bits of  HashKey <<1 mod poly here
 107                                 //(for Karatsuba purposes)
 108 #define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 109                                 // bits of  HashKey^2 <<1 mod poly here
 110                                 // (for Karatsuba purposes)
 111 #define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 112                                 // bits of  HashKey^3 <<1 mod poly here
 113                                 // (for Karatsuba purposes)
 114 #define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 115                                 // bits of  HashKey^4 <<1 mod poly here
 116                                 // (for Karatsuba purposes)
 117
 118 #define arg1 rdi
 119 #define arg2 rsi
 120 #define arg3 rdx
 121 #define arg4 rcx
 122 #define arg5 r8
 123 #define arg6 r9
 124 #define arg7 STACK_OFFSET+8(%rsp)
 125 #define arg8 STACK_OFFSET+16(%rsp)
 126 #define arg9 STACK_OFFSET+24(%rsp)
 127 #define arg10 STACK_OFFSET+32(%rsp)
 128 #define arg11 STACK_OFFSET+40(%rsp)
 129 #define keysize 2*15*16(%arg1)
 130 #endif
 131
 132
 133 #define STATE1  %xmm0
 134 #define STATE2  %xmm4
 135 #define STATE3  %xmm5
 136 #define STATE4  %xmm6
 137 #define STATE   STATE1
 138 #define IN1     %xmm1
 139 #define IN2     %xmm7
 140 #define IN3     %xmm8
 141 #define IN4     %xmm9
 142 #define IN      IN1
 143 #define KEY     %xmm2
 144 #define IV      %xmm3
 145
 146 #define BSWAP_MASK %xmm10
 147 #define CTR     %xmm11
 148 #define INC     %xmm12
 149
 150 #define GF128MUL_MASK %xmm10
 151
 152 #ifdef __x86_64__
 153 #define AREG    %rax
 154 #define KEYP    %rdi
 155 #define OUTP    %rsi
 156 #define UKEYP   OUTP
 157 #define INP     %rdx
 158 #define LEN     %rcx
 159 #define IVP     %r8
 160 #define KLEN    %r9d
 161 #define T1      %r10
 162 #define TKEYP   T1
 163 #define T2      %r11
 164 #define TCTR_LOW T2
 165 #else
 166 #define AREG    %eax
 167 #define KEYP    %edi
 168 #define OUTP    AREG
 169 #define UKEYP   OUTP
 170 #define INP     %edx
 171 #define LEN     %esi
 172 #define IVP     %ebp
 173 #define KLEN    %ebx
 174 #define T1      %ecx
 175 #define TKEYP   T1
 176 #endif
 177
 178 .macro FUNC_SAVE
 179         push    %r12
 180         push    %r13
 181         push    %r14
 182 #
 183 # states of %xmm registers %xmm6:%xmm15 not saved
 184 # all %xmm registers are clobbered
 185 #
 186 .endm
 187
 188
 189 .macro FUNC_RESTORE
 190         pop     %r14
 191         pop     %r13
 192         pop     %r12
 193 .endm
 194
 195 # Precompute hashkeys.
 196 # Input: Hash subkey.
 197 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
 198 # once per key.
 199 # clobbers r12, and tmp xmm registers.
 200 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 201         mov     \SUBKEY, %r12
 202         movdqu  (%r12), \TMP3
 203         movdqa  SHUF_MASK(%rip), \TMP2
 204         PSHUFB_XMM \TMP2, \TMP3
 205
 206         # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 207
 208         movdqa  \TMP3, \TMP2
 209         psllq   $1, \TMP3
 210         psrlq   $63, \TMP2
 211         movdqa  \TMP2, \TMP1
 212         pslldq  $8, \TMP2
 213         psrldq  $8, \TMP1
 214         por     \TMP2, \TMP3
 215
 216         # reduce HashKey<<1
 217
 218         pshufd  $0x24, \TMP1, \TMP2
 219         pcmpeqd TWOONE(%rip), \TMP2
 220         pand    POLY(%rip), \TMP2
 221         pxor    \TMP2, \TMP3
 222         movdqu  \TMP3, HashKey(%arg2)
 223
 224         movdqa     \TMP3, \TMP5
 225         pshufd     $78, \TMP3, \TMP1
 226         pxor       \TMP3, \TMP1
 227         movdqu     \TMP1, HashKey_k(%arg2)
 228
 229         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 230 # TMP5 = HashKey^2<<1 (mod poly)
 231         movdqu     \TMP5, HashKey_2(%arg2)
 232 # HashKey_2 = HashKey^2<<1 (mod poly)
 233         pshufd     $78, \TMP5, \TMP1
 234         pxor       \TMP5, \TMP1
 235         movdqu     \TMP1, HashKey_2_k(%arg2)
 236
 237         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 238 # TMP5 = HashKey^3<<1 (mod poly)
 239         movdqu     \TMP5, HashKey_3(%arg2)
 240         pshufd     $78, \TMP5, \TMP1
 241         pxor       \TMP5, \TMP1
 242         movdqu     \TMP1, HashKey_3_k(%arg2)
 243
 244         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 245 # TMP5 = HashKey^3<<1 (mod poly)
 246         movdqu     \TMP5, HashKey_4(%arg2)
 247         pshufd     $78, \TMP5, \TMP1
 248         pxor       \TMP5, \TMP1
 249         movdqu     \TMP1, HashKey_4_k(%arg2)
 250 .endm
 251
 252 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 253 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 254 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 255         mov \AADLEN, %r11
 256         mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 257         xor %r11d, %r11d
 258         mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 259         mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 260         mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 261         mov \Iv, %rax
 262         movdqu (%rax), %xmm0
 263         movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 264
 265         movdqa  SHUF_MASK(%rip), %xmm2
 266         PSHUFB_XMM %xmm2, %xmm0
 267         movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 268
 269         PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 270         movdqu HashKey(%arg2), %xmm13
 271
 272         CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 273         %xmm4, %xmm5, %xmm6
 274 .endm
 275
 276 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 277 # struct has been initialized by GCM_INIT.
 278 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 279 # Clobbers rax, r10-r13, and xmm0-xmm15
 280 .macro GCM_ENC_DEC operation
 281         movdqu AadHash(%arg2), %xmm8
 282         movdqu HashKey(%arg2), %xmm13
 283         add %arg5, InLen(%arg2)
 284
 285         xor %r11d, %r11d # initialise the data pointer offset as zero
 286         PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 287
 288         sub %r11, %arg5         # sub partial block data used
 289         mov %arg5, %r13         # save the number of bytes
 290
 291         and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 292         mov %r13, %r12
 293         # Encrypt/Decrypt first few blocks
 294
 295         and     $(3<<4), %r12
 296         jz      _initial_num_blocks_is_0_\@
 297         cmp     $(2<<4), %r12
 298         jb      _initial_num_blocks_is_1_\@
 299         je      _initial_num_blocks_is_2_\@
 300 _initial_num_blocks_is_3_\@:
 301         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 303         sub     $48, %r13
 304         jmp     _initial_blocks_\@
 305 _initial_num_blocks_is_2_\@:
 306         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 308         sub     $32, %r13
 309         jmp     _initial_blocks_\@
 310 _initial_num_blocks_is_1_\@:
 311         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 313         sub     $16, %r13
 314         jmp     _initial_blocks_\@
 315 _initial_num_blocks_is_0_\@:
 316         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 317 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 318 _initial_blocks_\@:
 319
 320         # Main loop - Encrypt/Decrypt remaining blocks
 321
 322         cmp     $0, %r13
 323         je      _zero_cipher_left_\@
 324         sub     $64, %r13
 325         je      _four_cipher_left_\@
 326 _crypt_by_4_\@:
 327         GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 328         %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 329         %xmm7, %xmm8, enc
 330         add     $64, %r11
 331         sub     $64, %r13
 332         jne     _crypt_by_4_\@
 333 _four_cipher_left_\@:
 334         GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 335 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 336 _zero_cipher_left_\@:
 337         movdqu %xmm8, AadHash(%arg2)
 338         movdqu %xmm0, CurCount(%arg2)
 339
 340         mov     %arg5, %r13
 341         and     $15, %r13                       # %r13 = arg5 (mod 16)
 342         je      _multiple_of_16_bytes_\@
 343
 344         mov %r13, PBlockLen(%arg2)
 345
 346         # Handle the last <16 Byte block separately
 347         paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 348         movdqu %xmm0, CurCount(%arg2)
 349         movdqa SHUF_MASK(%rip), %xmm10
 350         PSHUFB_XMM %xmm10, %xmm0
 351
 352         ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 353         movdqu %xmm0, PBlockEncKey(%arg2)
 354
 355         cmp     $16, %arg5
 356         jge _large_enough_update_\@
 357
 358         lea (%arg4,%r11,1), %r10
 359         mov %r13, %r12
 360         READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 361         jmp _data_read_\@
 362
 363 _large_enough_update_\@:
 364         sub     $16, %r11
 365         add     %r13, %r11
 366
 367         # receive the last <16 Byte block
 368         movdqu  (%arg4, %r11, 1), %xmm1
 369
 370         sub     %r13, %r11
 371         add     $16, %r11
 372
 373         lea     SHIFT_MASK+16(%rip), %r12
 374         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 375         # (r13 is the number of bytes in plaintext mod 16)
 376         sub     %r13, %r12
 377         # get the appropriate shuffle mask
 378         movdqu  (%r12), %xmm2
 379         # shift right 16-r13 bytes
 380         PSHUFB_XMM  %xmm2, %xmm1
 381
 382 _data_read_\@:
 383         lea ALL_F+16(%rip), %r12
 384         sub %r13, %r12
 385
 386 .ifc \operation, dec
 387         movdqa  %xmm1, %xmm2
 388 .endif
 389         pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 390         movdqu  (%r12), %xmm1
 391         # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 392         pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 393 .ifc \operation, dec
 394         pand    %xmm1, %xmm2
 395         movdqa SHUF_MASK(%rip), %xmm10
 396         PSHUFB_XMM %xmm10 ,%xmm2
 397
 398         pxor %xmm2, %xmm8
 399 .else
 400         movdqa SHUF_MASK(%rip), %xmm10
 401         PSHUFB_XMM %xmm10,%xmm0
 402
 403         pxor    %xmm0, %xmm8
 404 .endif
 405
 406         movdqu %xmm8, AadHash(%arg2)
 407 .ifc \operation, enc
 408         # GHASH computation for the last <16 byte block
 409         movdqa SHUF_MASK(%rip), %xmm10
 410         # shuffle xmm0 back to output as ciphertext
 411         PSHUFB_XMM %xmm10, %xmm0
 412 .endif
 413
 414         # Output %r13 bytes
 415         MOVQ_R64_XMM %xmm0, %rax
 416         cmp $8, %r13
 417         jle _less_than_8_bytes_left_\@
 418         mov %rax, (%arg3 , %r11, 1)
 419         add $8, %r11
 420         psrldq $8, %xmm0
 421         MOVQ_R64_XMM %xmm0, %rax
 422         sub $8, %r13
 423 _less_than_8_bytes_left_\@:
 424         mov %al,  (%arg3, %r11, 1)
 425         add $1, %r11
 426         shr $8, %rax
 427         sub $1, %r13
 428         jne _less_than_8_bytes_left_\@
 429 _multiple_of_16_bytes_\@:
 430 .endm
 431
 432 # GCM_COMPLETE Finishes update of tag of last partial block
 433 # Output: Authorization Tag (AUTH_TAG)
 434 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 435 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 436         movdqu AadHash(%arg2), %xmm8
 437         movdqu HashKey(%arg2), %xmm13
 438
 439         mov PBlockLen(%arg2), %r12
 440
 441         cmp $0, %r12
 442         je _partial_done\@
 443
 444         GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 445
 446 _partial_done\@:
 447         mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 448         shl     $3, %r12                  # convert into number of bits
 449         movd    %r12d, %xmm15             # len(A) in %xmm15
 450         mov InLen(%arg2), %r12
 451         shl     $3, %r12                  # len(C) in bits (*128)
 452         MOVQ_R64_XMM    %r12, %xmm1
 453
 454         pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 455         pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 456         pxor    %xmm15, %xmm8
 457         GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 458         # final GHASH computation
 459         movdqa SHUF_MASK(%rip), %xmm10
 460         PSHUFB_XMM %xmm10, %xmm8
 461
 462         movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 463         ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 464         pxor    %xmm8, %xmm0
 465 _return_T_\@:
 466         mov     \AUTHTAG, %r10                     # %r10 = authTag
 467         mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 468         cmp     $16, %r11
 469         je      _T_16_\@
 470         cmp     $8, %r11
 471         jl      _T_4_\@
 472 _T_8_\@:
 473         MOVQ_R64_XMM    %xmm0, %rax
 474         mov     %rax, (%r10)
 475         add     $8, %r10
 476         sub     $8, %r11
 477         psrldq  $8, %xmm0
 478         cmp     $0, %r11
 479         je      _return_T_done_\@
 480 _T_4_\@:
 481         movd    %xmm0, %eax
 482         mov     %eax, (%r10)
 483         add     $4, %r10
 484         sub     $4, %r11
 485         psrldq  $4, %xmm0
 486         cmp     $0, %r11
 487         je      _return_T_done_\@
 488 _T_123_\@:
 489         movd    %xmm0, %eax
 490         cmp     $2, %r11
 491         jl      _T_1_\@
 492         mov     %ax, (%r10)
 493         cmp     $2, %r11
 494         je      _return_T_done_\@
 495         add     $2, %r10
 496         sar     $16, %eax
 497 _T_1_\@:
 498         mov     %al, (%r10)
 499         jmp     _return_T_done_\@
 500 _T_16_\@:
 501         movdqu  %xmm0, (%r10)
 502 _return_T_done_\@:
 503 .endm
 504
 505 #ifdef __x86_64__
 506 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 507 *
 508 *
 509 * Input: A and B (128-bits each, bit-reflected)
 510 * Output: C = A*B*x mod poly, (i.e. >>1 )
 511 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 512 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 513 *
 514 */
 515 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 516         movdqa    \GH, \TMP1
 517         pshufd    $78, \GH, \TMP2
 518         pshufd    $78, \HK, \TMP3
 519         pxor      \GH, \TMP2            # TMP2 = a1+a0
 520         pxor      \HK, \TMP3            # TMP3 = b1+b0
 521         PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
 522         PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
 523         PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 524         pxor      \GH, \TMP2
 525         pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 526         movdqa    \TMP2, \TMP3
 527         pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 528         psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 529         pxor      \TMP3, \GH
 530         pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 531
 532         # first phase of the reduction
 533
 534         movdqa    \GH, \TMP2
 535         movdqa    \GH, \TMP3
 536         movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 537                                         # in in order to perform
 538                                         # independent shifts
 539         pslld     $31, \TMP2            # packed right shift <<31
 540         pslld     $30, \TMP3            # packed right shift <<30
 541         pslld     $25, \TMP4            # packed right shift <<25
 542         pxor      \TMP3, \TMP2          # xor the shifted versions
 543         pxor      \TMP4, \TMP2
 544         movdqa    \TMP2, \TMP5
 545         psrldq    $4, \TMP5             # right shift TMP5 1 DW
 546         pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 547         pxor      \TMP2, \GH
 548
 549         # second phase of the reduction
 550
 551         movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 552                                         # in in order to perform
 553                                         # independent shifts
 554         movdqa    \GH,\TMP3
 555         movdqa    \GH,\TMP4
 556         psrld     $1,\TMP2              # packed left shift >>1
 557         psrld     $2,\TMP3              # packed left shift >>2
 558         psrld     $7,\TMP4              # packed left shift >>7
 559         pxor      \TMP3,\TMP2           # xor the shifted versions
 560         pxor      \TMP4,\TMP2
 561         pxor      \TMP5, \TMP2
 562         pxor      \TMP2, \GH
 563         pxor      \TMP1, \GH            # result is in TMP1
 564 .endm
 565
 566 # Reads DLEN bytes starting at DPTR and stores in XMMDst
 567 # where 0 < DLEN < 16
 568 # Clobbers %rax, DLEN and XMM1
 569 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 570         cmp $8, \DLEN
 571         jl _read_lt8_\@
 572         mov (\DPTR), %rax
 573         MOVQ_R64_XMM %rax, \XMMDst
 574         sub $8, \DLEN
 575         jz _done_read_partial_block_\@
 576         xor %eax, %eax
 577 _read_next_byte_\@:
 578         shl $8, %rax
 579         mov 7(\DPTR, \DLEN, 1), %al
 580         dec \DLEN
 581         jnz _read_next_byte_\@
 582         MOVQ_R64_XMM %rax, \XMM1
 583         pslldq $8, \XMM1
 584         por \XMM1, \XMMDst
 585         jmp _done_read_partial_block_\@
 586 _read_lt8_\@:
 587         xor %eax, %eax
 588 _read_next_byte_lt8_\@:
 589         shl $8, %rax
 590         mov -1(\DPTR, \DLEN, 1), %al
 591         dec \DLEN
 592         jnz _read_next_byte_lt8_\@
 593         MOVQ_R64_XMM %rax, \XMMDst
 594 _done_read_partial_block_\@:
 595 .endm
 596
 597 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 598 # clobbers r10-11, xmm14
 599 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 600         TMP6 TMP7
 601         MOVADQ     SHUF_MASK(%rip), %xmm14
 602         mov        \AAD, %r10           # %r10 = AAD
 603         mov        \AADLEN, %r11                # %r11 = aadLen
 604         pxor       \TMP7, \TMP7
 605         pxor       \TMP6, \TMP6
 606
 607         cmp        $16, %r11
 608         jl         _get_AAD_rest\@
 609 _get_AAD_blocks\@:
 610         movdqu     (%r10), \TMP7
 611         PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 612         pxor       \TMP7, \TMP6
 613         GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 614         add        $16, %r10
 615         sub        $16, %r11
 616         cmp        $16, %r11
 617         jge        _get_AAD_blocks\@
 618
 619         movdqu     \TMP6, \TMP7
 620
 621         /* read the last <16B of AAD */
 622 _get_AAD_rest\@:
 623         cmp        $0, %r11
 624         je         _get_AAD_done\@
 625
 626         READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 627         PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 628         pxor       \TMP6, \TMP7
 629         GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 630         movdqu \TMP7, \TMP6
 631
 632 _get_AAD_done\@:
 633         movdqu \TMP6, AadHash(%arg2)
 634 .endm
 635
 636 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 637 # between update calls.
 638 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 639 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 640 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 641 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 642         AAD_HASH operation
 643         mov     PBlockLen(%arg2), %r13
 644         cmp     $0, %r13
 645         je      _partial_block_done_\@  # Leave Macro if no partial blocks
 646         # Read in input data without over reading
 647         cmp     $16, \PLAIN_CYPH_LEN
 648         jl      _fewer_than_16_bytes_\@
 649         movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 650         jmp     _data_read_\@
 651
 652 _fewer_than_16_bytes_\@:
 653         lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 654         mov     \PLAIN_CYPH_LEN, %r12
 655         READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 656
 657         mov PBlockLen(%arg2), %r13
 658
 659 _data_read_\@:                          # Finished reading in data
 660
 661         movdqu  PBlockEncKey(%arg2), %xmm9
 662         movdqu  HashKey(%arg2), %xmm13
 663
 664         lea     SHIFT_MASK(%rip), %r12
 665
 666         # adjust the shuffle mask pointer to be able to shift r13 bytes
 667         # r16-r13 is the number of bytes in plaintext mod 16)
 668         add     %r13, %r12
 669         movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 670         PSHUFB_XMM %xmm2, %xmm9         # shift right r13 bytes
 671
 672 .ifc \operation, dec
 673         movdqa  %xmm1, %xmm3
 674         pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
 675
 676         mov     \PLAIN_CYPH_LEN, %r10
 677         add     %r13, %r10
 678         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 679         sub     $16, %r10
 680         # Determine if if partial block is not being filled and
 681         # shift mask accordingly
 682         jge     _no_extra_mask_1_\@
 683         sub     %r10, %r12
 684 _no_extra_mask_1_\@:
 685
 686         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 687         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 688         pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 689
 690         pand    %xmm1, %xmm3
 691         movdqa  SHUF_MASK(%rip), %xmm10
 692         PSHUFB_XMM      %xmm10, %xmm3
 693         PSHUFB_XMM      %xmm2, %xmm3
 694         pxor    %xmm3, \AAD_HASH
 695
 696         cmp     $0, %r10
 697         jl      _partial_incomplete_1_\@
 698
 699         # GHASH computation for the last <16 Byte block
 700         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 701         xor     %eax, %eax
 702
 703         mov     %rax, PBlockLen(%arg2)
 704         jmp     _dec_done_\@
 705 _partial_incomplete_1_\@:
 706         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 707 _dec_done_\@:
 708         movdqu  \AAD_HASH, AadHash(%arg2)
 709 .else
 710         pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 711
 712         mov     \PLAIN_CYPH_LEN, %r10
 713         add     %r13, %r10
 714         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 715         sub     $16, %r10
 716         # Determine if if partial block is not being filled and
 717         # shift mask accordingly
 718         jge     _no_extra_mask_2_\@
 719         sub     %r10, %r12
 720 _no_extra_mask_2_\@:
 721
 722         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 723         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 724         pand    %xmm1, %xmm9
 725
 726         movdqa  SHUF_MASK(%rip), %xmm1
 727         PSHUFB_XMM %xmm1, %xmm9
 728         PSHUFB_XMM %xmm2, %xmm9
 729         pxor    %xmm9, \AAD_HASH
 730
 731         cmp     $0, %r10
 732         jl      _partial_incomplete_2_\@
 733
 734         # GHASH computation for the last <16 Byte block
 735         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 736         xor     %eax, %eax
 737
 738         mov     %rax, PBlockLen(%arg2)
 739         jmp     _encode_done_\@
 740 _partial_incomplete_2_\@:
 741         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 742 _encode_done_\@:
 743         movdqu  \AAD_HASH, AadHash(%arg2)
 744
 745         movdqa  SHUF_MASK(%rip), %xmm10
 746         # shuffle xmm9 back to output as ciphertext
 747         PSHUFB_XMM      %xmm10, %xmm9
 748         PSHUFB_XMM      %xmm2, %xmm9
 749 .endif
 750         # output encrypted Bytes
 751         cmp     $0, %r10
 752         jl      _partial_fill_\@
 753         mov     %r13, %r12
 754         mov     $16, %r13
 755         # Set r13 to be the number of bytes to write out
 756         sub     %r12, %r13
 757         jmp     _count_set_\@
 758 _partial_fill_\@:
 759         mov     \PLAIN_CYPH_LEN, %r13
 760 _count_set_\@:
 761         movdqa  %xmm9, %xmm0
 762         MOVQ_R64_XMM    %xmm0, %rax
 763         cmp     $8, %r13
 764         jle     _less_than_8_bytes_left_\@
 765
 766         mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 767         add     $8, \DATA_OFFSET
 768         psrldq  $8, %xmm0
 769         MOVQ_R64_XMM    %xmm0, %rax
 770         sub     $8, %r13
 771 _less_than_8_bytes_left_\@:
 772         movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 773         add     $1, \DATA_OFFSET
 774         shr     $8, %rax
 775         sub     $1, %r13
 776         jne     _less_than_8_bytes_left_\@
 777 _partial_block_done_\@:
 778 .endm # PARTIAL_BLOCK
 779
 780 /*
 781 * if a = number of total plaintext bytes
 782 * b = floor(a/16)
 783 * num_initial_blocks = b mod 4
 784 * encrypt the initial num_initial_blocks blocks and apply ghash on
 785 * the ciphertext
 786 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 787 * are clobbered
 788 * arg1, %arg2, %arg3 are used as a pointer only, not modified
 789 */
 790
 791
 792 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 793         XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 794         MOVADQ          SHUF_MASK(%rip), %xmm14
 795
 796         movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 797
 798         # start AES for num_initial_blocks blocks
 799
 800         movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 801
 802 .if (\i == 5) || (\i == 6) || (\i == 7)
 803
 804         MOVADQ          ONE(%RIP),\TMP1
 805         MOVADQ          0(%arg1),\TMP2
 806 .irpc index, \i_seq
 807         paddd           \TMP1, \XMM0                 # INCR Y0
 808 .ifc \operation, dec
 809         movdqa     \XMM0, %xmm\index
 810 .else
 811         MOVADQ          \XMM0, %xmm\index
 812 .endif
 813         PSHUFB_XMM      %xmm14, %xmm\index      # perform a 16 byte swap
 814         pxor            \TMP2, %xmm\index
 815 .endr
 816         lea     0x10(%arg1),%r10
 817         mov     keysize,%eax
 818         shr     $2,%eax                         # 128->4, 192->6, 256->8
 819         add     $5,%eax                       # 128->9, 192->11, 256->13
 820
 821 aes_loop_initial_\@:
 822         MOVADQ  (%r10),\TMP1
 823 .irpc   index, \i_seq
 824         AESENC  \TMP1, %xmm\index
 825 .endr
 826         add     $16,%r10
 827         sub     $1,%eax
 828         jnz     aes_loop_initial_\@
 829
 830         MOVADQ  (%r10), \TMP1
 831 .irpc index, \i_seq
 832         AESENCLAST \TMP1, %xmm\index         # Last Round
 833 .endr
 834 .irpc index, \i_seq
 835         movdqu     (%arg4 , %r11, 1), \TMP1
 836         pxor       \TMP1, %xmm\index
 837         movdqu     %xmm\index, (%arg3 , %r11, 1)
 838         # write back plaintext/ciphertext for num_initial_blocks
 839         add        $16, %r11
 840
 841 .ifc \operation, dec
 842         movdqa     \TMP1, %xmm\index
 843 .endif
 844         PSHUFB_XMM         %xmm14, %xmm\index
 845
 846                 # prepare plaintext/ciphertext for GHASH computation
 847 .endr
 848 .endif
 849
 850         # apply GHASH on num_initial_blocks blocks
 851
 852 .if \i == 5
 853         pxor       %xmm5, %xmm6
 854         GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 855         pxor       %xmm6, %xmm7
 856         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 857         pxor       %xmm7, %xmm8
 858         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859 .elseif \i == 6
 860         pxor       %xmm6, %xmm7
 861         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 862         pxor       %xmm7, %xmm8
 863         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 864 .elseif \i == 7
 865         pxor       %xmm7, %xmm8
 866         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 867 .endif
 868         cmp        $64, %r13
 869         jl      _initial_blocks_done\@
 870         # no need for precomputed values
 871 /*
 872 *
 873 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 874 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 875 */
 876         MOVADQ     ONE(%RIP),\TMP1
 877         paddd      \TMP1, \XMM0              # INCR Y0
 878         MOVADQ     \XMM0, \XMM1
 879         PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 880
 881         paddd      \TMP1, \XMM0              # INCR Y0
 882         MOVADQ     \XMM0, \XMM2
 883         PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 884
 885         paddd      \TMP1, \XMM0              # INCR Y0
 886         MOVADQ     \XMM0, \XMM3
 887         PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 888
 889         paddd      \TMP1, \XMM0              # INCR Y0
 890         MOVADQ     \XMM0, \XMM4
 891         PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 892
 893         MOVADQ     0(%arg1),\TMP1
 894         pxor       \TMP1, \XMM1
 895         pxor       \TMP1, \XMM2
 896         pxor       \TMP1, \XMM3
 897         pxor       \TMP1, \XMM4
 898 .irpc index, 1234 # do 4 rounds
 899         movaps 0x10*\index(%arg1), \TMP1
 900         AESENC     \TMP1, \XMM1
 901         AESENC     \TMP1, \XMM2
 902         AESENC     \TMP1, \XMM3
 903         AESENC     \TMP1, \XMM4
 904 .endr
 905 .irpc index, 56789 # do next 5 rounds
 906         movaps 0x10*\index(%arg1), \TMP1
 907         AESENC     \TMP1, \XMM1
 908         AESENC     \TMP1, \XMM2
 909         AESENC     \TMP1, \XMM3
 910         AESENC     \TMP1, \XMM4
 911 .endr
 912         lea        0xa0(%arg1),%r10
 913         mov        keysize,%eax
 914         shr        $2,%eax                      # 128->4, 192->6, 256->8
 915         sub        $4,%eax                      # 128->0, 192->2, 256->4
 916         jz         aes_loop_pre_done\@
 917
 918 aes_loop_pre_\@:
 919         MOVADQ     (%r10),\TMP2
 920 .irpc   index, 1234
 921         AESENC     \TMP2, %xmm\index
 922 .endr
 923         add        $16,%r10
 924         sub        $1,%eax
 925         jnz        aes_loop_pre_\@
 926
 927 aes_loop_pre_done\@:
 928         MOVADQ     (%r10), \TMP2
 929         AESENCLAST \TMP2, \XMM1
 930         AESENCLAST \TMP2, \XMM2
 931         AESENCLAST \TMP2, \XMM3
 932         AESENCLAST \TMP2, \XMM4
 933         movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 934         pxor       \TMP1, \XMM1
 935 .ifc \operation, dec
 936         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 937         movdqa     \TMP1, \XMM1
 938 .endif
 939         movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 940         pxor       \TMP1, \XMM2
 941 .ifc \operation, dec
 942         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 943         movdqa     \TMP1, \XMM2
 944 .endif
 945         movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 946         pxor       \TMP1, \XMM3
 947 .ifc \operation, dec
 948         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 949         movdqa     \TMP1, \XMM3
 950 .endif
 951         movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 952         pxor       \TMP1, \XMM4
 953 .ifc \operation, dec
 954         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 955         movdqa     \TMP1, \XMM4
 956 .else
 957         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 958         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 959         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 960         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 961 .endif
 962
 963         add        $64, %r11
 964         PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 965         pxor       \XMMDst, \XMM1
 966 # combine GHASHed value with the corresponding ciphertext
 967         PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 968         PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 969         PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 970
 971 _initial_blocks_done\@:
 972
 973 .endm
 974
 975 /*
 976 * encrypt 4 blocks at a time
 977 * ghash the 4 previously encrypted ciphertext blocks
 978 * arg1, %arg3, %arg4 are used as pointers only, not modified
 979 * %r11 is the data offset value
 980 */
 981 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
 982 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 983
 984         movdqa    \XMM1, \XMM5
 985         movdqa    \XMM2, \XMM6
 986         movdqa    \XMM3, \XMM7
 987         movdqa    \XMM4, \XMM8
 988
 989         movdqa    SHUF_MASK(%rip), %xmm15
 990         # multiply TMP5 * HashKey using karatsuba
 991
 992         movdqa    \XMM5, \TMP4
 993         pshufd    $78, \XMM5, \TMP6
 994         pxor      \XMM5, \TMP6
 995         paddd     ONE(%rip), \XMM0              # INCR CNT
 996         movdqu    HashKey_4(%arg2), \TMP5
 997         PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 998         movdqa    \XMM0, \XMM1
 999         paddd     ONE(%rip), \XMM0              # INCR CNT
1000         movdqa    \XMM0, \XMM2
1001         paddd     ONE(%rip), \XMM0              # INCR CNT
1002         movdqa    \XMM0, \XMM3
1003         paddd     ONE(%rip), \XMM0              # INCR CNT
1004         movdqa    \XMM0, \XMM4
1005         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1006         PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1007         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1008         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1009         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1010
1011         pxor      (%arg1), \XMM1
1012         pxor      (%arg1), \XMM2
1013         pxor      (%arg1), \XMM3
1014         pxor      (%arg1), \XMM4
1015         movdqu    HashKey_4_k(%arg2), \TMP5
1016         PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1017         movaps 0x10(%arg1), \TMP1
1018         AESENC    \TMP1, \XMM1              # Round 1
1019         AESENC    \TMP1, \XMM2
1020         AESENC    \TMP1, \XMM3
1021         AESENC    \TMP1, \XMM4
1022         movaps 0x20(%arg1), \TMP1
1023         AESENC    \TMP1, \XMM1              # Round 2
1024         AESENC    \TMP1, \XMM2
1025         AESENC    \TMP1, \XMM3
1026         AESENC    \TMP1, \XMM4
1027         movdqa    \XMM6, \TMP1
1028         pshufd    $78, \XMM6, \TMP2
1029         pxor      \XMM6, \TMP2
1030         movdqu    HashKey_3(%arg2), \TMP5
1031         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1032         movaps 0x30(%arg1), \TMP3
1033         AESENC    \TMP3, \XMM1              # Round 3
1034         AESENC    \TMP3, \XMM2
1035         AESENC    \TMP3, \XMM3
1036         AESENC    \TMP3, \XMM4
1037         PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1038         movaps 0x40(%arg1), \TMP3
1039         AESENC    \TMP3, \XMM1              # Round 4
1040         AESENC    \TMP3, \XMM2
1041         AESENC    \TMP3, \XMM3
1042         AESENC    \TMP3, \XMM4
1043         movdqu    HashKey_3_k(%arg2), \TMP5
1044         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1045         movaps 0x50(%arg1), \TMP3
1046         AESENC    \TMP3, \XMM1              # Round 5
1047         AESENC    \TMP3, \XMM2
1048         AESENC    \TMP3, \XMM3
1049         AESENC    \TMP3, \XMM4
1050         pxor      \TMP1, \TMP4
1051 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1052         pxor      \XMM6, \XMM5
1053         pxor      \TMP2, \TMP6
1054         movdqa    \XMM7, \TMP1
1055         pshufd    $78, \XMM7, \TMP2
1056         pxor      \XMM7, \TMP2
1057         movdqu    HashKey_2(%arg2), \TMP5
1058
1059         # Multiply TMP5 * HashKey using karatsuba
1060
1061         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1062         movaps 0x60(%arg1), \TMP3
1063         AESENC    \TMP3, \XMM1              # Round 6
1064         AESENC    \TMP3, \XMM2
1065         AESENC    \TMP3, \XMM3
1066         AESENC    \TMP3, \XMM4
1067         PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1068         movaps 0x70(%arg1), \TMP3
1069         AESENC    \TMP3, \XMM1             # Round 7
1070         AESENC    \TMP3, \XMM2
1071         AESENC    \TMP3, \XMM3
1072         AESENC    \TMP3, \XMM4
1073         movdqu    HashKey_2_k(%arg2), \TMP5
1074         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1075         movaps 0x80(%arg1), \TMP3
1076         AESENC    \TMP3, \XMM1             # Round 8
1077         AESENC    \TMP3, \XMM2
1078         AESENC    \TMP3, \XMM3
1079         AESENC    \TMP3, \XMM4
1080         pxor      \TMP1, \TMP4
1081 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1082         pxor      \XMM7, \XMM5
1083         pxor      \TMP2, \TMP6
1084
1085         # Multiply XMM8 * HashKey
1086         # XMM8 and TMP5 hold the values for the two operands
1087
1088         movdqa    \XMM8, \TMP1
1089         pshufd    $78, \XMM8, \TMP2
1090         pxor      \XMM8, \TMP2
1091         movdqu    HashKey(%arg2), \TMP5
1092         PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1093         movaps 0x90(%arg1), \TMP3
1094         AESENC    \TMP3, \XMM1            # Round 9
1095         AESENC    \TMP3, \XMM2
1096         AESENC    \TMP3, \XMM3
1097         AESENC    \TMP3, \XMM4
1098         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1099         lea       0xa0(%arg1),%r10
1100         mov       keysize,%eax
1101         shr       $2,%eax                       # 128->4, 192->6, 256->8
1102         sub       $4,%eax                       # 128->0, 192->2, 256->4
1103         jz        aes_loop_par_enc_done\@
1104
1105 aes_loop_par_enc\@:
1106         MOVADQ    (%r10),\TMP3
1107 .irpc   index, 1234
1108         AESENC    \TMP3, %xmm\index
1109 .endr
1110         add       $16,%r10
1111         sub       $1,%eax
1112         jnz       aes_loop_par_enc\@
1113
1114 aes_loop_par_enc_done\@:
1115         MOVADQ    (%r10), \TMP3
1116         AESENCLAST \TMP3, \XMM1           # Round 10
1117         AESENCLAST \TMP3, \XMM2
1118         AESENCLAST \TMP3, \XMM3
1119         AESENCLAST \TMP3, \XMM4
1120         movdqu    HashKey_k(%arg2), \TMP5
1121         PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1122         movdqu    (%arg4,%r11,1), \TMP3
1123         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1124         movdqu    16(%arg4,%r11,1), \TMP3
1125         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1126         movdqu    32(%arg4,%r11,1), \TMP3
1127         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1128         movdqu    48(%arg4,%r11,1), \TMP3
1129         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1130         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1131         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1132         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1133         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1134         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1135         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1136         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1137         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1138
1139         pxor      \TMP4, \TMP1
1140         pxor      \XMM8, \XMM5
1141         pxor      \TMP6, \TMP2
1142         pxor      \TMP1, \TMP2
1143         pxor      \XMM5, \TMP2
1144         movdqa    \TMP2, \TMP3
1145         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1146         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1147         pxor      \TMP3, \XMM5
1148         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1149
1150         # first phase of reduction
1151
1152         movdqa    \XMM5, \TMP2
1153         movdqa    \XMM5, \TMP3
1154         movdqa    \XMM5, \TMP4
1155 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1156         pslld     $31, \TMP2                   # packed right shift << 31
1157         pslld     $30, \TMP3                   # packed right shift << 30
1158         pslld     $25, \TMP4                   # packed right shift << 25
1159         pxor      \TMP3, \TMP2                 # xor the shifted versions
1160         pxor      \TMP4, \TMP2
1161         movdqa    \TMP2, \TMP5
1162         psrldq    $4, \TMP5                    # right shift T5 1 DW
1163         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1164         pxor      \TMP2, \XMM5
1165
1166         # second phase of reduction
1167
1168         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1169         movdqa    \XMM5,\TMP3
1170         movdqa    \XMM5,\TMP4
1171         psrld     $1, \TMP2                    # packed left shift >>1
1172         psrld     $2, \TMP3                    # packed left shift >>2
1173         psrld     $7, \TMP4                    # packed left shift >>7
1174         pxor      \TMP3,\TMP2                  # xor the shifted versions
1175         pxor      \TMP4,\TMP2
1176         pxor      \TMP5, \TMP2
1177         pxor      \TMP2, \XMM5
1178         pxor      \TMP1, \XMM5                 # result is in TMP1
1179
1180         pxor      \XMM5, \XMM1
1181 .endm
1182
1183 /*
1184 * decrypt 4 blocks at a time
1185 * ghash the 4 previously decrypted ciphertext blocks
1186 * arg1, %arg3, %arg4 are used as pointers only, not modified
1187 * %r11 is the data offset value
1188 */
1189 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1190 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1191
1192         movdqa    \XMM1, \XMM5
1193         movdqa    \XMM2, \XMM6
1194         movdqa    \XMM3, \XMM7
1195         movdqa    \XMM4, \XMM8
1196
1197         movdqa    SHUF_MASK(%rip), %xmm15
1198         # multiply TMP5 * HashKey using karatsuba
1199
1200         movdqa    \XMM5, \TMP4
1201         pshufd    $78, \XMM5, \TMP6
1202         pxor      \XMM5, \TMP6
1203         paddd     ONE(%rip), \XMM0              # INCR CNT
1204         movdqu    HashKey_4(%arg2), \TMP5
1205         PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1206         movdqa    \XMM0, \XMM1
1207         paddd     ONE(%rip), \XMM0              # INCR CNT
1208         movdqa    \XMM0, \XMM2
1209         paddd     ONE(%rip), \XMM0              # INCR CNT
1210         movdqa    \XMM0, \XMM3
1211         paddd     ONE(%rip), \XMM0              # INCR CNT
1212         movdqa    \XMM0, \XMM4
1213         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1214         PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1215         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1216         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1217         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1218
1219         pxor      (%arg1), \XMM1
1220         pxor      (%arg1), \XMM2
1221         pxor      (%arg1), \XMM3
1222         pxor      (%arg1), \XMM4
1223         movdqu    HashKey_4_k(%arg2), \TMP5
1224         PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1225         movaps 0x10(%arg1), \TMP1
1226         AESENC    \TMP1, \XMM1              # Round 1
1227         AESENC    \TMP1, \XMM2
1228         AESENC    \TMP1, \XMM3
1229         AESENC    \TMP1, \XMM4
1230         movaps 0x20(%arg1), \TMP1
1231         AESENC    \TMP1, \XMM1              # Round 2
1232         AESENC    \TMP1, \XMM2
1233         AESENC    \TMP1, \XMM3
1234         AESENC    \TMP1, \XMM4
1235         movdqa    \XMM6, \TMP1
1236         pshufd    $78, \XMM6, \TMP2
1237         pxor      \XMM6, \TMP2
1238         movdqu    HashKey_3(%arg2), \TMP5
1239         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1240         movaps 0x30(%arg1), \TMP3
1241         AESENC    \TMP3, \XMM1              # Round 3
1242         AESENC    \TMP3, \XMM2
1243         AESENC    \TMP3, \XMM3
1244         AESENC    \TMP3, \XMM4
1245         PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1246         movaps 0x40(%arg1), \TMP3
1247         AESENC    \TMP3, \XMM1              # Round 4
1248         AESENC    \TMP3, \XMM2
1249         AESENC    \TMP3, \XMM3
1250         AESENC    \TMP3, \XMM4
1251         movdqu    HashKey_3_k(%arg2), \TMP5
1252         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1253         movaps 0x50(%arg1), \TMP3
1254         AESENC    \TMP3, \XMM1              # Round 5
1255         AESENC    \TMP3, \XMM2
1256         AESENC    \TMP3, \XMM3
1257         AESENC    \TMP3, \XMM4
1258         pxor      \TMP1, \TMP4
1259 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1260         pxor      \XMM6, \XMM5
1261         pxor      \TMP2, \TMP6
1262         movdqa    \XMM7, \TMP1
1263         pshufd    $78, \XMM7, \TMP2
1264         pxor      \XMM7, \TMP2
1265         movdqu    HashKey_2(%arg2), \TMP5
1266
1267         # Multiply TMP5 * HashKey using karatsuba
1268
1269         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1270         movaps 0x60(%arg1), \TMP3
1271         AESENC    \TMP3, \XMM1              # Round 6
1272         AESENC    \TMP3, \XMM2
1273         AESENC    \TMP3, \XMM3
1274         AESENC    \TMP3, \XMM4
1275         PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1276         movaps 0x70(%arg1), \TMP3
1277         AESENC    \TMP3, \XMM1             # Round 7
1278         AESENC    \TMP3, \XMM2
1279         AESENC    \TMP3, \XMM3
1280         AESENC    \TMP3, \XMM4
1281         movdqu    HashKey_2_k(%arg2), \TMP5
1282         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1283         movaps 0x80(%arg1), \TMP3
1284         AESENC    \TMP3, \XMM1             # Round 8
1285         AESENC    \TMP3, \XMM2
1286         AESENC    \TMP3, \XMM3
1287         AESENC    \TMP3, \XMM4
1288         pxor      \TMP1, \TMP4
1289 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1290         pxor      \XMM7, \XMM5
1291         pxor      \TMP2, \TMP6
1292
1293         # Multiply XMM8 * HashKey
1294         # XMM8 and TMP5 hold the values for the two operands
1295
1296         movdqa    \XMM8, \TMP1
1297         pshufd    $78, \XMM8, \TMP2
1298         pxor      \XMM8, \TMP2
1299         movdqu    HashKey(%arg2), \TMP5
1300         PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1301         movaps 0x90(%arg1), \TMP3
1302         AESENC    \TMP3, \XMM1            # Round 9
1303         AESENC    \TMP3, \XMM2
1304         AESENC    \TMP3, \XMM3
1305         AESENC    \TMP3, \XMM4
1306         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1307         lea       0xa0(%arg1),%r10
1308         mov       keysize,%eax
1309         shr       $2,%eax                       # 128->4, 192->6, 256->8
1310         sub       $4,%eax                       # 128->0, 192->2, 256->4
1311         jz        aes_loop_par_dec_done\@
1312
1313 aes_loop_par_dec\@:
1314         MOVADQ    (%r10),\TMP3
1315 .irpc   index, 1234
1316         AESENC    \TMP3, %xmm\index
1317 .endr
1318         add       $16,%r10
1319         sub       $1,%eax
1320         jnz       aes_loop_par_dec\@
1321
1322 aes_loop_par_dec_done\@:
1323         MOVADQ    (%r10), \TMP3
1324         AESENCLAST \TMP3, \XMM1           # last round
1325         AESENCLAST \TMP3, \XMM2
1326         AESENCLAST \TMP3, \XMM3
1327         AESENCLAST \TMP3, \XMM4
1328         movdqu    HashKey_k(%arg2), \TMP5
1329         PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1330         movdqu    (%arg4,%r11,1), \TMP3
1331         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1332         movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1333         movdqa    \TMP3, \XMM1
1334         movdqu    16(%arg4,%r11,1), \TMP3
1335         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1336         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1337         movdqa    \TMP3, \XMM2
1338         movdqu    32(%arg4,%r11,1), \TMP3
1339         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1340         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1341         movdqa    \TMP3, \XMM3
1342         movdqu    48(%arg4,%r11,1), \TMP3
1343         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1344         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1345         movdqa    \TMP3, \XMM4
1346         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1347         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1348         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1349         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1350
1351         pxor      \TMP4, \TMP1
1352         pxor      \XMM8, \XMM5
1353         pxor      \TMP6, \TMP2
1354         pxor      \TMP1, \TMP2
1355         pxor      \XMM5, \TMP2
1356         movdqa    \TMP2, \TMP3
1357         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1358         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1359         pxor      \TMP3, \XMM5
1360         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1361
1362         # first phase of reduction
1363
1364         movdqa    \XMM5, \TMP2
1365         movdqa    \XMM5, \TMP3
1366         movdqa    \XMM5, \TMP4
1367 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1368         pslld     $31, \TMP2                   # packed right shift << 31
1369         pslld     $30, \TMP3                   # packed right shift << 30
1370         pslld     $25, \TMP4                   # packed right shift << 25
1371         pxor      \TMP3, \TMP2                 # xor the shifted versions
1372         pxor      \TMP4, \TMP2
1373         movdqa    \TMP2, \TMP5
1374         psrldq    $4, \TMP5                    # right shift T5 1 DW
1375         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1376         pxor      \TMP2, \XMM5
1377
1378         # second phase of reduction
1379
1380         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1381         movdqa    \XMM5,\TMP3
1382         movdqa    \XMM5,\TMP4
1383         psrld     $1, \TMP2                    # packed left shift >>1
1384         psrld     $2, \TMP3                    # packed left shift >>2
1385         psrld     $7, \TMP4                    # packed left shift >>7
1386         pxor      \TMP3,\TMP2                  # xor the shifted versions
1387         pxor      \TMP4,\TMP2
1388         pxor      \TMP5, \TMP2
1389         pxor      \TMP2, \XMM5
1390         pxor      \TMP1, \XMM5                 # result is in TMP1
1391
1392         pxor      \XMM5, \XMM1
1393 .endm
1394
1395 /* GHASH the last 4 ciphertext blocks. */
1396 .macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1397 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1398
1399         # Multiply TMP6 * HashKey (using Karatsuba)
1400
1401         movdqa    \XMM1, \TMP6
1402         pshufd    $78, \XMM1, \TMP2
1403         pxor      \XMM1, \TMP2
1404         movdqu    HashKey_4(%arg2), \TMP5
1405         PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1406         PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1407         movdqu    HashKey_4_k(%arg2), \TMP4
1408         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1409         movdqa    \XMM1, \XMMDst
1410         movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1411
1412         # Multiply TMP1 * HashKey (using Karatsuba)
1413
1414         movdqa    \XMM2, \TMP1
1415         pshufd    $78, \XMM2, \TMP2
1416         pxor      \XMM2, \TMP2
1417         movdqu    HashKey_3(%arg2), \TMP5
1418         PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1419         PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1420         movdqu    HashKey_3_k(%arg2), \TMP4
1421         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1422         pxor      \TMP1, \TMP6
1423         pxor      \XMM2, \XMMDst
1424         pxor      \TMP2, \XMM1
1425 # results accumulated in TMP6, XMMDst, XMM1
1426
1427         # Multiply TMP1 * HashKey (using Karatsuba)
1428
1429         movdqa    \XMM3, \TMP1
1430         pshufd    $78, \XMM3, \TMP2
1431         pxor      \XMM3, \TMP2
1432         movdqu    HashKey_2(%arg2), \TMP5
1433         PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1434         PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1435         movdqu    HashKey_2_k(%arg2), \TMP4
1436         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1437         pxor      \TMP1, \TMP6
1438         pxor      \XMM3, \XMMDst
1439         pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1440
1441         # Multiply TMP1 * HashKey (using Karatsuba)
1442         movdqa    \XMM4, \TMP1
1443         pshufd    $78, \XMM4, \TMP2
1444         pxor      \XMM4, \TMP2
1445         movdqu    HashKey(%arg2), \TMP5
1446         PCLMULQDQ 0x11, \TMP5, \TMP1        # TMP1 = a1*b1
1447         PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1448         movdqu    HashKey_k(%arg2), \TMP4
1449         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1450         pxor      \TMP1, \TMP6
1451         pxor      \XMM4, \XMMDst
1452         pxor      \XMM1, \TMP2
1453         pxor      \TMP6, \TMP2
1454         pxor      \XMMDst, \TMP2
1455         # middle section of the temp results combined as in karatsuba algorithm
1456         movdqa    \TMP2, \TMP4
1457         pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1458         psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1459         pxor      \TMP4, \XMMDst
1460         pxor      \TMP2, \TMP6
1461 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1462         # first phase of the reduction
1463         movdqa    \XMMDst, \TMP2
1464         movdqa    \XMMDst, \TMP3
1465         movdqa    \XMMDst, \TMP4
1466 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1467         pslld     $31, \TMP2                # packed right shifting << 31
1468         pslld     $30, \TMP3                # packed right shifting << 30
1469         pslld     $25, \TMP4                # packed right shifting << 25
1470         pxor      \TMP3, \TMP2              # xor the shifted versions
1471         pxor      \TMP4, \TMP2
1472         movdqa    \TMP2, \TMP7
1473         psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1474         pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1475         pxor      \TMP2, \XMMDst
1476
1477         # second phase of the reduction
1478         movdqa    \XMMDst, \TMP2
1479         # make 3 copies of XMMDst for doing 3 shift operations
1480         movdqa    \XMMDst, \TMP3
1481         movdqa    \XMMDst, \TMP4
1482         psrld     $1, \TMP2                 # packed left shift >> 1
1483         psrld     $2, \TMP3                 # packed left shift >> 2
1484         psrld     $7, \TMP4                 # packed left shift >> 7
1485         pxor      \TMP3, \TMP2              # xor the shifted versions
1486         pxor      \TMP4, \TMP2
1487         pxor      \TMP7, \TMP2
1488         pxor      \TMP2, \XMMDst
1489         pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1490 .endm
1491
1492
1493 /* Encryption of a single block
1494 * uses eax & r10
1495 */
1496
1497 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1498
1499         pxor            (%arg1), \XMM0
1500         mov             keysize,%eax
1501         shr             $2,%eax                 # 128->4, 192->6, 256->8
1502         add             $5,%eax                 # 128->9, 192->11, 256->13
1503         lea             16(%arg1), %r10   # get first expanded key address
1504
1505 _esb_loop_\@:
1506         MOVADQ          (%r10),\TMP1
1507         AESENC          \TMP1,\XMM0
1508         add             $16,%r10
1509         sub             $1,%eax
1510         jnz             _esb_loop_\@
1511
1512         MOVADQ          (%r10),\TMP1
1513         AESENCLAST      \TMP1,\XMM0
1514 .endm
1515 /*****************************************************************************
1516 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1517 *                   struct gcm_context_data *data
1518 *                                      // Context data
1519 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1520 *                   const u8 *in,      // Ciphertext input
1521 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1522 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1523 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1524 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1525 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1526 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1527 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1528 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1529 *                                      // given authentication tag and only return the plaintext if they match.
1530 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1531 *                                      // (most likely), 12 or 8.
1532 *
1533 * Assumptions:
1534 *
1535 * keys:
1536 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1537 *       set of 11 keys in the data structure void *aes_ctx
1538 *
1539 * iv:
1540 *       0                   1                   2                   3
1541 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1542 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543 *       |                             Salt  (From the SA)               |
1544 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545 *       |                     Initialization Vector                     |
1546 *       |         (This is the sequence number from IPSec header)       |
1547 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1548 *       |                              0x1                              |
1549 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1550 *
1551 *
1552 *
1553 * AAD:
1554 *       AAD padded to 128 bits with 0
1555 *       for example, assume AAD is a u32 vector
1556 *
1557 *       if AAD is 8 bytes:
1558 *       AAD[3] = {A0, A1};
1559 *       padded AAD in xmm register = {A1 A0 0 0}
1560 *
1561 *       0                   1                   2                   3
1562 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1563 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1564 *       |                               SPI (A1)                        |
1565 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566 *       |                     32-bit Sequence Number (A0)               |
1567 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568 *       |                              0x0                              |
1569 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570 *
1571 *                                       AAD Format with 32-bit Sequence Number
1572 *
1573 *       if AAD is 12 bytes:
1574 *       AAD[3] = {A0, A1, A2};
1575 *       padded AAD in xmm register = {A2 A1 A0 0}
1576 *
1577 *       0                   1                   2                   3
1578 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1579 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1580 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1581 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582 *       |                               SPI (A2)                        |
1583 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1585 *       |                                                               |
1586 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1587 *       |                              0x0                              |
1588 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1589 *
1590 *                        AAD Format with 64-bit Extended Sequence Number
1591 *
1592 * poly = x^128 + x^127 + x^126 + x^121 + 1
1593 *
1594 *****************************************************************************/
1595 SYM_FUNC_START(aesni_gcm_dec)
1596         FUNC_SAVE
1597
1598         GCM_INIT %arg6, arg7, arg8, arg9
1599         GCM_ENC_DEC dec
1600         GCM_COMPLETE arg10, arg11
1601         FUNC_RESTORE
1602         ret
1603 SYM_FUNC_END(aesni_gcm_dec)
1604
1605
1606 /*****************************************************************************
1607 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1608 *                    struct gcm_context_data *data
1609 *                                        // Context data
1610 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1611 *                    const u8 *in,       // Plaintext input
1612 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1613 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1614 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1615 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1616 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1617 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1618 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1619 *                    u8 *auth_tag,       // Authenticated Tag output.
1620 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1621 *                                        // 12 or 8.
1622 *
1623 * Assumptions:
1624 *
1625 * keys:
1626 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1627 *       first set of 11 keys in the data structure void *aes_ctx
1628 *
1629 *
1630 * iv:
1631 *       0                   1                   2                   3
1632 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1633 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634 *       |                             Salt  (From the SA)               |
1635 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636 *       |                     Initialization Vector                     |
1637 *       |         (This is the sequence number from IPSec header)       |
1638 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1639 *       |                              0x1                              |
1640 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641 *
1642 *
1643 *
1644 * AAD:
1645 *       AAD padded to 128 bits with 0
1646 *       for example, assume AAD is a u32 vector
1647 *
1648 *       if AAD is 8 bytes:
1649 *       AAD[3] = {A0, A1};
1650 *       padded AAD in xmm register = {A1 A0 0 0}
1651 *
1652 *       0                   1                   2                   3
1653 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1654 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1655 *       |                               SPI (A1)                        |
1656 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657 *       |                     32-bit Sequence Number (A0)               |
1658 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659 *       |                              0x0                              |
1660 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661 *
1662 *                                 AAD Format with 32-bit Sequence Number
1663 *
1664 *       if AAD is 12 bytes:
1665 *       AAD[3] = {A0, A1, A2};
1666 *       padded AAD in xmm register = {A2 A1 A0 0}
1667 *
1668 *       0                   1                   2                   3
1669 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1670 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671 *       |                               SPI (A2)                        |
1672 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1674 *       |                                                               |
1675 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1676 *       |                              0x0                              |
1677 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1678 *
1679 *                         AAD Format with 64-bit Extended Sequence Number
1680 *
1681 * poly = x^128 + x^127 + x^126 + x^121 + 1
1682 ***************************************************************************/
1683 SYM_FUNC_START(aesni_gcm_enc)
1684         FUNC_SAVE
1685
1686         GCM_INIT %arg6, arg7, arg8, arg9
1687         GCM_ENC_DEC enc
1688
1689         GCM_COMPLETE arg10, arg11
1690         FUNC_RESTORE
1691         ret
1692 SYM_FUNC_END(aesni_gcm_enc)
1693
1694 /*****************************************************************************
1695 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1696 *                     struct gcm_context_data *data,
1697 *                                         // context data
1698 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1699 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1700 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1701 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1702 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1703 *                     u64 aad_len)        // Length of AAD in bytes.
1704 */
1705 SYM_FUNC_START(aesni_gcm_init)
1706         FUNC_SAVE
1707         GCM_INIT %arg3, %arg4,%arg5, %arg6
1708         FUNC_RESTORE
1709         ret
1710 SYM_FUNC_END(aesni_gcm_init)
1711
1712 /*****************************************************************************
1713 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1714 *                    struct gcm_context_data *data,
1715 *                                        // context data
1716 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1717 *                    const u8 *in,       // Plaintext input
1718 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1719 */
1720 SYM_FUNC_START(aesni_gcm_enc_update)
1721         FUNC_SAVE
1722         GCM_ENC_DEC enc
1723         FUNC_RESTORE
1724         ret
1725 SYM_FUNC_END(aesni_gcm_enc_update)
1726
1727 /*****************************************************************************
1728 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1729 *                    struct gcm_context_data *data,
1730 *                                        // context data
1731 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1732 *                    const u8 *in,       // Plaintext input
1733 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1734 */
1735 SYM_FUNC_START(aesni_gcm_dec_update)
1736         FUNC_SAVE
1737         GCM_ENC_DEC dec
1738         FUNC_RESTORE
1739         ret
1740 SYM_FUNC_END(aesni_gcm_dec_update)
1741
1742 /*****************************************************************************
1743 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1744 *                    struct gcm_context_data *data,
1745 *                                        // context data
1746 *                    u8 *auth_tag,       // Authenticated Tag output.
1747 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1748 *                                        // 12 or 8.
1749 */
1750 SYM_FUNC_START(aesni_gcm_finalize)
1751         FUNC_SAVE
1752         GCM_COMPLETE %arg3 %arg4
1753         FUNC_RESTORE
1754         ret
1755 SYM_FUNC_END(aesni_gcm_finalize)
1756
1757 #endif
1758
1759
1760 SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1761 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1762         pshufd $0b11111111, %xmm1, %xmm1
1763         shufps $0b00010000, %xmm0, %xmm4
1764         pxor %xmm4, %xmm0
1765         shufps $0b10001100, %xmm0, %xmm4
1766         pxor %xmm4, %xmm0
1767         pxor %xmm1, %xmm0
1768         movaps %xmm0, (TKEYP)
1769         add $0x10, TKEYP
1770         ret
1771 SYM_FUNC_END(_key_expansion_256a)
1772 SYM_FUNC_END_ALIAS(_key_expansion_128)
1773
1774 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1775         pshufd $0b01010101, %xmm1, %xmm1
1776         shufps $0b00010000, %xmm0, %xmm4
1777         pxor %xmm4, %xmm0
1778         shufps $0b10001100, %xmm0, %xmm4
1779         pxor %xmm4, %xmm0
1780         pxor %xmm1, %xmm0
1781
1782         movaps %xmm2, %xmm5
1783         movaps %xmm2, %xmm6
1784         pslldq $4, %xmm5
1785         pshufd $0b11111111, %xmm0, %xmm3
1786         pxor %xmm3, %xmm2
1787         pxor %xmm5, %xmm2
1788
1789         movaps %xmm0, %xmm1
1790         shufps $0b01000100, %xmm0, %xmm6
1791         movaps %xmm6, (TKEYP)
1792         shufps $0b01001110, %xmm2, %xmm1
1793         movaps %xmm1, 0x10(TKEYP)
1794         add $0x20, TKEYP
1795         ret
1796 SYM_FUNC_END(_key_expansion_192a)
1797
1798 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1799         pshufd $0b01010101, %xmm1, %xmm1
1800         shufps $0b00010000, %xmm0, %xmm4
1801         pxor %xmm4, %xmm0
1802         shufps $0b10001100, %xmm0, %xmm4
1803         pxor %xmm4, %xmm0
1804         pxor %xmm1, %xmm0
1805
1806         movaps %xmm2, %xmm5
1807         pslldq $4, %xmm5
1808         pshufd $0b11111111, %xmm0, %xmm3
1809         pxor %xmm3, %xmm2
1810         pxor %xmm5, %xmm2
1811
1812         movaps %xmm0, (TKEYP)
1813         add $0x10, TKEYP
1814         ret
1815 SYM_FUNC_END(_key_expansion_192b)
1816
1817 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1818         pshufd $0b10101010, %xmm1, %xmm1
1819         shufps $0b00010000, %xmm2, %xmm4
1820         pxor %xmm4, %xmm2
1821         shufps $0b10001100, %xmm2, %xmm4
1822         pxor %xmm4, %xmm2
1823         pxor %xmm1, %xmm2
1824         movaps %xmm2, (TKEYP)
1825         add $0x10, TKEYP
1826         ret
1827 SYM_FUNC_END(_key_expansion_256b)
1828
1829 /*
1830  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1831  *                   unsigned int key_len)
1832  */
1833 SYM_FUNC_START(aesni_set_key)
1834         FRAME_BEGIN
1835 #ifndef __x86_64__
1836         pushl KEYP
1837         movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1838         movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1839         movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1840 #endif
1841         movups (UKEYP), %xmm0           # user key (first 16 bytes)
1842         movaps %xmm0, (KEYP)
1843         lea 0x10(KEYP), TKEYP           # key addr
1844         movl %edx, 480(KEYP)
1845         pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1846         cmp $24, %dl
1847         jb .Lenc_key128
1848         je .Lenc_key192
1849         movups 0x10(UKEYP), %xmm2       # other user key
1850         movaps %xmm2, (TKEYP)
1851         add $0x10, TKEYP
1852         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1853         call _key_expansion_256a
1854         AESKEYGENASSIST 0x1 %xmm0 %xmm1
1855         call _key_expansion_256b
1856         AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1857         call _key_expansion_256a
1858         AESKEYGENASSIST 0x2 %xmm0 %xmm1
1859         call _key_expansion_256b
1860         AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1861         call _key_expansion_256a
1862         AESKEYGENASSIST 0x4 %xmm0 %xmm1
1863         call _key_expansion_256b
1864         AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1865         call _key_expansion_256a
1866         AESKEYGENASSIST 0x8 %xmm0 %xmm1
1867         call _key_expansion_256b
1868         AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1869         call _key_expansion_256a
1870         AESKEYGENASSIST 0x10 %xmm0 %xmm1
1871         call _key_expansion_256b
1872         AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1873         call _key_expansion_256a
1874         AESKEYGENASSIST 0x20 %xmm0 %xmm1
1875         call _key_expansion_256b
1876         AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1877         call _key_expansion_256a
1878         jmp .Ldec_key
1879 .Lenc_key192:
1880         movq 0x10(UKEYP), %xmm2         # other user key
1881         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1882         call _key_expansion_192a
1883         AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1884         call _key_expansion_192b
1885         AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1886         call _key_expansion_192a
1887         AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1888         call _key_expansion_192b
1889         AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1890         call _key_expansion_192a
1891         AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1892         call _key_expansion_192b
1893         AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1894         call _key_expansion_192a
1895         AESKEYGENASSIST 0x80 %xmm2 %xmm1        # round 8
1896         call _key_expansion_192b
1897         jmp .Ldec_key
1898 .Lenc_key128:
1899         AESKEYGENASSIST 0x1 %xmm0 %xmm1         # round 1
1900         call _key_expansion_128
1901         AESKEYGENASSIST 0x2 %xmm0 %xmm1         # round 2
1902         call _key_expansion_128
1903         AESKEYGENASSIST 0x4 %xmm0 %xmm1         # round 3
1904         call _key_expansion_128
1905         AESKEYGENASSIST 0x8 %xmm0 %xmm1         # round 4
1906         call _key_expansion_128
1907         AESKEYGENASSIST 0x10 %xmm0 %xmm1        # round 5
1908         call _key_expansion_128
1909         AESKEYGENASSIST 0x20 %xmm0 %xmm1        # round 6
1910         call _key_expansion_128
1911         AESKEYGENASSIST 0x40 %xmm0 %xmm1        # round 7
1912         call _key_expansion_128
1913         AESKEYGENASSIST 0x80 %xmm0 %xmm1        # round 8
1914         call _key_expansion_128
1915         AESKEYGENASSIST 0x1b %xmm0 %xmm1        # round 9
1916         call _key_expansion_128
1917         AESKEYGENASSIST 0x36 %xmm0 %xmm1        # round 10
1918         call _key_expansion_128
1919 .Ldec_key:
1920         sub $0x10, TKEYP
1921         movaps (KEYP), %xmm0
1922         movaps (TKEYP), %xmm1
1923         movaps %xmm0, 240(TKEYP)
1924         movaps %xmm1, 240(KEYP)
1925         add $0x10, KEYP
1926         lea 240-16(TKEYP), UKEYP
1927 .align 4
1928 .Ldec_key_loop:
1929         movaps (KEYP), %xmm0
1930         AESIMC %xmm0 %xmm1
1931         movaps %xmm1, (UKEYP)
1932         add $0x10, KEYP
1933         sub $0x10, UKEYP
1934         cmp TKEYP, KEYP
1935         jb .Ldec_key_loop
1936         xor AREG, AREG
1937 #ifndef __x86_64__
1938         popl KEYP
1939 #endif
1940         FRAME_END
1941         ret
1942 SYM_FUNC_END(aesni_set_key)
1943
1944 /*
1945  * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1946  */
1947 SYM_FUNC_START(aesni_enc)
1948         FRAME_BEGIN
1949 #ifndef __x86_64__
1950         pushl KEYP
1951         pushl KLEN
1952         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1953         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1954         movl (FRAME_OFFSET+20)(%esp), INP       # src
1955 #endif
1956         movl 480(KEYP), KLEN            # key length
1957         movups (INP), STATE             # input
1958         call _aesni_enc1
1959         movups STATE, (OUTP)            # output
1960 #ifndef __x86_64__
1961         popl KLEN
1962         popl KEYP
1963 #endif
1964         FRAME_END
1965         ret
1966 SYM_FUNC_END(aesni_enc)
1967
1968 /*
1969  * _aesni_enc1:         internal ABI
1970  * input:
1971  *      KEYP:           key struct pointer
1972  *      KLEN:           round count
1973  *      STATE:          initial state (input)
1974  * output:
1975  *      STATE:          finial state (output)
1976  * changed:
1977  *      KEY
1978  *      TKEYP (T1)
1979  */
1980 SYM_FUNC_START_LOCAL(_aesni_enc1)
1981         movaps (KEYP), KEY              # key
1982         mov KEYP, TKEYP
1983         pxor KEY, STATE         # round 0
1984         add $0x30, TKEYP
1985         cmp $24, KLEN
1986         jb .Lenc128
1987         lea 0x20(TKEYP), TKEYP
1988         je .Lenc192
1989         add $0x20, TKEYP
1990         movaps -0x60(TKEYP), KEY
1991         AESENC KEY STATE
1992         movaps -0x50(TKEYP), KEY
1993         AESENC KEY STATE
1994 .align 4
1995 .Lenc192:
1996         movaps -0x40(TKEYP), KEY
1997         AESENC KEY STATE
1998         movaps -0x30(TKEYP), KEY
1999         AESENC KEY STATE
2000 .align 4
2001 .Lenc128:
2002         movaps -0x20(TKEYP), KEY
2003         AESENC KEY STATE
2004         movaps -0x10(TKEYP), KEY
2005         AESENC KEY STATE
2006         movaps (TKEYP), KEY
2007         AESENC KEY STATE
2008         movaps 0x10(TKEYP), KEY
2009         AESENC KEY STATE
2010         movaps 0x20(TKEYP), KEY
2011         AESENC KEY STATE
2012         movaps 0x30(TKEYP), KEY
2013         AESENC KEY STATE
2014         movaps 0x40(TKEYP), KEY
2015         AESENC KEY STATE
2016         movaps 0x50(TKEYP), KEY
2017         AESENC KEY STATE
2018         movaps 0x60(TKEYP), KEY
2019         AESENC KEY STATE
2020         movaps 0x70(TKEYP), KEY
2021         AESENCLAST KEY STATE
2022         ret
2023 SYM_FUNC_END(_aesni_enc1)
2024
2025 /*
2026  * _aesni_enc4: internal ABI
2027  * input:
2028  *      KEYP:           key struct pointer
2029  *      KLEN:           round count
2030  *      STATE1:         initial state (input)
2031  *      STATE2
2032  *      STATE3
2033  *      STATE4
2034  * output:
2035  *      STATE1:         finial state (output)
2036  *      STATE2
2037  *      STATE3
2038  *      STATE4
2039  * changed:
2040  *      KEY
2041  *      TKEYP (T1)
2042  */
2043 SYM_FUNC_START_LOCAL(_aesni_enc4)
2044         movaps (KEYP), KEY              # key
2045         mov KEYP, TKEYP
2046         pxor KEY, STATE1                # round 0
2047         pxor KEY, STATE2
2048         pxor KEY, STATE3
2049         pxor KEY, STATE4
2050         add $0x30, TKEYP
2051         cmp $24, KLEN
2052         jb .L4enc128
2053         lea 0x20(TKEYP), TKEYP
2054         je .L4enc192
2055         add $0x20, TKEYP
2056         movaps -0x60(TKEYP), KEY
2057         AESENC KEY STATE1
2058         AESENC KEY STATE2
2059         AESENC KEY STATE3
2060         AESENC KEY STATE4
2061         movaps -0x50(TKEYP), KEY
2062         AESENC KEY STATE1
2063         AESENC KEY STATE2
2064         AESENC KEY STATE3
2065         AESENC KEY STATE4
2066 #.align 4
2067 .L4enc192:
2068         movaps -0x40(TKEYP), KEY
2069         AESENC KEY STATE1
2070         AESENC KEY STATE2
2071         AESENC KEY STATE3
2072         AESENC KEY STATE4
2073         movaps -0x30(TKEYP), KEY
2074         AESENC KEY STATE1
2075         AESENC KEY STATE2
2076         AESENC KEY STATE3
2077         AESENC KEY STATE4
2078 #.align 4
2079 .L4enc128:
2080         movaps -0x20(TKEYP), KEY
2081         AESENC KEY STATE1
2082         AESENC KEY STATE2
2083         AESENC KEY STATE3
2084         AESENC KEY STATE4
2085         movaps -0x10(TKEYP), KEY
2086         AESENC KEY STATE1
2087         AESENC KEY STATE2
2088         AESENC KEY STATE3
2089         AESENC KEY STATE4
2090         movaps (TKEYP), KEY
2091         AESENC KEY STATE1
2092         AESENC KEY STATE2
2093         AESENC KEY STATE3
2094         AESENC KEY STATE4
2095         movaps 0x10(TKEYP), KEY
2096         AESENC KEY STATE1
2097         AESENC KEY STATE2
2098         AESENC KEY STATE3
2099         AESENC KEY STATE4
2100         movaps 0x20(TKEYP), KEY
2101         AESENC KEY STATE1
2102         AESENC KEY STATE2
2103         AESENC KEY STATE3
2104         AESENC KEY STATE4
2105         movaps 0x30(TKEYP), KEY
2106         AESENC KEY STATE1
2107         AESENC KEY STATE2
2108         AESENC KEY STATE3
2109         AESENC KEY STATE4
2110         movaps 0x40(TKEYP), KEY
2111         AESENC KEY STATE1
2112         AESENC KEY STATE2
2113         AESENC KEY STATE3
2114         AESENC KEY STATE4
2115         movaps 0x50(TKEYP), KEY
2116         AESENC KEY STATE1
2117         AESENC KEY STATE2
2118         AESENC KEY STATE3
2119         AESENC KEY STATE4
2120         movaps 0x60(TKEYP), KEY
2121         AESENC KEY STATE1
2122         AESENC KEY STATE2
2123         AESENC KEY STATE3
2124         AESENC KEY STATE4
2125         movaps 0x70(TKEYP), KEY
2126         AESENCLAST KEY STATE1           # last round
2127         AESENCLAST KEY STATE2
2128         AESENCLAST KEY STATE3
2129         AESENCLAST KEY STATE4
2130         ret
2131 SYM_FUNC_END(_aesni_enc4)
2132
2133 /*
2134  * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2135  */
2136 SYM_FUNC_START(aesni_dec)
2137         FRAME_BEGIN
2138 #ifndef __x86_64__
2139         pushl KEYP
2140         pushl KLEN
2141         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
2142         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
2143         movl (FRAME_OFFSET+20)(%esp), INP       # src
2144 #endif
2145         mov 480(KEYP), KLEN             # key length
2146         add $240, KEYP
2147         movups (INP), STATE             # input
2148         call _aesni_dec1
2149         movups STATE, (OUTP)            #output
2150 #ifndef __x86_64__
2151         popl KLEN
2152         popl KEYP
2153 #endif
2154         FRAME_END
2155         ret
2156 SYM_FUNC_END(aesni_dec)
2157
2158 /*
2159  * _aesni_dec1:         internal ABI
2160  * input:
2161  *      KEYP:           key struct pointer
2162  *      KLEN:           key length
2163  *      STATE:          initial state (input)
2164  * output:
2165  *      STATE:          finial state (output)
2166  * changed:
2167  *      KEY
2168  *      TKEYP (T1)
2169  */
2170 SYM_FUNC_START_LOCAL(_aesni_dec1)
2171         movaps (KEYP), KEY              # key
2172         mov KEYP, TKEYP
2173         pxor KEY, STATE         # round 0
2174         add $0x30, TKEYP
2175         cmp $24, KLEN
2176         jb .Ldec128
2177         lea 0x20(TKEYP), TKEYP
2178         je .Ldec192
2179         add $0x20, TKEYP
2180         movaps -0x60(TKEYP), KEY
2181         AESDEC KEY STATE
2182         movaps -0x50(TKEYP), KEY
2183         AESDEC KEY STATE
2184 .align 4
2185 .Ldec192:
2186         movaps -0x40(TKEYP), KEY
2187         AESDEC KEY STATE
2188         movaps -0x30(TKEYP), KEY
2189         AESDEC KEY STATE
2190 .align 4
2191 .Ldec128:
2192         movaps -0x20(TKEYP), KEY
2193         AESDEC KEY STATE
2194         movaps -0x10(TKEYP), KEY
2195         AESDEC KEY STATE
2196         movaps (TKEYP), KEY
2197         AESDEC KEY STATE
2198         movaps 0x10(TKEYP), KEY
2199         AESDEC KEY STATE
2200         movaps 0x20(TKEYP), KEY
2201         AESDEC KEY STATE
2202         movaps 0x30(TKEYP), KEY
2203         AESDEC KEY STATE
2204         movaps 0x40(TKEYP), KEY
2205         AESDEC KEY STATE
2206         movaps 0x50(TKEYP), KEY
2207         AESDEC KEY STATE
2208         movaps 0x60(TKEYP), KEY
2209         AESDEC KEY STATE
2210         movaps 0x70(TKEYP), KEY
2211         AESDECLAST KEY STATE
2212         ret
2213 SYM_FUNC_END(_aesni_dec1)
2214
2215 /*
2216  * _aesni_dec4: internal ABI
2217  * input:
2218  *      KEYP:           key struct pointer
2219  *      KLEN:           key length
2220  *      STATE1:         initial state (input)
2221  *      STATE2
2222  *      STATE3
2223  *      STATE4
2224  * output:
2225  *      STATE1:         finial state (output)
2226  *      STATE2
2227  *      STATE3
2228  *      STATE4
2229  * changed:
2230  *      KEY
2231  *      TKEYP (T1)
2232  */
2233 SYM_FUNC_START_LOCAL(_aesni_dec4)
2234         movaps (KEYP), KEY              # key
2235         mov KEYP, TKEYP
2236         pxor KEY, STATE1                # round 0
2237         pxor KEY, STATE2
2238         pxor KEY, STATE3
2239         pxor KEY, STATE4
2240         add $0x30, TKEYP
2241         cmp $24, KLEN
2242         jb .L4dec128
2243         lea 0x20(TKEYP), TKEYP
2244         je .L4dec192
2245         add $0x20, TKEYP
2246         movaps -0x60(TKEYP), KEY
2247         AESDEC KEY STATE1
2248         AESDEC KEY STATE2
2249         AESDEC KEY STATE3
2250         AESDEC KEY STATE4
2251         movaps -0x50(TKEYP), KEY
2252         AESDEC KEY STATE1
2253         AESDEC KEY STATE2
2254         AESDEC KEY STATE3
2255         AESDEC KEY STATE4
2256 .align 4
2257 .L4dec192:
2258         movaps -0x40(TKEYP), KEY
2259         AESDEC KEY STATE1
2260         AESDEC KEY STATE2
2261         AESDEC KEY STATE3
2262         AESDEC KEY STATE4
2263         movaps -0x30(TKEYP), KEY
2264         AESDEC KEY STATE1
2265         AESDEC KEY STATE2
2266         AESDEC KEY STATE3
2267         AESDEC KEY STATE4
2268 .align 4
2269 .L4dec128:
2270         movaps -0x20(TKEYP), KEY
2271         AESDEC KEY STATE1
2272         AESDEC KEY STATE2
2273         AESDEC KEY STATE3
2274         AESDEC KEY STATE4
2275         movaps -0x10(TKEYP), KEY
2276         AESDEC KEY STATE1
2277         AESDEC KEY STATE2
2278         AESDEC KEY STATE3
2279         AESDEC KEY STATE4
2280         movaps (TKEYP), KEY
2281         AESDEC KEY STATE1
2282         AESDEC KEY STATE2
2283         AESDEC KEY STATE3
2284         AESDEC KEY STATE4
2285         movaps 0x10(TKEYP), KEY
2286         AESDEC KEY STATE1
2287         AESDEC KEY STATE2
2288         AESDEC KEY STATE3
2289         AESDEC KEY STATE4
2290         movaps 0x20(TKEYP), KEY
2291         AESDEC KEY STATE1
2292         AESDEC KEY STATE2
2293         AESDEC KEY STATE3
2294         AESDEC KEY STATE4
2295         movaps 0x30(TKEYP), KEY
2296         AESDEC KEY STATE1
2297         AESDEC KEY STATE2
2298         AESDEC KEY STATE3
2299         AESDEC KEY STATE4
2300         movaps 0x40(TKEYP), KEY
2301         AESDEC KEY STATE1
2302         AESDEC KEY STATE2
2303         AESDEC KEY STATE3
2304         AESDEC KEY STATE4
2305         movaps 0x50(TKEYP), KEY
2306         AESDEC KEY STATE1
2307         AESDEC KEY STATE2
2308         AESDEC KEY STATE3
2309         AESDEC KEY STATE4
2310         movaps 0x60(TKEYP), KEY
2311         AESDEC KEY STATE1
2312         AESDEC KEY STATE2
2313         AESDEC KEY STATE3
2314         AESDEC KEY STATE4
2315         movaps 0x70(TKEYP), KEY
2316         AESDECLAST KEY STATE1           # last round
2317         AESDECLAST KEY STATE2
2318         AESDECLAST KEY STATE3
2319         AESDECLAST KEY STATE4
2320         ret
2321 SYM_FUNC_END(_aesni_dec4)
2322
2323 /*
2324  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2325  *                    size_t len)
2326  */
2327 SYM_FUNC_START(aesni_ecb_enc)
2328         FRAME_BEGIN
2329 #ifndef __x86_64__
2330         pushl LEN
2331         pushl KEYP
2332         pushl KLEN
2333         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2334         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2335         movl (FRAME_OFFSET+24)(%esp), INP       # src
2336         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2337 #endif
2338         test LEN, LEN           # check length
2339         jz .Lecb_enc_ret
2340         mov 480(KEYP), KLEN
2341         cmp $16, LEN
2342         jb .Lecb_enc_ret
2343         cmp $64, LEN
2344         jb .Lecb_enc_loop1
2345 .align 4
2346 .Lecb_enc_loop4:
2347         movups (INP), STATE1
2348         movups 0x10(INP), STATE2
2349         movups 0x20(INP), STATE3
2350         movups 0x30(INP), STATE4
2351         call _aesni_enc4
2352         movups STATE1, (OUTP)
2353         movups STATE2, 0x10(OUTP)
2354         movups STATE3, 0x20(OUTP)
2355         movups STATE4, 0x30(OUTP)
2356         sub $64, LEN
2357         add $64, INP
2358         add $64, OUTP
2359         cmp $64, LEN
2360         jge .Lecb_enc_loop4
2361         cmp $16, LEN
2362         jb .Lecb_enc_ret
2363 .align 4
2364 .Lecb_enc_loop1:
2365         movups (INP), STATE1
2366         call _aesni_enc1
2367         movups STATE1, (OUTP)
2368         sub $16, LEN
2369         add $16, INP
2370         add $16, OUTP
2371         cmp $16, LEN
2372         jge .Lecb_enc_loop1
2373 .Lecb_enc_ret:
2374 #ifndef __x86_64__
2375         popl KLEN
2376         popl KEYP
2377         popl LEN
2378 #endif
2379         FRAME_END
2380         ret
2381 SYM_FUNC_END(aesni_ecb_enc)
2382
2383 /*
2384  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2385  *                    size_t len);
2386  */
2387 SYM_FUNC_START(aesni_ecb_dec)
2388         FRAME_BEGIN
2389 #ifndef __x86_64__
2390         pushl LEN
2391         pushl KEYP
2392         pushl KLEN
2393         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2394         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2395         movl (FRAME_OFFSET+24)(%esp), INP       # src
2396         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2397 #endif
2398         test LEN, LEN
2399         jz .Lecb_dec_ret
2400         mov 480(KEYP), KLEN
2401         add $240, KEYP
2402         cmp $16, LEN
2403         jb .Lecb_dec_ret
2404         cmp $64, LEN
2405         jb .Lecb_dec_loop1
2406 .align 4
2407 .Lecb_dec_loop4:
2408         movups (INP), STATE1
2409         movups 0x10(INP), STATE2
2410         movups 0x20(INP), STATE3
2411         movups 0x30(INP), STATE4
2412         call _aesni_dec4
2413         movups STATE1, (OUTP)
2414         movups STATE2, 0x10(OUTP)
2415         movups STATE3, 0x20(OUTP)
2416         movups STATE4, 0x30(OUTP)
2417         sub $64, LEN
2418         add $64, INP
2419         add $64, OUTP
2420         cmp $64, LEN
2421         jge .Lecb_dec_loop4
2422         cmp $16, LEN
2423         jb .Lecb_dec_ret
2424 .align 4
2425 .Lecb_dec_loop1:
2426         movups (INP), STATE1
2427         call _aesni_dec1
2428         movups STATE1, (OUTP)
2429         sub $16, LEN
2430         add $16, INP
2431         add $16, OUTP
2432         cmp $16, LEN
2433         jge .Lecb_dec_loop1
2434 .Lecb_dec_ret:
2435 #ifndef __x86_64__
2436         popl KLEN
2437         popl KEYP
2438         popl LEN
2439 #endif
2440         FRAME_END
2441         ret
2442 SYM_FUNC_END(aesni_ecb_dec)
2443
2444 /*
2445  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2446  *                    size_t len, u8 *iv)
2447  */
2448 SYM_FUNC_START(aesni_cbc_enc)
2449         FRAME_BEGIN
2450 #ifndef __x86_64__
2451         pushl IVP
2452         pushl LEN
2453         pushl KEYP
2454         pushl KLEN
2455         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2456         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2457         movl (FRAME_OFFSET+28)(%esp), INP       # src
2458         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2459         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2460 #endif
2461         cmp $16, LEN
2462         jb .Lcbc_enc_ret
2463         mov 480(KEYP), KLEN
2464         movups (IVP), STATE     # load iv as initial state
2465 .align 4
2466 .Lcbc_enc_loop:
2467         movups (INP), IN        # load input
2468         pxor IN, STATE
2469         call _aesni_enc1
2470         movups STATE, (OUTP)    # store output
2471         sub $16, LEN
2472         add $16, INP
2473         add $16, OUTP
2474         cmp $16, LEN
2475         jge .Lcbc_enc_loop
2476         movups STATE, (IVP)
2477 .Lcbc_enc_ret:
2478 #ifndef __x86_64__
2479         popl KLEN
2480         popl KEYP
2481         popl LEN
2482         popl IVP
2483 #endif
2484         FRAME_END
2485         ret
2486 SYM_FUNC_END(aesni_cbc_enc)
2487
2488 /*
2489  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2490  *                    size_t len, u8 *iv)
2491  */
2492 SYM_FUNC_START(aesni_cbc_dec)
2493         FRAME_BEGIN
2494 #ifndef __x86_64__
2495         pushl IVP
2496         pushl LEN
2497         pushl KEYP
2498         pushl KLEN
2499         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2500         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2501         movl (FRAME_OFFSET+28)(%esp), INP       # src
2502         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2503         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2504 #endif
2505         cmp $16, LEN
2506         jb .Lcbc_dec_just_ret
2507         mov 480(KEYP), KLEN
2508         add $240, KEYP
2509         movups (IVP), IV
2510         cmp $64, LEN
2511         jb .Lcbc_dec_loop1
2512 .align 4
2513 .Lcbc_dec_loop4:
2514         movups (INP), IN1
2515         movaps IN1, STATE1
2516         movups 0x10(INP), IN2
2517         movaps IN2, STATE2
2518 #ifdef __x86_64__
2519         movups 0x20(INP), IN3
2520         movaps IN3, STATE3
2521         movups 0x30(INP), IN4
2522         movaps IN4, STATE4
2523 #else
2524         movups 0x20(INP), IN1
2525         movaps IN1, STATE3
2526         movups 0x30(INP), IN2
2527         movaps IN2, STATE4
2528 #endif
2529         call _aesni_dec4
2530         pxor IV, STATE1
2531 #ifdef __x86_64__
2532         pxor IN1, STATE2
2533         pxor IN2, STATE3
2534         pxor IN3, STATE4
2535         movaps IN4, IV
2536 #else
2537         pxor IN1, STATE4
2538         movaps IN2, IV
2539         movups (INP), IN1
2540         pxor IN1, STATE2
2541         movups 0x10(INP), IN2
2542         pxor IN2, STATE3
2543 #endif
2544         movups STATE1, (OUTP)
2545         movups STATE2, 0x10(OUTP)
2546         movups STATE3, 0x20(OUTP)
2547         movups STATE4, 0x30(OUTP)
2548         sub $64, LEN
2549         add $64, INP
2550         add $64, OUTP
2551         cmp $64, LEN
2552         jge .Lcbc_dec_loop4
2553         cmp $16, LEN
2554         jb .Lcbc_dec_ret
2555 .align 4
2556 .Lcbc_dec_loop1:
2557         movups (INP), IN
2558         movaps IN, STATE
2559         call _aesni_dec1
2560         pxor IV, STATE
2561         movups STATE, (OUTP)
2562         movaps IN, IV
2563         sub $16, LEN
2564         add $16, INP
2565         add $16, OUTP
2566         cmp $16, LEN
2567         jge .Lcbc_dec_loop1
2568 .Lcbc_dec_ret:
2569         movups IV, (IVP)
2570 .Lcbc_dec_just_ret:
2571 #ifndef __x86_64__
2572         popl KLEN
2573         popl KEYP
2574         popl LEN
2575         popl IVP
2576 #endif
2577         FRAME_END
2578         ret
2579 SYM_FUNC_END(aesni_cbc_dec)
2580
2581 #ifdef __x86_64__
2582 .pushsection .rodata
2583 .align 16
2584 .Lbswap_mask:
2585         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2586 .popsection
2587
2588 /*
2589  * _aesni_inc_init:     internal ABI
2590  *      setup registers used by _aesni_inc
2591  * input:
2592  *      IV
2593  * output:
2594  *      CTR:    == IV, in little endian
2595  *      TCTR_LOW: == lower qword of CTR
2596  *      INC:    == 1, in little endian
2597  *      BSWAP_MASK == endian swapping mask
2598  */
2599 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2600         movaps .Lbswap_mask, BSWAP_MASK
2601         movaps IV, CTR
2602         PSHUFB_XMM BSWAP_MASK CTR
2603         mov $1, TCTR_LOW
2604         MOVQ_R64_XMM TCTR_LOW INC
2605         MOVQ_R64_XMM CTR TCTR_LOW
2606         ret
2607 SYM_FUNC_END(_aesni_inc_init)
2608
2609 /*
2610  * _aesni_inc:          internal ABI
2611  *      Increase IV by 1, IV is in big endian
2612  * input:
2613  *      IV
2614  *      CTR:    == IV, in little endian
2615  *      TCTR_LOW: == lower qword of CTR
2616  *      INC:    == 1, in little endian
2617  *      BSWAP_MASK == endian swapping mask
2618  * output:
2619  *      IV:     Increase by 1
2620  * changed:
2621  *      CTR:    == output IV, in little endian
2622  *      TCTR_LOW: == lower qword of CTR
2623  */
2624 SYM_FUNC_START_LOCAL(_aesni_inc)
2625         paddq INC, CTR
2626         add $1, TCTR_LOW
2627         jnc .Linc_low
2628         pslldq $8, INC
2629         paddq INC, CTR
2630         psrldq $8, INC
2631 .Linc_low:
2632         movaps CTR, IV
2633         PSHUFB_XMM BSWAP_MASK IV
2634         ret
2635 SYM_FUNC_END(_aesni_inc)
2636
2637 /*
2638  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2639  *                    size_t len, u8 *iv)
2640  */
2641 SYM_FUNC_START(aesni_ctr_enc)
2642         FRAME_BEGIN
2643         cmp $16, LEN
2644         jb .Lctr_enc_just_ret
2645         mov 480(KEYP), KLEN
2646         movups (IVP), IV
2647         call _aesni_inc_init
2648         cmp $64, LEN
2649         jb .Lctr_enc_loop1
2650 .align 4
2651 .Lctr_enc_loop4:
2652         movaps IV, STATE1
2653         call _aesni_inc
2654         movups (INP), IN1
2655         movaps IV, STATE2
2656         call _aesni_inc
2657         movups 0x10(INP), IN2
2658         movaps IV, STATE3
2659         call _aesni_inc
2660         movups 0x20(INP), IN3
2661         movaps IV, STATE4
2662         call _aesni_inc
2663         movups 0x30(INP), IN4
2664         call _aesni_enc4
2665         pxor IN1, STATE1
2666         movups STATE1, (OUTP)
2667         pxor IN2, STATE2
2668         movups STATE2, 0x10(OUTP)
2669         pxor IN3, STATE3
2670         movups STATE3, 0x20(OUTP)
2671         pxor IN4, STATE4
2672         movups STATE4, 0x30(OUTP)
2673         sub $64, LEN
2674         add $64, INP
2675         add $64, OUTP
2676         cmp $64, LEN
2677         jge .Lctr_enc_loop4
2678         cmp $16, LEN
2679         jb .Lctr_enc_ret
2680 .align 4
2681 .Lctr_enc_loop1:
2682         movaps IV, STATE
2683         call _aesni_inc
2684         movups (INP), IN
2685         call _aesni_enc1
2686         pxor IN, STATE
2687         movups STATE, (OUTP)
2688         sub $16, LEN
2689         add $16, INP
2690         add $16, OUTP
2691         cmp $16, LEN
2692         jge .Lctr_enc_loop1
2693 .Lctr_enc_ret:
2694         movups IV, (IVP)
2695 .Lctr_enc_just_ret:
2696         FRAME_END
2697         ret
2698 SYM_FUNC_END(aesni_ctr_enc)
2699
2700 /*
2701  * _aesni_gf128mul_x_ble:               internal ABI
2702  *      Multiply in GF(2^128) for XTS IVs
2703  * input:
2704  *      IV:     current IV
2705  *      GF128MUL_MASK == mask with 0x87 and 0x01
2706  * output:
2707  *      IV:     next IV
2708  * changed:
2709  *      CTR:    == temporary value
2710  */
2711 #define _aesni_gf128mul_x_ble() \
2712         pshufd $0x13, IV, CTR; \
2713         paddq IV, IV; \
2714         psrad $31, CTR; \
2715         pand GF128MUL_MASK, CTR; \
2716         pxor CTR, IV;
2717
2718 /*
2719  * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2720  *                       bool enc, u8 *iv)
2721  */
2722 SYM_FUNC_START(aesni_xts_crypt8)
2723         FRAME_BEGIN
2724         cmpb $0, %cl
2725         movl $0, %ecx
2726         movl $240, %r10d
2727         leaq _aesni_enc4, %r11
2728         leaq _aesni_dec4, %rax
2729         cmovel %r10d, %ecx
2730         cmoveq %rax, %r11
2731
2732         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2733         movups (IVP), IV
2734
2735         mov 480(KEYP), KLEN
2736         addq %rcx, KEYP
2737
2738         movdqa IV, STATE1
2739         movdqu 0x00(INP), INC
2740         pxor INC, STATE1
2741         movdqu IV, 0x00(OUTP)
2742
2743         _aesni_gf128mul_x_ble()
2744         movdqa IV, STATE2
2745         movdqu 0x10(INP), INC
2746         pxor INC, STATE2
2747         movdqu IV, 0x10(OUTP)
2748
2749         _aesni_gf128mul_x_ble()
2750         movdqa IV, STATE3
2751         movdqu 0x20(INP), INC
2752         pxor INC, STATE3
2753         movdqu IV, 0x20(OUTP)
2754
2755         _aesni_gf128mul_x_ble()
2756         movdqa IV, STATE4
2757         movdqu 0x30(INP), INC
2758         pxor INC, STATE4
2759         movdqu IV, 0x30(OUTP)
2760
2761         CALL_NOSPEC %r11
2762
2763         movdqu 0x00(OUTP), INC
2764         pxor INC, STATE1
2765         movdqu STATE1, 0x00(OUTP)
2766
2767         _aesni_gf128mul_x_ble()
2768         movdqa IV, STATE1
2769         movdqu 0x40(INP), INC
2770         pxor INC, STATE1
2771         movdqu IV, 0x40(OUTP)
2772
2773         movdqu 0x10(OUTP), INC
2774         pxor INC, STATE2
2775         movdqu STATE2, 0x10(OUTP)
2776
2777         _aesni_gf128mul_x_ble()
2778         movdqa IV, STATE2
2779         movdqu 0x50(INP), INC
2780         pxor INC, STATE2
2781         movdqu IV, 0x50(OUTP)
2782
2783         movdqu 0x20(OUTP), INC
2784         pxor INC, STATE3
2785         movdqu STATE3, 0x20(OUTP)
2786
2787         _aesni_gf128mul_x_ble()
2788         movdqa IV, STATE3
2789         movdqu 0x60(INP), INC
2790         pxor INC, STATE3
2791         movdqu IV, 0x60(OUTP)
2792
2793         movdqu 0x30(OUTP), INC
2794         pxor INC, STATE4
2795         movdqu STATE4, 0x30(OUTP)
2796
2797         _aesni_gf128mul_x_ble()
2798         movdqa IV, STATE4
2799         movdqu 0x70(INP), INC
2800         pxor INC, STATE4
2801         movdqu IV, 0x70(OUTP)
2802
2803         _aesni_gf128mul_x_ble()
2804         movups IV, (IVP)
2805
2806         CALL_NOSPEC %r11
2807
2808         movdqu 0x40(OUTP), INC
2809         pxor INC, STATE1
2810         movdqu STATE1, 0x40(OUTP)
2811
2812         movdqu 0x50(OUTP), INC
2813         pxor INC, STATE2
2814         movdqu STATE2, 0x50(OUTP)
2815
2816         movdqu 0x60(OUTP), INC
2817         pxor INC, STATE3
2818         movdqu STATE3, 0x60(OUTP)
2819
2820         movdqu 0x70(OUTP), INC
2821         pxor INC, STATE4
2822         movdqu STATE4, 0x70(OUTP)
2823
2824         FRAME_END
2825         ret
2826 SYM_FUNC_END(aesni_xts_crypt8)
2827
2828 #endif