arch/x86/crypto/aesni-intel_asm.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Implement AES algorithm in Intel AES-NI instructions.
   4  *
   5  * The white paper of AES-NI instructions can be downloaded from:
   6  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7  *
   8  * Copyright (C) 2008, Intel Corp.
   9  *    Author: Huang Ying <ying.huang@intel.com>
  10  *            Vinodh Gopal <vinodh.gopal@intel.com>
  11  *            Kahraman Akdemir
  12  *
  13  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14  * interface for 64-bit kernels.
  15  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17  *             Adrian Hoban <adrian.hoban@intel.com>
  18  *             James Guilford (james.guilford@intel.com)
  19  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20  *             Tadeusz Struk (tadeusz.struk@intel.com)
  21  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22  *    Copyright (c) 2010, Intel Corporation.
  23  *
  24  * Ported x86_64 version to x86:
  25  *    Author: Mathias Krause <minipli@googlemail.com>
  26  */
  27
  28 #include <linux/linkage.h>
  29 #include <asm/frame.h>
  30 #include <asm/nospec-branch.h>
  31
  32 /*
  33  * The following macros are used to move an (un)aligned 16 byte value to/from
  34  * an XMM register.  This can done for either FP or integer values, for FP use
  35  * movaps (move aligned packed single) or integer use movdqa (move double quad
  36  * aligned).  It doesn't make a performance difference which instruction is used
  37  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  38  * shorter, so that is the one we'll use for now. (same for unaligned).
  39  */
  40 #define MOVADQ  movaps
  41 #define MOVUDQ  movups
  42
  43 #ifdef __x86_64__
  44
  45 # constants in mergeable sections, linker can reorder and merge
  46 .section        .rodata.cst16.POLY, "aM", @progbits, 16
  47 .align 16
  48 POLY:   .octa 0xC2000000000000000000000000000001
  49 .section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  50 .align 16
  51 TWOONE: .octa 0x00000001000000000000000000000001
  52
  53 .section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  54 .align 16
  55 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  56 .section        .rodata.cst16.MASK1, "aM", @progbits, 16
  57 .align 16
  58 MASK1:      .octa 0x0000000000000000ffffffffffffffff
  59 .section        .rodata.cst16.MASK2, "aM", @progbits, 16
  60 .align 16
  61 MASK2:      .octa 0xffffffffffffffff0000000000000000
  62 .section        .rodata.cst16.ONE, "aM", @progbits, 16
  63 .align 16
  64 ONE:        .octa 0x00000000000000000000000000000001
  65 .section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  66 .align 16
  67 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  68 .section        .rodata.cst16.dec, "aM", @progbits, 16
  69 .align 16
  70 dec:        .octa 0x1
  71 .section        .rodata.cst16.enc, "aM", @progbits, 16
  72 .align 16
  73 enc:        .octa 0x2
  74
  75 # order of these constants should not change.
  76 # more specifically, ALL_F should follow SHIFT_MASK,
  77 # and zero should follow ALL_F
  78 .section        .rodata, "a", @progbits
  79 .align 16
  80 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  81 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  82             .octa 0x00000000000000000000000000000000
  83
  84 .text
  85
  86 #define AadHash 16*0
  87 #define AadLen 16*1
  88 #define InLen (16*1)+8
  89 #define PBlockEncKey 16*2
  90 #define OrigIV 16*3
  91 #define CurCount 16*4
  92 #define PBlockLen 16*5
  93 #define HashKey         16*6    // store HashKey <<1 mod poly here
  94 #define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
  95 #define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
  96 #define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
  97 #define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
  98                                 // bits of  HashKey <<1 mod poly here
  99                                 //(for Karatsuba purposes)
 100 #define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 101                                 // bits of  HashKey^2 <<1 mod poly here
 102                                 // (for Karatsuba purposes)
 103 #define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 104                                 // bits of  HashKey^3 <<1 mod poly here
 105                                 // (for Karatsuba purposes)
 106 #define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 107                                 // bits of  HashKey^4 <<1 mod poly here
 108                                 // (for Karatsuba purposes)
 109
 110 #define arg1 rdi
 111 #define arg2 rsi
 112 #define arg3 rdx
 113 #define arg4 rcx
 114 #define arg5 r8
 115 #define arg6 r9
 116 #define keysize 2*15*16(%arg1)
 117 #endif
 118
 119
 120 #define STATE1  %xmm0
 121 #define STATE2  %xmm4
 122 #define STATE3  %xmm5
 123 #define STATE4  %xmm6
 124 #define STATE   STATE1
 125 #define IN1     %xmm1
 126 #define IN2     %xmm7
 127 #define IN3     %xmm8
 128 #define IN4     %xmm9
 129 #define IN      IN1
 130 #define KEY     %xmm2
 131 #define IV      %xmm3
 132
 133 #define BSWAP_MASK %xmm10
 134 #define CTR     %xmm11
 135 #define INC     %xmm12
 136
 137 #define GF128MUL_MASK %xmm7
 138
 139 #ifdef __x86_64__
 140 #define AREG    %rax
 141 #define KEYP    %rdi
 142 #define OUTP    %rsi
 143 #define UKEYP   OUTP
 144 #define INP     %rdx
 145 #define LEN     %rcx
 146 #define IVP     %r8
 147 #define KLEN    %r9d
 148 #define T1      %r10
 149 #define TKEYP   T1
 150 #define T2      %r11
 151 #define TCTR_LOW T2
 152 #else
 153 #define AREG    %eax
 154 #define KEYP    %edi
 155 #define OUTP    AREG
 156 #define UKEYP   OUTP
 157 #define INP     %edx
 158 #define LEN     %esi
 159 #define IVP     %ebp
 160 #define KLEN    %ebx
 161 #define T1      %ecx
 162 #define TKEYP   T1
 163 #endif
 164
 165 .macro FUNC_SAVE
 166         push    %r12
 167         push    %r13
 168         push    %r14
 169 #
 170 # states of %xmm registers %xmm6:%xmm15 not saved
 171 # all %xmm registers are clobbered
 172 #
 173 .endm
 174
 175
 176 .macro FUNC_RESTORE
 177         pop     %r14
 178         pop     %r13
 179         pop     %r12
 180 .endm
 181
 182 # Precompute hashkeys.
 183 # Input: Hash subkey.
 184 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
 185 # once per key.
 186 # clobbers r12, and tmp xmm registers.
 187 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 188         mov     \SUBKEY, %r12
 189         movdqu  (%r12), \TMP3
 190         movdqa  SHUF_MASK(%rip), \TMP2
 191         pshufb  \TMP2, \TMP3
 192
 193         # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 194
 195         movdqa  \TMP3, \TMP2
 196         psllq   $1, \TMP3
 197         psrlq   $63, \TMP2
 198         movdqa  \TMP2, \TMP1
 199         pslldq  $8, \TMP2
 200         psrldq  $8, \TMP1
 201         por     \TMP2, \TMP3
 202
 203         # reduce HashKey<<1
 204
 205         pshufd  $0x24, \TMP1, \TMP2
 206         pcmpeqd TWOONE(%rip), \TMP2
 207         pand    POLY(%rip), \TMP2
 208         pxor    \TMP2, \TMP3
 209         movdqu  \TMP3, HashKey(%arg2)
 210
 211         movdqa     \TMP3, \TMP5
 212         pshufd     $78, \TMP3, \TMP1
 213         pxor       \TMP3, \TMP1
 214         movdqu     \TMP1, HashKey_k(%arg2)
 215
 216         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 217 # TMP5 = HashKey^2<<1 (mod poly)
 218         movdqu     \TMP5, HashKey_2(%arg2)
 219 # HashKey_2 = HashKey^2<<1 (mod poly)
 220         pshufd     $78, \TMP5, \TMP1
 221         pxor       \TMP5, \TMP1
 222         movdqu     \TMP1, HashKey_2_k(%arg2)
 223
 224         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 225 # TMP5 = HashKey^3<<1 (mod poly)
 226         movdqu     \TMP5, HashKey_3(%arg2)
 227         pshufd     $78, \TMP5, \TMP1
 228         pxor       \TMP5, \TMP1
 229         movdqu     \TMP1, HashKey_3_k(%arg2)
 230
 231         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 232 # TMP5 = HashKey^3<<1 (mod poly)
 233         movdqu     \TMP5, HashKey_4(%arg2)
 234         pshufd     $78, \TMP5, \TMP1
 235         pxor       \TMP5, \TMP1
 236         movdqu     \TMP1, HashKey_4_k(%arg2)
 237 .endm
 238
 239 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 240 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 241 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 242         mov \AADLEN, %r11
 243         mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 244         xor %r11d, %r11d
 245         mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 246         mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 247         mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 248         mov \Iv, %rax
 249         movdqu (%rax), %xmm0
 250         movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 251
 252         movdqa  SHUF_MASK(%rip), %xmm2
 253         pshufb %xmm2, %xmm0
 254         movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 255
 256         PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
 257         movdqu HashKey(%arg2), %xmm13
 258
 259         CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 260         %xmm4, %xmm5, %xmm6
 261 .endm
 262
 263 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 264 # struct has been initialized by GCM_INIT.
 265 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 266 # Clobbers rax, r10-r13, and xmm0-xmm15
 267 .macro GCM_ENC_DEC operation
 268         movdqu AadHash(%arg2), %xmm8
 269         movdqu HashKey(%arg2), %xmm13
 270         add %arg5, InLen(%arg2)
 271
 272         xor %r11d, %r11d # initialise the data pointer offset as zero
 273         PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 274
 275         sub %r11, %arg5         # sub partial block data used
 276         mov %arg5, %r13         # save the number of bytes
 277
 278         and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 279         mov %r13, %r12
 280         # Encrypt/Decrypt first few blocks
 281
 282         and     $(3<<4), %r12
 283         jz      .L_initial_num_blocks_is_0_\@
 284         cmp     $(2<<4), %r12
 285         jb      .L_initial_num_blocks_is_1_\@
 286         je      .L_initial_num_blocks_is_2_\@
 287 .L_initial_num_blocks_is_3_\@:
 288         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 289 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 290         sub     $48, %r13
 291         jmp     .L_initial_blocks_\@
 292 .L_initial_num_blocks_is_2_\@:
 293         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 294 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 295         sub     $32, %r13
 296         jmp     .L_initial_blocks_\@
 297 .L_initial_num_blocks_is_1_\@:
 298         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 299 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 300         sub     $16, %r13
 301         jmp     .L_initial_blocks_\@
 302 .L_initial_num_blocks_is_0_\@:
 303         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 304 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 305 .L_initial_blocks_\@:
 306
 307         # Main loop - Encrypt/Decrypt remaining blocks
 308
 309         test    %r13, %r13
 310         je      .L_zero_cipher_left_\@
 311         sub     $64, %r13
 312         je      .L_four_cipher_left_\@
 313 .L_crypt_by_4_\@:
 314         GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 315         %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 316         %xmm7, %xmm8, enc
 317         add     $64, %r11
 318         sub     $64, %r13
 319         jne     .L_crypt_by_4_\@
 320 .L_four_cipher_left_\@:
 321         GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 322 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 323 .L_zero_cipher_left_\@:
 324         movdqu %xmm8, AadHash(%arg2)
 325         movdqu %xmm0, CurCount(%arg2)
 326
 327         mov     %arg5, %r13
 328         and     $15, %r13                       # %r13 = arg5 (mod 16)
 329         je      .L_multiple_of_16_bytes_\@
 330
 331         mov %r13, PBlockLen(%arg2)
 332
 333         # Handle the last <16 Byte block separately
 334         paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 335         movdqu %xmm0, CurCount(%arg2)
 336         movdqa SHUF_MASK(%rip), %xmm10
 337         pshufb %xmm10, %xmm0
 338
 339         ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 340         movdqu %xmm0, PBlockEncKey(%arg2)
 341
 342         cmp     $16, %arg5
 343         jge     .L_large_enough_update_\@
 344
 345         lea (%arg4,%r11,1), %r10
 346         mov %r13, %r12
 347         READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 348         jmp     .L_data_read_\@
 349
 350 .L_large_enough_update_\@:
 351         sub     $16, %r11
 352         add     %r13, %r11
 353
 354         # receive the last <16 Byte block
 355         movdqu  (%arg4, %r11, 1), %xmm1
 356
 357         sub     %r13, %r11
 358         add     $16, %r11
 359
 360         lea     SHIFT_MASK+16(%rip), %r12
 361         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 362         # (r13 is the number of bytes in plaintext mod 16)
 363         sub     %r13, %r12
 364         # get the appropriate shuffle mask
 365         movdqu  (%r12), %xmm2
 366         # shift right 16-r13 bytes
 367         pshufb  %xmm2, %xmm1
 368
 369 .L_data_read_\@:
 370         lea ALL_F+16(%rip), %r12
 371         sub %r13, %r12
 372
 373 .ifc \operation, dec
 374         movdqa  %xmm1, %xmm2
 375 .endif
 376         pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 377         movdqu  (%r12), %xmm1
 378         # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 379         pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 380 .ifc \operation, dec
 381         pand    %xmm1, %xmm2
 382         movdqa SHUF_MASK(%rip), %xmm10
 383         pshufb %xmm10 ,%xmm2
 384
 385         pxor %xmm2, %xmm8
 386 .else
 387         movdqa SHUF_MASK(%rip), %xmm10
 388         pshufb %xmm10,%xmm0
 389
 390         pxor    %xmm0, %xmm8
 391 .endif
 392
 393         movdqu %xmm8, AadHash(%arg2)
 394 .ifc \operation, enc
 395         # GHASH computation for the last <16 byte block
 396         movdqa SHUF_MASK(%rip), %xmm10
 397         # shuffle xmm0 back to output as ciphertext
 398         pshufb %xmm10, %xmm0
 399 .endif
 400
 401         # Output %r13 bytes
 402         movq %xmm0, %rax
 403         cmp $8, %r13
 404         jle .L_less_than_8_bytes_left_\@
 405         mov %rax, (%arg3 , %r11, 1)
 406         add $8, %r11
 407         psrldq $8, %xmm0
 408         movq %xmm0, %rax
 409         sub $8, %r13
 410 .L_less_than_8_bytes_left_\@:
 411         mov %al,  (%arg3, %r11, 1)
 412         add $1, %r11
 413         shr $8, %rax
 414         sub $1, %r13
 415         jne .L_less_than_8_bytes_left_\@
 416 .L_multiple_of_16_bytes_\@:
 417 .endm
 418
 419 # GCM_COMPLETE Finishes update of tag of last partial block
 420 # Output: Authorization Tag (AUTH_TAG)
 421 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 422 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 423         movdqu AadHash(%arg2), %xmm8
 424         movdqu HashKey(%arg2), %xmm13
 425
 426         mov PBlockLen(%arg2), %r12
 427
 428         test %r12, %r12
 429         je .L_partial_done\@
 430
 431         GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 432
 433 .L_partial_done\@:
 434         mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 435         shl     $3, %r12                  # convert into number of bits
 436         movd    %r12d, %xmm15             # len(A) in %xmm15
 437         mov InLen(%arg2), %r12
 438         shl     $3, %r12                  # len(C) in bits (*128)
 439         movq    %r12, %xmm1
 440
 441         pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 442         pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 443         pxor    %xmm15, %xmm8
 444         GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 445         # final GHASH computation
 446         movdqa SHUF_MASK(%rip), %xmm10
 447         pshufb %xmm10, %xmm8
 448
 449         movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 450         ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 451         pxor    %xmm8, %xmm0
 452 .L_return_T_\@:
 453         mov     \AUTHTAG, %r10                     # %r10 = authTag
 454         mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 455         cmp     $16, %r11
 456         je      .L_T_16_\@
 457         cmp     $8, %r11
 458         jl      .L_T_4_\@
 459 .L_T_8_\@:
 460         movq    %xmm0, %rax
 461         mov     %rax, (%r10)
 462         add     $8, %r10
 463         sub     $8, %r11
 464         psrldq  $8, %xmm0
 465         test    %r11, %r11
 466         je      .L_return_T_done_\@
 467 .L_T_4_\@:
 468         movd    %xmm0, %eax
 469         mov     %eax, (%r10)
 470         add     $4, %r10
 471         sub     $4, %r11
 472         psrldq  $4, %xmm0
 473         test    %r11, %r11
 474         je      .L_return_T_done_\@
 475 .L_T_123_\@:
 476         movd    %xmm0, %eax
 477         cmp     $2, %r11
 478         jl      .L_T_1_\@
 479         mov     %ax, (%r10)
 480         cmp     $2, %r11
 481         je      .L_return_T_done_\@
 482         add     $2, %r10
 483         sar     $16, %eax
 484 .L_T_1_\@:
 485         mov     %al, (%r10)
 486         jmp     .L_return_T_done_\@
 487 .L_T_16_\@:
 488         movdqu  %xmm0, (%r10)
 489 .L_return_T_done_\@:
 490 .endm
 491
 492 #ifdef __x86_64__
 493 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 494 *
 495 *
 496 * Input: A and B (128-bits each, bit-reflected)
 497 * Output: C = A*B*x mod poly, (i.e. >>1 )
 498 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 499 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 500 *
 501 */
 502 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 503         movdqa    \GH, \TMP1
 504         pshufd    $78, \GH, \TMP2
 505         pshufd    $78, \HK, \TMP3
 506         pxor      \GH, \TMP2            # TMP2 = a1+a0
 507         pxor      \HK, \TMP3            # TMP3 = b1+b0
 508         pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
 509         pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
 510         pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 511         pxor      \GH, \TMP2
 512         pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 513         movdqa    \TMP2, \TMP3
 514         pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 515         psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 516         pxor      \TMP3, \GH
 517         pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 518
 519         # first phase of the reduction
 520
 521         movdqa    \GH, \TMP2
 522         movdqa    \GH, \TMP3
 523         movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 524                                         # in in order to perform
 525                                         # independent shifts
 526         pslld     $31, \TMP2            # packed right shift <<31
 527         pslld     $30, \TMP3            # packed right shift <<30
 528         pslld     $25, \TMP4            # packed right shift <<25
 529         pxor      \TMP3, \TMP2          # xor the shifted versions
 530         pxor      \TMP4, \TMP2
 531         movdqa    \TMP2, \TMP5
 532         psrldq    $4, \TMP5             # right shift TMP5 1 DW
 533         pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 534         pxor      \TMP2, \GH
 535
 536         # second phase of the reduction
 537
 538         movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 539                                         # in in order to perform
 540                                         # independent shifts
 541         movdqa    \GH,\TMP3
 542         movdqa    \GH,\TMP4
 543         psrld     $1,\TMP2              # packed left shift >>1
 544         psrld     $2,\TMP3              # packed left shift >>2
 545         psrld     $7,\TMP4              # packed left shift >>7
 546         pxor      \TMP3,\TMP2           # xor the shifted versions
 547         pxor      \TMP4,\TMP2
 548         pxor      \TMP5, \TMP2
 549         pxor      \TMP2, \GH
 550         pxor      \TMP1, \GH            # result is in TMP1
 551 .endm
 552
 553 # Reads DLEN bytes starting at DPTR and stores in XMMDst
 554 # where 0 < DLEN < 16
 555 # Clobbers %rax, DLEN and XMM1
 556 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 557         cmp $8, \DLEN
 558         jl .L_read_lt8_\@
 559         mov (\DPTR), %rax
 560         movq %rax, \XMMDst
 561         sub $8, \DLEN
 562         jz .L_done_read_partial_block_\@
 563         xor %eax, %eax
 564 .L_read_next_byte_\@:
 565         shl $8, %rax
 566         mov 7(\DPTR, \DLEN, 1), %al
 567         dec \DLEN
 568         jnz .L_read_next_byte_\@
 569         movq %rax, \XMM1
 570         pslldq $8, \XMM1
 571         por \XMM1, \XMMDst
 572         jmp .L_done_read_partial_block_\@
 573 .L_read_lt8_\@:
 574         xor %eax, %eax
 575 .L_read_next_byte_lt8_\@:
 576         shl $8, %rax
 577         mov -1(\DPTR, \DLEN, 1), %al
 578         dec \DLEN
 579         jnz .L_read_next_byte_lt8_\@
 580         movq %rax, \XMMDst
 581 .L_done_read_partial_block_\@:
 582 .endm
 583
 584 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 585 # clobbers r10-11, xmm14
 586 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 587         TMP6 TMP7
 588         MOVADQ     SHUF_MASK(%rip), %xmm14
 589         mov        \AAD, %r10           # %r10 = AAD
 590         mov        \AADLEN, %r11                # %r11 = aadLen
 591         pxor       \TMP7, \TMP7
 592         pxor       \TMP6, \TMP6
 593
 594         cmp        $16, %r11
 595         jl         .L_get_AAD_rest\@
 596 .L_get_AAD_blocks\@:
 597         movdqu     (%r10), \TMP7
 598         pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 599         pxor       \TMP7, \TMP6
 600         GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 601         add        $16, %r10
 602         sub        $16, %r11
 603         cmp        $16, %r11
 604         jge        .L_get_AAD_blocks\@
 605
 606         movdqu     \TMP6, \TMP7
 607
 608         /* read the last <16B of AAD */
 609 .L_get_AAD_rest\@:
 610         test       %r11, %r11
 611         je         .L_get_AAD_done\@
 612
 613         READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 614         pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 615         pxor       \TMP6, \TMP7
 616         GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 617         movdqu \TMP7, \TMP6
 618
 619 .L_get_AAD_done\@:
 620         movdqu \TMP6, AadHash(%arg2)
 621 .endm
 622
 623 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 624 # between update calls.
 625 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 626 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 627 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 628 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 629         AAD_HASH operation
 630         mov     PBlockLen(%arg2), %r13
 631         test    %r13, %r13
 632         je      .L_partial_block_done_\@        # Leave Macro if no partial blocks
 633         # Read in input data without over reading
 634         cmp     $16, \PLAIN_CYPH_LEN
 635         jl      .L_fewer_than_16_bytes_\@
 636         movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 637         jmp     .L_data_read_\@
 638
 639 .L_fewer_than_16_bytes_\@:
 640         lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 641         mov     \PLAIN_CYPH_LEN, %r12
 642         READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 643
 644         mov PBlockLen(%arg2), %r13
 645
 646 .L_data_read_\@:                                # Finished reading in data
 647
 648         movdqu  PBlockEncKey(%arg2), %xmm9
 649         movdqu  HashKey(%arg2), %xmm13
 650
 651         lea     SHIFT_MASK(%rip), %r12
 652
 653         # adjust the shuffle mask pointer to be able to shift r13 bytes
 654         # r16-r13 is the number of bytes in plaintext mod 16)
 655         add     %r13, %r12
 656         movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 657         pshufb  %xmm2, %xmm9            # shift right r13 bytes
 658
 659 .ifc \operation, dec
 660         movdqa  %xmm1, %xmm3
 661         pxor    %xmm1, %xmm9            # Ciphertext XOR E(K, Yn)
 662
 663         mov     \PLAIN_CYPH_LEN, %r10
 664         add     %r13, %r10
 665         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 666         sub     $16, %r10
 667         # Determine if partial block is not being filled and
 668         # shift mask accordingly
 669         jge     .L_no_extra_mask_1_\@
 670         sub     %r10, %r12
 671 .L_no_extra_mask_1_\@:
 672
 673         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 674         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 675         pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 676
 677         pand    %xmm1, %xmm3
 678         movdqa  SHUF_MASK(%rip), %xmm10
 679         pshufb  %xmm10, %xmm3
 680         pshufb  %xmm2, %xmm3
 681         pxor    %xmm3, \AAD_HASH
 682
 683         test    %r10, %r10
 684         jl      .L_partial_incomplete_1_\@
 685
 686         # GHASH computation for the last <16 Byte block
 687         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 688         xor     %eax, %eax
 689
 690         mov     %rax, PBlockLen(%arg2)
 691         jmp     .L_dec_done_\@
 692 .L_partial_incomplete_1_\@:
 693         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 694 .L_dec_done_\@:
 695         movdqu  \AAD_HASH, AadHash(%arg2)
 696 .else
 697         pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 698
 699         mov     \PLAIN_CYPH_LEN, %r10
 700         add     %r13, %r10
 701         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 702         sub     $16, %r10
 703         # Determine if partial block is not being filled and
 704         # shift mask accordingly
 705         jge     .L_no_extra_mask_2_\@
 706         sub     %r10, %r12
 707 .L_no_extra_mask_2_\@:
 708
 709         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 710         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 711         pand    %xmm1, %xmm9
 712
 713         movdqa  SHUF_MASK(%rip), %xmm1
 714         pshufb  %xmm1, %xmm9
 715         pshufb  %xmm2, %xmm9
 716         pxor    %xmm9, \AAD_HASH
 717
 718         test    %r10, %r10
 719         jl      .L_partial_incomplete_2_\@
 720
 721         # GHASH computation for the last <16 Byte block
 722         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 723         xor     %eax, %eax
 724
 725         mov     %rax, PBlockLen(%arg2)
 726         jmp     .L_encode_done_\@
 727 .L_partial_incomplete_2_\@:
 728         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 729 .L_encode_done_\@:
 730         movdqu  \AAD_HASH, AadHash(%arg2)
 731
 732         movdqa  SHUF_MASK(%rip), %xmm10
 733         # shuffle xmm9 back to output as ciphertext
 734         pshufb  %xmm10, %xmm9
 735         pshufb  %xmm2, %xmm9
 736 .endif
 737         # output encrypted Bytes
 738         test    %r10, %r10
 739         jl      .L_partial_fill_\@
 740         mov     %r13, %r12
 741         mov     $16, %r13
 742         # Set r13 to be the number of bytes to write out
 743         sub     %r12, %r13
 744         jmp     .L_count_set_\@
 745 .L_partial_fill_\@:
 746         mov     \PLAIN_CYPH_LEN, %r13
 747 .L_count_set_\@:
 748         movdqa  %xmm9, %xmm0
 749         movq    %xmm0, %rax
 750         cmp     $8, %r13
 751         jle     .L_less_than_8_bytes_left_\@
 752
 753         mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 754         add     $8, \DATA_OFFSET
 755         psrldq  $8, %xmm0
 756         movq    %xmm0, %rax
 757         sub     $8, %r13
 758 .L_less_than_8_bytes_left_\@:
 759         movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 760         add     $1, \DATA_OFFSET
 761         shr     $8, %rax
 762         sub     $1, %r13
 763         jne     .L_less_than_8_bytes_left_\@
 764 .L_partial_block_done_\@:
 765 .endm # PARTIAL_BLOCK
 766
 767 /*
 768 * if a = number of total plaintext bytes
 769 * b = floor(a/16)
 770 * num_initial_blocks = b mod 4
 771 * encrypt the initial num_initial_blocks blocks and apply ghash on
 772 * the ciphertext
 773 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 774 * are clobbered
 775 * arg1, %arg2, %arg3 are used as a pointer only, not modified
 776 */
 777
 778
 779 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 780         XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 781         MOVADQ          SHUF_MASK(%rip), %xmm14
 782
 783         movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 784
 785         # start AES for num_initial_blocks blocks
 786
 787         movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 788
 789 .if (\i == 5) || (\i == 6) || (\i == 7)
 790
 791         MOVADQ          ONE(%RIP),\TMP1
 792         MOVADQ          0(%arg1),\TMP2
 793 .irpc index, \i_seq
 794         paddd           \TMP1, \XMM0                 # INCR Y0
 795 .ifc \operation, dec
 796         movdqa     \XMM0, %xmm\index
 797 .else
 798         MOVADQ          \XMM0, %xmm\index
 799 .endif
 800         pshufb  %xmm14, %xmm\index      # perform a 16 byte swap
 801         pxor            \TMP2, %xmm\index
 802 .endr
 803         lea     0x10(%arg1),%r10
 804         mov     keysize,%eax
 805         shr     $2,%eax                         # 128->4, 192->6, 256->8
 806         add     $5,%eax                       # 128->9, 192->11, 256->13
 807
 808 .Laes_loop_initial_\@:
 809         MOVADQ  (%r10),\TMP1
 810 .irpc   index, \i_seq
 811         aesenc  \TMP1, %xmm\index
 812 .endr
 813         add     $16,%r10
 814         sub     $1,%eax
 815         jnz     .Laes_loop_initial_\@
 816
 817         MOVADQ  (%r10), \TMP1
 818 .irpc index, \i_seq
 819         aesenclast \TMP1, %xmm\index         # Last Round
 820 .endr
 821 .irpc index, \i_seq
 822         movdqu     (%arg4 , %r11, 1), \TMP1
 823         pxor       \TMP1, %xmm\index
 824         movdqu     %xmm\index, (%arg3 , %r11, 1)
 825         # write back plaintext/ciphertext for num_initial_blocks
 826         add        $16, %r11
 827
 828 .ifc \operation, dec
 829         movdqa     \TMP1, %xmm\index
 830 .endif
 831         pshufb     %xmm14, %xmm\index
 832
 833                 # prepare plaintext/ciphertext for GHASH computation
 834 .endr
 835 .endif
 836
 837         # apply GHASH on num_initial_blocks blocks
 838
 839 .if \i == 5
 840         pxor       %xmm5, %xmm6
 841         GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 842         pxor       %xmm6, %xmm7
 843         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 844         pxor       %xmm7, %xmm8
 845         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 846 .elseif \i == 6
 847         pxor       %xmm6, %xmm7
 848         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 849         pxor       %xmm7, %xmm8
 850         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 851 .elseif \i == 7
 852         pxor       %xmm7, %xmm8
 853         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 854 .endif
 855         cmp        $64, %r13
 856         jl      .L_initial_blocks_done\@
 857         # no need for precomputed values
 858 /*
 859 *
 860 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 861 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 862 */
 863         MOVADQ     ONE(%RIP),\TMP1
 864         paddd      \TMP1, \XMM0              # INCR Y0
 865         MOVADQ     \XMM0, \XMM1
 866         pshufb  %xmm14, \XMM1        # perform a 16 byte swap
 867
 868         paddd      \TMP1, \XMM0              # INCR Y0
 869         MOVADQ     \XMM0, \XMM2
 870         pshufb  %xmm14, \XMM2        # perform a 16 byte swap
 871
 872         paddd      \TMP1, \XMM0              # INCR Y0
 873         MOVADQ     \XMM0, \XMM3
 874         pshufb %xmm14, \XMM3        # perform a 16 byte swap
 875
 876         paddd      \TMP1, \XMM0              # INCR Y0
 877         MOVADQ     \XMM0, \XMM4
 878         pshufb %xmm14, \XMM4        # perform a 16 byte swap
 879
 880         MOVADQ     0(%arg1),\TMP1
 881         pxor       \TMP1, \XMM1
 882         pxor       \TMP1, \XMM2
 883         pxor       \TMP1, \XMM3
 884         pxor       \TMP1, \XMM4
 885 .irpc index, 1234 # do 4 rounds
 886         movaps 0x10*\index(%arg1), \TMP1
 887         aesenc     \TMP1, \XMM1
 888         aesenc     \TMP1, \XMM2
 889         aesenc     \TMP1, \XMM3
 890         aesenc     \TMP1, \XMM4
 891 .endr
 892 .irpc index, 56789 # do next 5 rounds
 893         movaps 0x10*\index(%arg1), \TMP1
 894         aesenc     \TMP1, \XMM1
 895         aesenc     \TMP1, \XMM2
 896         aesenc     \TMP1, \XMM3
 897         aesenc     \TMP1, \XMM4
 898 .endr
 899         lea        0xa0(%arg1),%r10
 900         mov        keysize,%eax
 901         shr        $2,%eax                      # 128->4, 192->6, 256->8
 902         sub        $4,%eax                      # 128->0, 192->2, 256->4
 903         jz         .Laes_loop_pre_done\@
 904
 905 .Laes_loop_pre_\@:
 906         MOVADQ     (%r10),\TMP2
 907 .irpc   index, 1234
 908         aesenc     \TMP2, %xmm\index
 909 .endr
 910         add        $16,%r10
 911         sub        $1,%eax
 912         jnz        .Laes_loop_pre_\@
 913
 914 .Laes_loop_pre_done\@:
 915         MOVADQ     (%r10), \TMP2
 916         aesenclast \TMP2, \XMM1
 917         aesenclast \TMP2, \XMM2
 918         aesenclast \TMP2, \XMM3
 919         aesenclast \TMP2, \XMM4
 920         movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 921         pxor       \TMP1, \XMM1
 922 .ifc \operation, dec
 923         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 924         movdqa     \TMP1, \XMM1
 925 .endif
 926         movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 927         pxor       \TMP1, \XMM2
 928 .ifc \operation, dec
 929         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 930         movdqa     \TMP1, \XMM2
 931 .endif
 932         movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 933         pxor       \TMP1, \XMM3
 934 .ifc \operation, dec
 935         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 936         movdqa     \TMP1, \XMM3
 937 .endif
 938         movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 939         pxor       \TMP1, \XMM4
 940 .ifc \operation, dec
 941         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 942         movdqa     \TMP1, \XMM4
 943 .else
 944         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 945         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 946         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 947         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 948 .endif
 949
 950         add        $64, %r11
 951         pshufb %xmm14, \XMM1 # perform a 16 byte swap
 952         pxor       \XMMDst, \XMM1
 953 # combine GHASHed value with the corresponding ciphertext
 954         pshufb %xmm14, \XMM2 # perform a 16 byte swap
 955         pshufb %xmm14, \XMM3 # perform a 16 byte swap
 956         pshufb %xmm14, \XMM4 # perform a 16 byte swap
 957
 958 .L_initial_blocks_done\@:
 959
 960 .endm
 961
 962 /*
 963 * encrypt 4 blocks at a time
 964 * ghash the 4 previously encrypted ciphertext blocks
 965 * arg1, %arg3, %arg4 are used as pointers only, not modified
 966 * %r11 is the data offset value
 967 */
 968 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
 969 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 970
 971         movdqa    \XMM1, \XMM5
 972         movdqa    \XMM2, \XMM6
 973         movdqa    \XMM3, \XMM7
 974         movdqa    \XMM4, \XMM8
 975
 976         movdqa    SHUF_MASK(%rip), %xmm15
 977         # multiply TMP5 * HashKey using karatsuba
 978
 979         movdqa    \XMM5, \TMP4
 980         pshufd    $78, \XMM5, \TMP6
 981         pxor      \XMM5, \TMP6
 982         paddd     ONE(%rip), \XMM0              # INCR CNT
 983         movdqu    HashKey_4(%arg2), \TMP5
 984         pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 985         movdqa    \XMM0, \XMM1
 986         paddd     ONE(%rip), \XMM0              # INCR CNT
 987         movdqa    \XMM0, \XMM2
 988         paddd     ONE(%rip), \XMM0              # INCR CNT
 989         movdqa    \XMM0, \XMM3
 990         paddd     ONE(%rip), \XMM0              # INCR CNT
 991         movdqa    \XMM0, \XMM4
 992         pshufb %xmm15, \XMM1    # perform a 16 byte swap
 993         pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
 994         pshufb %xmm15, \XMM2    # perform a 16 byte swap
 995         pshufb %xmm15, \XMM3    # perform a 16 byte swap
 996         pshufb %xmm15, \XMM4    # perform a 16 byte swap
 997
 998         pxor      (%arg1), \XMM1
 999         pxor      (%arg1), \XMM2
1000         pxor      (%arg1), \XMM3
1001         pxor      (%arg1), \XMM4
1002         movdqu    HashKey_4_k(%arg2), \TMP5
1003         pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1004         movaps 0x10(%arg1), \TMP1
1005         aesenc    \TMP1, \XMM1              # Round 1
1006         aesenc    \TMP1, \XMM2
1007         aesenc    \TMP1, \XMM3
1008         aesenc    \TMP1, \XMM4
1009         movaps 0x20(%arg1), \TMP1
1010         aesenc    \TMP1, \XMM1              # Round 2
1011         aesenc    \TMP1, \XMM2
1012         aesenc    \TMP1, \XMM3
1013         aesenc    \TMP1, \XMM4
1014         movdqa    \XMM6, \TMP1
1015         pshufd    $78, \XMM6, \TMP2
1016         pxor      \XMM6, \TMP2
1017         movdqu    HashKey_3(%arg2), \TMP5
1018         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1019         movaps 0x30(%arg1), \TMP3
1020         aesenc    \TMP3, \XMM1              # Round 3
1021         aesenc    \TMP3, \XMM2
1022         aesenc    \TMP3, \XMM3
1023         aesenc    \TMP3, \XMM4
1024         pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1025         movaps 0x40(%arg1), \TMP3
1026         aesenc    \TMP3, \XMM1              # Round 4
1027         aesenc    \TMP3, \XMM2
1028         aesenc    \TMP3, \XMM3
1029         aesenc    \TMP3, \XMM4
1030         movdqu    HashKey_3_k(%arg2), \TMP5
1031         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1032         movaps 0x50(%arg1), \TMP3
1033         aesenc    \TMP3, \XMM1              # Round 5
1034         aesenc    \TMP3, \XMM2
1035         aesenc    \TMP3, \XMM3
1036         aesenc    \TMP3, \XMM4
1037         pxor      \TMP1, \TMP4
1038 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1039         pxor      \XMM6, \XMM5
1040         pxor      \TMP2, \TMP6
1041         movdqa    \XMM7, \TMP1
1042         pshufd    $78, \XMM7, \TMP2
1043         pxor      \XMM7, \TMP2
1044         movdqu    HashKey_2(%arg2), \TMP5
1045
1046         # Multiply TMP5 * HashKey using karatsuba
1047
1048         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1049         movaps 0x60(%arg1), \TMP3
1050         aesenc    \TMP3, \XMM1              # Round 6
1051         aesenc    \TMP3, \XMM2
1052         aesenc    \TMP3, \XMM3
1053         aesenc    \TMP3, \XMM4
1054         pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1055         movaps 0x70(%arg1), \TMP3
1056         aesenc    \TMP3, \XMM1              # Round 7
1057         aesenc    \TMP3, \XMM2
1058         aesenc    \TMP3, \XMM3
1059         aesenc    \TMP3, \XMM4
1060         movdqu    HashKey_2_k(%arg2), \TMP5
1061         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1062         movaps 0x80(%arg1), \TMP3
1063         aesenc    \TMP3, \XMM1              # Round 8
1064         aesenc    \TMP3, \XMM2
1065         aesenc    \TMP3, \XMM3
1066         aesenc    \TMP3, \XMM4
1067         pxor      \TMP1, \TMP4
1068 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1069         pxor      \XMM7, \XMM5
1070         pxor      \TMP2, \TMP6
1071
1072         # Multiply XMM8 * HashKey
1073         # XMM8 and TMP5 hold the values for the two operands
1074
1075         movdqa    \XMM8, \TMP1
1076         pshufd    $78, \XMM8, \TMP2
1077         pxor      \XMM8, \TMP2
1078         movdqu    HashKey(%arg2), \TMP5
1079         pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1080         movaps 0x90(%arg1), \TMP3
1081         aesenc    \TMP3, \XMM1             # Round 9
1082         aesenc    \TMP3, \XMM2
1083         aesenc    \TMP3, \XMM3
1084         aesenc    \TMP3, \XMM4
1085         pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1086         lea       0xa0(%arg1),%r10
1087         mov       keysize,%eax
1088         shr       $2,%eax                       # 128->4, 192->6, 256->8
1089         sub       $4,%eax                       # 128->0, 192->2, 256->4
1090         jz        .Laes_loop_par_enc_done\@
1091
1092 .Laes_loop_par_enc\@:
1093         MOVADQ    (%r10),\TMP3
1094 .irpc   index, 1234
1095         aesenc    \TMP3, %xmm\index
1096 .endr
1097         add       $16,%r10
1098         sub       $1,%eax
1099         jnz       .Laes_loop_par_enc\@
1100
1101 .Laes_loop_par_enc_done\@:
1102         MOVADQ    (%r10), \TMP3
1103         aesenclast \TMP3, \XMM1           # Round 10
1104         aesenclast \TMP3, \XMM2
1105         aesenclast \TMP3, \XMM3
1106         aesenclast \TMP3, \XMM4
1107         movdqu    HashKey_k(%arg2), \TMP5
1108         pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1109         movdqu    (%arg4,%r11,1), \TMP3
1110         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1111         movdqu    16(%arg4,%r11,1), \TMP3
1112         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1113         movdqu    32(%arg4,%r11,1), \TMP3
1114         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1115         movdqu    48(%arg4,%r11,1), \TMP3
1116         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1117         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1118         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1119         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1120         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1121         pshufb %xmm15, \XMM1        # perform a 16 byte swap
1122         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1123         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1124         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1125
1126         pxor      \TMP4, \TMP1
1127         pxor      \XMM8, \XMM5
1128         pxor      \TMP6, \TMP2
1129         pxor      \TMP1, \TMP2
1130         pxor      \XMM5, \TMP2
1131         movdqa    \TMP2, \TMP3
1132         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1133         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1134         pxor      \TMP3, \XMM5
1135         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1136
1137         # first phase of reduction
1138
1139         movdqa    \XMM5, \TMP2
1140         movdqa    \XMM5, \TMP3
1141         movdqa    \XMM5, \TMP4
1142 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1143         pslld     $31, \TMP2                   # packed right shift << 31
1144         pslld     $30, \TMP3                   # packed right shift << 30
1145         pslld     $25, \TMP4                   # packed right shift << 25
1146         pxor      \TMP3, \TMP2                 # xor the shifted versions
1147         pxor      \TMP4, \TMP2
1148         movdqa    \TMP2, \TMP5
1149         psrldq    $4, \TMP5                    # right shift T5 1 DW
1150         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1151         pxor      \TMP2, \XMM5
1152
1153         # second phase of reduction
1154
1155         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1156         movdqa    \XMM5,\TMP3
1157         movdqa    \XMM5,\TMP4
1158         psrld     $1, \TMP2                    # packed left shift >>1
1159         psrld     $2, \TMP3                    # packed left shift >>2
1160         psrld     $7, \TMP4                    # packed left shift >>7
1161         pxor      \TMP3,\TMP2                  # xor the shifted versions
1162         pxor      \TMP4,\TMP2
1163         pxor      \TMP5, \TMP2
1164         pxor      \TMP2, \XMM5
1165         pxor      \TMP1, \XMM5                 # result is in TMP1
1166
1167         pxor      \XMM5, \XMM1
1168 .endm
1169
1170 /*
1171 * decrypt 4 blocks at a time
1172 * ghash the 4 previously decrypted ciphertext blocks
1173 * arg1, %arg3, %arg4 are used as pointers only, not modified
1174 * %r11 is the data offset value
1175 */
1176 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1177 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1178
1179         movdqa    \XMM1, \XMM5
1180         movdqa    \XMM2, \XMM6
1181         movdqa    \XMM3, \XMM7
1182         movdqa    \XMM4, \XMM8
1183
1184         movdqa    SHUF_MASK(%rip), %xmm15
1185         # multiply TMP5 * HashKey using karatsuba
1186
1187         movdqa    \XMM5, \TMP4
1188         pshufd    $78, \XMM5, \TMP6
1189         pxor      \XMM5, \TMP6
1190         paddd     ONE(%rip), \XMM0              # INCR CNT
1191         movdqu    HashKey_4(%arg2), \TMP5
1192         pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1193         movdqa    \XMM0, \XMM1
1194         paddd     ONE(%rip), \XMM0              # INCR CNT
1195         movdqa    \XMM0, \XMM2
1196         paddd     ONE(%rip), \XMM0              # INCR CNT
1197         movdqa    \XMM0, \XMM3
1198         paddd     ONE(%rip), \XMM0              # INCR CNT
1199         movdqa    \XMM0, \XMM4
1200         pshufb %xmm15, \XMM1    # perform a 16 byte swap
1201         pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1202         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1203         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1204         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1205
1206         pxor      (%arg1), \XMM1
1207         pxor      (%arg1), \XMM2
1208         pxor      (%arg1), \XMM3
1209         pxor      (%arg1), \XMM4
1210         movdqu    HashKey_4_k(%arg2), \TMP5
1211         pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1212         movaps 0x10(%arg1), \TMP1
1213         aesenc    \TMP1, \XMM1              # Round 1
1214         aesenc    \TMP1, \XMM2
1215         aesenc    \TMP1, \XMM3
1216         aesenc    \TMP1, \XMM4
1217         movaps 0x20(%arg1), \TMP1
1218         aesenc    \TMP1, \XMM1              # Round 2
1219         aesenc    \TMP1, \XMM2
1220         aesenc    \TMP1, \XMM3
1221         aesenc    \TMP1, \XMM4
1222         movdqa    \XMM6, \TMP1
1223         pshufd    $78, \XMM6, \TMP2
1224         pxor      \XMM6, \TMP2
1225         movdqu    HashKey_3(%arg2), \TMP5
1226         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1227         movaps 0x30(%arg1), \TMP3
1228         aesenc    \TMP3, \XMM1              # Round 3
1229         aesenc    \TMP3, \XMM2
1230         aesenc    \TMP3, \XMM3
1231         aesenc    \TMP3, \XMM4
1232         pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1233         movaps 0x40(%arg1), \TMP3
1234         aesenc    \TMP3, \XMM1              # Round 4
1235         aesenc    \TMP3, \XMM2
1236         aesenc    \TMP3, \XMM3
1237         aesenc    \TMP3, \XMM4
1238         movdqu    HashKey_3_k(%arg2), \TMP5
1239         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1240         movaps 0x50(%arg1), \TMP3
1241         aesenc    \TMP3, \XMM1              # Round 5
1242         aesenc    \TMP3, \XMM2
1243         aesenc    \TMP3, \XMM3
1244         aesenc    \TMP3, \XMM4
1245         pxor      \TMP1, \TMP4
1246 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1247         pxor      \XMM6, \XMM5
1248         pxor      \TMP2, \TMP6
1249         movdqa    \XMM7, \TMP1
1250         pshufd    $78, \XMM7, \TMP2
1251         pxor      \XMM7, \TMP2
1252         movdqu    HashKey_2(%arg2), \TMP5
1253
1254         # Multiply TMP5 * HashKey using karatsuba
1255
1256         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1257         movaps 0x60(%arg1), \TMP3
1258         aesenc    \TMP3, \XMM1              # Round 6
1259         aesenc    \TMP3, \XMM2
1260         aesenc    \TMP3, \XMM3
1261         aesenc    \TMP3, \XMM4
1262         pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1263         movaps 0x70(%arg1), \TMP3
1264         aesenc    \TMP3, \XMM1              # Round 7
1265         aesenc    \TMP3, \XMM2
1266         aesenc    \TMP3, \XMM3
1267         aesenc    \TMP3, \XMM4
1268         movdqu    HashKey_2_k(%arg2), \TMP5
1269         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1270         movaps 0x80(%arg1), \TMP3
1271         aesenc    \TMP3, \XMM1              # Round 8
1272         aesenc    \TMP3, \XMM2
1273         aesenc    \TMP3, \XMM3
1274         aesenc    \TMP3, \XMM4
1275         pxor      \TMP1, \TMP4
1276 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1277         pxor      \XMM7, \XMM5
1278         pxor      \TMP2, \TMP6
1279
1280         # Multiply XMM8 * HashKey
1281         # XMM8 and TMP5 hold the values for the two operands
1282
1283         movdqa    \XMM8, \TMP1
1284         pshufd    $78, \XMM8, \TMP2
1285         pxor      \XMM8, \TMP2
1286         movdqu    HashKey(%arg2), \TMP5
1287         pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1288         movaps 0x90(%arg1), \TMP3
1289         aesenc    \TMP3, \XMM1             # Round 9
1290         aesenc    \TMP3, \XMM2
1291         aesenc    \TMP3, \XMM3
1292         aesenc    \TMP3, \XMM4
1293         pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1294         lea       0xa0(%arg1),%r10
1295         mov       keysize,%eax
1296         shr       $2,%eax                       # 128->4, 192->6, 256->8
1297         sub       $4,%eax                       # 128->0, 192->2, 256->4
1298         jz        .Laes_loop_par_dec_done\@
1299
1300 .Laes_loop_par_dec\@:
1301         MOVADQ    (%r10),\TMP3
1302 .irpc   index, 1234
1303         aesenc    \TMP3, %xmm\index
1304 .endr
1305         add       $16,%r10
1306         sub       $1,%eax
1307         jnz       .Laes_loop_par_dec\@
1308
1309 .Laes_loop_par_dec_done\@:
1310         MOVADQ    (%r10), \TMP3
1311         aesenclast \TMP3, \XMM1           # last round
1312         aesenclast \TMP3, \XMM2
1313         aesenclast \TMP3, \XMM3
1314         aesenclast \TMP3, \XMM4
1315         movdqu    HashKey_k(%arg2), \TMP5
1316         pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1317         movdqu    (%arg4,%r11,1), \TMP3
1318         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1319         movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1320         movdqa    \TMP3, \XMM1
1321         movdqu    16(%arg4,%r11,1), \TMP3
1322         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1323         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1324         movdqa    \TMP3, \XMM2
1325         movdqu    32(%arg4,%r11,1), \TMP3
1326         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1327         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1328         movdqa    \TMP3, \XMM3
1329         movdqu    48(%arg4,%r11,1), \TMP3
1330         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1331         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1332         movdqa    \TMP3, \XMM4
1333         pshufb %xmm15, \XMM1        # perform a 16 byte swap
1334         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1335         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1336         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1337
1338         pxor      \TMP4, \TMP1
1339         pxor      \XMM8, \XMM5
1340         pxor      \TMP6, \TMP2
1341         pxor      \TMP1, \TMP2
1342         pxor      \XMM5, \TMP2
1343         movdqa    \TMP2, \TMP3
1344         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1345         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1346         pxor      \TMP3, \XMM5
1347         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1348
1349         # first phase of reduction
1350
1351         movdqa    \XMM5, \TMP2
1352         movdqa    \XMM5, \TMP3
1353         movdqa    \XMM5, \TMP4
1354 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1355         pslld     $31, \TMP2                   # packed right shift << 31
1356         pslld     $30, \TMP3                   # packed right shift << 30
1357         pslld     $25, \TMP4                   # packed right shift << 25
1358         pxor      \TMP3, \TMP2                 # xor the shifted versions
1359         pxor      \TMP4, \TMP2
1360         movdqa    \TMP2, \TMP5
1361         psrldq    $4, \TMP5                    # right shift T5 1 DW
1362         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1363         pxor      \TMP2, \XMM5
1364
1365         # second phase of reduction
1366
1367         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1368         movdqa    \XMM5,\TMP3
1369         movdqa    \XMM5,\TMP4
1370         psrld     $1, \TMP2                    # packed left shift >>1
1371         psrld     $2, \TMP3                    # packed left shift >>2
1372         psrld     $7, \TMP4                    # packed left shift >>7
1373         pxor      \TMP3,\TMP2                  # xor the shifted versions
1374         pxor      \TMP4,\TMP2
1375         pxor      \TMP5, \TMP2
1376         pxor      \TMP2, \XMM5
1377         pxor      \TMP1, \XMM5                 # result is in TMP1
1378
1379         pxor      \XMM5, \XMM1
1380 .endm
1381
1382 /* GHASH the last 4 ciphertext blocks. */
1383 .macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1384 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1385
1386         # Multiply TMP6 * HashKey (using Karatsuba)
1387
1388         movdqa    \XMM1, \TMP6
1389         pshufd    $78, \XMM1, \TMP2
1390         pxor      \XMM1, \TMP2
1391         movdqu    HashKey_4(%arg2), \TMP5
1392         pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1393         pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1394         movdqu    HashKey_4_k(%arg2), \TMP4
1395         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1396         movdqa    \XMM1, \XMMDst
1397         movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1398
1399         # Multiply TMP1 * HashKey (using Karatsuba)
1400
1401         movdqa    \XMM2, \TMP1
1402         pshufd    $78, \XMM2, \TMP2
1403         pxor      \XMM2, \TMP2
1404         movdqu    HashKey_3(%arg2), \TMP5
1405         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1406         pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1407         movdqu    HashKey_3_k(%arg2), \TMP4
1408         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1409         pxor      \TMP1, \TMP6
1410         pxor      \XMM2, \XMMDst
1411         pxor      \TMP2, \XMM1
1412 # results accumulated in TMP6, XMMDst, XMM1
1413
1414         # Multiply TMP1 * HashKey (using Karatsuba)
1415
1416         movdqa    \XMM3, \TMP1
1417         pshufd    $78, \XMM3, \TMP2
1418         pxor      \XMM3, \TMP2
1419         movdqu    HashKey_2(%arg2), \TMP5
1420         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1421         pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1422         movdqu    HashKey_2_k(%arg2), \TMP4
1423         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1424         pxor      \TMP1, \TMP6
1425         pxor      \XMM3, \XMMDst
1426         pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1427
1428         # Multiply TMP1 * HashKey (using Karatsuba)
1429         movdqa    \XMM4, \TMP1
1430         pshufd    $78, \XMM4, \TMP2
1431         pxor      \XMM4, \TMP2
1432         movdqu    HashKey(%arg2), \TMP5
1433         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1434         pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1435         movdqu    HashKey_k(%arg2), \TMP4
1436         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1437         pxor      \TMP1, \TMP6
1438         pxor      \XMM4, \XMMDst
1439         pxor      \XMM1, \TMP2
1440         pxor      \TMP6, \TMP2
1441         pxor      \XMMDst, \TMP2
1442         # middle section of the temp results combined as in karatsuba algorithm
1443         movdqa    \TMP2, \TMP4
1444         pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1445         psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1446         pxor      \TMP4, \XMMDst
1447         pxor      \TMP2, \TMP6
1448 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1449         # first phase of the reduction
1450         movdqa    \XMMDst, \TMP2
1451         movdqa    \XMMDst, \TMP3
1452         movdqa    \XMMDst, \TMP4
1453 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1454         pslld     $31, \TMP2                # packed right shifting << 31
1455         pslld     $30, \TMP3                # packed right shifting << 30
1456         pslld     $25, \TMP4                # packed right shifting << 25
1457         pxor      \TMP3, \TMP2              # xor the shifted versions
1458         pxor      \TMP4, \TMP2
1459         movdqa    \TMP2, \TMP7
1460         psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1461         pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1462         pxor      \TMP2, \XMMDst
1463
1464         # second phase of the reduction
1465         movdqa    \XMMDst, \TMP2
1466         # make 3 copies of XMMDst for doing 3 shift operations
1467         movdqa    \XMMDst, \TMP3
1468         movdqa    \XMMDst, \TMP4
1469         psrld     $1, \TMP2                 # packed left shift >> 1
1470         psrld     $2, \TMP3                 # packed left shift >> 2
1471         psrld     $7, \TMP4                 # packed left shift >> 7
1472         pxor      \TMP3, \TMP2              # xor the shifted versions
1473         pxor      \TMP4, \TMP2
1474         pxor      \TMP7, \TMP2
1475         pxor      \TMP2, \XMMDst
1476         pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1477 .endm
1478
1479
1480 /* Encryption of a single block
1481 * uses eax & r10
1482 */
1483
1484 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1485
1486         pxor            (%arg1), \XMM0
1487         mov             keysize,%eax
1488         shr             $2,%eax                 # 128->4, 192->6, 256->8
1489         add             $5,%eax                 # 128->9, 192->11, 256->13
1490         lea             16(%arg1), %r10   # get first expanded key address
1491
1492 _esb_loop_\@:
1493         MOVADQ          (%r10),\TMP1
1494         aesenc          \TMP1,\XMM0
1495         add             $16,%r10
1496         sub             $1,%eax
1497         jnz             _esb_loop_\@
1498
1499         MOVADQ          (%r10),\TMP1
1500         aesenclast      \TMP1,\XMM0
1501 .endm
1502
1503 /*****************************************************************************
1504 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1505 *                     struct gcm_context_data *data,
1506 *                                         // context data
1507 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1508 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1509 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1510 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1511 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1512 *                     u64 aad_len)        // Length of AAD in bytes.
1513 */
1514 SYM_FUNC_START(aesni_gcm_init)
1515         FUNC_SAVE
1516         GCM_INIT %arg3, %arg4,%arg5, %arg6
1517         FUNC_RESTORE
1518         RET
1519 SYM_FUNC_END(aesni_gcm_init)
1520
1521 /*****************************************************************************
1522 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1523 *                    struct gcm_context_data *data,
1524 *                                        // context data
1525 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1526 *                    const u8 *in,       // Plaintext input
1527 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1528 */
1529 SYM_FUNC_START(aesni_gcm_enc_update)
1530         FUNC_SAVE
1531         GCM_ENC_DEC enc
1532         FUNC_RESTORE
1533         RET
1534 SYM_FUNC_END(aesni_gcm_enc_update)
1535
1536 /*****************************************************************************
1537 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1538 *                    struct gcm_context_data *data,
1539 *                                        // context data
1540 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1541 *                    const u8 *in,       // Plaintext input
1542 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1543 */
1544 SYM_FUNC_START(aesni_gcm_dec_update)
1545         FUNC_SAVE
1546         GCM_ENC_DEC dec
1547         FUNC_RESTORE
1548         RET
1549 SYM_FUNC_END(aesni_gcm_dec_update)
1550
1551 /*****************************************************************************
1552 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1553 *                    struct gcm_context_data *data,
1554 *                                        // context data
1555 *                    u8 *auth_tag,       // Authenticated Tag output.
1556 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1557 *                                        // 12 or 8.
1558 */
1559 SYM_FUNC_START(aesni_gcm_finalize)
1560         FUNC_SAVE
1561         GCM_COMPLETE %arg3 %arg4
1562         FUNC_RESTORE
1563         RET
1564 SYM_FUNC_END(aesni_gcm_finalize)
1565
1566 #endif
1567
1568 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1569         pshufd $0b11111111, %xmm1, %xmm1
1570         shufps $0b00010000, %xmm0, %xmm4
1571         pxor %xmm4, %xmm0
1572         shufps $0b10001100, %xmm0, %xmm4
1573         pxor %xmm4, %xmm0
1574         pxor %xmm1, %xmm0
1575         movaps %xmm0, (TKEYP)
1576         add $0x10, TKEYP
1577         RET
1578 SYM_FUNC_END(_key_expansion_256a)
1579 SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1580
1581 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1582         pshufd $0b01010101, %xmm1, %xmm1
1583         shufps $0b00010000, %xmm0, %xmm4
1584         pxor %xmm4, %xmm0
1585         shufps $0b10001100, %xmm0, %xmm4
1586         pxor %xmm4, %xmm0
1587         pxor %xmm1, %xmm0
1588
1589         movaps %xmm2, %xmm5
1590         movaps %xmm2, %xmm6
1591         pslldq $4, %xmm5
1592         pshufd $0b11111111, %xmm0, %xmm3
1593         pxor %xmm3, %xmm2
1594         pxor %xmm5, %xmm2
1595
1596         movaps %xmm0, %xmm1
1597         shufps $0b01000100, %xmm0, %xmm6
1598         movaps %xmm6, (TKEYP)
1599         shufps $0b01001110, %xmm2, %xmm1
1600         movaps %xmm1, 0x10(TKEYP)
1601         add $0x20, TKEYP
1602         RET
1603 SYM_FUNC_END(_key_expansion_192a)
1604
1605 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1606         pshufd $0b01010101, %xmm1, %xmm1
1607         shufps $0b00010000, %xmm0, %xmm4
1608         pxor %xmm4, %xmm0
1609         shufps $0b10001100, %xmm0, %xmm4
1610         pxor %xmm4, %xmm0
1611         pxor %xmm1, %xmm0
1612
1613         movaps %xmm2, %xmm5
1614         pslldq $4, %xmm5
1615         pshufd $0b11111111, %xmm0, %xmm3
1616         pxor %xmm3, %xmm2
1617         pxor %xmm5, %xmm2
1618
1619         movaps %xmm0, (TKEYP)
1620         add $0x10, TKEYP
1621         RET
1622 SYM_FUNC_END(_key_expansion_192b)
1623
1624 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1625         pshufd $0b10101010, %xmm1, %xmm1
1626         shufps $0b00010000, %xmm2, %xmm4
1627         pxor %xmm4, %xmm2
1628         shufps $0b10001100, %xmm2, %xmm4
1629         pxor %xmm4, %xmm2
1630         pxor %xmm1, %xmm2
1631         movaps %xmm2, (TKEYP)
1632         add $0x10, TKEYP
1633         RET
1634 SYM_FUNC_END(_key_expansion_256b)
1635
1636 /*
1637  * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1638  *                    unsigned int key_len)
1639  */
1640 SYM_FUNC_START(aesni_set_key)
1641         FRAME_BEGIN
1642 #ifndef __x86_64__
1643         pushl KEYP
1644         movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1645         movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1646         movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1647 #endif
1648         movups (UKEYP), %xmm0           # user key (first 16 bytes)
1649         movaps %xmm0, (KEYP)
1650         lea 0x10(KEYP), TKEYP           # key addr
1651         movl %edx, 480(KEYP)
1652         pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1653         cmp $24, %dl
1654         jb .Lenc_key128
1655         je .Lenc_key192
1656         movups 0x10(UKEYP), %xmm2       # other user key
1657         movaps %xmm2, (TKEYP)
1658         add $0x10, TKEYP
1659         aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1660         call _key_expansion_256a
1661         aeskeygenassist $0x1, %xmm0, %xmm1
1662         call _key_expansion_256b
1663         aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1664         call _key_expansion_256a
1665         aeskeygenassist $0x2, %xmm0, %xmm1
1666         call _key_expansion_256b
1667         aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1668         call _key_expansion_256a
1669         aeskeygenassist $0x4, %xmm0, %xmm1
1670         call _key_expansion_256b
1671         aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1672         call _key_expansion_256a
1673         aeskeygenassist $0x8, %xmm0, %xmm1
1674         call _key_expansion_256b
1675         aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1676         call _key_expansion_256a
1677         aeskeygenassist $0x10, %xmm0, %xmm1
1678         call _key_expansion_256b
1679         aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1680         call _key_expansion_256a
1681         aeskeygenassist $0x20, %xmm0, %xmm1
1682         call _key_expansion_256b
1683         aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1684         call _key_expansion_256a
1685         jmp .Ldec_key
1686 .Lenc_key192:
1687         movq 0x10(UKEYP), %xmm2         # other user key
1688         aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1689         call _key_expansion_192a
1690         aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1691         call _key_expansion_192b
1692         aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1693         call _key_expansion_192a
1694         aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1695         call _key_expansion_192b
1696         aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1697         call _key_expansion_192a
1698         aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1699         call _key_expansion_192b
1700         aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1701         call _key_expansion_192a
1702         aeskeygenassist $0x80, %xmm2, %xmm1     # round 8
1703         call _key_expansion_192b
1704         jmp .Ldec_key
1705 .Lenc_key128:
1706         aeskeygenassist $0x1, %xmm0, %xmm1      # round 1
1707         call _key_expansion_128
1708         aeskeygenassist $0x2, %xmm0, %xmm1      # round 2
1709         call _key_expansion_128
1710         aeskeygenassist $0x4, %xmm0, %xmm1      # round 3
1711         call _key_expansion_128
1712         aeskeygenassist $0x8, %xmm0, %xmm1      # round 4
1713         call _key_expansion_128
1714         aeskeygenassist $0x10, %xmm0, %xmm1     # round 5
1715         call _key_expansion_128
1716         aeskeygenassist $0x20, %xmm0, %xmm1     # round 6
1717         call _key_expansion_128
1718         aeskeygenassist $0x40, %xmm0, %xmm1     # round 7
1719         call _key_expansion_128
1720         aeskeygenassist $0x80, %xmm0, %xmm1     # round 8
1721         call _key_expansion_128
1722         aeskeygenassist $0x1b, %xmm0, %xmm1     # round 9
1723         call _key_expansion_128
1724         aeskeygenassist $0x36, %xmm0, %xmm1     # round 10
1725         call _key_expansion_128
1726 .Ldec_key:
1727         sub $0x10, TKEYP
1728         movaps (KEYP), %xmm0
1729         movaps (TKEYP), %xmm1
1730         movaps %xmm0, 240(TKEYP)
1731         movaps %xmm1, 240(KEYP)
1732         add $0x10, KEYP
1733         lea 240-16(TKEYP), UKEYP
1734 .align 4
1735 .Ldec_key_loop:
1736         movaps (KEYP), %xmm0
1737         aesimc %xmm0, %xmm1
1738         movaps %xmm1, (UKEYP)
1739         add $0x10, KEYP
1740         sub $0x10, UKEYP
1741         cmp TKEYP, KEYP
1742         jb .Ldec_key_loop
1743 #ifndef __x86_64__
1744         popl KEYP
1745 #endif
1746         FRAME_END
1747         RET
1748 SYM_FUNC_END(aesni_set_key)
1749
1750 /*
1751  * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1752  */
1753 SYM_FUNC_START(aesni_enc)
1754         FRAME_BEGIN
1755 #ifndef __x86_64__
1756         pushl KEYP
1757         pushl KLEN
1758         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1759         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1760         movl (FRAME_OFFSET+20)(%esp), INP       # src
1761 #endif
1762         movl 480(KEYP), KLEN            # key length
1763         movups (INP), STATE             # input
1764         call _aesni_enc1
1765         movups STATE, (OUTP)            # output
1766 #ifndef __x86_64__
1767         popl KLEN
1768         popl KEYP
1769 #endif
1770         FRAME_END
1771         RET
1772 SYM_FUNC_END(aesni_enc)
1773
1774 /*
1775  * _aesni_enc1:         internal ABI
1776  * input:
1777  *      KEYP:           key struct pointer
1778  *      KLEN:           round count
1779  *      STATE:          initial state (input)
1780  * output:
1781  *      STATE:          finial state (output)
1782  * changed:
1783  *      KEY
1784  *      TKEYP (T1)
1785  */
1786 SYM_FUNC_START_LOCAL(_aesni_enc1)
1787         movaps (KEYP), KEY              # key
1788         mov KEYP, TKEYP
1789         pxor KEY, STATE         # round 0
1790         add $0x30, TKEYP
1791         cmp $24, KLEN
1792         jb .Lenc128
1793         lea 0x20(TKEYP), TKEYP
1794         je .Lenc192
1795         add $0x20, TKEYP
1796         movaps -0x60(TKEYP), KEY
1797         aesenc KEY, STATE
1798         movaps -0x50(TKEYP), KEY
1799         aesenc KEY, STATE
1800 .align 4
1801 .Lenc192:
1802         movaps -0x40(TKEYP), KEY
1803         aesenc KEY, STATE
1804         movaps -0x30(TKEYP), KEY
1805         aesenc KEY, STATE
1806 .align 4
1807 .Lenc128:
1808         movaps -0x20(TKEYP), KEY
1809         aesenc KEY, STATE
1810         movaps -0x10(TKEYP), KEY
1811         aesenc KEY, STATE
1812         movaps (TKEYP), KEY
1813         aesenc KEY, STATE
1814         movaps 0x10(TKEYP), KEY
1815         aesenc KEY, STATE
1816         movaps 0x20(TKEYP), KEY
1817         aesenc KEY, STATE
1818         movaps 0x30(TKEYP), KEY
1819         aesenc KEY, STATE
1820         movaps 0x40(TKEYP), KEY
1821         aesenc KEY, STATE
1822         movaps 0x50(TKEYP), KEY
1823         aesenc KEY, STATE
1824         movaps 0x60(TKEYP), KEY
1825         aesenc KEY, STATE
1826         movaps 0x70(TKEYP), KEY
1827         aesenclast KEY, STATE
1828         RET
1829 SYM_FUNC_END(_aesni_enc1)
1830
1831 /*
1832  * _aesni_enc4: internal ABI
1833  * input:
1834  *      KEYP:           key struct pointer
1835  *      KLEN:           round count
1836  *      STATE1:         initial state (input)
1837  *      STATE2
1838  *      STATE3
1839  *      STATE4
1840  * output:
1841  *      STATE1:         finial state (output)
1842  *      STATE2
1843  *      STATE3
1844  *      STATE4
1845  * changed:
1846  *      KEY
1847  *      TKEYP (T1)
1848  */
1849 SYM_FUNC_START_LOCAL(_aesni_enc4)
1850         movaps (KEYP), KEY              # key
1851         mov KEYP, TKEYP
1852         pxor KEY, STATE1                # round 0
1853         pxor KEY, STATE2
1854         pxor KEY, STATE3
1855         pxor KEY, STATE4
1856         add $0x30, TKEYP
1857         cmp $24, KLEN
1858         jb .L4enc128
1859         lea 0x20(TKEYP), TKEYP
1860         je .L4enc192
1861         add $0x20, TKEYP
1862         movaps -0x60(TKEYP), KEY
1863         aesenc KEY, STATE1
1864         aesenc KEY, STATE2
1865         aesenc KEY, STATE3
1866         aesenc KEY, STATE4
1867         movaps -0x50(TKEYP), KEY
1868         aesenc KEY, STATE1
1869         aesenc KEY, STATE2
1870         aesenc KEY, STATE3
1871         aesenc KEY, STATE4
1872 #.align 4
1873 .L4enc192:
1874         movaps -0x40(TKEYP), KEY
1875         aesenc KEY, STATE1
1876         aesenc KEY, STATE2
1877         aesenc KEY, STATE3
1878         aesenc KEY, STATE4
1879         movaps -0x30(TKEYP), KEY
1880         aesenc KEY, STATE1
1881         aesenc KEY, STATE2
1882         aesenc KEY, STATE3
1883         aesenc KEY, STATE4
1884 #.align 4
1885 .L4enc128:
1886         movaps -0x20(TKEYP), KEY
1887         aesenc KEY, STATE1
1888         aesenc KEY, STATE2
1889         aesenc KEY, STATE3
1890         aesenc KEY, STATE4
1891         movaps -0x10(TKEYP), KEY
1892         aesenc KEY, STATE1
1893         aesenc KEY, STATE2
1894         aesenc KEY, STATE3
1895         aesenc KEY, STATE4
1896         movaps (TKEYP), KEY
1897         aesenc KEY, STATE1
1898         aesenc KEY, STATE2
1899         aesenc KEY, STATE3
1900         aesenc KEY, STATE4
1901         movaps 0x10(TKEYP), KEY
1902         aesenc KEY, STATE1
1903         aesenc KEY, STATE2
1904         aesenc KEY, STATE3
1905         aesenc KEY, STATE4
1906         movaps 0x20(TKEYP), KEY
1907         aesenc KEY, STATE1
1908         aesenc KEY, STATE2
1909         aesenc KEY, STATE3
1910         aesenc KEY, STATE4
1911         movaps 0x30(TKEYP), KEY
1912         aesenc KEY, STATE1
1913         aesenc KEY, STATE2
1914         aesenc KEY, STATE3
1915         aesenc KEY, STATE4
1916         movaps 0x40(TKEYP), KEY
1917         aesenc KEY, STATE1
1918         aesenc KEY, STATE2
1919         aesenc KEY, STATE3
1920         aesenc KEY, STATE4
1921         movaps 0x50(TKEYP), KEY
1922         aesenc KEY, STATE1
1923         aesenc KEY, STATE2
1924         aesenc KEY, STATE3
1925         aesenc KEY, STATE4
1926         movaps 0x60(TKEYP), KEY
1927         aesenc KEY, STATE1
1928         aesenc KEY, STATE2
1929         aesenc KEY, STATE3
1930         aesenc KEY, STATE4
1931         movaps 0x70(TKEYP), KEY
1932         aesenclast KEY, STATE1          # last round
1933         aesenclast KEY, STATE2
1934         aesenclast KEY, STATE3
1935         aesenclast KEY, STATE4
1936         RET
1937 SYM_FUNC_END(_aesni_enc4)
1938
1939 /*
1940  * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
1941  */
1942 SYM_FUNC_START(aesni_dec)
1943         FRAME_BEGIN
1944 #ifndef __x86_64__
1945         pushl KEYP
1946         pushl KLEN
1947         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1948         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1949         movl (FRAME_OFFSET+20)(%esp), INP       # src
1950 #endif
1951         mov 480(KEYP), KLEN             # key length
1952         add $240, KEYP
1953         movups (INP), STATE             # input
1954         call _aesni_dec1
1955         movups STATE, (OUTP)            #output
1956 #ifndef __x86_64__
1957         popl KLEN
1958         popl KEYP
1959 #endif
1960         FRAME_END
1961         RET
1962 SYM_FUNC_END(aesni_dec)
1963
1964 /*
1965  * _aesni_dec1:         internal ABI
1966  * input:
1967  *      KEYP:           key struct pointer
1968  *      KLEN:           key length
1969  *      STATE:          initial state (input)
1970  * output:
1971  *      STATE:          finial state (output)
1972  * changed:
1973  *      KEY
1974  *      TKEYP (T1)
1975  */
1976 SYM_FUNC_START_LOCAL(_aesni_dec1)
1977         movaps (KEYP), KEY              # key
1978         mov KEYP, TKEYP
1979         pxor KEY, STATE         # round 0
1980         add $0x30, TKEYP
1981         cmp $24, KLEN
1982         jb .Ldec128
1983         lea 0x20(TKEYP), TKEYP
1984         je .Ldec192
1985         add $0x20, TKEYP
1986         movaps -0x60(TKEYP), KEY
1987         aesdec KEY, STATE
1988         movaps -0x50(TKEYP), KEY
1989         aesdec KEY, STATE
1990 .align 4
1991 .Ldec192:
1992         movaps -0x40(TKEYP), KEY
1993         aesdec KEY, STATE
1994         movaps -0x30(TKEYP), KEY
1995         aesdec KEY, STATE
1996 .align 4
1997 .Ldec128:
1998         movaps -0x20(TKEYP), KEY
1999         aesdec KEY, STATE
2000         movaps -0x10(TKEYP), KEY
2001         aesdec KEY, STATE
2002         movaps (TKEYP), KEY
2003         aesdec KEY, STATE
2004         movaps 0x10(TKEYP), KEY
2005         aesdec KEY, STATE
2006         movaps 0x20(TKEYP), KEY
2007         aesdec KEY, STATE
2008         movaps 0x30(TKEYP), KEY
2009         aesdec KEY, STATE
2010         movaps 0x40(TKEYP), KEY
2011         aesdec KEY, STATE
2012         movaps 0x50(TKEYP), KEY
2013         aesdec KEY, STATE
2014         movaps 0x60(TKEYP), KEY
2015         aesdec KEY, STATE
2016         movaps 0x70(TKEYP), KEY
2017         aesdeclast KEY, STATE
2018         RET
2019 SYM_FUNC_END(_aesni_dec1)
2020
2021 /*
2022  * _aesni_dec4: internal ABI
2023  * input:
2024  *      KEYP:           key struct pointer
2025  *      KLEN:           key length
2026  *      STATE1:         initial state (input)
2027  *      STATE2
2028  *      STATE3
2029  *      STATE4
2030  * output:
2031  *      STATE1:         finial state (output)
2032  *      STATE2
2033  *      STATE3
2034  *      STATE4
2035  * changed:
2036  *      KEY
2037  *      TKEYP (T1)
2038  */
2039 SYM_FUNC_START_LOCAL(_aesni_dec4)
2040         movaps (KEYP), KEY              # key
2041         mov KEYP, TKEYP
2042         pxor KEY, STATE1                # round 0
2043         pxor KEY, STATE2
2044         pxor KEY, STATE3
2045         pxor KEY, STATE4
2046         add $0x30, TKEYP
2047         cmp $24, KLEN
2048         jb .L4dec128
2049         lea 0x20(TKEYP), TKEYP
2050         je .L4dec192
2051         add $0x20, TKEYP
2052         movaps -0x60(TKEYP), KEY
2053         aesdec KEY, STATE1
2054         aesdec KEY, STATE2
2055         aesdec KEY, STATE3
2056         aesdec KEY, STATE4
2057         movaps -0x50(TKEYP), KEY
2058         aesdec KEY, STATE1
2059         aesdec KEY, STATE2
2060         aesdec KEY, STATE3
2061         aesdec KEY, STATE4
2062 .align 4
2063 .L4dec192:
2064         movaps -0x40(TKEYP), KEY
2065         aesdec KEY, STATE1
2066         aesdec KEY, STATE2
2067         aesdec KEY, STATE3
2068         aesdec KEY, STATE4
2069         movaps -0x30(TKEYP), KEY
2070         aesdec KEY, STATE1
2071         aesdec KEY, STATE2
2072         aesdec KEY, STATE3
2073         aesdec KEY, STATE4
2074 .align 4
2075 .L4dec128:
2076         movaps -0x20(TKEYP), KEY
2077         aesdec KEY, STATE1
2078         aesdec KEY, STATE2
2079         aesdec KEY, STATE3
2080         aesdec KEY, STATE4
2081         movaps -0x10(TKEYP), KEY
2082         aesdec KEY, STATE1
2083         aesdec KEY, STATE2
2084         aesdec KEY, STATE3
2085         aesdec KEY, STATE4
2086         movaps (TKEYP), KEY
2087         aesdec KEY, STATE1
2088         aesdec KEY, STATE2
2089         aesdec KEY, STATE3
2090         aesdec KEY, STATE4
2091         movaps 0x10(TKEYP), KEY
2092         aesdec KEY, STATE1
2093         aesdec KEY, STATE2
2094         aesdec KEY, STATE3
2095         aesdec KEY, STATE4
2096         movaps 0x20(TKEYP), KEY
2097         aesdec KEY, STATE1
2098         aesdec KEY, STATE2
2099         aesdec KEY, STATE3
2100         aesdec KEY, STATE4
2101         movaps 0x30(TKEYP), KEY
2102         aesdec KEY, STATE1
2103         aesdec KEY, STATE2
2104         aesdec KEY, STATE3
2105         aesdec KEY, STATE4
2106         movaps 0x40(TKEYP), KEY
2107         aesdec KEY, STATE1
2108         aesdec KEY, STATE2
2109         aesdec KEY, STATE3
2110         aesdec KEY, STATE4
2111         movaps 0x50(TKEYP), KEY
2112         aesdec KEY, STATE1
2113         aesdec KEY, STATE2
2114         aesdec KEY, STATE3
2115         aesdec KEY, STATE4
2116         movaps 0x60(TKEYP), KEY
2117         aesdec KEY, STATE1
2118         aesdec KEY, STATE2
2119         aesdec KEY, STATE3
2120         aesdec KEY, STATE4
2121         movaps 0x70(TKEYP), KEY
2122         aesdeclast KEY, STATE1          # last round
2123         aesdeclast KEY, STATE2
2124         aesdeclast KEY, STATE3
2125         aesdeclast KEY, STATE4
2126         RET
2127 SYM_FUNC_END(_aesni_dec4)
2128
2129 /*
2130  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2131  *                    size_t len)
2132  */
2133 SYM_FUNC_START(aesni_ecb_enc)
2134         FRAME_BEGIN
2135 #ifndef __x86_64__
2136         pushl LEN
2137         pushl KEYP
2138         pushl KLEN
2139         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2140         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2141         movl (FRAME_OFFSET+24)(%esp), INP       # src
2142         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2143 #endif
2144         test LEN, LEN           # check length
2145         jz .Lecb_enc_ret
2146         mov 480(KEYP), KLEN
2147         cmp $16, LEN
2148         jb .Lecb_enc_ret
2149         cmp $64, LEN
2150         jb .Lecb_enc_loop1
2151 .align 4
2152 .Lecb_enc_loop4:
2153         movups (INP), STATE1
2154         movups 0x10(INP), STATE2
2155         movups 0x20(INP), STATE3
2156         movups 0x30(INP), STATE4
2157         call _aesni_enc4
2158         movups STATE1, (OUTP)
2159         movups STATE2, 0x10(OUTP)
2160         movups STATE3, 0x20(OUTP)
2161         movups STATE4, 0x30(OUTP)
2162         sub $64, LEN
2163         add $64, INP
2164         add $64, OUTP
2165         cmp $64, LEN
2166         jge .Lecb_enc_loop4
2167         cmp $16, LEN
2168         jb .Lecb_enc_ret
2169 .align 4
2170 .Lecb_enc_loop1:
2171         movups (INP), STATE1
2172         call _aesni_enc1
2173         movups STATE1, (OUTP)
2174         sub $16, LEN
2175         add $16, INP
2176         add $16, OUTP
2177         cmp $16, LEN
2178         jge .Lecb_enc_loop1
2179 .Lecb_enc_ret:
2180 #ifndef __x86_64__
2181         popl KLEN
2182         popl KEYP
2183         popl LEN
2184 #endif
2185         FRAME_END
2186         RET
2187 SYM_FUNC_END(aesni_ecb_enc)
2188
2189 /*
2190  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2191  *                    size_t len);
2192  */
2193 SYM_FUNC_START(aesni_ecb_dec)
2194         FRAME_BEGIN
2195 #ifndef __x86_64__
2196         pushl LEN
2197         pushl KEYP
2198         pushl KLEN
2199         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2200         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2201         movl (FRAME_OFFSET+24)(%esp), INP       # src
2202         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2203 #endif
2204         test LEN, LEN
2205         jz .Lecb_dec_ret
2206         mov 480(KEYP), KLEN
2207         add $240, KEYP
2208         cmp $16, LEN
2209         jb .Lecb_dec_ret
2210         cmp $64, LEN
2211         jb .Lecb_dec_loop1
2212 .align 4
2213 .Lecb_dec_loop4:
2214         movups (INP), STATE1
2215         movups 0x10(INP), STATE2
2216         movups 0x20(INP), STATE3
2217         movups 0x30(INP), STATE4
2218         call _aesni_dec4
2219         movups STATE1, (OUTP)
2220         movups STATE2, 0x10(OUTP)
2221         movups STATE3, 0x20(OUTP)
2222         movups STATE4, 0x30(OUTP)
2223         sub $64, LEN
2224         add $64, INP
2225         add $64, OUTP
2226         cmp $64, LEN
2227         jge .Lecb_dec_loop4
2228         cmp $16, LEN
2229         jb .Lecb_dec_ret
2230 .align 4
2231 .Lecb_dec_loop1:
2232         movups (INP), STATE1
2233         call _aesni_dec1
2234         movups STATE1, (OUTP)
2235         sub $16, LEN
2236         add $16, INP
2237         add $16, OUTP
2238         cmp $16, LEN
2239         jge .Lecb_dec_loop1
2240 .Lecb_dec_ret:
2241 #ifndef __x86_64__
2242         popl KLEN
2243         popl KEYP
2244         popl LEN
2245 #endif
2246         FRAME_END
2247         RET
2248 SYM_FUNC_END(aesni_ecb_dec)
2249
2250 /*
2251  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2252  *                    size_t len, u8 *iv)
2253  */
2254 SYM_FUNC_START(aesni_cbc_enc)
2255         FRAME_BEGIN
2256 #ifndef __x86_64__
2257         pushl IVP
2258         pushl LEN
2259         pushl KEYP
2260         pushl KLEN
2261         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2262         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2263         movl (FRAME_OFFSET+28)(%esp), INP       # src
2264         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2265         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2266 #endif
2267         cmp $16, LEN
2268         jb .Lcbc_enc_ret
2269         mov 480(KEYP), KLEN
2270         movups (IVP), STATE     # load iv as initial state
2271 .align 4
2272 .Lcbc_enc_loop:
2273         movups (INP), IN        # load input
2274         pxor IN, STATE
2275         call _aesni_enc1
2276         movups STATE, (OUTP)    # store output
2277         sub $16, LEN
2278         add $16, INP
2279         add $16, OUTP
2280         cmp $16, LEN
2281         jge .Lcbc_enc_loop
2282         movups STATE, (IVP)
2283 .Lcbc_enc_ret:
2284 #ifndef __x86_64__
2285         popl KLEN
2286         popl KEYP
2287         popl LEN
2288         popl IVP
2289 #endif
2290         FRAME_END
2291         RET
2292 SYM_FUNC_END(aesni_cbc_enc)
2293
2294 /*
2295  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2296  *                    size_t len, u8 *iv)
2297  */
2298 SYM_FUNC_START(aesni_cbc_dec)
2299         FRAME_BEGIN
2300 #ifndef __x86_64__
2301         pushl IVP
2302         pushl LEN
2303         pushl KEYP
2304         pushl KLEN
2305         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2306         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2307         movl (FRAME_OFFSET+28)(%esp), INP       # src
2308         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2309         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2310 #endif
2311         cmp $16, LEN
2312         jb .Lcbc_dec_just_ret
2313         mov 480(KEYP), KLEN
2314         add $240, KEYP
2315         movups (IVP), IV
2316         cmp $64, LEN
2317         jb .Lcbc_dec_loop1
2318 .align 4
2319 .Lcbc_dec_loop4:
2320         movups (INP), IN1
2321         movaps IN1, STATE1
2322         movups 0x10(INP), IN2
2323         movaps IN2, STATE2
2324 #ifdef __x86_64__
2325         movups 0x20(INP), IN3
2326         movaps IN3, STATE3
2327         movups 0x30(INP), IN4
2328         movaps IN4, STATE4
2329 #else
2330         movups 0x20(INP), IN1
2331         movaps IN1, STATE3
2332         movups 0x30(INP), IN2
2333         movaps IN2, STATE4
2334 #endif
2335         call _aesni_dec4
2336         pxor IV, STATE1
2337 #ifdef __x86_64__
2338         pxor IN1, STATE2
2339         pxor IN2, STATE3
2340         pxor IN3, STATE4
2341         movaps IN4, IV
2342 #else
2343         pxor IN1, STATE4
2344         movaps IN2, IV
2345         movups (INP), IN1
2346         pxor IN1, STATE2
2347         movups 0x10(INP), IN2
2348         pxor IN2, STATE3
2349 #endif
2350         movups STATE1, (OUTP)
2351         movups STATE2, 0x10(OUTP)
2352         movups STATE3, 0x20(OUTP)
2353         movups STATE4, 0x30(OUTP)
2354         sub $64, LEN
2355         add $64, INP
2356         add $64, OUTP
2357         cmp $64, LEN
2358         jge .Lcbc_dec_loop4
2359         cmp $16, LEN
2360         jb .Lcbc_dec_ret
2361 .align 4
2362 .Lcbc_dec_loop1:
2363         movups (INP), IN
2364         movaps IN, STATE
2365         call _aesni_dec1
2366         pxor IV, STATE
2367         movups STATE, (OUTP)
2368         movaps IN, IV
2369         sub $16, LEN
2370         add $16, INP
2371         add $16, OUTP
2372         cmp $16, LEN
2373         jge .Lcbc_dec_loop1
2374 .Lcbc_dec_ret:
2375         movups IV, (IVP)
2376 .Lcbc_dec_just_ret:
2377 #ifndef __x86_64__
2378         popl KLEN
2379         popl KEYP
2380         popl LEN
2381         popl IVP
2382 #endif
2383         FRAME_END
2384         RET
2385 SYM_FUNC_END(aesni_cbc_dec)
2386
2387 /*
2388  * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2389  *                        size_t len, u8 *iv)
2390  */
2391 SYM_FUNC_START(aesni_cts_cbc_enc)
2392         FRAME_BEGIN
2393 #ifndef __x86_64__
2394         pushl IVP
2395         pushl LEN
2396         pushl KEYP
2397         pushl KLEN
2398         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2399         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2400         movl (FRAME_OFFSET+28)(%esp), INP       # src
2401         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2402         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2403         lea .Lcts_permute_table, T1
2404 #else
2405         lea .Lcts_permute_table(%rip), T1
2406 #endif
2407         mov 480(KEYP), KLEN
2408         movups (IVP), STATE
2409         sub $16, LEN
2410         mov T1, IVP
2411         add $32, IVP
2412         add LEN, T1
2413         sub LEN, IVP
2414         movups (T1), %xmm4
2415         movups (IVP), %xmm5
2416
2417         movups (INP), IN1
2418         add LEN, INP
2419         movups (INP), IN2
2420
2421         pxor IN1, STATE
2422         call _aesni_enc1
2423
2424         pshufb %xmm5, IN2
2425         pxor STATE, IN2
2426         pshufb %xmm4, STATE
2427         add OUTP, LEN
2428         movups STATE, (LEN)
2429
2430         movaps IN2, STATE
2431         call _aesni_enc1
2432         movups STATE, (OUTP)
2433
2434 #ifndef __x86_64__
2435         popl KLEN
2436         popl KEYP
2437         popl LEN
2438         popl IVP
2439 #endif
2440         FRAME_END
2441         RET
2442 SYM_FUNC_END(aesni_cts_cbc_enc)
2443
2444 /*
2445  * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2446  *                        size_t len, u8 *iv)
2447  */
2448 SYM_FUNC_START(aesni_cts_cbc_dec)
2449         FRAME_BEGIN
2450 #ifndef __x86_64__
2451         pushl IVP
2452         pushl LEN
2453         pushl KEYP
2454         pushl KLEN
2455         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2456         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2457         movl (FRAME_OFFSET+28)(%esp), INP       # src
2458         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2459         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2460         lea .Lcts_permute_table, T1
2461 #else
2462         lea .Lcts_permute_table(%rip), T1
2463 #endif
2464         mov 480(KEYP), KLEN
2465         add $240, KEYP
2466         movups (IVP), IV
2467         sub $16, LEN
2468         mov T1, IVP
2469         add $32, IVP
2470         add LEN, T1
2471         sub LEN, IVP
2472         movups (T1), %xmm4
2473
2474         movups (INP), STATE
2475         add LEN, INP
2476         movups (INP), IN1
2477
2478         call _aesni_dec1
2479         movaps STATE, IN2
2480         pshufb %xmm4, STATE
2481         pxor IN1, STATE
2482
2483         add OUTP, LEN
2484         movups STATE, (LEN)
2485
2486         movups (IVP), %xmm0
2487         pshufb %xmm0, IN1
2488         pblendvb IN2, IN1
2489         movaps IN1, STATE
2490         call _aesni_dec1
2491
2492         pxor IV, STATE
2493         movups STATE, (OUTP)
2494
2495 #ifndef __x86_64__
2496         popl KLEN
2497         popl KEYP
2498         popl LEN
2499         popl IVP
2500 #endif
2501         FRAME_END
2502         RET
2503 SYM_FUNC_END(aesni_cts_cbc_dec)
2504
2505 .pushsection .rodata
2506 .align 16
2507 .Lcts_permute_table:
2508         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2509         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2510         .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2511         .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2512         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2513         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2514 #ifdef __x86_64__
2515 .Lbswap_mask:
2516         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2517 #endif
2518 .popsection
2519
2520 #ifdef __x86_64__
2521 /*
2522  * _aesni_inc_init:     internal ABI
2523  *      setup registers used by _aesni_inc
2524  * input:
2525  *      IV
2526  * output:
2527  *      CTR:    == IV, in little endian
2528  *      TCTR_LOW: == lower qword of CTR
2529  *      INC:    == 1, in little endian
2530  *      BSWAP_MASK == endian swapping mask
2531  */
2532 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2533         movaps .Lbswap_mask(%rip), BSWAP_MASK
2534         movaps IV, CTR
2535         pshufb BSWAP_MASK, CTR
2536         mov $1, TCTR_LOW
2537         movq TCTR_LOW, INC
2538         movq CTR, TCTR_LOW
2539         RET
2540 SYM_FUNC_END(_aesni_inc_init)
2541
2542 /*
2543  * _aesni_inc:          internal ABI
2544  *      Increase IV by 1, IV is in big endian
2545  * input:
2546  *      IV
2547  *      CTR:    == IV, in little endian
2548  *      TCTR_LOW: == lower qword of CTR
2549  *      INC:    == 1, in little endian
2550  *      BSWAP_MASK == endian swapping mask
2551  * output:
2552  *      IV:     Increase by 1
2553  * changed:
2554  *      CTR:    == output IV, in little endian
2555  *      TCTR_LOW: == lower qword of CTR
2556  */
2557 SYM_FUNC_START_LOCAL(_aesni_inc)
2558         paddq INC, CTR
2559         add $1, TCTR_LOW
2560         jnc .Linc_low
2561         pslldq $8, INC
2562         paddq INC, CTR
2563         psrldq $8, INC
2564 .Linc_low:
2565         movaps CTR, IV
2566         pshufb BSWAP_MASK, IV
2567         RET
2568 SYM_FUNC_END(_aesni_inc)
2569
2570 /*
2571  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2572  *                    size_t len, u8 *iv)
2573  */
2574 SYM_FUNC_START(aesni_ctr_enc)
2575         FRAME_BEGIN
2576         cmp $16, LEN
2577         jb .Lctr_enc_just_ret
2578         mov 480(KEYP), KLEN
2579         movups (IVP), IV
2580         call _aesni_inc_init
2581         cmp $64, LEN
2582         jb .Lctr_enc_loop1
2583 .align 4
2584 .Lctr_enc_loop4:
2585         movaps IV, STATE1
2586         call _aesni_inc
2587         movups (INP), IN1
2588         movaps IV, STATE2
2589         call _aesni_inc
2590         movups 0x10(INP), IN2
2591         movaps IV, STATE3
2592         call _aesni_inc
2593         movups 0x20(INP), IN3
2594         movaps IV, STATE4
2595         call _aesni_inc
2596         movups 0x30(INP), IN4
2597         call _aesni_enc4
2598         pxor IN1, STATE1
2599         movups STATE1, (OUTP)
2600         pxor IN2, STATE2
2601         movups STATE2, 0x10(OUTP)
2602         pxor IN3, STATE3
2603         movups STATE3, 0x20(OUTP)
2604         pxor IN4, STATE4
2605         movups STATE4, 0x30(OUTP)
2606         sub $64, LEN
2607         add $64, INP
2608         add $64, OUTP
2609         cmp $64, LEN
2610         jge .Lctr_enc_loop4
2611         cmp $16, LEN
2612         jb .Lctr_enc_ret
2613 .align 4
2614 .Lctr_enc_loop1:
2615         movaps IV, STATE
2616         call _aesni_inc
2617         movups (INP), IN
2618         call _aesni_enc1
2619         pxor IN, STATE
2620         movups STATE, (OUTP)
2621         sub $16, LEN
2622         add $16, INP
2623         add $16, OUTP
2624         cmp $16, LEN
2625         jge .Lctr_enc_loop1
2626 .Lctr_enc_ret:
2627         movups IV, (IVP)
2628 .Lctr_enc_just_ret:
2629         FRAME_END
2630         RET
2631 SYM_FUNC_END(aesni_ctr_enc)
2632
2633 #endif
2634
2635 .section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2636 .align 16
2637 .Lgf128mul_x_ble_mask:
2638         .octa 0x00000000000000010000000000000087
2639 .previous
2640
2641 /*
2642  * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
2643  * input:
2644  *      IV:     current IV
2645  *      GF128MUL_MASK == mask with 0x87 and 0x01
2646  * output:
2647  *      IV:     next IV
2648  * changed:
2649  *      KEY:    == temporary value
2650  */
2651 .macro _aesni_gf128mul_x_ble
2652         pshufd $0x13, IV, KEY
2653         paddq IV, IV
2654         psrad $31, KEY
2655         pand GF128MUL_MASK, KEY
2656         pxor KEY, IV
2657 .endm
2658
2659 .macro  _aesni_xts_crypt        enc
2660         FRAME_BEGIN
2661 #ifndef __x86_64__
2662         pushl IVP
2663         pushl LEN
2664         pushl KEYP
2665         pushl KLEN
2666         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2667         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2668         movl (FRAME_OFFSET+28)(%esp), INP       # src
2669         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2670         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2671         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2672 #else
2673         movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2674 #endif
2675         movups (IVP), IV
2676
2677         mov 480(KEYP), KLEN
2678 .if !\enc
2679         add $240, KEYP
2680
2681         test $15, LEN
2682         jz .Lxts_loop4\@
2683         sub $16, LEN
2684 .endif
2685
2686 .Lxts_loop4\@:
2687         sub $64, LEN
2688         jl .Lxts_1x\@
2689
2690         movdqa IV, STATE1
2691         movdqu 0x00(INP), IN
2692         pxor IN, STATE1
2693         movdqu IV, 0x00(OUTP)
2694
2695         _aesni_gf128mul_x_ble
2696         movdqa IV, STATE2
2697         movdqu 0x10(INP), IN
2698         pxor IN, STATE2
2699         movdqu IV, 0x10(OUTP)
2700
2701         _aesni_gf128mul_x_ble
2702         movdqa IV, STATE3
2703         movdqu 0x20(INP), IN
2704         pxor IN, STATE3
2705         movdqu IV, 0x20(OUTP)
2706
2707         _aesni_gf128mul_x_ble
2708         movdqa IV, STATE4
2709         movdqu 0x30(INP), IN
2710         pxor IN, STATE4
2711         movdqu IV, 0x30(OUTP)
2712
2713 .if \enc
2714         call _aesni_enc4
2715 .else
2716         call _aesni_dec4
2717 .endif
2718
2719         movdqu 0x00(OUTP), IN
2720         pxor IN, STATE1
2721         movdqu STATE1, 0x00(OUTP)
2722
2723         movdqu 0x10(OUTP), IN
2724         pxor IN, STATE2
2725         movdqu STATE2, 0x10(OUTP)
2726
2727         movdqu 0x20(OUTP), IN
2728         pxor IN, STATE3
2729         movdqu STATE3, 0x20(OUTP)
2730
2731         movdqu 0x30(OUTP), IN
2732         pxor IN, STATE4
2733         movdqu STATE4, 0x30(OUTP)
2734
2735         _aesni_gf128mul_x_ble
2736
2737         add $64, INP
2738         add $64, OUTP
2739         test LEN, LEN
2740         jnz .Lxts_loop4\@
2741
2742 .Lxts_ret_iv\@:
2743         movups IV, (IVP)
2744
2745 .Lxts_ret\@:
2746 #ifndef __x86_64__
2747         popl KLEN
2748         popl KEYP
2749         popl LEN
2750         popl IVP
2751 #endif
2752         FRAME_END
2753         RET
2754
2755 .Lxts_1x\@:
2756         add $64, LEN
2757         jz .Lxts_ret_iv\@
2758 .if \enc
2759         sub $16, LEN
2760         jl .Lxts_cts4\@
2761 .endif
2762
2763 .Lxts_loop1\@:
2764         movdqu (INP), STATE
2765 .if \enc
2766         pxor IV, STATE
2767         call _aesni_enc1
2768 .else
2769         add $16, INP
2770         sub $16, LEN
2771         jl .Lxts_cts1\@
2772         pxor IV, STATE
2773         call _aesni_dec1
2774 .endif
2775         pxor IV, STATE
2776         _aesni_gf128mul_x_ble
2777
2778         test LEN, LEN
2779         jz .Lxts_out\@
2780
2781 .if \enc
2782         add $16, INP
2783         sub $16, LEN
2784         jl .Lxts_cts1\@
2785 .endif
2786
2787         movdqu STATE, (OUTP)
2788         add $16, OUTP
2789         jmp .Lxts_loop1\@
2790
2791 .Lxts_out\@:
2792         movdqu STATE, (OUTP)
2793         jmp .Lxts_ret_iv\@
2794
2795 .if \enc
2796 .Lxts_cts4\@:
2797         movdqa STATE4, STATE
2798         sub $16, OUTP
2799 .Lxts_cts1\@:
2800 .else
2801 .Lxts_cts1\@:
2802         movdqa IV, STATE4
2803         _aesni_gf128mul_x_ble
2804
2805         pxor IV, STATE
2806         call _aesni_dec1
2807         pxor IV, STATE
2808 .endif
2809 #ifndef __x86_64__
2810         lea .Lcts_permute_table, T1
2811 #else
2812         lea .Lcts_permute_table(%rip), T1
2813 #endif
2814         add LEN, INP            /* rewind input pointer */
2815         add $16, LEN            /* # bytes in final block */
2816         movups (INP), IN1
2817
2818         mov T1, IVP
2819         add $32, IVP
2820         add LEN, T1
2821         sub LEN, IVP
2822         add OUTP, LEN
2823
2824         movups (T1), %xmm4
2825         movaps STATE, IN2
2826         pshufb %xmm4, STATE
2827         movups STATE, (LEN)
2828
2829         movups (IVP), %xmm0
2830         pshufb %xmm0, IN1
2831         pblendvb IN2, IN1
2832         movaps IN1, STATE
2833
2834 .if \enc
2835         pxor IV, STATE
2836         call _aesni_enc1
2837         pxor IV, STATE
2838 .else
2839         pxor STATE4, STATE
2840         call _aesni_dec1
2841         pxor STATE4, STATE
2842 .endif
2843
2844         movups STATE, (OUTP)
2845         jmp .Lxts_ret\@
2846 .endm
2847
2848 /*
2849  * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
2850  *                    const u8 *src, unsigned int len, le128 *iv)
2851  */
2852 SYM_FUNC_START(aesni_xts_enc)
2853         _aesni_xts_crypt        1
2854 SYM_FUNC_END(aesni_xts_enc)
2855
2856 /*
2857  * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
2858  *                    const u8 *src, unsigned int len, le128 *iv)
2859  */
2860 SYM_FUNC_START(aesni_xts_dec)
2861         _aesni_xts_crypt        0
2862 SYM_FUNC_END(aesni_xts_dec)