arch/x86/crypto/aesni-intel_asm.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Implement AES algorithm in Intel AES-NI instructions.
   4  *
   5  * The white paper of AES-NI instructions can be downloaded from:
   6  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   7  *
   8  * Copyright (C) 2008, Intel Corp.
   9  *    Author: Huang Ying <ying.huang@intel.com>
  10  *            Vinodh Gopal <vinodh.gopal@intel.com>
  11  *            Kahraman Akdemir
  12  *
  13  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  14  * interface for 64-bit kernels.
  15  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  16  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  17  *             Adrian Hoban <adrian.hoban@intel.com>
  18  *             James Guilford (james.guilford@intel.com)
  19  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  20  *             Tadeusz Struk (tadeusz.struk@intel.com)
  21  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  22  *    Copyright (c) 2010, Intel Corporation.
  23  *
  24  * Ported x86_64 version to x86:
  25  *    Author: Mathias Krause <minipli@googlemail.com>
  26  */
  27
  28 #include <linux/linkage.h>
  29 #include <asm/frame.h>
  30 #include <asm/nospec-branch.h>
  31
  32 /*
  33  * The following macros are used to move an (un)aligned 16 byte value to/from
  34  * an XMM register.  This can done for either FP or integer values, for FP use
  35  * movaps (move aligned packed single) or integer use movdqa (move double quad
  36  * aligned).  It doesn't make a performance difference which instruction is used
  37  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  38  * shorter, so that is the one we'll use for now. (same for unaligned).
  39  */
  40 #define MOVADQ  movaps
  41 #define MOVUDQ  movups
  42
  43 #ifdef __x86_64__
  44
  45 # constants in mergeable sections, linker can reorder and merge
  46 .section        .rodata.cst16.POLY, "aM", @progbits, 16
  47 .align 16
  48 POLY:   .octa 0xC2000000000000000000000000000001
  49 .section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  50 .align 16
  51 TWOONE: .octa 0x00000001000000000000000000000001
  52
  53 .section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  54 .align 16
  55 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  56 .section        .rodata.cst16.MASK1, "aM", @progbits, 16
  57 .align 16
  58 MASK1:      .octa 0x0000000000000000ffffffffffffffff
  59 .section        .rodata.cst16.MASK2, "aM", @progbits, 16
  60 .align 16
  61 MASK2:      .octa 0xffffffffffffffff0000000000000000
  62 .section        .rodata.cst16.ONE, "aM", @progbits, 16
  63 .align 16
  64 ONE:        .octa 0x00000000000000000000000000000001
  65 .section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  66 .align 16
  67 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  68 .section        .rodata.cst16.dec, "aM", @progbits, 16
  69 .align 16
  70 dec:        .octa 0x1
  71 .section        .rodata.cst16.enc, "aM", @progbits, 16
  72 .align 16
  73 enc:        .octa 0x2
  74
  75 # order of these constants should not change.
  76 # more specifically, ALL_F should follow SHIFT_MASK,
  77 # and zero should follow ALL_F
  78 .section        .rodata, "a", @progbits
  79 .align 16
  80 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  81 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  82             .octa 0x00000000000000000000000000000000
  83
  84 .text
  85
  86
  87 #define STACK_OFFSET    8*3
  88
  89 #define AadHash 16*0
  90 #define AadLen 16*1
  91 #define InLen (16*1)+8
  92 #define PBlockEncKey 16*2
  93 #define OrigIV 16*3
  94 #define CurCount 16*4
  95 #define PBlockLen 16*5
  96 #define HashKey         16*6    // store HashKey <<1 mod poly here
  97 #define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
  98 #define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
  99 #define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
 100 #define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
 101                                 // bits of  HashKey <<1 mod poly here
 102                                 //(for Karatsuba purposes)
 103 #define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 104                                 // bits of  HashKey^2 <<1 mod poly here
 105                                 // (for Karatsuba purposes)
 106 #define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 107                                 // bits of  HashKey^3 <<1 mod poly here
 108                                 // (for Karatsuba purposes)
 109 #define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 110                                 // bits of  HashKey^4 <<1 mod poly here
 111                                 // (for Karatsuba purposes)
 112
 113 #define arg1 rdi
 114 #define arg2 rsi
 115 #define arg3 rdx
 116 #define arg4 rcx
 117 #define arg5 r8
 118 #define arg6 r9
 119 #define arg7 STACK_OFFSET+8(%rsp)
 120 #define arg8 STACK_OFFSET+16(%rsp)
 121 #define arg9 STACK_OFFSET+24(%rsp)
 122 #define arg10 STACK_OFFSET+32(%rsp)
 123 #define arg11 STACK_OFFSET+40(%rsp)
 124 #define keysize 2*15*16(%arg1)
 125 #endif
 126
 127
 128 #define STATE1  %xmm0
 129 #define STATE2  %xmm4
 130 #define STATE3  %xmm5
 131 #define STATE4  %xmm6
 132 #define STATE   STATE1
 133 #define IN1     %xmm1
 134 #define IN2     %xmm7
 135 #define IN3     %xmm8
 136 #define IN4     %xmm9
 137 #define IN      IN1
 138 #define KEY     %xmm2
 139 #define IV      %xmm3
 140
 141 #define BSWAP_MASK %xmm10
 142 #define CTR     %xmm11
 143 #define INC     %xmm12
 144
 145 #define GF128MUL_MASK %xmm7
 146
 147 #ifdef __x86_64__
 148 #define AREG    %rax
 149 #define KEYP    %rdi
 150 #define OUTP    %rsi
 151 #define UKEYP   OUTP
 152 #define INP     %rdx
 153 #define LEN     %rcx
 154 #define IVP     %r8
 155 #define KLEN    %r9d
 156 #define T1      %r10
 157 #define TKEYP   T1
 158 #define T2      %r11
 159 #define TCTR_LOW T2
 160 #else
 161 #define AREG    %eax
 162 #define KEYP    %edi
 163 #define OUTP    AREG
 164 #define UKEYP   OUTP
 165 #define INP     %edx
 166 #define LEN     %esi
 167 #define IVP     %ebp
 168 #define KLEN    %ebx
 169 #define T1      %ecx
 170 #define TKEYP   T1
 171 #endif
 172
 173 .macro FUNC_SAVE
 174         push    %r12
 175         push    %r13
 176         push    %r14
 177 #
 178 # states of %xmm registers %xmm6:%xmm15 not saved
 179 # all %xmm registers are clobbered
 180 #
 181 .endm
 182
 183
 184 .macro FUNC_RESTORE
 185         pop     %r14
 186         pop     %r13
 187         pop     %r12
 188 .endm
 189
 190 # Precompute hashkeys.
 191 # Input: Hash subkey.
 192 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
 193 # once per key.
 194 # clobbers r12, and tmp xmm registers.
 195 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 196         mov     \SUBKEY, %r12
 197         movdqu  (%r12), \TMP3
 198         movdqa  SHUF_MASK(%rip), \TMP2
 199         pshufb  \TMP2, \TMP3
 200
 201         # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 202
 203         movdqa  \TMP3, \TMP2
 204         psllq   $1, \TMP3
 205         psrlq   $63, \TMP2
 206         movdqa  \TMP2, \TMP1
 207         pslldq  $8, \TMP2
 208         psrldq  $8, \TMP1
 209         por     \TMP2, \TMP3
 210
 211         # reduce HashKey<<1
 212
 213         pshufd  $0x24, \TMP1, \TMP2
 214         pcmpeqd TWOONE(%rip), \TMP2
 215         pand    POLY(%rip), \TMP2
 216         pxor    \TMP2, \TMP3
 217         movdqu  \TMP3, HashKey(%arg2)
 218
 219         movdqa     \TMP3, \TMP5
 220         pshufd     $78, \TMP3, \TMP1
 221         pxor       \TMP3, \TMP1
 222         movdqu     \TMP1, HashKey_k(%arg2)
 223
 224         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 225 # TMP5 = HashKey^2<<1 (mod poly)
 226         movdqu     \TMP5, HashKey_2(%arg2)
 227 # HashKey_2 = HashKey^2<<1 (mod poly)
 228         pshufd     $78, \TMP5, \TMP1
 229         pxor       \TMP5, \TMP1
 230         movdqu     \TMP1, HashKey_2_k(%arg2)
 231
 232         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 233 # TMP5 = HashKey^3<<1 (mod poly)
 234         movdqu     \TMP5, HashKey_3(%arg2)
 235         pshufd     $78, \TMP5, \TMP1
 236         pxor       \TMP5, \TMP1
 237         movdqu     \TMP1, HashKey_3_k(%arg2)
 238
 239         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 240 # TMP5 = HashKey^3<<1 (mod poly)
 241         movdqu     \TMP5, HashKey_4(%arg2)
 242         pshufd     $78, \TMP5, \TMP1
 243         pxor       \TMP5, \TMP1
 244         movdqu     \TMP1, HashKey_4_k(%arg2)
 245 .endm
 246
 247 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 248 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 249 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 250         mov \AADLEN, %r11
 251         mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 252         xor %r11d, %r11d
 253         mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 254         mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 255         mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 256         mov \Iv, %rax
 257         movdqu (%rax), %xmm0
 258         movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 259
 260         movdqa  SHUF_MASK(%rip), %xmm2
 261         pshufb %xmm2, %xmm0
 262         movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 263
 264         PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
 265         movdqu HashKey(%arg2), %xmm13
 266
 267         CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 268         %xmm4, %xmm5, %xmm6
 269 .endm
 270
 271 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 272 # struct has been initialized by GCM_INIT.
 273 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 274 # Clobbers rax, r10-r13, and xmm0-xmm15
 275 .macro GCM_ENC_DEC operation
 276         movdqu AadHash(%arg2), %xmm8
 277         movdqu HashKey(%arg2), %xmm13
 278         add %arg5, InLen(%arg2)
 279
 280         xor %r11d, %r11d # initialise the data pointer offset as zero
 281         PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 282
 283         sub %r11, %arg5         # sub partial block data used
 284         mov %arg5, %r13         # save the number of bytes
 285
 286         and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 287         mov %r13, %r12
 288         # Encrypt/Decrypt first few blocks
 289
 290         and     $(3<<4), %r12
 291         jz      .L_initial_num_blocks_is_0_\@
 292         cmp     $(2<<4), %r12
 293         jb      .L_initial_num_blocks_is_1_\@
 294         je      .L_initial_num_blocks_is_2_\@
 295 .L_initial_num_blocks_is_3_\@:
 296         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 297 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 298         sub     $48, %r13
 299         jmp     .L_initial_blocks_\@
 300 .L_initial_num_blocks_is_2_\@:
 301         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 303         sub     $32, %r13
 304         jmp     .L_initial_blocks_\@
 305 .L_initial_num_blocks_is_1_\@:
 306         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 308         sub     $16, %r13
 309         jmp     .L_initial_blocks_\@
 310 .L_initial_num_blocks_is_0_\@:
 311         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 313 .L_initial_blocks_\@:
 314
 315         # Main loop - Encrypt/Decrypt remaining blocks
 316
 317         test    %r13, %r13
 318         je      .L_zero_cipher_left_\@
 319         sub     $64, %r13
 320         je      .L_four_cipher_left_\@
 321 .L_crypt_by_4_\@:
 322         GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 323         %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 324         %xmm7, %xmm8, enc
 325         add     $64, %r11
 326         sub     $64, %r13
 327         jne     .L_crypt_by_4_\@
 328 .L_four_cipher_left_\@:
 329         GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 330 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 331 .L_zero_cipher_left_\@:
 332         movdqu %xmm8, AadHash(%arg2)
 333         movdqu %xmm0, CurCount(%arg2)
 334
 335         mov     %arg5, %r13
 336         and     $15, %r13                       # %r13 = arg5 (mod 16)
 337         je      .L_multiple_of_16_bytes_\@
 338
 339         mov %r13, PBlockLen(%arg2)
 340
 341         # Handle the last <16 Byte block separately
 342         paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 343         movdqu %xmm0, CurCount(%arg2)
 344         movdqa SHUF_MASK(%rip), %xmm10
 345         pshufb %xmm10, %xmm0
 346
 347         ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 348         movdqu %xmm0, PBlockEncKey(%arg2)
 349
 350         cmp     $16, %arg5
 351         jge     .L_large_enough_update_\@
 352
 353         lea (%arg4,%r11,1), %r10
 354         mov %r13, %r12
 355         READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 356         jmp     .L_data_read_\@
 357
 358 .L_large_enough_update_\@:
 359         sub     $16, %r11
 360         add     %r13, %r11
 361
 362         # receive the last <16 Byte block
 363         movdqu  (%arg4, %r11, 1), %xmm1
 364
 365         sub     %r13, %r11
 366         add     $16, %r11
 367
 368         lea     SHIFT_MASK+16(%rip), %r12
 369         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 370         # (r13 is the number of bytes in plaintext mod 16)
 371         sub     %r13, %r12
 372         # get the appropriate shuffle mask
 373         movdqu  (%r12), %xmm2
 374         # shift right 16-r13 bytes
 375         pshufb  %xmm2, %xmm1
 376
 377 .L_data_read_\@:
 378         lea ALL_F+16(%rip), %r12
 379         sub %r13, %r12
 380
 381 .ifc \operation, dec
 382         movdqa  %xmm1, %xmm2
 383 .endif
 384         pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 385         movdqu  (%r12), %xmm1
 386         # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 387         pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 388 .ifc \operation, dec
 389         pand    %xmm1, %xmm2
 390         movdqa SHUF_MASK(%rip), %xmm10
 391         pshufb %xmm10 ,%xmm2
 392
 393         pxor %xmm2, %xmm8
 394 .else
 395         movdqa SHUF_MASK(%rip), %xmm10
 396         pshufb %xmm10,%xmm0
 397
 398         pxor    %xmm0, %xmm8
 399 .endif
 400
 401         movdqu %xmm8, AadHash(%arg2)
 402 .ifc \operation, enc
 403         # GHASH computation for the last <16 byte block
 404         movdqa SHUF_MASK(%rip), %xmm10
 405         # shuffle xmm0 back to output as ciphertext
 406         pshufb %xmm10, %xmm0
 407 .endif
 408
 409         # Output %r13 bytes
 410         movq %xmm0, %rax
 411         cmp $8, %r13
 412         jle .L_less_than_8_bytes_left_\@
 413         mov %rax, (%arg3 , %r11, 1)
 414         add $8, %r11
 415         psrldq $8, %xmm0
 416         movq %xmm0, %rax
 417         sub $8, %r13
 418 .L_less_than_8_bytes_left_\@:
 419         mov %al,  (%arg3, %r11, 1)
 420         add $1, %r11
 421         shr $8, %rax
 422         sub $1, %r13
 423         jne .L_less_than_8_bytes_left_\@
 424 .L_multiple_of_16_bytes_\@:
 425 .endm
 426
 427 # GCM_COMPLETE Finishes update of tag of last partial block
 428 # Output: Authorization Tag (AUTH_TAG)
 429 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 430 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 431         movdqu AadHash(%arg2), %xmm8
 432         movdqu HashKey(%arg2), %xmm13
 433
 434         mov PBlockLen(%arg2), %r12
 435
 436         test %r12, %r12
 437         je .L_partial_done\@
 438
 439         GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 440
 441 .L_partial_done\@:
 442         mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 443         shl     $3, %r12                  # convert into number of bits
 444         movd    %r12d, %xmm15             # len(A) in %xmm15
 445         mov InLen(%arg2), %r12
 446         shl     $3, %r12                  # len(C) in bits (*128)
 447         movq    %r12, %xmm1
 448
 449         pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 450         pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 451         pxor    %xmm15, %xmm8
 452         GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 453         # final GHASH computation
 454         movdqa SHUF_MASK(%rip), %xmm10
 455         pshufb %xmm10, %xmm8
 456
 457         movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 458         ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 459         pxor    %xmm8, %xmm0
 460 .L_return_T_\@:
 461         mov     \AUTHTAG, %r10                     # %r10 = authTag
 462         mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 463         cmp     $16, %r11
 464         je      .L_T_16_\@
 465         cmp     $8, %r11
 466         jl      .L_T_4_\@
 467 .L_T_8_\@:
 468         movq    %xmm0, %rax
 469         mov     %rax, (%r10)
 470         add     $8, %r10
 471         sub     $8, %r11
 472         psrldq  $8, %xmm0
 473         test    %r11, %r11
 474         je      .L_return_T_done_\@
 475 .L_T_4_\@:
 476         movd    %xmm0, %eax
 477         mov     %eax, (%r10)
 478         add     $4, %r10
 479         sub     $4, %r11
 480         psrldq  $4, %xmm0
 481         test    %r11, %r11
 482         je      .L_return_T_done_\@
 483 .L_T_123_\@:
 484         movd    %xmm0, %eax
 485         cmp     $2, %r11
 486         jl      .L_T_1_\@
 487         mov     %ax, (%r10)
 488         cmp     $2, %r11
 489         je      .L_return_T_done_\@
 490         add     $2, %r10
 491         sar     $16, %eax
 492 .L_T_1_\@:
 493         mov     %al, (%r10)
 494         jmp     .L_return_T_done_\@
 495 .L_T_16_\@:
 496         movdqu  %xmm0, (%r10)
 497 .L_return_T_done_\@:
 498 .endm
 499
 500 #ifdef __x86_64__
 501 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 502 *
 503 *
 504 * Input: A and B (128-bits each, bit-reflected)
 505 * Output: C = A*B*x mod poly, (i.e. >>1 )
 506 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 507 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 508 *
 509 */
 510 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 511         movdqa    \GH, \TMP1
 512         pshufd    $78, \GH, \TMP2
 513         pshufd    $78, \HK, \TMP3
 514         pxor      \GH, \TMP2            # TMP2 = a1+a0
 515         pxor      \HK, \TMP3            # TMP3 = b1+b0
 516         pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
 517         pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
 518         pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 519         pxor      \GH, \TMP2
 520         pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 521         movdqa    \TMP2, \TMP3
 522         pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 523         psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 524         pxor      \TMP3, \GH
 525         pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 526
 527         # first phase of the reduction
 528
 529         movdqa    \GH, \TMP2
 530         movdqa    \GH, \TMP3
 531         movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 532                                         # in in order to perform
 533                                         # independent shifts
 534         pslld     $31, \TMP2            # packed right shift <<31
 535         pslld     $30, \TMP3            # packed right shift <<30
 536         pslld     $25, \TMP4            # packed right shift <<25
 537         pxor      \TMP3, \TMP2          # xor the shifted versions
 538         pxor      \TMP4, \TMP2
 539         movdqa    \TMP2, \TMP5
 540         psrldq    $4, \TMP5             # right shift TMP5 1 DW
 541         pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 542         pxor      \TMP2, \GH
 543
 544         # second phase of the reduction
 545
 546         movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 547                                         # in in order to perform
 548                                         # independent shifts
 549         movdqa    \GH,\TMP3
 550         movdqa    \GH,\TMP4
 551         psrld     $1,\TMP2              # packed left shift >>1
 552         psrld     $2,\TMP3              # packed left shift >>2
 553         psrld     $7,\TMP4              # packed left shift >>7
 554         pxor      \TMP3,\TMP2           # xor the shifted versions
 555         pxor      \TMP4,\TMP2
 556         pxor      \TMP5, \TMP2
 557         pxor      \TMP2, \GH
 558         pxor      \TMP1, \GH            # result is in TMP1
 559 .endm
 560
 561 # Reads DLEN bytes starting at DPTR and stores in XMMDst
 562 # where 0 < DLEN < 16
 563 # Clobbers %rax, DLEN and XMM1
 564 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 565         cmp $8, \DLEN
 566         jl .L_read_lt8_\@
 567         mov (\DPTR), %rax
 568         movq %rax, \XMMDst
 569         sub $8, \DLEN
 570         jz .L_done_read_partial_block_\@
 571         xor %eax, %eax
 572 .L_read_next_byte_\@:
 573         shl $8, %rax
 574         mov 7(\DPTR, \DLEN, 1), %al
 575         dec \DLEN
 576         jnz .L_read_next_byte_\@
 577         movq %rax, \XMM1
 578         pslldq $8, \XMM1
 579         por \XMM1, \XMMDst
 580         jmp .L_done_read_partial_block_\@
 581 .L_read_lt8_\@:
 582         xor %eax, %eax
 583 .L_read_next_byte_lt8_\@:
 584         shl $8, %rax
 585         mov -1(\DPTR, \DLEN, 1), %al
 586         dec \DLEN
 587         jnz .L_read_next_byte_lt8_\@
 588         movq %rax, \XMMDst
 589 .L_done_read_partial_block_\@:
 590 .endm
 591
 592 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 593 # clobbers r10-11, xmm14
 594 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 595         TMP6 TMP7
 596         MOVADQ     SHUF_MASK(%rip), %xmm14
 597         mov        \AAD, %r10           # %r10 = AAD
 598         mov        \AADLEN, %r11                # %r11 = aadLen
 599         pxor       \TMP7, \TMP7
 600         pxor       \TMP6, \TMP6
 601
 602         cmp        $16, %r11
 603         jl         .L_get_AAD_rest\@
 604 .L_get_AAD_blocks\@:
 605         movdqu     (%r10), \TMP7
 606         pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 607         pxor       \TMP7, \TMP6
 608         GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 609         add        $16, %r10
 610         sub        $16, %r11
 611         cmp        $16, %r11
 612         jge        .L_get_AAD_blocks\@
 613
 614         movdqu     \TMP6, \TMP7
 615
 616         /* read the last <16B of AAD */
 617 .L_get_AAD_rest\@:
 618         test       %r11, %r11
 619         je         .L_get_AAD_done\@
 620
 621         READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 622         pshufb     %xmm14, \TMP7 # byte-reflect the AAD data
 623         pxor       \TMP6, \TMP7
 624         GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 625         movdqu \TMP7, \TMP6
 626
 627 .L_get_AAD_done\@:
 628         movdqu \TMP6, AadHash(%arg2)
 629 .endm
 630
 631 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 632 # between update calls.
 633 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 634 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 635 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 636 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 637         AAD_HASH operation
 638         mov     PBlockLen(%arg2), %r13
 639         test    %r13, %r13
 640         je      .L_partial_block_done_\@        # Leave Macro if no partial blocks
 641         # Read in input data without over reading
 642         cmp     $16, \PLAIN_CYPH_LEN
 643         jl      .L_fewer_than_16_bytes_\@
 644         movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 645         jmp     .L_data_read_\@
 646
 647 .L_fewer_than_16_bytes_\@:
 648         lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 649         mov     \PLAIN_CYPH_LEN, %r12
 650         READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 651
 652         mov PBlockLen(%arg2), %r13
 653
 654 .L_data_read_\@:                                # Finished reading in data
 655
 656         movdqu  PBlockEncKey(%arg2), %xmm9
 657         movdqu  HashKey(%arg2), %xmm13
 658
 659         lea     SHIFT_MASK(%rip), %r12
 660
 661         # adjust the shuffle mask pointer to be able to shift r13 bytes
 662         # r16-r13 is the number of bytes in plaintext mod 16)
 663         add     %r13, %r12
 664         movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 665         pshufb  %xmm2, %xmm9            # shift right r13 bytes
 666
 667 .ifc \operation, dec
 668         movdqa  %xmm1, %xmm3
 669         pxor    %xmm1, %xmm9            # Ciphertext XOR E(K, Yn)
 670
 671         mov     \PLAIN_CYPH_LEN, %r10
 672         add     %r13, %r10
 673         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 674         sub     $16, %r10
 675         # Determine if partial block is not being filled and
 676         # shift mask accordingly
 677         jge     .L_no_extra_mask_1_\@
 678         sub     %r10, %r12
 679 .L_no_extra_mask_1_\@:
 680
 681         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 682         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 683         pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 684
 685         pand    %xmm1, %xmm3
 686         movdqa  SHUF_MASK(%rip), %xmm10
 687         pshufb  %xmm10, %xmm3
 688         pshufb  %xmm2, %xmm3
 689         pxor    %xmm3, \AAD_HASH
 690
 691         test    %r10, %r10
 692         jl      .L_partial_incomplete_1_\@
 693
 694         # GHASH computation for the last <16 Byte block
 695         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 696         xor     %eax, %eax
 697
 698         mov     %rax, PBlockLen(%arg2)
 699         jmp     .L_dec_done_\@
 700 .L_partial_incomplete_1_\@:
 701         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 702 .L_dec_done_\@:
 703         movdqu  \AAD_HASH, AadHash(%arg2)
 704 .else
 705         pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 706
 707         mov     \PLAIN_CYPH_LEN, %r10
 708         add     %r13, %r10
 709         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 710         sub     $16, %r10
 711         # Determine if partial block is not being filled and
 712         # shift mask accordingly
 713         jge     .L_no_extra_mask_2_\@
 714         sub     %r10, %r12
 715 .L_no_extra_mask_2_\@:
 716
 717         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 718         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 719         pand    %xmm1, %xmm9
 720
 721         movdqa  SHUF_MASK(%rip), %xmm1
 722         pshufb  %xmm1, %xmm9
 723         pshufb  %xmm2, %xmm9
 724         pxor    %xmm9, \AAD_HASH
 725
 726         test    %r10, %r10
 727         jl      .L_partial_incomplete_2_\@
 728
 729         # GHASH computation for the last <16 Byte block
 730         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 731         xor     %eax, %eax
 732
 733         mov     %rax, PBlockLen(%arg2)
 734         jmp     .L_encode_done_\@
 735 .L_partial_incomplete_2_\@:
 736         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 737 .L_encode_done_\@:
 738         movdqu  \AAD_HASH, AadHash(%arg2)
 739
 740         movdqa  SHUF_MASK(%rip), %xmm10
 741         # shuffle xmm9 back to output as ciphertext
 742         pshufb  %xmm10, %xmm9
 743         pshufb  %xmm2, %xmm9
 744 .endif
 745         # output encrypted Bytes
 746         test    %r10, %r10
 747         jl      .L_partial_fill_\@
 748         mov     %r13, %r12
 749         mov     $16, %r13
 750         # Set r13 to be the number of bytes to write out
 751         sub     %r12, %r13
 752         jmp     .L_count_set_\@
 753 .L_partial_fill_\@:
 754         mov     \PLAIN_CYPH_LEN, %r13
 755 .L_count_set_\@:
 756         movdqa  %xmm9, %xmm0
 757         movq    %xmm0, %rax
 758         cmp     $8, %r13
 759         jle     .L_less_than_8_bytes_left_\@
 760
 761         mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 762         add     $8, \DATA_OFFSET
 763         psrldq  $8, %xmm0
 764         movq    %xmm0, %rax
 765         sub     $8, %r13
 766 .L_less_than_8_bytes_left_\@:
 767         movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 768         add     $1, \DATA_OFFSET
 769         shr     $8, %rax
 770         sub     $1, %r13
 771         jne     .L_less_than_8_bytes_left_\@
 772 .L_partial_block_done_\@:
 773 .endm # PARTIAL_BLOCK
 774
 775 /*
 776 * if a = number of total plaintext bytes
 777 * b = floor(a/16)
 778 * num_initial_blocks = b mod 4
 779 * encrypt the initial num_initial_blocks blocks and apply ghash on
 780 * the ciphertext
 781 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 782 * are clobbered
 783 * arg1, %arg2, %arg3 are used as a pointer only, not modified
 784 */
 785
 786
 787 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 788         XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 789         MOVADQ          SHUF_MASK(%rip), %xmm14
 790
 791         movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 792
 793         # start AES for num_initial_blocks blocks
 794
 795         movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 796
 797 .if (\i == 5) || (\i == 6) || (\i == 7)
 798
 799         MOVADQ          ONE(%RIP),\TMP1
 800         MOVADQ          0(%arg1),\TMP2
 801 .irpc index, \i_seq
 802         paddd           \TMP1, \XMM0                 # INCR Y0
 803 .ifc \operation, dec
 804         movdqa     \XMM0, %xmm\index
 805 .else
 806         MOVADQ          \XMM0, %xmm\index
 807 .endif
 808         pshufb  %xmm14, %xmm\index      # perform a 16 byte swap
 809         pxor            \TMP2, %xmm\index
 810 .endr
 811         lea     0x10(%arg1),%r10
 812         mov     keysize,%eax
 813         shr     $2,%eax                         # 128->4, 192->6, 256->8
 814         add     $5,%eax                       # 128->9, 192->11, 256->13
 815
 816 .Laes_loop_initial_\@:
 817         MOVADQ  (%r10),\TMP1
 818 .irpc   index, \i_seq
 819         aesenc  \TMP1, %xmm\index
 820 .endr
 821         add     $16,%r10
 822         sub     $1,%eax
 823         jnz     .Laes_loop_initial_\@
 824
 825         MOVADQ  (%r10), \TMP1
 826 .irpc index, \i_seq
 827         aesenclast \TMP1, %xmm\index         # Last Round
 828 .endr
 829 .irpc index, \i_seq
 830         movdqu     (%arg4 , %r11, 1), \TMP1
 831         pxor       \TMP1, %xmm\index
 832         movdqu     %xmm\index, (%arg3 , %r11, 1)
 833         # write back plaintext/ciphertext for num_initial_blocks
 834         add        $16, %r11
 835
 836 .ifc \operation, dec
 837         movdqa     \TMP1, %xmm\index
 838 .endif
 839         pshufb     %xmm14, %xmm\index
 840
 841                 # prepare plaintext/ciphertext for GHASH computation
 842 .endr
 843 .endif
 844
 845         # apply GHASH on num_initial_blocks blocks
 846
 847 .if \i == 5
 848         pxor       %xmm5, %xmm6
 849         GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 850         pxor       %xmm6, %xmm7
 851         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 852         pxor       %xmm7, %xmm8
 853         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 854 .elseif \i == 6
 855         pxor       %xmm6, %xmm7
 856         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 857         pxor       %xmm7, %xmm8
 858         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859 .elseif \i == 7
 860         pxor       %xmm7, %xmm8
 861         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 862 .endif
 863         cmp        $64, %r13
 864         jl      .L_initial_blocks_done\@
 865         # no need for precomputed values
 866 /*
 867 *
 868 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 869 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 870 */
 871         MOVADQ     ONE(%RIP),\TMP1
 872         paddd      \TMP1, \XMM0              # INCR Y0
 873         MOVADQ     \XMM0, \XMM1
 874         pshufb  %xmm14, \XMM1        # perform a 16 byte swap
 875
 876         paddd      \TMP1, \XMM0              # INCR Y0
 877         MOVADQ     \XMM0, \XMM2
 878         pshufb  %xmm14, \XMM2        # perform a 16 byte swap
 879
 880         paddd      \TMP1, \XMM0              # INCR Y0
 881         MOVADQ     \XMM0, \XMM3
 882         pshufb %xmm14, \XMM3        # perform a 16 byte swap
 883
 884         paddd      \TMP1, \XMM0              # INCR Y0
 885         MOVADQ     \XMM0, \XMM4
 886         pshufb %xmm14, \XMM4        # perform a 16 byte swap
 887
 888         MOVADQ     0(%arg1),\TMP1
 889         pxor       \TMP1, \XMM1
 890         pxor       \TMP1, \XMM2
 891         pxor       \TMP1, \XMM3
 892         pxor       \TMP1, \XMM4
 893 .irpc index, 1234 # do 4 rounds
 894         movaps 0x10*\index(%arg1), \TMP1
 895         aesenc     \TMP1, \XMM1
 896         aesenc     \TMP1, \XMM2
 897         aesenc     \TMP1, \XMM3
 898         aesenc     \TMP1, \XMM4
 899 .endr
 900 .irpc index, 56789 # do next 5 rounds
 901         movaps 0x10*\index(%arg1), \TMP1
 902         aesenc     \TMP1, \XMM1
 903         aesenc     \TMP1, \XMM2
 904         aesenc     \TMP1, \XMM3
 905         aesenc     \TMP1, \XMM4
 906 .endr
 907         lea        0xa0(%arg1),%r10
 908         mov        keysize,%eax
 909         shr        $2,%eax                      # 128->4, 192->6, 256->8
 910         sub        $4,%eax                      # 128->0, 192->2, 256->4
 911         jz         .Laes_loop_pre_done\@
 912
 913 .Laes_loop_pre_\@:
 914         MOVADQ     (%r10),\TMP2
 915 .irpc   index, 1234
 916         aesenc     \TMP2, %xmm\index
 917 .endr
 918         add        $16,%r10
 919         sub        $1,%eax
 920         jnz        .Laes_loop_pre_\@
 921
 922 .Laes_loop_pre_done\@:
 923         MOVADQ     (%r10), \TMP2
 924         aesenclast \TMP2, \XMM1
 925         aesenclast \TMP2, \XMM2
 926         aesenclast \TMP2, \XMM3
 927         aesenclast \TMP2, \XMM4
 928         movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 929         pxor       \TMP1, \XMM1
 930 .ifc \operation, dec
 931         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 932         movdqa     \TMP1, \XMM1
 933 .endif
 934         movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 935         pxor       \TMP1, \XMM2
 936 .ifc \operation, dec
 937         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 938         movdqa     \TMP1, \XMM2
 939 .endif
 940         movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 941         pxor       \TMP1, \XMM3
 942 .ifc \operation, dec
 943         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 944         movdqa     \TMP1, \XMM3
 945 .endif
 946         movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 947         pxor       \TMP1, \XMM4
 948 .ifc \operation, dec
 949         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 950         movdqa     \TMP1, \XMM4
 951 .else
 952         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 953         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 954         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 955         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 956 .endif
 957
 958         add        $64, %r11
 959         pshufb %xmm14, \XMM1 # perform a 16 byte swap
 960         pxor       \XMMDst, \XMM1
 961 # combine GHASHed value with the corresponding ciphertext
 962         pshufb %xmm14, \XMM2 # perform a 16 byte swap
 963         pshufb %xmm14, \XMM3 # perform a 16 byte swap
 964         pshufb %xmm14, \XMM4 # perform a 16 byte swap
 965
 966 .L_initial_blocks_done\@:
 967
 968 .endm
 969
 970 /*
 971 * encrypt 4 blocks at a time
 972 * ghash the 4 previously encrypted ciphertext blocks
 973 * arg1, %arg3, %arg4 are used as pointers only, not modified
 974 * %r11 is the data offset value
 975 */
 976 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
 977 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 978
 979         movdqa    \XMM1, \XMM5
 980         movdqa    \XMM2, \XMM6
 981         movdqa    \XMM3, \XMM7
 982         movdqa    \XMM4, \XMM8
 983
 984         movdqa    SHUF_MASK(%rip), %xmm15
 985         # multiply TMP5 * HashKey using karatsuba
 986
 987         movdqa    \XMM5, \TMP4
 988         pshufd    $78, \XMM5, \TMP6
 989         pxor      \XMM5, \TMP6
 990         paddd     ONE(%rip), \XMM0              # INCR CNT
 991         movdqu    HashKey_4(%arg2), \TMP5
 992         pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
 993         movdqa    \XMM0, \XMM1
 994         paddd     ONE(%rip), \XMM0              # INCR CNT
 995         movdqa    \XMM0, \XMM2
 996         paddd     ONE(%rip), \XMM0              # INCR CNT
 997         movdqa    \XMM0, \XMM3
 998         paddd     ONE(%rip), \XMM0              # INCR CNT
 999         movdqa    \XMM0, \XMM4
1000         pshufb %xmm15, \XMM1    # perform a 16 byte swap
1001         pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1002         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1003         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1004         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1005
1006         pxor      (%arg1), \XMM1
1007         pxor      (%arg1), \XMM2
1008         pxor      (%arg1), \XMM3
1009         pxor      (%arg1), \XMM4
1010         movdqu    HashKey_4_k(%arg2), \TMP5
1011         pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1012         movaps 0x10(%arg1), \TMP1
1013         aesenc    \TMP1, \XMM1              # Round 1
1014         aesenc    \TMP1, \XMM2
1015         aesenc    \TMP1, \XMM3
1016         aesenc    \TMP1, \XMM4
1017         movaps 0x20(%arg1), \TMP1
1018         aesenc    \TMP1, \XMM1              # Round 2
1019         aesenc    \TMP1, \XMM2
1020         aesenc    \TMP1, \XMM3
1021         aesenc    \TMP1, \XMM4
1022         movdqa    \XMM6, \TMP1
1023         pshufd    $78, \XMM6, \TMP2
1024         pxor      \XMM6, \TMP2
1025         movdqu    HashKey_3(%arg2), \TMP5
1026         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1027         movaps 0x30(%arg1), \TMP3
1028         aesenc    \TMP3, \XMM1              # Round 3
1029         aesenc    \TMP3, \XMM2
1030         aesenc    \TMP3, \XMM3
1031         aesenc    \TMP3, \XMM4
1032         pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1033         movaps 0x40(%arg1), \TMP3
1034         aesenc    \TMP3, \XMM1              # Round 4
1035         aesenc    \TMP3, \XMM2
1036         aesenc    \TMP3, \XMM3
1037         aesenc    \TMP3, \XMM4
1038         movdqu    HashKey_3_k(%arg2), \TMP5
1039         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1040         movaps 0x50(%arg1), \TMP3
1041         aesenc    \TMP3, \XMM1              # Round 5
1042         aesenc    \TMP3, \XMM2
1043         aesenc    \TMP3, \XMM3
1044         aesenc    \TMP3, \XMM4
1045         pxor      \TMP1, \TMP4
1046 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1047         pxor      \XMM6, \XMM5
1048         pxor      \TMP2, \TMP6
1049         movdqa    \XMM7, \TMP1
1050         pshufd    $78, \XMM7, \TMP2
1051         pxor      \XMM7, \TMP2
1052         movdqu    HashKey_2(%arg2), \TMP5
1053
1054         # Multiply TMP5 * HashKey using karatsuba
1055
1056         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1057         movaps 0x60(%arg1), \TMP3
1058         aesenc    \TMP3, \XMM1              # Round 6
1059         aesenc    \TMP3, \XMM2
1060         aesenc    \TMP3, \XMM3
1061         aesenc    \TMP3, \XMM4
1062         pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1063         movaps 0x70(%arg1), \TMP3
1064         aesenc    \TMP3, \XMM1              # Round 7
1065         aesenc    \TMP3, \XMM2
1066         aesenc    \TMP3, \XMM3
1067         aesenc    \TMP3, \XMM4
1068         movdqu    HashKey_2_k(%arg2), \TMP5
1069         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1070         movaps 0x80(%arg1), \TMP3
1071         aesenc    \TMP3, \XMM1              # Round 8
1072         aesenc    \TMP3, \XMM2
1073         aesenc    \TMP3, \XMM3
1074         aesenc    \TMP3, \XMM4
1075         pxor      \TMP1, \TMP4
1076 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1077         pxor      \XMM7, \XMM5
1078         pxor      \TMP2, \TMP6
1079
1080         # Multiply XMM8 * HashKey
1081         # XMM8 and TMP5 hold the values for the two operands
1082
1083         movdqa    \XMM8, \TMP1
1084         pshufd    $78, \XMM8, \TMP2
1085         pxor      \XMM8, \TMP2
1086         movdqu    HashKey(%arg2), \TMP5
1087         pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1088         movaps 0x90(%arg1), \TMP3
1089         aesenc    \TMP3, \XMM1             # Round 9
1090         aesenc    \TMP3, \XMM2
1091         aesenc    \TMP3, \XMM3
1092         aesenc    \TMP3, \XMM4
1093         pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1094         lea       0xa0(%arg1),%r10
1095         mov       keysize,%eax
1096         shr       $2,%eax                       # 128->4, 192->6, 256->8
1097         sub       $4,%eax                       # 128->0, 192->2, 256->4
1098         jz        .Laes_loop_par_enc_done\@
1099
1100 .Laes_loop_par_enc\@:
1101         MOVADQ    (%r10),\TMP3
1102 .irpc   index, 1234
1103         aesenc    \TMP3, %xmm\index
1104 .endr
1105         add       $16,%r10
1106         sub       $1,%eax
1107         jnz       .Laes_loop_par_enc\@
1108
1109 .Laes_loop_par_enc_done\@:
1110         MOVADQ    (%r10), \TMP3
1111         aesenclast \TMP3, \XMM1           # Round 10
1112         aesenclast \TMP3, \XMM2
1113         aesenclast \TMP3, \XMM3
1114         aesenclast \TMP3, \XMM4
1115         movdqu    HashKey_k(%arg2), \TMP5
1116         pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1117         movdqu    (%arg4,%r11,1), \TMP3
1118         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1119         movdqu    16(%arg4,%r11,1), \TMP3
1120         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1121         movdqu    32(%arg4,%r11,1), \TMP3
1122         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1123         movdqu    48(%arg4,%r11,1), \TMP3
1124         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1125         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1126         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1127         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1128         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1129         pshufb %xmm15, \XMM1        # perform a 16 byte swap
1130         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1131         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1132         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1133
1134         pxor      \TMP4, \TMP1
1135         pxor      \XMM8, \XMM5
1136         pxor      \TMP6, \TMP2
1137         pxor      \TMP1, \TMP2
1138         pxor      \XMM5, \TMP2
1139         movdqa    \TMP2, \TMP3
1140         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1141         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1142         pxor      \TMP3, \XMM5
1143         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1144
1145         # first phase of reduction
1146
1147         movdqa    \XMM5, \TMP2
1148         movdqa    \XMM5, \TMP3
1149         movdqa    \XMM5, \TMP4
1150 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1151         pslld     $31, \TMP2                   # packed right shift << 31
1152         pslld     $30, \TMP3                   # packed right shift << 30
1153         pslld     $25, \TMP4                   # packed right shift << 25
1154         pxor      \TMP3, \TMP2                 # xor the shifted versions
1155         pxor      \TMP4, \TMP2
1156         movdqa    \TMP2, \TMP5
1157         psrldq    $4, \TMP5                    # right shift T5 1 DW
1158         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1159         pxor      \TMP2, \XMM5
1160
1161         # second phase of reduction
1162
1163         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1164         movdqa    \XMM5,\TMP3
1165         movdqa    \XMM5,\TMP4
1166         psrld     $1, \TMP2                    # packed left shift >>1
1167         psrld     $2, \TMP3                    # packed left shift >>2
1168         psrld     $7, \TMP4                    # packed left shift >>7
1169         pxor      \TMP3,\TMP2                  # xor the shifted versions
1170         pxor      \TMP4,\TMP2
1171         pxor      \TMP5, \TMP2
1172         pxor      \TMP2, \XMM5
1173         pxor      \TMP1, \XMM5                 # result is in TMP1
1174
1175         pxor      \XMM5, \XMM1
1176 .endm
1177
1178 /*
1179 * decrypt 4 blocks at a time
1180 * ghash the 4 previously decrypted ciphertext blocks
1181 * arg1, %arg3, %arg4 are used as pointers only, not modified
1182 * %r11 is the data offset value
1183 */
1184 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1185 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1186
1187         movdqa    \XMM1, \XMM5
1188         movdqa    \XMM2, \XMM6
1189         movdqa    \XMM3, \XMM7
1190         movdqa    \XMM4, \XMM8
1191
1192         movdqa    SHUF_MASK(%rip), %xmm15
1193         # multiply TMP5 * HashKey using karatsuba
1194
1195         movdqa    \XMM5, \TMP4
1196         pshufd    $78, \XMM5, \TMP6
1197         pxor      \XMM5, \TMP6
1198         paddd     ONE(%rip), \XMM0              # INCR CNT
1199         movdqu    HashKey_4(%arg2), \TMP5
1200         pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1201         movdqa    \XMM0, \XMM1
1202         paddd     ONE(%rip), \XMM0              # INCR CNT
1203         movdqa    \XMM0, \XMM2
1204         paddd     ONE(%rip), \XMM0              # INCR CNT
1205         movdqa    \XMM0, \XMM3
1206         paddd     ONE(%rip), \XMM0              # INCR CNT
1207         movdqa    \XMM0, \XMM4
1208         pshufb %xmm15, \XMM1    # perform a 16 byte swap
1209         pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1210         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1211         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1212         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1213
1214         pxor      (%arg1), \XMM1
1215         pxor      (%arg1), \XMM2
1216         pxor      (%arg1), \XMM3
1217         pxor      (%arg1), \XMM4
1218         movdqu    HashKey_4_k(%arg2), \TMP5
1219         pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1220         movaps 0x10(%arg1), \TMP1
1221         aesenc    \TMP1, \XMM1              # Round 1
1222         aesenc    \TMP1, \XMM2
1223         aesenc    \TMP1, \XMM3
1224         aesenc    \TMP1, \XMM4
1225         movaps 0x20(%arg1), \TMP1
1226         aesenc    \TMP1, \XMM1              # Round 2
1227         aesenc    \TMP1, \XMM2
1228         aesenc    \TMP1, \XMM3
1229         aesenc    \TMP1, \XMM4
1230         movdqa    \XMM6, \TMP1
1231         pshufd    $78, \XMM6, \TMP2
1232         pxor      \XMM6, \TMP2
1233         movdqu    HashKey_3(%arg2), \TMP5
1234         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1235         movaps 0x30(%arg1), \TMP3
1236         aesenc    \TMP3, \XMM1              # Round 3
1237         aesenc    \TMP3, \XMM2
1238         aesenc    \TMP3, \XMM3
1239         aesenc    \TMP3, \XMM4
1240         pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1241         movaps 0x40(%arg1), \TMP3
1242         aesenc    \TMP3, \XMM1              # Round 4
1243         aesenc    \TMP3, \XMM2
1244         aesenc    \TMP3, \XMM3
1245         aesenc    \TMP3, \XMM4
1246         movdqu    HashKey_3_k(%arg2), \TMP5
1247         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1248         movaps 0x50(%arg1), \TMP3
1249         aesenc    \TMP3, \XMM1              # Round 5
1250         aesenc    \TMP3, \XMM2
1251         aesenc    \TMP3, \XMM3
1252         aesenc    \TMP3, \XMM4
1253         pxor      \TMP1, \TMP4
1254 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1255         pxor      \XMM6, \XMM5
1256         pxor      \TMP2, \TMP6
1257         movdqa    \XMM7, \TMP1
1258         pshufd    $78, \XMM7, \TMP2
1259         pxor      \XMM7, \TMP2
1260         movdqu    HashKey_2(%arg2), \TMP5
1261
1262         # Multiply TMP5 * HashKey using karatsuba
1263
1264         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1265         movaps 0x60(%arg1), \TMP3
1266         aesenc    \TMP3, \XMM1              # Round 6
1267         aesenc    \TMP3, \XMM2
1268         aesenc    \TMP3, \XMM3
1269         aesenc    \TMP3, \XMM4
1270         pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1271         movaps 0x70(%arg1), \TMP3
1272         aesenc    \TMP3, \XMM1              # Round 7
1273         aesenc    \TMP3, \XMM2
1274         aesenc    \TMP3, \XMM3
1275         aesenc    \TMP3, \XMM4
1276         movdqu    HashKey_2_k(%arg2), \TMP5
1277         pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1278         movaps 0x80(%arg1), \TMP3
1279         aesenc    \TMP3, \XMM1              # Round 8
1280         aesenc    \TMP3, \XMM2
1281         aesenc    \TMP3, \XMM3
1282         aesenc    \TMP3, \XMM4
1283         pxor      \TMP1, \TMP4
1284 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1285         pxor      \XMM7, \XMM5
1286         pxor      \TMP2, \TMP6
1287
1288         # Multiply XMM8 * HashKey
1289         # XMM8 and TMP5 hold the values for the two operands
1290
1291         movdqa    \XMM8, \TMP1
1292         pshufd    $78, \XMM8, \TMP2
1293         pxor      \XMM8, \TMP2
1294         movdqu    HashKey(%arg2), \TMP5
1295         pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1296         movaps 0x90(%arg1), \TMP3
1297         aesenc    \TMP3, \XMM1             # Round 9
1298         aesenc    \TMP3, \XMM2
1299         aesenc    \TMP3, \XMM3
1300         aesenc    \TMP3, \XMM4
1301         pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1302         lea       0xa0(%arg1),%r10
1303         mov       keysize,%eax
1304         shr       $2,%eax                       # 128->4, 192->6, 256->8
1305         sub       $4,%eax                       # 128->0, 192->2, 256->4
1306         jz        .Laes_loop_par_dec_done\@
1307
1308 .Laes_loop_par_dec\@:
1309         MOVADQ    (%r10),\TMP3
1310 .irpc   index, 1234
1311         aesenc    \TMP3, %xmm\index
1312 .endr
1313         add       $16,%r10
1314         sub       $1,%eax
1315         jnz       .Laes_loop_par_dec\@
1316
1317 .Laes_loop_par_dec_done\@:
1318         MOVADQ    (%r10), \TMP3
1319         aesenclast \TMP3, \XMM1           # last round
1320         aesenclast \TMP3, \XMM2
1321         aesenclast \TMP3, \XMM3
1322         aesenclast \TMP3, \XMM4
1323         movdqu    HashKey_k(%arg2), \TMP5
1324         pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1325         movdqu    (%arg4,%r11,1), \TMP3
1326         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1327         movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1328         movdqa    \TMP3, \XMM1
1329         movdqu    16(%arg4,%r11,1), \TMP3
1330         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1331         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1332         movdqa    \TMP3, \XMM2
1333         movdqu    32(%arg4,%r11,1), \TMP3
1334         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1335         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1336         movdqa    \TMP3, \XMM3
1337         movdqu    48(%arg4,%r11,1), \TMP3
1338         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1339         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1340         movdqa    \TMP3, \XMM4
1341         pshufb %xmm15, \XMM1        # perform a 16 byte swap
1342         pshufb %xmm15, \XMM2    # perform a 16 byte swap
1343         pshufb %xmm15, \XMM3    # perform a 16 byte swap
1344         pshufb %xmm15, \XMM4    # perform a 16 byte swap
1345
1346         pxor      \TMP4, \TMP1
1347         pxor      \XMM8, \XMM5
1348         pxor      \TMP6, \TMP2
1349         pxor      \TMP1, \TMP2
1350         pxor      \XMM5, \TMP2
1351         movdqa    \TMP2, \TMP3
1352         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1353         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1354         pxor      \TMP3, \XMM5
1355         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1356
1357         # first phase of reduction
1358
1359         movdqa    \XMM5, \TMP2
1360         movdqa    \XMM5, \TMP3
1361         movdqa    \XMM5, \TMP4
1362 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1363         pslld     $31, \TMP2                   # packed right shift << 31
1364         pslld     $30, \TMP3                   # packed right shift << 30
1365         pslld     $25, \TMP4                   # packed right shift << 25
1366         pxor      \TMP3, \TMP2                 # xor the shifted versions
1367         pxor      \TMP4, \TMP2
1368         movdqa    \TMP2, \TMP5
1369         psrldq    $4, \TMP5                    # right shift T5 1 DW
1370         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1371         pxor      \TMP2, \XMM5
1372
1373         # second phase of reduction
1374
1375         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1376         movdqa    \XMM5,\TMP3
1377         movdqa    \XMM5,\TMP4
1378         psrld     $1, \TMP2                    # packed left shift >>1
1379         psrld     $2, \TMP3                    # packed left shift >>2
1380         psrld     $7, \TMP4                    # packed left shift >>7
1381         pxor      \TMP3,\TMP2                  # xor the shifted versions
1382         pxor      \TMP4,\TMP2
1383         pxor      \TMP5, \TMP2
1384         pxor      \TMP2, \XMM5
1385         pxor      \TMP1, \XMM5                 # result is in TMP1
1386
1387         pxor      \XMM5, \XMM1
1388 .endm
1389
1390 /* GHASH the last 4 ciphertext blocks. */
1391 .macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1392 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1393
1394         # Multiply TMP6 * HashKey (using Karatsuba)
1395
1396         movdqa    \XMM1, \TMP6
1397         pshufd    $78, \XMM1, \TMP2
1398         pxor      \XMM1, \TMP2
1399         movdqu    HashKey_4(%arg2), \TMP5
1400         pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1401         pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1402         movdqu    HashKey_4_k(%arg2), \TMP4
1403         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1404         movdqa    \XMM1, \XMMDst
1405         movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1406
1407         # Multiply TMP1 * HashKey (using Karatsuba)
1408
1409         movdqa    \XMM2, \TMP1
1410         pshufd    $78, \XMM2, \TMP2
1411         pxor      \XMM2, \TMP2
1412         movdqu    HashKey_3(%arg2), \TMP5
1413         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1414         pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1415         movdqu    HashKey_3_k(%arg2), \TMP4
1416         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1417         pxor      \TMP1, \TMP6
1418         pxor      \XMM2, \XMMDst
1419         pxor      \TMP2, \XMM1
1420 # results accumulated in TMP6, XMMDst, XMM1
1421
1422         # Multiply TMP1 * HashKey (using Karatsuba)
1423
1424         movdqa    \XMM3, \TMP1
1425         pshufd    $78, \XMM3, \TMP2
1426         pxor      \XMM3, \TMP2
1427         movdqu    HashKey_2(%arg2), \TMP5
1428         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1429         pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1430         movdqu    HashKey_2_k(%arg2), \TMP4
1431         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1432         pxor      \TMP1, \TMP6
1433         pxor      \XMM3, \XMMDst
1434         pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1435
1436         # Multiply TMP1 * HashKey (using Karatsuba)
1437         movdqa    \XMM4, \TMP1
1438         pshufd    $78, \XMM4, \TMP2
1439         pxor      \XMM4, \TMP2
1440         movdqu    HashKey(%arg2), \TMP5
1441         pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1442         pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1443         movdqu    HashKey_k(%arg2), \TMP4
1444         pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1445         pxor      \TMP1, \TMP6
1446         pxor      \XMM4, \XMMDst
1447         pxor      \XMM1, \TMP2
1448         pxor      \TMP6, \TMP2
1449         pxor      \XMMDst, \TMP2
1450         # middle section of the temp results combined as in karatsuba algorithm
1451         movdqa    \TMP2, \TMP4
1452         pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1453         psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1454         pxor      \TMP4, \XMMDst
1455         pxor      \TMP2, \TMP6
1456 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1457         # first phase of the reduction
1458         movdqa    \XMMDst, \TMP2
1459         movdqa    \XMMDst, \TMP3
1460         movdqa    \XMMDst, \TMP4
1461 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1462         pslld     $31, \TMP2                # packed right shifting << 31
1463         pslld     $30, \TMP3                # packed right shifting << 30
1464         pslld     $25, \TMP4                # packed right shifting << 25
1465         pxor      \TMP3, \TMP2              # xor the shifted versions
1466         pxor      \TMP4, \TMP2
1467         movdqa    \TMP2, \TMP7
1468         psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1469         pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1470         pxor      \TMP2, \XMMDst
1471
1472         # second phase of the reduction
1473         movdqa    \XMMDst, \TMP2
1474         # make 3 copies of XMMDst for doing 3 shift operations
1475         movdqa    \XMMDst, \TMP3
1476         movdqa    \XMMDst, \TMP4
1477         psrld     $1, \TMP2                 # packed left shift >> 1
1478         psrld     $2, \TMP3                 # packed left shift >> 2
1479         psrld     $7, \TMP4                 # packed left shift >> 7
1480         pxor      \TMP3, \TMP2              # xor the shifted versions
1481         pxor      \TMP4, \TMP2
1482         pxor      \TMP7, \TMP2
1483         pxor      \TMP2, \XMMDst
1484         pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1485 .endm
1486
1487
1488 /* Encryption of a single block
1489 * uses eax & r10
1490 */
1491
1492 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1493
1494         pxor            (%arg1), \XMM0
1495         mov             keysize,%eax
1496         shr             $2,%eax                 # 128->4, 192->6, 256->8
1497         add             $5,%eax                 # 128->9, 192->11, 256->13
1498         lea             16(%arg1), %r10   # get first expanded key address
1499
1500 _esb_loop_\@:
1501         MOVADQ          (%r10),\TMP1
1502         aesenc          \TMP1,\XMM0
1503         add             $16,%r10
1504         sub             $1,%eax
1505         jnz             _esb_loop_\@
1506
1507         MOVADQ          (%r10),\TMP1
1508         aesenclast      \TMP1,\XMM0
1509 .endm
1510 /*****************************************************************************
1511 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1512 *                   struct gcm_context_data *data
1513 *                                      // Context data
1514 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1515 *                   const u8 *in,      // Ciphertext input
1516 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1517 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1518 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1519 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1520 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1521 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1522 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1523 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1524 *                                      // given authentication tag and only return the plaintext if they match.
1525 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1526 *                                      // (most likely), 12 or 8.
1527 *
1528 * Assumptions:
1529 *
1530 * keys:
1531 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1532 *       set of 11 keys in the data structure void *aes_ctx
1533 *
1534 * iv:
1535 *       0                   1                   2                   3
1536 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1537 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1538 *       |                             Salt  (From the SA)               |
1539 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1540 *       |                     Initialization Vector                     |
1541 *       |         (This is the sequence number from IPSec header)       |
1542 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543 *       |                              0x1                              |
1544 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545 *
1546 *
1547 *
1548 * AAD:
1549 *       AAD padded to 128 bits with 0
1550 *       for example, assume AAD is a u32 vector
1551 *
1552 *       if AAD is 8 bytes:
1553 *       AAD[3] = {A0, A1};
1554 *       padded AAD in xmm register = {A1 A0 0 0}
1555 *
1556 *       0                   1                   2                   3
1557 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1558 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1559 *       |                               SPI (A1)                        |
1560 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1561 *       |                     32-bit Sequence Number (A0)               |
1562 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563 *       |                              0x0                              |
1564 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565 *
1566 *                                       AAD Format with 32-bit Sequence Number
1567 *
1568 *       if AAD is 12 bytes:
1569 *       AAD[3] = {A0, A1, A2};
1570 *       padded AAD in xmm register = {A2 A1 A0 0}
1571 *
1572 *       0                   1                   2                   3
1573 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1574 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1576 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1577 *       |                               SPI (A2)                        |
1578 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1580 *       |                                                               |
1581 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582 *       |                              0x0                              |
1583 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 *
1585 *                        AAD Format with 64-bit Extended Sequence Number
1586 *
1587 * poly = x^128 + x^127 + x^126 + x^121 + 1
1588 *
1589 *****************************************************************************/
1590 SYM_FUNC_START(aesni_gcm_dec)
1591         FUNC_SAVE
1592
1593         GCM_INIT %arg6, arg7, arg8, arg9
1594         GCM_ENC_DEC dec
1595         GCM_COMPLETE arg10, arg11
1596         FUNC_RESTORE
1597         RET
1598 SYM_FUNC_END(aesni_gcm_dec)
1599
1600
1601 /*****************************************************************************
1602 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1603 *                    struct gcm_context_data *data
1604 *                                        // Context data
1605 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1606 *                    const u8 *in,       // Plaintext input
1607 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1608 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1609 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1610 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1611 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1612 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1613 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1614 *                    u8 *auth_tag,       // Authenticated Tag output.
1615 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1616 *                                        // 12 or 8.
1617 *
1618 * Assumptions:
1619 *
1620 * keys:
1621 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1622 *       first set of 11 keys in the data structure void *aes_ctx
1623 *
1624 *
1625 * iv:
1626 *       0                   1                   2                   3
1627 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1628 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1629 *       |                             Salt  (From the SA)               |
1630 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631 *       |                     Initialization Vector                     |
1632 *       |         (This is the sequence number from IPSec header)       |
1633 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634 *       |                              0x1                              |
1635 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636 *
1637 *
1638 *
1639 * AAD:
1640 *       AAD padded to 128 bits with 0
1641 *       for example, assume AAD is a u32 vector
1642 *
1643 *       if AAD is 8 bytes:
1644 *       AAD[3] = {A0, A1};
1645 *       padded AAD in xmm register = {A1 A0 0 0}
1646 *
1647 *       0                   1                   2                   3
1648 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1649 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1650 *       |                               SPI (A1)                        |
1651 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1652 *       |                     32-bit Sequence Number (A0)               |
1653 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654 *       |                              0x0                              |
1655 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 *
1657 *                                 AAD Format with 32-bit Sequence Number
1658 *
1659 *       if AAD is 12 bytes:
1660 *       AAD[3] = {A0, A1, A2};
1661 *       padded AAD in xmm register = {A2 A1 A0 0}
1662 *
1663 *       0                   1                   2                   3
1664 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1665 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666 *       |                               SPI (A2)                        |
1667 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1668 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1669 *       |                                                               |
1670 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671 *       |                              0x0                              |
1672 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673 *
1674 *                         AAD Format with 64-bit Extended Sequence Number
1675 *
1676 * poly = x^128 + x^127 + x^126 + x^121 + 1
1677 ***************************************************************************/
1678 SYM_FUNC_START(aesni_gcm_enc)
1679         FUNC_SAVE
1680
1681         GCM_INIT %arg6, arg7, arg8, arg9
1682         GCM_ENC_DEC enc
1683
1684         GCM_COMPLETE arg10, arg11
1685         FUNC_RESTORE
1686         RET
1687 SYM_FUNC_END(aesni_gcm_enc)
1688
1689 /*****************************************************************************
1690 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1691 *                     struct gcm_context_data *data,
1692 *                                         // context data
1693 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1694 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1695 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1696 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1697 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1698 *                     u64 aad_len)        // Length of AAD in bytes.
1699 */
1700 SYM_FUNC_START(aesni_gcm_init)
1701         FUNC_SAVE
1702         GCM_INIT %arg3, %arg4,%arg5, %arg6
1703         FUNC_RESTORE
1704         RET
1705 SYM_FUNC_END(aesni_gcm_init)
1706
1707 /*****************************************************************************
1708 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1709 *                    struct gcm_context_data *data,
1710 *                                        // context data
1711 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1712 *                    const u8 *in,       // Plaintext input
1713 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1714 */
1715 SYM_FUNC_START(aesni_gcm_enc_update)
1716         FUNC_SAVE
1717         GCM_ENC_DEC enc
1718         FUNC_RESTORE
1719         RET
1720 SYM_FUNC_END(aesni_gcm_enc_update)
1721
1722 /*****************************************************************************
1723 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1724 *                    struct gcm_context_data *data,
1725 *                                        // context data
1726 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1727 *                    const u8 *in,       // Plaintext input
1728 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1729 */
1730 SYM_FUNC_START(aesni_gcm_dec_update)
1731         FUNC_SAVE
1732         GCM_ENC_DEC dec
1733         FUNC_RESTORE
1734         RET
1735 SYM_FUNC_END(aesni_gcm_dec_update)
1736
1737 /*****************************************************************************
1738 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1739 *                    struct gcm_context_data *data,
1740 *                                        // context data
1741 *                    u8 *auth_tag,       // Authenticated Tag output.
1742 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1743 *                                        // 12 or 8.
1744 */
1745 SYM_FUNC_START(aesni_gcm_finalize)
1746         FUNC_SAVE
1747         GCM_COMPLETE %arg3 %arg4
1748         FUNC_RESTORE
1749         RET
1750 SYM_FUNC_END(aesni_gcm_finalize)
1751
1752 #endif
1753
1754 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1755         pshufd $0b11111111, %xmm1, %xmm1
1756         shufps $0b00010000, %xmm0, %xmm4
1757         pxor %xmm4, %xmm0
1758         shufps $0b10001100, %xmm0, %xmm4
1759         pxor %xmm4, %xmm0
1760         pxor %xmm1, %xmm0
1761         movaps %xmm0, (TKEYP)
1762         add $0x10, TKEYP
1763         RET
1764 SYM_FUNC_END(_key_expansion_256a)
1765 SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1766
1767 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1768         pshufd $0b01010101, %xmm1, %xmm1
1769         shufps $0b00010000, %xmm0, %xmm4
1770         pxor %xmm4, %xmm0
1771         shufps $0b10001100, %xmm0, %xmm4
1772         pxor %xmm4, %xmm0
1773         pxor %xmm1, %xmm0
1774
1775         movaps %xmm2, %xmm5
1776         movaps %xmm2, %xmm6
1777         pslldq $4, %xmm5
1778         pshufd $0b11111111, %xmm0, %xmm3
1779         pxor %xmm3, %xmm2
1780         pxor %xmm5, %xmm2
1781
1782         movaps %xmm0, %xmm1
1783         shufps $0b01000100, %xmm0, %xmm6
1784         movaps %xmm6, (TKEYP)
1785         shufps $0b01001110, %xmm2, %xmm1
1786         movaps %xmm1, 0x10(TKEYP)
1787         add $0x20, TKEYP
1788         RET
1789 SYM_FUNC_END(_key_expansion_192a)
1790
1791 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1792         pshufd $0b01010101, %xmm1, %xmm1
1793         shufps $0b00010000, %xmm0, %xmm4
1794         pxor %xmm4, %xmm0
1795         shufps $0b10001100, %xmm0, %xmm4
1796         pxor %xmm4, %xmm0
1797         pxor %xmm1, %xmm0
1798
1799         movaps %xmm2, %xmm5
1800         pslldq $4, %xmm5
1801         pshufd $0b11111111, %xmm0, %xmm3
1802         pxor %xmm3, %xmm2
1803         pxor %xmm5, %xmm2
1804
1805         movaps %xmm0, (TKEYP)
1806         add $0x10, TKEYP
1807         RET
1808 SYM_FUNC_END(_key_expansion_192b)
1809
1810 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1811         pshufd $0b10101010, %xmm1, %xmm1
1812         shufps $0b00010000, %xmm2, %xmm4
1813         pxor %xmm4, %xmm2
1814         shufps $0b10001100, %xmm2, %xmm4
1815         pxor %xmm4, %xmm2
1816         pxor %xmm1, %xmm2
1817         movaps %xmm2, (TKEYP)
1818         add $0x10, TKEYP
1819         RET
1820 SYM_FUNC_END(_key_expansion_256b)
1821
1822 /*
1823  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1824  *                   unsigned int key_len)
1825  */
1826 SYM_FUNC_START(aesni_set_key)
1827         FRAME_BEGIN
1828 #ifndef __x86_64__
1829         pushl KEYP
1830         movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1831         movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1832         movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1833 #endif
1834         movups (UKEYP), %xmm0           # user key (first 16 bytes)
1835         movaps %xmm0, (KEYP)
1836         lea 0x10(KEYP), TKEYP           # key addr
1837         movl %edx, 480(KEYP)
1838         pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1839         cmp $24, %dl
1840         jb .Lenc_key128
1841         je .Lenc_key192
1842         movups 0x10(UKEYP), %xmm2       # other user key
1843         movaps %xmm2, (TKEYP)
1844         add $0x10, TKEYP
1845         aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1846         call _key_expansion_256a
1847         aeskeygenassist $0x1, %xmm0, %xmm1
1848         call _key_expansion_256b
1849         aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1850         call _key_expansion_256a
1851         aeskeygenassist $0x2, %xmm0, %xmm1
1852         call _key_expansion_256b
1853         aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1854         call _key_expansion_256a
1855         aeskeygenassist $0x4, %xmm0, %xmm1
1856         call _key_expansion_256b
1857         aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1858         call _key_expansion_256a
1859         aeskeygenassist $0x8, %xmm0, %xmm1
1860         call _key_expansion_256b
1861         aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1862         call _key_expansion_256a
1863         aeskeygenassist $0x10, %xmm0, %xmm1
1864         call _key_expansion_256b
1865         aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1866         call _key_expansion_256a
1867         aeskeygenassist $0x20, %xmm0, %xmm1
1868         call _key_expansion_256b
1869         aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1870         call _key_expansion_256a
1871         jmp .Ldec_key
1872 .Lenc_key192:
1873         movq 0x10(UKEYP), %xmm2         # other user key
1874         aeskeygenassist $0x1, %xmm2, %xmm1      # round 1
1875         call _key_expansion_192a
1876         aeskeygenassist $0x2, %xmm2, %xmm1      # round 2
1877         call _key_expansion_192b
1878         aeskeygenassist $0x4, %xmm2, %xmm1      # round 3
1879         call _key_expansion_192a
1880         aeskeygenassist $0x8, %xmm2, %xmm1      # round 4
1881         call _key_expansion_192b
1882         aeskeygenassist $0x10, %xmm2, %xmm1     # round 5
1883         call _key_expansion_192a
1884         aeskeygenassist $0x20, %xmm2, %xmm1     # round 6
1885         call _key_expansion_192b
1886         aeskeygenassist $0x40, %xmm2, %xmm1     # round 7
1887         call _key_expansion_192a
1888         aeskeygenassist $0x80, %xmm2, %xmm1     # round 8
1889         call _key_expansion_192b
1890         jmp .Ldec_key
1891 .Lenc_key128:
1892         aeskeygenassist $0x1, %xmm0, %xmm1      # round 1
1893         call _key_expansion_128
1894         aeskeygenassist $0x2, %xmm0, %xmm1      # round 2
1895         call _key_expansion_128
1896         aeskeygenassist $0x4, %xmm0, %xmm1      # round 3
1897         call _key_expansion_128
1898         aeskeygenassist $0x8, %xmm0, %xmm1      # round 4
1899         call _key_expansion_128
1900         aeskeygenassist $0x10, %xmm0, %xmm1     # round 5
1901         call _key_expansion_128
1902         aeskeygenassist $0x20, %xmm0, %xmm1     # round 6
1903         call _key_expansion_128
1904         aeskeygenassist $0x40, %xmm0, %xmm1     # round 7
1905         call _key_expansion_128
1906         aeskeygenassist $0x80, %xmm0, %xmm1     # round 8
1907         call _key_expansion_128
1908         aeskeygenassist $0x1b, %xmm0, %xmm1     # round 9
1909         call _key_expansion_128
1910         aeskeygenassist $0x36, %xmm0, %xmm1     # round 10
1911         call _key_expansion_128
1912 .Ldec_key:
1913         sub $0x10, TKEYP
1914         movaps (KEYP), %xmm0
1915         movaps (TKEYP), %xmm1
1916         movaps %xmm0, 240(TKEYP)
1917         movaps %xmm1, 240(KEYP)
1918         add $0x10, KEYP
1919         lea 240-16(TKEYP), UKEYP
1920 .align 4
1921 .Ldec_key_loop:
1922         movaps (KEYP), %xmm0
1923         aesimc %xmm0, %xmm1
1924         movaps %xmm1, (UKEYP)
1925         add $0x10, KEYP
1926         sub $0x10, UKEYP
1927         cmp TKEYP, KEYP
1928         jb .Ldec_key_loop
1929         xor AREG, AREG
1930 #ifndef __x86_64__
1931         popl KEYP
1932 #endif
1933         FRAME_END
1934         RET
1935 SYM_FUNC_END(aesni_set_key)
1936
1937 /*
1938  * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1939  */
1940 SYM_FUNC_START(aesni_enc)
1941         FRAME_BEGIN
1942 #ifndef __x86_64__
1943         pushl KEYP
1944         pushl KLEN
1945         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1946         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1947         movl (FRAME_OFFSET+20)(%esp), INP       # src
1948 #endif
1949         movl 480(KEYP), KLEN            # key length
1950         movups (INP), STATE             # input
1951         call _aesni_enc1
1952         movups STATE, (OUTP)            # output
1953 #ifndef __x86_64__
1954         popl KLEN
1955         popl KEYP
1956 #endif
1957         FRAME_END
1958         RET
1959 SYM_FUNC_END(aesni_enc)
1960
1961 /*
1962  * _aesni_enc1:         internal ABI
1963  * input:
1964  *      KEYP:           key struct pointer
1965  *      KLEN:           round count
1966  *      STATE:          initial state (input)
1967  * output:
1968  *      STATE:          finial state (output)
1969  * changed:
1970  *      KEY
1971  *      TKEYP (T1)
1972  */
1973 SYM_FUNC_START_LOCAL(_aesni_enc1)
1974         movaps (KEYP), KEY              # key
1975         mov KEYP, TKEYP
1976         pxor KEY, STATE         # round 0
1977         add $0x30, TKEYP
1978         cmp $24, KLEN
1979         jb .Lenc128
1980         lea 0x20(TKEYP), TKEYP
1981         je .Lenc192
1982         add $0x20, TKEYP
1983         movaps -0x60(TKEYP), KEY
1984         aesenc KEY, STATE
1985         movaps -0x50(TKEYP), KEY
1986         aesenc KEY, STATE
1987 .align 4
1988 .Lenc192:
1989         movaps -0x40(TKEYP), KEY
1990         aesenc KEY, STATE
1991         movaps -0x30(TKEYP), KEY
1992         aesenc KEY, STATE
1993 .align 4
1994 .Lenc128:
1995         movaps -0x20(TKEYP), KEY
1996         aesenc KEY, STATE
1997         movaps -0x10(TKEYP), KEY
1998         aesenc KEY, STATE
1999         movaps (TKEYP), KEY
2000         aesenc KEY, STATE
2001         movaps 0x10(TKEYP), KEY
2002         aesenc KEY, STATE
2003         movaps 0x20(TKEYP), KEY
2004         aesenc KEY, STATE
2005         movaps 0x30(TKEYP), KEY
2006         aesenc KEY, STATE
2007         movaps 0x40(TKEYP), KEY
2008         aesenc KEY, STATE
2009         movaps 0x50(TKEYP), KEY
2010         aesenc KEY, STATE
2011         movaps 0x60(TKEYP), KEY
2012         aesenc KEY, STATE
2013         movaps 0x70(TKEYP), KEY
2014         aesenclast KEY, STATE
2015         RET
2016 SYM_FUNC_END(_aesni_enc1)
2017
2018 /*
2019  * _aesni_enc4: internal ABI
2020  * input:
2021  *      KEYP:           key struct pointer
2022  *      KLEN:           round count
2023  *      STATE1:         initial state (input)
2024  *      STATE2
2025  *      STATE3
2026  *      STATE4
2027  * output:
2028  *      STATE1:         finial state (output)
2029  *      STATE2
2030  *      STATE3
2031  *      STATE4
2032  * changed:
2033  *      KEY
2034  *      TKEYP (T1)
2035  */
2036 SYM_FUNC_START_LOCAL(_aesni_enc4)
2037         movaps (KEYP), KEY              # key
2038         mov KEYP, TKEYP
2039         pxor KEY, STATE1                # round 0
2040         pxor KEY, STATE2
2041         pxor KEY, STATE3
2042         pxor KEY, STATE4
2043         add $0x30, TKEYP
2044         cmp $24, KLEN
2045         jb .L4enc128
2046         lea 0x20(TKEYP), TKEYP
2047         je .L4enc192
2048         add $0x20, TKEYP
2049         movaps -0x60(TKEYP), KEY
2050         aesenc KEY, STATE1
2051         aesenc KEY, STATE2
2052         aesenc KEY, STATE3
2053         aesenc KEY, STATE4
2054         movaps -0x50(TKEYP), KEY
2055         aesenc KEY, STATE1
2056         aesenc KEY, STATE2
2057         aesenc KEY, STATE3
2058         aesenc KEY, STATE4
2059 #.align 4
2060 .L4enc192:
2061         movaps -0x40(TKEYP), KEY
2062         aesenc KEY, STATE1
2063         aesenc KEY, STATE2
2064         aesenc KEY, STATE3
2065         aesenc KEY, STATE4
2066         movaps -0x30(TKEYP), KEY
2067         aesenc KEY, STATE1
2068         aesenc KEY, STATE2
2069         aesenc KEY, STATE3
2070         aesenc KEY, STATE4
2071 #.align 4
2072 .L4enc128:
2073         movaps -0x20(TKEYP), KEY
2074         aesenc KEY, STATE1
2075         aesenc KEY, STATE2
2076         aesenc KEY, STATE3
2077         aesenc KEY, STATE4
2078         movaps -0x10(TKEYP), KEY
2079         aesenc KEY, STATE1
2080         aesenc KEY, STATE2
2081         aesenc KEY, STATE3
2082         aesenc KEY, STATE4
2083         movaps (TKEYP), KEY
2084         aesenc KEY, STATE1
2085         aesenc KEY, STATE2
2086         aesenc KEY, STATE3
2087         aesenc KEY, STATE4
2088         movaps 0x10(TKEYP), KEY
2089         aesenc KEY, STATE1
2090         aesenc KEY, STATE2
2091         aesenc KEY, STATE3
2092         aesenc KEY, STATE4
2093         movaps 0x20(TKEYP), KEY
2094         aesenc KEY, STATE1
2095         aesenc KEY, STATE2
2096         aesenc KEY, STATE3
2097         aesenc KEY, STATE4
2098         movaps 0x30(TKEYP), KEY
2099         aesenc KEY, STATE1
2100         aesenc KEY, STATE2
2101         aesenc KEY, STATE3
2102         aesenc KEY, STATE4
2103         movaps 0x40(TKEYP), KEY
2104         aesenc KEY, STATE1
2105         aesenc KEY, STATE2
2106         aesenc KEY, STATE3
2107         aesenc KEY, STATE4
2108         movaps 0x50(TKEYP), KEY
2109         aesenc KEY, STATE1
2110         aesenc KEY, STATE2
2111         aesenc KEY, STATE3
2112         aesenc KEY, STATE4
2113         movaps 0x60(TKEYP), KEY
2114         aesenc KEY, STATE1
2115         aesenc KEY, STATE2
2116         aesenc KEY, STATE3
2117         aesenc KEY, STATE4
2118         movaps 0x70(TKEYP), KEY
2119         aesenclast KEY, STATE1          # last round
2120         aesenclast KEY, STATE2
2121         aesenclast KEY, STATE3
2122         aesenclast KEY, STATE4
2123         RET
2124 SYM_FUNC_END(_aesni_enc4)
2125
2126 /*
2127  * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2128  */
2129 SYM_FUNC_START(aesni_dec)
2130         FRAME_BEGIN
2131 #ifndef __x86_64__
2132         pushl KEYP
2133         pushl KLEN
2134         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
2135         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
2136         movl (FRAME_OFFSET+20)(%esp), INP       # src
2137 #endif
2138         mov 480(KEYP), KLEN             # key length
2139         add $240, KEYP
2140         movups (INP), STATE             # input
2141         call _aesni_dec1
2142         movups STATE, (OUTP)            #output
2143 #ifndef __x86_64__
2144         popl KLEN
2145         popl KEYP
2146 #endif
2147         FRAME_END
2148         RET
2149 SYM_FUNC_END(aesni_dec)
2150
2151 /*
2152  * _aesni_dec1:         internal ABI
2153  * input:
2154  *      KEYP:           key struct pointer
2155  *      KLEN:           key length
2156  *      STATE:          initial state (input)
2157  * output:
2158  *      STATE:          finial state (output)
2159  * changed:
2160  *      KEY
2161  *      TKEYP (T1)
2162  */
2163 SYM_FUNC_START_LOCAL(_aesni_dec1)
2164         movaps (KEYP), KEY              # key
2165         mov KEYP, TKEYP
2166         pxor KEY, STATE         # round 0
2167         add $0x30, TKEYP
2168         cmp $24, KLEN
2169         jb .Ldec128
2170         lea 0x20(TKEYP), TKEYP
2171         je .Ldec192
2172         add $0x20, TKEYP
2173         movaps -0x60(TKEYP), KEY
2174         aesdec KEY, STATE
2175         movaps -0x50(TKEYP), KEY
2176         aesdec KEY, STATE
2177 .align 4
2178 .Ldec192:
2179         movaps -0x40(TKEYP), KEY
2180         aesdec KEY, STATE
2181         movaps -0x30(TKEYP), KEY
2182         aesdec KEY, STATE
2183 .align 4
2184 .Ldec128:
2185         movaps -0x20(TKEYP), KEY
2186         aesdec KEY, STATE
2187         movaps -0x10(TKEYP), KEY
2188         aesdec KEY, STATE
2189         movaps (TKEYP), KEY
2190         aesdec KEY, STATE
2191         movaps 0x10(TKEYP), KEY
2192         aesdec KEY, STATE
2193         movaps 0x20(TKEYP), KEY
2194         aesdec KEY, STATE
2195         movaps 0x30(TKEYP), KEY
2196         aesdec KEY, STATE
2197         movaps 0x40(TKEYP), KEY
2198         aesdec KEY, STATE
2199         movaps 0x50(TKEYP), KEY
2200         aesdec KEY, STATE
2201         movaps 0x60(TKEYP), KEY
2202         aesdec KEY, STATE
2203         movaps 0x70(TKEYP), KEY
2204         aesdeclast KEY, STATE
2205         RET
2206 SYM_FUNC_END(_aesni_dec1)
2207
2208 /*
2209  * _aesni_dec4: internal ABI
2210  * input:
2211  *      KEYP:           key struct pointer
2212  *      KLEN:           key length
2213  *      STATE1:         initial state (input)
2214  *      STATE2
2215  *      STATE3
2216  *      STATE4
2217  * output:
2218  *      STATE1:         finial state (output)
2219  *      STATE2
2220  *      STATE3
2221  *      STATE4
2222  * changed:
2223  *      KEY
2224  *      TKEYP (T1)
2225  */
2226 SYM_FUNC_START_LOCAL(_aesni_dec4)
2227         movaps (KEYP), KEY              # key
2228         mov KEYP, TKEYP
2229         pxor KEY, STATE1                # round 0
2230         pxor KEY, STATE2
2231         pxor KEY, STATE3
2232         pxor KEY, STATE4
2233         add $0x30, TKEYP
2234         cmp $24, KLEN
2235         jb .L4dec128
2236         lea 0x20(TKEYP), TKEYP
2237         je .L4dec192
2238         add $0x20, TKEYP
2239         movaps -0x60(TKEYP), KEY
2240         aesdec KEY, STATE1
2241         aesdec KEY, STATE2
2242         aesdec KEY, STATE3
2243         aesdec KEY, STATE4
2244         movaps -0x50(TKEYP), KEY
2245         aesdec KEY, STATE1
2246         aesdec KEY, STATE2
2247         aesdec KEY, STATE3
2248         aesdec KEY, STATE4
2249 .align 4
2250 .L4dec192:
2251         movaps -0x40(TKEYP), KEY
2252         aesdec KEY, STATE1
2253         aesdec KEY, STATE2
2254         aesdec KEY, STATE3
2255         aesdec KEY, STATE4
2256         movaps -0x30(TKEYP), KEY
2257         aesdec KEY, STATE1
2258         aesdec KEY, STATE2
2259         aesdec KEY, STATE3
2260         aesdec KEY, STATE4
2261 .align 4
2262 .L4dec128:
2263         movaps -0x20(TKEYP), KEY
2264         aesdec KEY, STATE1
2265         aesdec KEY, STATE2
2266         aesdec KEY, STATE3
2267         aesdec KEY, STATE4
2268         movaps -0x10(TKEYP), KEY
2269         aesdec KEY, STATE1
2270         aesdec KEY, STATE2
2271         aesdec KEY, STATE3
2272         aesdec KEY, STATE4
2273         movaps (TKEYP), KEY
2274         aesdec KEY, STATE1
2275         aesdec KEY, STATE2
2276         aesdec KEY, STATE3
2277         aesdec KEY, STATE4
2278         movaps 0x10(TKEYP), KEY
2279         aesdec KEY, STATE1
2280         aesdec KEY, STATE2
2281         aesdec KEY, STATE3
2282         aesdec KEY, STATE4
2283         movaps 0x20(TKEYP), KEY
2284         aesdec KEY, STATE1
2285         aesdec KEY, STATE2
2286         aesdec KEY, STATE3
2287         aesdec KEY, STATE4
2288         movaps 0x30(TKEYP), KEY
2289         aesdec KEY, STATE1
2290         aesdec KEY, STATE2
2291         aesdec KEY, STATE3
2292         aesdec KEY, STATE4
2293         movaps 0x40(TKEYP), KEY
2294         aesdec KEY, STATE1
2295         aesdec KEY, STATE2
2296         aesdec KEY, STATE3
2297         aesdec KEY, STATE4
2298         movaps 0x50(TKEYP), KEY
2299         aesdec KEY, STATE1
2300         aesdec KEY, STATE2
2301         aesdec KEY, STATE3
2302         aesdec KEY, STATE4
2303         movaps 0x60(TKEYP), KEY
2304         aesdec KEY, STATE1
2305         aesdec KEY, STATE2
2306         aesdec KEY, STATE3
2307         aesdec KEY, STATE4
2308         movaps 0x70(TKEYP), KEY
2309         aesdeclast KEY, STATE1          # last round
2310         aesdeclast KEY, STATE2
2311         aesdeclast KEY, STATE3
2312         aesdeclast KEY, STATE4
2313         RET
2314 SYM_FUNC_END(_aesni_dec4)
2315
2316 /*
2317  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2318  *                    size_t len)
2319  */
2320 SYM_FUNC_START(aesni_ecb_enc)
2321         FRAME_BEGIN
2322 #ifndef __x86_64__
2323         pushl LEN
2324         pushl KEYP
2325         pushl KLEN
2326         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2327         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2328         movl (FRAME_OFFSET+24)(%esp), INP       # src
2329         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2330 #endif
2331         test LEN, LEN           # check length
2332         jz .Lecb_enc_ret
2333         mov 480(KEYP), KLEN
2334         cmp $16, LEN
2335         jb .Lecb_enc_ret
2336         cmp $64, LEN
2337         jb .Lecb_enc_loop1
2338 .align 4
2339 .Lecb_enc_loop4:
2340         movups (INP), STATE1
2341         movups 0x10(INP), STATE2
2342         movups 0x20(INP), STATE3
2343         movups 0x30(INP), STATE4
2344         call _aesni_enc4
2345         movups STATE1, (OUTP)
2346         movups STATE2, 0x10(OUTP)
2347         movups STATE3, 0x20(OUTP)
2348         movups STATE4, 0x30(OUTP)
2349         sub $64, LEN
2350         add $64, INP
2351         add $64, OUTP
2352         cmp $64, LEN
2353         jge .Lecb_enc_loop4
2354         cmp $16, LEN
2355         jb .Lecb_enc_ret
2356 .align 4
2357 .Lecb_enc_loop1:
2358         movups (INP), STATE1
2359         call _aesni_enc1
2360         movups STATE1, (OUTP)
2361         sub $16, LEN
2362         add $16, INP
2363         add $16, OUTP
2364         cmp $16, LEN
2365         jge .Lecb_enc_loop1
2366 .Lecb_enc_ret:
2367 #ifndef __x86_64__
2368         popl KLEN
2369         popl KEYP
2370         popl LEN
2371 #endif
2372         FRAME_END
2373         RET
2374 SYM_FUNC_END(aesni_ecb_enc)
2375
2376 /*
2377  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2378  *                    size_t len);
2379  */
2380 SYM_FUNC_START(aesni_ecb_dec)
2381         FRAME_BEGIN
2382 #ifndef __x86_64__
2383         pushl LEN
2384         pushl KEYP
2385         pushl KLEN
2386         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2387         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2388         movl (FRAME_OFFSET+24)(%esp), INP       # src
2389         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2390 #endif
2391         test LEN, LEN
2392         jz .Lecb_dec_ret
2393         mov 480(KEYP), KLEN
2394         add $240, KEYP
2395         cmp $16, LEN
2396         jb .Lecb_dec_ret
2397         cmp $64, LEN
2398         jb .Lecb_dec_loop1
2399 .align 4
2400 .Lecb_dec_loop4:
2401         movups (INP), STATE1
2402         movups 0x10(INP), STATE2
2403         movups 0x20(INP), STATE3
2404         movups 0x30(INP), STATE4
2405         call _aesni_dec4
2406         movups STATE1, (OUTP)
2407         movups STATE2, 0x10(OUTP)
2408         movups STATE3, 0x20(OUTP)
2409         movups STATE4, 0x30(OUTP)
2410         sub $64, LEN
2411         add $64, INP
2412         add $64, OUTP
2413         cmp $64, LEN
2414         jge .Lecb_dec_loop4
2415         cmp $16, LEN
2416         jb .Lecb_dec_ret
2417 .align 4
2418 .Lecb_dec_loop1:
2419         movups (INP), STATE1
2420         call _aesni_dec1
2421         movups STATE1, (OUTP)
2422         sub $16, LEN
2423         add $16, INP
2424         add $16, OUTP
2425         cmp $16, LEN
2426         jge .Lecb_dec_loop1
2427 .Lecb_dec_ret:
2428 #ifndef __x86_64__
2429         popl KLEN
2430         popl KEYP
2431         popl LEN
2432 #endif
2433         FRAME_END
2434         RET
2435 SYM_FUNC_END(aesni_ecb_dec)
2436
2437 /*
2438  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2439  *                    size_t len, u8 *iv)
2440  */
2441 SYM_FUNC_START(aesni_cbc_enc)
2442         FRAME_BEGIN
2443 #ifndef __x86_64__
2444         pushl IVP
2445         pushl LEN
2446         pushl KEYP
2447         pushl KLEN
2448         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2449         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2450         movl (FRAME_OFFSET+28)(%esp), INP       # src
2451         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2452         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2453 #endif
2454         cmp $16, LEN
2455         jb .Lcbc_enc_ret
2456         mov 480(KEYP), KLEN
2457         movups (IVP), STATE     # load iv as initial state
2458 .align 4
2459 .Lcbc_enc_loop:
2460         movups (INP), IN        # load input
2461         pxor IN, STATE
2462         call _aesni_enc1
2463         movups STATE, (OUTP)    # store output
2464         sub $16, LEN
2465         add $16, INP
2466         add $16, OUTP
2467         cmp $16, LEN
2468         jge .Lcbc_enc_loop
2469         movups STATE, (IVP)
2470 .Lcbc_enc_ret:
2471 #ifndef __x86_64__
2472         popl KLEN
2473         popl KEYP
2474         popl LEN
2475         popl IVP
2476 #endif
2477         FRAME_END
2478         RET
2479 SYM_FUNC_END(aesni_cbc_enc)
2480
2481 /*
2482  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2483  *                    size_t len, u8 *iv)
2484  */
2485 SYM_FUNC_START(aesni_cbc_dec)
2486         FRAME_BEGIN
2487 #ifndef __x86_64__
2488         pushl IVP
2489         pushl LEN
2490         pushl KEYP
2491         pushl KLEN
2492         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2493         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2494         movl (FRAME_OFFSET+28)(%esp), INP       # src
2495         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2496         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2497 #endif
2498         cmp $16, LEN
2499         jb .Lcbc_dec_just_ret
2500         mov 480(KEYP), KLEN
2501         add $240, KEYP
2502         movups (IVP), IV
2503         cmp $64, LEN
2504         jb .Lcbc_dec_loop1
2505 .align 4
2506 .Lcbc_dec_loop4:
2507         movups (INP), IN1
2508         movaps IN1, STATE1
2509         movups 0x10(INP), IN2
2510         movaps IN2, STATE2
2511 #ifdef __x86_64__
2512         movups 0x20(INP), IN3
2513         movaps IN3, STATE3
2514         movups 0x30(INP), IN4
2515         movaps IN4, STATE4
2516 #else
2517         movups 0x20(INP), IN1
2518         movaps IN1, STATE3
2519         movups 0x30(INP), IN2
2520         movaps IN2, STATE4
2521 #endif
2522         call _aesni_dec4
2523         pxor IV, STATE1
2524 #ifdef __x86_64__
2525         pxor IN1, STATE2
2526         pxor IN2, STATE3
2527         pxor IN3, STATE4
2528         movaps IN4, IV
2529 #else
2530         pxor IN1, STATE4
2531         movaps IN2, IV
2532         movups (INP), IN1
2533         pxor IN1, STATE2
2534         movups 0x10(INP), IN2
2535         pxor IN2, STATE3
2536 #endif
2537         movups STATE1, (OUTP)
2538         movups STATE2, 0x10(OUTP)
2539         movups STATE3, 0x20(OUTP)
2540         movups STATE4, 0x30(OUTP)
2541         sub $64, LEN
2542         add $64, INP
2543         add $64, OUTP
2544         cmp $64, LEN
2545         jge .Lcbc_dec_loop4
2546         cmp $16, LEN
2547         jb .Lcbc_dec_ret
2548 .align 4
2549 .Lcbc_dec_loop1:
2550         movups (INP), IN
2551         movaps IN, STATE
2552         call _aesni_dec1
2553         pxor IV, STATE
2554         movups STATE, (OUTP)
2555         movaps IN, IV
2556         sub $16, LEN
2557         add $16, INP
2558         add $16, OUTP
2559         cmp $16, LEN
2560         jge .Lcbc_dec_loop1
2561 .Lcbc_dec_ret:
2562         movups IV, (IVP)
2563 .Lcbc_dec_just_ret:
2564 #ifndef __x86_64__
2565         popl KLEN
2566         popl KEYP
2567         popl LEN
2568         popl IVP
2569 #endif
2570         FRAME_END
2571         RET
2572 SYM_FUNC_END(aesni_cbc_dec)
2573
2574 /*
2575  * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2576  *                        size_t len, u8 *iv)
2577  */
2578 SYM_FUNC_START(aesni_cts_cbc_enc)
2579         FRAME_BEGIN
2580 #ifndef __x86_64__
2581         pushl IVP
2582         pushl LEN
2583         pushl KEYP
2584         pushl KLEN
2585         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2586         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2587         movl (FRAME_OFFSET+28)(%esp), INP       # src
2588         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2589         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2590         lea .Lcts_permute_table, T1
2591 #else
2592         lea .Lcts_permute_table(%rip), T1
2593 #endif
2594         mov 480(KEYP), KLEN
2595         movups (IVP), STATE
2596         sub $16, LEN
2597         mov T1, IVP
2598         add $32, IVP
2599         add LEN, T1
2600         sub LEN, IVP
2601         movups (T1), %xmm4
2602         movups (IVP), %xmm5
2603
2604         movups (INP), IN1
2605         add LEN, INP
2606         movups (INP), IN2
2607
2608         pxor IN1, STATE
2609         call _aesni_enc1
2610
2611         pshufb %xmm5, IN2
2612         pxor STATE, IN2
2613         pshufb %xmm4, STATE
2614         add OUTP, LEN
2615         movups STATE, (LEN)
2616
2617         movaps IN2, STATE
2618         call _aesni_enc1
2619         movups STATE, (OUTP)
2620
2621 #ifndef __x86_64__
2622         popl KLEN
2623         popl KEYP
2624         popl LEN
2625         popl IVP
2626 #endif
2627         FRAME_END
2628         RET
2629 SYM_FUNC_END(aesni_cts_cbc_enc)
2630
2631 /*
2632  * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2633  *                        size_t len, u8 *iv)
2634  */
2635 SYM_FUNC_START(aesni_cts_cbc_dec)
2636         FRAME_BEGIN
2637 #ifndef __x86_64__
2638         pushl IVP
2639         pushl LEN
2640         pushl KEYP
2641         pushl KLEN
2642         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2643         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2644         movl (FRAME_OFFSET+28)(%esp), INP       # src
2645         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2646         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2647         lea .Lcts_permute_table, T1
2648 #else
2649         lea .Lcts_permute_table(%rip), T1
2650 #endif
2651         mov 480(KEYP), KLEN
2652         add $240, KEYP
2653         movups (IVP), IV
2654         sub $16, LEN
2655         mov T1, IVP
2656         add $32, IVP
2657         add LEN, T1
2658         sub LEN, IVP
2659         movups (T1), %xmm4
2660
2661         movups (INP), STATE
2662         add LEN, INP
2663         movups (INP), IN1
2664
2665         call _aesni_dec1
2666         movaps STATE, IN2
2667         pshufb %xmm4, STATE
2668         pxor IN1, STATE
2669
2670         add OUTP, LEN
2671         movups STATE, (LEN)
2672
2673         movups (IVP), %xmm0
2674         pshufb %xmm0, IN1
2675         pblendvb IN2, IN1
2676         movaps IN1, STATE
2677         call _aesni_dec1
2678
2679         pxor IV, STATE
2680         movups STATE, (OUTP)
2681
2682 #ifndef __x86_64__
2683         popl KLEN
2684         popl KEYP
2685         popl LEN
2686         popl IVP
2687 #endif
2688         FRAME_END
2689         RET
2690 SYM_FUNC_END(aesni_cts_cbc_dec)
2691
2692 .pushsection .rodata
2693 .align 16
2694 .Lcts_permute_table:
2695         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2696         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2697         .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2698         .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2699         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2700         .byte           0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2701 #ifdef __x86_64__
2702 .Lbswap_mask:
2703         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2704 #endif
2705 .popsection
2706
2707 #ifdef __x86_64__
2708 /*
2709  * _aesni_inc_init:     internal ABI
2710  *      setup registers used by _aesni_inc
2711  * input:
2712  *      IV
2713  * output:
2714  *      CTR:    == IV, in little endian
2715  *      TCTR_LOW: == lower qword of CTR
2716  *      INC:    == 1, in little endian
2717  *      BSWAP_MASK == endian swapping mask
2718  */
2719 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2720         movaps .Lbswap_mask(%rip), BSWAP_MASK
2721         movaps IV, CTR
2722         pshufb BSWAP_MASK, CTR
2723         mov $1, TCTR_LOW
2724         movq TCTR_LOW, INC
2725         movq CTR, TCTR_LOW
2726         RET
2727 SYM_FUNC_END(_aesni_inc_init)
2728
2729 /*
2730  * _aesni_inc:          internal ABI
2731  *      Increase IV by 1, IV is in big endian
2732  * input:
2733  *      IV
2734  *      CTR:    == IV, in little endian
2735  *      TCTR_LOW: == lower qword of CTR
2736  *      INC:    == 1, in little endian
2737  *      BSWAP_MASK == endian swapping mask
2738  * output:
2739  *      IV:     Increase by 1
2740  * changed:
2741  *      CTR:    == output IV, in little endian
2742  *      TCTR_LOW: == lower qword of CTR
2743  */
2744 SYM_FUNC_START_LOCAL(_aesni_inc)
2745         paddq INC, CTR
2746         add $1, TCTR_LOW
2747         jnc .Linc_low
2748         pslldq $8, INC
2749         paddq INC, CTR
2750         psrldq $8, INC
2751 .Linc_low:
2752         movaps CTR, IV
2753         pshufb BSWAP_MASK, IV
2754         RET
2755 SYM_FUNC_END(_aesni_inc)
2756
2757 /*
2758  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2759  *                    size_t len, u8 *iv)
2760  */
2761 SYM_FUNC_START(aesni_ctr_enc)
2762         FRAME_BEGIN
2763         cmp $16, LEN
2764         jb .Lctr_enc_just_ret
2765         mov 480(KEYP), KLEN
2766         movups (IVP), IV
2767         call _aesni_inc_init
2768         cmp $64, LEN
2769         jb .Lctr_enc_loop1
2770 .align 4
2771 .Lctr_enc_loop4:
2772         movaps IV, STATE1
2773         call _aesni_inc
2774         movups (INP), IN1
2775         movaps IV, STATE2
2776         call _aesni_inc
2777         movups 0x10(INP), IN2
2778         movaps IV, STATE3
2779         call _aesni_inc
2780         movups 0x20(INP), IN3
2781         movaps IV, STATE4
2782         call _aesni_inc
2783         movups 0x30(INP), IN4
2784         call _aesni_enc4
2785         pxor IN1, STATE1
2786         movups STATE1, (OUTP)
2787         pxor IN2, STATE2
2788         movups STATE2, 0x10(OUTP)
2789         pxor IN3, STATE3
2790         movups STATE3, 0x20(OUTP)
2791         pxor IN4, STATE4
2792         movups STATE4, 0x30(OUTP)
2793         sub $64, LEN
2794         add $64, INP
2795         add $64, OUTP
2796         cmp $64, LEN
2797         jge .Lctr_enc_loop4
2798         cmp $16, LEN
2799         jb .Lctr_enc_ret
2800 .align 4
2801 .Lctr_enc_loop1:
2802         movaps IV, STATE
2803         call _aesni_inc
2804         movups (INP), IN
2805         call _aesni_enc1
2806         pxor IN, STATE
2807         movups STATE, (OUTP)
2808         sub $16, LEN
2809         add $16, INP
2810         add $16, OUTP
2811         cmp $16, LEN
2812         jge .Lctr_enc_loop1
2813 .Lctr_enc_ret:
2814         movups IV, (IVP)
2815 .Lctr_enc_just_ret:
2816         FRAME_END
2817         RET
2818 SYM_FUNC_END(aesni_ctr_enc)
2819
2820 #endif
2821
2822 .section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2823 .align 16
2824 .Lgf128mul_x_ble_mask:
2825         .octa 0x00000000000000010000000000000087
2826 .previous
2827
2828 /*
2829  * _aesni_gf128mul_x_ble:               internal ABI
2830  *      Multiply in GF(2^128) for XTS IVs
2831  * input:
2832  *      IV:     current IV
2833  *      GF128MUL_MASK == mask with 0x87 and 0x01
2834  * output:
2835  *      IV:     next IV
2836  * changed:
2837  *      CTR:    == temporary value
2838  */
2839 #define _aesni_gf128mul_x_ble() \
2840         pshufd $0x13, IV, KEY; \
2841         paddq IV, IV; \
2842         psrad $31, KEY; \
2843         pand GF128MUL_MASK, KEY; \
2844         pxor KEY, IV;
2845
2846 /*
2847  * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2848  *                        const u8 *src, unsigned int len, le128 *iv)
2849  */
2850 SYM_FUNC_START(aesni_xts_encrypt)
2851         FRAME_BEGIN
2852 #ifndef __x86_64__
2853         pushl IVP
2854         pushl LEN
2855         pushl KEYP
2856         pushl KLEN
2857         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2858         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2859         movl (FRAME_OFFSET+28)(%esp), INP       # src
2860         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2861         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2862         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2863 #else
2864         movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2865 #endif
2866         movups (IVP), IV
2867
2868         mov 480(KEYP), KLEN
2869
2870 .Lxts_enc_loop4:
2871         sub $64, LEN
2872         jl .Lxts_enc_1x
2873
2874         movdqa IV, STATE1
2875         movdqu 0x00(INP), IN
2876         pxor IN, STATE1
2877         movdqu IV, 0x00(OUTP)
2878
2879         _aesni_gf128mul_x_ble()
2880         movdqa IV, STATE2
2881         movdqu 0x10(INP), IN
2882         pxor IN, STATE2
2883         movdqu IV, 0x10(OUTP)
2884
2885         _aesni_gf128mul_x_ble()
2886         movdqa IV, STATE3
2887         movdqu 0x20(INP), IN
2888         pxor IN, STATE3
2889         movdqu IV, 0x20(OUTP)
2890
2891         _aesni_gf128mul_x_ble()
2892         movdqa IV, STATE4
2893         movdqu 0x30(INP), IN
2894         pxor IN, STATE4
2895         movdqu IV, 0x30(OUTP)
2896
2897         call _aesni_enc4
2898
2899         movdqu 0x00(OUTP), IN
2900         pxor IN, STATE1
2901         movdqu STATE1, 0x00(OUTP)
2902
2903         movdqu 0x10(OUTP), IN
2904         pxor IN, STATE2
2905         movdqu STATE2, 0x10(OUTP)
2906
2907         movdqu 0x20(OUTP), IN
2908         pxor IN, STATE3
2909         movdqu STATE3, 0x20(OUTP)
2910
2911         movdqu 0x30(OUTP), IN
2912         pxor IN, STATE4
2913         movdqu STATE4, 0x30(OUTP)
2914
2915         _aesni_gf128mul_x_ble()
2916
2917         add $64, INP
2918         add $64, OUTP
2919         test LEN, LEN
2920         jnz .Lxts_enc_loop4
2921
2922 .Lxts_enc_ret_iv:
2923         movups IV, (IVP)
2924
2925 .Lxts_enc_ret:
2926 #ifndef __x86_64__
2927         popl KLEN
2928         popl KEYP
2929         popl LEN
2930         popl IVP
2931 #endif
2932         FRAME_END
2933         RET
2934
2935 .Lxts_enc_1x:
2936         add $64, LEN
2937         jz .Lxts_enc_ret_iv
2938         sub $16, LEN
2939         jl .Lxts_enc_cts4
2940
2941 .Lxts_enc_loop1:
2942         movdqu (INP), STATE
2943         pxor IV, STATE
2944         call _aesni_enc1
2945         pxor IV, STATE
2946         _aesni_gf128mul_x_ble()
2947
2948         test LEN, LEN
2949         jz .Lxts_enc_out
2950
2951         add $16, INP
2952         sub $16, LEN
2953         jl .Lxts_enc_cts1
2954
2955         movdqu STATE, (OUTP)
2956         add $16, OUTP
2957         jmp .Lxts_enc_loop1
2958
2959 .Lxts_enc_out:
2960         movdqu STATE, (OUTP)
2961         jmp .Lxts_enc_ret_iv
2962
2963 .Lxts_enc_cts4:
2964         movdqa STATE4, STATE
2965         sub $16, OUTP
2966
2967 .Lxts_enc_cts1:
2968 #ifndef __x86_64__
2969         lea .Lcts_permute_table, T1
2970 #else
2971         lea .Lcts_permute_table(%rip), T1
2972 #endif
2973         add LEN, INP            /* rewind input pointer */
2974         add $16, LEN            /* # bytes in final block */
2975         movups (INP), IN1
2976
2977         mov T1, IVP
2978         add $32, IVP
2979         add LEN, T1
2980         sub LEN, IVP
2981         add OUTP, LEN
2982
2983         movups (T1), %xmm4
2984         movaps STATE, IN2
2985         pshufb %xmm4, STATE
2986         movups STATE, (LEN)
2987
2988         movups (IVP), %xmm0
2989         pshufb %xmm0, IN1
2990         pblendvb IN2, IN1
2991         movaps IN1, STATE
2992
2993         pxor IV, STATE
2994         call _aesni_enc1
2995         pxor IV, STATE
2996
2997         movups STATE, (OUTP)
2998         jmp .Lxts_enc_ret
2999 SYM_FUNC_END(aesni_xts_encrypt)
3000
3001 /*
3002  * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
3003  *                        const u8 *src, unsigned int len, le128 *iv)
3004  */
3005 SYM_FUNC_START(aesni_xts_decrypt)
3006         FRAME_BEGIN
3007 #ifndef __x86_64__
3008         pushl IVP
3009         pushl LEN
3010         pushl KEYP
3011         pushl KLEN
3012         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
3013         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
3014         movl (FRAME_OFFSET+28)(%esp), INP       # src
3015         movl (FRAME_OFFSET+32)(%esp), LEN       # len
3016         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
3017         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
3018 #else
3019         movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
3020 #endif
3021         movups (IVP), IV
3022
3023         mov 480(KEYP), KLEN
3024         add $240, KEYP
3025
3026         test $15, LEN
3027         jz .Lxts_dec_loop4
3028         sub $16, LEN
3029
3030 .Lxts_dec_loop4:
3031         sub $64, LEN
3032         jl .Lxts_dec_1x
3033
3034         movdqa IV, STATE1
3035         movdqu 0x00(INP), IN
3036         pxor IN, STATE1
3037         movdqu IV, 0x00(OUTP)
3038
3039         _aesni_gf128mul_x_ble()
3040         movdqa IV, STATE2
3041         movdqu 0x10(INP), IN
3042         pxor IN, STATE2
3043         movdqu IV, 0x10(OUTP)
3044
3045         _aesni_gf128mul_x_ble()
3046         movdqa IV, STATE3
3047         movdqu 0x20(INP), IN
3048         pxor IN, STATE3
3049         movdqu IV, 0x20(OUTP)
3050
3051         _aesni_gf128mul_x_ble()
3052         movdqa IV, STATE4
3053         movdqu 0x30(INP), IN
3054         pxor IN, STATE4
3055         movdqu IV, 0x30(OUTP)
3056
3057         call _aesni_dec4
3058
3059         movdqu 0x00(OUTP), IN
3060         pxor IN, STATE1
3061         movdqu STATE1, 0x00(OUTP)
3062
3063         movdqu 0x10(OUTP), IN
3064         pxor IN, STATE2
3065         movdqu STATE2, 0x10(OUTP)
3066
3067         movdqu 0x20(OUTP), IN
3068         pxor IN, STATE3
3069         movdqu STATE3, 0x20(OUTP)
3070
3071         movdqu 0x30(OUTP), IN
3072         pxor IN, STATE4
3073         movdqu STATE4, 0x30(OUTP)
3074
3075         _aesni_gf128mul_x_ble()
3076
3077         add $64, INP
3078         add $64, OUTP
3079         test LEN, LEN
3080         jnz .Lxts_dec_loop4
3081
3082 .Lxts_dec_ret_iv:
3083         movups IV, (IVP)
3084
3085 .Lxts_dec_ret:
3086 #ifndef __x86_64__
3087         popl KLEN
3088         popl KEYP
3089         popl LEN
3090         popl IVP
3091 #endif
3092         FRAME_END
3093         RET
3094
3095 .Lxts_dec_1x:
3096         add $64, LEN
3097         jz .Lxts_dec_ret_iv
3098
3099 .Lxts_dec_loop1:
3100         movdqu (INP), STATE
3101
3102         add $16, INP
3103         sub $16, LEN
3104         jl .Lxts_dec_cts1
3105
3106         pxor IV, STATE
3107         call _aesni_dec1
3108         pxor IV, STATE
3109         _aesni_gf128mul_x_ble()
3110
3111         test LEN, LEN
3112         jz .Lxts_dec_out
3113
3114         movdqu STATE, (OUTP)
3115         add $16, OUTP
3116         jmp .Lxts_dec_loop1
3117
3118 .Lxts_dec_out:
3119         movdqu STATE, (OUTP)
3120         jmp .Lxts_dec_ret_iv
3121
3122 .Lxts_dec_cts1:
3123         movdqa IV, STATE4
3124         _aesni_gf128mul_x_ble()
3125
3126         pxor IV, STATE
3127         call _aesni_dec1
3128         pxor IV, STATE
3129
3130 #ifndef __x86_64__
3131         lea .Lcts_permute_table, T1
3132 #else
3133         lea .Lcts_permute_table(%rip), T1
3134 #endif
3135         add LEN, INP            /* rewind input pointer */
3136         add $16, LEN            /* # bytes in final block */
3137         movups (INP), IN1
3138
3139         mov T1, IVP
3140         add $32, IVP
3141         add LEN, T1
3142         sub LEN, IVP
3143         add OUTP, LEN
3144
3145         movups (T1), %xmm4
3146         movaps STATE, IN2
3147         pshufb %xmm4, STATE
3148         movups STATE, (LEN)
3149
3150         movups (IVP), %xmm0
3151         pshufb %xmm0, IN1
3152         pblendvb IN2, IN1
3153         movaps IN1, STATE
3154
3155         pxor STATE4, STATE
3156         call _aesni_dec1
3157         pxor STATE4, STATE
3158
3159         movups STATE, (OUTP)
3160         jmp .Lxts_dec_ret
3161 SYM_FUNC_END(aesni_xts_decrypt)