1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Implement AES algorithm in Intel AES-NI instructions.
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
24 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
28 #include <linux/linkage.h>
29 #include <asm/frame.h>
30 #include <asm/nospec-branch.h>
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register. This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned). It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released. However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
45 # constants in mergeable sections, linker can reorder and merge
46 .section .rodata.cst16.POLY, "aM", @progbits, 16
48 POLY: .octa 0xC2000000000000000000000000000001
49 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
51 TWOONE: .octa 0x00000001000000000000000000000001
53 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
55 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
56 .section .rodata.cst16.MASK1, "aM", @progbits, 16
58 MASK1: .octa 0x0000000000000000ffffffffffffffff
59 .section .rodata.cst16.MASK2, "aM", @progbits, 16
61 MASK2: .octa 0xffffffffffffffff0000000000000000
62 .section .rodata.cst16.ONE, "aM", @progbits, 16
64 ONE: .octa 0x00000000000000000000000000000001
65 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
67 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
68 .section .rodata.cst16.dec, "aM", @progbits, 16
71 .section .rodata.cst16.enc, "aM", @progbits, 16
75 # order of these constants should not change.
76 # more specifically, ALL_F should follow SHIFT_MASK,
77 # and zero should follow ALL_F
78 .section .rodata, "a", @progbits
80 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
81 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
82 .octa 0x00000000000000000000000000000000
88 #define InLen (16*1)+8
89 #define PBlockEncKey 16*2
92 #define PBlockLen 16*5
93 #define HashKey 16*6 // store HashKey <<1 mod poly here
94 #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
95 #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
96 #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
97 #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
98 // bits of HashKey <<1 mod poly here
99 //(for Karatsuba purposes)
100 #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
101 // bits of HashKey^2 <<1 mod poly here
102 // (for Karatsuba purposes)
103 #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
104 // bits of HashKey^3 <<1 mod poly here
105 // (for Karatsuba purposes)
106 #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
107 // bits of HashKey^4 <<1 mod poly here
108 // (for Karatsuba purposes)
116 #define keysize 2*15*16(%arg1)
133 #define BSWAP_MASK %xmm10
137 #define GF128MUL_MASK %xmm7
170 # states of %xmm registers %xmm6:%xmm15 not saved
171 # all %xmm registers are clobbered
182 # Precompute hashkeys.
183 # Input: Hash subkey.
184 # Output: HashKeys stored in gcm_context_data. Only needs to be called
186 # clobbers r12, and tmp xmm registers.
187 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
190 movdqa SHUF_MASK(%rip), \TMP2
193 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
205 pshufd $0x24, \TMP1, \TMP2
206 pcmpeqd TWOONE(%rip), \TMP2
207 pand POLY(%rip), \TMP2
209 movdqu \TMP3, HashKey(%arg2)
212 pshufd $78, \TMP3, \TMP1
214 movdqu \TMP1, HashKey_k(%arg2)
216 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
217 # TMP5 = HashKey^2<<1 (mod poly)
218 movdqu \TMP5, HashKey_2(%arg2)
219 # HashKey_2 = HashKey^2<<1 (mod poly)
220 pshufd $78, \TMP5, \TMP1
222 movdqu \TMP1, HashKey_2_k(%arg2)
224 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
225 # TMP5 = HashKey^3<<1 (mod poly)
226 movdqu \TMP5, HashKey_3(%arg2)
227 pshufd $78, \TMP5, \TMP1
229 movdqu \TMP1, HashKey_3_k(%arg2)
231 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
232 # TMP5 = HashKey^3<<1 (mod poly)
233 movdqu \TMP5, HashKey_4(%arg2)
234 pshufd $78, \TMP5, \TMP1
236 movdqu \TMP1, HashKey_4_k(%arg2)
239 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
240 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
241 .macro GCM_INIT Iv SUBKEY AAD AADLEN
243 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
245 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
246 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
247 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
250 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
252 movdqa SHUF_MASK(%rip), %xmm2
254 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
256 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
257 movdqu HashKey(%arg2), %xmm13
259 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
263 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
264 # struct has been initialized by GCM_INIT.
265 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
266 # Clobbers rax, r10-r13, and xmm0-xmm15
267 .macro GCM_ENC_DEC operation
268 movdqu AadHash(%arg2), %xmm8
269 movdqu HashKey(%arg2), %xmm13
270 add %arg5, InLen(%arg2)
272 xor %r11d, %r11d # initialise the data pointer offset as zero
273 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
275 sub %r11, %arg5 # sub partial block data used
276 mov %arg5, %r13 # save the number of bytes
278 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
280 # Encrypt/Decrypt first few blocks
283 jz .L_initial_num_blocks_is_0_\@
285 jb .L_initial_num_blocks_is_1_\@
286 je .L_initial_num_blocks_is_2_\@
287 .L_initial_num_blocks_is_3_\@:
288 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
289 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
291 jmp .L_initial_blocks_\@
292 .L_initial_num_blocks_is_2_\@:
293 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
294 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
296 jmp .L_initial_blocks_\@
297 .L_initial_num_blocks_is_1_\@:
298 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
299 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
301 jmp .L_initial_blocks_\@
302 .L_initial_num_blocks_is_0_\@:
303 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
304 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
305 .L_initial_blocks_\@:
307 # Main loop - Encrypt/Decrypt remaining blocks
310 je .L_zero_cipher_left_\@
312 je .L_four_cipher_left_\@
314 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
315 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
320 .L_four_cipher_left_\@:
321 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
322 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
323 .L_zero_cipher_left_\@:
324 movdqu %xmm8, AadHash(%arg2)
325 movdqu %xmm0, CurCount(%arg2)
328 and $15, %r13 # %r13 = arg5 (mod 16)
329 je .L_multiple_of_16_bytes_\@
331 mov %r13, PBlockLen(%arg2)
333 # Handle the last <16 Byte block separately
334 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
335 movdqu %xmm0, CurCount(%arg2)
336 movdqa SHUF_MASK(%rip), %xmm10
339 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
340 movdqu %xmm0, PBlockEncKey(%arg2)
343 jge .L_large_enough_update_\@
345 lea (%arg4,%r11,1), %r10
347 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
350 .L_large_enough_update_\@:
354 # receive the last <16 Byte block
355 movdqu (%arg4, %r11, 1), %xmm1
360 lea SHIFT_MASK+16(%rip), %r12
361 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
362 # (r13 is the number of bytes in plaintext mod 16)
364 # get the appropriate shuffle mask
366 # shift right 16-r13 bytes
370 lea ALL_F+16(%rip), %r12
376 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
378 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
379 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
382 movdqa SHUF_MASK(%rip), %xmm10
387 movdqa SHUF_MASK(%rip), %xmm10
393 movdqu %xmm8, AadHash(%arg2)
395 # GHASH computation for the last <16 byte block
396 movdqa SHUF_MASK(%rip), %xmm10
397 # shuffle xmm0 back to output as ciphertext
404 jle .L_less_than_8_bytes_left_\@
405 mov %rax, (%arg3 , %r11, 1)
410 .L_less_than_8_bytes_left_\@:
411 mov %al, (%arg3, %r11, 1)
415 jne .L_less_than_8_bytes_left_\@
416 .L_multiple_of_16_bytes_\@:
419 # GCM_COMPLETE Finishes update of tag of last partial block
420 # Output: Authorization Tag (AUTH_TAG)
421 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
422 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
423 movdqu AadHash(%arg2), %xmm8
424 movdqu HashKey(%arg2), %xmm13
426 mov PBlockLen(%arg2), %r12
431 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
434 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
435 shl $3, %r12 # convert into number of bits
436 movd %r12d, %xmm15 # len(A) in %xmm15
437 mov InLen(%arg2), %r12
438 shl $3, %r12 # len(C) in bits (*128)
441 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
442 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
444 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
445 # final GHASH computation
446 movdqa SHUF_MASK(%rip), %xmm10
449 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
450 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
453 mov \AUTHTAG, %r10 # %r10 = authTag
454 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
466 je .L_return_T_done_\@
474 je .L_return_T_done_\@
481 je .L_return_T_done_\@
486 jmp .L_return_T_done_\@
493 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
496 * Input: A and B (128-bits each, bit-reflected)
497 * Output: C = A*B*x mod poly, (i.e. >>1 )
498 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
499 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
502 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
504 pshufd $78, \GH, \TMP2
505 pshufd $78, \HK, \TMP3
506 pxor \GH, \TMP2 # TMP2 = a1+a0
507 pxor \HK, \TMP3 # TMP3 = b1+b0
508 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
509 pclmulqdq $0x00, \HK, \GH # GH = a0*b0
510 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
512 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
514 pslldq $8, \TMP3 # left shift TMP3 2 DWs
515 psrldq $8, \TMP2 # right shift TMP2 2 DWs
517 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
519 # first phase of the reduction
523 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
524 # in in order to perform
526 pslld $31, \TMP2 # packed right shift <<31
527 pslld $30, \TMP3 # packed right shift <<30
528 pslld $25, \TMP4 # packed right shift <<25
529 pxor \TMP3, \TMP2 # xor the shifted versions
532 psrldq $4, \TMP5 # right shift TMP5 1 DW
533 pslldq $12, \TMP2 # left shift TMP2 3 DWs
536 # second phase of the reduction
538 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
539 # in in order to perform
543 psrld $1,\TMP2 # packed left shift >>1
544 psrld $2,\TMP3 # packed left shift >>2
545 psrld $7,\TMP4 # packed left shift >>7
546 pxor \TMP3,\TMP2 # xor the shifted versions
550 pxor \TMP1, \GH # result is in TMP1
553 # Reads DLEN bytes starting at DPTR and stores in XMMDst
554 # where 0 < DLEN < 16
555 # Clobbers %rax, DLEN and XMM1
556 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
562 jz .L_done_read_partial_block_\@
564 .L_read_next_byte_\@:
566 mov 7(\DPTR, \DLEN, 1), %al
568 jnz .L_read_next_byte_\@
572 jmp .L_done_read_partial_block_\@
575 .L_read_next_byte_lt8_\@:
577 mov -1(\DPTR, \DLEN, 1), %al
579 jnz .L_read_next_byte_lt8_\@
581 .L_done_read_partial_block_\@:
584 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
585 # clobbers r10-11, xmm14
586 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
588 MOVADQ SHUF_MASK(%rip), %xmm14
589 mov \AAD, %r10 # %r10 = AAD
590 mov \AADLEN, %r11 # %r11 = aadLen
598 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
600 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
604 jge .L_get_AAD_blocks\@
608 /* read the last <16B of AAD */
613 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
614 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
616 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
620 movdqu \TMP6, AadHash(%arg2)
623 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
624 # between update calls.
625 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
626 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
627 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
628 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
630 mov PBlockLen(%arg2), %r13
632 je .L_partial_block_done_\@ # Leave Macro if no partial blocks
633 # Read in input data without over reading
634 cmp $16, \PLAIN_CYPH_LEN
635 jl .L_fewer_than_16_bytes_\@
636 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
639 .L_fewer_than_16_bytes_\@:
640 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
641 mov \PLAIN_CYPH_LEN, %r12
642 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
644 mov PBlockLen(%arg2), %r13
646 .L_data_read_\@: # Finished reading in data
648 movdqu PBlockEncKey(%arg2), %xmm9
649 movdqu HashKey(%arg2), %xmm13
651 lea SHIFT_MASK(%rip), %r12
653 # adjust the shuffle mask pointer to be able to shift r13 bytes
654 # r16-r13 is the number of bytes in plaintext mod 16)
656 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
657 pshufb %xmm2, %xmm9 # shift right r13 bytes
661 pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
663 mov \PLAIN_CYPH_LEN, %r10
665 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
667 # Determine if partial block is not being filled and
668 # shift mask accordingly
669 jge .L_no_extra_mask_1_\@
671 .L_no_extra_mask_1_\@:
673 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
674 # get the appropriate mask to mask out bottom r13 bytes of xmm9
675 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
678 movdqa SHUF_MASK(%rip), %xmm10
681 pxor %xmm3, \AAD_HASH
684 jl .L_partial_incomplete_1_\@
686 # GHASH computation for the last <16 Byte block
687 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
690 mov %rax, PBlockLen(%arg2)
692 .L_partial_incomplete_1_\@:
693 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
695 movdqu \AAD_HASH, AadHash(%arg2)
697 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
699 mov \PLAIN_CYPH_LEN, %r10
701 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
703 # Determine if partial block is not being filled and
704 # shift mask accordingly
705 jge .L_no_extra_mask_2_\@
707 .L_no_extra_mask_2_\@:
709 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
710 # get the appropriate mask to mask out bottom r13 bytes of xmm9
713 movdqa SHUF_MASK(%rip), %xmm1
716 pxor %xmm9, \AAD_HASH
719 jl .L_partial_incomplete_2_\@
721 # GHASH computation for the last <16 Byte block
722 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
725 mov %rax, PBlockLen(%arg2)
726 jmp .L_encode_done_\@
727 .L_partial_incomplete_2_\@:
728 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
730 movdqu \AAD_HASH, AadHash(%arg2)
732 movdqa SHUF_MASK(%rip), %xmm10
733 # shuffle xmm9 back to output as ciphertext
737 # output encrypted Bytes
739 jl .L_partial_fill_\@
742 # Set r13 to be the number of bytes to write out
746 mov \PLAIN_CYPH_LEN, %r13
751 jle .L_less_than_8_bytes_left_\@
753 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
758 .L_less_than_8_bytes_left_\@:
759 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
763 jne .L_less_than_8_bytes_left_\@
764 .L_partial_block_done_\@:
765 .endm # PARTIAL_BLOCK
768 * if a = number of total plaintext bytes
770 * num_initial_blocks = b mod 4
771 * encrypt the initial num_initial_blocks blocks and apply ghash on
773 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
775 * arg1, %arg2, %arg3 are used as a pointer only, not modified
779 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
780 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
781 MOVADQ SHUF_MASK(%rip), %xmm14
783 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
785 # start AES for num_initial_blocks blocks
787 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
789 .if (\i == 5) || (\i == 6) || (\i == 7)
791 MOVADQ ONE(%RIP),\TMP1
792 MOVADQ 0(%arg1),\TMP2
794 paddd \TMP1, \XMM0 # INCR Y0
796 movdqa \XMM0, %xmm\index
798 MOVADQ \XMM0, %xmm\index
800 pshufb %xmm14, %xmm\index # perform a 16 byte swap
801 pxor \TMP2, %xmm\index
805 shr $2,%eax # 128->4, 192->6, 256->8
806 add $5,%eax # 128->9, 192->11, 256->13
808 .Laes_loop_initial_\@:
811 aesenc \TMP1, %xmm\index
815 jnz .Laes_loop_initial_\@
819 aesenclast \TMP1, %xmm\index # Last Round
822 movdqu (%arg4 , %r11, 1), \TMP1
823 pxor \TMP1, %xmm\index
824 movdqu %xmm\index, (%arg3 , %r11, 1)
825 # write back plaintext/ciphertext for num_initial_blocks
829 movdqa \TMP1, %xmm\index
831 pshufb %xmm14, %xmm\index
833 # prepare plaintext/ciphertext for GHASH computation
837 # apply GHASH on num_initial_blocks blocks
841 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
843 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
845 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
848 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
850 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
853 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
856 jl .L_initial_blocks_done\@
857 # no need for precomputed values
860 * Precomputations for HashKey parallel with encryption of first 4 blocks.
861 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
863 MOVADQ ONE(%RIP),\TMP1
864 paddd \TMP1, \XMM0 # INCR Y0
866 pshufb %xmm14, \XMM1 # perform a 16 byte swap
868 paddd \TMP1, \XMM0 # INCR Y0
870 pshufb %xmm14, \XMM2 # perform a 16 byte swap
872 paddd \TMP1, \XMM0 # INCR Y0
874 pshufb %xmm14, \XMM3 # perform a 16 byte swap
876 paddd \TMP1, \XMM0 # INCR Y0
878 pshufb %xmm14, \XMM4 # perform a 16 byte swap
880 MOVADQ 0(%arg1),\TMP1
885 .irpc index, 1234 # do 4 rounds
886 movaps 0x10*\index(%arg1), \TMP1
892 .irpc index, 56789 # do next 5 rounds
893 movaps 0x10*\index(%arg1), \TMP1
901 shr $2,%eax # 128->4, 192->6, 256->8
902 sub $4,%eax # 128->0, 192->2, 256->4
903 jz .Laes_loop_pre_done\@
908 aesenc \TMP2, %xmm\index
912 jnz .Laes_loop_pre_\@
914 .Laes_loop_pre_done\@:
916 aesenclast \TMP2, \XMM1
917 aesenclast \TMP2, \XMM2
918 aesenclast \TMP2, \XMM3
919 aesenclast \TMP2, \XMM4
920 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
923 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
926 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
929 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
932 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
935 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
938 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
941 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
944 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
945 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
946 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
947 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
951 pshufb %xmm14, \XMM1 # perform a 16 byte swap
953 # combine GHASHed value with the corresponding ciphertext
954 pshufb %xmm14, \XMM2 # perform a 16 byte swap
955 pshufb %xmm14, \XMM3 # perform a 16 byte swap
956 pshufb %xmm14, \XMM4 # perform a 16 byte swap
958 .L_initial_blocks_done\@:
963 * encrypt 4 blocks at a time
964 * ghash the 4 previously encrypted ciphertext blocks
965 * arg1, %arg3, %arg4 are used as pointers only, not modified
966 * %r11 is the data offset value
968 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
969 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
976 movdqa SHUF_MASK(%rip), %xmm15
977 # multiply TMP5 * HashKey using karatsuba
980 pshufd $78, \XMM5, \TMP6
982 paddd ONE(%rip), \XMM0 # INCR CNT
983 movdqu HashKey_4(%arg2), \TMP5
984 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
986 paddd ONE(%rip), \XMM0 # INCR CNT
988 paddd ONE(%rip), \XMM0 # INCR CNT
990 paddd ONE(%rip), \XMM0 # INCR CNT
992 pshufb %xmm15, \XMM1 # perform a 16 byte swap
993 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
994 pshufb %xmm15, \XMM2 # perform a 16 byte swap
995 pshufb %xmm15, \XMM3 # perform a 16 byte swap
996 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1002 movdqu HashKey_4_k(%arg2), \TMP5
1003 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1004 movaps 0x10(%arg1), \TMP1
1005 aesenc \TMP1, \XMM1 # Round 1
1009 movaps 0x20(%arg1), \TMP1
1010 aesenc \TMP1, \XMM1 # Round 2
1015 pshufd $78, \XMM6, \TMP2
1017 movdqu HashKey_3(%arg2), \TMP5
1018 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1019 movaps 0x30(%arg1), \TMP3
1020 aesenc \TMP3, \XMM1 # Round 3
1024 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1025 movaps 0x40(%arg1), \TMP3
1026 aesenc \TMP3, \XMM1 # Round 4
1030 movdqu HashKey_3_k(%arg2), \TMP5
1031 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1032 movaps 0x50(%arg1), \TMP3
1033 aesenc \TMP3, \XMM1 # Round 5
1038 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1042 pshufd $78, \XMM7, \TMP2
1044 movdqu HashKey_2(%arg2), \TMP5
1046 # Multiply TMP5 * HashKey using karatsuba
1048 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1049 movaps 0x60(%arg1), \TMP3
1050 aesenc \TMP3, \XMM1 # Round 6
1054 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1055 movaps 0x70(%arg1), \TMP3
1056 aesenc \TMP3, \XMM1 # Round 7
1060 movdqu HashKey_2_k(%arg2), \TMP5
1061 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1062 movaps 0x80(%arg1), \TMP3
1063 aesenc \TMP3, \XMM1 # Round 8
1068 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1072 # Multiply XMM8 * HashKey
1073 # XMM8 and TMP5 hold the values for the two operands
1076 pshufd $78, \XMM8, \TMP2
1078 movdqu HashKey(%arg2), \TMP5
1079 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1080 movaps 0x90(%arg1), \TMP3
1081 aesenc \TMP3, \XMM1 # Round 9
1085 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1086 lea 0xa0(%arg1),%r10
1088 shr $2,%eax # 128->4, 192->6, 256->8
1089 sub $4,%eax # 128->0, 192->2, 256->4
1090 jz .Laes_loop_par_enc_done\@
1092 .Laes_loop_par_enc\@:
1095 aesenc \TMP3, %xmm\index
1099 jnz .Laes_loop_par_enc\@
1101 .Laes_loop_par_enc_done\@:
1102 MOVADQ (%r10), \TMP3
1103 aesenclast \TMP3, \XMM1 # Round 10
1104 aesenclast \TMP3, \XMM2
1105 aesenclast \TMP3, \XMM3
1106 aesenclast \TMP3, \XMM4
1107 movdqu HashKey_k(%arg2), \TMP5
1108 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1109 movdqu (%arg4,%r11,1), \TMP3
1110 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1111 movdqu 16(%arg4,%r11,1), \TMP3
1112 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1113 movdqu 32(%arg4,%r11,1), \TMP3
1114 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1115 movdqu 48(%arg4,%r11,1), \TMP3
1116 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1117 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1118 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1119 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1120 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1121 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1122 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1123 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1124 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1132 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1133 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1135 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1137 # first phase of reduction
1142 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1143 pslld $31, \TMP2 # packed right shift << 31
1144 pslld $30, \TMP3 # packed right shift << 30
1145 pslld $25, \TMP4 # packed right shift << 25
1146 pxor \TMP3, \TMP2 # xor the shifted versions
1149 psrldq $4, \TMP5 # right shift T5 1 DW
1150 pslldq $12, \TMP2 # left shift T2 3 DWs
1153 # second phase of reduction
1155 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1158 psrld $1, \TMP2 # packed left shift >>1
1159 psrld $2, \TMP3 # packed left shift >>2
1160 psrld $7, \TMP4 # packed left shift >>7
1161 pxor \TMP3,\TMP2 # xor the shifted versions
1165 pxor \TMP1, \XMM5 # result is in TMP1
1171 * decrypt 4 blocks at a time
1172 * ghash the 4 previously decrypted ciphertext blocks
1173 * arg1, %arg3, %arg4 are used as pointers only, not modified
1174 * %r11 is the data offset value
1176 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1177 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1184 movdqa SHUF_MASK(%rip), %xmm15
1185 # multiply TMP5 * HashKey using karatsuba
1188 pshufd $78, \XMM5, \TMP6
1190 paddd ONE(%rip), \XMM0 # INCR CNT
1191 movdqu HashKey_4(%arg2), \TMP5
1192 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1194 paddd ONE(%rip), \XMM0 # INCR CNT
1196 paddd ONE(%rip), \XMM0 # INCR CNT
1198 paddd ONE(%rip), \XMM0 # INCR CNT
1200 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1201 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1202 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1203 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1204 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1210 movdqu HashKey_4_k(%arg2), \TMP5
1211 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1212 movaps 0x10(%arg1), \TMP1
1213 aesenc \TMP1, \XMM1 # Round 1
1217 movaps 0x20(%arg1), \TMP1
1218 aesenc \TMP1, \XMM1 # Round 2
1223 pshufd $78, \XMM6, \TMP2
1225 movdqu HashKey_3(%arg2), \TMP5
1226 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1227 movaps 0x30(%arg1), \TMP3
1228 aesenc \TMP3, \XMM1 # Round 3
1232 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1233 movaps 0x40(%arg1), \TMP3
1234 aesenc \TMP3, \XMM1 # Round 4
1238 movdqu HashKey_3_k(%arg2), \TMP5
1239 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1240 movaps 0x50(%arg1), \TMP3
1241 aesenc \TMP3, \XMM1 # Round 5
1246 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1250 pshufd $78, \XMM7, \TMP2
1252 movdqu HashKey_2(%arg2), \TMP5
1254 # Multiply TMP5 * HashKey using karatsuba
1256 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1257 movaps 0x60(%arg1), \TMP3
1258 aesenc \TMP3, \XMM1 # Round 6
1262 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1263 movaps 0x70(%arg1), \TMP3
1264 aesenc \TMP3, \XMM1 # Round 7
1268 movdqu HashKey_2_k(%arg2), \TMP5
1269 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1270 movaps 0x80(%arg1), \TMP3
1271 aesenc \TMP3, \XMM1 # Round 8
1276 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1280 # Multiply XMM8 * HashKey
1281 # XMM8 and TMP5 hold the values for the two operands
1284 pshufd $78, \XMM8, \TMP2
1286 movdqu HashKey(%arg2), \TMP5
1287 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1288 movaps 0x90(%arg1), \TMP3
1289 aesenc \TMP3, \XMM1 # Round 9
1293 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1294 lea 0xa0(%arg1),%r10
1296 shr $2,%eax # 128->4, 192->6, 256->8
1297 sub $4,%eax # 128->0, 192->2, 256->4
1298 jz .Laes_loop_par_dec_done\@
1300 .Laes_loop_par_dec\@:
1303 aesenc \TMP3, %xmm\index
1307 jnz .Laes_loop_par_dec\@
1309 .Laes_loop_par_dec_done\@:
1310 MOVADQ (%r10), \TMP3
1311 aesenclast \TMP3, \XMM1 # last round
1312 aesenclast \TMP3, \XMM2
1313 aesenclast \TMP3, \XMM3
1314 aesenclast \TMP3, \XMM4
1315 movdqu HashKey_k(%arg2), \TMP5
1316 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1317 movdqu (%arg4,%r11,1), \TMP3
1318 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1319 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1321 movdqu 16(%arg4,%r11,1), \TMP3
1322 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1323 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1325 movdqu 32(%arg4,%r11,1), \TMP3
1326 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1327 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1329 movdqu 48(%arg4,%r11,1), \TMP3
1330 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1331 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1333 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1334 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1335 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1336 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1344 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1345 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1347 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1349 # first phase of reduction
1354 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1355 pslld $31, \TMP2 # packed right shift << 31
1356 pslld $30, \TMP3 # packed right shift << 30
1357 pslld $25, \TMP4 # packed right shift << 25
1358 pxor \TMP3, \TMP2 # xor the shifted versions
1361 psrldq $4, \TMP5 # right shift T5 1 DW
1362 pslldq $12, \TMP2 # left shift T2 3 DWs
1365 # second phase of reduction
1367 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1370 psrld $1, \TMP2 # packed left shift >>1
1371 psrld $2, \TMP3 # packed left shift >>2
1372 psrld $7, \TMP4 # packed left shift >>7
1373 pxor \TMP3,\TMP2 # xor the shifted versions
1377 pxor \TMP1, \XMM5 # result is in TMP1
1382 /* GHASH the last 4 ciphertext blocks. */
1383 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1384 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1386 # Multiply TMP6 * HashKey (using Karatsuba)
1389 pshufd $78, \XMM1, \TMP2
1391 movdqu HashKey_4(%arg2), \TMP5
1392 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1393 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1394 movdqu HashKey_4_k(%arg2), \TMP4
1395 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1396 movdqa \XMM1, \XMMDst
1397 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1399 # Multiply TMP1 * HashKey (using Karatsuba)
1402 pshufd $78, \XMM2, \TMP2
1404 movdqu HashKey_3(%arg2), \TMP5
1405 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1406 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1407 movdqu HashKey_3_k(%arg2), \TMP4
1408 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1412 # results accumulated in TMP6, XMMDst, XMM1
1414 # Multiply TMP1 * HashKey (using Karatsuba)
1417 pshufd $78, \XMM3, \TMP2
1419 movdqu HashKey_2(%arg2), \TMP5
1420 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1421 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1422 movdqu HashKey_2_k(%arg2), \TMP4
1423 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1426 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1428 # Multiply TMP1 * HashKey (using Karatsuba)
1430 pshufd $78, \XMM4, \TMP2
1432 movdqu HashKey(%arg2), \TMP5
1433 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1434 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1435 movdqu HashKey_k(%arg2), \TMP4
1436 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1442 # middle section of the temp results combined as in karatsuba algorithm
1444 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1445 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1448 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1449 # first phase of the reduction
1450 movdqa \XMMDst, \TMP2
1451 movdqa \XMMDst, \TMP3
1452 movdqa \XMMDst, \TMP4
1453 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1454 pslld $31, \TMP2 # packed right shifting << 31
1455 pslld $30, \TMP3 # packed right shifting << 30
1456 pslld $25, \TMP4 # packed right shifting << 25
1457 pxor \TMP3, \TMP2 # xor the shifted versions
1460 psrldq $4, \TMP7 # right shift TMP7 1 DW
1461 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1464 # second phase of the reduction
1465 movdqa \XMMDst, \TMP2
1466 # make 3 copies of XMMDst for doing 3 shift operations
1467 movdqa \XMMDst, \TMP3
1468 movdqa \XMMDst, \TMP4
1469 psrld $1, \TMP2 # packed left shift >> 1
1470 psrld $2, \TMP3 # packed left shift >> 2
1471 psrld $7, \TMP4 # packed left shift >> 7
1472 pxor \TMP3, \TMP2 # xor the shifted versions
1476 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1480 /* Encryption of a single block
1484 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1488 shr $2,%eax # 128->4, 192->6, 256->8
1489 add $5,%eax # 128->9, 192->11, 256->13
1490 lea 16(%arg1), %r10 # get first expanded key address
1500 aesenclast \TMP1,\XMM0
1503 /*****************************************************************************
1504 * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1505 * struct gcm_context_data *data,
1507 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1508 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1509 * // concatenated with 0x00000001. 16-byte aligned pointer.
1510 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1511 * const u8 *aad, // Additional Authentication Data (AAD)
1512 * u64 aad_len) // Length of AAD in bytes.
1514 SYM_FUNC_START(aesni_gcm_init)
1516 GCM_INIT %arg3, %arg4,%arg5, %arg6
1519 SYM_FUNC_END(aesni_gcm_init)
1521 /*****************************************************************************
1522 * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1523 * struct gcm_context_data *data,
1525 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1526 * const u8 *in, // Plaintext input
1527 * u64 plaintext_len, // Length of data in bytes for encryption.
1529 SYM_FUNC_START(aesni_gcm_enc_update)
1534 SYM_FUNC_END(aesni_gcm_enc_update)
1536 /*****************************************************************************
1537 * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1538 * struct gcm_context_data *data,
1540 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1541 * const u8 *in, // Plaintext input
1542 * u64 plaintext_len, // Length of data in bytes for encryption.
1544 SYM_FUNC_START(aesni_gcm_dec_update)
1549 SYM_FUNC_END(aesni_gcm_dec_update)
1551 /*****************************************************************************
1552 * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1553 * struct gcm_context_data *data,
1555 * u8 *auth_tag, // Authenticated Tag output.
1556 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1559 SYM_FUNC_START(aesni_gcm_finalize)
1561 GCM_COMPLETE %arg3 %arg4
1564 SYM_FUNC_END(aesni_gcm_finalize)
1568 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1569 pshufd $0b11111111, %xmm1, %xmm1
1570 shufps $0b00010000, %xmm0, %xmm4
1572 shufps $0b10001100, %xmm0, %xmm4
1575 movaps %xmm0, (TKEYP)
1578 SYM_FUNC_END(_key_expansion_256a)
1579 SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
1581 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1582 pshufd $0b01010101, %xmm1, %xmm1
1583 shufps $0b00010000, %xmm0, %xmm4
1585 shufps $0b10001100, %xmm0, %xmm4
1592 pshufd $0b11111111, %xmm0, %xmm3
1597 shufps $0b01000100, %xmm0, %xmm6
1598 movaps %xmm6, (TKEYP)
1599 shufps $0b01001110, %xmm2, %xmm1
1600 movaps %xmm1, 0x10(TKEYP)
1603 SYM_FUNC_END(_key_expansion_192a)
1605 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1606 pshufd $0b01010101, %xmm1, %xmm1
1607 shufps $0b00010000, %xmm0, %xmm4
1609 shufps $0b10001100, %xmm0, %xmm4
1615 pshufd $0b11111111, %xmm0, %xmm3
1619 movaps %xmm0, (TKEYP)
1622 SYM_FUNC_END(_key_expansion_192b)
1624 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1625 pshufd $0b10101010, %xmm1, %xmm1
1626 shufps $0b00010000, %xmm2, %xmm4
1628 shufps $0b10001100, %xmm2, %xmm4
1631 movaps %xmm2, (TKEYP)
1634 SYM_FUNC_END(_key_expansion_256b)
1637 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1638 * unsigned int key_len)
1640 SYM_FUNC_START(aesni_set_key)
1644 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1645 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1646 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1648 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1649 movaps %xmm0, (KEYP)
1650 lea 0x10(KEYP), TKEYP # key addr
1651 movl %edx, 480(KEYP)
1652 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1656 movups 0x10(UKEYP), %xmm2 # other user key
1657 movaps %xmm2, (TKEYP)
1659 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1660 call _key_expansion_256a
1661 aeskeygenassist $0x1, %xmm0, %xmm1
1662 call _key_expansion_256b
1663 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1664 call _key_expansion_256a
1665 aeskeygenassist $0x2, %xmm0, %xmm1
1666 call _key_expansion_256b
1667 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1668 call _key_expansion_256a
1669 aeskeygenassist $0x4, %xmm0, %xmm1
1670 call _key_expansion_256b
1671 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1672 call _key_expansion_256a
1673 aeskeygenassist $0x8, %xmm0, %xmm1
1674 call _key_expansion_256b
1675 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1676 call _key_expansion_256a
1677 aeskeygenassist $0x10, %xmm0, %xmm1
1678 call _key_expansion_256b
1679 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1680 call _key_expansion_256a
1681 aeskeygenassist $0x20, %xmm0, %xmm1
1682 call _key_expansion_256b
1683 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1684 call _key_expansion_256a
1687 movq 0x10(UKEYP), %xmm2 # other user key
1688 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1689 call _key_expansion_192a
1690 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1691 call _key_expansion_192b
1692 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1693 call _key_expansion_192a
1694 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1695 call _key_expansion_192b
1696 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1697 call _key_expansion_192a
1698 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1699 call _key_expansion_192b
1700 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1701 call _key_expansion_192a
1702 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
1703 call _key_expansion_192b
1706 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
1707 call _key_expansion_128
1708 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
1709 call _key_expansion_128
1710 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
1711 call _key_expansion_128
1712 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
1713 call _key_expansion_128
1714 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
1715 call _key_expansion_128
1716 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
1717 call _key_expansion_128
1718 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
1719 call _key_expansion_128
1720 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
1721 call _key_expansion_128
1722 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
1723 call _key_expansion_128
1724 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
1725 call _key_expansion_128
1728 movaps (KEYP), %xmm0
1729 movaps (TKEYP), %xmm1
1730 movaps %xmm0, 240(TKEYP)
1731 movaps %xmm1, 240(KEYP)
1733 lea 240-16(TKEYP), UKEYP
1736 movaps (KEYP), %xmm0
1738 movaps %xmm1, (UKEYP)
1748 SYM_FUNC_END(aesni_set_key)
1751 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1753 SYM_FUNC_START(aesni_enc)
1758 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1759 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1760 movl (FRAME_OFFSET+20)(%esp), INP # src
1762 movl 480(KEYP), KLEN # key length
1763 movups (INP), STATE # input
1765 movups STATE, (OUTP) # output
1772 SYM_FUNC_END(aesni_enc)
1775 * _aesni_enc1: internal ABI
1777 * KEYP: key struct pointer
1779 * STATE: initial state (input)
1781 * STATE: finial state (output)
1786 SYM_FUNC_START_LOCAL(_aesni_enc1)
1787 movaps (KEYP), KEY # key
1789 pxor KEY, STATE # round 0
1793 lea 0x20(TKEYP), TKEYP
1796 movaps -0x60(TKEYP), KEY
1798 movaps -0x50(TKEYP), KEY
1802 movaps -0x40(TKEYP), KEY
1804 movaps -0x30(TKEYP), KEY
1808 movaps -0x20(TKEYP), KEY
1810 movaps -0x10(TKEYP), KEY
1814 movaps 0x10(TKEYP), KEY
1816 movaps 0x20(TKEYP), KEY
1818 movaps 0x30(TKEYP), KEY
1820 movaps 0x40(TKEYP), KEY
1822 movaps 0x50(TKEYP), KEY
1824 movaps 0x60(TKEYP), KEY
1826 movaps 0x70(TKEYP), KEY
1827 aesenclast KEY, STATE
1829 SYM_FUNC_END(_aesni_enc1)
1832 * _aesni_enc4: internal ABI
1834 * KEYP: key struct pointer
1836 * STATE1: initial state (input)
1841 * STATE1: finial state (output)
1849 SYM_FUNC_START_LOCAL(_aesni_enc4)
1850 movaps (KEYP), KEY # key
1852 pxor KEY, STATE1 # round 0
1859 lea 0x20(TKEYP), TKEYP
1862 movaps -0x60(TKEYP), KEY
1867 movaps -0x50(TKEYP), KEY
1874 movaps -0x40(TKEYP), KEY
1879 movaps -0x30(TKEYP), KEY
1886 movaps -0x20(TKEYP), KEY
1891 movaps -0x10(TKEYP), KEY
1901 movaps 0x10(TKEYP), KEY
1906 movaps 0x20(TKEYP), KEY
1911 movaps 0x30(TKEYP), KEY
1916 movaps 0x40(TKEYP), KEY
1921 movaps 0x50(TKEYP), KEY
1926 movaps 0x60(TKEYP), KEY
1931 movaps 0x70(TKEYP), KEY
1932 aesenclast KEY, STATE1 # last round
1933 aesenclast KEY, STATE2
1934 aesenclast KEY, STATE3
1935 aesenclast KEY, STATE4
1937 SYM_FUNC_END(_aesni_enc4)
1940 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
1942 SYM_FUNC_START(aesni_dec)
1947 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1948 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1949 movl (FRAME_OFFSET+20)(%esp), INP # src
1951 mov 480(KEYP), KLEN # key length
1953 movups (INP), STATE # input
1955 movups STATE, (OUTP) #output
1962 SYM_FUNC_END(aesni_dec)
1965 * _aesni_dec1: internal ABI
1967 * KEYP: key struct pointer
1969 * STATE: initial state (input)
1971 * STATE: finial state (output)
1976 SYM_FUNC_START_LOCAL(_aesni_dec1)
1977 movaps (KEYP), KEY # key
1979 pxor KEY, STATE # round 0
1983 lea 0x20(TKEYP), TKEYP
1986 movaps -0x60(TKEYP), KEY
1988 movaps -0x50(TKEYP), KEY
1992 movaps -0x40(TKEYP), KEY
1994 movaps -0x30(TKEYP), KEY
1998 movaps -0x20(TKEYP), KEY
2000 movaps -0x10(TKEYP), KEY
2004 movaps 0x10(TKEYP), KEY
2006 movaps 0x20(TKEYP), KEY
2008 movaps 0x30(TKEYP), KEY
2010 movaps 0x40(TKEYP), KEY
2012 movaps 0x50(TKEYP), KEY
2014 movaps 0x60(TKEYP), KEY
2016 movaps 0x70(TKEYP), KEY
2017 aesdeclast KEY, STATE
2019 SYM_FUNC_END(_aesni_dec1)
2022 * _aesni_dec4: internal ABI
2024 * KEYP: key struct pointer
2026 * STATE1: initial state (input)
2031 * STATE1: finial state (output)
2039 SYM_FUNC_START_LOCAL(_aesni_dec4)
2040 movaps (KEYP), KEY # key
2042 pxor KEY, STATE1 # round 0
2049 lea 0x20(TKEYP), TKEYP
2052 movaps -0x60(TKEYP), KEY
2057 movaps -0x50(TKEYP), KEY
2064 movaps -0x40(TKEYP), KEY
2069 movaps -0x30(TKEYP), KEY
2076 movaps -0x20(TKEYP), KEY
2081 movaps -0x10(TKEYP), KEY
2091 movaps 0x10(TKEYP), KEY
2096 movaps 0x20(TKEYP), KEY
2101 movaps 0x30(TKEYP), KEY
2106 movaps 0x40(TKEYP), KEY
2111 movaps 0x50(TKEYP), KEY
2116 movaps 0x60(TKEYP), KEY
2121 movaps 0x70(TKEYP), KEY
2122 aesdeclast KEY, STATE1 # last round
2123 aesdeclast KEY, STATE2
2124 aesdeclast KEY, STATE3
2125 aesdeclast KEY, STATE4
2127 SYM_FUNC_END(_aesni_dec4)
2130 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2133 SYM_FUNC_START(aesni_ecb_enc)
2139 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2140 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2141 movl (FRAME_OFFSET+24)(%esp), INP # src
2142 movl (FRAME_OFFSET+28)(%esp), LEN # len
2144 test LEN, LEN # check length
2153 movups (INP), STATE1
2154 movups 0x10(INP), STATE2
2155 movups 0x20(INP), STATE3
2156 movups 0x30(INP), STATE4
2158 movups STATE1, (OUTP)
2159 movups STATE2, 0x10(OUTP)
2160 movups STATE3, 0x20(OUTP)
2161 movups STATE4, 0x30(OUTP)
2171 movups (INP), STATE1
2173 movups STATE1, (OUTP)
2187 SYM_FUNC_END(aesni_ecb_enc)
2190 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2193 SYM_FUNC_START(aesni_ecb_dec)
2199 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2200 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2201 movl (FRAME_OFFSET+24)(%esp), INP # src
2202 movl (FRAME_OFFSET+28)(%esp), LEN # len
2214 movups (INP), STATE1
2215 movups 0x10(INP), STATE2
2216 movups 0x20(INP), STATE3
2217 movups 0x30(INP), STATE4
2219 movups STATE1, (OUTP)
2220 movups STATE2, 0x10(OUTP)
2221 movups STATE3, 0x20(OUTP)
2222 movups STATE4, 0x30(OUTP)
2232 movups (INP), STATE1
2234 movups STATE1, (OUTP)
2248 SYM_FUNC_END(aesni_ecb_dec)
2251 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2252 * size_t len, u8 *iv)
2254 SYM_FUNC_START(aesni_cbc_enc)
2261 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2262 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2263 movl (FRAME_OFFSET+28)(%esp), INP # src
2264 movl (FRAME_OFFSET+32)(%esp), LEN # len
2265 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2270 movups (IVP), STATE # load iv as initial state
2273 movups (INP), IN # load input
2276 movups STATE, (OUTP) # store output
2292 SYM_FUNC_END(aesni_cbc_enc)
2295 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2296 * size_t len, u8 *iv)
2298 SYM_FUNC_START(aesni_cbc_dec)
2305 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2306 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2307 movl (FRAME_OFFSET+28)(%esp), INP # src
2308 movl (FRAME_OFFSET+32)(%esp), LEN # len
2309 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2312 jb .Lcbc_dec_just_ret
2322 movups 0x10(INP), IN2
2325 movups 0x20(INP), IN3
2327 movups 0x30(INP), IN4
2330 movups 0x20(INP), IN1
2332 movups 0x30(INP), IN2
2347 movups 0x10(INP), IN2
2350 movups STATE1, (OUTP)
2351 movups STATE2, 0x10(OUTP)
2352 movups STATE3, 0x20(OUTP)
2353 movups STATE4, 0x30(OUTP)
2367 movups STATE, (OUTP)
2385 SYM_FUNC_END(aesni_cbc_dec)
2388 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2389 * size_t len, u8 *iv)
2391 SYM_FUNC_START(aesni_cts_cbc_enc)
2398 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2399 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2400 movl (FRAME_OFFSET+28)(%esp), INP # src
2401 movl (FRAME_OFFSET+32)(%esp), LEN # len
2402 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2403 lea .Lcts_permute_table, T1
2405 lea .Lcts_permute_table(%rip), T1
2432 movups STATE, (OUTP)
2442 SYM_FUNC_END(aesni_cts_cbc_enc)
2445 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2446 * size_t len, u8 *iv)
2448 SYM_FUNC_START(aesni_cts_cbc_dec)
2455 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2456 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2457 movl (FRAME_OFFSET+28)(%esp), INP # src
2458 movl (FRAME_OFFSET+32)(%esp), LEN # len
2459 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2460 lea .Lcts_permute_table, T1
2462 lea .Lcts_permute_table(%rip), T1
2493 movups STATE, (OUTP)
2503 SYM_FUNC_END(aesni_cts_cbc_dec)
2505 .pushsection .rodata
2507 .Lcts_permute_table:
2508 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2509 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2510 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
2511 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
2512 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2513 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
2516 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2522 * _aesni_inc_init: internal ABI
2523 * setup registers used by _aesni_inc
2527 * CTR: == IV, in little endian
2528 * TCTR_LOW: == lower qword of CTR
2529 * INC: == 1, in little endian
2530 * BSWAP_MASK == endian swapping mask
2532 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2533 movaps .Lbswap_mask(%rip), BSWAP_MASK
2535 pshufb BSWAP_MASK, CTR
2540 SYM_FUNC_END(_aesni_inc_init)
2543 * _aesni_inc: internal ABI
2544 * Increase IV by 1, IV is in big endian
2547 * CTR: == IV, in little endian
2548 * TCTR_LOW: == lower qword of CTR
2549 * INC: == 1, in little endian
2550 * BSWAP_MASK == endian swapping mask
2554 * CTR: == output IV, in little endian
2555 * TCTR_LOW: == lower qword of CTR
2557 SYM_FUNC_START_LOCAL(_aesni_inc)
2566 pshufb BSWAP_MASK, IV
2568 SYM_FUNC_END(_aesni_inc)
2571 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2572 * size_t len, u8 *iv)
2574 SYM_FUNC_START(aesni_ctr_enc)
2577 jb .Lctr_enc_just_ret
2580 call _aesni_inc_init
2590 movups 0x10(INP), IN2
2593 movups 0x20(INP), IN3
2596 movups 0x30(INP), IN4
2599 movups STATE1, (OUTP)
2601 movups STATE2, 0x10(OUTP)
2603 movups STATE3, 0x20(OUTP)
2605 movups STATE4, 0x30(OUTP)
2620 movups STATE, (OUTP)
2631 SYM_FUNC_END(aesni_ctr_enc)
2635 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
2637 .Lgf128mul_x_ble_mask:
2638 .octa 0x00000000000000010000000000000087
2642 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
2645 * GF128MUL_MASK == mask with 0x87 and 0x01
2649 * KEY: == temporary value
2651 .macro _aesni_gf128mul_x_ble
2652 pshufd $0x13, IV, KEY
2655 pand GF128MUL_MASK, KEY
2659 .macro _aesni_xts_crypt enc
2666 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2667 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2668 movl (FRAME_OFFSET+28)(%esp), INP # src
2669 movl (FRAME_OFFSET+32)(%esp), LEN # len
2670 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2671 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2673 movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
2691 movdqu 0x00(INP), IN
2693 movdqu IV, 0x00(OUTP)
2695 _aesni_gf128mul_x_ble
2697 movdqu 0x10(INP), IN
2699 movdqu IV, 0x10(OUTP)
2701 _aesni_gf128mul_x_ble
2703 movdqu 0x20(INP), IN
2705 movdqu IV, 0x20(OUTP)
2707 _aesni_gf128mul_x_ble
2709 movdqu 0x30(INP), IN
2711 movdqu IV, 0x30(OUTP)
2719 movdqu 0x00(OUTP), IN
2721 movdqu STATE1, 0x00(OUTP)
2723 movdqu 0x10(OUTP), IN
2725 movdqu STATE2, 0x10(OUTP)
2727 movdqu 0x20(OUTP), IN
2729 movdqu STATE3, 0x20(OUTP)
2731 movdqu 0x30(OUTP), IN
2733 movdqu STATE4, 0x30(OUTP)
2735 _aesni_gf128mul_x_ble
2776 _aesni_gf128mul_x_ble
2787 movdqu STATE, (OUTP)
2792 movdqu STATE, (OUTP)
2797 movdqa STATE4, STATE
2803 _aesni_gf128mul_x_ble
2810 lea .Lcts_permute_table, T1
2812 lea .Lcts_permute_table(%rip), T1
2814 add LEN, INP /* rewind input pointer */
2815 add $16, LEN /* # bytes in final block */
2844 movups STATE, (OUTP)
2849 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
2850 * const u8 *src, unsigned int len, le128 *iv)
2852 SYM_FUNC_START(aesni_xts_enc)
2854 SYM_FUNC_END(aesni_xts_enc)
2857 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
2858 * const u8 *src, unsigned int len, le128 *iv)
2860 SYM_FUNC_START(aesni_xts_dec)
2862 SYM_FUNC_END(aesni_xts_dec)