2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/frame.h>
35 #include <asm/nospec-branch.h>
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
50 # constants in mergeable sections, linker can reorder and merge
51 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
53 .Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
55 .section .rodata.cst16.POLY, "aM", @progbits, 16
57 POLY: .octa 0xC2000000000000000000000000000001
58 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
60 TWOONE: .octa 0x00000001000000000000000000000001
62 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
64 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65 .section .rodata.cst16.MASK1, "aM", @progbits, 16
67 MASK1: .octa 0x0000000000000000ffffffffffffffff
68 .section .rodata.cst16.MASK2, "aM", @progbits, 16
70 MASK2: .octa 0xffffffffffffffff0000000000000000
71 .section .rodata.cst16.ONE, "aM", @progbits, 16
73 ONE: .octa 0x00000000000000000000000000000001
74 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
76 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77 .section .rodata.cst16.dec, "aM", @progbits, 16
80 .section .rodata.cst16.enc, "aM", @progbits, 16
84 # order of these constants should not change.
85 # more specifically, ALL_F should follow SHIFT_MASK,
86 # and zero should follow ALL_F
87 .section .rodata, "a", @progbits
89 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
95 .type aad_shift_arr, @object
96 .size aad_shift_arr, 272
98 .octa 0xffffffffffffffffffffffffffffffff
99 .octa 0xffffffffffffffffffffffffffffff0C
100 .octa 0xffffffffffffffffffffffffffff0D0C
101 .octa 0xffffffffffffffffffffffffff0E0D0C
102 .octa 0xffffffffffffffffffffffff0F0E0D0C
103 .octa 0xffffffffffffffffffffff0C0B0A0908
104 .octa 0xffffffffffffffffffff0D0C0B0A0908
105 .octa 0xffffffffffffffffff0E0D0C0B0A0908
106 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
107 .octa 0xffffffffffffff0C0B0A090807060504
108 .octa 0xffffffffffff0D0C0B0A090807060504
109 .octa 0xffffffffff0E0D0C0B0A090807060504
110 .octa 0xffffffff0F0E0D0C0B0A090807060504
111 .octa 0xffffff0C0B0A09080706050403020100
112 .octa 0xffff0D0C0B0A09080706050403020100
113 .octa 0xff0E0D0C0B0A09080706050403020100
114 .octa 0x0F0E0D0C0B0A09080706050403020100
120 #define STACK_OFFSET 8*3
121 #define HashKey 16*0 // store HashKey <<1 mod poly here
122 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
123 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
124 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
125 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
126 // bits of HashKey <<1 mod poly here
127 //(for Karatsuba purposes)
128 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
129 // bits of HashKey^2 <<1 mod poly here
130 // (for Karatsuba purposes)
131 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
132 // bits of HashKey^3 <<1 mod poly here
133 // (for Karatsuba purposes)
134 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
135 // bits of HashKey^4 <<1 mod poly here
136 // (for Karatsuba purposes)
137 #define VARIABLE_OFFSET 16*8
145 #define arg7 STACK_OFFSET+8(%r14)
146 #define arg8 STACK_OFFSET+16(%r14)
147 #define arg9 STACK_OFFSET+24(%r14)
148 #define arg10 STACK_OFFSET+32(%r14)
149 #define keysize 2*15*16(%arg1)
166 #define BSWAP_MASK %xmm10
170 #define GF128MUL_MASK %xmm10
200 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
203 * Input: A and B (128-bits each, bit-reflected)
204 * Output: C = A*B*x mod poly, (i.e. >>1 )
205 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
206 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
209 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
211 pshufd $78, \GH, \TMP2
212 pshufd $78, \HK, \TMP3
213 pxor \GH, \TMP2 # TMP2 = a1+a0
214 pxor \HK, \TMP3 # TMP3 = b1+b0
215 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
216 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
217 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
219 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
221 pslldq $8, \TMP3 # left shift TMP3 2 DWs
222 psrldq $8, \TMP2 # right shift TMP2 2 DWs
224 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
226 # first phase of the reduction
230 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
231 # in in order to perform
233 pslld $31, \TMP2 # packed right shift <<31
234 pslld $30, \TMP3 # packed right shift <<30
235 pslld $25, \TMP4 # packed right shift <<25
236 pxor \TMP3, \TMP2 # xor the shifted versions
239 psrldq $4, \TMP5 # right shift TMP5 1 DW
240 pslldq $12, \TMP2 # left shift TMP2 3 DWs
243 # second phase of the reduction
245 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
246 # in in order to perform
250 psrld $1,\TMP2 # packed left shift >>1
251 psrld $2,\TMP3 # packed left shift >>2
252 psrld $7,\TMP4 # packed left shift >>7
253 pxor \TMP3,\TMP2 # xor the shifted versions
257 pxor \TMP1, \GH # result is in TMP1
261 * if a = number of total plaintext bytes
263 * num_initial_blocks = b mod 4
264 * encrypt the initial num_initial_blocks blocks and apply ghash on
266 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
268 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
272 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
273 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
274 MOVADQ SHUF_MASK(%rip), %xmm14
275 mov arg7, %r10 # %r10 = AAD
276 mov arg8, %r12 # %r12 = aadLen
282 jl _get_AAD_rest8\num_initial_blocks\operation
283 _get_AAD_blocks\num_initial_blocks\operation:
284 movdqu (%r10), %xmm\i
285 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
287 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
292 jge _get_AAD_blocks\num_initial_blocks\operation
296 je _get_AAD_done\num_initial_blocks\operation
300 /* read the last <16B of AAD. since we have at least 4B of
301 data right after the AAD (the ICV, and maybe some CT), we can
302 read 4B/8B blocks safely, and then get rid of the extra stuff */
303 _get_AAD_rest8\num_initial_blocks\operation:
305 jle _get_AAD_rest4\num_initial_blocks\operation
312 jmp _get_AAD_rest8\num_initial_blocks\operation
313 _get_AAD_rest4\num_initial_blocks\operation:
315 jle _get_AAD_rest0\num_initial_blocks\operation
323 _get_AAD_rest0\num_initial_blocks\operation:
324 /* finalize: shift out the extra bytes we read, and align
325 left. since pslldq can only shift by an immediate, we use
326 vpshufb and an array of shuffle masks */
329 movdqu aad_shift_arr(%r11), \TMP1
330 PSHUFB_XMM \TMP1, %xmm\i
331 _get_AAD_rest_final\num_initial_blocks\operation:
332 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
334 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
336 _get_AAD_done\num_initial_blocks\operation:
337 xor %r11, %r11 # initialise the data pointer offset as zero
338 # start AES for num_initial_blocks blocks
340 mov %arg5, %rax # %rax = *Y0
341 movdqu (%rax), \XMM0 # XMM0 = Y0
342 PSHUFB_XMM %xmm14, \XMM0
344 .if (\i == 5) || (\i == 6) || (\i == 7)
345 MOVADQ ONE(%RIP),\TMP1
348 paddd \TMP1, \XMM0 # INCR Y0
349 movdqa \XMM0, %xmm\index
350 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
351 pxor \TMP2, %xmm\index
355 shr $2,%eax # 128->4, 192->6, 256->8
356 add $5,%eax # 128->9, 192->11, 256->13
358 aes_loop_initial_dec\num_initial_blocks:
361 AESENC \TMP1, %xmm\index
365 jnz aes_loop_initial_dec\num_initial_blocks
369 AESENCLAST \TMP1, %xmm\index # Last Round
372 movdqu (%arg3 , %r11, 1), \TMP1
373 pxor \TMP1, %xmm\index
374 movdqu %xmm\index, (%arg2 , %r11, 1)
375 # write back plaintext/ciphertext for num_initial_blocks
378 movdqa \TMP1, %xmm\index
379 PSHUFB_XMM %xmm14, %xmm\index
380 # prepare plaintext/ciphertext for GHASH computation
384 # apply GHASH on num_initial_blocks blocks
388 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
390 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
392 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
395 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
397 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
400 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
403 jl _initial_blocks_done\num_initial_blocks\operation
404 # no need for precomputed values
407 * Precomputations for HashKey parallel with encryption of first 4 blocks.
408 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
410 MOVADQ ONE(%rip), \TMP1
411 paddd \TMP1, \XMM0 # INCR Y0
413 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
415 paddd \TMP1, \XMM0 # INCR Y0
417 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
419 paddd \TMP1, \XMM0 # INCR Y0
421 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
423 paddd \TMP1, \XMM0 # INCR Y0
425 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
427 MOVADQ 0(%arg1),\TMP1
433 pshufd $78, \TMP3, \TMP1
435 movdqa \TMP1, HashKey_k(%rsp)
436 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
437 # TMP5 = HashKey^2<<1 (mod poly)
438 movdqa \TMP5, HashKey_2(%rsp)
439 # HashKey_2 = HashKey^2<<1 (mod poly)
440 pshufd $78, \TMP5, \TMP1
442 movdqa \TMP1, HashKey_2_k(%rsp)
443 .irpc index, 1234 # do 4 rounds
444 movaps 0x10*\index(%arg1), \TMP1
450 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
451 # TMP5 = HashKey^3<<1 (mod poly)
452 movdqa \TMP5, HashKey_3(%rsp)
453 pshufd $78, \TMP5, \TMP1
455 movdqa \TMP1, HashKey_3_k(%rsp)
456 .irpc index, 56789 # do next 5 rounds
457 movaps 0x10*\index(%arg1), \TMP1
463 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
464 # TMP5 = HashKey^3<<1 (mod poly)
465 movdqa \TMP5, HashKey_4(%rsp)
466 pshufd $78, \TMP5, \TMP1
468 movdqa \TMP1, HashKey_4_k(%rsp)
471 shr $2,%eax # 128->4, 192->6, 256->8
472 sub $4,%eax # 128->0, 192->2, 256->4
473 jz aes_loop_pre_dec_done\num_initial_blocks
475 aes_loop_pre_dec\num_initial_blocks:
478 AESENC \TMP2, %xmm\index
482 jnz aes_loop_pre_dec\num_initial_blocks
484 aes_loop_pre_dec_done\num_initial_blocks:
486 AESENCLAST \TMP2, \XMM1
487 AESENCLAST \TMP2, \XMM2
488 AESENCLAST \TMP2, \XMM3
489 AESENCLAST \TMP2, \XMM4
490 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
492 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
494 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
496 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
498 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
500 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
502 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
504 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
507 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
509 # combine GHASHed value with the corresponding ciphertext
510 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
511 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
512 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
514 _initial_blocks_done\num_initial_blocks\operation:
520 * if a = number of total plaintext bytes
522 * num_initial_blocks = b mod 4
523 * encrypt the initial num_initial_blocks blocks and apply ghash on
525 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
527 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
531 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
532 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
533 MOVADQ SHUF_MASK(%rip), %xmm14
534 mov arg7, %r10 # %r10 = AAD
535 mov arg8, %r12 # %r12 = aadLen
541 jl _get_AAD_rest8\num_initial_blocks\operation
542 _get_AAD_blocks\num_initial_blocks\operation:
543 movdqu (%r10), %xmm\i
544 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
546 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 jge _get_AAD_blocks\num_initial_blocks\operation
555 je _get_AAD_done\num_initial_blocks\operation
559 /* read the last <16B of AAD. since we have at least 4B of
560 data right after the AAD (the ICV, and maybe some PT), we can
561 read 4B/8B blocks safely, and then get rid of the extra stuff */
562 _get_AAD_rest8\num_initial_blocks\operation:
564 jle _get_AAD_rest4\num_initial_blocks\operation
571 jmp _get_AAD_rest8\num_initial_blocks\operation
572 _get_AAD_rest4\num_initial_blocks\operation:
574 jle _get_AAD_rest0\num_initial_blocks\operation
582 _get_AAD_rest0\num_initial_blocks\operation:
583 /* finalize: shift out the extra bytes we read, and align
584 left. since pslldq can only shift by an immediate, we use
585 vpshufb and an array of shuffle masks */
588 movdqu aad_shift_arr(%r11), \TMP1
589 PSHUFB_XMM \TMP1, %xmm\i
590 _get_AAD_rest_final\num_initial_blocks\operation:
591 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
593 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
595 _get_AAD_done\num_initial_blocks\operation:
596 xor %r11, %r11 # initialise the data pointer offset as zero
597 # start AES for num_initial_blocks blocks
599 mov %arg5, %rax # %rax = *Y0
600 movdqu (%rax), \XMM0 # XMM0 = Y0
601 PSHUFB_XMM %xmm14, \XMM0
603 .if (\i == 5) || (\i == 6) || (\i == 7)
605 MOVADQ ONE(%RIP),\TMP1
606 MOVADQ 0(%arg1),\TMP2
608 paddd \TMP1, \XMM0 # INCR Y0
609 MOVADQ \XMM0, %xmm\index
610 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
611 pxor \TMP2, %xmm\index
615 shr $2,%eax # 128->4, 192->6, 256->8
616 add $5,%eax # 128->9, 192->11, 256->13
618 aes_loop_initial_enc\num_initial_blocks:
621 AESENC \TMP1, %xmm\index
625 jnz aes_loop_initial_enc\num_initial_blocks
629 AESENCLAST \TMP1, %xmm\index # Last Round
632 movdqu (%arg3 , %r11, 1), \TMP1
633 pxor \TMP1, %xmm\index
634 movdqu %xmm\index, (%arg2 , %r11, 1)
635 # write back plaintext/ciphertext for num_initial_blocks
637 PSHUFB_XMM %xmm14, %xmm\index
639 # prepare plaintext/ciphertext for GHASH computation
643 # apply GHASH on num_initial_blocks blocks
647 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
649 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
651 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
654 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
656 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
659 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
662 jl _initial_blocks_done\num_initial_blocks\operation
663 # no need for precomputed values
666 * Precomputations for HashKey parallel with encryption of first 4 blocks.
667 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
669 MOVADQ ONE(%RIP),\TMP1
670 paddd \TMP1, \XMM0 # INCR Y0
672 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
674 paddd \TMP1, \XMM0 # INCR Y0
676 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
678 paddd \TMP1, \XMM0 # INCR Y0
680 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
682 paddd \TMP1, \XMM0 # INCR Y0
684 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
686 MOVADQ 0(%arg1),\TMP1
692 pshufd $78, \TMP3, \TMP1
694 movdqa \TMP1, HashKey_k(%rsp)
695 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
696 # TMP5 = HashKey^2<<1 (mod poly)
697 movdqa \TMP5, HashKey_2(%rsp)
698 # HashKey_2 = HashKey^2<<1 (mod poly)
699 pshufd $78, \TMP5, \TMP1
701 movdqa \TMP1, HashKey_2_k(%rsp)
702 .irpc index, 1234 # do 4 rounds
703 movaps 0x10*\index(%arg1), \TMP1
709 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
710 # TMP5 = HashKey^3<<1 (mod poly)
711 movdqa \TMP5, HashKey_3(%rsp)
712 pshufd $78, \TMP5, \TMP1
714 movdqa \TMP1, HashKey_3_k(%rsp)
715 .irpc index, 56789 # do next 5 rounds
716 movaps 0x10*\index(%arg1), \TMP1
722 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
723 # TMP5 = HashKey^3<<1 (mod poly)
724 movdqa \TMP5, HashKey_4(%rsp)
725 pshufd $78, \TMP5, \TMP1
727 movdqa \TMP1, HashKey_4_k(%rsp)
730 shr $2,%eax # 128->4, 192->6, 256->8
731 sub $4,%eax # 128->0, 192->2, 256->4
732 jz aes_loop_pre_enc_done\num_initial_blocks
734 aes_loop_pre_enc\num_initial_blocks:
737 AESENC \TMP2, %xmm\index
741 jnz aes_loop_pre_enc\num_initial_blocks
743 aes_loop_pre_enc_done\num_initial_blocks:
745 AESENCLAST \TMP2, \XMM1
746 AESENCLAST \TMP2, \XMM2
747 AESENCLAST \TMP2, \XMM3
748 AESENCLAST \TMP2, \XMM4
749 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
751 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
753 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
755 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
757 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
758 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
759 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
760 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
763 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
765 # combine GHASHed value with the corresponding ciphertext
766 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
767 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
768 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
770 _initial_blocks_done\num_initial_blocks\operation:
775 * encrypt 4 blocks at a time
776 * ghash the 4 previously encrypted ciphertext blocks
777 * arg1, %arg2, %arg3 are used as pointers only, not modified
778 * %r11 is the data offset value
780 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
781 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
788 movdqa SHUF_MASK(%rip), %xmm15
789 # multiply TMP5 * HashKey using karatsuba
792 pshufd $78, \XMM5, \TMP6
794 paddd ONE(%rip), \XMM0 # INCR CNT
795 movdqa HashKey_4(%rsp), \TMP5
796 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
798 paddd ONE(%rip), \XMM0 # INCR CNT
800 paddd ONE(%rip), \XMM0 # INCR CNT
802 paddd ONE(%rip), \XMM0 # INCR CNT
804 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
805 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
806 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
807 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
808 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814 movdqa HashKey_4_k(%rsp), \TMP5
815 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
816 movaps 0x10(%arg1), \TMP1
817 AESENC \TMP1, \XMM1 # Round 1
821 movaps 0x20(%arg1), \TMP1
822 AESENC \TMP1, \XMM1 # Round 2
827 pshufd $78, \XMM6, \TMP2
829 movdqa HashKey_3(%rsp), \TMP5
830 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
831 movaps 0x30(%arg1), \TMP3
832 AESENC \TMP3, \XMM1 # Round 3
836 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
837 movaps 0x40(%arg1), \TMP3
838 AESENC \TMP3, \XMM1 # Round 4
842 movdqa HashKey_3_k(%rsp), \TMP5
843 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
844 movaps 0x50(%arg1), \TMP3
845 AESENC \TMP3, \XMM1 # Round 5
850 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
854 pshufd $78, \XMM7, \TMP2
856 movdqa HashKey_2(%rsp ), \TMP5
858 # Multiply TMP5 * HashKey using karatsuba
860 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
861 movaps 0x60(%arg1), \TMP3
862 AESENC \TMP3, \XMM1 # Round 6
866 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
867 movaps 0x70(%arg1), \TMP3
868 AESENC \TMP3, \XMM1 # Round 7
872 movdqa HashKey_2_k(%rsp), \TMP5
873 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
874 movaps 0x80(%arg1), \TMP3
875 AESENC \TMP3, \XMM1 # Round 8
880 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
884 # Multiply XMM8 * HashKey
885 # XMM8 and TMP5 hold the values for the two operands
888 pshufd $78, \XMM8, \TMP2
890 movdqa HashKey(%rsp), \TMP5
891 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
892 movaps 0x90(%arg1), \TMP3
893 AESENC \TMP3, \XMM1 # Round 9
897 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
900 shr $2,%eax # 128->4, 192->6, 256->8
901 sub $4,%eax # 128->0, 192->2, 256->4
902 jz aes_loop_par_enc_done
907 AESENC \TMP3, %xmm\index
913 aes_loop_par_enc_done:
915 AESENCLAST \TMP3, \XMM1 # Round 10
916 AESENCLAST \TMP3, \XMM2
917 AESENCLAST \TMP3, \XMM3
918 AESENCLAST \TMP3, \XMM4
919 movdqa HashKey_k(%rsp), \TMP5
920 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
921 movdqu (%arg3,%r11,1), \TMP3
922 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
923 movdqu 16(%arg3,%r11,1), \TMP3
924 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
925 movdqu 32(%arg3,%r11,1), \TMP3
926 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
927 movdqu 48(%arg3,%r11,1), \TMP3
928 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
929 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
930 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
931 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
932 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
933 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
934 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
935 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
936 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
944 pslldq $8, \TMP3 # left shift TMP3 2 DWs
945 psrldq $8, \TMP2 # right shift TMP2 2 DWs
947 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
949 # first phase of reduction
954 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
955 pslld $31, \TMP2 # packed right shift << 31
956 pslld $30, \TMP3 # packed right shift << 30
957 pslld $25, \TMP4 # packed right shift << 25
958 pxor \TMP3, \TMP2 # xor the shifted versions
961 psrldq $4, \TMP5 # right shift T5 1 DW
962 pslldq $12, \TMP2 # left shift T2 3 DWs
965 # second phase of reduction
967 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
970 psrld $1, \TMP2 # packed left shift >>1
971 psrld $2, \TMP3 # packed left shift >>2
972 psrld $7, \TMP4 # packed left shift >>7
973 pxor \TMP3,\TMP2 # xor the shifted versions
977 pxor \TMP1, \XMM5 # result is in TMP1
983 * decrypt 4 blocks at a time
984 * ghash the 4 previously decrypted ciphertext blocks
985 * arg1, %arg2, %arg3 are used as pointers only, not modified
986 * %r11 is the data offset value
988 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
989 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
996 movdqa SHUF_MASK(%rip), %xmm15
997 # multiply TMP5 * HashKey using karatsuba
1000 pshufd $78, \XMM5, \TMP6
1002 paddd ONE(%rip), \XMM0 # INCR CNT
1003 movdqa HashKey_4(%rsp), \TMP5
1004 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1006 paddd ONE(%rip), \XMM0 # INCR CNT
1008 paddd ONE(%rip), \XMM0 # INCR CNT
1010 paddd ONE(%rip), \XMM0 # INCR CNT
1012 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1013 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1014 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1015 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1016 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1022 movdqa HashKey_4_k(%rsp), \TMP5
1023 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1024 movaps 0x10(%arg1), \TMP1
1025 AESENC \TMP1, \XMM1 # Round 1
1029 movaps 0x20(%arg1), \TMP1
1030 AESENC \TMP1, \XMM1 # Round 2
1035 pshufd $78, \XMM6, \TMP2
1037 movdqa HashKey_3(%rsp), \TMP5
1038 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1039 movaps 0x30(%arg1), \TMP3
1040 AESENC \TMP3, \XMM1 # Round 3
1044 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1045 movaps 0x40(%arg1), \TMP3
1046 AESENC \TMP3, \XMM1 # Round 4
1050 movdqa HashKey_3_k(%rsp), \TMP5
1051 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1052 movaps 0x50(%arg1), \TMP3
1053 AESENC \TMP3, \XMM1 # Round 5
1058 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1062 pshufd $78, \XMM7, \TMP2
1064 movdqa HashKey_2(%rsp ), \TMP5
1066 # Multiply TMP5 * HashKey using karatsuba
1068 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1069 movaps 0x60(%arg1), \TMP3
1070 AESENC \TMP3, \XMM1 # Round 6
1074 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1075 movaps 0x70(%arg1), \TMP3
1076 AESENC \TMP3, \XMM1 # Round 7
1080 movdqa HashKey_2_k(%rsp), \TMP5
1081 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 movaps 0x80(%arg1), \TMP3
1083 AESENC \TMP3, \XMM1 # Round 8
1088 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1092 # Multiply XMM8 * HashKey
1093 # XMM8 and TMP5 hold the values for the two operands
1096 pshufd $78, \XMM8, \TMP2
1098 movdqa HashKey(%rsp), \TMP5
1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1100 movaps 0x90(%arg1), \TMP3
1101 AESENC \TMP3, \XMM1 # Round 9
1105 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1106 lea 0xa0(%arg1),%r10
1108 shr $2,%eax # 128->4, 192->6, 256->8
1109 sub $4,%eax # 128->0, 192->2, 256->4
1110 jz aes_loop_par_dec_done
1115 AESENC \TMP3, %xmm\index
1119 jnz aes_loop_par_dec
1121 aes_loop_par_dec_done:
1122 MOVADQ (%r10), \TMP3
1123 AESENCLAST \TMP3, \XMM1 # last round
1124 AESENCLAST \TMP3, \XMM2
1125 AESENCLAST \TMP3, \XMM3
1126 AESENCLAST \TMP3, \XMM4
1127 movdqa HashKey_k(%rsp), \TMP5
1128 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1129 movdqu (%arg3,%r11,1), \TMP3
1130 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1131 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1133 movdqu 16(%arg3,%r11,1), \TMP3
1134 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1135 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1137 movdqu 32(%arg3,%r11,1), \TMP3
1138 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1139 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1141 movdqu 48(%arg3,%r11,1), \TMP3
1142 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1143 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1145 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1146 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1147 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1148 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1156 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1157 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1159 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1161 # first phase of reduction
1166 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1167 pslld $31, \TMP2 # packed right shift << 31
1168 pslld $30, \TMP3 # packed right shift << 30
1169 pslld $25, \TMP4 # packed right shift << 25
1170 pxor \TMP3, \TMP2 # xor the shifted versions
1173 psrldq $4, \TMP5 # right shift T5 1 DW
1174 pslldq $12, \TMP2 # left shift T2 3 DWs
1177 # second phase of reduction
1179 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1182 psrld $1, \TMP2 # packed left shift >>1
1183 psrld $2, \TMP3 # packed left shift >>2
1184 psrld $7, \TMP4 # packed left shift >>7
1185 pxor \TMP3,\TMP2 # xor the shifted versions
1189 pxor \TMP1, \XMM5 # result is in TMP1
1194 /* GHASH the last 4 ciphertext blocks. */
1195 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1196 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1198 # Multiply TMP6 * HashKey (using Karatsuba)
1201 pshufd $78, \XMM1, \TMP2
1203 movdqa HashKey_4(%rsp), \TMP5
1204 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1205 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1206 movdqa HashKey_4_k(%rsp), \TMP4
1207 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1208 movdqa \XMM1, \XMMDst
1209 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1211 # Multiply TMP1 * HashKey (using Karatsuba)
1214 pshufd $78, \XMM2, \TMP2
1216 movdqa HashKey_3(%rsp), \TMP5
1217 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1218 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1219 movdqa HashKey_3_k(%rsp), \TMP4
1220 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1224 # results accumulated in TMP6, XMMDst, XMM1
1226 # Multiply TMP1 * HashKey (using Karatsuba)
1229 pshufd $78, \XMM3, \TMP2
1231 movdqa HashKey_2(%rsp), \TMP5
1232 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1233 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1234 movdqa HashKey_2_k(%rsp), \TMP4
1235 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1238 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1240 # Multiply TMP1 * HashKey (using Karatsuba)
1242 pshufd $78, \XMM4, \TMP2
1244 movdqa HashKey(%rsp), \TMP5
1245 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1246 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1247 movdqa HashKey_k(%rsp), \TMP4
1248 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1254 # middle section of the temp results combined as in karatsuba algorithm
1256 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1257 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1260 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1261 # first phase of the reduction
1262 movdqa \XMMDst, \TMP2
1263 movdqa \XMMDst, \TMP3
1264 movdqa \XMMDst, \TMP4
1265 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1266 pslld $31, \TMP2 # packed right shifting << 31
1267 pslld $30, \TMP3 # packed right shifting << 30
1268 pslld $25, \TMP4 # packed right shifting << 25
1269 pxor \TMP3, \TMP2 # xor the shifted versions
1272 psrldq $4, \TMP7 # right shift TMP7 1 DW
1273 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1276 # second phase of the reduction
1277 movdqa \XMMDst, \TMP2
1278 # make 3 copies of XMMDst for doing 3 shift operations
1279 movdqa \XMMDst, \TMP3
1280 movdqa \XMMDst, \TMP4
1281 psrld $1, \TMP2 # packed left shift >> 1
1282 psrld $2, \TMP3 # packed left shift >> 2
1283 psrld $7, \TMP4 # packed left shift >> 7
1284 pxor \TMP3, \TMP2 # xor the shifted versions
1288 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1292 /* Encryption of a single block
1296 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1300 shr $2,%eax # 128->4, 192->6, 256->8
1301 add $5,%eax # 128->9, 192->11, 256->13
1302 lea 16(%arg1), %r10 # get first expanded key address
1312 AESENCLAST \TMP1,\XMM0
1314 /*****************************************************************************
1315 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1316 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1317 * const u8 *in, // Ciphertext input
1318 * u64 plaintext_len, // Length of data in bytes for decryption.
1319 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1320 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1321 * // concatenated with 0x00000001. 16-byte aligned pointer.
1322 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1323 * const u8 *aad, // Additional Authentication Data (AAD)
1324 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1325 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1326 * // given authentication tag and only return the plaintext if they match.
1327 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1328 * // (most likely), 12 or 8.
1333 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1334 * set of 11 keys in the data structure void *aes_ctx
1338 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1339 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1340 * | Salt (From the SA) |
1341 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1342 * | Initialization Vector |
1343 * | (This is the sequence number from IPSec header) |
1344 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1346 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1351 * AAD padded to 128 bits with 0
1352 * for example, assume AAD is a u32 vector
1354 * if AAD is 8 bytes:
1355 * AAD[3] = {A0, A1};
1356 * padded AAD in xmm register = {A1 A0 0 0}
1359 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1360 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1362 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1363 * | 32-bit Sequence Number (A0) |
1364 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1366 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1368 * AAD Format with 32-bit Sequence Number
1370 * if AAD is 12 bytes:
1371 * AAD[3] = {A0, A1, A2};
1372 * padded AAD in xmm register = {A2 A1 A0 0}
1375 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1376 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1377 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1378 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1380 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1381 * | 64-bit Extended Sequence Number {A1,A0} |
1383 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1385 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1387 * AAD Format with 64-bit Extended Sequence Number
1390 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1391 * The code supports 16 too but for other sizes, the code will fail.
1394 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1395 * For other sizes, the code will fail.
1397 * poly = x^128 + x^127 + x^126 + x^121 + 1
1399 *****************************************************************************/
1400 ENTRY(aesni_gcm_dec)
1406 * states of %xmm registers %xmm6:%xmm15 not saved
1407 * all %xmm registers are clobbered
1409 sub $VARIABLE_OFFSET, %rsp
1410 and $~63, %rsp # align rsp to 64 bytes
1412 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1413 movdqa SHUF_MASK(%rip), %xmm2
1414 PSHUFB_XMM %xmm2, %xmm13
1417 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1419 movdqa %xmm13, %xmm2
1429 pshufd $0x24, %xmm1, %xmm2
1430 pcmpeqd TWOONE(%rip), %xmm2
1431 pand POLY(%rip), %xmm2
1432 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1435 # Decrypt first few blocks
1437 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1438 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1439 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1442 jz _initial_num_blocks_is_0_decrypt
1444 jb _initial_num_blocks_is_1_decrypt
1445 je _initial_num_blocks_is_2_decrypt
1446 _initial_num_blocks_is_3_decrypt:
1447 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1448 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1450 jmp _initial_blocks_decrypted
1451 _initial_num_blocks_is_2_decrypt:
1452 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1453 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1455 jmp _initial_blocks_decrypted
1456 _initial_num_blocks_is_1_decrypt:
1457 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1458 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1460 jmp _initial_blocks_decrypted
1461 _initial_num_blocks_is_0_decrypt:
1462 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1463 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1464 _initial_blocks_decrypted:
1466 je _zero_cipher_left_decrypt
1468 je _four_cipher_left_decrypt
1470 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1471 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1475 _four_cipher_left_decrypt:
1476 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1477 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1478 _zero_cipher_left_decrypt:
1480 and $15, %r13 # %r13 = arg4 (mod 16)
1481 je _multiple_of_16_bytes_decrypt
1483 # Handle the last <16 byte block separately
1485 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1486 movdqa SHUF_MASK(%rip), %xmm10
1487 PSHUFB_XMM %xmm10, %xmm0
1489 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1492 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1493 lea SHIFT_MASK+16(%rip), %r12
1495 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1496 # (%r13 is the number of bytes in plaintext mod 16)
1497 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1498 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1501 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1502 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1503 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1504 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1506 movdqa SHUF_MASK(%rip), %xmm10
1507 PSHUFB_XMM %xmm10 ,%xmm2
1510 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1511 # GHASH computation for the last <16 byte block
1516 MOVQ_R64_XMM %xmm0, %rax
1518 jle _less_than_8_bytes_left_decrypt
1519 mov %rax, (%arg2 , %r11, 1)
1522 MOVQ_R64_XMM %xmm0, %rax
1524 _less_than_8_bytes_left_decrypt:
1525 mov %al, (%arg2, %r11, 1)
1529 jne _less_than_8_bytes_left_decrypt
1530 _multiple_of_16_bytes_decrypt:
1531 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1532 shl $3, %r12 # convert into number of bits
1533 movd %r12d, %xmm15 # len(A) in %xmm15
1534 shl $3, %arg4 # len(C) in bits (*128)
1535 MOVQ_R64_XMM %arg4, %xmm1
1536 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1537 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1539 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1540 # final GHASH computation
1541 movdqa SHUF_MASK(%rip), %xmm10
1542 PSHUFB_XMM %xmm10, %xmm8
1544 mov %arg5, %rax # %rax = *Y0
1545 movdqu (%rax), %xmm0 # %xmm0 = Y0
1546 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1549 mov arg9, %r10 # %r10 = authTag
1550 mov arg10, %r11 # %r11 = auth_tag_len
1556 MOVQ_R64_XMM %xmm0, %rax
1562 je _return_T_done_decrypt
1570 je _return_T_done_decrypt
1577 je _return_T_done_decrypt
1582 jmp _return_T_done_decrypt
1584 movdqu %xmm0, (%r10)
1585 _return_T_done_decrypt:
1591 ENDPROC(aesni_gcm_dec)
1594 /*****************************************************************************
1595 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1596 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1597 * const u8 *in, // Plaintext input
1598 * u64 plaintext_len, // Length of data in bytes for encryption.
1599 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1600 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1601 * // concatenated with 0x00000001. 16-byte aligned pointer.
1602 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1603 * const u8 *aad, // Additional Authentication Data (AAD)
1604 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1605 * u8 *auth_tag, // Authenticated Tag output.
1606 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1612 * keys are pre-expanded and aligned to 16 bytes. we are using the
1613 * first set of 11 keys in the data structure void *aes_ctx
1618 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1619 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1620 * | Salt (From the SA) |
1621 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1622 * | Initialization Vector |
1623 * | (This is the sequence number from IPSec header) |
1624 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1626 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1631 * AAD padded to 128 bits with 0
1632 * for example, assume AAD is a u32 vector
1634 * if AAD is 8 bytes:
1635 * AAD[3] = {A0, A1};
1636 * padded AAD in xmm register = {A1 A0 0 0}
1639 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1640 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1642 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643 * | 32-bit Sequence Number (A0) |
1644 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1646 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1648 * AAD Format with 32-bit Sequence Number
1650 * if AAD is 12 bytes:
1651 * AAD[3] = {A0, A1, A2};
1652 * padded AAD in xmm register = {A2 A1 A0 0}
1655 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1656 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659 * | 64-bit Extended Sequence Number {A1,A0} |
1661 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1665 * AAD Format with 64-bit Extended Sequence Number
1668 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1669 * The code supports 16 too but for other sizes, the code will fail.
1672 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1673 * For other sizes, the code will fail.
1675 * poly = x^128 + x^127 + x^126 + x^121 + 1
1676 ***************************************************************************/
1677 ENTRY(aesni_gcm_enc)
1683 # states of %xmm registers %xmm6:%xmm15 not saved
1684 # all %xmm registers are clobbered
1686 sub $VARIABLE_OFFSET, %rsp
1689 movdqu (%r12), %xmm13
1690 movdqa SHUF_MASK(%rip), %xmm2
1691 PSHUFB_XMM %xmm2, %xmm13
1694 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1696 movdqa %xmm13, %xmm2
1706 pshufd $0x24, %xmm1, %xmm2
1707 pcmpeqd TWOONE(%rip), %xmm2
1708 pand POLY(%rip), %xmm2
1710 movdqa %xmm13, HashKey(%rsp)
1711 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1715 # Encrypt first few blocks
1718 jz _initial_num_blocks_is_0_encrypt
1720 jb _initial_num_blocks_is_1_encrypt
1721 je _initial_num_blocks_is_2_encrypt
1722 _initial_num_blocks_is_3_encrypt:
1723 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1724 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1726 jmp _initial_blocks_encrypted
1727 _initial_num_blocks_is_2_encrypt:
1728 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1729 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1731 jmp _initial_blocks_encrypted
1732 _initial_num_blocks_is_1_encrypt:
1733 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1734 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1736 jmp _initial_blocks_encrypted
1737 _initial_num_blocks_is_0_encrypt:
1738 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1739 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1740 _initial_blocks_encrypted:
1742 # Main loop - Encrypt remaining blocks
1745 je _zero_cipher_left_encrypt
1747 je _four_cipher_left_encrypt
1748 _encrypt_by_4_encrypt:
1749 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1750 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1753 jne _encrypt_by_4_encrypt
1754 _four_cipher_left_encrypt:
1755 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1756 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1757 _zero_cipher_left_encrypt:
1759 and $15, %r13 # %r13 = arg4 (mod 16)
1760 je _multiple_of_16_bytes_encrypt
1762 # Handle the last <16 Byte block separately
1763 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1764 movdqa SHUF_MASK(%rip), %xmm10
1765 PSHUFB_XMM %xmm10, %xmm0
1768 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1771 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1772 lea SHIFT_MASK+16(%rip), %r12
1774 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1775 # (%r13 is the number of bytes in plaintext mod 16)
1776 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1777 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1778 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1779 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1780 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1781 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1782 movdqa SHUF_MASK(%rip), %xmm10
1783 PSHUFB_XMM %xmm10,%xmm0
1786 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1787 # GHASH computation for the last <16 byte block
1791 movdqa SHUF_MASK(%rip), %xmm10
1792 PSHUFB_XMM %xmm10, %xmm0
1794 # shuffle xmm0 back to output as ciphertext
1797 MOVQ_R64_XMM %xmm0, %rax
1799 jle _less_than_8_bytes_left_encrypt
1800 mov %rax, (%arg2 , %r11, 1)
1803 MOVQ_R64_XMM %xmm0, %rax
1805 _less_than_8_bytes_left_encrypt:
1806 mov %al, (%arg2, %r11, 1)
1810 jne _less_than_8_bytes_left_encrypt
1811 _multiple_of_16_bytes_encrypt:
1812 mov arg8, %r12 # %r12 = addLen (number of bytes)
1814 movd %r12d, %xmm15 # len(A) in %xmm15
1815 shl $3, %arg4 # len(C) in bits (*128)
1816 MOVQ_R64_XMM %arg4, %xmm1
1817 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1818 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1820 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1821 # final GHASH computation
1822 movdqa SHUF_MASK(%rip), %xmm10
1823 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1825 mov %arg5, %rax # %rax = *Y0
1826 movdqu (%rax), %xmm0 # %xmm0 = Y0
1827 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1830 mov arg9, %r10 # %r10 = authTag
1831 mov arg10, %r11 # %r11 = auth_tag_len
1837 MOVQ_R64_XMM %xmm0, %rax
1843 je _return_T_done_encrypt
1851 je _return_T_done_encrypt
1858 je _return_T_done_encrypt
1863 jmp _return_T_done_encrypt
1865 movdqu %xmm0, (%r10)
1866 _return_T_done_encrypt:
1872 ENDPROC(aesni_gcm_enc)
1879 _key_expansion_256a:
1880 pshufd $0b11111111, %xmm1, %xmm1
1881 shufps $0b00010000, %xmm0, %xmm4
1883 shufps $0b10001100, %xmm0, %xmm4
1886 movaps %xmm0, (TKEYP)
1889 ENDPROC(_key_expansion_128)
1890 ENDPROC(_key_expansion_256a)
1893 _key_expansion_192a:
1894 pshufd $0b01010101, %xmm1, %xmm1
1895 shufps $0b00010000, %xmm0, %xmm4
1897 shufps $0b10001100, %xmm0, %xmm4
1904 pshufd $0b11111111, %xmm0, %xmm3
1909 shufps $0b01000100, %xmm0, %xmm6
1910 movaps %xmm6, (TKEYP)
1911 shufps $0b01001110, %xmm2, %xmm1
1912 movaps %xmm1, 0x10(TKEYP)
1915 ENDPROC(_key_expansion_192a)
1918 _key_expansion_192b:
1919 pshufd $0b01010101, %xmm1, %xmm1
1920 shufps $0b00010000, %xmm0, %xmm4
1922 shufps $0b10001100, %xmm0, %xmm4
1928 pshufd $0b11111111, %xmm0, %xmm3
1932 movaps %xmm0, (TKEYP)
1935 ENDPROC(_key_expansion_192b)
1938 _key_expansion_256b:
1939 pshufd $0b10101010, %xmm1, %xmm1
1940 shufps $0b00010000, %xmm2, %xmm4
1942 shufps $0b10001100, %xmm2, %xmm4
1945 movaps %xmm2, (TKEYP)
1948 ENDPROC(_key_expansion_256b)
1951 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1952 * unsigned int key_len)
1954 ENTRY(aesni_set_key)
1958 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1959 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1960 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1962 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1963 movaps %xmm0, (KEYP)
1964 lea 0x10(KEYP), TKEYP # key addr
1965 movl %edx, 480(KEYP)
1966 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1970 movups 0x10(UKEYP), %xmm2 # other user key
1971 movaps %xmm2, (TKEYP)
1973 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1974 call _key_expansion_256a
1975 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1976 call _key_expansion_256b
1977 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1978 call _key_expansion_256a
1979 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1980 call _key_expansion_256b
1981 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1982 call _key_expansion_256a
1983 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1984 call _key_expansion_256b
1985 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1986 call _key_expansion_256a
1987 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1988 call _key_expansion_256b
1989 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1990 call _key_expansion_256a
1991 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1992 call _key_expansion_256b
1993 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1994 call _key_expansion_256a
1995 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1996 call _key_expansion_256b
1997 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1998 call _key_expansion_256a
2001 movq 0x10(UKEYP), %xmm2 # other user key
2002 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
2003 call _key_expansion_192a
2004 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
2005 call _key_expansion_192b
2006 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
2007 call _key_expansion_192a
2008 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
2009 call _key_expansion_192b
2010 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
2011 call _key_expansion_192a
2012 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
2013 call _key_expansion_192b
2014 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
2015 call _key_expansion_192a
2016 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
2017 call _key_expansion_192b
2020 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
2021 call _key_expansion_128
2022 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
2023 call _key_expansion_128
2024 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
2025 call _key_expansion_128
2026 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
2027 call _key_expansion_128
2028 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
2029 call _key_expansion_128
2030 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
2031 call _key_expansion_128
2032 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
2033 call _key_expansion_128
2034 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
2035 call _key_expansion_128
2036 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
2037 call _key_expansion_128
2038 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
2039 call _key_expansion_128
2042 movaps (KEYP), %xmm0
2043 movaps (TKEYP), %xmm1
2044 movaps %xmm0, 240(TKEYP)
2045 movaps %xmm1, 240(KEYP)
2047 lea 240-16(TKEYP), UKEYP
2050 movaps (KEYP), %xmm0
2052 movaps %xmm1, (UKEYP)
2063 ENDPROC(aesni_set_key)
2066 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2073 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2074 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2075 movl (FRAME_OFFSET+20)(%esp), INP # src
2077 movl 480(KEYP), KLEN # key length
2078 movups (INP), STATE # input
2080 movups STATE, (OUTP) # output
2090 * _aesni_enc1: internal ABI
2092 * KEYP: key struct pointer
2094 * STATE: initial state (input)
2096 * STATE: finial state (output)
2103 movaps (KEYP), KEY # key
2105 pxor KEY, STATE # round 0
2109 lea 0x20(TKEYP), TKEYP
2112 movaps -0x60(TKEYP), KEY
2114 movaps -0x50(TKEYP), KEY
2118 movaps -0x40(TKEYP), KEY
2120 movaps -0x30(TKEYP), KEY
2124 movaps -0x20(TKEYP), KEY
2126 movaps -0x10(TKEYP), KEY
2130 movaps 0x10(TKEYP), KEY
2132 movaps 0x20(TKEYP), KEY
2134 movaps 0x30(TKEYP), KEY
2136 movaps 0x40(TKEYP), KEY
2138 movaps 0x50(TKEYP), KEY
2140 movaps 0x60(TKEYP), KEY
2142 movaps 0x70(TKEYP), KEY
2143 AESENCLAST KEY STATE
2145 ENDPROC(_aesni_enc1)
2148 * _aesni_enc4: internal ABI
2150 * KEYP: key struct pointer
2152 * STATE1: initial state (input)
2157 * STATE1: finial state (output)
2167 movaps (KEYP), KEY # key
2169 pxor KEY, STATE1 # round 0
2176 lea 0x20(TKEYP), TKEYP
2179 movaps -0x60(TKEYP), KEY
2184 movaps -0x50(TKEYP), KEY
2191 movaps -0x40(TKEYP), KEY
2196 movaps -0x30(TKEYP), KEY
2203 movaps -0x20(TKEYP), KEY
2208 movaps -0x10(TKEYP), KEY
2218 movaps 0x10(TKEYP), KEY
2223 movaps 0x20(TKEYP), KEY
2228 movaps 0x30(TKEYP), KEY
2233 movaps 0x40(TKEYP), KEY
2238 movaps 0x50(TKEYP), KEY
2243 movaps 0x60(TKEYP), KEY
2248 movaps 0x70(TKEYP), KEY
2249 AESENCLAST KEY STATE1 # last round
2250 AESENCLAST KEY STATE2
2251 AESENCLAST KEY STATE3
2252 AESENCLAST KEY STATE4
2254 ENDPROC(_aesni_enc4)
2257 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2264 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2265 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2266 movl (FRAME_OFFSET+20)(%esp), INP # src
2268 mov 480(KEYP), KLEN # key length
2270 movups (INP), STATE # input
2272 movups STATE, (OUTP) #output
2282 * _aesni_dec1: internal ABI
2284 * KEYP: key struct pointer
2286 * STATE: initial state (input)
2288 * STATE: finial state (output)
2295 movaps (KEYP), KEY # key
2297 pxor KEY, STATE # round 0
2301 lea 0x20(TKEYP), TKEYP
2304 movaps -0x60(TKEYP), KEY
2306 movaps -0x50(TKEYP), KEY
2310 movaps -0x40(TKEYP), KEY
2312 movaps -0x30(TKEYP), KEY
2316 movaps -0x20(TKEYP), KEY
2318 movaps -0x10(TKEYP), KEY
2322 movaps 0x10(TKEYP), KEY
2324 movaps 0x20(TKEYP), KEY
2326 movaps 0x30(TKEYP), KEY
2328 movaps 0x40(TKEYP), KEY
2330 movaps 0x50(TKEYP), KEY
2332 movaps 0x60(TKEYP), KEY
2334 movaps 0x70(TKEYP), KEY
2335 AESDECLAST KEY STATE
2337 ENDPROC(_aesni_dec1)
2340 * _aesni_dec4: internal ABI
2342 * KEYP: key struct pointer
2344 * STATE1: initial state (input)
2349 * STATE1: finial state (output)
2359 movaps (KEYP), KEY # key
2361 pxor KEY, STATE1 # round 0
2368 lea 0x20(TKEYP), TKEYP
2371 movaps -0x60(TKEYP), KEY
2376 movaps -0x50(TKEYP), KEY
2383 movaps -0x40(TKEYP), KEY
2388 movaps -0x30(TKEYP), KEY
2395 movaps -0x20(TKEYP), KEY
2400 movaps -0x10(TKEYP), KEY
2410 movaps 0x10(TKEYP), KEY
2415 movaps 0x20(TKEYP), KEY
2420 movaps 0x30(TKEYP), KEY
2425 movaps 0x40(TKEYP), KEY
2430 movaps 0x50(TKEYP), KEY
2435 movaps 0x60(TKEYP), KEY
2440 movaps 0x70(TKEYP), KEY
2441 AESDECLAST KEY STATE1 # last round
2442 AESDECLAST KEY STATE2
2443 AESDECLAST KEY STATE3
2444 AESDECLAST KEY STATE4
2446 ENDPROC(_aesni_dec4)
2449 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2452 ENTRY(aesni_ecb_enc)
2458 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2459 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2460 movl (FRAME_OFFSET+24)(%esp), INP # src
2461 movl (FRAME_OFFSET+28)(%esp), LEN # len
2463 test LEN, LEN # check length
2472 movups (INP), STATE1
2473 movups 0x10(INP), STATE2
2474 movups 0x20(INP), STATE3
2475 movups 0x30(INP), STATE4
2477 movups STATE1, (OUTP)
2478 movups STATE2, 0x10(OUTP)
2479 movups STATE3, 0x20(OUTP)
2480 movups STATE4, 0x30(OUTP)
2490 movups (INP), STATE1
2492 movups STATE1, (OUTP)
2506 ENDPROC(aesni_ecb_enc)
2509 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2512 ENTRY(aesni_ecb_dec)
2518 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2519 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2520 movl (FRAME_OFFSET+24)(%esp), INP # src
2521 movl (FRAME_OFFSET+28)(%esp), LEN # len
2533 movups (INP), STATE1
2534 movups 0x10(INP), STATE2
2535 movups 0x20(INP), STATE3
2536 movups 0x30(INP), STATE4
2538 movups STATE1, (OUTP)
2539 movups STATE2, 0x10(OUTP)
2540 movups STATE3, 0x20(OUTP)
2541 movups STATE4, 0x30(OUTP)
2551 movups (INP), STATE1
2553 movups STATE1, (OUTP)
2567 ENDPROC(aesni_ecb_dec)
2570 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2571 * size_t len, u8 *iv)
2573 ENTRY(aesni_cbc_enc)
2580 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2581 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2582 movl (FRAME_OFFSET+28)(%esp), INP # src
2583 movl (FRAME_OFFSET+32)(%esp), LEN # len
2584 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2589 movups (IVP), STATE # load iv as initial state
2592 movups (INP), IN # load input
2595 movups STATE, (OUTP) # store output
2611 ENDPROC(aesni_cbc_enc)
2614 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2615 * size_t len, u8 *iv)
2617 ENTRY(aesni_cbc_dec)
2624 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2625 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2626 movl (FRAME_OFFSET+28)(%esp), INP # src
2627 movl (FRAME_OFFSET+32)(%esp), LEN # len
2628 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2631 jb .Lcbc_dec_just_ret
2641 movups 0x10(INP), IN2
2644 movups 0x20(INP), IN3
2646 movups 0x30(INP), IN4
2649 movups 0x20(INP), IN1
2651 movups 0x30(INP), IN2
2666 movups 0x10(INP), IN2
2669 movups STATE1, (OUTP)
2670 movups STATE2, 0x10(OUTP)
2671 movups STATE3, 0x20(OUTP)
2672 movups STATE4, 0x30(OUTP)
2686 movups STATE, (OUTP)
2704 ENDPROC(aesni_cbc_dec)
2707 .pushsection .rodata
2710 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2714 * _aesni_inc_init: internal ABI
2715 * setup registers used by _aesni_inc
2719 * CTR: == IV, in little endian
2720 * TCTR_LOW: == lower qword of CTR
2721 * INC: == 1, in little endian
2722 * BSWAP_MASK == endian swapping mask
2726 movaps .Lbswap_mask, BSWAP_MASK
2728 PSHUFB_XMM BSWAP_MASK CTR
2730 MOVQ_R64_XMM TCTR_LOW INC
2731 MOVQ_R64_XMM CTR TCTR_LOW
2733 ENDPROC(_aesni_inc_init)
2736 * _aesni_inc: internal ABI
2737 * Increase IV by 1, IV is in big endian
2740 * CTR: == IV, in little endian
2741 * TCTR_LOW: == lower qword of CTR
2742 * INC: == 1, in little endian
2743 * BSWAP_MASK == endian swapping mask
2747 * CTR: == output IV, in little endian
2748 * TCTR_LOW: == lower qword of CTR
2760 PSHUFB_XMM BSWAP_MASK IV
2765 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2766 * size_t len, u8 *iv)
2768 ENTRY(aesni_ctr_enc)
2771 jb .Lctr_enc_just_ret
2774 call _aesni_inc_init
2784 movups 0x10(INP), IN2
2787 movups 0x20(INP), IN3
2790 movups 0x30(INP), IN4
2793 movups STATE1, (OUTP)
2795 movups STATE2, 0x10(OUTP)
2797 movups STATE3, 0x20(OUTP)
2799 movups STATE4, 0x30(OUTP)
2814 movups STATE, (OUTP)
2825 ENDPROC(aesni_ctr_enc)
2828 * _aesni_gf128mul_x_ble: internal ABI
2829 * Multiply in GF(2^128) for XTS IVs
2832 * GF128MUL_MASK == mask with 0x87 and 0x01
2836 * CTR: == temporary value
2838 #define _aesni_gf128mul_x_ble() \
2839 pshufd $0x13, IV, CTR; \
2842 pand GF128MUL_MASK, CTR; \
2846 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2849 ENTRY(aesni_xts_crypt8)
2854 leaq _aesni_enc4, %r11
2855 leaq _aesni_dec4, %rax
2859 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2866 movdqu 0x00(INP), INC
2868 movdqu IV, 0x00(OUTP)
2870 _aesni_gf128mul_x_ble()
2872 movdqu 0x10(INP), INC
2874 movdqu IV, 0x10(OUTP)
2876 _aesni_gf128mul_x_ble()
2878 movdqu 0x20(INP), INC
2880 movdqu IV, 0x20(OUTP)
2882 _aesni_gf128mul_x_ble()
2884 movdqu 0x30(INP), INC
2886 movdqu IV, 0x30(OUTP)
2890 movdqu 0x00(OUTP), INC
2892 movdqu STATE1, 0x00(OUTP)
2894 _aesni_gf128mul_x_ble()
2896 movdqu 0x40(INP), INC
2898 movdqu IV, 0x40(OUTP)
2900 movdqu 0x10(OUTP), INC
2902 movdqu STATE2, 0x10(OUTP)
2904 _aesni_gf128mul_x_ble()
2906 movdqu 0x50(INP), INC
2908 movdqu IV, 0x50(OUTP)
2910 movdqu 0x20(OUTP), INC
2912 movdqu STATE3, 0x20(OUTP)
2914 _aesni_gf128mul_x_ble()
2916 movdqu 0x60(INP), INC
2918 movdqu IV, 0x60(OUTP)
2920 movdqu 0x30(OUTP), INC
2922 movdqu STATE4, 0x30(OUTP)
2924 _aesni_gf128mul_x_ble()
2926 movdqu 0x70(INP), INC
2928 movdqu IV, 0x70(OUTP)
2930 _aesni_gf128mul_x_ble()
2935 movdqu 0x40(OUTP), INC
2937 movdqu STATE1, 0x40(OUTP)
2939 movdqu 0x50(OUTP), INC
2941 movdqu STATE2, 0x50(OUTP)
2943 movdqu 0x60(OUTP), INC
2945 movdqu STATE3, 0x60(OUTP)
2947 movdqu 0x70(OUTP), INC
2949 movdqu STATE4, 0x70(OUTP)
2953 ENDPROC(aesni_xts_crypt8)