Merge tag 'xtensa-20181115' of git://github.com/jcmvbkbc/linux-xtensa
[sfrench/cifs-2.6.git] / arch / arm64 / crypto / aes-neon.S
1 /*
2  * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
3  *
4  * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13
14 #define AES_ENTRY(func)         ENTRY(neon_ ## func)
15 #define AES_ENDPROC(func)       ENDPROC(neon_ ## func)
16
17         xtsmask         .req    v7
18
19         .macro          xts_reload_mask, tmp
20         xts_load_mask   \tmp
21         .endm
22
23         /* multiply by polynomial 'x' in GF(2^8) */
24         .macro          mul_by_x, out, in, temp, const
25         sshr            \temp, \in, #7
26         shl             \out, \in, #1
27         and             \temp, \temp, \const
28         eor             \out, \out, \temp
29         .endm
30
31         /* multiply by polynomial 'x^2' in GF(2^8) */
32         .macro          mul_by_x2, out, in, temp, const
33         ushr            \temp, \in, #6
34         shl             \out, \in, #2
35         pmul            \temp, \temp, \const
36         eor             \out, \out, \temp
37         .endm
38
39         /* preload the entire Sbox */
40         .macro          prepare, sbox, shiftrows, temp
41         movi            v12.16b, #0x1b
42         ldr_l           q13, \shiftrows, \temp
43         ldr_l           q14, .Lror32by8, \temp
44         adr_l           \temp, \sbox
45         ld1             {v16.16b-v19.16b}, [\temp], #64
46         ld1             {v20.16b-v23.16b}, [\temp], #64
47         ld1             {v24.16b-v27.16b}, [\temp], #64
48         ld1             {v28.16b-v31.16b}, [\temp]
49         .endm
50
51         /* do preload for encryption */
52         .macro          enc_prepare, ignore0, ignore1, temp
53         prepare         .LForward_Sbox, .LForward_ShiftRows, \temp
54         .endm
55
56         .macro          enc_switch_key, ignore0, ignore1, temp
57         /* do nothing */
58         .endm
59
60         /* do preload for decryption */
61         .macro          dec_prepare, ignore0, ignore1, temp
62         prepare         .LReverse_Sbox, .LReverse_ShiftRows, \temp
63         .endm
64
65         /* apply SubBytes transformation using the the preloaded Sbox */
66         .macro          sub_bytes, in
67         sub             v9.16b, \in\().16b, v15.16b
68         tbl             \in\().16b, {v16.16b-v19.16b}, \in\().16b
69         sub             v10.16b, v9.16b, v15.16b
70         tbx             \in\().16b, {v20.16b-v23.16b}, v9.16b
71         sub             v11.16b, v10.16b, v15.16b
72         tbx             \in\().16b, {v24.16b-v27.16b}, v10.16b
73         tbx             \in\().16b, {v28.16b-v31.16b}, v11.16b
74         .endm
75
76         /* apply MixColumns transformation */
77         .macro          mix_columns, in, enc
78         .if             \enc == 0
79         /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
80         mul_by_x2       v8.16b, \in\().16b, v9.16b, v12.16b
81         eor             \in\().16b, \in\().16b, v8.16b
82         rev32           v8.8h, v8.8h
83         eor             \in\().16b, \in\().16b, v8.16b
84         .endif
85
86         mul_by_x        v9.16b, \in\().16b, v8.16b, v12.16b
87         rev32           v8.8h, \in\().8h
88         eor             v8.16b, v8.16b, v9.16b
89         eor             \in\().16b, \in\().16b, v8.16b
90         tbl             \in\().16b, {\in\().16b}, v14.16b
91         eor             \in\().16b, \in\().16b, v8.16b
92         .endm
93
94         .macro          do_block, enc, in, rounds, rk, rkp, i
95         ld1             {v15.4s}, [\rk]
96         add             \rkp, \rk, #16
97         mov             \i, \rounds
98 1111:   eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
99         movi            v15.16b, #0x40
100         tbl             \in\().16b, {\in\().16b}, v13.16b       /* ShiftRows */
101         sub_bytes       \in
102         subs            \i, \i, #1
103         ld1             {v15.4s}, [\rkp], #16
104         beq             2222f
105         mix_columns     \in, \enc
106         b               1111b
107 2222:   eor             \in\().16b, \in\().16b, v15.16b         /* ^round key */
108         .endm
109
110         .macro          encrypt_block, in, rounds, rk, rkp, i
111         do_block        1, \in, \rounds, \rk, \rkp, \i
112         .endm
113
114         .macro          decrypt_block, in, rounds, rk, rkp, i
115         do_block        0, \in, \rounds, \rk, \rkp, \i
116         .endm
117
118         /*
119          * Interleaved versions: functionally equivalent to the
120          * ones above, but applied to 2 or 4 AES states in parallel.
121          */
122
123         .macro          sub_bytes_2x, in0, in1
124         sub             v8.16b, \in0\().16b, v15.16b
125         tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
126         sub             v9.16b, \in1\().16b, v15.16b
127         tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
128         sub             v10.16b, v8.16b, v15.16b
129         tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
130         sub             v11.16b, v9.16b, v15.16b
131         tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
132         sub             v8.16b, v10.16b, v15.16b
133         tbx             \in0\().16b, {v24.16b-v27.16b}, v10.16b
134         sub             v9.16b, v11.16b, v15.16b
135         tbx             \in1\().16b, {v24.16b-v27.16b}, v11.16b
136         tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
137         tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
138         .endm
139
140         .macro          sub_bytes_4x, in0, in1, in2, in3
141         sub             v8.16b, \in0\().16b, v15.16b
142         tbl             \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
143         sub             v9.16b, \in1\().16b, v15.16b
144         tbl             \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
145         sub             v10.16b, \in2\().16b, v15.16b
146         tbl             \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
147         sub             v11.16b, \in3\().16b, v15.16b
148         tbl             \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
149         tbx             \in0\().16b, {v20.16b-v23.16b}, v8.16b
150         tbx             \in1\().16b, {v20.16b-v23.16b}, v9.16b
151         sub             v8.16b, v8.16b, v15.16b
152         tbx             \in2\().16b, {v20.16b-v23.16b}, v10.16b
153         sub             v9.16b, v9.16b, v15.16b
154         tbx             \in3\().16b, {v20.16b-v23.16b}, v11.16b
155         sub             v10.16b, v10.16b, v15.16b
156         tbx             \in0\().16b, {v24.16b-v27.16b}, v8.16b
157         sub             v11.16b, v11.16b, v15.16b
158         tbx             \in1\().16b, {v24.16b-v27.16b}, v9.16b
159         sub             v8.16b, v8.16b, v15.16b
160         tbx             \in2\().16b, {v24.16b-v27.16b}, v10.16b
161         sub             v9.16b, v9.16b, v15.16b
162         tbx             \in3\().16b, {v24.16b-v27.16b}, v11.16b
163         sub             v10.16b, v10.16b, v15.16b
164         tbx             \in0\().16b, {v28.16b-v31.16b}, v8.16b
165         sub             v11.16b, v11.16b, v15.16b
166         tbx             \in1\().16b, {v28.16b-v31.16b}, v9.16b
167         tbx             \in2\().16b, {v28.16b-v31.16b}, v10.16b
168         tbx             \in3\().16b, {v28.16b-v31.16b}, v11.16b
169         .endm
170
171         .macro          mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
172         sshr            \tmp0\().16b, \in0\().16b, #7
173         shl             \out0\().16b, \in0\().16b, #1
174         sshr            \tmp1\().16b, \in1\().16b, #7
175         and             \tmp0\().16b, \tmp0\().16b, \const\().16b
176         shl             \out1\().16b, \in1\().16b, #1
177         and             \tmp1\().16b, \tmp1\().16b, \const\().16b
178         eor             \out0\().16b, \out0\().16b, \tmp0\().16b
179         eor             \out1\().16b, \out1\().16b, \tmp1\().16b
180         .endm
181
182         .macro          mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
183         ushr            \tmp0\().16b, \in0\().16b, #6
184         shl             \out0\().16b, \in0\().16b, #2
185         ushr            \tmp1\().16b, \in1\().16b, #6
186         pmul            \tmp0\().16b, \tmp0\().16b, \const\().16b
187         shl             \out1\().16b, \in1\().16b, #2
188         pmul            \tmp1\().16b, \tmp1\().16b, \const\().16b
189         eor             \out0\().16b, \out0\().16b, \tmp0\().16b
190         eor             \out1\().16b, \out1\().16b, \tmp1\().16b
191         .endm
192
193         .macro          mix_columns_2x, in0, in1, enc
194         .if             \enc == 0
195         /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
196         mul_by_x2_2x    v8, v9, \in0, \in1, v10, v11, v12
197         eor             \in0\().16b, \in0\().16b, v8.16b
198         rev32           v8.8h, v8.8h
199         eor             \in1\().16b, \in1\().16b, v9.16b
200         rev32           v9.8h, v9.8h
201         eor             \in0\().16b, \in0\().16b, v8.16b
202         eor             \in1\().16b, \in1\().16b, v9.16b
203         .endif
204
205         mul_by_x_2x     v8, v9, \in0, \in1, v10, v11, v12
206         rev32           v10.8h, \in0\().8h
207         rev32           v11.8h, \in1\().8h
208         eor             v10.16b, v10.16b, v8.16b
209         eor             v11.16b, v11.16b, v9.16b
210         eor             \in0\().16b, \in0\().16b, v10.16b
211         eor             \in1\().16b, \in1\().16b, v11.16b
212         tbl             \in0\().16b, {\in0\().16b}, v14.16b
213         tbl             \in1\().16b, {\in1\().16b}, v14.16b
214         eor             \in0\().16b, \in0\().16b, v10.16b
215         eor             \in1\().16b, \in1\().16b, v11.16b
216         .endm
217
218         .macro          do_block_2x, enc, in0, in1, rounds, rk, rkp, i
219         ld1             {v15.4s}, [\rk]
220         add             \rkp, \rk, #16
221         mov             \i, \rounds
222 1111:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
223         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
224         movi            v15.16b, #0x40
225         tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
226         tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
227         sub_bytes_2x    \in0, \in1
228         subs            \i, \i, #1
229         ld1             {v15.4s}, [\rkp], #16
230         beq             2222f
231         mix_columns_2x  \in0, \in1, \enc
232         b               1111b
233 2222:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
234         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
235         .endm
236
237         .macro          do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
238         ld1             {v15.4s}, [\rk]
239         add             \rkp, \rk, #16
240         mov             \i, \rounds
241 1111:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
242         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
243         eor             \in2\().16b, \in2\().16b, v15.16b       /* ^round key */
244         eor             \in3\().16b, \in3\().16b, v15.16b       /* ^round key */
245         movi            v15.16b, #0x40
246         tbl             \in0\().16b, {\in0\().16b}, v13.16b     /* ShiftRows */
247         tbl             \in1\().16b, {\in1\().16b}, v13.16b     /* ShiftRows */
248         tbl             \in2\().16b, {\in2\().16b}, v13.16b     /* ShiftRows */
249         tbl             \in3\().16b, {\in3\().16b}, v13.16b     /* ShiftRows */
250         sub_bytes_4x    \in0, \in1, \in2, \in3
251         subs            \i, \i, #1
252         ld1             {v15.4s}, [\rkp], #16
253         beq             2222f
254         mix_columns_2x  \in0, \in1, \enc
255         mix_columns_2x  \in2, \in3, \enc
256         b               1111b
257 2222:   eor             \in0\().16b, \in0\().16b, v15.16b       /* ^round key */
258         eor             \in1\().16b, \in1\().16b, v15.16b       /* ^round key */
259         eor             \in2\().16b, \in2\().16b, v15.16b       /* ^round key */
260         eor             \in3\().16b, \in3\().16b, v15.16b       /* ^round key */
261         .endm
262
263         .macro          encrypt_block2x, in0, in1, rounds, rk, rkp, i
264         do_block_2x     1, \in0, \in1, \rounds, \rk, \rkp, \i
265         .endm
266
267         .macro          decrypt_block2x, in0, in1, rounds, rk, rkp, i
268         do_block_2x     0, \in0, \in1, \rounds, \rk, \rkp, \i
269         .endm
270
271         .macro          encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
272         do_block_4x     1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
273         .endm
274
275         .macro          decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
276         do_block_4x     0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
277         .endm
278
279 #include "aes-modes.S"
280
281         .section        ".rodata", "a"
282         .align          6
283 .LForward_Sbox:
284         .byte           0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
285         .byte           0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
286         .byte           0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
287         .byte           0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
288         .byte           0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
289         .byte           0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
290         .byte           0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
291         .byte           0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
292         .byte           0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
293         .byte           0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
294         .byte           0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
295         .byte           0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
296         .byte           0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
297         .byte           0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
298         .byte           0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
299         .byte           0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
300         .byte           0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
301         .byte           0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
302         .byte           0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
303         .byte           0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
304         .byte           0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
305         .byte           0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
306         .byte           0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
307         .byte           0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
308         .byte           0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
309         .byte           0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
310         .byte           0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
311         .byte           0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
312         .byte           0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
313         .byte           0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
314         .byte           0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
315         .byte           0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
316
317 .LReverse_Sbox:
318         .byte           0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
319         .byte           0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
320         .byte           0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
321         .byte           0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
322         .byte           0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
323         .byte           0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
324         .byte           0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
325         .byte           0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
326         .byte           0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
327         .byte           0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
328         .byte           0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
329         .byte           0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
330         .byte           0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
331         .byte           0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
332         .byte           0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
333         .byte           0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
334         .byte           0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
335         .byte           0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
336         .byte           0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
337         .byte           0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
338         .byte           0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
339         .byte           0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
340         .byte           0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
341         .byte           0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
342         .byte           0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
343         .byte           0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
344         .byte           0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
345         .byte           0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
346         .byte           0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
347         .byte           0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
348         .byte           0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
349         .byte           0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
350
351 .LForward_ShiftRows:
352         .octa           0x0b06010c07020d08030e09040f0a0500
353
354 .LReverse_ShiftRows:
355         .octa           0x0306090c0f0205080b0e0104070a0d00
356
357 .Lror32by8:
358         .octa           0x0c0f0e0d080b0a090407060500030201