Merge branch 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq
[sfrench/cifs-2.6.git] / arch / arm64 / crypto / ghash-ce-core.S
1 /*
2  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
3  *
4  * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 as published
8  * by the Free Software Foundation.
9  */
10
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13
14         SHASH           .req    v0
15         SHASH2          .req    v1
16         T1              .req    v2
17         T2              .req    v3
18         MASK            .req    v4
19         XL              .req    v5
20         XM              .req    v6
21         XH              .req    v7
22         IN1             .req    v7
23
24         k00_16          .req    v8
25         k32_48          .req    v9
26
27         t3              .req    v10
28         t4              .req    v11
29         t5              .req    v12
30         t6              .req    v13
31         t7              .req    v14
32         t8              .req    v15
33         t9              .req    v16
34
35         perm1           .req    v17
36         perm2           .req    v18
37         perm3           .req    v19
38
39         sh1             .req    v20
40         sh2             .req    v21
41         sh3             .req    v22
42         sh4             .req    v23
43
44         ss1             .req    v24
45         ss2             .req    v25
46         ss3             .req    v26
47         ss4             .req    v27
48
49         .text
50         .arch           armv8-a+crypto
51
52         .macro          __pmull_p64, rd, rn, rm
53         pmull           \rd\().1q, \rn\().1d, \rm\().1d
54         .endm
55
56         .macro          __pmull2_p64, rd, rn, rm
57         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
58         .endm
59
60         .macro          __pmull_p8, rq, ad, bd
61         ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
62         ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
63         ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
64
65         __pmull_p8_\bd  \rq, \ad
66         .endm
67
68         .macro          __pmull2_p8, rq, ad, bd
69         tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
70         tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
71         tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
72
73         __pmull2_p8_\bd \rq, \ad
74         .endm
75
76         .macro          __pmull_p8_SHASH, rq, ad
77         __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
78         .endm
79
80         .macro          __pmull_p8_SHASH2, rq, ad
81         __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
82         .endm
83
84         .macro          __pmull2_p8_SHASH, rq, ad
85         __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
86         .endm
87
88         .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
89         pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
90         pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
91         pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
92         pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
93         pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
94         pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
95         pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
96         pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
97
98         eor             t3.16b, t3.16b, t4.16b                  // L = E + F
99         eor             t5.16b, t5.16b, t6.16b                  // M = G + H
100         eor             t7.16b, t7.16b, t8.16b                  // N = I + J
101
102         uzp1            t4.2d, t3.2d, t5.2d
103         uzp2            t3.2d, t3.2d, t5.2d
104         uzp1            t6.2d, t7.2d, t9.2d
105         uzp2            t7.2d, t7.2d, t9.2d
106
107         // t3 = (L) (P0 + P1) << 8
108         // t5 = (M) (P2 + P3) << 16
109         eor             t4.16b, t4.16b, t3.16b
110         and             t3.16b, t3.16b, k32_48.16b
111
112         // t7 = (N) (P4 + P5) << 24
113         // t9 = (K) (P6 + P7) << 32
114         eor             t6.16b, t6.16b, t7.16b
115         and             t7.16b, t7.16b, k00_16.16b
116
117         eor             t4.16b, t4.16b, t3.16b
118         eor             t6.16b, t6.16b, t7.16b
119
120         zip2            t5.2d, t4.2d, t3.2d
121         zip1            t3.2d, t4.2d, t3.2d
122         zip2            t9.2d, t6.2d, t7.2d
123         zip1            t7.2d, t6.2d, t7.2d
124
125         ext             t3.16b, t3.16b, t3.16b, #15
126         ext             t5.16b, t5.16b, t5.16b, #14
127         ext             t7.16b, t7.16b, t7.16b, #13
128         ext             t9.16b, t9.16b, t9.16b, #12
129
130         eor             t3.16b, t3.16b, t5.16b
131         eor             t7.16b, t7.16b, t9.16b
132         eor             \rq\().16b, \rq\().16b, t3.16b
133         eor             \rq\().16b, \rq\().16b, t7.16b
134         .endm
135
136         .macro          __pmull_pre_p64
137         movi            MASK.16b, #0xe1
138         shl             MASK.2d, MASK.2d, #57
139         .endm
140
141         .macro          __pmull_pre_p8
142         // k00_16 := 0x0000000000000000_000000000000ffff
143         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
144         movi            k32_48.2d, #0xffffffff
145         mov             k32_48.h[2], k32_48.h[0]
146         ushr            k00_16.2d, k32_48.2d, #32
147
148         // prepare the permutation vectors
149         mov_q           x5, 0x080f0e0d0c0b0a09
150         movi            T1.8b, #8
151         dup             perm1.2d, x5
152         eor             perm1.16b, perm1.16b, T1.16b
153         ushr            perm2.2d, perm1.2d, #8
154         ushr            perm3.2d, perm1.2d, #16
155         ushr            T1.2d, perm1.2d, #24
156         sli             perm2.2d, perm1.2d, #56
157         sli             perm3.2d, perm1.2d, #48
158         sli             T1.2d, perm1.2d, #40
159
160         // precompute loop invariants
161         tbl             sh1.16b, {SHASH.16b}, perm1.16b
162         tbl             sh2.16b, {SHASH.16b}, perm2.16b
163         tbl             sh3.16b, {SHASH.16b}, perm3.16b
164         tbl             sh4.16b, {SHASH.16b}, T1.16b
165         ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
166         ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
167         ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
168         ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
169         .endm
170
171         //
172         // PMULL (64x64->128) based reduction for CPUs that can do
173         // it in a single instruction.
174         //
175         .macro          __pmull_reduce_p64
176         pmull           T2.1q, XL.1d, MASK.1d
177         eor             XM.16b, XM.16b, T1.16b
178
179         mov             XH.d[0], XM.d[1]
180         mov             XM.d[1], XL.d[0]
181
182         eor             XL.16b, XM.16b, T2.16b
183         ext             T2.16b, XL.16b, XL.16b, #8
184         pmull           XL.1q, XL.1d, MASK.1d
185         .endm
186
187         //
188         // Alternative reduction for CPUs that lack support for the
189         // 64x64->128 PMULL instruction
190         //
191         .macro          __pmull_reduce_p8
192         eor             XM.16b, XM.16b, T1.16b
193
194         mov             XL.d[1], XM.d[0]
195         mov             XH.d[0], XM.d[1]
196
197         shl             T1.2d, XL.2d, #57
198         shl             T2.2d, XL.2d, #62
199         eor             T2.16b, T2.16b, T1.16b
200         shl             T1.2d, XL.2d, #63
201         eor             T2.16b, T2.16b, T1.16b
202         ext             T1.16b, XL.16b, XH.16b, #8
203         eor             T2.16b, T2.16b, T1.16b
204
205         mov             XL.d[1], T2.d[0]
206         mov             XH.d[0], T2.d[1]
207
208         ushr            T2.2d, XL.2d, #1
209         eor             XH.16b, XH.16b, XL.16b
210         eor             XL.16b, XL.16b, T2.16b
211         ushr            T2.2d, T2.2d, #6
212         ushr            XL.2d, XL.2d, #1
213         .endm
214
215         .macro          __pmull_ghash, pn
216         ld1             {SHASH.2d}, [x3]
217         ld1             {XL.2d}, [x1]
218         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
219         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
220
221         __pmull_pre_\pn
222
223         /* do the head block first, if supplied */
224         cbz             x4, 0f
225         ld1             {T1.2d}, [x4]
226         b               1f
227
228 0:      ld1             {T1.2d}, [x2], #16
229         sub             w0, w0, #1
230
231 1:      /* multiply XL by SHASH in GF(2^128) */
232 CPU_LE( rev64           T1.16b, T1.16b  )
233
234         ext             T2.16b, XL.16b, XL.16b, #8
235         ext             IN1.16b, T1.16b, T1.16b, #8
236         eor             T1.16b, T1.16b, T2.16b
237         eor             XL.16b, XL.16b, IN1.16b
238
239         __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
240         eor             T1.16b, T1.16b, XL.16b
241         __pmull_\pn     XL, XL, SHASH                   // a0 * b0
242         __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
243
244         eor             T2.16b, XL.16b, XH.16b
245         ext             T1.16b, XL.16b, XH.16b, #8
246         eor             XM.16b, XM.16b, T2.16b
247
248         __pmull_reduce_\pn
249
250         eor             T2.16b, T2.16b, XH.16b
251         eor             XL.16b, XL.16b, T2.16b
252
253         cbnz            w0, 0b
254
255         st1             {XL.2d}, [x1]
256         ret
257         .endm
258
259         /*
260          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
261          *                         struct ghash_key const *k, const char *head)
262          */
263 ENTRY(pmull_ghash_update_p64)
264         __pmull_ghash   p64
265 ENDPROC(pmull_ghash_update_p64)
266
267 ENTRY(pmull_ghash_update_p8)
268         __pmull_ghash   p8
269 ENDPROC(pmull_ghash_update_p8)
270
271         KS              .req    v8
272         CTR             .req    v9
273         INP             .req    v10
274
275         .macro          load_round_keys, rounds, rk
276         cmp             \rounds, #12
277         blo             2222f           /* 128 bits */
278         beq             1111f           /* 192 bits */
279         ld1             {v17.4s-v18.4s}, [\rk], #32
280 1111:   ld1             {v19.4s-v20.4s}, [\rk], #32
281 2222:   ld1             {v21.4s-v24.4s}, [\rk], #64
282         ld1             {v25.4s-v28.4s}, [\rk], #64
283         ld1             {v29.4s-v31.4s}, [\rk]
284         .endm
285
286         .macro          enc_round, state, key
287         aese            \state\().16b, \key\().16b
288         aesmc           \state\().16b, \state\().16b
289         .endm
290
291         .macro          enc_block, state, rounds
292         cmp             \rounds, #12
293         b.lo            2222f           /* 128 bits */
294         b.eq            1111f           /* 192 bits */
295         enc_round       \state, v17
296         enc_round       \state, v18
297 1111:   enc_round       \state, v19
298         enc_round       \state, v20
299 2222:   .irp            key, v21, v22, v23, v24, v25, v26, v27, v28, v29
300         enc_round       \state, \key
301         .endr
302         aese            \state\().16b, v30.16b
303         eor             \state\().16b, \state\().16b, v31.16b
304         .endm
305
306         .macro          pmull_gcm_do_crypt, enc
307         ld1             {SHASH.2d}, [x4]
308         ld1             {XL.2d}, [x1]
309         ldr             x8, [x5, #8]                    // load lower counter
310
311         movi            MASK.16b, #0xe1
312         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
313 CPU_LE( rev             x8, x8          )
314         shl             MASK.2d, MASK.2d, #57
315         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
316
317         .if             \enc == 1
318         ld1             {KS.16b}, [x7]
319         .endif
320
321 0:      ld1             {CTR.8b}, [x5]                  // load upper counter
322         ld1             {INP.16b}, [x3], #16
323         rev             x9, x8
324         add             x8, x8, #1
325         sub             w0, w0, #1
326         ins             CTR.d[1], x9                    // set lower counter
327
328         .if             \enc == 1
329         eor             INP.16b, INP.16b, KS.16b        // encrypt input
330         st1             {INP.16b}, [x2], #16
331         .endif
332
333         rev64           T1.16b, INP.16b
334
335         cmp             w6, #12
336         b.ge            2f                              // AES-192/256?
337
338 1:      enc_round       CTR, v21
339
340         ext             T2.16b, XL.16b, XL.16b, #8
341         ext             IN1.16b, T1.16b, T1.16b, #8
342
343         enc_round       CTR, v22
344
345         eor             T1.16b, T1.16b, T2.16b
346         eor             XL.16b, XL.16b, IN1.16b
347
348         enc_round       CTR, v23
349
350         pmull2          XH.1q, SHASH.2d, XL.2d          // a1 * b1
351         eor             T1.16b, T1.16b, XL.16b
352
353         enc_round       CTR, v24
354
355         pmull           XL.1q, SHASH.1d, XL.1d          // a0 * b0
356         pmull           XM.1q, SHASH2.1d, T1.1d         // (a1 + a0)(b1 + b0)
357
358         enc_round       CTR, v25
359
360         ext             T1.16b, XL.16b, XH.16b, #8
361         eor             T2.16b, XL.16b, XH.16b
362         eor             XM.16b, XM.16b, T1.16b
363
364         enc_round       CTR, v26
365
366         eor             XM.16b, XM.16b, T2.16b
367         pmull           T2.1q, XL.1d, MASK.1d
368
369         enc_round       CTR, v27
370
371         mov             XH.d[0], XM.d[1]
372         mov             XM.d[1], XL.d[0]
373
374         enc_round       CTR, v28
375
376         eor             XL.16b, XM.16b, T2.16b
377
378         enc_round       CTR, v29
379
380         ext             T2.16b, XL.16b, XL.16b, #8
381
382         aese            CTR.16b, v30.16b
383
384         pmull           XL.1q, XL.1d, MASK.1d
385         eor             T2.16b, T2.16b, XH.16b
386
387         eor             KS.16b, CTR.16b, v31.16b
388
389         eor             XL.16b, XL.16b, T2.16b
390
391         .if             \enc == 0
392         eor             INP.16b, INP.16b, KS.16b
393         st1             {INP.16b}, [x2], #16
394         .endif
395
396         cbnz            w0, 0b
397
398 CPU_LE( rev             x8, x8          )
399         st1             {XL.2d}, [x1]
400         str             x8, [x5, #8]                    // store lower counter
401
402         .if             \enc == 1
403         st1             {KS.16b}, [x7]
404         .endif
405
406         ret
407
408 2:      b.eq            3f                              // AES-192?
409         enc_round       CTR, v17
410         enc_round       CTR, v18
411 3:      enc_round       CTR, v19
412         enc_round       CTR, v20
413         b               1b
414         .endm
415
416         /*
417          * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
418          *                        struct ghash_key const *k, u8 ctr[],
419          *                        int rounds, u8 ks[])
420          */
421 ENTRY(pmull_gcm_encrypt)
422         pmull_gcm_do_crypt      1
423 ENDPROC(pmull_gcm_encrypt)
424
425         /*
426          * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
427          *                        struct ghash_key const *k, u8 ctr[],
428          *                        int rounds)
429          */
430 ENTRY(pmull_gcm_decrypt)
431         pmull_gcm_do_crypt      0
432 ENDPROC(pmull_gcm_decrypt)
433
434         /*
435          * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
436          */
437 ENTRY(pmull_gcm_encrypt_block)
438         cbz             x2, 0f
439         load_round_keys w3, x2
440 0:      ld1             {v0.16b}, [x1]
441         enc_block       v0, w3
442         st1             {v0.16b}, [x0]
443         ret
444 ENDPROC(pmull_gcm_encrypt_block)