treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 500
[sfrench/cifs-2.6.git] / arch / arm64 / crypto / ghash-ce-core.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
4  *
5  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6  */
7
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
10
11         SHASH           .req    v0
12         SHASH2          .req    v1
13         T1              .req    v2
14         T2              .req    v3
15         MASK            .req    v4
16         XL              .req    v5
17         XM              .req    v6
18         XH              .req    v7
19         IN1             .req    v7
20
21         k00_16          .req    v8
22         k32_48          .req    v9
23
24         t3              .req    v10
25         t4              .req    v11
26         t5              .req    v12
27         t6              .req    v13
28         t7              .req    v14
29         t8              .req    v15
30         t9              .req    v16
31
32         perm1           .req    v17
33         perm2           .req    v18
34         perm3           .req    v19
35
36         sh1             .req    v20
37         sh2             .req    v21
38         sh3             .req    v22
39         sh4             .req    v23
40
41         ss1             .req    v24
42         ss2             .req    v25
43         ss3             .req    v26
44         ss4             .req    v27
45
46         XL2             .req    v8
47         XM2             .req    v9
48         XH2             .req    v10
49         XL3             .req    v11
50         XM3             .req    v12
51         XH3             .req    v13
52         TT3             .req    v14
53         TT4             .req    v15
54         HH              .req    v16
55         HH3             .req    v17
56         HH4             .req    v18
57         HH34            .req    v19
58
59         .text
60         .arch           armv8-a+crypto
61
62         .macro          __pmull_p64, rd, rn, rm
63         pmull           \rd\().1q, \rn\().1d, \rm\().1d
64         .endm
65
66         .macro          __pmull2_p64, rd, rn, rm
67         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
68         .endm
69
70         .macro          __pmull_p8, rq, ad, bd
71         ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
72         ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
73         ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
74
75         __pmull_p8_\bd  \rq, \ad
76         .endm
77
78         .macro          __pmull2_p8, rq, ad, bd
79         tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
80         tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
81         tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
82
83         __pmull2_p8_\bd \rq, \ad
84         .endm
85
86         .macro          __pmull_p8_SHASH, rq, ad
87         __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
88         .endm
89
90         .macro          __pmull_p8_SHASH2, rq, ad
91         __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
92         .endm
93
94         .macro          __pmull2_p8_SHASH, rq, ad
95         __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
96         .endm
97
98         .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
99         pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
100         pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
101         pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
102         pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
103         pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
104         pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
105         pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
106         pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
107
108         eor             t3.16b, t3.16b, t4.16b                  // L = E + F
109         eor             t5.16b, t5.16b, t6.16b                  // M = G + H
110         eor             t7.16b, t7.16b, t8.16b                  // N = I + J
111
112         uzp1            t4.2d, t3.2d, t5.2d
113         uzp2            t3.2d, t3.2d, t5.2d
114         uzp1            t6.2d, t7.2d, t9.2d
115         uzp2            t7.2d, t7.2d, t9.2d
116
117         // t3 = (L) (P0 + P1) << 8
118         // t5 = (M) (P2 + P3) << 16
119         eor             t4.16b, t4.16b, t3.16b
120         and             t3.16b, t3.16b, k32_48.16b
121
122         // t7 = (N) (P4 + P5) << 24
123         // t9 = (K) (P6 + P7) << 32
124         eor             t6.16b, t6.16b, t7.16b
125         and             t7.16b, t7.16b, k00_16.16b
126
127         eor             t4.16b, t4.16b, t3.16b
128         eor             t6.16b, t6.16b, t7.16b
129
130         zip2            t5.2d, t4.2d, t3.2d
131         zip1            t3.2d, t4.2d, t3.2d
132         zip2            t9.2d, t6.2d, t7.2d
133         zip1            t7.2d, t6.2d, t7.2d
134
135         ext             t3.16b, t3.16b, t3.16b, #15
136         ext             t5.16b, t5.16b, t5.16b, #14
137         ext             t7.16b, t7.16b, t7.16b, #13
138         ext             t9.16b, t9.16b, t9.16b, #12
139
140         eor             t3.16b, t3.16b, t5.16b
141         eor             t7.16b, t7.16b, t9.16b
142         eor             \rq\().16b, \rq\().16b, t3.16b
143         eor             \rq\().16b, \rq\().16b, t7.16b
144         .endm
145
146         .macro          __pmull_pre_p64
147         add             x8, x3, #16
148         ld1             {HH.2d-HH4.2d}, [x8]
149
150         trn1            SHASH2.2d, SHASH.2d, HH.2d
151         trn2            T1.2d, SHASH.2d, HH.2d
152         eor             SHASH2.16b, SHASH2.16b, T1.16b
153
154         trn1            HH34.2d, HH3.2d, HH4.2d
155         trn2            T1.2d, HH3.2d, HH4.2d
156         eor             HH34.16b, HH34.16b, T1.16b
157
158         movi            MASK.16b, #0xe1
159         shl             MASK.2d, MASK.2d, #57
160         .endm
161
162         .macro          __pmull_pre_p8
163         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
164         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
165
166         // k00_16 := 0x0000000000000000_000000000000ffff
167         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
168         movi            k32_48.2d, #0xffffffff
169         mov             k32_48.h[2], k32_48.h[0]
170         ushr            k00_16.2d, k32_48.2d, #32
171
172         // prepare the permutation vectors
173         mov_q           x5, 0x080f0e0d0c0b0a09
174         movi            T1.8b, #8
175         dup             perm1.2d, x5
176         eor             perm1.16b, perm1.16b, T1.16b
177         ushr            perm2.2d, perm1.2d, #8
178         ushr            perm3.2d, perm1.2d, #16
179         ushr            T1.2d, perm1.2d, #24
180         sli             perm2.2d, perm1.2d, #56
181         sli             perm3.2d, perm1.2d, #48
182         sli             T1.2d, perm1.2d, #40
183
184         // precompute loop invariants
185         tbl             sh1.16b, {SHASH.16b}, perm1.16b
186         tbl             sh2.16b, {SHASH.16b}, perm2.16b
187         tbl             sh3.16b, {SHASH.16b}, perm3.16b
188         tbl             sh4.16b, {SHASH.16b}, T1.16b
189         ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
190         ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
191         ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
192         ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
193         .endm
194
195         //
196         // PMULL (64x64->128) based reduction for CPUs that can do
197         // it in a single instruction.
198         //
199         .macro          __pmull_reduce_p64
200         pmull           T2.1q, XL.1d, MASK.1d
201         eor             XM.16b, XM.16b, T1.16b
202
203         mov             XH.d[0], XM.d[1]
204         mov             XM.d[1], XL.d[0]
205
206         eor             XL.16b, XM.16b, T2.16b
207         ext             T2.16b, XL.16b, XL.16b, #8
208         pmull           XL.1q, XL.1d, MASK.1d
209         .endm
210
211         //
212         // Alternative reduction for CPUs that lack support for the
213         // 64x64->128 PMULL instruction
214         //
215         .macro          __pmull_reduce_p8
216         eor             XM.16b, XM.16b, T1.16b
217
218         mov             XL.d[1], XM.d[0]
219         mov             XH.d[0], XM.d[1]
220
221         shl             T1.2d, XL.2d, #57
222         shl             T2.2d, XL.2d, #62
223         eor             T2.16b, T2.16b, T1.16b
224         shl             T1.2d, XL.2d, #63
225         eor             T2.16b, T2.16b, T1.16b
226         ext             T1.16b, XL.16b, XH.16b, #8
227         eor             T2.16b, T2.16b, T1.16b
228
229         mov             XL.d[1], T2.d[0]
230         mov             XH.d[0], T2.d[1]
231
232         ushr            T2.2d, XL.2d, #1
233         eor             XH.16b, XH.16b, XL.16b
234         eor             XL.16b, XL.16b, T2.16b
235         ushr            T2.2d, T2.2d, #6
236         ushr            XL.2d, XL.2d, #1
237         .endm
238
239         .macro          __pmull_ghash, pn
240         ld1             {SHASH.2d}, [x3]
241         ld1             {XL.2d}, [x1]
242
243         __pmull_pre_\pn
244
245         /* do the head block first, if supplied */
246         cbz             x4, 0f
247         ld1             {T1.2d}, [x4]
248         mov             x4, xzr
249         b               3f
250
251 0:      .ifc            \pn, p64
252         tbnz            w0, #0, 2f              // skip until #blocks is a
253         tbnz            w0, #1, 2f              // round multiple of 4
254
255 1:      ld1             {XM3.16b-TT4.16b}, [x2], #64
256
257         sub             w0, w0, #4
258
259         rev64           T1.16b, XM3.16b
260         rev64           T2.16b, XH3.16b
261         rev64           TT4.16b, TT4.16b
262         rev64           TT3.16b, TT3.16b
263
264         ext             IN1.16b, TT4.16b, TT4.16b, #8
265         ext             XL3.16b, TT3.16b, TT3.16b, #8
266
267         eor             TT4.16b, TT4.16b, IN1.16b
268         pmull2          XH2.1q, SHASH.2d, IN1.2d        // a1 * b1
269         pmull           XL2.1q, SHASH.1d, IN1.1d        // a0 * b0
270         pmull           XM2.1q, SHASH2.1d, TT4.1d       // (a1 + a0)(b1 + b0)
271
272         eor             TT3.16b, TT3.16b, XL3.16b
273         pmull2          XH3.1q, HH.2d, XL3.2d           // a1 * b1
274         pmull           XL3.1q, HH.1d, XL3.1d           // a0 * b0
275         pmull2          XM3.1q, SHASH2.2d, TT3.2d       // (a1 + a0)(b1 + b0)
276
277         ext             IN1.16b, T2.16b, T2.16b, #8
278         eor             XL2.16b, XL2.16b, XL3.16b
279         eor             XH2.16b, XH2.16b, XH3.16b
280         eor             XM2.16b, XM2.16b, XM3.16b
281
282         eor             T2.16b, T2.16b, IN1.16b
283         pmull2          XH3.1q, HH3.2d, IN1.2d          // a1 * b1
284         pmull           XL3.1q, HH3.1d, IN1.1d          // a0 * b0
285         pmull           XM3.1q, HH34.1d, T2.1d          // (a1 + a0)(b1 + b0)
286
287         eor             XL2.16b, XL2.16b, XL3.16b
288         eor             XH2.16b, XH2.16b, XH3.16b
289         eor             XM2.16b, XM2.16b, XM3.16b
290
291         ext             IN1.16b, T1.16b, T1.16b, #8
292         ext             TT3.16b, XL.16b, XL.16b, #8
293         eor             XL.16b, XL.16b, IN1.16b
294         eor             T1.16b, T1.16b, TT3.16b
295
296         pmull2          XH.1q, HH4.2d, XL.2d            // a1 * b1
297         eor             T1.16b, T1.16b, XL.16b
298         pmull           XL.1q, HH4.1d, XL.1d            // a0 * b0
299         pmull2          XM.1q, HH34.2d, T1.2d           // (a1 + a0)(b1 + b0)
300
301         eor             XL.16b, XL.16b, XL2.16b
302         eor             XH.16b, XH.16b, XH2.16b
303         eor             XM.16b, XM.16b, XM2.16b
304
305         eor             T2.16b, XL.16b, XH.16b
306         ext             T1.16b, XL.16b, XH.16b, #8
307         eor             XM.16b, XM.16b, T2.16b
308
309         __pmull_reduce_p64
310
311         eor             T2.16b, T2.16b, XH.16b
312         eor             XL.16b, XL.16b, T2.16b
313
314         cbz             w0, 5f
315         b               1b
316         .endif
317
318 2:      ld1             {T1.2d}, [x2], #16
319         sub             w0, w0, #1
320
321 3:      /* multiply XL by SHASH in GF(2^128) */
322 CPU_LE( rev64           T1.16b, T1.16b  )
323
324         ext             T2.16b, XL.16b, XL.16b, #8
325         ext             IN1.16b, T1.16b, T1.16b, #8
326         eor             T1.16b, T1.16b, T2.16b
327         eor             XL.16b, XL.16b, IN1.16b
328
329         __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
330         eor             T1.16b, T1.16b, XL.16b
331         __pmull_\pn     XL, XL, SHASH                   // a0 * b0
332         __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
333
334 4:      eor             T2.16b, XL.16b, XH.16b
335         ext             T1.16b, XL.16b, XH.16b, #8
336         eor             XM.16b, XM.16b, T2.16b
337
338         __pmull_reduce_\pn
339
340         eor             T2.16b, T2.16b, XH.16b
341         eor             XL.16b, XL.16b, T2.16b
342
343         cbnz            w0, 0b
344
345 5:      st1             {XL.2d}, [x1]
346         ret
347         .endm
348
349         /*
350          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
351          *                         struct ghash_key const *k, const char *head)
352          */
353 ENTRY(pmull_ghash_update_p64)
354         __pmull_ghash   p64
355 ENDPROC(pmull_ghash_update_p64)
356
357 ENTRY(pmull_ghash_update_p8)
358         __pmull_ghash   p8
359 ENDPROC(pmull_ghash_update_p8)
360
361         KS0             .req    v12
362         KS1             .req    v13
363         INP0            .req    v14
364         INP1            .req    v15
365
366         .macro          load_round_keys, rounds, rk
367         cmp             \rounds, #12
368         blo             2222f           /* 128 bits */
369         beq             1111f           /* 192 bits */
370         ld1             {v17.4s-v18.4s}, [\rk], #32
371 1111:   ld1             {v19.4s-v20.4s}, [\rk], #32
372 2222:   ld1             {v21.4s-v24.4s}, [\rk], #64
373         ld1             {v25.4s-v28.4s}, [\rk], #64
374         ld1             {v29.4s-v31.4s}, [\rk]
375         .endm
376
377         .macro          enc_round, state, key
378         aese            \state\().16b, \key\().16b
379         aesmc           \state\().16b, \state\().16b
380         .endm
381
382         .macro          enc_block, state, rounds
383         cmp             \rounds, #12
384         b.lo            2222f           /* 128 bits */
385         b.eq            1111f           /* 192 bits */
386         enc_round       \state, v17
387         enc_round       \state, v18
388 1111:   enc_round       \state, v19
389         enc_round       \state, v20
390 2222:   .irp            key, v21, v22, v23, v24, v25, v26, v27, v28, v29
391         enc_round       \state, \key
392         .endr
393         aese            \state\().16b, v30.16b
394         eor             \state\().16b, \state\().16b, v31.16b
395         .endm
396
397         .macro          pmull_gcm_do_crypt, enc
398         ld1             {SHASH.2d}, [x4], #16
399         ld1             {HH.2d}, [x4]
400         ld1             {XL.2d}, [x1]
401         ldr             x8, [x5, #8]                    // load lower counter
402
403         movi            MASK.16b, #0xe1
404         trn1            SHASH2.2d, SHASH.2d, HH.2d
405         trn2            T1.2d, SHASH.2d, HH.2d
406 CPU_LE( rev             x8, x8          )
407         shl             MASK.2d, MASK.2d, #57
408         eor             SHASH2.16b, SHASH2.16b, T1.16b
409
410         .if             \enc == 1
411         ldr             x10, [sp]
412         ld1             {KS0.16b-KS1.16b}, [x10]
413         .endif
414
415         cbnz            x6, 4f
416
417 0:      ld1             {INP0.16b-INP1.16b}, [x3], #32
418
419         rev             x9, x8
420         add             x11, x8, #1
421         add             x8, x8, #2
422
423         .if             \enc == 1
424         eor             INP0.16b, INP0.16b, KS0.16b     // encrypt input
425         eor             INP1.16b, INP1.16b, KS1.16b
426         .endif
427
428         ld1             {KS0.8b}, [x5]                  // load upper counter
429         rev             x11, x11
430         sub             w0, w0, #2
431         mov             KS1.8b, KS0.8b
432         ins             KS0.d[1], x9                    // set lower counter
433         ins             KS1.d[1], x11
434
435         rev64           T1.16b, INP1.16b
436
437         cmp             w7, #12
438         b.ge            2f                              // AES-192/256?
439
440 1:      enc_round       KS0, v21
441         ext             IN1.16b, T1.16b, T1.16b, #8
442
443         enc_round       KS1, v21
444         pmull2          XH2.1q, SHASH.2d, IN1.2d        // a1 * b1
445
446         enc_round       KS0, v22
447         eor             T1.16b, T1.16b, IN1.16b
448
449         enc_round       KS1, v22
450         pmull           XL2.1q, SHASH.1d, IN1.1d        // a0 * b0
451
452         enc_round       KS0, v23
453         pmull           XM2.1q, SHASH2.1d, T1.1d        // (a1 + a0)(b1 + b0)
454
455         enc_round       KS1, v23
456         rev64           T1.16b, INP0.16b
457         ext             T2.16b, XL.16b, XL.16b, #8
458
459         enc_round       KS0, v24
460         ext             IN1.16b, T1.16b, T1.16b, #8
461         eor             T1.16b, T1.16b, T2.16b
462
463         enc_round       KS1, v24
464         eor             XL.16b, XL.16b, IN1.16b
465
466         enc_round       KS0, v25
467         eor             T1.16b, T1.16b, XL.16b
468
469         enc_round       KS1, v25
470         pmull2          XH.1q, HH.2d, XL.2d             // a1 * b1
471
472         enc_round       KS0, v26
473         pmull           XL.1q, HH.1d, XL.1d             // a0 * b0
474
475         enc_round       KS1, v26
476         pmull2          XM.1q, SHASH2.2d, T1.2d         // (a1 + a0)(b1 + b0)
477
478         enc_round       KS0, v27
479         eor             XL.16b, XL.16b, XL2.16b
480         eor             XH.16b, XH.16b, XH2.16b
481
482         enc_round       KS1, v27
483         eor             XM.16b, XM.16b, XM2.16b
484         ext             T1.16b, XL.16b, XH.16b, #8
485
486         enc_round       KS0, v28
487         eor             T2.16b, XL.16b, XH.16b
488         eor             XM.16b, XM.16b, T1.16b
489
490         enc_round       KS1, v28
491         eor             XM.16b, XM.16b, T2.16b
492
493         enc_round       KS0, v29
494         pmull           T2.1q, XL.1d, MASK.1d
495
496         enc_round       KS1, v29
497         mov             XH.d[0], XM.d[1]
498         mov             XM.d[1], XL.d[0]
499
500         aese            KS0.16b, v30.16b
501         eor             XL.16b, XM.16b, T2.16b
502
503         aese            KS1.16b, v30.16b
504         ext             T2.16b, XL.16b, XL.16b, #8
505
506         eor             KS0.16b, KS0.16b, v31.16b
507         pmull           XL.1q, XL.1d, MASK.1d
508         eor             T2.16b, T2.16b, XH.16b
509
510         eor             KS1.16b, KS1.16b, v31.16b
511         eor             XL.16b, XL.16b, T2.16b
512
513         .if             \enc == 0
514         eor             INP0.16b, INP0.16b, KS0.16b
515         eor             INP1.16b, INP1.16b, KS1.16b
516         .endif
517
518         st1             {INP0.16b-INP1.16b}, [x2], #32
519
520         cbnz            w0, 0b
521
522 CPU_LE( rev             x8, x8          )
523         st1             {XL.2d}, [x1]
524         str             x8, [x5, #8]                    // store lower counter
525
526         .if             \enc == 1
527         st1             {KS0.16b-KS1.16b}, [x10]
528         .endif
529
530         ret
531
532 2:      b.eq            3f                              // AES-192?
533         enc_round       KS0, v17
534         enc_round       KS1, v17
535         enc_round       KS0, v18
536         enc_round       KS1, v18
537 3:      enc_round       KS0, v19
538         enc_round       KS1, v19
539         enc_round       KS0, v20
540         enc_round       KS1, v20
541         b               1b
542
543 4:      load_round_keys w7, x6
544         b               0b
545         .endm
546
547         /*
548          * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
549          *                        struct ghash_key const *k, u8 ctr[],
550          *                        int rounds, u8 ks[])
551          */
552 ENTRY(pmull_gcm_encrypt)
553         pmull_gcm_do_crypt      1
554 ENDPROC(pmull_gcm_encrypt)
555
556         /*
557          * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
558          *                        struct ghash_key const *k, u8 ctr[],
559          *                        int rounds)
560          */
561 ENTRY(pmull_gcm_decrypt)
562         pmull_gcm_do_crypt      0
563 ENDPROC(pmull_gcm_decrypt)
564
565         /*
566          * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
567          */
568 ENTRY(pmull_gcm_encrypt_block)
569         cbz             x2, 0f
570         load_round_keys w3, x2
571 0:      ld1             {v0.16b}, [x1]
572         enc_block       v0, w3
573         st1             {v0.16b}, [x0]
574         ret
575 ENDPROC(pmull_gcm_encrypt_block)