Merge tag 'char-misc-5.2-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh...
[sfrench/cifs-2.6.git] / arch / arm / crypto / ghash-ce-core.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4  *
5  * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6  */
7
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
10
11         SHASH           .req    q0
12         T1              .req    q1
13         XL              .req    q2
14         XM              .req    q3
15         XH              .req    q4
16         IN1             .req    q4
17
18         SHASH_L         .req    d0
19         SHASH_H         .req    d1
20         T1_L            .req    d2
21         T1_H            .req    d3
22         XL_L            .req    d4
23         XL_H            .req    d5
24         XM_L            .req    d6
25         XM_H            .req    d7
26         XH_L            .req    d8
27
28         t0l             .req    d10
29         t0h             .req    d11
30         t1l             .req    d12
31         t1h             .req    d13
32         t2l             .req    d14
33         t2h             .req    d15
34         t3l             .req    d16
35         t3h             .req    d17
36         t4l             .req    d18
37         t4h             .req    d19
38
39         t0q             .req    q5
40         t1q             .req    q6
41         t2q             .req    q7
42         t3q             .req    q8
43         t4q             .req    q9
44         T2              .req    q9
45
46         s1l             .req    d20
47         s1h             .req    d21
48         s2l             .req    d22
49         s2h             .req    d23
50         s3l             .req    d24
51         s3h             .req    d25
52         s4l             .req    d26
53         s4h             .req    d27
54
55         MASK            .req    d28
56         SHASH2_p8       .req    d28
57
58         k16             .req    d29
59         k32             .req    d30
60         k48             .req    d31
61         SHASH2_p64      .req    d31
62
63         HH              .req    q10
64         HH3             .req    q11
65         HH4             .req    q12
66         HH34            .req    q13
67
68         HH_L            .req    d20
69         HH_H            .req    d21
70         HH3_L           .req    d22
71         HH3_H           .req    d23
72         HH4_L           .req    d24
73         HH4_H           .req    d25
74         HH34_L          .req    d26
75         HH34_H          .req    d27
76         SHASH2_H        .req    d29
77
78         XL2             .req    q5
79         XM2             .req    q6
80         XH2             .req    q7
81         T3              .req    q8
82
83         XL2_L           .req    d10
84         XL2_H           .req    d11
85         XM2_L           .req    d12
86         XM2_H           .req    d13
87         T3_L            .req    d16
88         T3_H            .req    d17
89
90         .text
91         .fpu            crypto-neon-fp-armv8
92
93         .macro          __pmull_p64, rd, rn, rm, b1, b2, b3, b4
94         vmull.p64       \rd, \rn, \rm
95         .endm
96
97         /*
98          * This implementation of 64x64 -> 128 bit polynomial multiplication
99          * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
100          * "Fast Software Polynomial Multiplication on ARM Processors Using
101          * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
102          * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
103          *
104          * It has been slightly tweaked for in-order performance, and to allow
105          * 'rq' to overlap with 'ad' or 'bd'.
106          */
107         .macro          __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
108         vext.8          t0l, \ad, \ad, #1       @ A1
109         .ifc            \b1, t4l
110         vext.8          t4l, \bd, \bd, #1       @ B1
111         .endif
112         vmull.p8        t0q, t0l, \bd           @ F = A1*B
113         vext.8          t1l, \ad, \ad, #2       @ A2
114         vmull.p8        t4q, \ad, \b1           @ E = A*B1
115         .ifc            \b2, t3l
116         vext.8          t3l, \bd, \bd, #2       @ B2
117         .endif
118         vmull.p8        t1q, t1l, \bd           @ H = A2*B
119         vext.8          t2l, \ad, \ad, #3       @ A3
120         vmull.p8        t3q, \ad, \b2           @ G = A*B2
121         veor            t0q, t0q, t4q           @ L = E + F
122         .ifc            \b3, t4l
123         vext.8          t4l, \bd, \bd, #3       @ B3
124         .endif
125         vmull.p8        t2q, t2l, \bd           @ J = A3*B
126         veor            t0l, t0l, t0h           @ t0 = (L) (P0 + P1) << 8
127         veor            t1q, t1q, t3q           @ M = G + H
128         .ifc            \b4, t3l
129         vext.8          t3l, \bd, \bd, #4       @ B4
130         .endif
131         vmull.p8        t4q, \ad, \b3           @ I = A*B3
132         veor            t1l, t1l, t1h           @ t1 = (M) (P2 + P3) << 16
133         vmull.p8        t3q, \ad, \b4           @ K = A*B4
134         vand            t0h, t0h, k48
135         vand            t1h, t1h, k32
136         veor            t2q, t2q, t4q           @ N = I + J
137         veor            t0l, t0l, t0h
138         veor            t1l, t1l, t1h
139         veor            t2l, t2l, t2h           @ t2 = (N) (P4 + P5) << 24
140         vand            t2h, t2h, k16
141         veor            t3l, t3l, t3h           @ t3 = (K) (P6 + P7) << 32
142         vmov.i64        t3h, #0
143         vext.8          t0q, t0q, t0q, #15
144         veor            t2l, t2l, t2h
145         vext.8          t1q, t1q, t1q, #14
146         vmull.p8        \rq, \ad, \bd           @ D = A*B
147         vext.8          t2q, t2q, t2q, #13
148         vext.8          t3q, t3q, t3q, #12
149         veor            t0q, t0q, t1q
150         veor            t2q, t2q, t3q
151         veor            \rq, \rq, t0q
152         veor            \rq, \rq, t2q
153         .endm
154
155         //
156         // PMULL (64x64->128) based reduction for CPUs that can do
157         // it in a single instruction.
158         //
159         .macro          __pmull_reduce_p64
160         vmull.p64       T1, XL_L, MASK
161
162         veor            XH_L, XH_L, XM_H
163         vext.8          T1, T1, T1, #8
164         veor            XL_H, XL_H, XM_L
165         veor            T1, T1, XL
166
167         vmull.p64       XL, T1_H, MASK
168         .endm
169
170         //
171         // Alternative reduction for CPUs that lack support for the
172         // 64x64->128 PMULL instruction
173         //
174         .macro          __pmull_reduce_p8
175         veor            XL_H, XL_H, XM_L
176         veor            XH_L, XH_L, XM_H
177
178         vshl.i64        T1, XL, #57
179         vshl.i64        T2, XL, #62
180         veor            T1, T1, T2
181         vshl.i64        T2, XL, #63
182         veor            T1, T1, T2
183         veor            XL_H, XL_H, T1_L
184         veor            XH_L, XH_L, T1_H
185
186         vshr.u64        T1, XL, #1
187         veor            XH, XH, XL
188         veor            XL, XL, T1
189         vshr.u64        T1, T1, #6
190         vshr.u64        XL, XL, #1
191         .endm
192
193         .macro          ghash_update, pn
194         vld1.64         {XL}, [r1]
195
196         /* do the head block first, if supplied */
197         ldr             ip, [sp]
198         teq             ip, #0
199         beq             0f
200         vld1.64         {T1}, [ip]
201         teq             r0, #0
202         b               3f
203
204 0:      .ifc            \pn, p64
205         tst             r0, #3                  // skip until #blocks is a
206         bne             2f                      // round multiple of 4
207
208         vld1.8          {XL2-XM2}, [r2]!
209 1:      vld1.8          {T3-T2}, [r2]!
210         vrev64.8        XL2, XL2
211         vrev64.8        XM2, XM2
212
213         subs            r0, r0, #4
214
215         vext.8          T1, XL2, XL2, #8
216         veor            XL2_H, XL2_H, XL_L
217         veor            XL, XL, T1
218
219         vrev64.8        T3, T3
220         vrev64.8        T1, T2
221
222         vmull.p64       XH, HH4_H, XL_H                 // a1 * b1
223         veor            XL2_H, XL2_H, XL_H
224         vmull.p64       XL, HH4_L, XL_L                 // a0 * b0
225         vmull.p64       XM, HH34_H, XL2_H               // (a1 + a0)(b1 + b0)
226
227         vmull.p64       XH2, HH3_H, XM2_L               // a1 * b1
228         veor            XM2_L, XM2_L, XM2_H
229         vmull.p64       XL2, HH3_L, XM2_H               // a0 * b0
230         vmull.p64       XM2, HH34_L, XM2_L              // (a1 + a0)(b1 + b0)
231
232         veor            XH, XH, XH2
233         veor            XL, XL, XL2
234         veor            XM, XM, XM2
235
236         vmull.p64       XH2, HH_H, T3_L                 // a1 * b1
237         veor            T3_L, T3_L, T3_H
238         vmull.p64       XL2, HH_L, T3_H                 // a0 * b0
239         vmull.p64       XM2, SHASH2_H, T3_L             // (a1 + a0)(b1 + b0)
240
241         veor            XH, XH, XH2
242         veor            XL, XL, XL2
243         veor            XM, XM, XM2
244
245         vmull.p64       XH2, SHASH_H, T1_L              // a1 * b1
246         veor            T1_L, T1_L, T1_H
247         vmull.p64       XL2, SHASH_L, T1_H              // a0 * b0
248         vmull.p64       XM2, SHASH2_p64, T1_L           // (a1 + a0)(b1 + b0)
249
250         veor            XH, XH, XH2
251         veor            XL, XL, XL2
252         veor            XM, XM, XM2
253
254         beq             4f
255
256         vld1.8          {XL2-XM2}, [r2]!
257
258         veor            T1, XL, XH
259         veor            XM, XM, T1
260
261         __pmull_reduce_p64
262
263         veor            T1, T1, XH
264         veor            XL, XL, T1
265
266         b               1b
267         .endif
268
269 2:      vld1.64         {T1}, [r2]!
270         subs            r0, r0, #1
271
272 3:      /* multiply XL by SHASH in GF(2^128) */
273 #ifndef CONFIG_CPU_BIG_ENDIAN
274         vrev64.8        T1, T1
275 #endif
276         vext.8          IN1, T1, T1, #8
277         veor            T1_L, T1_L, XL_H
278         veor            XL, XL, IN1
279
280         __pmull_\pn     XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h   @ a1 * b1
281         veor            T1, T1, XL
282         __pmull_\pn     XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l   @ a0 * b0
283         __pmull_\pn     XM, T1_L, SHASH2_\pn                    @ (a1+a0)(b1+b0)
284
285 4:      veor            T1, XL, XH
286         veor            XM, XM, T1
287
288         __pmull_reduce_\pn
289
290         veor            T1, T1, XH
291         veor            XL, XL, T1
292
293         bne             0b
294
295         vst1.64         {XL}, [r1]
296         bx              lr
297         .endm
298
299         /*
300          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
301          *                         struct ghash_key const *k, const char *head)
302          */
303 ENTRY(pmull_ghash_update_p64)
304         vld1.64         {SHASH}, [r3]!
305         vld1.64         {HH}, [r3]!
306         vld1.64         {HH3-HH4}, [r3]
307
308         veor            SHASH2_p64, SHASH_L, SHASH_H
309         veor            SHASH2_H, HH_L, HH_H
310         veor            HH34_L, HH3_L, HH3_H
311         veor            HH34_H, HH4_L, HH4_H
312
313         vmov.i8         MASK, #0xe1
314         vshl.u64        MASK, MASK, #57
315
316         ghash_update    p64
317 ENDPROC(pmull_ghash_update_p64)
318
319 ENTRY(pmull_ghash_update_p8)
320         vld1.64         {SHASH}, [r3]
321         veor            SHASH2_p8, SHASH_L, SHASH_H
322
323         vext.8          s1l, SHASH_L, SHASH_L, #1
324         vext.8          s2l, SHASH_L, SHASH_L, #2
325         vext.8          s3l, SHASH_L, SHASH_L, #3
326         vext.8          s4l, SHASH_L, SHASH_L, #4
327         vext.8          s1h, SHASH_H, SHASH_H, #1
328         vext.8          s2h, SHASH_H, SHASH_H, #2
329         vext.8          s3h, SHASH_H, SHASH_H, #3
330         vext.8          s4h, SHASH_H, SHASH_H, #4
331
332         vmov.i64        k16, #0xffff
333         vmov.i64        k32, #0xffffffff
334         vmov.i64        k48, #0xffffffffffff
335
336         ghash_update    p8
337 ENDPROC(pmull_ghash_update_p8)