Merge tag 'clk-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/clk/linux
[sfrench/cifs-2.6.git] / arch / arm / crypto / ghash-ce-core.S
1 /*
2  * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
3  *
4  * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms of the GNU General Public License version 2 as published
8  * by the Free Software Foundation.
9  */
10
11 #include <linux/linkage.h>
12 #include <asm/assembler.h>
13
14         SHASH           .req    q0
15         T1              .req    q1
16         XL              .req    q2
17         XM              .req    q3
18         XH              .req    q4
19         IN1             .req    q4
20
21         SHASH_L         .req    d0
22         SHASH_H         .req    d1
23         T1_L            .req    d2
24         T1_H            .req    d3
25         XL_L            .req    d4
26         XL_H            .req    d5
27         XM_L            .req    d6
28         XM_H            .req    d7
29         XH_L            .req    d8
30
31         t0l             .req    d10
32         t0h             .req    d11
33         t1l             .req    d12
34         t1h             .req    d13
35         t2l             .req    d14
36         t2h             .req    d15
37         t3l             .req    d16
38         t3h             .req    d17
39         t4l             .req    d18
40         t4h             .req    d19
41
42         t0q             .req    q5
43         t1q             .req    q6
44         t2q             .req    q7
45         t3q             .req    q8
46         t4q             .req    q9
47         T2              .req    q9
48
49         s1l             .req    d20
50         s1h             .req    d21
51         s2l             .req    d22
52         s2h             .req    d23
53         s3l             .req    d24
54         s3h             .req    d25
55         s4l             .req    d26
56         s4h             .req    d27
57
58         MASK            .req    d28
59         SHASH2_p8       .req    d28
60
61         k16             .req    d29
62         k32             .req    d30
63         k48             .req    d31
64         SHASH2_p64      .req    d31
65
66         HH              .req    q10
67         HH3             .req    q11
68         HH4             .req    q12
69         HH34            .req    q13
70
71         HH_L            .req    d20
72         HH_H            .req    d21
73         HH3_L           .req    d22
74         HH3_H           .req    d23
75         HH4_L           .req    d24
76         HH4_H           .req    d25
77         HH34_L          .req    d26
78         HH34_H          .req    d27
79         SHASH2_H        .req    d29
80
81         XL2             .req    q5
82         XM2             .req    q6
83         XH2             .req    q7
84         T3              .req    q8
85
86         XL2_L           .req    d10
87         XL2_H           .req    d11
88         XM2_L           .req    d12
89         XM2_H           .req    d13
90         T3_L            .req    d16
91         T3_H            .req    d17
92
93         .text
94         .fpu            crypto-neon-fp-armv8
95
96         .macro          __pmull_p64, rd, rn, rm, b1, b2, b3, b4
97         vmull.p64       \rd, \rn, \rm
98         .endm
99
100         /*
101          * This implementation of 64x64 -> 128 bit polynomial multiplication
102          * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
103          * "Fast Software Polynomial Multiplication on ARM Processors Using
104          * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
105          * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
106          *
107          * It has been slightly tweaked for in-order performance, and to allow
108          * 'rq' to overlap with 'ad' or 'bd'.
109          */
110         .macro          __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
111         vext.8          t0l, \ad, \ad, #1       @ A1
112         .ifc            \b1, t4l
113         vext.8          t4l, \bd, \bd, #1       @ B1
114         .endif
115         vmull.p8        t0q, t0l, \bd           @ F = A1*B
116         vext.8          t1l, \ad, \ad, #2       @ A2
117         vmull.p8        t4q, \ad, \b1           @ E = A*B1
118         .ifc            \b2, t3l
119         vext.8          t3l, \bd, \bd, #2       @ B2
120         .endif
121         vmull.p8        t1q, t1l, \bd           @ H = A2*B
122         vext.8          t2l, \ad, \ad, #3       @ A3
123         vmull.p8        t3q, \ad, \b2           @ G = A*B2
124         veor            t0q, t0q, t4q           @ L = E + F
125         .ifc            \b3, t4l
126         vext.8          t4l, \bd, \bd, #3       @ B3
127         .endif
128         vmull.p8        t2q, t2l, \bd           @ J = A3*B
129         veor            t0l, t0l, t0h           @ t0 = (L) (P0 + P1) << 8
130         veor            t1q, t1q, t3q           @ M = G + H
131         .ifc            \b4, t3l
132         vext.8          t3l, \bd, \bd, #4       @ B4
133         .endif
134         vmull.p8        t4q, \ad, \b3           @ I = A*B3
135         veor            t1l, t1l, t1h           @ t1 = (M) (P2 + P3) << 16
136         vmull.p8        t3q, \ad, \b4           @ K = A*B4
137         vand            t0h, t0h, k48
138         vand            t1h, t1h, k32
139         veor            t2q, t2q, t4q           @ N = I + J
140         veor            t0l, t0l, t0h
141         veor            t1l, t1l, t1h
142         veor            t2l, t2l, t2h           @ t2 = (N) (P4 + P5) << 24
143         vand            t2h, t2h, k16
144         veor            t3l, t3l, t3h           @ t3 = (K) (P6 + P7) << 32
145         vmov.i64        t3h, #0
146         vext.8          t0q, t0q, t0q, #15
147         veor            t2l, t2l, t2h
148         vext.8          t1q, t1q, t1q, #14
149         vmull.p8        \rq, \ad, \bd           @ D = A*B
150         vext.8          t2q, t2q, t2q, #13
151         vext.8          t3q, t3q, t3q, #12
152         veor            t0q, t0q, t1q
153         veor            t2q, t2q, t3q
154         veor            \rq, \rq, t0q
155         veor            \rq, \rq, t2q
156         .endm
157
158         //
159         // PMULL (64x64->128) based reduction for CPUs that can do
160         // it in a single instruction.
161         //
162         .macro          __pmull_reduce_p64
163         vmull.p64       T1, XL_L, MASK
164
165         veor            XH_L, XH_L, XM_H
166         vext.8          T1, T1, T1, #8
167         veor            XL_H, XL_H, XM_L
168         veor            T1, T1, XL
169
170         vmull.p64       XL, T1_H, MASK
171         .endm
172
173         //
174         // Alternative reduction for CPUs that lack support for the
175         // 64x64->128 PMULL instruction
176         //
177         .macro          __pmull_reduce_p8
178         veor            XL_H, XL_H, XM_L
179         veor            XH_L, XH_L, XM_H
180
181         vshl.i64        T1, XL, #57
182         vshl.i64        T2, XL, #62
183         veor            T1, T1, T2
184         vshl.i64        T2, XL, #63
185         veor            T1, T1, T2
186         veor            XL_H, XL_H, T1_L
187         veor            XH_L, XH_L, T1_H
188
189         vshr.u64        T1, XL, #1
190         veor            XH, XH, XL
191         veor            XL, XL, T1
192         vshr.u64        T1, T1, #6
193         vshr.u64        XL, XL, #1
194         .endm
195
196         .macro          ghash_update, pn
197         vld1.64         {XL}, [r1]
198
199         /* do the head block first, if supplied */
200         ldr             ip, [sp]
201         teq             ip, #0
202         beq             0f
203         vld1.64         {T1}, [ip]
204         teq             r0, #0
205         b               3f
206
207 0:      .ifc            \pn, p64
208         tst             r0, #3                  // skip until #blocks is a
209         bne             2f                      // round multiple of 4
210
211         vld1.8          {XL2-XM2}, [r2]!
212 1:      vld1.8          {T3-T2}, [r2]!
213         vrev64.8        XL2, XL2
214         vrev64.8        XM2, XM2
215
216         subs            r0, r0, #4
217
218         vext.8          T1, XL2, XL2, #8
219         veor            XL2_H, XL2_H, XL_L
220         veor            XL, XL, T1
221
222         vrev64.8        T3, T3
223         vrev64.8        T1, T2
224
225         vmull.p64       XH, HH4_H, XL_H                 // a1 * b1
226         veor            XL2_H, XL2_H, XL_H
227         vmull.p64       XL, HH4_L, XL_L                 // a0 * b0
228         vmull.p64       XM, HH34_H, XL2_H               // (a1 + a0)(b1 + b0)
229
230         vmull.p64       XH2, HH3_H, XM2_L               // a1 * b1
231         veor            XM2_L, XM2_L, XM2_H
232         vmull.p64       XL2, HH3_L, XM2_H               // a0 * b0
233         vmull.p64       XM2, HH34_L, XM2_L              // (a1 + a0)(b1 + b0)
234
235         veor            XH, XH, XH2
236         veor            XL, XL, XL2
237         veor            XM, XM, XM2
238
239         vmull.p64       XH2, HH_H, T3_L                 // a1 * b1
240         veor            T3_L, T3_L, T3_H
241         vmull.p64       XL2, HH_L, T3_H                 // a0 * b0
242         vmull.p64       XM2, SHASH2_H, T3_L             // (a1 + a0)(b1 + b0)
243
244         veor            XH, XH, XH2
245         veor            XL, XL, XL2
246         veor            XM, XM, XM2
247
248         vmull.p64       XH2, SHASH_H, T1_L              // a1 * b1
249         veor            T1_L, T1_L, T1_H
250         vmull.p64       XL2, SHASH_L, T1_H              // a0 * b0
251         vmull.p64       XM2, SHASH2_p64, T1_L           // (a1 + a0)(b1 + b0)
252
253         veor            XH, XH, XH2
254         veor            XL, XL, XL2
255         veor            XM, XM, XM2
256
257         beq             4f
258
259         vld1.8          {XL2-XM2}, [r2]!
260
261         veor            T1, XL, XH
262         veor            XM, XM, T1
263
264         __pmull_reduce_p64
265
266         veor            T1, T1, XH
267         veor            XL, XL, T1
268
269         b               1b
270         .endif
271
272 2:      vld1.64         {T1}, [r2]!
273         subs            r0, r0, #1
274
275 3:      /* multiply XL by SHASH in GF(2^128) */
276 #ifndef CONFIG_CPU_BIG_ENDIAN
277         vrev64.8        T1, T1
278 #endif
279         vext.8          IN1, T1, T1, #8
280         veor            T1_L, T1_L, XL_H
281         veor            XL, XL, IN1
282
283         __pmull_\pn     XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h   @ a1 * b1
284         veor            T1, T1, XL
285         __pmull_\pn     XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l   @ a0 * b0
286         __pmull_\pn     XM, T1_L, SHASH2_\pn                    @ (a1+a0)(b1+b0)
287
288 4:      veor            T1, XL, XH
289         veor            XM, XM, T1
290
291         __pmull_reduce_\pn
292
293         veor            T1, T1, XH
294         veor            XL, XL, T1
295
296         bne             0b
297
298         vst1.64         {XL}, [r1]
299         bx              lr
300         .endm
301
302         /*
303          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
304          *                         struct ghash_key const *k, const char *head)
305          */
306 ENTRY(pmull_ghash_update_p64)
307         vld1.64         {SHASH}, [r3]!
308         vld1.64         {HH}, [r3]!
309         vld1.64         {HH3-HH4}, [r3]
310
311         veor            SHASH2_p64, SHASH_L, SHASH_H
312         veor            SHASH2_H, HH_L, HH_H
313         veor            HH34_L, HH3_L, HH3_H
314         veor            HH34_H, HH4_L, HH4_H
315
316         vmov.i8         MASK, #0xe1
317         vshl.u64        MASK, MASK, #57
318
319         ghash_update    p64
320 ENDPROC(pmull_ghash_update_p64)
321
322 ENTRY(pmull_ghash_update_p8)
323         vld1.64         {SHASH}, [r3]
324         veor            SHASH2_p8, SHASH_L, SHASH_H
325
326         vext.8          s1l, SHASH_L, SHASH_L, #1
327         vext.8          s2l, SHASH_L, SHASH_L, #2
328         vext.8          s3l, SHASH_L, SHASH_L, #3
329         vext.8          s4l, SHASH_L, SHASH_L, #4
330         vext.8          s1h, SHASH_H, SHASH_H, #1
331         vext.8          s2h, SHASH_H, SHASH_H, #2
332         vext.8          s3h, SHASH_H, SHASH_H, #3
333         vext.8          s4h, SHASH_H, SHASH_H, #4
334
335         vmov.i64        k16, #0xffff
336         vmov.i64        k32, #0xffffffff
337         vmov.i64        k48, #0xffffffffffff
338
339         ghash_update    p8
340 ENDPROC(pmull_ghash_update_p8)