ALSA: mips: Convert to the common vmalloc memalloc
[sfrench/cifs-2.6.git] / arch / x86 / crypto / poly1305-avx2-x86_64.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
4  *
5  * Copyright (C) 2015 Martin Willi
6  */
7
8 #include <linux/linkage.h>
9
10 .section        .rodata.cst32.ANMASK, "aM", @progbits, 32
11 .align 32
12 ANMASK: .octa 0x0000000003ffffff0000000003ffffff
13         .octa 0x0000000003ffffff0000000003ffffff
14
15 .section        .rodata.cst32.ORMASK, "aM", @progbits, 32
16 .align 32
17 ORMASK: .octa 0x00000000010000000000000001000000
18         .octa 0x00000000010000000000000001000000
19
20 .text
21
22 #define h0 0x00(%rdi)
23 #define h1 0x04(%rdi)
24 #define h2 0x08(%rdi)
25 #define h3 0x0c(%rdi)
26 #define h4 0x10(%rdi)
27 #define r0 0x00(%rdx)
28 #define r1 0x04(%rdx)
29 #define r2 0x08(%rdx)
30 #define r3 0x0c(%rdx)
31 #define r4 0x10(%rdx)
32 #define u0 0x00(%r8)
33 #define u1 0x04(%r8)
34 #define u2 0x08(%r8)
35 #define u3 0x0c(%r8)
36 #define u4 0x10(%r8)
37 #define w0 0x14(%r8)
38 #define w1 0x18(%r8)
39 #define w2 0x1c(%r8)
40 #define w3 0x20(%r8)
41 #define w4 0x24(%r8)
42 #define y0 0x28(%r8)
43 #define y1 0x2c(%r8)
44 #define y2 0x30(%r8)
45 #define y3 0x34(%r8)
46 #define y4 0x38(%r8)
47 #define m %rsi
48 #define hc0 %ymm0
49 #define hc1 %ymm1
50 #define hc2 %ymm2
51 #define hc3 %ymm3
52 #define hc4 %ymm4
53 #define hc0x %xmm0
54 #define hc1x %xmm1
55 #define hc2x %xmm2
56 #define hc3x %xmm3
57 #define hc4x %xmm4
58 #define t1 %ymm5
59 #define t2 %ymm6
60 #define t1x %xmm5
61 #define t2x %xmm6
62 #define ruwy0 %ymm7
63 #define ruwy1 %ymm8
64 #define ruwy2 %ymm9
65 #define ruwy3 %ymm10
66 #define ruwy4 %ymm11
67 #define ruwy0x %xmm7
68 #define ruwy1x %xmm8
69 #define ruwy2x %xmm9
70 #define ruwy3x %xmm10
71 #define ruwy4x %xmm11
72 #define svxz1 %ymm12
73 #define svxz2 %ymm13
74 #define svxz3 %ymm14
75 #define svxz4 %ymm15
76 #define d0 %r9
77 #define d1 %r10
78 #define d2 %r11
79 #define d3 %r12
80 #define d4 %r13
81
82 ENTRY(poly1305_4block_avx2)
83         # %rdi: Accumulator h[5]
84         # %rsi: 64 byte input block m
85         # %rdx: Poly1305 key r[5]
86         # %rcx: Quadblock count
87         # %r8:  Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
88
89         # This four-block variant uses loop unrolled block processing. It
90         # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
91         # h = (h + m) * r  =>  h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
92
93         vzeroupper
94         push            %rbx
95         push            %r12
96         push            %r13
97
98         # combine r0,u0,w0,y0
99         vmovd           y0,ruwy0x
100         vmovd           w0,t1x
101         vpunpcklqdq     t1,ruwy0,ruwy0
102         vmovd           u0,t1x
103         vmovd           r0,t2x
104         vpunpcklqdq     t2,t1,t1
105         vperm2i128      $0x20,t1,ruwy0,ruwy0
106
107         # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
108         vmovd           y1,ruwy1x
109         vmovd           w1,t1x
110         vpunpcklqdq     t1,ruwy1,ruwy1
111         vmovd           u1,t1x
112         vmovd           r1,t2x
113         vpunpcklqdq     t2,t1,t1
114         vperm2i128      $0x20,t1,ruwy1,ruwy1
115         vpslld          $2,ruwy1,svxz1
116         vpaddd          ruwy1,svxz1,svxz1
117
118         # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
119         vmovd           y2,ruwy2x
120         vmovd           w2,t1x
121         vpunpcklqdq     t1,ruwy2,ruwy2
122         vmovd           u2,t1x
123         vmovd           r2,t2x
124         vpunpcklqdq     t2,t1,t1
125         vperm2i128      $0x20,t1,ruwy2,ruwy2
126         vpslld          $2,ruwy2,svxz2
127         vpaddd          ruwy2,svxz2,svxz2
128
129         # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
130         vmovd           y3,ruwy3x
131         vmovd           w3,t1x
132         vpunpcklqdq     t1,ruwy3,ruwy3
133         vmovd           u3,t1x
134         vmovd           r3,t2x
135         vpunpcklqdq     t2,t1,t1
136         vperm2i128      $0x20,t1,ruwy3,ruwy3
137         vpslld          $2,ruwy3,svxz3
138         vpaddd          ruwy3,svxz3,svxz3
139
140         # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
141         vmovd           y4,ruwy4x
142         vmovd           w4,t1x
143         vpunpcklqdq     t1,ruwy4,ruwy4
144         vmovd           u4,t1x
145         vmovd           r4,t2x
146         vpunpcklqdq     t2,t1,t1
147         vperm2i128      $0x20,t1,ruwy4,ruwy4
148         vpslld          $2,ruwy4,svxz4
149         vpaddd          ruwy4,svxz4,svxz4
150
151 .Ldoblock4:
152         # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
153         #        m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
154         vmovd           0x00(m),hc0x
155         vmovd           0x10(m),t1x
156         vpunpcklqdq     t1,hc0,hc0
157         vmovd           0x20(m),t1x
158         vmovd           0x30(m),t2x
159         vpunpcklqdq     t2,t1,t1
160         vperm2i128      $0x20,t1,hc0,hc0
161         vpand           ANMASK(%rip),hc0,hc0
162         vmovd           h0,t1x
163         vpaddd          t1,hc0,hc0
164         # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
165         #        (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
166         vmovd           0x03(m),hc1x
167         vmovd           0x13(m),t1x
168         vpunpcklqdq     t1,hc1,hc1
169         vmovd           0x23(m),t1x
170         vmovd           0x33(m),t2x
171         vpunpcklqdq     t2,t1,t1
172         vperm2i128      $0x20,t1,hc1,hc1
173         vpsrld          $2,hc1,hc1
174         vpand           ANMASK(%rip),hc1,hc1
175         vmovd           h1,t1x
176         vpaddd          t1,hc1,hc1
177         # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
178         #        (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
179         vmovd           0x06(m),hc2x
180         vmovd           0x16(m),t1x
181         vpunpcklqdq     t1,hc2,hc2
182         vmovd           0x26(m),t1x
183         vmovd           0x36(m),t2x
184         vpunpcklqdq     t2,t1,t1
185         vperm2i128      $0x20,t1,hc2,hc2
186         vpsrld          $4,hc2,hc2
187         vpand           ANMASK(%rip),hc2,hc2
188         vmovd           h2,t1x
189         vpaddd          t1,hc2,hc2
190         # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
191         #        (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
192         vmovd           0x09(m),hc3x
193         vmovd           0x19(m),t1x
194         vpunpcklqdq     t1,hc3,hc3
195         vmovd           0x29(m),t1x
196         vmovd           0x39(m),t2x
197         vpunpcklqdq     t2,t1,t1
198         vperm2i128      $0x20,t1,hc3,hc3
199         vpsrld          $6,hc3,hc3
200         vpand           ANMASK(%rip),hc3,hc3
201         vmovd           h3,t1x
202         vpaddd          t1,hc3,hc3
203         # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
204         #        (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
205         vmovd           0x0c(m),hc4x
206         vmovd           0x1c(m),t1x
207         vpunpcklqdq     t1,hc4,hc4
208         vmovd           0x2c(m),t1x
209         vmovd           0x3c(m),t2x
210         vpunpcklqdq     t2,t1,t1
211         vperm2i128      $0x20,t1,hc4,hc4
212         vpsrld          $8,hc4,hc4
213         vpor            ORMASK(%rip),hc4,hc4
214         vmovd           h4,t1x
215         vpaddd          t1,hc4,hc4
216
217         # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
218         vpmuludq        hc0,ruwy0,t1
219         # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
220         vpmuludq        hc1,svxz4,t2
221         vpaddq          t2,t1,t1
222         # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
223         vpmuludq        hc2,svxz3,t2
224         vpaddq          t2,t1,t1
225         # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
226         vpmuludq        hc3,svxz2,t2
227         vpaddq          t2,t1,t1
228         # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
229         vpmuludq        hc4,svxz1,t2
230         vpaddq          t2,t1,t1
231         # d0 = t1[0] + t1[1] + t[2] + t[3]
232         vpermq          $0xee,t1,t2
233         vpaddq          t2,t1,t1
234         vpsrldq         $8,t1,t2
235         vpaddq          t2,t1,t1
236         vmovq           t1x,d0
237
238         # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
239         vpmuludq        hc0,ruwy1,t1
240         # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
241         vpmuludq        hc1,ruwy0,t2
242         vpaddq          t2,t1,t1
243         # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
244         vpmuludq        hc2,svxz4,t2
245         vpaddq          t2,t1,t1
246         # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
247         vpmuludq        hc3,svxz3,t2
248         vpaddq          t2,t1,t1
249         # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
250         vpmuludq        hc4,svxz2,t2
251         vpaddq          t2,t1,t1
252         # d1 = t1[0] + t1[1] + t1[3] + t1[4]
253         vpermq          $0xee,t1,t2
254         vpaddq          t2,t1,t1
255         vpsrldq         $8,t1,t2
256         vpaddq          t2,t1,t1
257         vmovq           t1x,d1
258
259         # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
260         vpmuludq        hc0,ruwy2,t1
261         # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
262         vpmuludq        hc1,ruwy1,t2
263         vpaddq          t2,t1,t1
264         # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
265         vpmuludq        hc2,ruwy0,t2
266         vpaddq          t2,t1,t1
267         # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
268         vpmuludq        hc3,svxz4,t2
269         vpaddq          t2,t1,t1
270         # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
271         vpmuludq        hc4,svxz3,t2
272         vpaddq          t2,t1,t1
273         # d2 = t1[0] + t1[1] + t1[2] + t1[3]
274         vpermq          $0xee,t1,t2
275         vpaddq          t2,t1,t1
276         vpsrldq         $8,t1,t2
277         vpaddq          t2,t1,t1
278         vmovq           t1x,d2
279
280         # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
281         vpmuludq        hc0,ruwy3,t1
282         # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
283         vpmuludq        hc1,ruwy2,t2
284         vpaddq          t2,t1,t1
285         # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
286         vpmuludq        hc2,ruwy1,t2
287         vpaddq          t2,t1,t1
288         # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
289         vpmuludq        hc3,ruwy0,t2
290         vpaddq          t2,t1,t1
291         # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
292         vpmuludq        hc4,svxz4,t2
293         vpaddq          t2,t1,t1
294         # d3 = t1[0] + t1[1] + t1[2] + t1[3]
295         vpermq          $0xee,t1,t2
296         vpaddq          t2,t1,t1
297         vpsrldq         $8,t1,t2
298         vpaddq          t2,t1,t1
299         vmovq           t1x,d3
300
301         # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
302         vpmuludq        hc0,ruwy4,t1
303         # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
304         vpmuludq        hc1,ruwy3,t2
305         vpaddq          t2,t1,t1
306         # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
307         vpmuludq        hc2,ruwy2,t2
308         vpaddq          t2,t1,t1
309         # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
310         vpmuludq        hc3,ruwy1,t2
311         vpaddq          t2,t1,t1
312         # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
313         vpmuludq        hc4,ruwy0,t2
314         vpaddq          t2,t1,t1
315         # d4 = t1[0] + t1[1] + t1[2] + t1[3]
316         vpermq          $0xee,t1,t2
317         vpaddq          t2,t1,t1
318         vpsrldq         $8,t1,t2
319         vpaddq          t2,t1,t1
320         vmovq           t1x,d4
321
322         # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
323         # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
324         # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
325         # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
326         # integers.  It's true in a single-block implementation, but not here.
327
328         # d1 += d0 >> 26
329         mov             d0,%rax
330         shr             $26,%rax
331         add             %rax,d1
332         # h0 = d0 & 0x3ffffff
333         mov             d0,%rbx
334         and             $0x3ffffff,%ebx
335
336         # d2 += d1 >> 26
337         mov             d1,%rax
338         shr             $26,%rax
339         add             %rax,d2
340         # h1 = d1 & 0x3ffffff
341         mov             d1,%rax
342         and             $0x3ffffff,%eax
343         mov             %eax,h1
344
345         # d3 += d2 >> 26
346         mov             d2,%rax
347         shr             $26,%rax
348         add             %rax,d3
349         # h2 = d2 & 0x3ffffff
350         mov             d2,%rax
351         and             $0x3ffffff,%eax
352         mov             %eax,h2
353
354         # d4 += d3 >> 26
355         mov             d3,%rax
356         shr             $26,%rax
357         add             %rax,d4
358         # h3 = d3 & 0x3ffffff
359         mov             d3,%rax
360         and             $0x3ffffff,%eax
361         mov             %eax,h3
362
363         # h0 += (d4 >> 26) * 5
364         mov             d4,%rax
365         shr             $26,%rax
366         lea             (%rax,%rax,4),%rax
367         add             %rax,%rbx
368         # h4 = d4 & 0x3ffffff
369         mov             d4,%rax
370         and             $0x3ffffff,%eax
371         mov             %eax,h4
372
373         # h1 += h0 >> 26
374         mov             %rbx,%rax
375         shr             $26,%rax
376         add             %eax,h1
377         # h0 = h0 & 0x3ffffff
378         andl            $0x3ffffff,%ebx
379         mov             %ebx,h0
380
381         add             $0x40,m
382         dec             %rcx
383         jnz             .Ldoblock4
384
385         vzeroupper
386         pop             %r13
387         pop             %r12
388         pop             %rbx
389         ret
390 ENDPROC(poly1305_4block_avx2)