Merge tag 'afs-next-20190628' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowel...
[sfrench/cifs-2.6.git] / arch / x86 / crypto / chacha-avx2-x86_64.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
4  *
5  * Copyright (C) 2015 Martin Willi
6  */
7
8 #include <linux/linkage.h>
9
10 .section        .rodata.cst32.ROT8, "aM", @progbits, 32
11 .align 32
12 ROT8:   .octa 0x0e0d0c0f0a09080b0605040702010003
13         .octa 0x0e0d0c0f0a09080b0605040702010003
14
15 .section        .rodata.cst32.ROT16, "aM", @progbits, 32
16 .align 32
17 ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
18         .octa 0x0d0c0f0e09080b0a0504070601000302
19
20 .section        .rodata.cst32.CTRINC, "aM", @progbits, 32
21 .align 32
22 CTRINC: .octa 0x00000003000000020000000100000000
23         .octa 0x00000007000000060000000500000004
24
25 .section        .rodata.cst32.CTR2BL, "aM", @progbits, 32
26 .align 32
27 CTR2BL: .octa 0x00000000000000000000000000000000
28         .octa 0x00000000000000000000000000000001
29
30 .section        .rodata.cst32.CTR4BL, "aM", @progbits, 32
31 .align 32
32 CTR4BL: .octa 0x00000000000000000000000000000002
33         .octa 0x00000000000000000000000000000003
34
35 .text
36
37 ENTRY(chacha_2block_xor_avx2)
38         # %rdi: Input state matrix, s
39         # %rsi: up to 2 data blocks output, o
40         # %rdx: up to 2 data blocks input, i
41         # %rcx: input/output length in bytes
42         # %r8d: nrounds
43
44         # This function encrypts two ChaCha blocks by loading the state
45         # matrix twice across four AVX registers. It performs matrix operations
46         # on four words in each matrix in parallel, but requires shuffling to
47         # rearrange the words after each round.
48
49         vzeroupper
50
51         # x0..3[0-2] = s0..3
52         vbroadcasti128  0x00(%rdi),%ymm0
53         vbroadcasti128  0x10(%rdi),%ymm1
54         vbroadcasti128  0x20(%rdi),%ymm2
55         vbroadcasti128  0x30(%rdi),%ymm3
56
57         vpaddd          CTR2BL(%rip),%ymm3,%ymm3
58
59         vmovdqa         %ymm0,%ymm8
60         vmovdqa         %ymm1,%ymm9
61         vmovdqa         %ymm2,%ymm10
62         vmovdqa         %ymm3,%ymm11
63
64         vmovdqa         ROT8(%rip),%ymm4
65         vmovdqa         ROT16(%rip),%ymm5
66
67         mov             %rcx,%rax
68
69 .Ldoubleround:
70
71         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
72         vpaddd          %ymm1,%ymm0,%ymm0
73         vpxor           %ymm0,%ymm3,%ymm3
74         vpshufb         %ymm5,%ymm3,%ymm3
75
76         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
77         vpaddd          %ymm3,%ymm2,%ymm2
78         vpxor           %ymm2,%ymm1,%ymm1
79         vmovdqa         %ymm1,%ymm6
80         vpslld          $12,%ymm6,%ymm6
81         vpsrld          $20,%ymm1,%ymm1
82         vpor            %ymm6,%ymm1,%ymm1
83
84         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
85         vpaddd          %ymm1,%ymm0,%ymm0
86         vpxor           %ymm0,%ymm3,%ymm3
87         vpshufb         %ymm4,%ymm3,%ymm3
88
89         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
90         vpaddd          %ymm3,%ymm2,%ymm2
91         vpxor           %ymm2,%ymm1,%ymm1
92         vmovdqa         %ymm1,%ymm7
93         vpslld          $7,%ymm7,%ymm7
94         vpsrld          $25,%ymm1,%ymm1
95         vpor            %ymm7,%ymm1,%ymm1
96
97         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
98         vpshufd         $0x39,%ymm1,%ymm1
99         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100         vpshufd         $0x4e,%ymm2,%ymm2
101         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
102         vpshufd         $0x93,%ymm3,%ymm3
103
104         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
105         vpaddd          %ymm1,%ymm0,%ymm0
106         vpxor           %ymm0,%ymm3,%ymm3
107         vpshufb         %ymm5,%ymm3,%ymm3
108
109         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
110         vpaddd          %ymm3,%ymm2,%ymm2
111         vpxor           %ymm2,%ymm1,%ymm1
112         vmovdqa         %ymm1,%ymm6
113         vpslld          $12,%ymm6,%ymm6
114         vpsrld          $20,%ymm1,%ymm1
115         vpor            %ymm6,%ymm1,%ymm1
116
117         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
118         vpaddd          %ymm1,%ymm0,%ymm0
119         vpxor           %ymm0,%ymm3,%ymm3
120         vpshufb         %ymm4,%ymm3,%ymm3
121
122         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
123         vpaddd          %ymm3,%ymm2,%ymm2
124         vpxor           %ymm2,%ymm1,%ymm1
125         vmovdqa         %ymm1,%ymm7
126         vpslld          $7,%ymm7,%ymm7
127         vpsrld          $25,%ymm1,%ymm1
128         vpor            %ymm7,%ymm1,%ymm1
129
130         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
131         vpshufd         $0x93,%ymm1,%ymm1
132         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
133         vpshufd         $0x4e,%ymm2,%ymm2
134         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
135         vpshufd         $0x39,%ymm3,%ymm3
136
137         sub             $2,%r8d
138         jnz             .Ldoubleround
139
140         # o0 = i0 ^ (x0 + s0)
141         vpaddd          %ymm8,%ymm0,%ymm7
142         cmp             $0x10,%rax
143         jl              .Lxorpart2
144         vpxor           0x00(%rdx),%xmm7,%xmm6
145         vmovdqu         %xmm6,0x00(%rsi)
146         vextracti128    $1,%ymm7,%xmm0
147         # o1 = i1 ^ (x1 + s1)
148         vpaddd          %ymm9,%ymm1,%ymm7
149         cmp             $0x20,%rax
150         jl              .Lxorpart2
151         vpxor           0x10(%rdx),%xmm7,%xmm6
152         vmovdqu         %xmm6,0x10(%rsi)
153         vextracti128    $1,%ymm7,%xmm1
154         # o2 = i2 ^ (x2 + s2)
155         vpaddd          %ymm10,%ymm2,%ymm7
156         cmp             $0x30,%rax
157         jl              .Lxorpart2
158         vpxor           0x20(%rdx),%xmm7,%xmm6
159         vmovdqu         %xmm6,0x20(%rsi)
160         vextracti128    $1,%ymm7,%xmm2
161         # o3 = i3 ^ (x3 + s3)
162         vpaddd          %ymm11,%ymm3,%ymm7
163         cmp             $0x40,%rax
164         jl              .Lxorpart2
165         vpxor           0x30(%rdx),%xmm7,%xmm6
166         vmovdqu         %xmm6,0x30(%rsi)
167         vextracti128    $1,%ymm7,%xmm3
168
169         # xor and write second block
170         vmovdqa         %xmm0,%xmm7
171         cmp             $0x50,%rax
172         jl              .Lxorpart2
173         vpxor           0x40(%rdx),%xmm7,%xmm6
174         vmovdqu         %xmm6,0x40(%rsi)
175
176         vmovdqa         %xmm1,%xmm7
177         cmp             $0x60,%rax
178         jl              .Lxorpart2
179         vpxor           0x50(%rdx),%xmm7,%xmm6
180         vmovdqu         %xmm6,0x50(%rsi)
181
182         vmovdqa         %xmm2,%xmm7
183         cmp             $0x70,%rax
184         jl              .Lxorpart2
185         vpxor           0x60(%rdx),%xmm7,%xmm6
186         vmovdqu         %xmm6,0x60(%rsi)
187
188         vmovdqa         %xmm3,%xmm7
189         cmp             $0x80,%rax
190         jl              .Lxorpart2
191         vpxor           0x70(%rdx),%xmm7,%xmm6
192         vmovdqu         %xmm6,0x70(%rsi)
193
194 .Ldone2:
195         vzeroupper
196         ret
197
198 .Lxorpart2:
199         # xor remaining bytes from partial register into output
200         mov             %rax,%r9
201         and             $0x0f,%r9
202         jz              .Ldone2
203         and             $~0x0f,%rax
204
205         mov             %rsi,%r11
206
207         lea             8(%rsp),%r10
208         sub             $0x10,%rsp
209         and             $~31,%rsp
210
211         lea             (%rdx,%rax),%rsi
212         mov             %rsp,%rdi
213         mov             %r9,%rcx
214         rep movsb
215
216         vpxor           0x00(%rsp),%xmm7,%xmm7
217         vmovdqa         %xmm7,0x00(%rsp)
218
219         mov             %rsp,%rsi
220         lea             (%r11,%rax),%rdi
221         mov             %r9,%rcx
222         rep movsb
223
224         lea             -8(%r10),%rsp
225         jmp             .Ldone2
226
227 ENDPROC(chacha_2block_xor_avx2)
228
229 ENTRY(chacha_4block_xor_avx2)
230         # %rdi: Input state matrix, s
231         # %rsi: up to 4 data blocks output, o
232         # %rdx: up to 4 data blocks input, i
233         # %rcx: input/output length in bytes
234         # %r8d: nrounds
235
236         # This function encrypts four ChaCha blocks by loading the state
237         # matrix four times across eight AVX registers. It performs matrix
238         # operations on four words in two matrices in parallel, sequentially
239         # to the operations on the four words of the other two matrices. The
240         # required word shuffling has a rather high latency, we can do the
241         # arithmetic on two matrix-pairs without much slowdown.
242
243         vzeroupper
244
245         # x0..3[0-4] = s0..3
246         vbroadcasti128  0x00(%rdi),%ymm0
247         vbroadcasti128  0x10(%rdi),%ymm1
248         vbroadcasti128  0x20(%rdi),%ymm2
249         vbroadcasti128  0x30(%rdi),%ymm3
250
251         vmovdqa         %ymm0,%ymm4
252         vmovdqa         %ymm1,%ymm5
253         vmovdqa         %ymm2,%ymm6
254         vmovdqa         %ymm3,%ymm7
255
256         vpaddd          CTR2BL(%rip),%ymm3,%ymm3
257         vpaddd          CTR4BL(%rip),%ymm7,%ymm7
258
259         vmovdqa         %ymm0,%ymm11
260         vmovdqa         %ymm1,%ymm12
261         vmovdqa         %ymm2,%ymm13
262         vmovdqa         %ymm3,%ymm14
263         vmovdqa         %ymm7,%ymm15
264
265         vmovdqa         ROT8(%rip),%ymm8
266         vmovdqa         ROT16(%rip),%ymm9
267
268         mov             %rcx,%rax
269
270 .Ldoubleround4:
271
272         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
273         vpaddd          %ymm1,%ymm0,%ymm0
274         vpxor           %ymm0,%ymm3,%ymm3
275         vpshufb         %ymm9,%ymm3,%ymm3
276
277         vpaddd          %ymm5,%ymm4,%ymm4
278         vpxor           %ymm4,%ymm7,%ymm7
279         vpshufb         %ymm9,%ymm7,%ymm7
280
281         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
282         vpaddd          %ymm3,%ymm2,%ymm2
283         vpxor           %ymm2,%ymm1,%ymm1
284         vmovdqa         %ymm1,%ymm10
285         vpslld          $12,%ymm10,%ymm10
286         vpsrld          $20,%ymm1,%ymm1
287         vpor            %ymm10,%ymm1,%ymm1
288
289         vpaddd          %ymm7,%ymm6,%ymm6
290         vpxor           %ymm6,%ymm5,%ymm5
291         vmovdqa         %ymm5,%ymm10
292         vpslld          $12,%ymm10,%ymm10
293         vpsrld          $20,%ymm5,%ymm5
294         vpor            %ymm10,%ymm5,%ymm5
295
296         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
297         vpaddd          %ymm1,%ymm0,%ymm0
298         vpxor           %ymm0,%ymm3,%ymm3
299         vpshufb         %ymm8,%ymm3,%ymm3
300
301         vpaddd          %ymm5,%ymm4,%ymm4
302         vpxor           %ymm4,%ymm7,%ymm7
303         vpshufb         %ymm8,%ymm7,%ymm7
304
305         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
306         vpaddd          %ymm3,%ymm2,%ymm2
307         vpxor           %ymm2,%ymm1,%ymm1
308         vmovdqa         %ymm1,%ymm10
309         vpslld          $7,%ymm10,%ymm10
310         vpsrld          $25,%ymm1,%ymm1
311         vpor            %ymm10,%ymm1,%ymm1
312
313         vpaddd          %ymm7,%ymm6,%ymm6
314         vpxor           %ymm6,%ymm5,%ymm5
315         vmovdqa         %ymm5,%ymm10
316         vpslld          $7,%ymm10,%ymm10
317         vpsrld          $25,%ymm5,%ymm5
318         vpor            %ymm10,%ymm5,%ymm5
319
320         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
321         vpshufd         $0x39,%ymm1,%ymm1
322         vpshufd         $0x39,%ymm5,%ymm5
323         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
324         vpshufd         $0x4e,%ymm2,%ymm2
325         vpshufd         $0x4e,%ymm6,%ymm6
326         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
327         vpshufd         $0x93,%ymm3,%ymm3
328         vpshufd         $0x93,%ymm7,%ymm7
329
330         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
331         vpaddd          %ymm1,%ymm0,%ymm0
332         vpxor           %ymm0,%ymm3,%ymm3
333         vpshufb         %ymm9,%ymm3,%ymm3
334
335         vpaddd          %ymm5,%ymm4,%ymm4
336         vpxor           %ymm4,%ymm7,%ymm7
337         vpshufb         %ymm9,%ymm7,%ymm7
338
339         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
340         vpaddd          %ymm3,%ymm2,%ymm2
341         vpxor           %ymm2,%ymm1,%ymm1
342         vmovdqa         %ymm1,%ymm10
343         vpslld          $12,%ymm10,%ymm10
344         vpsrld          $20,%ymm1,%ymm1
345         vpor            %ymm10,%ymm1,%ymm1
346
347         vpaddd          %ymm7,%ymm6,%ymm6
348         vpxor           %ymm6,%ymm5,%ymm5
349         vmovdqa         %ymm5,%ymm10
350         vpslld          $12,%ymm10,%ymm10
351         vpsrld          $20,%ymm5,%ymm5
352         vpor            %ymm10,%ymm5,%ymm5
353
354         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
355         vpaddd          %ymm1,%ymm0,%ymm0
356         vpxor           %ymm0,%ymm3,%ymm3
357         vpshufb         %ymm8,%ymm3,%ymm3
358
359         vpaddd          %ymm5,%ymm4,%ymm4
360         vpxor           %ymm4,%ymm7,%ymm7
361         vpshufb         %ymm8,%ymm7,%ymm7
362
363         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
364         vpaddd          %ymm3,%ymm2,%ymm2
365         vpxor           %ymm2,%ymm1,%ymm1
366         vmovdqa         %ymm1,%ymm10
367         vpslld          $7,%ymm10,%ymm10
368         vpsrld          $25,%ymm1,%ymm1
369         vpor            %ymm10,%ymm1,%ymm1
370
371         vpaddd          %ymm7,%ymm6,%ymm6
372         vpxor           %ymm6,%ymm5,%ymm5
373         vmovdqa         %ymm5,%ymm10
374         vpslld          $7,%ymm10,%ymm10
375         vpsrld          $25,%ymm5,%ymm5
376         vpor            %ymm10,%ymm5,%ymm5
377
378         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
379         vpshufd         $0x93,%ymm1,%ymm1
380         vpshufd         $0x93,%ymm5,%ymm5
381         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
382         vpshufd         $0x4e,%ymm2,%ymm2
383         vpshufd         $0x4e,%ymm6,%ymm6
384         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
385         vpshufd         $0x39,%ymm3,%ymm3
386         vpshufd         $0x39,%ymm7,%ymm7
387
388         sub             $2,%r8d
389         jnz             .Ldoubleround4
390
391         # o0 = i0 ^ (x0 + s0), first block
392         vpaddd          %ymm11,%ymm0,%ymm10
393         cmp             $0x10,%rax
394         jl              .Lxorpart4
395         vpxor           0x00(%rdx),%xmm10,%xmm9
396         vmovdqu         %xmm9,0x00(%rsi)
397         vextracti128    $1,%ymm10,%xmm0
398         # o1 = i1 ^ (x1 + s1), first block
399         vpaddd          %ymm12,%ymm1,%ymm10
400         cmp             $0x20,%rax
401         jl              .Lxorpart4
402         vpxor           0x10(%rdx),%xmm10,%xmm9
403         vmovdqu         %xmm9,0x10(%rsi)
404         vextracti128    $1,%ymm10,%xmm1
405         # o2 = i2 ^ (x2 + s2), first block
406         vpaddd          %ymm13,%ymm2,%ymm10
407         cmp             $0x30,%rax
408         jl              .Lxorpart4
409         vpxor           0x20(%rdx),%xmm10,%xmm9
410         vmovdqu         %xmm9,0x20(%rsi)
411         vextracti128    $1,%ymm10,%xmm2
412         # o3 = i3 ^ (x3 + s3), first block
413         vpaddd          %ymm14,%ymm3,%ymm10
414         cmp             $0x40,%rax
415         jl              .Lxorpart4
416         vpxor           0x30(%rdx),%xmm10,%xmm9
417         vmovdqu         %xmm9,0x30(%rsi)
418         vextracti128    $1,%ymm10,%xmm3
419
420         # xor and write second block
421         vmovdqa         %xmm0,%xmm10
422         cmp             $0x50,%rax
423         jl              .Lxorpart4
424         vpxor           0x40(%rdx),%xmm10,%xmm9
425         vmovdqu         %xmm9,0x40(%rsi)
426
427         vmovdqa         %xmm1,%xmm10
428         cmp             $0x60,%rax
429         jl              .Lxorpart4
430         vpxor           0x50(%rdx),%xmm10,%xmm9
431         vmovdqu         %xmm9,0x50(%rsi)
432
433         vmovdqa         %xmm2,%xmm10
434         cmp             $0x70,%rax
435         jl              .Lxorpart4
436         vpxor           0x60(%rdx),%xmm10,%xmm9
437         vmovdqu         %xmm9,0x60(%rsi)
438
439         vmovdqa         %xmm3,%xmm10
440         cmp             $0x80,%rax
441         jl              .Lxorpart4
442         vpxor           0x70(%rdx),%xmm10,%xmm9
443         vmovdqu         %xmm9,0x70(%rsi)
444
445         # o0 = i0 ^ (x0 + s0), third block
446         vpaddd          %ymm11,%ymm4,%ymm10
447         cmp             $0x90,%rax
448         jl              .Lxorpart4
449         vpxor           0x80(%rdx),%xmm10,%xmm9
450         vmovdqu         %xmm9,0x80(%rsi)
451         vextracti128    $1,%ymm10,%xmm4
452         # o1 = i1 ^ (x1 + s1), third block
453         vpaddd          %ymm12,%ymm5,%ymm10
454         cmp             $0xa0,%rax
455         jl              .Lxorpart4
456         vpxor           0x90(%rdx),%xmm10,%xmm9
457         vmovdqu         %xmm9,0x90(%rsi)
458         vextracti128    $1,%ymm10,%xmm5
459         # o2 = i2 ^ (x2 + s2), third block
460         vpaddd          %ymm13,%ymm6,%ymm10
461         cmp             $0xb0,%rax
462         jl              .Lxorpart4
463         vpxor           0xa0(%rdx),%xmm10,%xmm9
464         vmovdqu         %xmm9,0xa0(%rsi)
465         vextracti128    $1,%ymm10,%xmm6
466         # o3 = i3 ^ (x3 + s3), third block
467         vpaddd          %ymm15,%ymm7,%ymm10
468         cmp             $0xc0,%rax
469         jl              .Lxorpart4
470         vpxor           0xb0(%rdx),%xmm10,%xmm9
471         vmovdqu         %xmm9,0xb0(%rsi)
472         vextracti128    $1,%ymm10,%xmm7
473
474         # xor and write fourth block
475         vmovdqa         %xmm4,%xmm10
476         cmp             $0xd0,%rax
477         jl              .Lxorpart4
478         vpxor           0xc0(%rdx),%xmm10,%xmm9
479         vmovdqu         %xmm9,0xc0(%rsi)
480
481         vmovdqa         %xmm5,%xmm10
482         cmp             $0xe0,%rax
483         jl              .Lxorpart4
484         vpxor           0xd0(%rdx),%xmm10,%xmm9
485         vmovdqu         %xmm9,0xd0(%rsi)
486
487         vmovdqa         %xmm6,%xmm10
488         cmp             $0xf0,%rax
489         jl              .Lxorpart4
490         vpxor           0xe0(%rdx),%xmm10,%xmm9
491         vmovdqu         %xmm9,0xe0(%rsi)
492
493         vmovdqa         %xmm7,%xmm10
494         cmp             $0x100,%rax
495         jl              .Lxorpart4
496         vpxor           0xf0(%rdx),%xmm10,%xmm9
497         vmovdqu         %xmm9,0xf0(%rsi)
498
499 .Ldone4:
500         vzeroupper
501         ret
502
503 .Lxorpart4:
504         # xor remaining bytes from partial register into output
505         mov             %rax,%r9
506         and             $0x0f,%r9
507         jz              .Ldone4
508         and             $~0x0f,%rax
509
510         mov             %rsi,%r11
511
512         lea             8(%rsp),%r10
513         sub             $0x10,%rsp
514         and             $~31,%rsp
515
516         lea             (%rdx,%rax),%rsi
517         mov             %rsp,%rdi
518         mov             %r9,%rcx
519         rep movsb
520
521         vpxor           0x00(%rsp),%xmm10,%xmm10
522         vmovdqa         %xmm10,0x00(%rsp)
523
524         mov             %rsp,%rsi
525         lea             (%r11,%rax),%rdi
526         mov             %r9,%rcx
527         rep movsb
528
529         lea             -8(%r10),%rsp
530         jmp             .Ldone4
531
532 ENDPROC(chacha_4block_xor_avx2)
533
534 ENTRY(chacha_8block_xor_avx2)
535         # %rdi: Input state matrix, s
536         # %rsi: up to 8 data blocks output, o
537         # %rdx: up to 8 data blocks input, i
538         # %rcx: input/output length in bytes
539         # %r8d: nrounds
540
541         # This function encrypts eight consecutive ChaCha blocks by loading
542         # the state matrix in AVX registers eight times. As we need some
543         # scratch registers, we save the first four registers on the stack. The
544         # algorithm performs each operation on the corresponding word of each
545         # state matrix, hence requires no word shuffling. For final XORing step
546         # we transpose the matrix by interleaving 32-, 64- and then 128-bit
547         # words, which allows us to do XOR in AVX registers. 8/16-bit word
548         # rotation is done with the slightly better performing byte shuffling,
549         # 7/12-bit word rotation uses traditional shift+OR.
550
551         vzeroupper
552         # 4 * 32 byte stack, 32-byte aligned
553         lea             8(%rsp),%r10
554         and             $~31, %rsp
555         sub             $0x80, %rsp
556         mov             %rcx,%rax
557
558         # x0..15[0-7] = s[0..15]
559         vpbroadcastd    0x00(%rdi),%ymm0
560         vpbroadcastd    0x04(%rdi),%ymm1
561         vpbroadcastd    0x08(%rdi),%ymm2
562         vpbroadcastd    0x0c(%rdi),%ymm3
563         vpbroadcastd    0x10(%rdi),%ymm4
564         vpbroadcastd    0x14(%rdi),%ymm5
565         vpbroadcastd    0x18(%rdi),%ymm6
566         vpbroadcastd    0x1c(%rdi),%ymm7
567         vpbroadcastd    0x20(%rdi),%ymm8
568         vpbroadcastd    0x24(%rdi),%ymm9
569         vpbroadcastd    0x28(%rdi),%ymm10
570         vpbroadcastd    0x2c(%rdi),%ymm11
571         vpbroadcastd    0x30(%rdi),%ymm12
572         vpbroadcastd    0x34(%rdi),%ymm13
573         vpbroadcastd    0x38(%rdi),%ymm14
574         vpbroadcastd    0x3c(%rdi),%ymm15
575         # x0..3 on stack
576         vmovdqa         %ymm0,0x00(%rsp)
577         vmovdqa         %ymm1,0x20(%rsp)
578         vmovdqa         %ymm2,0x40(%rsp)
579         vmovdqa         %ymm3,0x60(%rsp)
580
581         vmovdqa         CTRINC(%rip),%ymm1
582         vmovdqa         ROT8(%rip),%ymm2
583         vmovdqa         ROT16(%rip),%ymm3
584
585         # x12 += counter values 0-3
586         vpaddd          %ymm1,%ymm12,%ymm12
587
588 .Ldoubleround8:
589         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
590         vpaddd          0x00(%rsp),%ymm4,%ymm0
591         vmovdqa         %ymm0,0x00(%rsp)
592         vpxor           %ymm0,%ymm12,%ymm12
593         vpshufb         %ymm3,%ymm12,%ymm12
594         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
595         vpaddd          0x20(%rsp),%ymm5,%ymm0
596         vmovdqa         %ymm0,0x20(%rsp)
597         vpxor           %ymm0,%ymm13,%ymm13
598         vpshufb         %ymm3,%ymm13,%ymm13
599         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
600         vpaddd          0x40(%rsp),%ymm6,%ymm0
601         vmovdqa         %ymm0,0x40(%rsp)
602         vpxor           %ymm0,%ymm14,%ymm14
603         vpshufb         %ymm3,%ymm14,%ymm14
604         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
605         vpaddd          0x60(%rsp),%ymm7,%ymm0
606         vmovdqa         %ymm0,0x60(%rsp)
607         vpxor           %ymm0,%ymm15,%ymm15
608         vpshufb         %ymm3,%ymm15,%ymm15
609
610         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
611         vpaddd          %ymm12,%ymm8,%ymm8
612         vpxor           %ymm8,%ymm4,%ymm4
613         vpslld          $12,%ymm4,%ymm0
614         vpsrld          $20,%ymm4,%ymm4
615         vpor            %ymm0,%ymm4,%ymm4
616         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
617         vpaddd          %ymm13,%ymm9,%ymm9
618         vpxor           %ymm9,%ymm5,%ymm5
619         vpslld          $12,%ymm5,%ymm0
620         vpsrld          $20,%ymm5,%ymm5
621         vpor            %ymm0,%ymm5,%ymm5
622         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
623         vpaddd          %ymm14,%ymm10,%ymm10
624         vpxor           %ymm10,%ymm6,%ymm6
625         vpslld          $12,%ymm6,%ymm0
626         vpsrld          $20,%ymm6,%ymm6
627         vpor            %ymm0,%ymm6,%ymm6
628         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
629         vpaddd          %ymm15,%ymm11,%ymm11
630         vpxor           %ymm11,%ymm7,%ymm7
631         vpslld          $12,%ymm7,%ymm0
632         vpsrld          $20,%ymm7,%ymm7
633         vpor            %ymm0,%ymm7,%ymm7
634
635         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
636         vpaddd          0x00(%rsp),%ymm4,%ymm0
637         vmovdqa         %ymm0,0x00(%rsp)
638         vpxor           %ymm0,%ymm12,%ymm12
639         vpshufb         %ymm2,%ymm12,%ymm12
640         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
641         vpaddd          0x20(%rsp),%ymm5,%ymm0
642         vmovdqa         %ymm0,0x20(%rsp)
643         vpxor           %ymm0,%ymm13,%ymm13
644         vpshufb         %ymm2,%ymm13,%ymm13
645         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
646         vpaddd          0x40(%rsp),%ymm6,%ymm0
647         vmovdqa         %ymm0,0x40(%rsp)
648         vpxor           %ymm0,%ymm14,%ymm14
649         vpshufb         %ymm2,%ymm14,%ymm14
650         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
651         vpaddd          0x60(%rsp),%ymm7,%ymm0
652         vmovdqa         %ymm0,0x60(%rsp)
653         vpxor           %ymm0,%ymm15,%ymm15
654         vpshufb         %ymm2,%ymm15,%ymm15
655
656         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
657         vpaddd          %ymm12,%ymm8,%ymm8
658         vpxor           %ymm8,%ymm4,%ymm4
659         vpslld          $7,%ymm4,%ymm0
660         vpsrld          $25,%ymm4,%ymm4
661         vpor            %ymm0,%ymm4,%ymm4
662         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
663         vpaddd          %ymm13,%ymm9,%ymm9
664         vpxor           %ymm9,%ymm5,%ymm5
665         vpslld          $7,%ymm5,%ymm0
666         vpsrld          $25,%ymm5,%ymm5
667         vpor            %ymm0,%ymm5,%ymm5
668         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
669         vpaddd          %ymm14,%ymm10,%ymm10
670         vpxor           %ymm10,%ymm6,%ymm6
671         vpslld          $7,%ymm6,%ymm0
672         vpsrld          $25,%ymm6,%ymm6
673         vpor            %ymm0,%ymm6,%ymm6
674         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
675         vpaddd          %ymm15,%ymm11,%ymm11
676         vpxor           %ymm11,%ymm7,%ymm7
677         vpslld          $7,%ymm7,%ymm0
678         vpsrld          $25,%ymm7,%ymm7
679         vpor            %ymm0,%ymm7,%ymm7
680
681         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
682         vpaddd          0x00(%rsp),%ymm5,%ymm0
683         vmovdqa         %ymm0,0x00(%rsp)
684         vpxor           %ymm0,%ymm15,%ymm15
685         vpshufb         %ymm3,%ymm15,%ymm15
686         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
687         vpaddd          0x20(%rsp),%ymm6,%ymm0
688         vmovdqa         %ymm0,0x20(%rsp)
689         vpxor           %ymm0,%ymm12,%ymm12
690         vpshufb         %ymm3,%ymm12,%ymm12
691         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
692         vpaddd          0x40(%rsp),%ymm7,%ymm0
693         vmovdqa         %ymm0,0x40(%rsp)
694         vpxor           %ymm0,%ymm13,%ymm13
695         vpshufb         %ymm3,%ymm13,%ymm13
696         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
697         vpaddd          0x60(%rsp),%ymm4,%ymm0
698         vmovdqa         %ymm0,0x60(%rsp)
699         vpxor           %ymm0,%ymm14,%ymm14
700         vpshufb         %ymm3,%ymm14,%ymm14
701
702         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
703         vpaddd          %ymm15,%ymm10,%ymm10
704         vpxor           %ymm10,%ymm5,%ymm5
705         vpslld          $12,%ymm5,%ymm0
706         vpsrld          $20,%ymm5,%ymm5
707         vpor            %ymm0,%ymm5,%ymm5
708         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
709         vpaddd          %ymm12,%ymm11,%ymm11
710         vpxor           %ymm11,%ymm6,%ymm6
711         vpslld          $12,%ymm6,%ymm0
712         vpsrld          $20,%ymm6,%ymm6
713         vpor            %ymm0,%ymm6,%ymm6
714         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
715         vpaddd          %ymm13,%ymm8,%ymm8
716         vpxor           %ymm8,%ymm7,%ymm7
717         vpslld          $12,%ymm7,%ymm0
718         vpsrld          $20,%ymm7,%ymm7
719         vpor            %ymm0,%ymm7,%ymm7
720         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
721         vpaddd          %ymm14,%ymm9,%ymm9
722         vpxor           %ymm9,%ymm4,%ymm4
723         vpslld          $12,%ymm4,%ymm0
724         vpsrld          $20,%ymm4,%ymm4
725         vpor            %ymm0,%ymm4,%ymm4
726
727         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
728         vpaddd          0x00(%rsp),%ymm5,%ymm0
729         vmovdqa         %ymm0,0x00(%rsp)
730         vpxor           %ymm0,%ymm15,%ymm15
731         vpshufb         %ymm2,%ymm15,%ymm15
732         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
733         vpaddd          0x20(%rsp),%ymm6,%ymm0
734         vmovdqa         %ymm0,0x20(%rsp)
735         vpxor           %ymm0,%ymm12,%ymm12
736         vpshufb         %ymm2,%ymm12,%ymm12
737         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
738         vpaddd          0x40(%rsp),%ymm7,%ymm0
739         vmovdqa         %ymm0,0x40(%rsp)
740         vpxor           %ymm0,%ymm13,%ymm13
741         vpshufb         %ymm2,%ymm13,%ymm13
742         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
743         vpaddd          0x60(%rsp),%ymm4,%ymm0
744         vmovdqa         %ymm0,0x60(%rsp)
745         vpxor           %ymm0,%ymm14,%ymm14
746         vpshufb         %ymm2,%ymm14,%ymm14
747
748         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
749         vpaddd          %ymm15,%ymm10,%ymm10
750         vpxor           %ymm10,%ymm5,%ymm5
751         vpslld          $7,%ymm5,%ymm0
752         vpsrld          $25,%ymm5,%ymm5
753         vpor            %ymm0,%ymm5,%ymm5
754         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
755         vpaddd          %ymm12,%ymm11,%ymm11
756         vpxor           %ymm11,%ymm6,%ymm6
757         vpslld          $7,%ymm6,%ymm0
758         vpsrld          $25,%ymm6,%ymm6
759         vpor            %ymm0,%ymm6,%ymm6
760         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
761         vpaddd          %ymm13,%ymm8,%ymm8
762         vpxor           %ymm8,%ymm7,%ymm7
763         vpslld          $7,%ymm7,%ymm0
764         vpsrld          $25,%ymm7,%ymm7
765         vpor            %ymm0,%ymm7,%ymm7
766         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
767         vpaddd          %ymm14,%ymm9,%ymm9
768         vpxor           %ymm9,%ymm4,%ymm4
769         vpslld          $7,%ymm4,%ymm0
770         vpsrld          $25,%ymm4,%ymm4
771         vpor            %ymm0,%ymm4,%ymm4
772
773         sub             $2,%r8d
774         jnz             .Ldoubleround8
775
776         # x0..15[0-3] += s[0..15]
777         vpbroadcastd    0x00(%rdi),%ymm0
778         vpaddd          0x00(%rsp),%ymm0,%ymm0
779         vmovdqa         %ymm0,0x00(%rsp)
780         vpbroadcastd    0x04(%rdi),%ymm0
781         vpaddd          0x20(%rsp),%ymm0,%ymm0
782         vmovdqa         %ymm0,0x20(%rsp)
783         vpbroadcastd    0x08(%rdi),%ymm0
784         vpaddd          0x40(%rsp),%ymm0,%ymm0
785         vmovdqa         %ymm0,0x40(%rsp)
786         vpbroadcastd    0x0c(%rdi),%ymm0
787         vpaddd          0x60(%rsp),%ymm0,%ymm0
788         vmovdqa         %ymm0,0x60(%rsp)
789         vpbroadcastd    0x10(%rdi),%ymm0
790         vpaddd          %ymm0,%ymm4,%ymm4
791         vpbroadcastd    0x14(%rdi),%ymm0
792         vpaddd          %ymm0,%ymm5,%ymm5
793         vpbroadcastd    0x18(%rdi),%ymm0
794         vpaddd          %ymm0,%ymm6,%ymm6
795         vpbroadcastd    0x1c(%rdi),%ymm0
796         vpaddd          %ymm0,%ymm7,%ymm7
797         vpbroadcastd    0x20(%rdi),%ymm0
798         vpaddd          %ymm0,%ymm8,%ymm8
799         vpbroadcastd    0x24(%rdi),%ymm0
800         vpaddd          %ymm0,%ymm9,%ymm9
801         vpbroadcastd    0x28(%rdi),%ymm0
802         vpaddd          %ymm0,%ymm10,%ymm10
803         vpbroadcastd    0x2c(%rdi),%ymm0
804         vpaddd          %ymm0,%ymm11,%ymm11
805         vpbroadcastd    0x30(%rdi),%ymm0
806         vpaddd          %ymm0,%ymm12,%ymm12
807         vpbroadcastd    0x34(%rdi),%ymm0
808         vpaddd          %ymm0,%ymm13,%ymm13
809         vpbroadcastd    0x38(%rdi),%ymm0
810         vpaddd          %ymm0,%ymm14,%ymm14
811         vpbroadcastd    0x3c(%rdi),%ymm0
812         vpaddd          %ymm0,%ymm15,%ymm15
813
814         # x12 += counter values 0-3
815         vpaddd          %ymm1,%ymm12,%ymm12
816
817         # interleave 32-bit words in state n, n+1
818         vmovdqa         0x00(%rsp),%ymm0
819         vmovdqa         0x20(%rsp),%ymm1
820         vpunpckldq      %ymm1,%ymm0,%ymm2
821         vpunpckhdq      %ymm1,%ymm0,%ymm1
822         vmovdqa         %ymm2,0x00(%rsp)
823         vmovdqa         %ymm1,0x20(%rsp)
824         vmovdqa         0x40(%rsp),%ymm0
825         vmovdqa         0x60(%rsp),%ymm1
826         vpunpckldq      %ymm1,%ymm0,%ymm2
827         vpunpckhdq      %ymm1,%ymm0,%ymm1
828         vmovdqa         %ymm2,0x40(%rsp)
829         vmovdqa         %ymm1,0x60(%rsp)
830         vmovdqa         %ymm4,%ymm0
831         vpunpckldq      %ymm5,%ymm0,%ymm4
832         vpunpckhdq      %ymm5,%ymm0,%ymm5
833         vmovdqa         %ymm6,%ymm0
834         vpunpckldq      %ymm7,%ymm0,%ymm6
835         vpunpckhdq      %ymm7,%ymm0,%ymm7
836         vmovdqa         %ymm8,%ymm0
837         vpunpckldq      %ymm9,%ymm0,%ymm8
838         vpunpckhdq      %ymm9,%ymm0,%ymm9
839         vmovdqa         %ymm10,%ymm0
840         vpunpckldq      %ymm11,%ymm0,%ymm10
841         vpunpckhdq      %ymm11,%ymm0,%ymm11
842         vmovdqa         %ymm12,%ymm0
843         vpunpckldq      %ymm13,%ymm0,%ymm12
844         vpunpckhdq      %ymm13,%ymm0,%ymm13
845         vmovdqa         %ymm14,%ymm0
846         vpunpckldq      %ymm15,%ymm0,%ymm14
847         vpunpckhdq      %ymm15,%ymm0,%ymm15
848
849         # interleave 64-bit words in state n, n+2
850         vmovdqa         0x00(%rsp),%ymm0
851         vmovdqa         0x40(%rsp),%ymm2
852         vpunpcklqdq     %ymm2,%ymm0,%ymm1
853         vpunpckhqdq     %ymm2,%ymm0,%ymm2
854         vmovdqa         %ymm1,0x00(%rsp)
855         vmovdqa         %ymm2,0x40(%rsp)
856         vmovdqa         0x20(%rsp),%ymm0
857         vmovdqa         0x60(%rsp),%ymm2
858         vpunpcklqdq     %ymm2,%ymm0,%ymm1
859         vpunpckhqdq     %ymm2,%ymm0,%ymm2
860         vmovdqa         %ymm1,0x20(%rsp)
861         vmovdqa         %ymm2,0x60(%rsp)
862         vmovdqa         %ymm4,%ymm0
863         vpunpcklqdq     %ymm6,%ymm0,%ymm4
864         vpunpckhqdq     %ymm6,%ymm0,%ymm6
865         vmovdqa         %ymm5,%ymm0
866         vpunpcklqdq     %ymm7,%ymm0,%ymm5
867         vpunpckhqdq     %ymm7,%ymm0,%ymm7
868         vmovdqa         %ymm8,%ymm0
869         vpunpcklqdq     %ymm10,%ymm0,%ymm8
870         vpunpckhqdq     %ymm10,%ymm0,%ymm10
871         vmovdqa         %ymm9,%ymm0
872         vpunpcklqdq     %ymm11,%ymm0,%ymm9
873         vpunpckhqdq     %ymm11,%ymm0,%ymm11
874         vmovdqa         %ymm12,%ymm0
875         vpunpcklqdq     %ymm14,%ymm0,%ymm12
876         vpunpckhqdq     %ymm14,%ymm0,%ymm14
877         vmovdqa         %ymm13,%ymm0
878         vpunpcklqdq     %ymm15,%ymm0,%ymm13
879         vpunpckhqdq     %ymm15,%ymm0,%ymm15
880
881         # interleave 128-bit words in state n, n+4
882         # xor/write first four blocks
883         vmovdqa         0x00(%rsp),%ymm1
884         vperm2i128      $0x20,%ymm4,%ymm1,%ymm0
885         cmp             $0x0020,%rax
886         jl              .Lxorpart8
887         vpxor           0x0000(%rdx),%ymm0,%ymm0
888         vmovdqu         %ymm0,0x0000(%rsi)
889         vperm2i128      $0x31,%ymm4,%ymm1,%ymm4
890
891         vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
892         cmp             $0x0040,%rax
893         jl              .Lxorpart8
894         vpxor           0x0020(%rdx),%ymm0,%ymm0
895         vmovdqu         %ymm0,0x0020(%rsi)
896         vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
897
898         vmovdqa         0x40(%rsp),%ymm1
899         vperm2i128      $0x20,%ymm6,%ymm1,%ymm0
900         cmp             $0x0060,%rax
901         jl              .Lxorpart8
902         vpxor           0x0040(%rdx),%ymm0,%ymm0
903         vmovdqu         %ymm0,0x0040(%rsi)
904         vperm2i128      $0x31,%ymm6,%ymm1,%ymm6
905
906         vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
907         cmp             $0x0080,%rax
908         jl              .Lxorpart8
909         vpxor           0x0060(%rdx),%ymm0,%ymm0
910         vmovdqu         %ymm0,0x0060(%rsi)
911         vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
912
913         vmovdqa         0x20(%rsp),%ymm1
914         vperm2i128      $0x20,%ymm5,%ymm1,%ymm0
915         cmp             $0x00a0,%rax
916         jl              .Lxorpart8
917         vpxor           0x0080(%rdx),%ymm0,%ymm0
918         vmovdqu         %ymm0,0x0080(%rsi)
919         vperm2i128      $0x31,%ymm5,%ymm1,%ymm5
920
921         vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
922         cmp             $0x00c0,%rax
923         jl              .Lxorpart8
924         vpxor           0x00a0(%rdx),%ymm0,%ymm0
925         vmovdqu         %ymm0,0x00a0(%rsi)
926         vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
927
928         vmovdqa         0x60(%rsp),%ymm1
929         vperm2i128      $0x20,%ymm7,%ymm1,%ymm0
930         cmp             $0x00e0,%rax
931         jl              .Lxorpart8
932         vpxor           0x00c0(%rdx),%ymm0,%ymm0
933         vmovdqu         %ymm0,0x00c0(%rsi)
934         vperm2i128      $0x31,%ymm7,%ymm1,%ymm7
935
936         vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
937         cmp             $0x0100,%rax
938         jl              .Lxorpart8
939         vpxor           0x00e0(%rdx),%ymm0,%ymm0
940         vmovdqu         %ymm0,0x00e0(%rsi)
941         vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
942
943         # xor remaining blocks, write to output
944         vmovdqa         %ymm4,%ymm0
945         cmp             $0x0120,%rax
946         jl              .Lxorpart8
947         vpxor           0x0100(%rdx),%ymm0,%ymm0
948         vmovdqu         %ymm0,0x0100(%rsi)
949
950         vmovdqa         %ymm12,%ymm0
951         cmp             $0x0140,%rax
952         jl              .Lxorpart8
953         vpxor           0x0120(%rdx),%ymm0,%ymm0
954         vmovdqu         %ymm0,0x0120(%rsi)
955
956         vmovdqa         %ymm6,%ymm0
957         cmp             $0x0160,%rax
958         jl              .Lxorpart8
959         vpxor           0x0140(%rdx),%ymm0,%ymm0
960         vmovdqu         %ymm0,0x0140(%rsi)
961
962         vmovdqa         %ymm14,%ymm0
963         cmp             $0x0180,%rax
964         jl              .Lxorpart8
965         vpxor           0x0160(%rdx),%ymm0,%ymm0
966         vmovdqu         %ymm0,0x0160(%rsi)
967
968         vmovdqa         %ymm5,%ymm0
969         cmp             $0x01a0,%rax
970         jl              .Lxorpart8
971         vpxor           0x0180(%rdx),%ymm0,%ymm0
972         vmovdqu         %ymm0,0x0180(%rsi)
973
974         vmovdqa         %ymm13,%ymm0
975         cmp             $0x01c0,%rax
976         jl              .Lxorpart8
977         vpxor           0x01a0(%rdx),%ymm0,%ymm0
978         vmovdqu         %ymm0,0x01a0(%rsi)
979
980         vmovdqa         %ymm7,%ymm0
981         cmp             $0x01e0,%rax
982         jl              .Lxorpart8
983         vpxor           0x01c0(%rdx),%ymm0,%ymm0
984         vmovdqu         %ymm0,0x01c0(%rsi)
985
986         vmovdqa         %ymm15,%ymm0
987         cmp             $0x0200,%rax
988         jl              .Lxorpart8
989         vpxor           0x01e0(%rdx),%ymm0,%ymm0
990         vmovdqu         %ymm0,0x01e0(%rsi)
991
992 .Ldone8:
993         vzeroupper
994         lea             -8(%r10),%rsp
995         ret
996
997 .Lxorpart8:
998         # xor remaining bytes from partial register into output
999         mov             %rax,%r9
1000         and             $0x1f,%r9
1001         jz              .Ldone8
1002         and             $~0x1f,%rax
1003
1004         mov             %rsi,%r11
1005
1006         lea             (%rdx,%rax),%rsi
1007         mov             %rsp,%rdi
1008         mov             %r9,%rcx
1009         rep movsb
1010
1011         vpxor           0x00(%rsp),%ymm0,%ymm0
1012         vmovdqa         %ymm0,0x00(%rsp)
1013
1014         mov             %rsp,%rsi
1015         lea             (%r11,%rax),%rdi
1016         mov             %r9,%rcx
1017         rep movsb
1018
1019         jmp             .Ldone8
1020
1021 ENDPROC(chacha_8block_xor_avx2)