Merge tag 'arc-5.1-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc
[sfrench/cifs-2.6.git] / arch / arm / crypto / sha512-core.S_shipped
1 @ SPDX-License-Identifier: GPL-2.0
2
3 @ This code is taken from the OpenSSL project but the author (Andy Polyakov)
4 @ has relicensed it under the GPLv2. Therefore this program is free software;
5 @ you can redistribute it and/or modify it under the terms of the GNU General
6 @ Public License version 2 as published by the Free Software Foundation.
7 @
8 @ The original headers, including the original license headers, are
9 @ included below for completeness.
10
11 @ ====================================================================
12 @ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
13 @ project. The module is, however, dual licensed under OpenSSL and
14 @ CRYPTOGAMS licenses depending on where you obtain it. For further
15 @ details see http://www.openssl.org/~appro/cryptogams/.
16 @ ====================================================================
17
18 @ SHA512 block procedure for ARMv4. September 2007.
19
20 @ This code is ~4.5 (four and a half) times faster than code generated
21 @ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
22 @ Xscale PXA250 core].
23 @
24 @ July 2010.
25 @
26 @ Rescheduling for dual-issue pipeline resulted in 6% improvement on
27 @ Cortex A8 core and ~40 cycles per processed byte.
28
29 @ February 2011.
30 @
31 @ Profiler-assisted and platform-specific optimization resulted in 7%
32 @ improvement on Coxtex A8 core and ~38 cycles per byte.
33
34 @ March 2011.
35 @
36 @ Add NEON implementation. On Cortex A8 it was measured to process
37 @ one byte in 23.3 cycles or ~60% faster than integer-only code.
38
39 @ August 2012.
40 @
41 @ Improve NEON performance by 12% on Snapdragon S4. In absolute
42 @ terms it's 22.6 cycles per byte, which is disappointing result.
43 @ Technical writers asserted that 3-way S4 pipeline can sustain
44 @ multiple NEON instructions per cycle, but dual NEON issue could
45 @ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
46 @ for further details. On side note Cortex-A15 processes one byte in
47 @ 16 cycles.
48
49 @ Byte order [in]dependence. =========================================
50 @
51 @ Originally caller was expected to maintain specific *dword* order in
52 @ h[0-7], namely with most significant dword at *lower* address, which
53 @ was reflected in below two parameters as 0 and 4. Now caller is
54 @ expected to maintain native byte order for whole 64-bit values.
55 #ifndef __KERNEL__
56 # include "arm_arch.h"
57 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
58 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
59 #else
60 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
61 # define __ARM_MAX_ARCH__ 7
62 # define VFP_ABI_PUSH
63 # define VFP_ABI_POP
64 #endif
65
66 #ifdef __ARMEL__
67 # define LO 0
68 # define HI 4
69 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
70 #else
71 # define HI 0
72 # define LO 4
73 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
74 #endif
75
76 .text
77 #if __ARM_ARCH__<7
78 .code   32
79 #else
80 .syntax unified
81 # ifdef __thumb2__
82 #  define adrl adr
83 .thumb
84 # else
85 .code   32
86 # endif
87 #endif
88
89 .type   K512,%object
90 .align  5
91 K512:
92 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
93 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
94 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
95 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
96 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
97 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
98 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
99 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
100 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
101 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
102 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
103 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
104 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
105 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
106 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
107 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
108 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
109 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
110 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
111 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
112 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
113 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
114 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
115 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
116 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
117 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
118 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
119 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
120 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
121 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
122 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
123 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
124 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
125 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
126 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
127 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
128 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
129 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
130 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
131 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
132 .size   K512,.-K512
133 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
134 .LOPENSSL_armcap:
135 .word   OPENSSL_armcap_P-sha512_block_data_order
136 .skip   32-4
137 #else
138 .skip   32
139 #endif
140
141 .global sha512_block_data_order
142 .type   sha512_block_data_order,%function
143 sha512_block_data_order:
144 .Lsha512_block_data_order:
145 #if __ARM_ARCH__<7
146         sub     r3,pc,#8                @ sha512_block_data_order
147 #else
148         adr     r3,.Lsha512_block_data_order
149 #endif
150 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
151         ldr     r12,.LOPENSSL_armcap
152         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
153         tst     r12,#1
154         bne     .LNEON
155 #endif
156         add     r2,r1,r2,lsl#7  @ len to point at the end of inp
157         stmdb   sp!,{r4-r12,lr}
158         sub     r14,r3,#672             @ K512
159         sub     sp,sp,#9*8
160
161         ldr     r7,[r0,#32+LO]
162         ldr     r8,[r0,#32+HI]
163         ldr     r9, [r0,#48+LO]
164         ldr     r10, [r0,#48+HI]
165         ldr     r11, [r0,#56+LO]
166         ldr     r12, [r0,#56+HI]
167 .Loop:
168         str     r9, [sp,#48+0]
169         str     r10, [sp,#48+4]
170         str     r11, [sp,#56+0]
171         str     r12, [sp,#56+4]
172         ldr     r5,[r0,#0+LO]
173         ldr     r6,[r0,#0+HI]
174         ldr     r3,[r0,#8+LO]
175         ldr     r4,[r0,#8+HI]
176         ldr     r9, [r0,#16+LO]
177         ldr     r10, [r0,#16+HI]
178         ldr     r11, [r0,#24+LO]
179         ldr     r12, [r0,#24+HI]
180         str     r3,[sp,#8+0]
181         str     r4,[sp,#8+4]
182         str     r9, [sp,#16+0]
183         str     r10, [sp,#16+4]
184         str     r11, [sp,#24+0]
185         str     r12, [sp,#24+4]
186         ldr     r3,[r0,#40+LO]
187         ldr     r4,[r0,#40+HI]
188         str     r3,[sp,#40+0]
189         str     r4,[sp,#40+4]
190
191 .L00_15:
192 #if __ARM_ARCH__<7
193         ldrb    r3,[r1,#7]
194         ldrb    r9, [r1,#6]
195         ldrb    r10, [r1,#5]
196         ldrb    r11, [r1,#4]
197         ldrb    r4,[r1,#3]
198         ldrb    r12, [r1,#2]
199         orr     r3,r3,r9,lsl#8
200         ldrb    r9, [r1,#1]
201         orr     r3,r3,r10,lsl#16
202         ldrb    r10, [r1],#8
203         orr     r3,r3,r11,lsl#24
204         orr     r4,r4,r12,lsl#8
205         orr     r4,r4,r9,lsl#16
206         orr     r4,r4,r10,lsl#24
207 #else
208         ldr     r3,[r1,#4]
209         ldr     r4,[r1],#8
210 #ifdef __ARMEL__
211         rev     r3,r3
212         rev     r4,r4
213 #endif
214 #endif
215         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
216         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
217         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
218         mov     r9,r7,lsr#14
219         str     r3,[sp,#64+0]
220         mov     r10,r8,lsr#14
221         str     r4,[sp,#64+4]
222         eor     r9,r9,r8,lsl#18
223         ldr     r11,[sp,#56+0]  @ h.lo
224         eor     r10,r10,r7,lsl#18
225         ldr     r12,[sp,#56+4]  @ h.hi
226         eor     r9,r9,r7,lsr#18
227         eor     r10,r10,r8,lsr#18
228         eor     r9,r9,r8,lsl#14
229         eor     r10,r10,r7,lsl#14
230         eor     r9,r9,r8,lsr#9
231         eor     r10,r10,r7,lsr#9
232         eor     r9,r9,r7,lsl#23
233         eor     r10,r10,r8,lsl#23       @ Sigma1(e)
234         adds    r3,r3,r9
235         ldr     r9,[sp,#40+0]   @ f.lo
236         adc     r4,r4,r10               @ T += Sigma1(e)
237         ldr     r10,[sp,#40+4]  @ f.hi
238         adds    r3,r3,r11
239         ldr     r11,[sp,#48+0]  @ g.lo
240         adc     r4,r4,r12               @ T += h
241         ldr     r12,[sp,#48+4]  @ g.hi
242
243         eor     r9,r9,r11
244         str     r7,[sp,#32+0]
245         eor     r10,r10,r12
246         str     r8,[sp,#32+4]
247         and     r9,r9,r7
248         str     r5,[sp,#0+0]
249         and     r10,r10,r8
250         str     r6,[sp,#0+4]
251         eor     r9,r9,r11
252         ldr     r11,[r14,#LO]   @ K[i].lo
253         eor     r10,r10,r12             @ Ch(e,f,g)
254         ldr     r12,[r14,#HI]   @ K[i].hi
255
256         adds    r3,r3,r9
257         ldr     r7,[sp,#24+0]   @ d.lo
258         adc     r4,r4,r10               @ T += Ch(e,f,g)
259         ldr     r8,[sp,#24+4]   @ d.hi
260         adds    r3,r3,r11
261         and     r9,r11,#0xff
262         adc     r4,r4,r12               @ T += K[i]
263         adds    r7,r7,r3
264         ldr     r11,[sp,#8+0]   @ b.lo
265         adc     r8,r8,r4                @ d += T
266         teq     r9,#148
267
268         ldr     r12,[sp,#16+0]  @ c.lo
269 #if __ARM_ARCH__>=7
270         it      eq                      @ Thumb2 thing, sanity check in ARM
271 #endif
272         orreq   r14,r14,#1
273         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
274         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
275         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
276         mov     r9,r5,lsr#28
277         mov     r10,r6,lsr#28
278         eor     r9,r9,r6,lsl#4
279         eor     r10,r10,r5,lsl#4
280         eor     r9,r9,r6,lsr#2
281         eor     r10,r10,r5,lsr#2
282         eor     r9,r9,r5,lsl#30
283         eor     r10,r10,r6,lsl#30
284         eor     r9,r9,r6,lsr#7
285         eor     r10,r10,r5,lsr#7
286         eor     r9,r9,r5,lsl#25
287         eor     r10,r10,r6,lsl#25       @ Sigma0(a)
288         adds    r3,r3,r9
289         and     r9,r5,r11
290         adc     r4,r4,r10               @ T += Sigma0(a)
291
292         ldr     r10,[sp,#8+4]   @ b.hi
293         orr     r5,r5,r11
294         ldr     r11,[sp,#16+4]  @ c.hi
295         and     r5,r5,r12
296         and     r12,r6,r10
297         orr     r6,r6,r10
298         orr     r5,r5,r9                @ Maj(a,b,c).lo
299         and     r6,r6,r11
300         adds    r5,r5,r3
301         orr     r6,r6,r12               @ Maj(a,b,c).hi
302         sub     sp,sp,#8
303         adc     r6,r6,r4                @ h += T
304         tst     r14,#1
305         add     r14,r14,#8
306         tst     r14,#1
307         beq     .L00_15
308         ldr     r9,[sp,#184+0]
309         ldr     r10,[sp,#184+4]
310         bic     r14,r14,#1
311 .L16_79:
312         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
313         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
314         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
315         mov     r3,r9,lsr#1
316         ldr     r11,[sp,#80+0]
317         mov     r4,r10,lsr#1
318         ldr     r12,[sp,#80+4]
319         eor     r3,r3,r10,lsl#31
320         eor     r4,r4,r9,lsl#31
321         eor     r3,r3,r9,lsr#8
322         eor     r4,r4,r10,lsr#8
323         eor     r3,r3,r10,lsl#24
324         eor     r4,r4,r9,lsl#24
325         eor     r3,r3,r9,lsr#7
326         eor     r4,r4,r10,lsr#7
327         eor     r3,r3,r10,lsl#25
328
329         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
330         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
331         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
332         mov     r9,r11,lsr#19
333         mov     r10,r12,lsr#19
334         eor     r9,r9,r12,lsl#13
335         eor     r10,r10,r11,lsl#13
336         eor     r9,r9,r12,lsr#29
337         eor     r10,r10,r11,lsr#29
338         eor     r9,r9,r11,lsl#3
339         eor     r10,r10,r12,lsl#3
340         eor     r9,r9,r11,lsr#6
341         eor     r10,r10,r12,lsr#6
342         ldr     r11,[sp,#120+0]
343         eor     r9,r9,r12,lsl#26
344
345         ldr     r12,[sp,#120+4]
346         adds    r3,r3,r9
347         ldr     r9,[sp,#192+0]
348         adc     r4,r4,r10
349
350         ldr     r10,[sp,#192+4]
351         adds    r3,r3,r11
352         adc     r4,r4,r12
353         adds    r3,r3,r9
354         adc     r4,r4,r10
355         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
356         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
357         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
358         mov     r9,r7,lsr#14
359         str     r3,[sp,#64+0]
360         mov     r10,r8,lsr#14
361         str     r4,[sp,#64+4]
362         eor     r9,r9,r8,lsl#18
363         ldr     r11,[sp,#56+0]  @ h.lo
364         eor     r10,r10,r7,lsl#18
365         ldr     r12,[sp,#56+4]  @ h.hi
366         eor     r9,r9,r7,lsr#18
367         eor     r10,r10,r8,lsr#18
368         eor     r9,r9,r8,lsl#14
369         eor     r10,r10,r7,lsl#14
370         eor     r9,r9,r8,lsr#9
371         eor     r10,r10,r7,lsr#9
372         eor     r9,r9,r7,lsl#23
373         eor     r10,r10,r8,lsl#23       @ Sigma1(e)
374         adds    r3,r3,r9
375         ldr     r9,[sp,#40+0]   @ f.lo
376         adc     r4,r4,r10               @ T += Sigma1(e)
377         ldr     r10,[sp,#40+4]  @ f.hi
378         adds    r3,r3,r11
379         ldr     r11,[sp,#48+0]  @ g.lo
380         adc     r4,r4,r12               @ T += h
381         ldr     r12,[sp,#48+4]  @ g.hi
382
383         eor     r9,r9,r11
384         str     r7,[sp,#32+0]
385         eor     r10,r10,r12
386         str     r8,[sp,#32+4]
387         and     r9,r9,r7
388         str     r5,[sp,#0+0]
389         and     r10,r10,r8
390         str     r6,[sp,#0+4]
391         eor     r9,r9,r11
392         ldr     r11,[r14,#LO]   @ K[i].lo
393         eor     r10,r10,r12             @ Ch(e,f,g)
394         ldr     r12,[r14,#HI]   @ K[i].hi
395
396         adds    r3,r3,r9
397         ldr     r7,[sp,#24+0]   @ d.lo
398         adc     r4,r4,r10               @ T += Ch(e,f,g)
399         ldr     r8,[sp,#24+4]   @ d.hi
400         adds    r3,r3,r11
401         and     r9,r11,#0xff
402         adc     r4,r4,r12               @ T += K[i]
403         adds    r7,r7,r3
404         ldr     r11,[sp,#8+0]   @ b.lo
405         adc     r8,r8,r4                @ d += T
406         teq     r9,#23
407
408         ldr     r12,[sp,#16+0]  @ c.lo
409 #if __ARM_ARCH__>=7
410         it      eq                      @ Thumb2 thing, sanity check in ARM
411 #endif
412         orreq   r14,r14,#1
413         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
414         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
415         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
416         mov     r9,r5,lsr#28
417         mov     r10,r6,lsr#28
418         eor     r9,r9,r6,lsl#4
419         eor     r10,r10,r5,lsl#4
420         eor     r9,r9,r6,lsr#2
421         eor     r10,r10,r5,lsr#2
422         eor     r9,r9,r5,lsl#30
423         eor     r10,r10,r6,lsl#30
424         eor     r9,r9,r6,lsr#7
425         eor     r10,r10,r5,lsr#7
426         eor     r9,r9,r5,lsl#25
427         eor     r10,r10,r6,lsl#25       @ Sigma0(a)
428         adds    r3,r3,r9
429         and     r9,r5,r11
430         adc     r4,r4,r10               @ T += Sigma0(a)
431
432         ldr     r10,[sp,#8+4]   @ b.hi
433         orr     r5,r5,r11
434         ldr     r11,[sp,#16+4]  @ c.hi
435         and     r5,r5,r12
436         and     r12,r6,r10
437         orr     r6,r6,r10
438         orr     r5,r5,r9                @ Maj(a,b,c).lo
439         and     r6,r6,r11
440         adds    r5,r5,r3
441         orr     r6,r6,r12               @ Maj(a,b,c).hi
442         sub     sp,sp,#8
443         adc     r6,r6,r4                @ h += T
444         tst     r14,#1
445         add     r14,r14,#8
446 #if __ARM_ARCH__>=7
447         ittt    eq                      @ Thumb2 thing, sanity check in ARM
448 #endif
449         ldreq   r9,[sp,#184+0]
450         ldreq   r10,[sp,#184+4]
451         beq     .L16_79
452         bic     r14,r14,#1
453
454         ldr     r3,[sp,#8+0]
455         ldr     r4,[sp,#8+4]
456         ldr     r9, [r0,#0+LO]
457         ldr     r10, [r0,#0+HI]
458         ldr     r11, [r0,#8+LO]
459         ldr     r12, [r0,#8+HI]
460         adds    r9,r5,r9
461         str     r9, [r0,#0+LO]
462         adc     r10,r6,r10
463         str     r10, [r0,#0+HI]
464         adds    r11,r3,r11
465         str     r11, [r0,#8+LO]
466         adc     r12,r4,r12
467         str     r12, [r0,#8+HI]
468
469         ldr     r5,[sp,#16+0]
470         ldr     r6,[sp,#16+4]
471         ldr     r3,[sp,#24+0]
472         ldr     r4,[sp,#24+4]
473         ldr     r9, [r0,#16+LO]
474         ldr     r10, [r0,#16+HI]
475         ldr     r11, [r0,#24+LO]
476         ldr     r12, [r0,#24+HI]
477         adds    r9,r5,r9
478         str     r9, [r0,#16+LO]
479         adc     r10,r6,r10
480         str     r10, [r0,#16+HI]
481         adds    r11,r3,r11
482         str     r11, [r0,#24+LO]
483         adc     r12,r4,r12
484         str     r12, [r0,#24+HI]
485
486         ldr     r3,[sp,#40+0]
487         ldr     r4,[sp,#40+4]
488         ldr     r9, [r0,#32+LO]
489         ldr     r10, [r0,#32+HI]
490         ldr     r11, [r0,#40+LO]
491         ldr     r12, [r0,#40+HI]
492         adds    r7,r7,r9
493         str     r7,[r0,#32+LO]
494         adc     r8,r8,r10
495         str     r8,[r0,#32+HI]
496         adds    r11,r3,r11
497         str     r11, [r0,#40+LO]
498         adc     r12,r4,r12
499         str     r12, [r0,#40+HI]
500
501         ldr     r5,[sp,#48+0]
502         ldr     r6,[sp,#48+4]
503         ldr     r3,[sp,#56+0]
504         ldr     r4,[sp,#56+4]
505         ldr     r9, [r0,#48+LO]
506         ldr     r10, [r0,#48+HI]
507         ldr     r11, [r0,#56+LO]
508         ldr     r12, [r0,#56+HI]
509         adds    r9,r5,r9
510         str     r9, [r0,#48+LO]
511         adc     r10,r6,r10
512         str     r10, [r0,#48+HI]
513         adds    r11,r3,r11
514         str     r11, [r0,#56+LO]
515         adc     r12,r4,r12
516         str     r12, [r0,#56+HI]
517
518         add     sp,sp,#640
519         sub     r14,r14,#640
520
521         teq     r1,r2
522         bne     .Loop
523
524         add     sp,sp,#8*9              @ destroy frame
525 #if __ARM_ARCH__>=5
526         ldmia   sp!,{r4-r12,pc}
527 #else
528         ldmia   sp!,{r4-r12,lr}
529         tst     lr,#1
530         moveq   pc,lr                   @ be binary compatible with V4, yet
531         .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
532 #endif
533 .size   sha512_block_data_order,.-sha512_block_data_order
534 #if __ARM_MAX_ARCH__>=7
535 .arch   armv7-a
536 .fpu    neon
537
538 .global sha512_block_data_order_neon
539 .type   sha512_block_data_order_neon,%function
540 .align  4
541 sha512_block_data_order_neon:
542 .LNEON:
543         dmb                             @ errata #451034 on early Cortex A8
544         add     r2,r1,r2,lsl#7  @ len to point at the end of inp
545         VFP_ABI_PUSH
546         adrl    r3,K512
547         vldmia  r0,{d16-d23}            @ load context
548 .Loop_neon:
549         vshr.u64        d24,d20,#14     @ 0
550 #if 0<16
551         vld1.64         {d0},[r1]!      @ handles unaligned
552 #endif
553         vshr.u64        d25,d20,#18
554 #if 0>0
555          vadd.i64       d16,d30                 @ h+=Maj from the past
556 #endif
557         vshr.u64        d26,d20,#41
558         vld1.64         {d28},[r3,:64]! @ K[i++]
559         vsli.64         d24,d20,#50
560         vsli.64         d25,d20,#46
561         vmov            d29,d20
562         vsli.64         d26,d20,#23
563 #if 0<16 && defined(__ARMEL__)
564         vrev64.8        d0,d0
565 #endif
566         veor            d25,d24
567         vbsl            d29,d21,d22             @ Ch(e,f,g)
568         vshr.u64        d24,d16,#28
569         veor            d26,d25                 @ Sigma1(e)
570         vadd.i64        d27,d29,d23
571         vshr.u64        d25,d16,#34
572         vsli.64         d24,d16,#36
573         vadd.i64        d27,d26
574         vshr.u64        d26,d16,#39
575         vadd.i64        d28,d0
576         vsli.64         d25,d16,#30
577         veor            d30,d16,d17
578         vsli.64         d26,d16,#25
579         veor            d23,d24,d25
580         vadd.i64        d27,d28
581         vbsl            d30,d18,d17             @ Maj(a,b,c)
582         veor            d23,d26                 @ Sigma0(a)
583         vadd.i64        d19,d27
584         vadd.i64        d30,d27
585         @ vadd.i64      d23,d30
586         vshr.u64        d24,d19,#14     @ 1
587 #if 1<16
588         vld1.64         {d1},[r1]!      @ handles unaligned
589 #endif
590         vshr.u64        d25,d19,#18
591 #if 1>0
592          vadd.i64       d23,d30                 @ h+=Maj from the past
593 #endif
594         vshr.u64        d26,d19,#41
595         vld1.64         {d28},[r3,:64]! @ K[i++]
596         vsli.64         d24,d19,#50
597         vsli.64         d25,d19,#46
598         vmov            d29,d19
599         vsli.64         d26,d19,#23
600 #if 1<16 && defined(__ARMEL__)
601         vrev64.8        d1,d1
602 #endif
603         veor            d25,d24
604         vbsl            d29,d20,d21             @ Ch(e,f,g)
605         vshr.u64        d24,d23,#28
606         veor            d26,d25                 @ Sigma1(e)
607         vadd.i64        d27,d29,d22
608         vshr.u64        d25,d23,#34
609         vsli.64         d24,d23,#36
610         vadd.i64        d27,d26
611         vshr.u64        d26,d23,#39
612         vadd.i64        d28,d1
613         vsli.64         d25,d23,#30
614         veor            d30,d23,d16
615         vsli.64         d26,d23,#25
616         veor            d22,d24,d25
617         vadd.i64        d27,d28
618         vbsl            d30,d17,d16             @ Maj(a,b,c)
619         veor            d22,d26                 @ Sigma0(a)
620         vadd.i64        d18,d27
621         vadd.i64        d30,d27
622         @ vadd.i64      d22,d30
623         vshr.u64        d24,d18,#14     @ 2
624 #if 2<16
625         vld1.64         {d2},[r1]!      @ handles unaligned
626 #endif
627         vshr.u64        d25,d18,#18
628 #if 2>0
629          vadd.i64       d22,d30                 @ h+=Maj from the past
630 #endif
631         vshr.u64        d26,d18,#41
632         vld1.64         {d28},[r3,:64]! @ K[i++]
633         vsli.64         d24,d18,#50
634         vsli.64         d25,d18,#46
635         vmov            d29,d18
636         vsli.64         d26,d18,#23
637 #if 2<16 && defined(__ARMEL__)
638         vrev64.8        d2,d2
639 #endif
640         veor            d25,d24
641         vbsl            d29,d19,d20             @ Ch(e,f,g)
642         vshr.u64        d24,d22,#28
643         veor            d26,d25                 @ Sigma1(e)
644         vadd.i64        d27,d29,d21
645         vshr.u64        d25,d22,#34
646         vsli.64         d24,d22,#36
647         vadd.i64        d27,d26
648         vshr.u64        d26,d22,#39
649         vadd.i64        d28,d2
650         vsli.64         d25,d22,#30
651         veor            d30,d22,d23
652         vsli.64         d26,d22,#25
653         veor            d21,d24,d25
654         vadd.i64        d27,d28
655         vbsl            d30,d16,d23             @ Maj(a,b,c)
656         veor            d21,d26                 @ Sigma0(a)
657         vadd.i64        d17,d27
658         vadd.i64        d30,d27
659         @ vadd.i64      d21,d30
660         vshr.u64        d24,d17,#14     @ 3
661 #if 3<16
662         vld1.64         {d3},[r1]!      @ handles unaligned
663 #endif
664         vshr.u64        d25,d17,#18
665 #if 3>0
666          vadd.i64       d21,d30                 @ h+=Maj from the past
667 #endif
668         vshr.u64        d26,d17,#41
669         vld1.64         {d28},[r3,:64]! @ K[i++]
670         vsli.64         d24,d17,#50
671         vsli.64         d25,d17,#46
672         vmov            d29,d17
673         vsli.64         d26,d17,#23
674 #if 3<16 && defined(__ARMEL__)
675         vrev64.8        d3,d3
676 #endif
677         veor            d25,d24
678         vbsl            d29,d18,d19             @ Ch(e,f,g)
679         vshr.u64        d24,d21,#28
680         veor            d26,d25                 @ Sigma1(e)
681         vadd.i64        d27,d29,d20
682         vshr.u64        d25,d21,#34
683         vsli.64         d24,d21,#36
684         vadd.i64        d27,d26
685         vshr.u64        d26,d21,#39
686         vadd.i64        d28,d3
687         vsli.64         d25,d21,#30
688         veor            d30,d21,d22
689         vsli.64         d26,d21,#25
690         veor            d20,d24,d25
691         vadd.i64        d27,d28
692         vbsl            d30,d23,d22             @ Maj(a,b,c)
693         veor            d20,d26                 @ Sigma0(a)
694         vadd.i64        d16,d27
695         vadd.i64        d30,d27
696         @ vadd.i64      d20,d30
697         vshr.u64        d24,d16,#14     @ 4
698 #if 4<16
699         vld1.64         {d4},[r1]!      @ handles unaligned
700 #endif
701         vshr.u64        d25,d16,#18
702 #if 4>0
703          vadd.i64       d20,d30                 @ h+=Maj from the past
704 #endif
705         vshr.u64        d26,d16,#41
706         vld1.64         {d28},[r3,:64]! @ K[i++]
707         vsli.64         d24,d16,#50
708         vsli.64         d25,d16,#46
709         vmov            d29,d16
710         vsli.64         d26,d16,#23
711 #if 4<16 && defined(__ARMEL__)
712         vrev64.8        d4,d4
713 #endif
714         veor            d25,d24
715         vbsl            d29,d17,d18             @ Ch(e,f,g)
716         vshr.u64        d24,d20,#28
717         veor            d26,d25                 @ Sigma1(e)
718         vadd.i64        d27,d29,d19
719         vshr.u64        d25,d20,#34
720         vsli.64         d24,d20,#36
721         vadd.i64        d27,d26
722         vshr.u64        d26,d20,#39
723         vadd.i64        d28,d4
724         vsli.64         d25,d20,#30
725         veor            d30,d20,d21
726         vsli.64         d26,d20,#25
727         veor            d19,d24,d25
728         vadd.i64        d27,d28
729         vbsl            d30,d22,d21             @ Maj(a,b,c)
730         veor            d19,d26                 @ Sigma0(a)
731         vadd.i64        d23,d27
732         vadd.i64        d30,d27
733         @ vadd.i64      d19,d30
734         vshr.u64        d24,d23,#14     @ 5
735 #if 5<16
736         vld1.64         {d5},[r1]!      @ handles unaligned
737 #endif
738         vshr.u64        d25,d23,#18
739 #if 5>0
740          vadd.i64       d19,d30                 @ h+=Maj from the past
741 #endif
742         vshr.u64        d26,d23,#41
743         vld1.64         {d28},[r3,:64]! @ K[i++]
744         vsli.64         d24,d23,#50
745         vsli.64         d25,d23,#46
746         vmov            d29,d23
747         vsli.64         d26,d23,#23
748 #if 5<16 && defined(__ARMEL__)
749         vrev64.8        d5,d5
750 #endif
751         veor            d25,d24
752         vbsl            d29,d16,d17             @ Ch(e,f,g)
753         vshr.u64        d24,d19,#28
754         veor            d26,d25                 @ Sigma1(e)
755         vadd.i64        d27,d29,d18
756         vshr.u64        d25,d19,#34
757         vsli.64         d24,d19,#36
758         vadd.i64        d27,d26
759         vshr.u64        d26,d19,#39
760         vadd.i64        d28,d5
761         vsli.64         d25,d19,#30
762         veor            d30,d19,d20
763         vsli.64         d26,d19,#25
764         veor            d18,d24,d25
765         vadd.i64        d27,d28
766         vbsl            d30,d21,d20             @ Maj(a,b,c)
767         veor            d18,d26                 @ Sigma0(a)
768         vadd.i64        d22,d27
769         vadd.i64        d30,d27
770         @ vadd.i64      d18,d30
771         vshr.u64        d24,d22,#14     @ 6
772 #if 6<16
773         vld1.64         {d6},[r1]!      @ handles unaligned
774 #endif
775         vshr.u64        d25,d22,#18
776 #if 6>0
777          vadd.i64       d18,d30                 @ h+=Maj from the past
778 #endif
779         vshr.u64        d26,d22,#41
780         vld1.64         {d28},[r3,:64]! @ K[i++]
781         vsli.64         d24,d22,#50
782         vsli.64         d25,d22,#46
783         vmov            d29,d22
784         vsli.64         d26,d22,#23
785 #if 6<16 && defined(__ARMEL__)
786         vrev64.8        d6,d6
787 #endif
788         veor            d25,d24
789         vbsl            d29,d23,d16             @ Ch(e,f,g)
790         vshr.u64        d24,d18,#28
791         veor            d26,d25                 @ Sigma1(e)
792         vadd.i64        d27,d29,d17
793         vshr.u64        d25,d18,#34
794         vsli.64         d24,d18,#36
795         vadd.i64        d27,d26
796         vshr.u64        d26,d18,#39
797         vadd.i64        d28,d6
798         vsli.64         d25,d18,#30
799         veor            d30,d18,d19
800         vsli.64         d26,d18,#25
801         veor            d17,d24,d25
802         vadd.i64        d27,d28
803         vbsl            d30,d20,d19             @ Maj(a,b,c)
804         veor            d17,d26                 @ Sigma0(a)
805         vadd.i64        d21,d27
806         vadd.i64        d30,d27
807         @ vadd.i64      d17,d30
808         vshr.u64        d24,d21,#14     @ 7
809 #if 7<16
810         vld1.64         {d7},[r1]!      @ handles unaligned
811 #endif
812         vshr.u64        d25,d21,#18
813 #if 7>0
814          vadd.i64       d17,d30                 @ h+=Maj from the past
815 #endif
816         vshr.u64        d26,d21,#41
817         vld1.64         {d28},[r3,:64]! @ K[i++]
818         vsli.64         d24,d21,#50
819         vsli.64         d25,d21,#46
820         vmov            d29,d21
821         vsli.64         d26,d21,#23
822 #if 7<16 && defined(__ARMEL__)
823         vrev64.8        d7,d7
824 #endif
825         veor            d25,d24
826         vbsl            d29,d22,d23             @ Ch(e,f,g)
827         vshr.u64        d24,d17,#28
828         veor            d26,d25                 @ Sigma1(e)
829         vadd.i64        d27,d29,d16
830         vshr.u64        d25,d17,#34
831         vsli.64         d24,d17,#36
832         vadd.i64        d27,d26
833         vshr.u64        d26,d17,#39
834         vadd.i64        d28,d7
835         vsli.64         d25,d17,#30
836         veor            d30,d17,d18
837         vsli.64         d26,d17,#25
838         veor            d16,d24,d25
839         vadd.i64        d27,d28
840         vbsl            d30,d19,d18             @ Maj(a,b,c)
841         veor            d16,d26                 @ Sigma0(a)
842         vadd.i64        d20,d27
843         vadd.i64        d30,d27
844         @ vadd.i64      d16,d30
845         vshr.u64        d24,d20,#14     @ 8
846 #if 8<16
847         vld1.64         {d8},[r1]!      @ handles unaligned
848 #endif
849         vshr.u64        d25,d20,#18
850 #if 8>0
851          vadd.i64       d16,d30                 @ h+=Maj from the past
852 #endif
853         vshr.u64        d26,d20,#41
854         vld1.64         {d28},[r3,:64]! @ K[i++]
855         vsli.64         d24,d20,#50
856         vsli.64         d25,d20,#46
857         vmov            d29,d20
858         vsli.64         d26,d20,#23
859 #if 8<16 && defined(__ARMEL__)
860         vrev64.8        d8,d8
861 #endif
862         veor            d25,d24
863         vbsl            d29,d21,d22             @ Ch(e,f,g)
864         vshr.u64        d24,d16,#28
865         veor            d26,d25                 @ Sigma1(e)
866         vadd.i64        d27,d29,d23
867         vshr.u64        d25,d16,#34
868         vsli.64         d24,d16,#36
869         vadd.i64        d27,d26
870         vshr.u64        d26,d16,#39
871         vadd.i64        d28,d8
872         vsli.64         d25,d16,#30
873         veor            d30,d16,d17
874         vsli.64         d26,d16,#25
875         veor            d23,d24,d25
876         vadd.i64        d27,d28
877         vbsl            d30,d18,d17             @ Maj(a,b,c)
878         veor            d23,d26                 @ Sigma0(a)
879         vadd.i64        d19,d27
880         vadd.i64        d30,d27
881         @ vadd.i64      d23,d30
882         vshr.u64        d24,d19,#14     @ 9
883 #if 9<16
884         vld1.64         {d9},[r1]!      @ handles unaligned
885 #endif
886         vshr.u64        d25,d19,#18
887 #if 9>0
888          vadd.i64       d23,d30                 @ h+=Maj from the past
889 #endif
890         vshr.u64        d26,d19,#41
891         vld1.64         {d28},[r3,:64]! @ K[i++]
892         vsli.64         d24,d19,#50
893         vsli.64         d25,d19,#46
894         vmov            d29,d19
895         vsli.64         d26,d19,#23
896 #if 9<16 && defined(__ARMEL__)
897         vrev64.8        d9,d9
898 #endif
899         veor            d25,d24
900         vbsl            d29,d20,d21             @ Ch(e,f,g)
901         vshr.u64        d24,d23,#28
902         veor            d26,d25                 @ Sigma1(e)
903         vadd.i64        d27,d29,d22
904         vshr.u64        d25,d23,#34
905         vsli.64         d24,d23,#36
906         vadd.i64        d27,d26
907         vshr.u64        d26,d23,#39
908         vadd.i64        d28,d9
909         vsli.64         d25,d23,#30
910         veor            d30,d23,d16
911         vsli.64         d26,d23,#25
912         veor            d22,d24,d25
913         vadd.i64        d27,d28
914         vbsl            d30,d17,d16             @ Maj(a,b,c)
915         veor            d22,d26                 @ Sigma0(a)
916         vadd.i64        d18,d27
917         vadd.i64        d30,d27
918         @ vadd.i64      d22,d30
919         vshr.u64        d24,d18,#14     @ 10
920 #if 10<16
921         vld1.64         {d10},[r1]!     @ handles unaligned
922 #endif
923         vshr.u64        d25,d18,#18
924 #if 10>0
925          vadd.i64       d22,d30                 @ h+=Maj from the past
926 #endif
927         vshr.u64        d26,d18,#41
928         vld1.64         {d28},[r3,:64]! @ K[i++]
929         vsli.64         d24,d18,#50
930         vsli.64         d25,d18,#46
931         vmov            d29,d18
932         vsli.64         d26,d18,#23
933 #if 10<16 && defined(__ARMEL__)
934         vrev64.8        d10,d10
935 #endif
936         veor            d25,d24
937         vbsl            d29,d19,d20             @ Ch(e,f,g)
938         vshr.u64        d24,d22,#28
939         veor            d26,d25                 @ Sigma1(e)
940         vadd.i64        d27,d29,d21
941         vshr.u64        d25,d22,#34
942         vsli.64         d24,d22,#36
943         vadd.i64        d27,d26
944         vshr.u64        d26,d22,#39
945         vadd.i64        d28,d10
946         vsli.64         d25,d22,#30
947         veor            d30,d22,d23
948         vsli.64         d26,d22,#25
949         veor            d21,d24,d25
950         vadd.i64        d27,d28
951         vbsl            d30,d16,d23             @ Maj(a,b,c)
952         veor            d21,d26                 @ Sigma0(a)
953         vadd.i64        d17,d27
954         vadd.i64        d30,d27
955         @ vadd.i64      d21,d30
956         vshr.u64        d24,d17,#14     @ 11
957 #if 11<16
958         vld1.64         {d11},[r1]!     @ handles unaligned
959 #endif
960         vshr.u64        d25,d17,#18
961 #if 11>0
962          vadd.i64       d21,d30                 @ h+=Maj from the past
963 #endif
964         vshr.u64        d26,d17,#41
965         vld1.64         {d28},[r3,:64]! @ K[i++]
966         vsli.64         d24,d17,#50
967         vsli.64         d25,d17,#46
968         vmov            d29,d17
969         vsli.64         d26,d17,#23
970 #if 11<16 && defined(__ARMEL__)
971         vrev64.8        d11,d11
972 #endif
973         veor            d25,d24
974         vbsl            d29,d18,d19             @ Ch(e,f,g)
975         vshr.u64        d24,d21,#28
976         veor            d26,d25                 @ Sigma1(e)
977         vadd.i64        d27,d29,d20
978         vshr.u64        d25,d21,#34
979         vsli.64         d24,d21,#36
980         vadd.i64        d27,d26
981         vshr.u64        d26,d21,#39
982         vadd.i64        d28,d11
983         vsli.64         d25,d21,#30
984         veor            d30,d21,d22
985         vsli.64         d26,d21,#25
986         veor            d20,d24,d25
987         vadd.i64        d27,d28
988         vbsl            d30,d23,d22             @ Maj(a,b,c)
989         veor            d20,d26                 @ Sigma0(a)
990         vadd.i64        d16,d27
991         vadd.i64        d30,d27
992         @ vadd.i64      d20,d30
993         vshr.u64        d24,d16,#14     @ 12
994 #if 12<16
995         vld1.64         {d12},[r1]!     @ handles unaligned
996 #endif
997         vshr.u64        d25,d16,#18
998 #if 12>0
999          vadd.i64       d20,d30                 @ h+=Maj from the past
1000 #endif
1001         vshr.u64        d26,d16,#41
1002         vld1.64         {d28},[r3,:64]! @ K[i++]
1003         vsli.64         d24,d16,#50
1004         vsli.64         d25,d16,#46
1005         vmov            d29,d16
1006         vsli.64         d26,d16,#23
1007 #if 12<16 && defined(__ARMEL__)
1008         vrev64.8        d12,d12
1009 #endif
1010         veor            d25,d24
1011         vbsl            d29,d17,d18             @ Ch(e,f,g)
1012         vshr.u64        d24,d20,#28
1013         veor            d26,d25                 @ Sigma1(e)
1014         vadd.i64        d27,d29,d19
1015         vshr.u64        d25,d20,#34
1016         vsli.64         d24,d20,#36
1017         vadd.i64        d27,d26
1018         vshr.u64        d26,d20,#39
1019         vadd.i64        d28,d12
1020         vsli.64         d25,d20,#30
1021         veor            d30,d20,d21
1022         vsli.64         d26,d20,#25
1023         veor            d19,d24,d25
1024         vadd.i64        d27,d28
1025         vbsl            d30,d22,d21             @ Maj(a,b,c)
1026         veor            d19,d26                 @ Sigma0(a)
1027         vadd.i64        d23,d27
1028         vadd.i64        d30,d27
1029         @ vadd.i64      d19,d30
1030         vshr.u64        d24,d23,#14     @ 13
1031 #if 13<16
1032         vld1.64         {d13},[r1]!     @ handles unaligned
1033 #endif
1034         vshr.u64        d25,d23,#18
1035 #if 13>0
1036          vadd.i64       d19,d30                 @ h+=Maj from the past
1037 #endif
1038         vshr.u64        d26,d23,#41
1039         vld1.64         {d28},[r3,:64]! @ K[i++]
1040         vsli.64         d24,d23,#50
1041         vsli.64         d25,d23,#46
1042         vmov            d29,d23
1043         vsli.64         d26,d23,#23
1044 #if 13<16 && defined(__ARMEL__)
1045         vrev64.8        d13,d13
1046 #endif
1047         veor            d25,d24
1048         vbsl            d29,d16,d17             @ Ch(e,f,g)
1049         vshr.u64        d24,d19,#28
1050         veor            d26,d25                 @ Sigma1(e)
1051         vadd.i64        d27,d29,d18
1052         vshr.u64        d25,d19,#34
1053         vsli.64         d24,d19,#36
1054         vadd.i64        d27,d26
1055         vshr.u64        d26,d19,#39
1056         vadd.i64        d28,d13
1057         vsli.64         d25,d19,#30
1058         veor            d30,d19,d20
1059         vsli.64         d26,d19,#25
1060         veor            d18,d24,d25
1061         vadd.i64        d27,d28
1062         vbsl            d30,d21,d20             @ Maj(a,b,c)
1063         veor            d18,d26                 @ Sigma0(a)
1064         vadd.i64        d22,d27
1065         vadd.i64        d30,d27
1066         @ vadd.i64      d18,d30
1067         vshr.u64        d24,d22,#14     @ 14
1068 #if 14<16
1069         vld1.64         {d14},[r1]!     @ handles unaligned
1070 #endif
1071         vshr.u64        d25,d22,#18
1072 #if 14>0
1073          vadd.i64       d18,d30                 @ h+=Maj from the past
1074 #endif
1075         vshr.u64        d26,d22,#41
1076         vld1.64         {d28},[r3,:64]! @ K[i++]
1077         vsli.64         d24,d22,#50
1078         vsli.64         d25,d22,#46
1079         vmov            d29,d22
1080         vsli.64         d26,d22,#23
1081 #if 14<16 && defined(__ARMEL__)
1082         vrev64.8        d14,d14
1083 #endif
1084         veor            d25,d24
1085         vbsl            d29,d23,d16             @ Ch(e,f,g)
1086         vshr.u64        d24,d18,#28
1087         veor            d26,d25                 @ Sigma1(e)
1088         vadd.i64        d27,d29,d17
1089         vshr.u64        d25,d18,#34
1090         vsli.64         d24,d18,#36
1091         vadd.i64        d27,d26
1092         vshr.u64        d26,d18,#39
1093         vadd.i64        d28,d14
1094         vsli.64         d25,d18,#30
1095         veor            d30,d18,d19
1096         vsli.64         d26,d18,#25
1097         veor            d17,d24,d25
1098         vadd.i64        d27,d28
1099         vbsl            d30,d20,d19             @ Maj(a,b,c)
1100         veor            d17,d26                 @ Sigma0(a)
1101         vadd.i64        d21,d27
1102         vadd.i64        d30,d27
1103         @ vadd.i64      d17,d30
1104         vshr.u64        d24,d21,#14     @ 15
1105 #if 15<16
1106         vld1.64         {d15},[r1]!     @ handles unaligned
1107 #endif
1108         vshr.u64        d25,d21,#18
1109 #if 15>0
1110          vadd.i64       d17,d30                 @ h+=Maj from the past
1111 #endif
1112         vshr.u64        d26,d21,#41
1113         vld1.64         {d28},[r3,:64]! @ K[i++]
1114         vsli.64         d24,d21,#50
1115         vsli.64         d25,d21,#46
1116         vmov            d29,d21
1117         vsli.64         d26,d21,#23
1118 #if 15<16 && defined(__ARMEL__)
1119         vrev64.8        d15,d15
1120 #endif
1121         veor            d25,d24
1122         vbsl            d29,d22,d23             @ Ch(e,f,g)
1123         vshr.u64        d24,d17,#28
1124         veor            d26,d25                 @ Sigma1(e)
1125         vadd.i64        d27,d29,d16
1126         vshr.u64        d25,d17,#34
1127         vsli.64         d24,d17,#36
1128         vadd.i64        d27,d26
1129         vshr.u64        d26,d17,#39
1130         vadd.i64        d28,d15
1131         vsli.64         d25,d17,#30
1132         veor            d30,d17,d18
1133         vsli.64         d26,d17,#25
1134         veor            d16,d24,d25
1135         vadd.i64        d27,d28
1136         vbsl            d30,d19,d18             @ Maj(a,b,c)
1137         veor            d16,d26                 @ Sigma0(a)
1138         vadd.i64        d20,d27
1139         vadd.i64        d30,d27
1140         @ vadd.i64      d16,d30
1141         mov             r12,#4
1142 .L16_79_neon:
1143         subs            r12,#1
1144         vshr.u64        q12,q7,#19
1145         vshr.u64        q13,q7,#61
1146          vadd.i64       d16,d30                 @ h+=Maj from the past
1147         vshr.u64        q15,q7,#6
1148         vsli.64         q12,q7,#45
1149         vext.8          q14,q0,q1,#8    @ X[i+1]
1150         vsli.64         q13,q7,#3
1151         veor            q15,q12
1152         vshr.u64        q12,q14,#1
1153         veor            q15,q13                         @ sigma1(X[i+14])
1154         vshr.u64        q13,q14,#8
1155         vadd.i64        q0,q15
1156         vshr.u64        q15,q14,#7
1157         vsli.64         q12,q14,#63
1158         vsli.64         q13,q14,#56
1159         vext.8          q14,q4,q5,#8    @ X[i+9]
1160         veor            q15,q12
1161         vshr.u64        d24,d20,#14             @ from NEON_00_15
1162         vadd.i64        q0,q14
1163         vshr.u64        d25,d20,#18             @ from NEON_00_15
1164         veor            q15,q13                         @ sigma0(X[i+1])
1165         vshr.u64        d26,d20,#41             @ from NEON_00_15
1166         vadd.i64        q0,q15
1167         vld1.64         {d28},[r3,:64]! @ K[i++]
1168         vsli.64         d24,d20,#50
1169         vsli.64         d25,d20,#46
1170         vmov            d29,d20
1171         vsli.64         d26,d20,#23
1172 #if 16<16 && defined(__ARMEL__)
1173         vrev64.8        ,
1174 #endif
1175         veor            d25,d24
1176         vbsl            d29,d21,d22             @ Ch(e,f,g)
1177         vshr.u64        d24,d16,#28
1178         veor            d26,d25                 @ Sigma1(e)
1179         vadd.i64        d27,d29,d23
1180         vshr.u64        d25,d16,#34
1181         vsli.64         d24,d16,#36
1182         vadd.i64        d27,d26
1183         vshr.u64        d26,d16,#39
1184         vadd.i64        d28,d0
1185         vsli.64         d25,d16,#30
1186         veor            d30,d16,d17
1187         vsli.64         d26,d16,#25
1188         veor            d23,d24,d25
1189         vadd.i64        d27,d28
1190         vbsl            d30,d18,d17             @ Maj(a,b,c)
1191         veor            d23,d26                 @ Sigma0(a)
1192         vadd.i64        d19,d27
1193         vadd.i64        d30,d27
1194         @ vadd.i64      d23,d30
1195         vshr.u64        d24,d19,#14     @ 17
1196 #if 17<16
1197         vld1.64         {d1},[r1]!      @ handles unaligned
1198 #endif
1199         vshr.u64        d25,d19,#18
1200 #if 17>0
1201          vadd.i64       d23,d30                 @ h+=Maj from the past
1202 #endif
1203         vshr.u64        d26,d19,#41
1204         vld1.64         {d28},[r3,:64]! @ K[i++]
1205         vsli.64         d24,d19,#50
1206         vsli.64         d25,d19,#46
1207         vmov            d29,d19
1208         vsli.64         d26,d19,#23
1209 #if 17<16 && defined(__ARMEL__)
1210         vrev64.8        ,
1211 #endif
1212         veor            d25,d24
1213         vbsl            d29,d20,d21             @ Ch(e,f,g)
1214         vshr.u64        d24,d23,#28
1215         veor            d26,d25                 @ Sigma1(e)
1216         vadd.i64        d27,d29,d22
1217         vshr.u64        d25,d23,#34
1218         vsli.64         d24,d23,#36
1219         vadd.i64        d27,d26
1220         vshr.u64        d26,d23,#39
1221         vadd.i64        d28,d1
1222         vsli.64         d25,d23,#30
1223         veor            d30,d23,d16
1224         vsli.64         d26,d23,#25
1225         veor            d22,d24,d25
1226         vadd.i64        d27,d28
1227         vbsl            d30,d17,d16             @ Maj(a,b,c)
1228         veor            d22,d26                 @ Sigma0(a)
1229         vadd.i64        d18,d27
1230         vadd.i64        d30,d27
1231         @ vadd.i64      d22,d30
1232         vshr.u64        q12,q0,#19
1233         vshr.u64        q13,q0,#61
1234          vadd.i64       d22,d30                 @ h+=Maj from the past
1235         vshr.u64        q15,q0,#6
1236         vsli.64         q12,q0,#45
1237         vext.8          q14,q1,q2,#8    @ X[i+1]
1238         vsli.64         q13,q0,#3
1239         veor            q15,q12
1240         vshr.u64        q12,q14,#1
1241         veor            q15,q13                         @ sigma1(X[i+14])
1242         vshr.u64        q13,q14,#8
1243         vadd.i64        q1,q15
1244         vshr.u64        q15,q14,#7
1245         vsli.64         q12,q14,#63
1246         vsli.64         q13,q14,#56
1247         vext.8          q14,q5,q6,#8    @ X[i+9]
1248         veor            q15,q12
1249         vshr.u64        d24,d18,#14             @ from NEON_00_15
1250         vadd.i64        q1,q14
1251         vshr.u64        d25,d18,#18             @ from NEON_00_15
1252         veor            q15,q13                         @ sigma0(X[i+1])
1253         vshr.u64        d26,d18,#41             @ from NEON_00_15
1254         vadd.i64        q1,q15
1255         vld1.64         {d28},[r3,:64]! @ K[i++]
1256         vsli.64         d24,d18,#50
1257         vsli.64         d25,d18,#46
1258         vmov            d29,d18
1259         vsli.64         d26,d18,#23
1260 #if 18<16 && defined(__ARMEL__)
1261         vrev64.8        ,
1262 #endif
1263         veor            d25,d24
1264         vbsl            d29,d19,d20             @ Ch(e,f,g)
1265         vshr.u64        d24,d22,#28
1266         veor            d26,d25                 @ Sigma1(e)
1267         vadd.i64        d27,d29,d21
1268         vshr.u64        d25,d22,#34
1269         vsli.64         d24,d22,#36
1270         vadd.i64        d27,d26
1271         vshr.u64        d26,d22,#39
1272         vadd.i64        d28,d2
1273         vsli.64         d25,d22,#30
1274         veor            d30,d22,d23
1275         vsli.64         d26,d22,#25
1276         veor            d21,d24,d25
1277         vadd.i64        d27,d28
1278         vbsl            d30,d16,d23             @ Maj(a,b,c)
1279         veor            d21,d26                 @ Sigma0(a)
1280         vadd.i64        d17,d27
1281         vadd.i64        d30,d27
1282         @ vadd.i64      d21,d30
1283         vshr.u64        d24,d17,#14     @ 19
1284 #if 19<16
1285         vld1.64         {d3},[r1]!      @ handles unaligned
1286 #endif
1287         vshr.u64        d25,d17,#18
1288 #if 19>0
1289          vadd.i64       d21,d30                 @ h+=Maj from the past
1290 #endif
1291         vshr.u64        d26,d17,#41
1292         vld1.64         {d28},[r3,:64]! @ K[i++]
1293         vsli.64         d24,d17,#50
1294         vsli.64         d25,d17,#46
1295         vmov            d29,d17
1296         vsli.64         d26,d17,#23
1297 #if 19<16 && defined(__ARMEL__)
1298         vrev64.8        ,
1299 #endif
1300         veor            d25,d24
1301         vbsl            d29,d18,d19             @ Ch(e,f,g)
1302         vshr.u64        d24,d21,#28
1303         veor            d26,d25                 @ Sigma1(e)
1304         vadd.i64        d27,d29,d20
1305         vshr.u64        d25,d21,#34
1306         vsli.64         d24,d21,#36
1307         vadd.i64        d27,d26
1308         vshr.u64        d26,d21,#39
1309         vadd.i64        d28,d3
1310         vsli.64         d25,d21,#30
1311         veor            d30,d21,d22
1312         vsli.64         d26,d21,#25
1313         veor            d20,d24,d25
1314         vadd.i64        d27,d28
1315         vbsl            d30,d23,d22             @ Maj(a,b,c)
1316         veor            d20,d26                 @ Sigma0(a)
1317         vadd.i64        d16,d27
1318         vadd.i64        d30,d27
1319         @ vadd.i64      d20,d30
1320         vshr.u64        q12,q1,#19
1321         vshr.u64        q13,q1,#61
1322          vadd.i64       d20,d30                 @ h+=Maj from the past
1323         vshr.u64        q15,q1,#6
1324         vsli.64         q12,q1,#45
1325         vext.8          q14,q2,q3,#8    @ X[i+1]
1326         vsli.64         q13,q1,#3
1327         veor            q15,q12
1328         vshr.u64        q12,q14,#1
1329         veor            q15,q13                         @ sigma1(X[i+14])
1330         vshr.u64        q13,q14,#8
1331         vadd.i64        q2,q15
1332         vshr.u64        q15,q14,#7
1333         vsli.64         q12,q14,#63
1334         vsli.64         q13,q14,#56
1335         vext.8          q14,q6,q7,#8    @ X[i+9]
1336         veor            q15,q12
1337         vshr.u64        d24,d16,#14             @ from NEON_00_15
1338         vadd.i64        q2,q14
1339         vshr.u64        d25,d16,#18             @ from NEON_00_15
1340         veor            q15,q13                         @ sigma0(X[i+1])
1341         vshr.u64        d26,d16,#41             @ from NEON_00_15
1342         vadd.i64        q2,q15
1343         vld1.64         {d28},[r3,:64]! @ K[i++]
1344         vsli.64         d24,d16,#50
1345         vsli.64         d25,d16,#46
1346         vmov            d29,d16
1347         vsli.64         d26,d16,#23
1348 #if 20<16 && defined(__ARMEL__)
1349         vrev64.8        ,
1350 #endif
1351         veor            d25,d24
1352         vbsl            d29,d17,d18             @ Ch(e,f,g)
1353         vshr.u64        d24,d20,#28
1354         veor            d26,d25                 @ Sigma1(e)
1355         vadd.i64        d27,d29,d19
1356         vshr.u64        d25,d20,#34
1357         vsli.64         d24,d20,#36
1358         vadd.i64        d27,d26
1359         vshr.u64        d26,d20,#39
1360         vadd.i64        d28,d4
1361         vsli.64         d25,d20,#30
1362         veor            d30,d20,d21
1363         vsli.64         d26,d20,#25
1364         veor            d19,d24,d25
1365         vadd.i64        d27,d28
1366         vbsl            d30,d22,d21             @ Maj(a,b,c)
1367         veor            d19,d26                 @ Sigma0(a)
1368         vadd.i64        d23,d27
1369         vadd.i64        d30,d27
1370         @ vadd.i64      d19,d30
1371         vshr.u64        d24,d23,#14     @ 21
1372 #if 21<16
1373         vld1.64         {d5},[r1]!      @ handles unaligned
1374 #endif
1375         vshr.u64        d25,d23,#18
1376 #if 21>0
1377          vadd.i64       d19,d30                 @ h+=Maj from the past
1378 #endif
1379         vshr.u64        d26,d23,#41
1380         vld1.64         {d28},[r3,:64]! @ K[i++]
1381         vsli.64         d24,d23,#50
1382         vsli.64         d25,d23,#46
1383         vmov            d29,d23
1384         vsli.64         d26,d23,#23
1385 #if 21<16 && defined(__ARMEL__)
1386         vrev64.8        ,
1387 #endif
1388         veor            d25,d24
1389         vbsl            d29,d16,d17             @ Ch(e,f,g)
1390         vshr.u64        d24,d19,#28
1391         veor            d26,d25                 @ Sigma1(e)
1392         vadd.i64        d27,d29,d18
1393         vshr.u64        d25,d19,#34
1394         vsli.64         d24,d19,#36
1395         vadd.i64        d27,d26
1396         vshr.u64        d26,d19,#39
1397         vadd.i64        d28,d5
1398         vsli.64         d25,d19,#30
1399         veor            d30,d19,d20
1400         vsli.64         d26,d19,#25
1401         veor            d18,d24,d25
1402         vadd.i64        d27,d28
1403         vbsl            d30,d21,d20             @ Maj(a,b,c)
1404         veor            d18,d26                 @ Sigma0(a)
1405         vadd.i64        d22,d27
1406         vadd.i64        d30,d27
1407         @ vadd.i64      d18,d30
1408         vshr.u64        q12,q2,#19
1409         vshr.u64        q13,q2,#61
1410          vadd.i64       d18,d30                 @ h+=Maj from the past
1411         vshr.u64        q15,q2,#6
1412         vsli.64         q12,q2,#45
1413         vext.8          q14,q3,q4,#8    @ X[i+1]
1414         vsli.64         q13,q2,#3
1415         veor            q15,q12
1416         vshr.u64        q12,q14,#1
1417         veor            q15,q13                         @ sigma1(X[i+14])
1418         vshr.u64        q13,q14,#8
1419         vadd.i64        q3,q15
1420         vshr.u64        q15,q14,#7
1421         vsli.64         q12,q14,#63
1422         vsli.64         q13,q14,#56
1423         vext.8          q14,q7,q0,#8    @ X[i+9]
1424         veor            q15,q12
1425         vshr.u64        d24,d22,#14             @ from NEON_00_15
1426         vadd.i64        q3,q14
1427         vshr.u64        d25,d22,#18             @ from NEON_00_15
1428         veor            q15,q13                         @ sigma0(X[i+1])
1429         vshr.u64        d26,d22,#41             @ from NEON_00_15
1430         vadd.i64        q3,q15
1431         vld1.64         {d28},[r3,:64]! @ K[i++]
1432         vsli.64         d24,d22,#50
1433         vsli.64         d25,d22,#46
1434         vmov            d29,d22
1435         vsli.64         d26,d22,#23
1436 #if 22<16 && defined(__ARMEL__)
1437         vrev64.8        ,
1438 #endif
1439         veor            d25,d24
1440         vbsl            d29,d23,d16             @ Ch(e,f,g)
1441         vshr.u64        d24,d18,#28
1442         veor            d26,d25                 @ Sigma1(e)
1443         vadd.i64        d27,d29,d17
1444         vshr.u64        d25,d18,#34
1445         vsli.64         d24,d18,#36
1446         vadd.i64        d27,d26
1447         vshr.u64        d26,d18,#39
1448         vadd.i64        d28,d6
1449         vsli.64         d25,d18,#30
1450         veor            d30,d18,d19
1451         vsli.64         d26,d18,#25
1452         veor            d17,d24,d25
1453         vadd.i64        d27,d28
1454         vbsl            d30,d20,d19             @ Maj(a,b,c)
1455         veor            d17,d26                 @ Sigma0(a)
1456         vadd.i64        d21,d27
1457         vadd.i64        d30,d27
1458         @ vadd.i64      d17,d30
1459         vshr.u64        d24,d21,#14     @ 23
1460 #if 23<16
1461         vld1.64         {d7},[r1]!      @ handles unaligned
1462 #endif
1463         vshr.u64        d25,d21,#18
1464 #if 23>0
1465          vadd.i64       d17,d30                 @ h+=Maj from the past
1466 #endif
1467         vshr.u64        d26,d21,#41
1468         vld1.64         {d28},[r3,:64]! @ K[i++]
1469         vsli.64         d24,d21,#50
1470         vsli.64         d25,d21,#46
1471         vmov            d29,d21
1472         vsli.64         d26,d21,#23
1473 #if 23<16 && defined(__ARMEL__)
1474         vrev64.8        ,
1475 #endif
1476         veor            d25,d24
1477         vbsl            d29,d22,d23             @ Ch(e,f,g)
1478         vshr.u64        d24,d17,#28
1479         veor            d26,d25                 @ Sigma1(e)
1480         vadd.i64        d27,d29,d16
1481         vshr.u64        d25,d17,#34
1482         vsli.64         d24,d17,#36
1483         vadd.i64        d27,d26
1484         vshr.u64        d26,d17,#39
1485         vadd.i64        d28,d7
1486         vsli.64         d25,d17,#30
1487         veor            d30,d17,d18
1488         vsli.64         d26,d17,#25
1489         veor            d16,d24,d25
1490         vadd.i64        d27,d28
1491         vbsl            d30,d19,d18             @ Maj(a,b,c)
1492         veor            d16,d26                 @ Sigma0(a)
1493         vadd.i64        d20,d27
1494         vadd.i64        d30,d27
1495         @ vadd.i64      d16,d30
1496         vshr.u64        q12,q3,#19
1497         vshr.u64        q13,q3,#61
1498          vadd.i64       d16,d30                 @ h+=Maj from the past
1499         vshr.u64        q15,q3,#6
1500         vsli.64         q12,q3,#45
1501         vext.8          q14,q4,q5,#8    @ X[i+1]
1502         vsli.64         q13,q3,#3
1503         veor            q15,q12
1504         vshr.u64        q12,q14,#1
1505         veor            q15,q13                         @ sigma1(X[i+14])
1506         vshr.u64        q13,q14,#8
1507         vadd.i64        q4,q15
1508         vshr.u64        q15,q14,#7
1509         vsli.64         q12,q14,#63
1510         vsli.64         q13,q14,#56
1511         vext.8          q14,q0,q1,#8    @ X[i+9]
1512         veor            q15,q12
1513         vshr.u64        d24,d20,#14             @ from NEON_00_15
1514         vadd.i64        q4,q14
1515         vshr.u64        d25,d20,#18             @ from NEON_00_15
1516         veor            q15,q13                         @ sigma0(X[i+1])
1517         vshr.u64        d26,d20,#41             @ from NEON_00_15
1518         vadd.i64        q4,q15
1519         vld1.64         {d28},[r3,:64]! @ K[i++]
1520         vsli.64         d24,d20,#50
1521         vsli.64         d25,d20,#46
1522         vmov            d29,d20
1523         vsli.64         d26,d20,#23
1524 #if 24<16 && defined(__ARMEL__)
1525         vrev64.8        ,
1526 #endif
1527         veor            d25,d24
1528         vbsl            d29,d21,d22             @ Ch(e,f,g)
1529         vshr.u64        d24,d16,#28
1530         veor            d26,d25                 @ Sigma1(e)
1531         vadd.i64        d27,d29,d23
1532         vshr.u64        d25,d16,#34
1533         vsli.64         d24,d16,#36
1534         vadd.i64        d27,d26
1535         vshr.u64        d26,d16,#39
1536         vadd.i64        d28,d8
1537         vsli.64         d25,d16,#30
1538         veor            d30,d16,d17
1539         vsli.64         d26,d16,#25
1540         veor            d23,d24,d25
1541         vadd.i64        d27,d28
1542         vbsl            d30,d18,d17             @ Maj(a,b,c)
1543         veor            d23,d26                 @ Sigma0(a)
1544         vadd.i64        d19,d27
1545         vadd.i64        d30,d27
1546         @ vadd.i64      d23,d30
1547         vshr.u64        d24,d19,#14     @ 25
1548 #if 25<16
1549         vld1.64         {d9},[r1]!      @ handles unaligned
1550 #endif
1551         vshr.u64        d25,d19,#18
1552 #if 25>0
1553          vadd.i64       d23,d30                 @ h+=Maj from the past
1554 #endif
1555         vshr.u64        d26,d19,#41
1556         vld1.64         {d28},[r3,:64]! @ K[i++]
1557         vsli.64         d24,d19,#50
1558         vsli.64         d25,d19,#46
1559         vmov            d29,d19
1560         vsli.64         d26,d19,#23
1561 #if 25<16 && defined(__ARMEL__)
1562         vrev64.8        ,
1563 #endif
1564         veor            d25,d24
1565         vbsl            d29,d20,d21             @ Ch(e,f,g)
1566         vshr.u64        d24,d23,#28
1567         veor            d26,d25                 @ Sigma1(e)
1568         vadd.i64        d27,d29,d22
1569         vshr.u64        d25,d23,#34
1570         vsli.64         d24,d23,#36
1571         vadd.i64        d27,d26
1572         vshr.u64        d26,d23,#39
1573         vadd.i64        d28,d9
1574         vsli.64         d25,d23,#30
1575         veor            d30,d23,d16
1576         vsli.64         d26,d23,#25
1577         veor            d22,d24,d25
1578         vadd.i64        d27,d28
1579         vbsl            d30,d17,d16             @ Maj(a,b,c)
1580         veor            d22,d26                 @ Sigma0(a)
1581         vadd.i64        d18,d27
1582         vadd.i64        d30,d27
1583         @ vadd.i64      d22,d30
1584         vshr.u64        q12,q4,#19
1585         vshr.u64        q13,q4,#61
1586          vadd.i64       d22,d30                 @ h+=Maj from the past
1587         vshr.u64        q15,q4,#6
1588         vsli.64         q12,q4,#45
1589         vext.8          q14,q5,q6,#8    @ X[i+1]
1590         vsli.64         q13,q4,#3
1591         veor            q15,q12
1592         vshr.u64        q12,q14,#1
1593         veor            q15,q13                         @ sigma1(X[i+14])
1594         vshr.u64        q13,q14,#8
1595         vadd.i64        q5,q15
1596         vshr.u64        q15,q14,#7
1597         vsli.64         q12,q14,#63
1598         vsli.64         q13,q14,#56
1599         vext.8          q14,q1,q2,#8    @ X[i+9]
1600         veor            q15,q12
1601         vshr.u64        d24,d18,#14             @ from NEON_00_15
1602         vadd.i64        q5,q14
1603         vshr.u64        d25,d18,#18             @ from NEON_00_15
1604         veor            q15,q13                         @ sigma0(X[i+1])
1605         vshr.u64        d26,d18,#41             @ from NEON_00_15
1606         vadd.i64        q5,q15
1607         vld1.64         {d28},[r3,:64]! @ K[i++]
1608         vsli.64         d24,d18,#50
1609         vsli.64         d25,d18,#46
1610         vmov            d29,d18
1611         vsli.64         d26,d18,#23
1612 #if 26<16 && defined(__ARMEL__)
1613         vrev64.8        ,
1614 #endif
1615         veor            d25,d24
1616         vbsl            d29,d19,d20             @ Ch(e,f,g)
1617         vshr.u64        d24,d22,#28
1618         veor            d26,d25                 @ Sigma1(e)
1619         vadd.i64        d27,d29,d21
1620         vshr.u64        d25,d22,#34
1621         vsli.64         d24,d22,#36
1622         vadd.i64        d27,d26
1623         vshr.u64        d26,d22,#39
1624         vadd.i64        d28,d10
1625         vsli.64         d25,d22,#30
1626         veor            d30,d22,d23
1627         vsli.64         d26,d22,#25
1628         veor            d21,d24,d25
1629         vadd.i64        d27,d28
1630         vbsl            d30,d16,d23             @ Maj(a,b,c)
1631         veor            d21,d26                 @ Sigma0(a)
1632         vadd.i64        d17,d27
1633         vadd.i64        d30,d27
1634         @ vadd.i64      d21,d30
1635         vshr.u64        d24,d17,#14     @ 27
1636 #if 27<16
1637         vld1.64         {d11},[r1]!     @ handles unaligned
1638 #endif
1639         vshr.u64        d25,d17,#18
1640 #if 27>0
1641          vadd.i64       d21,d30                 @ h+=Maj from the past
1642 #endif
1643         vshr.u64        d26,d17,#41
1644         vld1.64         {d28},[r3,:64]! @ K[i++]
1645         vsli.64         d24,d17,#50
1646         vsli.64         d25,d17,#46
1647         vmov            d29,d17
1648         vsli.64         d26,d17,#23
1649 #if 27<16 && defined(__ARMEL__)
1650         vrev64.8        ,
1651 #endif
1652         veor            d25,d24
1653         vbsl            d29,d18,d19             @ Ch(e,f,g)
1654         vshr.u64        d24,d21,#28
1655         veor            d26,d25                 @ Sigma1(e)
1656         vadd.i64        d27,d29,d20
1657         vshr.u64        d25,d21,#34
1658         vsli.64         d24,d21,#36
1659         vadd.i64        d27,d26
1660         vshr.u64        d26,d21,#39
1661         vadd.i64        d28,d11
1662         vsli.64         d25,d21,#30
1663         veor            d30,d21,d22
1664         vsli.64         d26,d21,#25
1665         veor            d20,d24,d25
1666         vadd.i64        d27,d28
1667         vbsl            d30,d23,d22             @ Maj(a,b,c)
1668         veor            d20,d26                 @ Sigma0(a)
1669         vadd.i64        d16,d27
1670         vadd.i64        d30,d27
1671         @ vadd.i64      d20,d30
1672         vshr.u64        q12,q5,#19
1673         vshr.u64        q13,q5,#61
1674          vadd.i64       d20,d30                 @ h+=Maj from the past
1675         vshr.u64        q15,q5,#6
1676         vsli.64         q12,q5,#45
1677         vext.8          q14,q6,q7,#8    @ X[i+1]
1678         vsli.64         q13,q5,#3
1679         veor            q15,q12
1680         vshr.u64        q12,q14,#1
1681         veor            q15,q13                         @ sigma1(X[i+14])
1682         vshr.u64        q13,q14,#8
1683         vadd.i64        q6,q15
1684         vshr.u64        q15,q14,#7
1685         vsli.64         q12,q14,#63
1686         vsli.64         q13,q14,#56
1687         vext.8          q14,q2,q3,#8    @ X[i+9]
1688         veor            q15,q12
1689         vshr.u64        d24,d16,#14             @ from NEON_00_15
1690         vadd.i64        q6,q14
1691         vshr.u64        d25,d16,#18             @ from NEON_00_15
1692         veor            q15,q13                         @ sigma0(X[i+1])
1693         vshr.u64        d26,d16,#41             @ from NEON_00_15
1694         vadd.i64        q6,q15
1695         vld1.64         {d28},[r3,:64]! @ K[i++]
1696         vsli.64         d24,d16,#50
1697         vsli.64         d25,d16,#46
1698         vmov            d29,d16
1699         vsli.64         d26,d16,#23
1700 #if 28<16 && defined(__ARMEL__)
1701         vrev64.8        ,
1702 #endif
1703         veor            d25,d24
1704         vbsl            d29,d17,d18             @ Ch(e,f,g)
1705         vshr.u64        d24,d20,#28
1706         veor            d26,d25                 @ Sigma1(e)
1707         vadd.i64        d27,d29,d19
1708         vshr.u64        d25,d20,#34
1709         vsli.64         d24,d20,#36
1710         vadd.i64        d27,d26
1711         vshr.u64        d26,d20,#39
1712         vadd.i64        d28,d12
1713         vsli.64         d25,d20,#30
1714         veor            d30,d20,d21
1715         vsli.64         d26,d20,#25
1716         veor            d19,d24,d25
1717         vadd.i64        d27,d28
1718         vbsl            d30,d22,d21             @ Maj(a,b,c)
1719         veor            d19,d26                 @ Sigma0(a)
1720         vadd.i64        d23,d27
1721         vadd.i64        d30,d27
1722         @ vadd.i64      d19,d30
1723         vshr.u64        d24,d23,#14     @ 29
1724 #if 29<16
1725         vld1.64         {d13},[r1]!     @ handles unaligned
1726 #endif
1727         vshr.u64        d25,d23,#18
1728 #if 29>0
1729          vadd.i64       d19,d30                 @ h+=Maj from the past
1730 #endif
1731         vshr.u64        d26,d23,#41
1732         vld1.64         {d28},[r3,:64]! @ K[i++]
1733         vsli.64         d24,d23,#50
1734         vsli.64         d25,d23,#46
1735         vmov            d29,d23
1736         vsli.64         d26,d23,#23
1737 #if 29<16 && defined(__ARMEL__)
1738         vrev64.8        ,
1739 #endif
1740         veor            d25,d24
1741         vbsl            d29,d16,d17             @ Ch(e,f,g)
1742         vshr.u64        d24,d19,#28
1743         veor            d26,d25                 @ Sigma1(e)
1744         vadd.i64        d27,d29,d18
1745         vshr.u64        d25,d19,#34
1746         vsli.64         d24,d19,#36
1747         vadd.i64        d27,d26
1748         vshr.u64        d26,d19,#39
1749         vadd.i64        d28,d13
1750         vsli.64         d25,d19,#30
1751         veor            d30,d19,d20
1752         vsli.64         d26,d19,#25
1753         veor            d18,d24,d25
1754         vadd.i64        d27,d28
1755         vbsl            d30,d21,d20             @ Maj(a,b,c)
1756         veor            d18,d26                 @ Sigma0(a)
1757         vadd.i64        d22,d27
1758         vadd.i64        d30,d27
1759         @ vadd.i64      d18,d30
1760         vshr.u64        q12,q6,#19
1761         vshr.u64        q13,q6,#61
1762          vadd.i64       d18,d30                 @ h+=Maj from the past
1763         vshr.u64        q15,q6,#6
1764         vsli.64         q12,q6,#45
1765         vext.8          q14,q7,q0,#8    @ X[i+1]
1766         vsli.64         q13,q6,#3
1767         veor            q15,q12
1768         vshr.u64        q12,q14,#1
1769         veor            q15,q13                         @ sigma1(X[i+14])
1770         vshr.u64        q13,q14,#8
1771         vadd.i64        q7,q15
1772         vshr.u64        q15,q14,#7
1773         vsli.64         q12,q14,#63
1774         vsli.64         q13,q14,#56
1775         vext.8          q14,q3,q4,#8    @ X[i+9]
1776         veor            q15,q12
1777         vshr.u64        d24,d22,#14             @ from NEON_00_15
1778         vadd.i64        q7,q14
1779         vshr.u64        d25,d22,#18             @ from NEON_00_15
1780         veor            q15,q13                         @ sigma0(X[i+1])
1781         vshr.u64        d26,d22,#41             @ from NEON_00_15
1782         vadd.i64        q7,q15
1783         vld1.64         {d28},[r3,:64]! @ K[i++]
1784         vsli.64         d24,d22,#50
1785         vsli.64         d25,d22,#46
1786         vmov            d29,d22
1787         vsli.64         d26,d22,#23
1788 #if 30<16 && defined(__ARMEL__)
1789         vrev64.8        ,
1790 #endif
1791         veor            d25,d24
1792         vbsl            d29,d23,d16             @ Ch(e,f,g)
1793         vshr.u64        d24,d18,#28
1794         veor            d26,d25                 @ Sigma1(e)
1795         vadd.i64        d27,d29,d17
1796         vshr.u64        d25,d18,#34
1797         vsli.64         d24,d18,#36
1798         vadd.i64        d27,d26
1799         vshr.u64        d26,d18,#39
1800         vadd.i64        d28,d14
1801         vsli.64         d25,d18,#30
1802         veor            d30,d18,d19
1803         vsli.64         d26,d18,#25
1804         veor            d17,d24,d25
1805         vadd.i64        d27,d28
1806         vbsl            d30,d20,d19             @ Maj(a,b,c)
1807         veor            d17,d26                 @ Sigma0(a)
1808         vadd.i64        d21,d27
1809         vadd.i64        d30,d27
1810         @ vadd.i64      d17,d30
1811         vshr.u64        d24,d21,#14     @ 31
1812 #if 31<16
1813         vld1.64         {d15},[r1]!     @ handles unaligned
1814 #endif
1815         vshr.u64        d25,d21,#18
1816 #if 31>0
1817          vadd.i64       d17,d30                 @ h+=Maj from the past
1818 #endif
1819         vshr.u64        d26,d21,#41
1820         vld1.64         {d28},[r3,:64]! @ K[i++]
1821         vsli.64         d24,d21,#50
1822         vsli.64         d25,d21,#46
1823         vmov            d29,d21
1824         vsli.64         d26,d21,#23
1825 #if 31<16 && defined(__ARMEL__)
1826         vrev64.8        ,
1827 #endif
1828         veor            d25,d24
1829         vbsl            d29,d22,d23             @ Ch(e,f,g)
1830         vshr.u64        d24,d17,#28
1831         veor            d26,d25                 @ Sigma1(e)
1832         vadd.i64        d27,d29,d16
1833         vshr.u64        d25,d17,#34
1834         vsli.64         d24,d17,#36
1835         vadd.i64        d27,d26
1836         vshr.u64        d26,d17,#39
1837         vadd.i64        d28,d15
1838         vsli.64         d25,d17,#30
1839         veor            d30,d17,d18
1840         vsli.64         d26,d17,#25
1841         veor            d16,d24,d25
1842         vadd.i64        d27,d28
1843         vbsl            d30,d19,d18             @ Maj(a,b,c)
1844         veor            d16,d26                 @ Sigma0(a)
1845         vadd.i64        d20,d27
1846         vadd.i64        d30,d27
1847         @ vadd.i64      d16,d30
1848         bne             .L16_79_neon
1849
1850          vadd.i64       d16,d30         @ h+=Maj from the past
1851         vldmia          r0,{d24-d31}    @ load context to temp
1852         vadd.i64        q8,q12          @ vectorized accumulate
1853         vadd.i64        q9,q13
1854         vadd.i64        q10,q14
1855         vadd.i64        q11,q15
1856         vstmia          r0,{d16-d23}    @ save context
1857         teq             r1,r2
1858         sub             r3,#640 @ rewind K512
1859         bne             .Loop_neon
1860
1861         VFP_ABI_POP
1862         bx      lr                              @ .word 0xe12fff1e
1863 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
1864 #endif
1865 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
1866 .align  2
1867 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
1868 .comm   OPENSSL_armcap_P,4,4
1869 #endif