treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152
[sfrench/cifs-2.6.git] / arch / powerpc / lib / memcpy_64.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
4  */
5 #include <asm/processor.h>
6 #include <asm/ppc_asm.h>
7 #include <asm/export.h>
8 #include <asm/asm-compat.h>
9 #include <asm/feature-fixups.h>
10 #include <asm/kasan.h>
11
12 #ifndef SELFTEST_CASE
13 /* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */
14 #define SELFTEST_CASE   0
15 #endif
16
17         .align  7
18 _GLOBAL_TOC_KASAN(memcpy)
19 BEGIN_FTR_SECTION
20 #ifdef __LITTLE_ENDIAN__
21         cmpdi   cr7,r5,0
22 #else
23         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* save destination pointer for return value */
24 #endif
25 FTR_SECTION_ELSE
26 #ifdef CONFIG_PPC_BOOK3S_64
27         b       memcpy_power7
28 #endif
29 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
30 #ifdef __LITTLE_ENDIAN__
31         /* dumb little-endian memcpy that will get replaced at runtime */
32         addi r9,r3,-1
33         addi r4,r4,-1
34         beqlr cr7
35         mtctr r5
36 1:      lbzu r10,1(r4)
37         stbu r10,1(r9)
38         bdnz 1b
39         blr
40 #else
41         PPC_MTOCRF(0x01,r5)
42         cmpldi  cr1,r5,16
43         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
44         andi.   r6,r6,7
45         dcbt    0,r4
46         blt     cr1,.Lshort_copy
47 /* Below we want to nop out the bne if we're on a CPU that has the
48    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
49    cleared.
50    At the time of writing the only CPU that has this combination of bits
51    set is Power6. */
52 test_feature = (SELFTEST_CASE == 1)
53 BEGIN_FTR_SECTION
54         nop
55 FTR_SECTION_ELSE
56         bne     .Ldst_unaligned
57 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
58                     CPU_FTR_UNALIGNED_LD_STD)
59 .Ldst_aligned:
60         addi    r3,r3,-16
61 test_feature = (SELFTEST_CASE == 0)
62 BEGIN_FTR_SECTION
63         andi.   r0,r4,7
64         bne     .Lsrc_unaligned
65 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
66         srdi    r7,r5,4
67         ld      r9,0(r4)
68         addi    r4,r4,-8
69         mtctr   r7
70         andi.   r5,r5,7
71         bf      cr7*4+0,2f
72         addi    r3,r3,8
73         addi    r4,r4,8
74         mr      r8,r9
75         blt     cr1,3f
76 1:      ld      r9,8(r4)
77         std     r8,8(r3)
78 2:      ldu     r8,16(r4)
79         stdu    r9,16(r3)
80         bdnz    1b
81 3:      std     r8,8(r3)
82         beq     3f
83         addi    r3,r3,16
84 .Ldo_tail:
85         bf      cr7*4+1,1f
86         lwz     r9,8(r4)
87         addi    r4,r4,4
88         stw     r9,0(r3)
89         addi    r3,r3,4
90 1:      bf      cr7*4+2,2f
91         lhz     r9,8(r4)
92         addi    r4,r4,2
93         sth     r9,0(r3)
94         addi    r3,r3,2
95 2:      bf      cr7*4+3,3f
96         lbz     r9,8(r4)
97         stb     r9,0(r3)
98 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
99         blr
100
101 .Lsrc_unaligned:
102         srdi    r6,r5,3
103         addi    r5,r5,-16
104         subf    r4,r0,r4
105         srdi    r7,r5,4
106         sldi    r10,r0,3
107         cmpdi   cr6,r6,3
108         andi.   r5,r5,7
109         mtctr   r7
110         subfic  r11,r10,64
111         add     r5,r5,r0
112
113         bt      cr7*4+0,0f
114
115         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
116         ld      r0,8(r4)
117         sld     r6,r9,r10
118         ldu     r9,16(r4)
119         srd     r7,r0,r11
120         sld     r8,r0,r10
121         or      r7,r7,r6
122         blt     cr6,4f
123         ld      r0,8(r4)
124         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
125         b       2f
126
127 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
128         ldu     r9,8(r4)
129         sld     r8,r0,r10
130         addi    r3,r3,-8
131         blt     cr6,5f
132         ld      r0,8(r4)
133         srd     r12,r9,r11
134         sld     r6,r9,r10
135         ldu     r9,16(r4)
136         or      r12,r8,r12
137         srd     r7,r0,r11
138         sld     r8,r0,r10
139         addi    r3,r3,16
140         beq     cr6,3f
141
142         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
143 1:      or      r7,r7,r6
144         ld      r0,8(r4)
145         std     r12,8(r3)
146 2:      srd     r12,r9,r11
147         sld     r6,r9,r10
148         ldu     r9,16(r4)
149         or      r12,r8,r12
150         stdu    r7,16(r3)
151         srd     r7,r0,r11
152         sld     r8,r0,r10
153         bdnz    1b
154
155 3:      std     r12,8(r3)
156         or      r7,r7,r6
157 4:      std     r7,16(r3)
158 5:      srd     r12,r9,r11
159         or      r12,r8,r12
160         std     r12,24(r3)
161         beq     4f
162         cmpwi   cr1,r5,8
163         addi    r3,r3,32
164         sld     r9,r9,r10
165         ble     cr1,6f
166         ld      r0,8(r4)
167         srd     r7,r0,r11
168         or      r9,r7,r9
169 6:
170         bf      cr7*4+1,1f
171         rotldi  r9,r9,32
172         stw     r9,0(r3)
173         addi    r3,r3,4
174 1:      bf      cr7*4+2,2f
175         rotldi  r9,r9,16
176         sth     r9,0(r3)
177         addi    r3,r3,2
178 2:      bf      cr7*4+3,3f
179         rotldi  r9,r9,8
180         stb     r9,0(r3)
181 3:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
182         blr
183
184 .Ldst_unaligned:
185         PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
186         subf    r5,r6,r5
187         li      r7,0
188         cmpldi  cr1,r5,16
189         bf      cr7*4+3,1f
190         lbz     r0,0(r4)
191         stb     r0,0(r3)
192         addi    r7,r7,1
193 1:      bf      cr7*4+2,2f
194         lhzx    r0,r7,r4
195         sthx    r0,r7,r3
196         addi    r7,r7,2
197 2:      bf      cr7*4+1,3f
198         lwzx    r0,r7,r4
199         stwx    r0,r7,r3
200 3:      PPC_MTOCRF(0x01,r5)
201         add     r4,r6,r4
202         add     r3,r6,r3
203         b       .Ldst_aligned
204
205 .Lshort_copy:
206         bf      cr7*4+0,1f
207         lwz     r0,0(r4)
208         lwz     r9,4(r4)
209         addi    r4,r4,8
210         stw     r0,0(r3)
211         stw     r9,4(r3)
212         addi    r3,r3,8
213 1:      bf      cr7*4+1,2f
214         lwz     r0,0(r4)
215         addi    r4,r4,4
216         stw     r0,0(r3)
217         addi    r3,r3,4
218 2:      bf      cr7*4+2,3f
219         lhz     r0,0(r4)
220         addi    r4,r4,2
221         sth     r0,0(r3)
222         addi    r3,r3,2
223 3:      bf      cr7*4+3,4f
224         lbz     r0,0(r4)
225         stb     r0,0(r3)
226 4:      ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)     /* return dest pointer */
227         blr
228 #endif
229 EXPORT_SYMBOL(memcpy)
230 EXPORT_SYMBOL_KASAN(memcpy)