Merge tag 'trace-v6.9-2' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux...
[sfrench/cifs-2.6.git] / arch / powerpc / lib / memcpy_power7.S
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  *
4  * Copyright (C) IBM Corporation, 2012
5  *
6  * Author: Anton Blanchard <anton@au.ibm.com>
7  */
8 #include <asm/ppc_asm.h>
9
10 #ifndef SELFTEST_CASE
11 /* 0 == don't use VMX, 1 == use VMX */
12 #define SELFTEST_CASE   0
13 #endif
14
15 #ifdef __BIG_ENDIAN__
16 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
17 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
18 #else
19 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
20 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
21 #endif
22
23 _GLOBAL(memcpy_power7)
24         cmpldi  r5,16
25         cmpldi  cr1,r5,4096
26         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
27         blt     .Lshort_copy
28
29 #ifdef CONFIG_ALTIVEC
30 test_feature = SELFTEST_CASE
31 BEGIN_FTR_SECTION
32         bgt     cr1, .Lvmx_copy
33 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
34 #endif
35
36 .Lnonvmx_copy:
37         /* Get the source 8B aligned */
38         neg     r6,r4
39         mtocrf  0x01,r6
40         clrldi  r6,r6,(64-3)
41
42         bf      cr7*4+3,1f
43         lbz     r0,0(r4)
44         addi    r4,r4,1
45         stb     r0,0(r3)
46         addi    r3,r3,1
47
48 1:      bf      cr7*4+2,2f
49         lhz     r0,0(r4)
50         addi    r4,r4,2
51         sth     r0,0(r3)
52         addi    r3,r3,2
53
54 2:      bf      cr7*4+1,3f
55         lwz     r0,0(r4)
56         addi    r4,r4,4
57         stw     r0,0(r3)
58         addi    r3,r3,4
59
60 3:      sub     r5,r5,r6
61         cmpldi  r5,128
62         blt     5f
63
64         mflr    r0
65         stdu    r1,-STACKFRAMESIZE(r1)
66         std     r14,STK_REG(R14)(r1)
67         std     r15,STK_REG(R15)(r1)
68         std     r16,STK_REG(R16)(r1)
69         std     r17,STK_REG(R17)(r1)
70         std     r18,STK_REG(R18)(r1)
71         std     r19,STK_REG(R19)(r1)
72         std     r20,STK_REG(R20)(r1)
73         std     r21,STK_REG(R21)(r1)
74         std     r22,STK_REG(R22)(r1)
75         std     r0,STACKFRAMESIZE+16(r1)
76
77         srdi    r6,r5,7
78         mtctr   r6
79
80         /* Now do cacheline (128B) sized loads and stores. */
81         .align  5
82 4:
83         ld      r0,0(r4)
84         ld      r6,8(r4)
85         ld      r7,16(r4)
86         ld      r8,24(r4)
87         ld      r9,32(r4)
88         ld      r10,40(r4)
89         ld      r11,48(r4)
90         ld      r12,56(r4)
91         ld      r14,64(r4)
92         ld      r15,72(r4)
93         ld      r16,80(r4)
94         ld      r17,88(r4)
95         ld      r18,96(r4)
96         ld      r19,104(r4)
97         ld      r20,112(r4)
98         ld      r21,120(r4)
99         addi    r4,r4,128
100         std     r0,0(r3)
101         std     r6,8(r3)
102         std     r7,16(r3)
103         std     r8,24(r3)
104         std     r9,32(r3)
105         std     r10,40(r3)
106         std     r11,48(r3)
107         std     r12,56(r3)
108         std     r14,64(r3)
109         std     r15,72(r3)
110         std     r16,80(r3)
111         std     r17,88(r3)
112         std     r18,96(r3)
113         std     r19,104(r3)
114         std     r20,112(r3)
115         std     r21,120(r3)
116         addi    r3,r3,128
117         bdnz    4b
118
119         clrldi  r5,r5,(64-7)
120
121         ld      r14,STK_REG(R14)(r1)
122         ld      r15,STK_REG(R15)(r1)
123         ld      r16,STK_REG(R16)(r1)
124         ld      r17,STK_REG(R17)(r1)
125         ld      r18,STK_REG(R18)(r1)
126         ld      r19,STK_REG(R19)(r1)
127         ld      r20,STK_REG(R20)(r1)
128         ld      r21,STK_REG(R21)(r1)
129         ld      r22,STK_REG(R22)(r1)
130         addi    r1,r1,STACKFRAMESIZE
131
132         /* Up to 127B to go */
133 5:      srdi    r6,r5,4
134         mtocrf  0x01,r6
135
136 6:      bf      cr7*4+1,7f
137         ld      r0,0(r4)
138         ld      r6,8(r4)
139         ld      r7,16(r4)
140         ld      r8,24(r4)
141         ld      r9,32(r4)
142         ld      r10,40(r4)
143         ld      r11,48(r4)
144         ld      r12,56(r4)
145         addi    r4,r4,64
146         std     r0,0(r3)
147         std     r6,8(r3)
148         std     r7,16(r3)
149         std     r8,24(r3)
150         std     r9,32(r3)
151         std     r10,40(r3)
152         std     r11,48(r3)
153         std     r12,56(r3)
154         addi    r3,r3,64
155
156         /* Up to 63B to go */
157 7:      bf      cr7*4+2,8f
158         ld      r0,0(r4)
159         ld      r6,8(r4)
160         ld      r7,16(r4)
161         ld      r8,24(r4)
162         addi    r4,r4,32
163         std     r0,0(r3)
164         std     r6,8(r3)
165         std     r7,16(r3)
166         std     r8,24(r3)
167         addi    r3,r3,32
168
169         /* Up to 31B to go */
170 8:      bf      cr7*4+3,9f
171         ld      r0,0(r4)
172         ld      r6,8(r4)
173         addi    r4,r4,16
174         std     r0,0(r3)
175         std     r6,8(r3)
176         addi    r3,r3,16
177
178 9:      clrldi  r5,r5,(64-4)
179
180         /* Up to 15B to go */
181 .Lshort_copy:
182         mtocrf  0x01,r5
183         bf      cr7*4+0,12f
184         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
185         lwz     r6,4(r4)
186         addi    r4,r4,8
187         stw     r0,0(r3)
188         stw     r6,4(r3)
189         addi    r3,r3,8
190
191 12:     bf      cr7*4+1,13f
192         lwz     r0,0(r4)
193         addi    r4,r4,4
194         stw     r0,0(r3)
195         addi    r3,r3,4
196
197 13:     bf      cr7*4+2,14f
198         lhz     r0,0(r4)
199         addi    r4,r4,2
200         sth     r0,0(r3)
201         addi    r3,r3,2
202
203 14:     bf      cr7*4+3,15f
204         lbz     r0,0(r4)
205         stb     r0,0(r3)
206
207 15:     ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
208         blr
209
210 .Lunwind_stack_nonvmx_copy:
211         addi    r1,r1,STACKFRAMESIZE
212         b       .Lnonvmx_copy
213
214 .Lvmx_copy:
215 #ifdef CONFIG_ALTIVEC
216         mflr    r0
217         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
219         std     r0,16(r1)
220         stdu    r1,-STACKFRAMESIZE(r1)
221         bl      CFUNC(enter_vmx_ops)
222         cmpwi   cr1,r3,0
223         ld      r0,STACKFRAMESIZE+16(r1)
224         ld      r3,STK_REG(R31)(r1)
225         ld      r4,STK_REG(R30)(r1)
226         ld      r5,STK_REG(R29)(r1)
227         mtlr    r0
228
229         /*
230          * We prefetch both the source and destination using enhanced touch
231          * instructions. We use a stream ID of 0 for the load side and
232          * 1 for the store side.
233          */
234         clrrdi  r6,r4,7
235         clrrdi  r9,r3,7
236         ori     r9,r9,1         /* stream=1 */
237
238         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
239         cmpldi  r7,0x3FF
240         ble     1f
241         li      r7,0x3FF
242 1:      lis     r0,0x0E00       /* depth=7 */
243         sldi    r7,r7,7
244         or      r7,r7,r0
245         ori     r10,r7,1        /* stream=1 */
246
247         DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
248
249         beq     cr1,.Lunwind_stack_nonvmx_copy
250
251         /*
252          * If source and destination are not relatively aligned we use a
253          * slower permute loop.
254          */
255         xor     r6,r4,r3
256         rldicl. r6,r6,0,(64-4)
257         bne     .Lvmx_unaligned_copy
258
259         /* Get the destination 16B aligned */
260         neg     r6,r3
261         mtocrf  0x01,r6
262         clrldi  r6,r6,(64-4)
263
264         bf      cr7*4+3,1f
265         lbz     r0,0(r4)
266         addi    r4,r4,1
267         stb     r0,0(r3)
268         addi    r3,r3,1
269
270 1:      bf      cr7*4+2,2f
271         lhz     r0,0(r4)
272         addi    r4,r4,2
273         sth     r0,0(r3)
274         addi    r3,r3,2
275
276 2:      bf      cr7*4+1,3f
277         lwz     r0,0(r4)
278         addi    r4,r4,4
279         stw     r0,0(r3)
280         addi    r3,r3,4
281
282 3:      bf      cr7*4+0,4f
283         ld      r0,0(r4)
284         addi    r4,r4,8
285         std     r0,0(r3)
286         addi    r3,r3,8
287
288 4:      sub     r5,r5,r6
289
290         /* Get the desination 128B aligned */
291         neg     r6,r3
292         srdi    r7,r6,4
293         mtocrf  0x01,r7
294         clrldi  r6,r6,(64-7)
295
296         li      r9,16
297         li      r10,32
298         li      r11,48
299
300         bf      cr7*4+3,5f
301         lvx     v1,0,r4
302         addi    r4,r4,16
303         stvx    v1,0,r3
304         addi    r3,r3,16
305
306 5:      bf      cr7*4+2,6f
307         lvx     v1,0,r4
308         lvx     v0,r4,r9
309         addi    r4,r4,32
310         stvx    v1,0,r3
311         stvx    v0,r3,r9
312         addi    r3,r3,32
313
314 6:      bf      cr7*4+1,7f
315         lvx     v3,0,r4
316         lvx     v2,r4,r9
317         lvx     v1,r4,r10
318         lvx     v0,r4,r11
319         addi    r4,r4,64
320         stvx    v3,0,r3
321         stvx    v2,r3,r9
322         stvx    v1,r3,r10
323         stvx    v0,r3,r11
324         addi    r3,r3,64
325
326 7:      sub     r5,r5,r6
327         srdi    r6,r5,7
328
329         std     r14,STK_REG(R14)(r1)
330         std     r15,STK_REG(R15)(r1)
331         std     r16,STK_REG(R16)(r1)
332
333         li      r12,64
334         li      r14,80
335         li      r15,96
336         li      r16,112
337
338         mtctr   r6
339
340         /*
341          * Now do cacheline sized loads and stores. By this stage the
342          * cacheline stores are also cacheline aligned.
343          */
344         .align  5
345 8:
346         lvx     v7,0,r4
347         lvx     v6,r4,r9
348         lvx     v5,r4,r10
349         lvx     v4,r4,r11
350         lvx     v3,r4,r12
351         lvx     v2,r4,r14
352         lvx     v1,r4,r15
353         lvx     v0,r4,r16
354         addi    r4,r4,128
355         stvx    v7,0,r3
356         stvx    v6,r3,r9
357         stvx    v5,r3,r10
358         stvx    v4,r3,r11
359         stvx    v3,r3,r12
360         stvx    v2,r3,r14
361         stvx    v1,r3,r15
362         stvx    v0,r3,r16
363         addi    r3,r3,128
364         bdnz    8b
365
366         ld      r14,STK_REG(R14)(r1)
367         ld      r15,STK_REG(R15)(r1)
368         ld      r16,STK_REG(R16)(r1)
369
370         /* Up to 127B to go */
371         clrldi  r5,r5,(64-7)
372         srdi    r6,r5,4
373         mtocrf  0x01,r6
374
375         bf      cr7*4+1,9f
376         lvx     v3,0,r4
377         lvx     v2,r4,r9
378         lvx     v1,r4,r10
379         lvx     v0,r4,r11
380         addi    r4,r4,64
381         stvx    v3,0,r3
382         stvx    v2,r3,r9
383         stvx    v1,r3,r10
384         stvx    v0,r3,r11
385         addi    r3,r3,64
386
387 9:      bf      cr7*4+2,10f
388         lvx     v1,0,r4
389         lvx     v0,r4,r9
390         addi    r4,r4,32
391         stvx    v1,0,r3
392         stvx    v0,r3,r9
393         addi    r3,r3,32
394
395 10:     bf      cr7*4+3,11f
396         lvx     v1,0,r4
397         addi    r4,r4,16
398         stvx    v1,0,r3
399         addi    r3,r3,16
400
401         /* Up to 15B to go */
402 11:     clrldi  r5,r5,(64-4)
403         mtocrf  0x01,r5
404         bf      cr7*4+0,12f
405         ld      r0,0(r4)
406         addi    r4,r4,8
407         std     r0,0(r3)
408         addi    r3,r3,8
409
410 12:     bf      cr7*4+1,13f
411         lwz     r0,0(r4)
412         addi    r4,r4,4
413         stw     r0,0(r3)
414         addi    r3,r3,4
415
416 13:     bf      cr7*4+2,14f
417         lhz     r0,0(r4)
418         addi    r4,r4,2
419         sth     r0,0(r3)
420         addi    r3,r3,2
421
422 14:     bf      cr7*4+3,15f
423         lbz     r0,0(r4)
424         stb     r0,0(r3)
425
426 15:     addi    r1,r1,STACKFRAMESIZE
427         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
428         b       CFUNC(exit_vmx_ops)             /* tail call optimise */
429
430 .Lvmx_unaligned_copy:
431         /* Get the destination 16B aligned */
432         neg     r6,r3
433         mtocrf  0x01,r6
434         clrldi  r6,r6,(64-4)
435
436         bf      cr7*4+3,1f
437         lbz     r0,0(r4)
438         addi    r4,r4,1
439         stb     r0,0(r3)
440         addi    r3,r3,1
441
442 1:      bf      cr7*4+2,2f
443         lhz     r0,0(r4)
444         addi    r4,r4,2
445         sth     r0,0(r3)
446         addi    r3,r3,2
447
448 2:      bf      cr7*4+1,3f
449         lwz     r0,0(r4)
450         addi    r4,r4,4
451         stw     r0,0(r3)
452         addi    r3,r3,4
453
454 3:      bf      cr7*4+0,4f
455         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
456         lwz     r7,4(r4)
457         addi    r4,r4,8
458         stw     r0,0(r3)
459         stw     r7,4(r3)
460         addi    r3,r3,8
461
462 4:      sub     r5,r5,r6
463
464         /* Get the desination 128B aligned */
465         neg     r6,r3
466         srdi    r7,r6,4
467         mtocrf  0x01,r7
468         clrldi  r6,r6,(64-7)
469
470         li      r9,16
471         li      r10,32
472         li      r11,48
473
474         LVS(v16,0,r4)           /* Setup permute control vector */
475         lvx     v0,0,r4
476         addi    r4,r4,16
477
478         bf      cr7*4+3,5f
479         lvx     v1,0,r4
480         VPERM(v8,v0,v1,v16)
481         addi    r4,r4,16
482         stvx    v8,0,r3
483         addi    r3,r3,16
484         vor     v0,v1,v1
485
486 5:      bf      cr7*4+2,6f
487         lvx     v1,0,r4
488         VPERM(v8,v0,v1,v16)
489         lvx     v0,r4,r9
490         VPERM(v9,v1,v0,v16)
491         addi    r4,r4,32
492         stvx    v8,0,r3
493         stvx    v9,r3,r9
494         addi    r3,r3,32
495
496 6:      bf      cr7*4+1,7f
497         lvx     v3,0,r4
498         VPERM(v8,v0,v3,v16)
499         lvx     v2,r4,r9
500         VPERM(v9,v3,v2,v16)
501         lvx     v1,r4,r10
502         VPERM(v10,v2,v1,v16)
503         lvx     v0,r4,r11
504         VPERM(v11,v1,v0,v16)
505         addi    r4,r4,64
506         stvx    v8,0,r3
507         stvx    v9,r3,r9
508         stvx    v10,r3,r10
509         stvx    v11,r3,r11
510         addi    r3,r3,64
511
512 7:      sub     r5,r5,r6
513         srdi    r6,r5,7
514
515         std     r14,STK_REG(R14)(r1)
516         std     r15,STK_REG(R15)(r1)
517         std     r16,STK_REG(R16)(r1)
518
519         li      r12,64
520         li      r14,80
521         li      r15,96
522         li      r16,112
523
524         mtctr   r6
525
526         /*
527          * Now do cacheline sized loads and stores. By this stage the
528          * cacheline stores are also cacheline aligned.
529          */
530         .align  5
531 8:
532         lvx     v7,0,r4
533         VPERM(v8,v0,v7,v16)
534         lvx     v6,r4,r9
535         VPERM(v9,v7,v6,v16)
536         lvx     v5,r4,r10
537         VPERM(v10,v6,v5,v16)
538         lvx     v4,r4,r11
539         VPERM(v11,v5,v4,v16)
540         lvx     v3,r4,r12
541         VPERM(v12,v4,v3,v16)
542         lvx     v2,r4,r14
543         VPERM(v13,v3,v2,v16)
544         lvx     v1,r4,r15
545         VPERM(v14,v2,v1,v16)
546         lvx     v0,r4,r16
547         VPERM(v15,v1,v0,v16)
548         addi    r4,r4,128
549         stvx    v8,0,r3
550         stvx    v9,r3,r9
551         stvx    v10,r3,r10
552         stvx    v11,r3,r11
553         stvx    v12,r3,r12
554         stvx    v13,r3,r14
555         stvx    v14,r3,r15
556         stvx    v15,r3,r16
557         addi    r3,r3,128
558         bdnz    8b
559
560         ld      r14,STK_REG(R14)(r1)
561         ld      r15,STK_REG(R15)(r1)
562         ld      r16,STK_REG(R16)(r1)
563
564         /* Up to 127B to go */
565         clrldi  r5,r5,(64-7)
566         srdi    r6,r5,4
567         mtocrf  0x01,r6
568
569         bf      cr7*4+1,9f
570         lvx     v3,0,r4
571         VPERM(v8,v0,v3,v16)
572         lvx     v2,r4,r9
573         VPERM(v9,v3,v2,v16)
574         lvx     v1,r4,r10
575         VPERM(v10,v2,v1,v16)
576         lvx     v0,r4,r11
577         VPERM(v11,v1,v0,v16)
578         addi    r4,r4,64
579         stvx    v8,0,r3
580         stvx    v9,r3,r9
581         stvx    v10,r3,r10
582         stvx    v11,r3,r11
583         addi    r3,r3,64
584
585 9:      bf      cr7*4+2,10f
586         lvx     v1,0,r4
587         VPERM(v8,v0,v1,v16)
588         lvx     v0,r4,r9
589         VPERM(v9,v1,v0,v16)
590         addi    r4,r4,32
591         stvx    v8,0,r3
592         stvx    v9,r3,r9
593         addi    r3,r3,32
594
595 10:     bf      cr7*4+3,11f
596         lvx     v1,0,r4
597         VPERM(v8,v0,v1,v16)
598         addi    r4,r4,16
599         stvx    v8,0,r3
600         addi    r3,r3,16
601
602         /* Up to 15B to go */
603 11:     clrldi  r5,r5,(64-4)
604         addi    r4,r4,-16       /* Unwind the +16 load offset */
605         mtocrf  0x01,r5
606         bf      cr7*4+0,12f
607         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
608         lwz     r6,4(r4)
609         addi    r4,r4,8
610         stw     r0,0(r3)
611         stw     r6,4(r3)
612         addi    r3,r3,8
613
614 12:     bf      cr7*4+1,13f
615         lwz     r0,0(r4)
616         addi    r4,r4,4
617         stw     r0,0(r3)
618         addi    r3,r3,4
619
620 13:     bf      cr7*4+2,14f
621         lhz     r0,0(r4)
622         addi    r4,r4,2
623         sth     r0,0(r3)
624         addi    r3,r3,2
625
626 14:     bf      cr7*4+3,15f
627         lbz     r0,0(r4)
628         stb     r0,0(r3)
629
630 15:     addi    r1,r1,STACKFRAMESIZE
631         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
632         b       CFUNC(exit_vmx_ops)             /* tail call optimise */
633 #endif /* CONFIG_ALTIVEC */