sysdeps/powerpc/powerpc32/a2/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC A2.
   2    Copyright (C) 2010-2013 Free Software Foundation, Inc.
   3    Contributed by Michael Brutman <brutman@us.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 #define PREFETCH_AHEAD 4        /* no cache lines SRC prefetching ahead  */
  25 #define ZERO_AHEAD 2            /* no cache lines DST zeroing ahead  */
  26
  27         .machine  a2
  28 EALIGN (BP_SYM (memcpy), 5, 0)
  29         CALL_MCOUNT
  30
  31         dcbt    0,r4            /* Prefetch ONE SRC cacheline  */
  32         cmplwi  cr1,r5,16       /* is size < 16 ?  */
  33         mr      r6,r3           /* Copy dest reg to r6; */
  34         blt+    cr1,L(shortcopy)
  35
  36
  37         /* Big copy (16 bytes or more)
  38
  39            Figure out how far to the nearest quadword boundary, or if we are
  40            on one already.
  41
  42            r3 - return value (always)
  43            r4 - current source addr
  44            r5 - copy length
  45            r6 - current dest addr
  46         */
  47
  48         neg     r8,r3           /* LS 4 bits = # bytes to 8-byte dest bdry  */
  49         clrlwi  r8,r8,32-4      /* align to 16byte boundary  */
  50         sub     r7,r4,r3        /* compute offset to src from dest */
  51         cmplwi  cr0,r8,0        /* Were we aligned on a 16 byte bdy? */
  52         beq+    L(dst_aligned)
  53
  54
  55
  56         /* Destination is not aligned on quadword boundary.  Get us to one.
  57
  58            r3 - return value (always)
  59            r4 - current source addr
  60            r5 - copy length
  61            r6 - current dest addr
  62            r7 - offset to src from dest
  63            r8 - number of bytes to quadword boundary
  64         */
  65
  66         mtcrf   0x01,r8         /* put #bytes to boundary into cr7  */
  67         subf    r5,r8,r5        /* adjust remaining len */
  68
  69         bf      cr7*4+3,1f
  70         lbzx    r0,r7,r6        /* copy 1 byte addr */
  71         stb     r0,0(r6)
  72         addi    r6,r6,1
  73 1:
  74         bf      cr7*4+2,2f
  75         lhzx    r0,r7,r6        /* copy 2 byte addr */
  76         sth     r0,0(r6)
  77         addi    r6,r6,2
  78 2:
  79         bf      cr7*4+1,4f
  80         lwzx    r0,r7,r6        /* copy 4 byte addr */
  81         stw     r0,0(r6)
  82         addi    r6,r6,4
  83 4:
  84         bf      cr7*4+0,8f
  85         lfdx    r0,r7,r6        /* copy 8 byte addr */
  86         stfd    r0,0(r6)
  87         addi    r6,r6,8
  88 8:
  89         add     r4,r7,r6        /* update src addr */
  90
  91
  92
  93         /* Dest is quadword aligned now.
  94
  95            Lots of decisions to make.  If we are copying less than a cache
  96            line we won't be here long.  If we are not on a cache line
  97            boundary we need to get there.  And then we need to figure out
  98            how many cache lines ahead to pre-touch.
  99
 100            r3 - return value (always)
 101            r4 - current source addr
 102            r5 - copy length
 103            r6 - current dest addr
 104         */
 105
 106
 107         .align  4
 108 L(dst_aligned):
 109
 110
 111 #ifdef SHARED
 112         mflr    r0
 113 /* Establishes GOT addressability so we can load __cache_line_size
 114    from static. This value was set from the aux vector during startup.  */
 115         SETUP_GOT_ACCESS(r9,got_label)
 116         addis   r9,r9,__cache_line_size-got_label@ha
 117         lwz     r9,__cache_line_size-got_label@l(r9)
 118         mtlr    r0
 119 #else
 120 /* Load __cache_line_size from static. This value was set from the
 121    aux vector during startup.  */
 122         lis     r9,__cache_line_size@ha
 123         lwz     r9,__cache_line_size@l(r9)
 124 #endif
 125
 126         cmplwi  cr5, r9, 0
 127         bne+    cr5,L(cachelineset)
 128
 129 /* __cache_line_size not set: generic byte copy without much optimization */
 130         andi.   r0,r5,1         /* If length is odd copy one byte.  */
 131         beq     L(cachelinenotset_align)
 132         lbz     r7,0(r4)        /* Read one byte from source.  */
 133         addi    r5,r5,-1        /* Update length.  */
 134         addi    r4,r4,1         /* Update source pointer address.  */
 135         stb     r7,0(r6)        /* Store one byte on dest.  */
 136         addi    r6,r6,1         /* Update dest pointer address.  */
 137 L(cachelinenotset_align):
 138         cmpwi   cr7,r5,0        /* If length is 0 return.  */
 139         beqlr   cr7
 140         ori     r2,r2,0         /* Force a new dispatch group.  */
 141 L(cachelinenotset_loop):
 142         addic.  r5,r5,-2        /* Update length.  */
 143         lbz     r7,0(r4)        /* Load 2 bytes from source.  */
 144         lbz     r8,1(r4)
 145         addi    r4,r4,2         /* Update source pointer address.  */
 146         stb     r7,0(r6)        /* Store 2 bytes on dest.  */
 147         stb     r8,1(r6)
 148         addi    r6,r6,2         /* Update dest pointer address.  */
 149         bne     L(cachelinenotset_loop)
 150         blr
 151
 152
 153 L(cachelineset):
 154
 155         addi   r10,r9,-1
 156
 157         cmpw   cr5,r5,r10       /* Less than a cacheline to go? */
 158
 159         neg     r7,r6           /* How far to next cacheline bdy? */
 160
 161         addi    r6,r6,-8        /* prepare for stdu  */
 162         cmpwi   cr0,r9,128
 163         addi    r4,r4,-8        /* prepare for ldu  */
 164
 165
 166         ble+    cr5,L(lessthancacheline)
 167
 168         beq-    cr0,L(big_lines) /* 128 byte line code */
 169
 170
 171
 172
 173         /* More than a cacheline left to go, and using 64 byte cachelines */
 174
 175         clrlwi  r7,r7,32-6      /* How far to next cacheline bdy? */
 176
 177         cmplwi  cr6,r7,0        /* Are we on a cacheline bdy already? */
 178
 179         /* Reduce total len by what it takes to get to the next cache line */
 180         subf    r5,r7,r5
 181         srwi    r7,r7,4         /* How many qws to get to the line bdy? */
 182
 183         /* How many full cache lines to copy after getting to a line bdy? */
 184         srwi    r10,r5,6
 185
 186         cmplwi  r10,0           /* If no full cache lines to copy ... */
 187         li      r11,0           /* number cachelines to copy with prefetch  */
 188         beq     L(nocacheprefetch)
 189
 190
 191         /* We are here because we have at least one full cache line to copy,
 192            and therefore some pre-touching to do. */
 193
 194         cmplwi  r10,PREFETCH_AHEAD
 195         li      r12,64+8        /* prefetch distance  */
 196         ble     L(lessthanmaxprefetch)
 197
 198         /* We can only do so much pre-fetching.  R11 will have the count of
 199            lines left to prefetch after the initial batch of prefetches
 200            are executed. */
 201
 202         subi    r11,r10,PREFETCH_AHEAD
 203         li      r10,PREFETCH_AHEAD
 204
 205 L(lessthanmaxprefetch):
 206         mtctr   r10
 207
 208         /* At this point r10/ctr hold the number of lines to prefetch in this
 209            initial batch, and r11 holds any remainder. */
 210
 211 L(prefetchSRC):
 212         dcbt    r12,r4
 213         addi    r12,r12,64
 214         bdnz    L(prefetchSRC)
 215
 216
 217         /* Prefetching is done, or was not needed.
 218
 219            cr6 - are we on a cacheline boundary already?
 220            r7  - number of quadwords to the next cacheline boundary
 221         */
 222
 223 L(nocacheprefetch):
 224         mtctr   r7
 225
 226         cmplwi  cr1,r5,64   /* Less than a cache line to copy? */
 227
 228         /* How many bytes are left after we copy whatever full
 229            cache lines we can get? */
 230         clrlwi  r5,r5,32-6
 231
 232         beq     cr6,L(cachelinealigned)
 233
 234
 235         /* Copy quadwords up to the next cacheline boundary */
 236
 237 L(aligntocacheline):
 238         lfd     fp9,0x08(r4)
 239         lfdu    fp10,0x10(r4)
 240         stfd    fp9,0x08(r6)
 241         stfdu   fp10,0x10(r6)
 242         bdnz    L(aligntocacheline)
 243
 244
 245         .align 4
 246 L(cachelinealigned):            /* copy while cache lines  */
 247
 248         blt-    cr1,L(lessthancacheline) /* size <64  */
 249
 250 L(outerloop):
 251         cmpwi   r11,0
 252         mtctr   r11
 253         beq-    L(endloop)
 254
 255         li      r11,64*ZERO_AHEAD +8    /* DCBZ dist  */
 256
 257         .align  4
 258         /* Copy whole cachelines, optimized by prefetching SRC cacheline  */
 259 L(loop):                        /* Copy aligned body  */
 260         dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
 261         lfd     fp9,  0x08(r4)
 262         dcbz    r11,r6
 263         lfd     fp10, 0x10(r4)
 264         lfd     fp11, 0x18(r4)
 265         lfd     fp12, 0x20(r4)
 266         stfd    fp9,  0x08(r6)
 267         stfd    fp10, 0x10(r6)
 268         stfd    fp11, 0x18(r6)
 269         stfd    fp12, 0x20(r6)
 270         lfd     fp9,  0x28(r4)
 271         lfd     fp10, 0x30(r4)
 272         lfd     fp11, 0x38(r4)
 273         lfdu    fp12, 0x40(r4)
 274         stfd    fp9,  0x28(r6)
 275         stfd    fp10, 0x30(r6)
 276         stfd    fp11, 0x38(r6)
 277         stfdu   fp12, 0x40(r6)
 278
 279         bdnz    L(loop)
 280
 281
 282 L(endloop):
 283         cmpwi   r10,0
 284         beq-    L(endloop2)
 285         mtctr   r10
 286
 287 L(loop2):                       /* Copy aligned body  */
 288         lfd     fp9,  0x08(r4)
 289         lfd     fp10, 0x10(r4)
 290         lfd     fp11, 0x18(r4)
 291         lfd     fp12, 0x20(r4)
 292         stfd    fp9,  0x08(r6)
 293         stfd    fp10, 0x10(r6)
 294         stfd    fp11, 0x18(r6)
 295         stfd    fp12, 0x20(r6)
 296         lfd     fp9,  0x28(r4)
 297         lfd     fp10, 0x30(r4)
 298         lfd     fp11, 0x38(r4)
 299         lfdu    fp12, 0x40(r4)
 300         stfd    fp9,  0x28(r6)
 301         stfd    fp10, 0x30(r6)
 302         stfd    fp11, 0x38(r6)
 303         stfdu   fp12, 0x40(r6)
 304
 305         bdnz    L(loop2)
 306 L(endloop2):
 307
 308
 309         .align  4
 310 L(lessthancacheline):           /* Was there less than cache to do ?  */
 311         cmplwi  cr0,r5,16
 312         srwi    r7,r5,4         /* divide size by 16  */
 313         blt-    L(do_lt16)
 314         mtctr   r7
 315
 316 L(copy_remaining):
 317         lfd     fp9,  0x08(r4)
 318         lfdu    fp10, 0x10(r4)
 319         stfd    fp9,  0x08(r6)
 320         stfdu   fp10, 0x10(r6)
 321         bdnz    L(copy_remaining)
 322
 323 L(do_lt16):                     /* less than 16 ?  */
 324         cmplwi  cr0,r5,0        /* copy remaining bytes (0-15)  */
 325         beqlr+                  /* no rest to copy  */
 326         addi    r4,r4,8
 327         addi    r6,r6,8
 328
 329 L(shortcopy):                   /* SIMPLE COPY to handle size =< 15 bytes  */
 330         mtcrf   0x01,r5
 331         sub     r7,r4,r6
 332         bf-     cr7*4+0,8f
 333         lfdx    fp9,r7,r6       /* copy 8 byte  */
 334         stfd    fp9,0(r6)
 335         addi    r6,r6,8
 336 8:
 337         bf      cr7*4+1,4f
 338         lwzx    r0,r7,r6        /* copy 4 byte  */
 339         stw     r0,0(r6)
 340         addi    r6,r6,4
 341 4:
 342         bf      cr7*4+2,2f
 343         lhzx    r0,r7,r6        /* copy 2 byte  */
 344         sth     r0,0(r6)
 345         addi    r6,r6,2
 346 2:
 347         bf      cr7*4+3,1f
 348         lbzx    r0,r7,r6        /* copy 1 byte  */
 349         stb     r0,0(r6)
 350 1:
 351         blr
 352
 353
 354
 355
 356
 357         /* Similar to above, but for use with 128 byte lines. */
 358
 359
 360 L(big_lines):
 361
 362         clrlwi  r7,r7,32-7      /* How far to next cacheline bdy? */
 363
 364         cmplwi  cr6,r7,0        /* Are we on a cacheline bdy already? */
 365
 366         /* Reduce total len by what it takes to get to the next cache line */
 367         subf    r5,r7,r5
 368         srwi    r7,r7,4         /* How many qw to get to the line bdy? */
 369
 370         /* How many full cache lines to copy after getting to a line bdy? */
 371         srwi    r10,r5,7
 372
 373         cmplwi  r10,0           /* If no full cache lines to copy ... */
 374         li      r11,0           /* number cachelines to copy with prefetch  */
 375         beq     L(nocacheprefetch_128)
 376
 377
 378         /* We are here because we have at least one full cache line to copy,
 379            and therefore some pre-touching to do. */
 380
 381         cmplwi  r10,PREFETCH_AHEAD
 382         li      r12,128+8       /* prefetch distance  */
 383         ble     L(lessthanmaxprefetch_128)
 384
 385         /* We can only do so much pre-fetching.  R11 will have the count of
 386            lines left to prefetch after the initial batch of prefetches
 387            are executed. */
 388
 389         subi    r11,r10,PREFETCH_AHEAD
 390         li      r10,PREFETCH_AHEAD
 391
 392 L(lessthanmaxprefetch_128):
 393         mtctr   r10
 394
 395         /* At this point r10/ctr hold the number of lines to prefetch in this
 396            initial batch, and r11 holds any remainder. */
 397
 398 L(prefetchSRC_128):
 399         dcbt    r12,r4
 400         addi    r12,r12,128
 401         bdnz    L(prefetchSRC_128)
 402
 403
 404         /* Prefetching is done, or was not needed.
 405
 406            cr6 - are we on a cacheline boundary already?
 407            r7  - number of quadwords to the next cacheline boundary
 408         */
 409
 410 L(nocacheprefetch_128):
 411         mtctr   r7
 412
 413         cmplwi  cr1,r5,128  /* Less than a cache line to copy? */
 414
 415         /* How many bytes are left after we copy whatever full
 416            cache lines we can get? */
 417         clrlwi  r5,r5,32-7
 418
 419         beq     cr6,L(cachelinealigned_128)
 420
 421
 422         /* Copy quadwords up to the next cacheline boundary */
 423
 424 L(aligntocacheline_128):
 425         lfd     fp9,0x08(r4)
 426         lfdu    fp10,0x10(r4)
 427         stfd    fp9,0x08(r6)
 428         stfdu   fp10,0x10(r6)
 429         bdnz    L(aligntocacheline_128)
 430
 431
 432 L(cachelinealigned_128):        /* copy while cache lines  */
 433
 434         blt-    cr1,L(lessthancacheline) /* size <128  */
 435
 436 L(outerloop_128):
 437         cmpwi   r11,0
 438         mtctr   r11
 439         beq-    L(endloop_128)
 440
 441         li      r11,128*ZERO_AHEAD +8    /* DCBZ dist  */
 442
 443         .align  4
 444         /* Copy whole cachelines, optimized by prefetching SRC cacheline  */
 445 L(loop_128):                    /* Copy aligned body  */
 446         dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
 447         lfd     fp9,  0x08(r4)
 448         dcbz    r11,r6
 449         lfd     fp10, 0x10(r4)
 450         lfd     fp11, 0x18(r4)
 451         lfd     fp12, 0x20(r4)
 452         stfd    fp9,  0x08(r6)
 453         stfd    fp10, 0x10(r6)
 454         stfd    fp11, 0x18(r6)
 455         stfd    fp12, 0x20(r6)
 456         lfd     fp9,  0x28(r4)
 457         lfd     fp10, 0x30(r4)
 458         lfd     fp11, 0x38(r4)
 459         lfd     fp12, 0x40(r4)
 460         stfd    fp9,  0x28(r6)
 461         stfd    fp10, 0x30(r6)
 462         stfd    fp11, 0x38(r6)
 463         stfd    fp12, 0x40(r6)
 464         lfd     fp9,  0x48(r4)
 465         lfd     fp10, 0x50(r4)
 466         lfd     fp11, 0x58(r4)
 467         lfd     fp12, 0x60(r4)
 468         stfd    fp9,  0x48(r6)
 469         stfd    fp10, 0x50(r6)
 470         stfd    fp11, 0x58(r6)
 471         stfd    fp12, 0x60(r6)
 472         lfd     fp9,  0x68(r4)
 473         lfd     fp10, 0x70(r4)
 474         lfd     fp11, 0x78(r4)
 475         lfdu    fp12, 0x80(r4)
 476         stfd    fp9,  0x68(r6)
 477         stfd    fp10, 0x70(r6)
 478         stfd    fp11, 0x78(r6)
 479         stfdu   fp12, 0x80(r6)
 480
 481         bdnz    L(loop_128)
 482
 483
 484 L(endloop_128):
 485         cmpwi   r10,0
 486         beq-    L(endloop2_128)
 487         mtctr   r10
 488
 489 L(loop2_128):                   /* Copy aligned body  */
 490         lfd     fp9,  0x08(r4)
 491         lfd     fp10, 0x10(r4)
 492         lfd     fp11, 0x18(r4)
 493         lfd     fp12, 0x20(r4)
 494         stfd    fp9,  0x08(r6)
 495         stfd    fp10, 0x10(r6)
 496         stfd    fp11, 0x18(r6)
 497         stfd    fp12, 0x20(r6)
 498         lfd     fp9,  0x28(r4)
 499         lfd     fp10, 0x30(r4)
 500         lfd     fp11, 0x38(r4)
 501         lfd     fp12, 0x40(r4)
 502         stfd    fp9,  0x28(r6)
 503         stfd    fp10, 0x30(r6)
 504         stfd    fp11, 0x38(r6)
 505         stfd    fp12, 0x40(r6)
 506         lfd     fp9,  0x48(r4)
 507         lfd     fp10, 0x50(r4)
 508         lfd     fp11, 0x58(r4)
 509         lfd     fp12, 0x60(r4)
 510         stfd    fp9,  0x48(r6)
 511         stfd    fp10, 0x50(r6)
 512         stfd    fp11, 0x58(r6)
 513         stfd    fp12, 0x60(r6)
 514         lfd     fp9,  0x68(r4)
 515         lfd     fp10, 0x70(r4)
 516         lfd     fp11, 0x78(r4)
 517         lfdu    fp12, 0x80(r4)
 518         stfd    fp9,  0x68(r6)
 519         stfd    fp10, 0x70(r6)
 520         stfd    fp11, 0x78(r6)
 521         stfdu   fp12, 0x80(r6)
 522         bdnz    L(loop2_128)
 523 L(endloop2_128):
 524
 525         b       L(lessthancacheline)
 526
 527
 528 END (BP_SYM (memcpy))
 529 libc_hidden_builtin_def (memcpy)