sysdeps/powerpc/powerpc32/power7/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC32/POWER7.
   2    Copyright (C) 2010-2014 Free Software Foundation, Inc.
   3    Contributed by Luis Machado <luisgpm@br.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  23    Returns 'dst'.  */
  24
  25         .machine  power7
  26 EALIGN (memcpy, 5, 0)
  27         CALL_MCOUNT
  28
  29         stwu    1,-32(1)
  30         cfi_adjust_cfa_offset(32)
  31         stw     30,20(1)
  32         cfi_offset(30,(20-32))
  33         stw     31,24(1)
  34         mr      30,3
  35         cmplwi  cr1,5,31
  36         neg     0,3
  37         cfi_offset(31,-8)
  38         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  39                                     code.  */
  40
  41         andi.   11,3,7        /* Check alignment of DST.  */
  42         clrlwi  10,4,29       /* Check alignment of SRC.  */
  43         cmplw   cr6,10,11     /* SRC and DST alignments match?  */
  44         mr      12,4
  45         mr      31,5
  46         bne     cr6,L(copy_GE_32_unaligned)
  47
  48         srwi    9,5,3         /* Number of full quadwords remaining.  */
  49
  50         beq     L(copy_GE_32_aligned_cont)
  51
  52         clrlwi  0,0,29
  53         mtcrf   0x01,0
  54         subf    31,0,5
  55
  56         /* Get the SRC aligned to 8 bytes.  */
  57
  58 1:      bf      31,2f
  59         lbz     6,0(12)
  60         addi    12,12,1
  61         stb     6,0(3)
  62         addi    3,3,1
  63 2:      bf      30,4f
  64         lhz     6,0(12)
  65         addi    12,12,2
  66         sth     6,0(3)
  67         addi    3,3,2
  68 4:      bf      29,0f
  69         lwz     6,0(12)
  70         addi    12,12,4
  71         stw     6,0(3)
  72         addi    3,3,4
  73 0:
  74         clrlwi  10,12,29      /* Check alignment of SRC again.  */
  75         srwi    9,31,3        /* Number of full doublewords remaining.  */
  76
  77 L(copy_GE_32_aligned_cont):
  78
  79         clrlwi  11,31,29
  80         mtcrf   0x01,9
  81
  82         srwi    8,31,5
  83         cmplwi  cr1,9,4
  84         cmplwi  cr6,11,0
  85         mr      11,12
  86
  87         /* Copy 1~3 doublewords so the main loop starts
  88         at a multiple of 32 bytes.  */
  89
  90         bf      30,1f
  91         lfd     6,0(12)
  92         lfd     7,8(12)
  93         addi    11,12,16
  94         mtctr   8
  95         stfd    6,0(3)
  96         stfd    7,8(3)
  97         addi    10,3,16
  98         bf      31,4f
  99         lfd     0,16(12)
 100         stfd    0,16(3)
 101         blt     cr1,3f
 102         addi    11,12,24
 103         addi    10,3,24
 104         b       4f
 105
 106         .align  4
 107 1:      /* Copy 1 doubleword and set the counter.  */
 108         mr      10,3
 109         mtctr   8
 110         bf      31,4f
 111         lfd     6,0(12)
 112         addi    11,12,8
 113         stfd    6,0(3)
 114         addi    10,3,8
 115
 116 L(aligned_copy):
 117         /* Main aligned copy loop. Copies up to 128-bytes at a time. */
 118         .align  4
 119 4:
 120         /* check for any 32-byte or 64-byte lumps that are outside of a
 121            nice 128-byte range.  R8 contains the number of 32-byte
 122            lumps, so drop this into the CR, and use the SO/EQ bits to help
 123            handle the 32- or 64- byte lumps.  Then handle the rest with an
 124            unrolled 128-bytes-at-a-time copy loop. */
 125         mtocrf  1,8
 126         li      6,16    # 16() index
 127         li      7,32    # 32() index
 128         li      8,48    # 48() index
 129
 130 L(aligned_32byte):
 131         /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
 132         bns     cr7,L(aligned_64byte)
 133         lxvd2x  6,0,11
 134         lxvd2x  7,11,6
 135         addi    11,11,32
 136         stxvd2x 6,0,10
 137         stxvd2x 7,10,6
 138         addi    10,10,32
 139
 140 L(aligned_64byte):
 141         /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
 142         bne     cr7,L(aligned_128setup)
 143         lxvd2x  6,0,11
 144         lxvd2x  7,11,6
 145         lxvd2x  8,11,7
 146         lxvd2x  9,11,8
 147         addi    11,11,64
 148         stxvd2x 6,0,10
 149         stxvd2x 7,10,6
 150         stxvd2x 8,10,7
 151         stxvd2x 9,10,8
 152         addi    10,10,64
 153
 154 L(aligned_128setup):
 155         /* Set up for the 128-byte at a time copy loop.  */
 156         srwi    8,31,7
 157         cmpwi   8,0     # Any 4x lumps left?
 158         beq     3f      # if not, move along.
 159         lxvd2x  6,0,11
 160         lxvd2x  7,11,6
 161         mtctr   8       # otherwise, load the ctr and begin.
 162         li      8,48    # 48() index
 163         b       L(aligned_128loop)
 164
 165 L(aligned_128head):
 166         /* for the 2nd + iteration of this loop. */
 167         lxvd2x  6,0,11
 168         lxvd2x  7,11,6
 169 L(aligned_128loop):
 170         lxvd2x  8,11,7
 171         lxvd2x  9,11,8
 172         stxvd2x 6,0,10
 173         addi    11,11,64
 174         stxvd2x 7,10,6
 175         stxvd2x 8,10,7
 176         stxvd2x 9,10,8
 177         lxvd2x  6,0,11
 178         lxvd2x  7,11,6
 179         addi    10,10,64
 180         lxvd2x  8,11,7
 181         lxvd2x  9,11,8
 182         addi    11,11,64
 183         stxvd2x 6,0,10
 184         stxvd2x 7,10,6
 185         stxvd2x 8,10,7
 186         stxvd2x 9,10,8
 187         addi    10,10,64
 188         bdnz    L(aligned_128head)
 189
 190 3:
 191         /* Check for tail bytes.  */
 192         clrrwi  0,31,3
 193         mtcrf   0x01,31
 194         beq     cr6,0f
 195
 196 .L9:
 197         add     3,3,0
 198         add     12,12,0
 199
 200         /*  At this point we have a tail of 0-7 bytes and we know that the
 201         destination is doubleword-aligned.  */
 202 4:      /* Copy 4 bytes.  */
 203         bf      29,2f
 204
 205         lwz     6,0(12)
 206         addi    12,12,4
 207         stw     6,0(3)
 208         addi    3,3,4
 209 2:      /* Copy 2 bytes.  */
 210         bf      30,1f
 211
 212         lhz     6,0(12)
 213         addi    12,12,2
 214         sth     6,0(3)
 215         addi    3,3,2
 216 1:      /* Copy 1 byte.  */
 217         bf      31,0f
 218
 219         lbz     6,0(12)
 220         stb     6,0(3)
 221 0:      /* Return original DST pointer.  */
 222         mr      3,30
 223         lwz     30,20(1)
 224         lwz     31,24(1)
 225         addi    1,1,32
 226         blr
 227
 228         /* Handle copies of 0~31 bytes.  */
 229         .align  4
 230 L(copy_LT_32):
 231         cmplwi  cr6,5,8
 232         mr      12,4
 233         mtcrf   0x01,5
 234         ble     cr6,L(copy_LE_8)
 235
 236         /* At least 9 bytes to go.  */
 237         neg     8,4
 238         clrrwi  11,4,2
 239         andi.   0,8,3
 240         cmplwi  cr1,5,16
 241         mr      10,5
 242         beq     L(copy_LT_32_aligned)
 243
 244         /* Force 4-bytes alignment for SRC.  */
 245         mtocrf  0x01,0
 246         subf    10,0,5
 247 2:      bf      30,1f
 248
 249         lhz     6,0(12)
 250         addi    12,12,2
 251         sth     6,0(3)
 252         addi    3,3,2
 253 1:      bf      31,L(end_4bytes_alignment)
 254
 255         lbz     6,0(12)
 256         addi    12,12,1
 257         stb     6,0(3)
 258         addi    3,3,1
 259
 260         .align  4
 261 L(end_4bytes_alignment):
 262         cmplwi  cr1,10,16
 263         mtcrf   0x01,10
 264
 265 L(copy_LT_32_aligned):
 266         /* At least 6 bytes to go, and SRC is word-aligned.  */
 267         blt     cr1,8f
 268
 269         /* Copy 16 bytes.  */
 270         lwz     6,0(12)
 271         lwz     7,4(12)
 272         stw     6,0(3)
 273         lwz     8,8(12)
 274         stw     7,4(3)
 275         lwz     6,12(12)
 276         addi    12,12,16
 277         stw     8,8(3)
 278         stw     6,12(3)
 279         addi    3,3,16
 280 8:      /* Copy 8 bytes.  */
 281         bf      28,4f
 282
 283         lwz     6,0(12)
 284         lwz     7,4(12)
 285         addi    12,12,8
 286         stw     6,0(3)
 287         stw     7,4(3)
 288         addi    3,3,8
 289 4:      /* Copy 4 bytes.  */
 290         bf      29,2f
 291
 292         lwz     6,0(12)
 293         addi    12,12,4
 294         stw     6,0(3)
 295         addi    3,3,4
 296 2:      /* Copy 2-3 bytes.  */
 297         bf      30,1f
 298
 299         lhz     6,0(12)
 300         sth     6,0(3)
 301         bf      31,0f
 302         lbz     7,2(12)
 303         stb     7,2(3)
 304
 305         /* Return original DST pointer.  */
 306         mr      3,30
 307         lwz     30,20(1)
 308         addi    1,1,32
 309         blr
 310
 311         .align  4
 312 1:      /* Copy 1 byte.  */
 313         bf      31,0f
 314
 315         lbz     6,0(12)
 316         stb     6,0(3)
 317 0:      /* Return original DST pointer.  */
 318         mr      3,30
 319         lwz     30,20(1)
 320         addi    1,1,32
 321         blr
 322
 323         /* Handles copies of 0~8 bytes.  */
 324         .align  4
 325 L(copy_LE_8):
 326         bne     cr6,4f
 327
 328         /* Though we could've used lfd/stfd here, they are still
 329         slow for unaligned cases.  */
 330
 331         lwz     6,0(4)
 332         lwz     7,4(4)
 333         stw     6,0(3)
 334         stw     7,4(3)
 335
 336         /* Return original DST pointer.  */
 337         mr      3,30
 338         lwz     30,20(1)
 339         addi    1,1,32
 340         blr
 341
 342         .align  4
 343 4:      /* Copies 4~7 bytes.  */
 344         bf      29,2b
 345
 346         lwz     6,0(4)
 347         stw     6,0(3)
 348         bf      30,5f
 349         lhz     7,4(4)
 350         sth     7,4(3)
 351         bf      31,0f
 352         lbz     8,6(4)
 353         stb     8,6(3)
 354
 355         /* Return original DST pointer.  */
 356         mr      3,30
 357         lwz     30,20(1)
 358         addi    1,1,32
 359         blr
 360
 361         .align  4
 362 5:      /* Copy 1 byte.  */
 363         bf      31,0f
 364
 365         lbz     6,4(4)
 366         stb     6,4(3)
 367
 368 0:      /* Return original DST pointer.  */
 369         mr      3,30
 370         lwz     30,20(1)
 371         addi    1,1,32
 372         blr
 373
 374         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 375         SRC is not. Use aligned quadword loads from SRC, shifted to realign
 376         the data, allowing for aligned DST stores.  */
 377         .align  4
 378 L(copy_GE_32_unaligned):
 379         andi.   11,3,15       /* Check alignment of DST.  */
 380         clrlwi  0,0,28        /* Number of bytes until the 1st
 381                               quadword of DST.  */
 382         srwi    9,5,4         /* Number of full quadwords remaining.  */
 383
 384         beq    L(copy_GE_32_unaligned_cont)
 385
 386         /* DST is not quadword aligned, get it aligned.  */
 387
 388         mtcrf   0x01,0
 389         subf    31,0,5
 390
 391         /* Vector instructions work best when proper alignment (16-bytes)
 392         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 393 1:      /* Copy 1 byte.  */
 394         bf      31,2f
 395
 396         lbz     6,0(12)
 397         addi    12,12,1
 398         stb     6,0(3)
 399         addi    3,3,1
 400 2:      /* Copy 2 bytes.  */
 401         bf          30,4f
 402
 403         lhz     6,0(12)
 404         addi    12,12,2
 405         sth     6,0(3)
 406         addi    3,3,2
 407 4:      /* Copy 4 bytes.  */
 408         bf      29,8f
 409
 410         lwz     6,0(12)
 411         addi    12,12,4
 412         stw     6,0(3)
 413         addi    3,3,4
 414 8:      /* Copy 8 bytes.  */
 415         bf      28,0f
 416
 417         lfd     6,0(12)
 418         addi    12,12,8
 419         stfd    6,0(3)
 420         addi    3,3,8
 421 0:
 422         clrlwi  10,12,28      /* Check alignment of SRC.  */
 423         srwi    9,31,4        /* Number of full quadwords remaining.  */
 424
 425         /* The proper alignment is present, it is OK to copy the bytes now.  */
 426 L(copy_GE_32_unaligned_cont):
 427
 428         /* Setup two indexes to speed up the indexed vector operations.  */
 429         clrlwi  11,31,28
 430         li      6,16          /* Index for 16-bytes offsets.  */
 431         li      7,32          /* Index for 32-bytes offsets.  */
 432         cmplwi  cr1,11,0
 433         srwi    8,31,5        /* Setup the loop counter.  */
 434         mr      10,3
 435         mr      11,12
 436         mtcrf   0x01,9
 437         cmplwi  cr6,9,1
 438 #ifdef __LITTLE_ENDIAN__
 439         lvsr    5,0,12
 440 #else
 441         lvsl    5,0,12
 442 #endif
 443         lvx     3,0,12
 444         bf      31,L(setup_unaligned_loop)
 445
 446         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 447         lvx     4,12,6
 448 #ifdef __LITTLE_ENDIAN__
 449         vperm   6,4,3,5
 450 #else
 451         vperm   6,3,4,5
 452 #endif
 453         addi    11,12,16
 454         addi    10,3,16
 455         stvx    6,0,3
 456         vor     3,4,4
 457
 458 L(setup_unaligned_loop):
 459         mtctr   8
 460         ble     cr6,L(end_unaligned_loop)
 461
 462         /* Copy 32 bytes at a time using vector instructions.  */
 463         .align  4
 464 L(unaligned_loop):
 465
 466         /* Note: vr6/vr10 may contain data that was already copied,
 467         but in order to get proper alignment, we may have to copy
 468         some portions again. This is faster than having unaligned
 469         vector instructions though.  */
 470
 471         lvx     4,11,6        /* vr4 = r11+16.  */
 472 #ifdef __LITTLE_ENDIAN__
 473         vperm   6,4,3,5
 474 #else
 475         vperm   6,3,4,5
 476 #endif
 477         lvx     3,11,7        /* vr3 = r11+32.  */
 478 #ifdef __LITTLE_ENDIAN__
 479         vperm   10,3,4,5
 480 #else
 481         vperm   10,4,3,5
 482 #endif
 483         addi    11,11,32
 484         stvx    6,0,10
 485         stvx    10,10,6
 486         addi    10,10,32
 487
 488         bdnz    L(unaligned_loop)
 489
 490         .align  4
 491 L(end_unaligned_loop):
 492
 493         /* Check for tail bytes.  */
 494         clrrwi  0,31,4
 495         mtcrf   0x01,31
 496         beq     cr1,0f
 497
 498         add     3,3,0
 499         add     12,12,0
 500
 501         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 502 8:      /* Copy 8 bytes.  */
 503         bf      28,4f
 504
 505         lwz     6,0(12)
 506         lwz     7,4(12)
 507         addi    12,12,8
 508         stw     6,0(3)
 509         stw     7,4(3)
 510         addi    3,3,8
 511 4:      /* Copy 4 bytes.  */
 512         bf      29,2f
 513
 514         lwz     6,0(12)
 515         addi    12,12,4
 516         stw     6,0(3)
 517         addi    3,3,4
 518 2:      /* Copy 2~3 bytes.  */
 519         bf      30,1f
 520
 521         lhz     6,0(12)
 522         addi    12,12,2
 523         sth     6,0(3)
 524         addi    3,3,2
 525 1:      /* Copy 1 byte.  */
 526         bf      31,0f
 527
 528         lbz     6,0(12)
 529         stb     6,0(3)
 530 0:      /* Return original DST pointer.  */
 531         mr      3,30
 532         lwz     30,20(1)
 533         lwz     31,24(1)
 534         addi    1,1,32
 535         blr
 536
 537 END (memcpy)
 538 libc_hidden_builtin_def (memcpy)