sysdeps/powerpc/powerpc32/power7/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC32/POWER7.
   2    Copyright (C) 2010-2013 Free Software Foundation, Inc.
   3    Contributed by Luis Machado <luisgpm@br.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21 #include <bp-sym.h>
  22 #include <bp-asm.h>
  23
  24 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
  25    Returns 'dst'.  */
  26
  27         .machine  power7
  28 EALIGN (BP_SYM (memcpy), 5, 0)
  29         CALL_MCOUNT
  30
  31         stwu    1,-32(1)
  32         cfi_adjust_cfa_offset(32)
  33         stw     30,20(1)
  34         cfi_offset(30,(20-32))
  35         stw     31,24(1)
  36         mr      30,3
  37         cmplwi  cr1,5,31
  38         neg     0,3
  39         cfi_offset(31,-8)
  40         ble     cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
  41                                     code.  */
  42
  43         andi.   11,3,7        /* Check alignment of DST.  */
  44         clrlwi  10,4,29       /* Check alignment of SRC.  */
  45         cmplw   cr6,10,11     /* SRC and DST alignments match?  */
  46         mr      12,4
  47         mr      31,5
  48         bne     cr6,L(copy_GE_32_unaligned)
  49
  50         srwi    9,5,3         /* Number of full quadwords remaining.  */
  51
  52         beq     L(copy_GE_32_aligned_cont)
  53
  54         clrlwi  0,0,29
  55         mtcrf   0x01,0
  56         subf    31,0,5
  57
  58         /* Get the SRC aligned to 8 bytes.  */
  59
  60 1:      bf      31,2f
  61         lbz     6,0(12)
  62         addi    12,12,1
  63         stb     6,0(3)
  64         addi    3,3,1
  65 2:      bf      30,4f
  66         lhz     6,0(12)
  67         addi    12,12,2
  68         sth     6,0(3)
  69         addi    3,3,2
  70 4:      bf      29,0f
  71         lwz     6,0(12)
  72         addi    12,12,4
  73         stw     6,0(3)
  74         addi    3,3,4
  75 0:
  76         clrlwi  10,12,29      /* Check alignment of SRC again.  */
  77         srwi    9,31,3        /* Number of full doublewords remaining.  */
  78
  79 L(copy_GE_32_aligned_cont):
  80
  81         clrlwi  11,31,29
  82         mtcrf   0x01,9
  83
  84         srwi    8,31,5
  85         cmplwi  cr1,9,4
  86         cmplwi  cr6,11,0
  87         mr      11,12
  88
  89         /* Copy 1~3 doublewords so the main loop starts
  90         at a multiple of 32 bytes.  */
  91
  92         bf      30,1f
  93         lfd     6,0(12)
  94         lfd     7,8(12)
  95         addi    11,12,16
  96         mtctr   8
  97         stfd    6,0(3)
  98         stfd    7,8(3)
  99         addi    10,3,16
 100         bf      31,4f
 101         lfd     0,16(12)
 102         stfd    0,16(3)
 103         blt     cr1,3f
 104         addi    11,12,24
 105         addi    10,3,24
 106         b       4f
 107
 108         .align  4
 109 1:      /* Copy 1 doubleword and set the counter.  */
 110         mr      10,3
 111         mtctr   8
 112         bf      31,4f
 113         lfd     6,0(12)
 114         addi    11,12,8
 115         stfd    6,0(3)
 116         addi    10,3,8
 117
 118 L(aligned_copy):
 119         /* Main aligned copy loop. Copies up to 128-bytes at a time. */
 120         .align  4
 121 4:
 122         /* check for any 32-byte or 64-byte lumps that are outside of a
 123            nice 128-byte range.  R8 contains the number of 32-byte
 124            lumps, so drop this into the CR, and use the SO/EQ bits to help
 125            handle the 32- or 64- byte lumps.  Then handle the rest with an
 126            unrolled 128-bytes-at-a-time copy loop. */
 127         mtocrf  1,8
 128         li      6,16    # 16() index
 129         li      7,32    # 32() index
 130         li      8,48    # 48() index
 131
 132 L(aligned_32byte):
 133         /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
 134         bns     cr7,L(aligned_64byte)
 135         lxvd2x  6,0,11
 136         lxvd2x  7,11,6
 137         addi    11,11,32
 138         stxvd2x 6,0,10
 139         stxvd2x 7,10,6
 140         addi    10,10,32
 141
 142 L(aligned_64byte):
 143         /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
 144         bne     cr7,L(aligned_128setup)
 145         lxvd2x  6,0,11
 146         lxvd2x  7,11,6
 147         lxvd2x  8,11,7
 148         lxvd2x  9,11,8
 149         addi    11,11,64
 150         stxvd2x 6,0,10
 151         stxvd2x 7,10,6
 152         stxvd2x 8,10,7
 153         stxvd2x 9,10,8
 154         addi    10,10,64
 155
 156 L(aligned_128setup):
 157         /* Set up for the 128-byte at a time copy loop.  */
 158         srwi    8,31,7
 159         cmpwi   8,0     # Any 4x lumps left?
 160         beq     3f      # if not, move along.
 161         lxvd2x  6,0,11
 162         lxvd2x  7,11,6
 163         mtctr   8       # otherwise, load the ctr and begin.
 164         li      8,48    # 48() index
 165         b       L(aligned_128loop)
 166
 167 L(aligned_128head):
 168         /* for the 2nd + iteration of this loop. */
 169         lxvd2x  6,0,11
 170         lxvd2x  7,11,6
 171 L(aligned_128loop):
 172         lxvd2x  8,11,7
 173         lxvd2x  9,11,8
 174         stxvd2x 6,0,10
 175         addi    11,11,64
 176         stxvd2x 7,10,6
 177         stxvd2x 8,10,7
 178         stxvd2x 9,10,8
 179         lxvd2x  6,0,11
 180         lxvd2x  7,11,6
 181         addi    10,10,64
 182         lxvd2x  8,11,7
 183         lxvd2x  9,11,8
 184         addi    11,11,64
 185         stxvd2x 6,0,10
 186         stxvd2x 7,10,6
 187         stxvd2x 8,10,7
 188         stxvd2x 9,10,8
 189         addi    10,10,64
 190         bdnz    L(aligned_128head)
 191
 192 3:
 193         /* Check for tail bytes.  */
 194         clrrwi  0,31,3
 195         mtcrf   0x01,31
 196         beq     cr6,0f
 197
 198 .L9:
 199         add     3,3,0
 200         add     12,12,0
 201
 202         /*  At this point we have a tail of 0-7 bytes and we know that the
 203         destination is doubleword-aligned.  */
 204 4:      /* Copy 4 bytes.  */
 205         bf      29,2f
 206
 207         lwz     6,0(12)
 208         addi    12,12,4
 209         stw     6,0(3)
 210         addi    3,3,4
 211 2:      /* Copy 2 bytes.  */
 212         bf      30,1f
 213
 214         lhz     6,0(12)
 215         addi    12,12,2
 216         sth     6,0(3)
 217         addi    3,3,2
 218 1:      /* Copy 1 byte.  */
 219         bf      31,0f
 220
 221         lbz     6,0(12)
 222         stb     6,0(3)
 223 0:      /* Return original DST pointer.  */
 224         mr      3,30
 225         lwz     30,20(1)
 226         lwz     31,24(1)
 227         addi    1,1,32
 228         blr
 229
 230         /* Handle copies of 0~31 bytes.  */
 231         .align  4
 232 L(copy_LT_32):
 233         cmplwi  cr6,5,8
 234         mr      12,4
 235         mtcrf   0x01,5
 236         ble     cr6,L(copy_LE_8)
 237
 238         /* At least 9 bytes to go.  */
 239         neg     8,4
 240         clrrwi  11,4,2
 241         andi.   0,8,3
 242         cmplwi  cr1,5,16
 243         mr      10,5
 244         beq     L(copy_LT_32_aligned)
 245
 246         /* Force 4-bytes alignment for SRC.  */
 247         mtocrf  0x01,0
 248         subf    10,0,5
 249 2:      bf      30,1f
 250
 251         lhz     6,0(12)
 252         addi    12,12,2
 253         sth     6,0(3)
 254         addi    3,3,2
 255 1:      bf      31,L(end_4bytes_alignment)
 256
 257         lbz     6,0(12)
 258         addi    12,12,1
 259         stb     6,0(3)
 260         addi    3,3,1
 261
 262         .align  4
 263 L(end_4bytes_alignment):
 264         cmplwi  cr1,10,16
 265         mtcrf   0x01,10
 266
 267 L(copy_LT_32_aligned):
 268         /* At least 6 bytes to go, and SRC is word-aligned.  */
 269         blt     cr1,8f
 270
 271         /* Copy 16 bytes.  */
 272         lwz     6,0(12)
 273         lwz     7,4(12)
 274         stw     6,0(3)
 275         lwz     8,8(12)
 276         stw     7,4(3)
 277         lwz     6,12(12)
 278         addi    12,12,16
 279         stw     8,8(3)
 280         stw     6,12(3)
 281         addi    3,3,16
 282 8:      /* Copy 8 bytes.  */
 283         bf      28,4f
 284
 285         lwz     6,0(12)
 286         lwz     7,4(12)
 287         addi    12,12,8
 288         stw     6,0(3)
 289         stw     7,4(3)
 290         addi    3,3,8
 291 4:      /* Copy 4 bytes.  */
 292         bf      29,2f
 293
 294         lwz     6,0(12)
 295         addi    12,12,4
 296         stw     6,0(3)
 297         addi    3,3,4
 298 2:      /* Copy 2-3 bytes.  */
 299         bf      30,1f
 300
 301         lhz     6,0(12)
 302         sth     6,0(3)
 303         bf      31,0f
 304         lbz     7,2(12)
 305         stb     7,2(3)
 306
 307         /* Return original DST pointer.  */
 308         mr      3,30
 309         lwz     30,20(1)
 310         addi    1,1,32
 311         blr
 312
 313         .align  4
 314 1:      /* Copy 1 byte.  */
 315         bf      31,0f
 316
 317         lbz     6,0(12)
 318         stb     6,0(3)
 319 0:      /* Return original DST pointer.  */
 320         mr      3,30
 321         lwz     30,20(1)
 322         addi    1,1,32
 323         blr
 324
 325         /* Handles copies of 0~8 bytes.  */
 326         .align  4
 327 L(copy_LE_8):
 328         bne     cr6,4f
 329
 330         /* Though we could've used lfd/stfd here, they are still
 331         slow for unaligned cases.  */
 332
 333         lwz     6,0(4)
 334         lwz     7,4(4)
 335         stw     6,0(3)
 336         stw     7,4(3)
 337
 338         /* Return original DST pointer.  */
 339         mr      3,30
 340         lwz     30,20(1)
 341         addi    1,1,32
 342         blr
 343
 344         .align  4
 345 4:      /* Copies 4~7 bytes.  */
 346         bf      29,2b
 347
 348         lwz     6,0(4)
 349         stw     6,0(3)
 350         bf      30,5f
 351         lhz     7,4(4)
 352         sth     7,4(3)
 353         bf      31,0f
 354         lbz     8,6(4)
 355         stb     8,6(3)
 356
 357         /* Return original DST pointer.  */
 358         mr      3,30
 359         lwz     30,20(1)
 360         addi    1,1,32
 361         blr
 362
 363         .align  4
 364 5:      /* Copy 1 byte.  */
 365         bf      31,0f
 366
 367         lbz     6,4(4)
 368         stb     6,4(3)
 369
 370 0:      /* Return original DST pointer.  */
 371         mr      3,30
 372         lwz     30,20(1)
 373         addi    1,1,32
 374         blr
 375
 376         /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
 377         SRC is not. Use aligned quadword loads from SRC, shifted to realign
 378         the data, allowing for aligned DST stores.  */
 379         .align  4
 380 L(copy_GE_32_unaligned):
 381         andi.   11,3,15       /* Check alignment of DST.  */
 382         clrlwi  0,0,28        /* Number of bytes until the 1st
 383                               quadword of DST.  */
 384         srwi    9,5,4         /* Number of full quadwords remaining.  */
 385
 386         beq    L(copy_GE_32_unaligned_cont)
 387
 388         /* SRC is not quadword aligned, get it aligned.  */
 389
 390         mtcrf   0x01,0
 391         subf    31,0,5
 392
 393         /* Vector instructions work best when proper alignment (16-bytes)
 394         is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
 395 1:      /* Copy 1 byte.  */
 396         bf      31,2f
 397
 398         lbz     6,0(12)
 399         addi    12,12,1
 400         stb     6,0(3)
 401         addi    3,3,1
 402 2:      /* Copy 2 bytes.  */
 403         bf          30,4f
 404
 405         lhz     6,0(12)
 406         addi    12,12,2
 407         sth     6,0(3)
 408         addi    3,3,2
 409 4:      /* Copy 4 bytes.  */
 410         bf      29,8f
 411
 412         lwz     6,0(12)
 413         addi    12,12,4
 414         stw     6,0(3)
 415         addi    3,3,4
 416 8:      /* Copy 8 bytes.  */
 417         bf      28,0f
 418
 419         lfd     6,0(12)
 420         addi    12,12,8
 421         stfd    6,0(3)
 422         addi    3,3,8
 423 0:
 424         clrlwi  10,12,28      /* Check alignment of SRC.  */
 425         srwi    9,31,4        /* Number of full quadwords remaining.  */
 426
 427         /* The proper alignment is present, it is OK to copy the bytes now.  */
 428 L(copy_GE_32_unaligned_cont):
 429
 430         /* Setup two indexes to speed up the indexed vector operations.  */
 431         clrlwi  11,31,28
 432         li      6,16          /* Index for 16-bytes offsets.  */
 433         li      7,32          /* Index for 32-bytes offsets.  */
 434         cmplwi  cr1,11,0
 435         srwi    8,31,5        /* Setup the loop counter.  */
 436         mr      10,3
 437         mr      11,12
 438         mtcrf   0x01,9
 439         cmplwi  cr6,9,1
 440         lvsl    5,0,12
 441         lvx     3,0,12
 442         bf      31,L(setup_unaligned_loop)
 443
 444         /* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 445         lvx     4,12,6
 446         vperm   6,3,4,5
 447         addi    11,12,16
 448         addi    10,3,16
 449         stvx    6,0,3
 450         vor     3,4,4
 451
 452 L(setup_unaligned_loop):
 453         mtctr   8
 454         ble     cr6,L(end_unaligned_loop)
 455
 456         /* Copy 32 bytes at a time using vector instructions.  */
 457         .align  4
 458 L(unaligned_loop):
 459
 460         /* Note: vr6/vr10 may contain data that was already copied,
 461         but in order to get proper alignment, we may have to copy
 462         some portions again. This is faster than having unaligned
 463         vector instructions though.  */
 464
 465         lvx     4,11,6        /* vr4 = r11+16.  */
 466         vperm   6,3,4,5       /* Merge the correctly-aligned portions
 467                               of vr3/vr4 into vr6.  */
 468         lvx     3,11,7        /* vr3 = r11+32.  */
 469         vperm   10,4,3,5      /* Merge the correctly-aligned portions
 470                               of vr3/vr4 into vr10.  */
 471         addi    11,11,32
 472         stvx    6,0,10
 473         stvx    10,10,6
 474         addi    10,10,32
 475
 476         bdnz    L(unaligned_loop)
 477
 478         .align  4
 479 L(end_unaligned_loop):
 480
 481         /* Check for tail bytes.  */
 482         clrrwi  0,31,4
 483         mtcrf   0x01,31
 484         beq     cr1,0f
 485
 486         add     3,3,0
 487         add     12,12,0
 488
 489         /*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
 490 8:      /* Copy 8 bytes.  */
 491         bf      28,4f
 492
 493         lwz     6,0(12)
 494         lwz     7,4(12)
 495         addi    12,12,8
 496         stw     6,0(3)
 497         stw     7,4(3)
 498         addi    3,3,8
 499 4:      /* Copy 4 bytes.  */
 500         bf      29,2f
 501
 502         lwz     6,0(12)
 503         addi    12,12,4
 504         stw     6,0(3)
 505         addi    3,3,4
 506 2:      /* Copy 2~3 bytes.  */
 507         bf      30,1f
 508
 509         lhz     6,0(12)
 510         addi    12,12,2
 511         sth     6,0(3)
 512         addi    3,3,2
 513 1:      /* Copy 1 byte.  */
 514         bf      31,0f
 515
 516         lbz     6,0(12)
 517         stb     6,0(3)
 518 0:      /* Return original DST pointer.  */
 519         mr      3,30
 520         lwz     30,20(1)
 521         lwz     31,24(1)
 522         addi    1,1,32
 523         blr
 524
 525 END (BP_SYM (memcpy))
 526 libc_hidden_builtin_def (memcpy)