arch/powerpc/lib/memcmp_64.S

   1 /*
   2  * Author: Anton Blanchard <anton@au.ibm.com>
   3  * Copyright 2015 IBM Corporation.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version
   8  * 2 of the License, or (at your option) any later version.
   9  */
  10 #include <asm/ppc_asm.h>
  11 #include <asm/export.h>
  12 #include <asm/ppc-opcode.h>
  13
  14 #define off8    r6
  15 #define off16   r7
  16 #define off24   r8
  17
  18 #define rA      r9
  19 #define rB      r10
  20 #define rC      r11
  21 #define rD      r27
  22 #define rE      r28
  23 #define rF      r29
  24 #define rG      r30
  25 #define rH      r31
  26
  27 #ifdef __LITTLE_ENDIAN__
  28 #define LH      lhbrx
  29 #define LW      lwbrx
  30 #define LD      ldbrx
  31 #define LVS     lvsr
  32 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
  33         vperm _VRT,_VRB,_VRA,_VRC
  34 #else
  35 #define LH      lhzx
  36 #define LW      lwzx
  37 #define LD      ldx
  38 #define LVS     lvsl
  39 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
  40         vperm _VRT,_VRA,_VRB,_VRC
  41 #endif
  42
  43 #define VMX_THRESH 4096
  44 #define ENTER_VMX_OPS   \
  45         mflr    r0;     \
  46         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
  47         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
  48         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
  49         std     r0,16(r1); \
  50         stdu    r1,-STACKFRAMESIZE(r1); \
  51         bl      enter_vmx_ops; \
  52         cmpwi   cr1,r3,0; \
  53         ld      r0,STACKFRAMESIZE+16(r1); \
  54         ld      r3,STK_REG(R31)(r1); \
  55         ld      r4,STK_REG(R30)(r1); \
  56         ld      r5,STK_REG(R29)(r1); \
  57         addi    r1,r1,STACKFRAMESIZE; \
  58         mtlr    r0
  59
  60 #define EXIT_VMX_OPS \
  61         mflr    r0; \
  62         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
  63         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
  64         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
  65         std     r0,16(r1); \
  66         stdu    r1,-STACKFRAMESIZE(r1); \
  67         bl      exit_vmx_ops; \
  68         ld      r0,STACKFRAMESIZE+16(r1); \
  69         ld      r3,STK_REG(R31)(r1); \
  70         ld      r4,STK_REG(R30)(r1); \
  71         ld      r5,STK_REG(R29)(r1); \
  72         addi    r1,r1,STACKFRAMESIZE; \
  73         mtlr    r0
  74
  75 /*
  76  * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
  77  * 16 bytes boundary and permute the result with the 1st 16 bytes.
  78
  79  *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
  80  *    ^                                  ^                                 ^
  81  * 0xbbbb10                          0xbbbb20                          0xbbb30
  82  *                                 ^
  83  *                                _vaddr
  84  *
  85  *
  86  * _vmask is the mask generated by LVS
  87  * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
  88  *   for example: 0xyyyyyyyyyyyyy012 for big endian
  89  * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
  90  *   for example: 0x3456789abcdefzzz for big endian
  91  * The permute result is saved in _v_res.
  92  *   for example: 0x0123456789abcdef for big endian.
  93  */
  94 #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
  95         lvx     _v2nd_qw,_vaddr,off16; \
  96         VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
  97
  98 /*
  99  * There are 2 categories for memcmp:
 100  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
 101  * are named like .Lsameoffset_xxxx
 102  * 2) src/dst has different offset to the 8 bytes boundary. The handlers
 103  * are named like .Ldiffoffset_xxxx
 104  */
 105 _GLOBAL_TOC(memcmp)
 106         cmpdi   cr1,r5,0
 107
 108         /* Use the short loop if the src/dst addresses are not
 109          * with the same offset of 8 bytes align boundary.
 110          */
 111         xor     r6,r3,r4
 112         andi.   r6,r6,7
 113
 114         /* Fall back to short loop if compare at aligned addrs
 115          * with less than 8 bytes.
 116          */
 117         cmpdi   cr6,r5,7
 118
 119         beq     cr1,.Lzero
 120         bgt     cr6,.Lno_short
 121
 122 .Lshort:
 123         mtctr   r5
 124 1:      lbz     rA,0(r3)
 125         lbz     rB,0(r4)
 126         subf.   rC,rB,rA
 127         bne     .Lnon_zero
 128         bdz     .Lzero
 129
 130         lbz     rA,1(r3)
 131         lbz     rB,1(r4)
 132         subf.   rC,rB,rA
 133         bne     .Lnon_zero
 134         bdz     .Lzero
 135
 136         lbz     rA,2(r3)
 137         lbz     rB,2(r4)
 138         subf.   rC,rB,rA
 139         bne     .Lnon_zero
 140         bdz     .Lzero
 141
 142         lbz     rA,3(r3)
 143         lbz     rB,3(r4)
 144         subf.   rC,rB,rA
 145         bne     .Lnon_zero
 146
 147         addi    r3,r3,4
 148         addi    r4,r4,4
 149
 150         bdnz    1b
 151
 152 .Lzero:
 153         li      r3,0
 154         blr
 155
 156 .Lno_short:
 157         dcbt    0,r3
 158         dcbt    0,r4
 159         bne     .Ldiffoffset_8bytes_make_align_start
 160
 161
 162 .Lsameoffset_8bytes_make_align_start:
 163         /* attempt to compare bytes not aligned with 8 bytes so that
 164          * rest comparison can run based on 8 bytes alignment.
 165          */
 166         andi.   r6,r3,7
 167
 168         /* Try to compare the first double word which is not 8 bytes aligned:
 169          * load the first double word at (src & ~7UL) and shift left appropriate
 170          * bits before comparision.
 171          */
 172         rlwinm  r6,r3,3,26,28
 173         beq     .Lsameoffset_8bytes_aligned
 174         clrrdi  r3,r3,3
 175         clrrdi  r4,r4,3
 176         LD      rA,0,r3
 177         LD      rB,0,r4
 178         sld     rA,rA,r6
 179         sld     rB,rB,r6
 180         cmpld   cr0,rA,rB
 181         srwi    r6,r6,3
 182         bne     cr0,.LcmpAB_lightweight
 183         subfic  r6,r6,8
 184         subf.   r5,r6,r5
 185         addi    r3,r3,8
 186         addi    r4,r4,8
 187         beq     .Lzero
 188
 189 .Lsameoffset_8bytes_aligned:
 190         /* now we are aligned with 8 bytes.
 191          * Use .Llong loop if left cmp bytes are equal or greater than 32B.
 192          */
 193         cmpdi   cr6,r5,31
 194         bgt     cr6,.Llong
 195
 196 .Lcmp_lt32bytes:
 197         /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
 198         cmpdi   cr5,r5,7
 199         srdi    r0,r5,3
 200         ble     cr5,.Lcmp_rest_lt8bytes
 201
 202         /* handle 8 ~ 31 bytes */
 203         clrldi  r5,r5,61
 204         mtctr   r0
 205 2:
 206         LD      rA,0,r3
 207         LD      rB,0,r4
 208         cmpld   cr0,rA,rB
 209         addi    r3,r3,8
 210         addi    r4,r4,8
 211         bne     cr0,.LcmpAB_lightweight
 212         bdnz    2b
 213
 214         cmpwi   r5,0
 215         beq     .Lzero
 216
 217 .Lcmp_rest_lt8bytes:
 218         /*
 219          * Here we have less than 8 bytes to compare. At least s1 is aligned to
 220          * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
 221          * page boundary, otherwise we might read past the end of the buffer and
 222          * trigger a page fault. We use 4K as the conservative minimum page
 223          * size. If we detect that case we go to the byte-by-byte loop.
 224          *
 225          * Otherwise the next double word is loaded from s1 and s2, and shifted
 226          * right to compare the appropriate bits.
 227          */
 228         clrldi  r6,r4,(64-12)   // r6 = r4 & 0xfff
 229         cmpdi   r6,0xff8
 230         bgt     .Lshort
 231
 232         subfic  r6,r5,8
 233         slwi    r6,r6,3
 234         LD      rA,0,r3
 235         LD      rB,0,r4
 236         srd     rA,rA,r6
 237         srd     rB,rB,r6
 238         cmpld   cr0,rA,rB
 239         bne     cr0,.LcmpAB_lightweight
 240         b       .Lzero
 241
 242 .Lnon_zero:
 243         mr      r3,rC
 244         blr
 245
 246 .Llong:
 247 #ifdef CONFIG_ALTIVEC
 248 BEGIN_FTR_SECTION
 249         /* Try to use vmx loop if length is equal or greater than 4K */
 250         cmpldi  cr6,r5,VMX_THRESH
 251         bge     cr6,.Lsameoffset_vmx_cmp
 252 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 253
 254 .Llong_novmx_cmp:
 255 #endif
 256         /* At least s1 addr is aligned with 8 bytes */
 257         li      off8,8
 258         li      off16,16
 259         li      off24,24
 260
 261         std     r31,-8(r1)
 262         std     r30,-16(r1)
 263         std     r29,-24(r1)
 264         std     r28,-32(r1)
 265         std     r27,-40(r1)
 266
 267         srdi    r0,r5,5
 268         mtctr   r0
 269         andi.   r5,r5,31
 270
 271         LD      rA,0,r3
 272         LD      rB,0,r4
 273
 274         LD      rC,off8,r3
 275         LD      rD,off8,r4
 276
 277         LD      rE,off16,r3
 278         LD      rF,off16,r4
 279
 280         LD      rG,off24,r3
 281         LD      rH,off24,r4
 282         cmpld   cr0,rA,rB
 283
 284         addi    r3,r3,32
 285         addi    r4,r4,32
 286
 287         bdz     .Lfirst32
 288
 289         LD      rA,0,r3
 290         LD      rB,0,r4
 291         cmpld   cr1,rC,rD
 292
 293         LD      rC,off8,r3
 294         LD      rD,off8,r4
 295         cmpld   cr6,rE,rF
 296
 297         LD      rE,off16,r3
 298         LD      rF,off16,r4
 299         cmpld   cr7,rG,rH
 300         bne     cr0,.LcmpAB
 301
 302         LD      rG,off24,r3
 303         LD      rH,off24,r4
 304         cmpld   cr0,rA,rB
 305         bne     cr1,.LcmpCD
 306
 307         addi    r3,r3,32
 308         addi    r4,r4,32
 309
 310         bdz     .Lsecond32
 311
 312         .balign 16
 313
 314 1:      LD      rA,0,r3
 315         LD      rB,0,r4
 316         cmpld   cr1,rC,rD
 317         bne     cr6,.LcmpEF
 318
 319         LD      rC,off8,r3
 320         LD      rD,off8,r4
 321         cmpld   cr6,rE,rF
 322         bne     cr7,.LcmpGH
 323
 324         LD      rE,off16,r3
 325         LD      rF,off16,r4
 326         cmpld   cr7,rG,rH
 327         bne     cr0,.LcmpAB
 328
 329         LD      rG,off24,r3
 330         LD      rH,off24,r4
 331         cmpld   cr0,rA,rB
 332         bne     cr1,.LcmpCD
 333
 334         addi    r3,r3,32
 335         addi    r4,r4,32
 336
 337         bdnz    1b
 338
 339 .Lsecond32:
 340         cmpld   cr1,rC,rD
 341         bne     cr6,.LcmpEF
 342
 343         cmpld   cr6,rE,rF
 344         bne     cr7,.LcmpGH
 345
 346         cmpld   cr7,rG,rH
 347         bne     cr0,.LcmpAB
 348
 349         bne     cr1,.LcmpCD
 350         bne     cr6,.LcmpEF
 351         bne     cr7,.LcmpGH
 352
 353 .Ltail:
 354         ld      r31,-8(r1)
 355         ld      r30,-16(r1)
 356         ld      r29,-24(r1)
 357         ld      r28,-32(r1)
 358         ld      r27,-40(r1)
 359
 360         cmpdi   r5,0
 361         beq     .Lzero
 362         b       .Lshort
 363
 364 .Lfirst32:
 365         cmpld   cr1,rC,rD
 366         cmpld   cr6,rE,rF
 367         cmpld   cr7,rG,rH
 368
 369         bne     cr0,.LcmpAB
 370         bne     cr1,.LcmpCD
 371         bne     cr6,.LcmpEF
 372         bne     cr7,.LcmpGH
 373
 374         b       .Ltail
 375
 376 .LcmpAB:
 377         li      r3,1
 378         bgt     cr0,.Lout
 379         li      r3,-1
 380         b       .Lout
 381
 382 .LcmpCD:
 383         li      r3,1
 384         bgt     cr1,.Lout
 385         li      r3,-1
 386         b       .Lout
 387
 388 .LcmpEF:
 389         li      r3,1
 390         bgt     cr6,.Lout
 391         li      r3,-1
 392         b       .Lout
 393
 394 .LcmpGH:
 395         li      r3,1
 396         bgt     cr7,.Lout
 397         li      r3,-1
 398
 399 .Lout:
 400         ld      r31,-8(r1)
 401         ld      r30,-16(r1)
 402         ld      r29,-24(r1)
 403         ld      r28,-32(r1)
 404         ld      r27,-40(r1)
 405         blr
 406
 407 .LcmpAB_lightweight:   /* skip NV GPRS restore */
 408         li      r3,1
 409         bgtlr
 410         li      r3,-1
 411         blr
 412
 413 #ifdef CONFIG_ALTIVEC
 414 .Lsameoffset_vmx_cmp:
 415         /* Enter with src/dst addrs has the same offset with 8 bytes
 416          * align boundary.
 417          *
 418          * There is an optimization based on following fact: memcmp()
 419          * prones to fail early at the first 32 bytes.
 420          * Before applying VMX instructions which will lead to 32x128bits
 421          * VMX regs load/restore penalty, we compare the first 32 bytes
 422          * so that we can catch the ~80% fail cases.
 423          */
 424
 425         li      r0,4
 426         mtctr   r0
 427 .Lsameoffset_prechk_32B_loop:
 428         LD      rA,0,r3
 429         LD      rB,0,r4
 430         cmpld   cr0,rA,rB
 431         addi    r3,r3,8
 432         addi    r4,r4,8
 433         bne     cr0,.LcmpAB_lightweight
 434         addi    r5,r5,-8
 435         bdnz    .Lsameoffset_prechk_32B_loop
 436
 437         ENTER_VMX_OPS
 438         beq     cr1,.Llong_novmx_cmp
 439
 440 3:
 441         /* need to check whether r4 has the same offset with r3
 442          * for 16 bytes boundary.
 443          */
 444         xor     r0,r3,r4
 445         andi.   r0,r0,0xf
 446         bne     .Ldiffoffset_vmx_cmp_start
 447
 448         /* len is no less than 4KB. Need to align with 16 bytes further.
 449          */
 450         andi.   rA,r3,8
 451         LD      rA,0,r3
 452         beq     4f
 453         LD      rB,0,r4
 454         cmpld   cr0,rA,rB
 455         addi    r3,r3,8
 456         addi    r4,r4,8
 457         addi    r5,r5,-8
 458
 459         beq     cr0,4f
 460         /* save and restore cr0 */
 461         mfocrf  r5,128
 462         EXIT_VMX_OPS
 463         mtocrf  128,r5
 464         b       .LcmpAB_lightweight
 465
 466 4:
 467         /* compare 32 bytes for each loop */
 468         srdi    r0,r5,5
 469         mtctr   r0
 470         clrldi  r5,r5,59
 471         li      off16,16
 472
 473 .balign 16
 474 5:
 475         lvx     v0,0,r3
 476         lvx     v1,0,r4
 477         VCMPEQUD_RC(v0,v0,v1)
 478         bnl     cr6,7f
 479         lvx     v0,off16,r3
 480         lvx     v1,off16,r4
 481         VCMPEQUD_RC(v0,v0,v1)
 482         bnl     cr6,6f
 483         addi    r3,r3,32
 484         addi    r4,r4,32
 485         bdnz    5b
 486
 487         EXIT_VMX_OPS
 488         cmpdi   r5,0
 489         beq     .Lzero
 490         b       .Lcmp_lt32bytes
 491
 492 6:
 493         addi    r3,r3,16
 494         addi    r4,r4,16
 495
 496 7:
 497         /* diff the last 16 bytes */
 498         EXIT_VMX_OPS
 499         LD      rA,0,r3
 500         LD      rB,0,r4
 501         cmpld   cr0,rA,rB
 502         li      off8,8
 503         bne     cr0,.LcmpAB_lightweight
 504
 505         LD      rA,off8,r3
 506         LD      rB,off8,r4
 507         cmpld   cr0,rA,rB
 508         bne     cr0,.LcmpAB_lightweight
 509         b       .Lzero
 510 #endif
 511
 512 .Ldiffoffset_8bytes_make_align_start:
 513         /* now try to align s1 with 8 bytes */
 514         rlwinm  r6,r3,3,26,28
 515         beq     .Ldiffoffset_align_s1_8bytes
 516
 517         clrrdi  r3,r3,3
 518         LD      rA,0,r3
 519         LD      rB,0,r4  /* unaligned load */
 520         sld     rA,rA,r6
 521         srd     rA,rA,r6
 522         srd     rB,rB,r6
 523         cmpld   cr0,rA,rB
 524         srwi    r6,r6,3
 525         bne     cr0,.LcmpAB_lightweight
 526
 527         subfic  r6,r6,8
 528         subf.   r5,r6,r5
 529         addi    r3,r3,8
 530         add     r4,r4,r6
 531
 532         beq     .Lzero
 533
 534 .Ldiffoffset_align_s1_8bytes:
 535         /* now s1 is aligned with 8 bytes. */
 536 #ifdef CONFIG_ALTIVEC
 537 BEGIN_FTR_SECTION
 538         /* only do vmx ops when the size equal or greater than 4K bytes */
 539         cmpdi   cr5,r5,VMX_THRESH
 540         bge     cr5,.Ldiffoffset_vmx_cmp
 541 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 542
 543 .Ldiffoffset_novmx_cmp:
 544 #endif
 545
 546
 547         cmpdi   cr5,r5,31
 548         ble     cr5,.Lcmp_lt32bytes
 549
 550 #ifdef CONFIG_ALTIVEC
 551         b       .Llong_novmx_cmp
 552 #else
 553         b       .Llong
 554 #endif
 555
 556 #ifdef CONFIG_ALTIVEC
 557 .Ldiffoffset_vmx_cmp:
 558         /* perform a 32 bytes pre-checking before
 559          * enable VMX operations.
 560          */
 561         li      r0,4
 562         mtctr   r0
 563 .Ldiffoffset_prechk_32B_loop:
 564         LD      rA,0,r3
 565         LD      rB,0,r4
 566         cmpld   cr0,rA,rB
 567         addi    r3,r3,8
 568         addi    r4,r4,8
 569         bne     cr0,.LcmpAB_lightweight
 570         addi    r5,r5,-8
 571         bdnz    .Ldiffoffset_prechk_32B_loop
 572
 573         ENTER_VMX_OPS
 574         beq     cr1,.Ldiffoffset_novmx_cmp
 575
 576 .Ldiffoffset_vmx_cmp_start:
 577         /* Firstly try to align r3 with 16 bytes */
 578         andi.   r6,r3,0xf
 579         li      off16,16
 580         beq     .Ldiffoffset_vmx_s1_16bytes_align
 581
 582         LVS     v3,0,r3
 583         LVS     v4,0,r4
 584
 585         lvx     v5,0,r3
 586         lvx     v6,0,r4
 587         LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
 588         LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
 589
 590         VCMPEQUB_RC(v7,v9,v10)
 591         bnl     cr6,.Ldiffoffset_vmx_diff_found
 592
 593         subfic  r6,r6,16
 594         subf    r5,r6,r5
 595         add     r3,r3,r6
 596         add     r4,r4,r6
 597
 598 .Ldiffoffset_vmx_s1_16bytes_align:
 599         /* now s1 is aligned with 16 bytes */
 600         lvx     v6,0,r4
 601         LVS     v4,0,r4
 602         srdi    r6,r5,5  /* loop for 32 bytes each */
 603         clrldi  r5,r5,59
 604         mtctr   r6
 605
 606 .balign 16
 607 .Ldiffoffset_vmx_32bytesloop:
 608         /* the first qw of r4 was saved in v6 */
 609         lvx     v9,0,r3
 610         LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
 611         VCMPEQUB_RC(v7,v9,v10)
 612         vor     v6,v8,v8
 613         bnl     cr6,.Ldiffoffset_vmx_diff_found
 614
 615         addi    r3,r3,16
 616         addi    r4,r4,16
 617
 618         lvx     v9,0,r3
 619         LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
 620         VCMPEQUB_RC(v7,v9,v10)
 621         vor     v6,v8,v8
 622         bnl     cr6,.Ldiffoffset_vmx_diff_found
 623
 624         addi    r3,r3,16
 625         addi    r4,r4,16
 626
 627         bdnz    .Ldiffoffset_vmx_32bytesloop
 628
 629         EXIT_VMX_OPS
 630
 631         cmpdi   r5,0
 632         beq     .Lzero
 633         b       .Lcmp_lt32bytes
 634
 635 .Ldiffoffset_vmx_diff_found:
 636         EXIT_VMX_OPS
 637         /* anyway, the diff will appear in next 16 bytes */
 638         li      r5,16
 639         b       .Lcmp_lt32bytes
 640
 641 #endif
 642 EXPORT_SYMBOL(memcmp)