arch/powerpc/lib/checksum_64.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * This file contains assembly-language implementations
   4  * of IP-style 1's complement checksum routines.
   5  *
   6  *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
   7  *
   8  * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
   9  */
  10
  11 #include <linux/sys.h>
  12 #include <asm/processor.h>
  13 #include <asm/errno.h>
  14 #include <asm/ppc_asm.h>
  15 #include <asm/export.h>
  16
  17 /*
  18  * Computes the checksum of a memory block at buff, length len,
  19  * and adds in "sum" (32-bit).
  20  *
  21  * __csum_partial(r3=buff, r4=len, r5=sum)
  22  */
  23 _GLOBAL(__csum_partial)
  24         addic   r0,r5,0                 /* clear carry */
  25
  26         srdi.   r6,r4,3                 /* less than 8 bytes? */
  27         beq     .Lcsum_tail_word
  28
  29         /*
  30          * If only halfword aligned, align to a double word. Since odd
  31          * aligned addresses should be rare and they would require more
  32          * work to calculate the correct checksum, we ignore that case
  33          * and take the potential slowdown of unaligned loads.
  34          */
  35         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
  36         beq     .Lcsum_aligned
  37
  38         li      r7,4
  39         sub     r6,r7,r6
  40         mtctr   r6
  41
  42 1:
  43         lhz     r6,0(r3)                /* align to doubleword */
  44         subi    r4,r4,2
  45         addi    r3,r3,2
  46         adde    r0,r0,r6
  47         bdnz    1b
  48
  49 .Lcsum_aligned:
  50         /*
  51          * We unroll the loop such that each iteration is 64 bytes with an
  52          * entry and exit limb of 64 bytes, meaning a minimum size of
  53          * 128 bytes.
  54          */
  55         srdi.   r6,r4,7
  56         beq     .Lcsum_tail_doublewords         /* len < 128 */
  57
  58         srdi    r6,r4,6
  59         subi    r6,r6,1
  60         mtctr   r6
  61
  62         stdu    r1,-STACKFRAMESIZE(r1)
  63         std     r14,STK_REG(R14)(r1)
  64         std     r15,STK_REG(R15)(r1)
  65         std     r16,STK_REG(R16)(r1)
  66
  67         ld      r6,0(r3)
  68         ld      r9,8(r3)
  69
  70         ld      r10,16(r3)
  71         ld      r11,24(r3)
  72
  73         /*
  74          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
  75          * because of the XER dependency. This means the fastest this loop can
  76          * go is 16 cycles per iteration. The scheduling of the loop below has
  77          * been shown to hit this on both POWER6 and POWER7.
  78          */
  79         .align 5
  80 2:
  81         adde    r0,r0,r6
  82         ld      r12,32(r3)
  83         ld      r14,40(r3)
  84
  85         adde    r0,r0,r9
  86         ld      r15,48(r3)
  87         ld      r16,56(r3)
  88         addi    r3,r3,64
  89
  90         adde    r0,r0,r10
  91
  92         adde    r0,r0,r11
  93
  94         adde    r0,r0,r12
  95
  96         adde    r0,r0,r14
  97
  98         adde    r0,r0,r15
  99         ld      r6,0(r3)
 100         ld      r9,8(r3)
 101
 102         adde    r0,r0,r16
 103         ld      r10,16(r3)
 104         ld      r11,24(r3)
 105         bdnz    2b
 106
 107
 108         adde    r0,r0,r6
 109         ld      r12,32(r3)
 110         ld      r14,40(r3)
 111
 112         adde    r0,r0,r9
 113         ld      r15,48(r3)
 114         ld      r16,56(r3)
 115         addi    r3,r3,64
 116
 117         adde    r0,r0,r10
 118         adde    r0,r0,r11
 119         adde    r0,r0,r12
 120         adde    r0,r0,r14
 121         adde    r0,r0,r15
 122         adde    r0,r0,r16
 123
 124         ld      r14,STK_REG(R14)(r1)
 125         ld      r15,STK_REG(R15)(r1)
 126         ld      r16,STK_REG(R16)(r1)
 127         addi    r1,r1,STACKFRAMESIZE
 128
 129         andi.   r4,r4,63
 130
 131 .Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
 132         srdi.   r6,r4,3
 133         beq     .Lcsum_tail_word
 134
 135         mtctr   r6
 136 3:
 137         ld      r6,0(r3)
 138         addi    r3,r3,8
 139         adde    r0,r0,r6
 140         bdnz    3b
 141
 142         andi.   r4,r4,7
 143
 144 .Lcsum_tail_word:                       /* Up to 7 bytes to go */
 145         srdi.   r6,r4,2
 146         beq     .Lcsum_tail_halfword
 147
 148         lwz     r6,0(r3)
 149         addi    r3,r3,4
 150         adde    r0,r0,r6
 151         subi    r4,r4,4
 152
 153 .Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
 154         srdi.   r6,r4,1
 155         beq     .Lcsum_tail_byte
 156
 157         lhz     r6,0(r3)
 158         addi    r3,r3,2
 159         adde    r0,r0,r6
 160         subi    r4,r4,2
 161
 162 .Lcsum_tail_byte:                       /* Up to 1 byte to go */
 163         andi.   r6,r4,1
 164         beq     .Lcsum_finish
 165
 166         lbz     r6,0(r3)
 167 #ifdef __BIG_ENDIAN__
 168         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 169         adde    r0,r0,r9
 170 #else
 171         adde    r0,r0,r6
 172 #endif
 173
 174 .Lcsum_finish:
 175         addze   r0,r0                   /* add in final carry */
 176         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 177         add     r3,r4,r0
 178         srdi    r3,r3,32
 179         blr
 180 EXPORT_SYMBOL(__csum_partial)
 181
 182
 183         .macro srcnr
 184 100:
 185         EX_TABLE(100b,.Lsrc_error_nr)
 186         .endm
 187
 188         .macro source
 189 150:
 190         EX_TABLE(150b,.Lsrc_error)
 191         .endm
 192
 193         .macro dstnr
 194 200:
 195         EX_TABLE(200b,.Ldest_error_nr)
 196         .endm
 197
 198         .macro dest
 199 250:
 200         EX_TABLE(250b,.Ldest_error)
 201         .endm
 202
 203 /*
 204  * Computes the checksum of a memory block at src, length len,
 205  * and adds in "sum" (32-bit), while copying the block to dst.
 206  * If an access exception occurs on src or dst, it stores -EFAULT
 207  * to *src_err or *dst_err respectively. The caller must take any action
 208  * required in this case (zeroing memory, recalculating partial checksum etc).
 209  *
 210  * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 211  */
 212 _GLOBAL(csum_partial_copy_generic)
 213         addic   r0,r6,0                 /* clear carry */
 214
 215         srdi.   r6,r5,3                 /* less than 8 bytes? */
 216         beq     .Lcopy_tail_word
 217
 218         /*
 219          * If only halfword aligned, align to a double word. Since odd
 220          * aligned addresses should be rare and they would require more
 221          * work to calculate the correct checksum, we ignore that case
 222          * and take the potential slowdown of unaligned loads.
 223          *
 224          * If the source and destination are relatively unaligned we only
 225          * align the source. This keeps things simple.
 226          */
 227         rldicl. r6,r3,64-1,64-2         /* r6 = (r3 >> 1) & 0x3 */
 228         beq     .Lcopy_aligned
 229
 230         li      r9,4
 231         sub     r6,r9,r6
 232         mtctr   r6
 233
 234 1:
 235 srcnr;  lhz     r6,0(r3)                /* align to doubleword */
 236         subi    r5,r5,2
 237         addi    r3,r3,2
 238         adde    r0,r0,r6
 239 dstnr;  sth     r6,0(r4)
 240         addi    r4,r4,2
 241         bdnz    1b
 242
 243 .Lcopy_aligned:
 244         /*
 245          * We unroll the loop such that each iteration is 64 bytes with an
 246          * entry and exit limb of 64 bytes, meaning a minimum size of
 247          * 128 bytes.
 248          */
 249         srdi.   r6,r5,7
 250         beq     .Lcopy_tail_doublewords         /* len < 128 */
 251
 252         srdi    r6,r5,6
 253         subi    r6,r6,1
 254         mtctr   r6
 255
 256         stdu    r1,-STACKFRAMESIZE(r1)
 257         std     r14,STK_REG(R14)(r1)
 258         std     r15,STK_REG(R15)(r1)
 259         std     r16,STK_REG(R16)(r1)
 260
 261 source; ld      r6,0(r3)
 262 source; ld      r9,8(r3)
 263
 264 source; ld      r10,16(r3)
 265 source; ld      r11,24(r3)
 266
 267         /*
 268          * On POWER6 and POWER7 back to back adde instructions take 2 cycles
 269          * because of the XER dependency. This means the fastest this loop can
 270          * go is 16 cycles per iteration. The scheduling of the loop below has
 271          * been shown to hit this on both POWER6 and POWER7.
 272          */
 273         .align 5
 274 2:
 275         adde    r0,r0,r6
 276 source; ld      r12,32(r3)
 277 source; ld      r14,40(r3)
 278
 279         adde    r0,r0,r9
 280 source; ld      r15,48(r3)
 281 source; ld      r16,56(r3)
 282         addi    r3,r3,64
 283
 284         adde    r0,r0,r10
 285 dest;   std     r6,0(r4)
 286 dest;   std     r9,8(r4)
 287
 288         adde    r0,r0,r11
 289 dest;   std     r10,16(r4)
 290 dest;   std     r11,24(r4)
 291
 292         adde    r0,r0,r12
 293 dest;   std     r12,32(r4)
 294 dest;   std     r14,40(r4)
 295
 296         adde    r0,r0,r14
 297 dest;   std     r15,48(r4)
 298 dest;   std     r16,56(r4)
 299         addi    r4,r4,64
 300
 301         adde    r0,r0,r15
 302 source; ld      r6,0(r3)
 303 source; ld      r9,8(r3)
 304
 305         adde    r0,r0,r16
 306 source; ld      r10,16(r3)
 307 source; ld      r11,24(r3)
 308         bdnz    2b
 309
 310
 311         adde    r0,r0,r6
 312 source; ld      r12,32(r3)
 313 source; ld      r14,40(r3)
 314
 315         adde    r0,r0,r9
 316 source; ld      r15,48(r3)
 317 source; ld      r16,56(r3)
 318         addi    r3,r3,64
 319
 320         adde    r0,r0,r10
 321 dest;   std     r6,0(r4)
 322 dest;   std     r9,8(r4)
 323
 324         adde    r0,r0,r11
 325 dest;   std     r10,16(r4)
 326 dest;   std     r11,24(r4)
 327
 328         adde    r0,r0,r12
 329 dest;   std     r12,32(r4)
 330 dest;   std     r14,40(r4)
 331
 332         adde    r0,r0,r14
 333 dest;   std     r15,48(r4)
 334 dest;   std     r16,56(r4)
 335         addi    r4,r4,64
 336
 337         adde    r0,r0,r15
 338         adde    r0,r0,r16
 339
 340         ld      r14,STK_REG(R14)(r1)
 341         ld      r15,STK_REG(R15)(r1)
 342         ld      r16,STK_REG(R16)(r1)
 343         addi    r1,r1,STACKFRAMESIZE
 344
 345         andi.   r5,r5,63
 346
 347 .Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
 348         srdi.   r6,r5,3
 349         beq     .Lcopy_tail_word
 350
 351         mtctr   r6
 352 3:
 353 srcnr;  ld      r6,0(r3)
 354         addi    r3,r3,8
 355         adde    r0,r0,r6
 356 dstnr;  std     r6,0(r4)
 357         addi    r4,r4,8
 358         bdnz    3b
 359
 360         andi.   r5,r5,7
 361
 362 .Lcopy_tail_word:                       /* Up to 7 bytes to go */
 363         srdi.   r6,r5,2
 364         beq     .Lcopy_tail_halfword
 365
 366 srcnr;  lwz     r6,0(r3)
 367         addi    r3,r3,4
 368         adde    r0,r0,r6
 369 dstnr;  stw     r6,0(r4)
 370         addi    r4,r4,4
 371         subi    r5,r5,4
 372
 373 .Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
 374         srdi.   r6,r5,1
 375         beq     .Lcopy_tail_byte
 376
 377 srcnr;  lhz     r6,0(r3)
 378         addi    r3,r3,2
 379         adde    r0,r0,r6
 380 dstnr;  sth     r6,0(r4)
 381         addi    r4,r4,2
 382         subi    r5,r5,2
 383
 384 .Lcopy_tail_byte:                       /* Up to 1 byte to go */
 385         andi.   r6,r5,1
 386         beq     .Lcopy_finish
 387
 388 srcnr;  lbz     r6,0(r3)
 389 #ifdef __BIG_ENDIAN__
 390         sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
 391         adde    r0,r0,r9
 392 #else
 393         adde    r0,r0,r6
 394 #endif
 395 dstnr;  stb     r6,0(r4)
 396
 397 .Lcopy_finish:
 398         addze   r0,r0                   /* add in final carry */
 399         rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
 400         add     r3,r4,r0
 401         srdi    r3,r3,32
 402         blr
 403
 404 .Lsrc_error:
 405         ld      r14,STK_REG(R14)(r1)
 406         ld      r15,STK_REG(R15)(r1)
 407         ld      r16,STK_REG(R16)(r1)
 408         addi    r1,r1,STACKFRAMESIZE
 409 .Lsrc_error_nr:
 410         cmpdi   0,r7,0
 411         beqlr
 412         li      r6,-EFAULT
 413         stw     r6,0(r7)
 414         blr
 415
 416 .Ldest_error:
 417         ld      r14,STK_REG(R14)(r1)
 418         ld      r15,STK_REG(R15)(r1)
 419         ld      r16,STK_REG(R16)(r1)
 420         addi    r1,r1,STACKFRAMESIZE
 421 .Ldest_error_nr:
 422         cmpdi   0,r8,0
 423         beqlr
 424         li      r6,-EFAULT
 425         stw     r6,0(r8)
 426         blr
 427 EXPORT_SYMBOL(csum_partial_copy_generic)
 428
 429 /*
 430  * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 431  *                         const struct in6_addr *daddr,
 432  *                         __u32 len, __u8 proto, __wsum sum)
 433  */
 434
 435 _GLOBAL(csum_ipv6_magic)
 436         ld      r8, 0(r3)
 437         ld      r9, 8(r3)
 438         add     r5, r5, r6
 439         addc    r0, r8, r9
 440         ld      r10, 0(r4)
 441         ld      r11, 8(r4)
 442 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 443         rotldi  r5, r5, 8
 444 #endif
 445         adde    r0, r0, r10
 446         add     r5, r5, r7
 447         adde    r0, r0, r11
 448         adde    r0, r0, r5
 449         addze   r0, r0
 450         rotldi  r3, r0, 32              /* fold two 32 bit halves together */
 451         add     r3, r0, r3
 452         srdi    r0, r3, 32
 453         rotlwi  r3, r0, 16              /* fold two 16 bit halves together */
 454         add     r3, r0, r3
 455         not     r3, r3
 456         rlwinm  r3, r3, 16, 16, 31
 457         blr
 458 EXPORT_SYMBOL(csum_ipv6_magic)