arch/xtensa/lib/checksum.S

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              IP/TCP/UDP checksumming routines
   7  *
   8  * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
   9  *                  Optimized by Joe Taylor
  10  *
  11  *              This program is free software; you can redistribute it and/or
  12  *              modify it under the terms of the GNU General Public License
  13  *              as published by the Free Software Foundation; either version
  14  *              2 of the License, or (at your option) any later version.
  15  */
  16
  17 #include <asm/errno.h>
  18 #include <linux/linkage.h>
  19 #include <variant/core.h>
  20
  21 /*
  22  * computes a partial checksum, e.g. for TCP/UDP fragments
  23  */
  24
  25 /*
  26  * unsigned int csum_partial(const unsigned char *buf, int len,
  27  *                           unsigned int sum);
  28  *    a2 = buf
  29  *    a3 = len
  30  *    a4 = sum
  31  *
  32  * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
  33  */
  34
  35 /* ONES_ADD converts twos-complement math to ones-complement. */
  36 #define ONES_ADD(sum, val)        \
  37         add     sum, sum, val   ; \
  38         bgeu    sum, val, 99f   ; \
  39         addi    sum, sum, 1     ; \
  40 99:                             ;
  41
  42 .text
  43 ENTRY(csum_partial)
  44
  45         /*
  46          * Experiments with Ethernet and SLIP connections show that buf
  47          * is aligned on either a 2-byte or 4-byte boundary.
  48          */
  49         entry   sp, 32
  50         extui   a5, a2, 0, 2
  51         bnez    a5, 8f          /* branch if 2-byte aligned */
  52         /* Fall-through on common case, 4-byte alignment */
  53 1:
  54         srli    a5, a3, 5       /* 32-byte chunks */
  55 #if XCHAL_HAVE_LOOPS
  56         loopgtz a5, 2f
  57 #else
  58         beqz    a5, 2f
  59         slli    a5, a5, 5
  60         add     a5, a5, a2      /* a5 = end of last 32-byte chunk */
  61 .Loop1:
  62 #endif
  63         l32i    a6, a2, 0
  64         l32i    a7, a2, 4
  65         ONES_ADD(a4, a6)
  66         ONES_ADD(a4, a7)
  67         l32i    a6, a2, 8
  68         l32i    a7, a2, 12
  69         ONES_ADD(a4, a6)
  70         ONES_ADD(a4, a7)
  71         l32i    a6, a2, 16
  72         l32i    a7, a2, 20
  73         ONES_ADD(a4, a6)
  74         ONES_ADD(a4, a7)
  75         l32i    a6, a2, 24
  76         l32i    a7, a2, 28
  77         ONES_ADD(a4, a6)
  78         ONES_ADD(a4, a7)
  79         addi    a2, a2, 4*8
  80 #if !XCHAL_HAVE_LOOPS
  81         blt     a2, a5, .Loop1
  82 #endif
  83 2:
  84         extui   a5, a3, 2, 3    /* remaining 4-byte chunks */
  85 #if XCHAL_HAVE_LOOPS
  86         loopgtz a5, 3f
  87 #else
  88         beqz    a5, 3f
  89         slli    a5, a5, 2
  90         add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
  91 .Loop2:
  92 #endif
  93         l32i    a6, a2, 0
  94         ONES_ADD(a4, a6)
  95         addi    a2, a2, 4
  96 #if !XCHAL_HAVE_LOOPS
  97         blt     a2, a5, .Loop2
  98 #endif
  99 3:
 100         _bbci.l a3, 1, 5f       /* remaining 2-byte chunk */
 101         l16ui   a6, a2, 0
 102         ONES_ADD(a4, a6)
 103         addi    a2, a2, 2
 104 5:
 105         _bbci.l a3, 0, 7f       /* remaining 1-byte chunk */
 106 6:      l8ui    a6, a2, 0
 107 #ifdef __XTENSA_EB__
 108         slli    a6, a6, 8       /* load byte into bits 8..15 */
 109 #endif
 110         ONES_ADD(a4, a6)
 111 7:
 112         mov     a2, a4
 113         retw
 114
 115         /* uncommon case, buf is 2-byte aligned */
 116 8:
 117         beqz    a3, 7b          /* branch if len == 0 */
 118         beqi    a3, 1, 6b       /* branch if len == 1 */
 119
 120         extui   a5, a2, 0, 1
 121         bnez    a5, 8f          /* branch if 1-byte aligned */
 122
 123         l16ui   a6, a2, 0       /* common case, len >= 2 */
 124         ONES_ADD(a4, a6)
 125         addi    a2, a2, 2       /* adjust buf */
 126         addi    a3, a3, -2      /* adjust len */
 127         j       1b              /* now buf is 4-byte aligned */
 128
 129         /* case: odd-byte aligned, len > 1
 130          * This case is dog slow, so don't give us an odd address.
 131          * (I don't think this ever happens, but just in case.)
 132          */
 133 8:
 134         srli    a5, a3, 2       /* 4-byte chunks */
 135 #if XCHAL_HAVE_LOOPS
 136         loopgtz a5, 2f
 137 #else
 138         beqz    a5, 2f
 139         slli    a5, a5, 2
 140         add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
 141 .Loop3:
 142 #endif
 143         l8ui    a6, a2, 0       /* bits 24..31 */
 144         l16ui   a7, a2, 1       /* bits  8..23 */
 145         l8ui    a8, a2, 3       /* bits  0.. 8 */
 146 #ifdef  __XTENSA_EB__
 147         slli    a6, a6, 24
 148 #else
 149         slli    a8, a8, 24
 150 #endif
 151         slli    a7, a7, 8
 152         or      a7, a7, a6
 153         or      a7, a7, a8
 154         ONES_ADD(a4, a7)
 155         addi    a2, a2, 4
 156 #if !XCHAL_HAVE_LOOPS
 157         blt     a2, a5, .Loop3
 158 #endif
 159 2:
 160         _bbci.l a3, 1, 3f       /* remaining 2-byte chunk, still odd addr */
 161         l8ui    a6, a2, 0
 162         l8ui    a7, a2, 1
 163 #ifdef  __XTENSA_EB__
 164         slli    a6, a6, 8
 165 #else
 166         slli    a7, a7, 8
 167 #endif
 168         or      a7, a7, a6
 169         ONES_ADD(a4, a7)
 170         addi    a2, a2, 2
 171 3:
 172         j       5b              /* branch to handle the remaining byte */
 173
 174 ENDPROC(csum_partial)
 175
 176 /*
 177  * Copy from ds while checksumming, otherwise like csum_partial
 178  *
 179  * The macros SRC and DST specify the type of access for the instruction.
 180  * thus we can call a custom exception handler for each access type.
 181  */
 182
 183 #define SRC(y...)                       \
 184         9999: y;                        \
 185         .section __ex_table, "a";       \
 186         .long 9999b, 6001f      ;       \
 187         .previous
 188
 189 #define DST(y...)                       \
 190         9999: y;                        \
 191         .section __ex_table, "a";       \
 192         .long 9999b, 6002f      ;       \
 193         .previous
 194
 195 /*
 196 unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
 197                                         int sum, int *src_err_ptr, int *dst_err_ptr)
 198         a2  = src
 199         a3  = dst
 200         a4  = len
 201         a5  = sum
 202         a6  = src_err_ptr
 203         a7  = dst_err_ptr
 204         a8  = temp
 205         a9  = temp
 206         a10 = temp
 207         a11 = original len for exception handling
 208         a12 = original dst for exception handling
 209
 210     This function is optimized for 4-byte aligned addresses.  Other
 211     alignments work, but not nearly as efficiently.
 212  */
 213
 214 ENTRY(csum_partial_copy_generic)
 215
 216         entry   sp, 32
 217         mov     a12, a3
 218         mov     a11, a4
 219         or      a10, a2, a3
 220
 221         /* We optimize the following alignment tests for the 4-byte
 222         aligned case.  Two bbsi.l instructions might seem more optimal
 223         (commented out below).  However, both labels 5: and 3: are out
 224         of the imm8 range, so the assembler relaxes them into
 225         equivalent bbci.l, j combinations, which is actually
 226         slower. */
 227
 228         extui   a9, a10, 0, 2
 229         beqz    a9, 1f          /* branch if both are 4-byte aligned */
 230         bbsi.l  a10, 0, 5f      /* branch if one address is odd */
 231         j       3f              /* one address is 2-byte aligned */
 232
 233 /*      _bbsi.l a10, 0, 5f */   /* branch if odd address */
 234 /*      _bbsi.l a10, 1, 3f */   /* branch if 2-byte-aligned address */
 235
 236 1:
 237         /* src and dst are both 4-byte aligned */
 238         srli    a10, a4, 5      /* 32-byte chunks */
 239 #if XCHAL_HAVE_LOOPS
 240         loopgtz a10, 2f
 241 #else
 242         beqz    a10, 2f
 243         slli    a10, a10, 5
 244         add     a10, a10, a2    /* a10 = end of last 32-byte src chunk */
 245 .Loop5:
 246 #endif
 247 SRC(    l32i    a9, a2, 0       )
 248 SRC(    l32i    a8, a2, 4       )
 249 DST(    s32i    a9, a3, 0       )
 250 DST(    s32i    a8, a3, 4       )
 251         ONES_ADD(a5, a9)
 252         ONES_ADD(a5, a8)
 253 SRC(    l32i    a9, a2, 8       )
 254 SRC(    l32i    a8, a2, 12      )
 255 DST(    s32i    a9, a3, 8       )
 256 DST(    s32i    a8, a3, 12      )
 257         ONES_ADD(a5, a9)
 258         ONES_ADD(a5, a8)
 259 SRC(    l32i    a9, a2, 16      )
 260 SRC(    l32i    a8, a2, 20      )
 261 DST(    s32i    a9, a3, 16      )
 262 DST(    s32i    a8, a3, 20      )
 263         ONES_ADD(a5, a9)
 264         ONES_ADD(a5, a8)
 265 SRC(    l32i    a9, a2, 24      )
 266 SRC(    l32i    a8, a2, 28      )
 267 DST(    s32i    a9, a3, 24      )
 268 DST(    s32i    a8, a3, 28      )
 269         ONES_ADD(a5, a9)
 270         ONES_ADD(a5, a8)
 271         addi    a2, a2, 32
 272         addi    a3, a3, 32
 273 #if !XCHAL_HAVE_LOOPS
 274         blt     a2, a10, .Loop5
 275 #endif
 276 2:
 277         extui   a10, a4, 2, 3   /* remaining 4-byte chunks */
 278         extui   a4, a4, 0, 2    /* reset len for general-case, 2-byte chunks */
 279 #if XCHAL_HAVE_LOOPS
 280         loopgtz a10, 3f
 281 #else
 282         beqz    a10, 3f
 283         slli    a10, a10, 2
 284         add     a10, a10, a2    /* a10 = end of last 4-byte src chunk */
 285 .Loop6:
 286 #endif
 287 SRC(    l32i    a9, a2, 0       )
 288 DST(    s32i    a9, a3, 0       )
 289         ONES_ADD(a5, a9)
 290         addi    a2, a2, 4
 291         addi    a3, a3, 4
 292 #if !XCHAL_HAVE_LOOPS
 293         blt     a2, a10, .Loop6
 294 #endif
 295 3:
 296         /*
 297         Control comes to here in two cases: (1) It may fall through
 298         to here from the 4-byte alignment case to process, at most,
 299         one 2-byte chunk.  (2) It branches to here from above if
 300         either src or dst is 2-byte aligned, and we process all bytes
 301         here, except for perhaps a trailing odd byte.  It's
 302         inefficient, so align your addresses to 4-byte boundaries.
 303
 304         a2 = src
 305         a3 = dst
 306         a4 = len
 307         a5 = sum
 308         */
 309         srli    a10, a4, 1      /* 2-byte chunks */
 310 #if XCHAL_HAVE_LOOPS
 311         loopgtz a10, 4f
 312 #else
 313         beqz    a10, 4f
 314         slli    a10, a10, 1
 315         add     a10, a10, a2    /* a10 = end of last 2-byte src chunk */
 316 .Loop7:
 317 #endif
 318 SRC(    l16ui   a9, a2, 0       )
 319 DST(    s16i    a9, a3, 0       )
 320         ONES_ADD(a5, a9)
 321         addi    a2, a2, 2
 322         addi    a3, a3, 2
 323 #if !XCHAL_HAVE_LOOPS
 324         blt     a2, a10, .Loop7
 325 #endif
 326 4:
 327         /* This section processes a possible trailing odd byte. */
 328         _bbci.l a4, 0, 8f       /* 1-byte chunk */
 329 SRC(    l8ui    a9, a2, 0       )
 330 DST(    s8i     a9, a3, 0       )
 331 #ifdef __XTENSA_EB__
 332         slli    a9, a9, 8       /* shift byte to bits 8..15 */
 333 #endif
 334         ONES_ADD(a5, a9)
 335 8:
 336         mov     a2, a5
 337         retw
 338
 339 5:
 340         /* Control branch to here when either src or dst is odd.  We
 341         process all bytes using 8-bit accesses.  Grossly inefficient,
 342         so don't feed us an odd address. */
 343
 344         srli    a10, a4, 1      /* handle in pairs for 16-bit csum */
 345 #if XCHAL_HAVE_LOOPS
 346         loopgtz a10, 6f
 347 #else
 348         beqz    a10, 6f
 349         slli    a10, a10, 1
 350         add     a10, a10, a2    /* a10 = end of last odd-aligned, 2-byte src chunk */
 351 .Loop8:
 352 #endif
 353 SRC(    l8ui    a9, a2, 0       )
 354 SRC(    l8ui    a8, a2, 1       )
 355 DST(    s8i     a9, a3, 0       )
 356 DST(    s8i     a8, a3, 1       )
 357 #ifdef __XTENSA_EB__
 358         slli    a9, a9, 8       /* combine into a single 16-bit value */
 359 #else                           /* for checksum computation */
 360         slli    a8, a8, 8
 361 #endif
 362         or      a9, a9, a8
 363         ONES_ADD(a5, a9)
 364         addi    a2, a2, 2
 365         addi    a3, a3, 2
 366 #if !XCHAL_HAVE_LOOPS
 367         blt     a2, a10, .Loop8
 368 #endif
 369 6:
 370         j       4b              /* process the possible trailing odd byte */
 371
 372 ENDPROC(csum_partial_copy_generic)
 373
 374
 375 # Exception handler:
 376 .section .fixup, "ax"
 377 /*
 378         a6  = src_err_ptr
 379         a7  = dst_err_ptr
 380         a11 = original len for exception handling
 381         a12 = original dst for exception handling
 382 */
 383
 384 6001:
 385         _movi   a2, -EFAULT
 386         s32i    a2, a6, 0       /* src_err_ptr */
 387
 388         # clear the complete destination - computing the rest
 389         # is too much work
 390         movi    a2, 0
 391 #if XCHAL_HAVE_LOOPS
 392         loopgtz a11, 2f
 393 #else
 394         beqz    a11, 2f
 395         add     a11, a11, a12   /* a11 = ending address */
 396 .Leloop:
 397 #endif
 398         s8i     a2, a12, 0
 399         addi    a12, a12, 1
 400 #if !XCHAL_HAVE_LOOPS
 401         blt     a12, a11, .Leloop
 402 #endif
 403 2:
 404         retw
 405
 406 6002:
 407         movi    a2, -EFAULT
 408         s32i    a2, a7, 0       /* dst_err_ptr */
 409         movi    a2, 0
 410         retw
 411
 412 .previous