arch/xtensa/lib/checksum.S

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              IP/TCP/UDP checksumming routines
   7  *
   8  * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
   9  *                  Optimized by Joe Taylor
  10  *
  11  *              This program is free software; you can redistribute it and/or
  12  *              modify it under the terms of the GNU General Public License
  13  *              as published by the Free Software Foundation; either version
  14  *              2 of the License, or (at your option) any later version.
  15  */
  16
  17 #include <asm/errno.h>
  18 #include <linux/linkage.h>
  19 #define _ASMLANGUAGE
  20 #include <xtensa/config/core.h>
  21
  22 /*
  23  * computes a partial checksum, e.g. for TCP/UDP fragments
  24  */
  25
  26 /*
  27  * unsigned int csum_partial(const unsigned char *buf, int len,
  28  *                           unsigned int sum);
  29  *    a2 = buf
  30  *    a3 = len
  31  *    a4 = sum
  32  *
  33  * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
  34  */
  35
  36 /* ONES_ADD converts twos-complement math to ones-complement. */
  37 #define ONES_ADD(sum, val)        \
  38         add     sum, sum, val   ; \
  39         bgeu    sum, val, 99f   ; \
  40         addi    sum, sum, 1     ; \
  41 99:                             ;
  42
  43 .text
  44 ENTRY(csum_partial)
  45           /*
  46            * Experiments with Ethernet and SLIP connections show that buf
  47            * is aligned on either a 2-byte or 4-byte boundary.
  48            */
  49         entry   sp, 32
  50         extui   a5, a2, 0, 2
  51         bnez    a5, 8f          /* branch if 2-byte aligned */
  52         /* Fall-through on common case, 4-byte alignment */
  53 1:
  54         srli    a5, a3, 5       /* 32-byte chunks */
  55 #if XCHAL_HAVE_LOOPS
  56         loopgtz a5, 2f
  57 #else
  58         beqz    a5, 2f
  59         slli    a5, a5, 5
  60         add     a5, a5, a2      /* a5 = end of last 32-byte chunk */
  61 .Loop1:
  62 #endif
  63         l32i    a6, a2, 0
  64         l32i    a7, a2, 4
  65         ONES_ADD(a4, a6)
  66         ONES_ADD(a4, a7)
  67         l32i    a6, a2, 8
  68         l32i    a7, a2, 12
  69         ONES_ADD(a4, a6)
  70         ONES_ADD(a4, a7)
  71         l32i    a6, a2, 16
  72         l32i    a7, a2, 20
  73         ONES_ADD(a4, a6)
  74         ONES_ADD(a4, a7)
  75         l32i    a6, a2, 24
  76         l32i    a7, a2, 28
  77         ONES_ADD(a4, a6)
  78         ONES_ADD(a4, a7)
  79         addi    a2, a2, 4*8
  80 #if !XCHAL_HAVE_LOOPS
  81         blt     a2, a5, .Loop1
  82 #endif
  83 2:
  84         extui   a5, a3, 2, 3    /* remaining 4-byte chunks */
  85 #if XCHAL_HAVE_LOOPS
  86         loopgtz a5, 3f
  87 #else
  88         beqz    a5, 3f
  89         slli    a5, a5, 2
  90         add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
  91 .Loop2:
  92 #endif
  93         l32i    a6, a2, 0
  94         ONES_ADD(a4, a6)
  95         addi    a2, a2, 4
  96 #if !XCHAL_HAVE_LOOPS
  97         blt     a2, a5, .Loop2
  98 #endif
  99 3:
 100         _bbci.l a3, 1, 5f       /* remaining 2-byte chunk */
 101         l16ui   a6, a2, 0
 102         ONES_ADD(a4, a6)
 103         addi    a2, a2, 2
 104 5:
 105         _bbci.l a3, 0, 7f       /* remaining 1-byte chunk */
 106 6:      l8ui    a6, a2, 0
 107 #ifdef __XTENSA_EB__
 108         slli    a6, a6, 8       /* load byte into bits 8..15 */
 109 #endif
 110         ONES_ADD(a4, a6)
 111 7:
 112         mov     a2, a4
 113         retw
 114
 115         /* uncommon case, buf is 2-byte aligned */
 116 8:
 117         beqz    a3, 7b          /* branch if len == 0 */
 118         beqi    a3, 1, 6b       /* branch if len == 1 */
 119
 120         extui   a5, a2, 0, 1
 121         bnez    a5, 8f          /* branch if 1-byte aligned */
 122
 123         l16ui   a6, a2, 0       /* common case, len >= 2 */
 124         ONES_ADD(a4, a6)
 125         addi    a2, a2, 2       /* adjust buf */
 126         addi    a3, a3, -2      /* adjust len */
 127         j       1b              /* now buf is 4-byte aligned */
 128
 129         /* case: odd-byte aligned, len > 1
 130          * This case is dog slow, so don't give us an odd address.
 131          * (I don't think this ever happens, but just in case.)
 132          */
 133 8:
 134         srli    a5, a3, 2       /* 4-byte chunks */
 135 #if XCHAL_HAVE_LOOPS
 136         loopgtz a5, 2f
 137 #else
 138         beqz    a5, 2f
 139         slli    a5, a5, 2
 140         add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
 141 .Loop3:
 142 #endif
 143         l8ui    a6, a2, 0       /* bits 24..31 */
 144         l16ui   a7, a2, 1       /* bits  8..23 */
 145         l8ui    a8, a2, 3       /* bits  0.. 8 */
 146 #ifdef  __XTENSA_EB__
 147         slli    a6, a6, 24
 148 #else
 149         slli    a8, a8, 24
 150 #endif
 151         slli    a7, a7, 8
 152         or      a7, a7, a6
 153         or      a7, a7, a8
 154         ONES_ADD(a4, a7)
 155         addi    a2, a2, 4
 156 #if !XCHAL_HAVE_LOOPS
 157         blt     a2, a5, .Loop3
 158 #endif
 159 2:
 160         _bbci.l a3, 1, 3f       /* remaining 2-byte chunk, still odd addr */
 161         l8ui    a6, a2, 0
 162         l8ui    a7, a2, 1
 163 #ifdef  __XTENSA_EB__
 164         slli    a6, a6, 8
 165 #else
 166         slli    a7, a7, 8
 167 #endif
 168         or      a7, a7, a6
 169         ONES_ADD(a4, a7)
 170         addi    a2, a2, 2
 171 3:
 172         j       5b              /* branch to handle the remaining byte */
 173
 174
 175
 176 /*
 177  * Copy from ds while checksumming, otherwise like csum_partial
 178  *
 179  * The macros SRC and DST specify the type of access for the instruction.
 180  * thus we can call a custom exception handler for each access type.
 181  */
 182
 183 #define SRC(y...)                       \
 184         9999: y;                        \
 185         .section __ex_table, "a";       \
 186         .long 9999b, 6001f      ;       \
 187         .previous
 188
 189 #define DST(y...)                       \
 190         9999: y;                        \
 191         .section __ex_table, "a";       \
 192         .long 9999b, 6002f      ;       \
 193         .previous
 194
 195 /*
 196 unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
 197                                         int sum, int *src_err_ptr, int *dst_err_ptr)
 198         a2  = src
 199         a3  = dst
 200         a4  = len
 201         a5  = sum
 202         a6  = src_err_ptr
 203         a7  = dst_err_ptr
 204         a8  = temp
 205         a9  = temp
 206         a10 = temp
 207         a11 = original len for exception handling
 208         a12 = original dst for exception handling
 209
 210     This function is optimized for 4-byte aligned addresses.  Other
 211     alignments work, but not nearly as efficiently.
 212  */
 213
 214 ENTRY(csum_partial_copy_generic)
 215         entry   sp, 32
 216         mov     a12, a3
 217         mov     a11, a4
 218         or      a10, a2, a3
 219
 220         /* We optimize the following alignment tests for the 4-byte
 221         aligned case.  Two bbsi.l instructions might seem more optimal
 222         (commented out below).  However, both labels 5: and 3: are out
 223         of the imm8 range, so the assembler relaxes them into
 224         equivalent bbci.l, j combinations, which is actually
 225         slower. */
 226
 227         extui   a9, a10, 0, 2
 228         beqz    a9, 1f          /* branch if both are 4-byte aligned */
 229         bbsi.l  a10, 0, 5f      /* branch if one address is odd */
 230         j       3f              /* one address is 2-byte aligned */
 231
 232 /*      _bbsi.l a10, 0, 5f */   /* branch if odd address */
 233 /*      _bbsi.l a10, 1, 3f */   /* branch if 2-byte-aligned address */
 234
 235 1:
 236         /* src and dst are both 4-byte aligned */
 237         srli    a10, a4, 5      /* 32-byte chunks */
 238 #if XCHAL_HAVE_LOOPS
 239         loopgtz a10, 2f
 240 #else
 241         beqz    a10, 2f
 242         slli    a10, a10, 5
 243         add     a10, a10, a2    /* a10 = end of last 32-byte src chunk */
 244 .Loop5:
 245 #endif
 246 SRC(    l32i    a9, a2, 0       )
 247 SRC(    l32i    a8, a2, 4       )
 248 DST(    s32i    a9, a3, 0       )
 249 DST(    s32i    a8, a3, 4       )
 250         ONES_ADD(a5, a9)
 251         ONES_ADD(a5, a8)
 252 SRC(    l32i    a9, a2, 8       )
 253 SRC(    l32i    a8, a2, 12      )
 254 DST(    s32i    a9, a3, 8       )
 255 DST(    s32i    a8, a3, 12      )
 256         ONES_ADD(a5, a9)
 257         ONES_ADD(a5, a8)
 258 SRC(    l32i    a9, a2, 16      )
 259 SRC(    l32i    a8, a2, 20      )
 260 DST(    s32i    a9, a3, 16      )
 261 DST(    s32i    a8, a3, 20      )
 262         ONES_ADD(a5, a9)
 263         ONES_ADD(a5, a8)
 264 SRC(    l32i    a9, a2, 24      )
 265 SRC(    l32i    a8, a2, 28      )
 266 DST(    s32i    a9, a3, 24      )
 267 DST(    s32i    a8, a3, 28      )
 268         ONES_ADD(a5, a9)
 269         ONES_ADD(a5, a8)
 270         addi    a2, a2, 32
 271         addi    a3, a3, 32
 272 #if !XCHAL_HAVE_LOOPS
 273         blt     a2, a10, .Loop5
 274 #endif
 275 2:
 276         extui   a10, a4, 2, 3   /* remaining 4-byte chunks */
 277         extui   a4, a4, 0, 2    /* reset len for general-case, 2-byte chunks */
 278 #if XCHAL_HAVE_LOOPS
 279         loopgtz a10, 3f
 280 #else
 281         beqz    a10, 3f
 282         slli    a10, a10, 2
 283         add     a10, a10, a2    /* a10 = end of last 4-byte src chunk */
 284 .Loop6:
 285 #endif
 286 SRC(    l32i    a9, a2, 0       )
 287 DST(    s32i    a9, a3, 0       )
 288         ONES_ADD(a5, a9)
 289         addi    a2, a2, 4
 290         addi    a3, a3, 4
 291 #if !XCHAL_HAVE_LOOPS
 292         blt     a2, a10, .Loop6
 293 #endif
 294 3:
 295         /*
 296         Control comes to here in two cases: (1) It may fall through
 297         to here from the 4-byte alignment case to process, at most,
 298         one 2-byte chunk.  (2) It branches to here from above if
 299         either src or dst is 2-byte aligned, and we process all bytes
 300         here, except for perhaps a trailing odd byte.  It's
 301         inefficient, so align your addresses to 4-byte boundaries.
 302
 303         a2 = src
 304         a3 = dst
 305         a4 = len
 306         a5 = sum
 307         */
 308         srli    a10, a4, 1      /* 2-byte chunks */
 309 #if XCHAL_HAVE_LOOPS
 310         loopgtz a10, 4f
 311 #else
 312         beqz    a10, 4f
 313         slli    a10, a10, 1
 314         add     a10, a10, a2    /* a10 = end of last 2-byte src chunk */
 315 .Loop7:
 316 #endif
 317 SRC(    l16ui   a9, a2, 0       )
 318 DST(    s16i    a9, a3, 0       )
 319         ONES_ADD(a5, a9)
 320         addi    a2, a2, 2
 321         addi    a3, a3, 2
 322 #if !XCHAL_HAVE_LOOPS
 323         blt     a2, a10, .Loop7
 324 #endif
 325 4:
 326         /* This section processes a possible trailing odd byte. */
 327         _bbci.l a4, 0, 8f       /* 1-byte chunk */
 328 SRC(    l8ui    a9, a2, 0       )
 329 DST(    s8i     a9, a3, 0       )
 330 #ifdef __XTENSA_EB__
 331         slli    a9, a9, 8       /* shift byte to bits 8..15 */
 332 #endif
 333         ONES_ADD(a5, a9)
 334 8:
 335         mov     a2, a5
 336         retw
 337
 338 5:
 339         /* Control branch to here when either src or dst is odd.  We
 340         process all bytes using 8-bit accesses.  Grossly inefficient,
 341         so don't feed us an odd address. */
 342
 343         srli    a10, a4, 1      /* handle in pairs for 16-bit csum */
 344 #if XCHAL_HAVE_LOOPS
 345         loopgtz a10, 6f
 346 #else
 347         beqz    a10, 6f
 348         slli    a10, a10, 1
 349         add     a10, a10, a2    /* a10 = end of last odd-aligned, 2-byte src chunk */
 350 .Loop8:
 351 #endif
 352 SRC(    l8ui    a9, a2, 0       )
 353 SRC(    l8ui    a8, a2, 1       )
 354 DST(    s8i     a9, a3, 0       )
 355 DST(    s8i     a8, a3, 1       )
 356 #ifdef __XTENSA_EB__
 357         slli    a9, a9, 8       /* combine into a single 16-bit value */
 358 #else                           /* for checksum computation */
 359         slli    a8, a8, 8
 360 #endif
 361         or      a9, a9, a8
 362         ONES_ADD(a5, a9)
 363         addi    a2, a2, 2
 364         addi    a3, a3, 2
 365 #if !XCHAL_HAVE_LOOPS
 366         blt     a2, a10, .Loop8
 367 #endif
 368 6:
 369         j       4b              /* process the possible trailing odd byte */
 370
 371
 372 # Exception handler:
 373 .section .fixup, "ax"
 374 /*
 375         a6  = src_err_ptr
 376         a7  = dst_err_ptr
 377         a11 = original len for exception handling
 378         a12 = original dst for exception handling
 379 */
 380
 381 6001:
 382         _movi   a2, -EFAULT
 383         s32i    a2, a6, 0       /* src_err_ptr */
 384
 385         # clear the complete destination - computing the rest
 386         # is too much work
 387         movi    a2, 0
 388 #if XCHAL_HAVE_LOOPS
 389         loopgtz a11, 2f
 390 #else
 391         beqz    a11, 2f
 392         add     a11, a11, a12   /* a11 = ending address */
 393 .Leloop:
 394 #endif
 395         s8i     a2, a12, 0
 396         addi    a12, a12, 1
 397 #if !XCHAL_HAVE_LOOPS
 398         blt     a12, a11, .Leloop
 399 #endif
 400 2:
 401         retw
 402
 403 6002:
 404         movi    a2, -EFAULT
 405         s32i    a2, a7, 0       /* dst_err_ptr */
 406         movi    a2, 0
 407         retw
 408
 409 .previous
 410