arch/sparc/lib/checksum_64.S

   1 /* checksum.S: Sparc V9 optimized checksum code.
   2  *
   3  *  Copyright(C) 1995 Linus Torvalds
   4  *  Copyright(C) 1995 Miguel de Icaza
   5  *  Copyright(C) 1996, 2000 David S. Miller
   6  *  Copyright(C) 1997 Jakub Jelinek
   7  *
   8  * derived from:
   9  *      Linux/Alpha checksum c-code
  10  *      Linux/ix86 inline checksum assembly
  11  *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
  12  *      David Mosberger-Tang for optimized reference c-code
  13  *      BSD4.4 portable checksum routine
  14  */
  15
  16 #include <asm/export.h>
  17         .text
  18
  19 csum_partial_fix_alignment:
  20         /* We checked for zero length already, so there must be
  21          * at least one byte.
  22          */
  23         be,pt           %icc, 1f
  24          nop
  25         ldub            [%o0 + 0x00], %o4
  26         add             %o0, 1, %o0
  27         sub             %o1, 1, %o1
  28 1:      andcc           %o0, 0x2, %g0
  29         be,pn           %icc, csum_partial_post_align
  30          cmp            %o1, 2
  31         blu,pn          %icc, csum_partial_end_cruft
  32          nop
  33         lduh            [%o0 + 0x00], %o5
  34         add             %o0, 2, %o0
  35         sub             %o1, 2, %o1
  36         ba,pt           %xcc, csum_partial_post_align
  37          add            %o5, %o4, %o4
  38
  39         .align          32
  40         .globl          csum_partial
  41         .type           csum_partial,#function
  42         EXPORT_SYMBOL(csum_partial)
  43 csum_partial:           /* %o0=buff, %o1=len, %o2=sum */
  44         prefetch        [%o0 + 0x000], #n_reads
  45         clr             %o4
  46         prefetch        [%o0 + 0x040], #n_reads
  47         brz,pn          %o1, csum_partial_finish
  48          andcc          %o0, 0x3, %g0
  49
  50         /* We "remember" whether the lowest bit in the address
  51          * was set in %g7.  Because if it is, we have to swap
  52          * upper and lower 8 bit fields of the sum we calculate.
  53         */
  54         bne,pn          %icc, csum_partial_fix_alignment
  55          andcc          %o0, 0x1, %g7
  56
  57 csum_partial_post_align:
  58         prefetch        [%o0 + 0x080], #n_reads
  59         andncc          %o1, 0x3f, %o3
  60
  61         prefetch        [%o0 + 0x0c0], #n_reads
  62         sub             %o1, %o3, %o1
  63         brz,pn          %o3, 2f
  64          prefetch       [%o0 + 0x100], #n_reads
  65
  66         /* So that we don't need to use the non-pairing
  67          * add-with-carry instructions we accumulate 32-bit
  68          * values into a 64-bit register.  At the end of the
  69          * loop we fold it down to 32-bits and so on.
  70          */
  71         prefetch        [%o0 + 0x140], #n_reads
  72 1:      lduw            [%o0 + 0x00], %o5
  73         lduw            [%o0 + 0x04], %g1
  74         lduw            [%o0 + 0x08], %g2
  75         add             %o4, %o5, %o4
  76         lduw            [%o0 + 0x0c], %g3
  77         add             %o4, %g1, %o4
  78         lduw            [%o0 + 0x10], %o5
  79         add             %o4, %g2, %o4
  80         lduw            [%o0 + 0x14], %g1
  81         add             %o4, %g3, %o4
  82         lduw            [%o0 + 0x18], %g2
  83         add             %o4, %o5, %o4
  84         lduw            [%o0 + 0x1c], %g3
  85         add             %o4, %g1, %o4
  86         lduw            [%o0 + 0x20], %o5
  87         add             %o4, %g2, %o4
  88         lduw            [%o0 + 0x24], %g1
  89         add             %o4, %g3, %o4
  90         lduw            [%o0 + 0x28], %g2
  91         add             %o4, %o5, %o4
  92         lduw            [%o0 + 0x2c], %g3
  93         add             %o4, %g1, %o4
  94         lduw            [%o0 + 0x30], %o5
  95         add             %o4, %g2, %o4
  96         lduw            [%o0 + 0x34], %g1
  97         add             %o4, %g3, %o4
  98         lduw            [%o0 + 0x38], %g2
  99         add             %o4, %o5, %o4
 100         lduw            [%o0 + 0x3c], %g3
 101         add             %o4, %g1, %o4
 102         prefetch        [%o0 + 0x180], #n_reads
 103         add             %o4, %g2, %o4
 104         subcc           %o3, 0x40, %o3
 105         add             %o0, 0x40, %o0
 106         bne,pt          %icc, 1b
 107          add            %o4, %g3, %o4
 108
 109 2:      and             %o1, 0x3c, %o3
 110         brz,pn          %o3, 2f
 111          sub            %o1, %o3, %o1
 112 1:      lduw            [%o0 + 0x00], %o5
 113         subcc           %o3, 0x4, %o3
 114         add             %o0, 0x4, %o0
 115         bne,pt          %icc, 1b
 116          add            %o4, %o5, %o4
 117
 118 2:
 119         /* fold 64-->32 */
 120         srlx            %o4, 32, %o5
 121         srl             %o4, 0, %o4
 122         add             %o4, %o5, %o4
 123         srlx            %o4, 32, %o5
 124         srl             %o4, 0, %o4
 125         add             %o4, %o5, %o4
 126
 127         /* fold 32-->16 */
 128         sethi           %hi(0xffff0000), %g1
 129         srl             %o4, 16, %o5
 130         andn            %o4, %g1, %g2
 131         add             %o5, %g2, %o4
 132         srl             %o4, 16, %o5
 133         andn            %o4, %g1, %g2
 134         add             %o5, %g2, %o4
 135
 136 csum_partial_end_cruft:
 137         /* %o4 has the 16-bit sum we have calculated so-far.  */
 138         cmp             %o1, 2
 139         blu,pt          %icc, 1f
 140          nop
 141         lduh            [%o0 + 0x00], %o5
 142         sub             %o1, 2, %o1
 143         add             %o0, 2, %o0
 144         add             %o4, %o5, %o4
 145 1:      brz,pt          %o1, 1f
 146          nop
 147         ldub            [%o0 + 0x00], %o5
 148         sub             %o1, 1, %o1
 149         add             %o0, 1, %o0
 150         sllx            %o5, 8, %o5
 151         add             %o4, %o5, %o4
 152 1:
 153         /* fold 32-->16 */
 154         sethi           %hi(0xffff0000), %g1
 155         srl             %o4, 16, %o5
 156         andn            %o4, %g1, %g2
 157         add             %o5, %g2, %o4
 158         srl             %o4, 16, %o5
 159         andn            %o4, %g1, %g2
 160         add             %o5, %g2, %o4
 161
 162 1:      brz,pt          %g7, 1f
 163          nop
 164
 165         /* We started with an odd byte, byte-swap the result.  */
 166         srl             %o4, 8, %o5
 167         and             %o4, 0xff, %g1
 168         sll             %g1, 8, %g1
 169         or              %o5, %g1, %o4
 170
 171 1:      addcc           %o2, %o4, %o2
 172         addc            %g0, %o2, %o2
 173
 174 csum_partial_finish:
 175         retl
 176          srl            %o2, 0, %o0