arch/sparc/lib/checksum_64.S

   1 /* checksum.S: Sparc V9 optimized checksum code.
   2  *
   3  *  Copyright(C) 1995 Linus Torvalds
   4  *  Copyright(C) 1995 Miguel de Icaza
   5  *  Copyright(C) 1996, 2000 David S. Miller
   6  *  Copyright(C) 1997 Jakub Jelinek
   7  *
   8  * derived from:
   9  *      Linux/Alpha checksum c-code
  10  *      Linux/ix86 inline checksum assembly
  11  *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
  12  *      David Mosberger-Tang for optimized reference c-code
  13  *      BSD4.4 portable checksum routine
  14  */
  15
  16         .text
  17
  18 csum_partial_fix_alignment:
  19         /* We checked for zero length already, so there must be
  20          * at least one byte.
  21          */
  22         be,pt           %icc, 1f
  23          nop
  24         ldub            [%o0 + 0x00], %o4
  25         add             %o0, 1, %o0
  26         sub             %o1, 1, %o1
  27 1:      andcc           %o0, 0x2, %g0
  28         be,pn           %icc, csum_partial_post_align
  29          cmp            %o1, 2
  30         blu,pn          %icc, csum_partial_end_cruft
  31          nop
  32         lduh            [%o0 + 0x00], %o5
  33         add             %o0, 2, %o0
  34         sub             %o1, 2, %o1
  35         ba,pt           %xcc, csum_partial_post_align
  36          add            %o5, %o4, %o4
  37
  38         .align          32
  39         .globl          csum_partial
  40 csum_partial:           /* %o0=buff, %o1=len, %o2=sum */
  41         prefetch        [%o0 + 0x000], #n_reads
  42         clr             %o4
  43         prefetch        [%o0 + 0x040], #n_reads
  44         brz,pn          %o1, csum_partial_finish
  45          andcc          %o0, 0x3, %g0
  46
  47         /* We "remember" whether the lowest bit in the address
  48          * was set in %g7.  Because if it is, we have to swap
  49          * upper and lower 8 bit fields of the sum we calculate.
  50         */
  51         bne,pn          %icc, csum_partial_fix_alignment
  52          andcc          %o0, 0x1, %g7
  53
  54 csum_partial_post_align:
  55         prefetch        [%o0 + 0x080], #n_reads
  56         andncc          %o1, 0x3f, %o3
  57
  58         prefetch        [%o0 + 0x0c0], #n_reads
  59         sub             %o1, %o3, %o1
  60         brz,pn          %o3, 2f
  61          prefetch       [%o0 + 0x100], #n_reads
  62
  63         /* So that we don't need to use the non-pairing
  64          * add-with-carry instructions we accumulate 32-bit
  65          * values into a 64-bit register.  At the end of the
  66          * loop we fold it down to 32-bits and so on.
  67          */
  68         prefetch        [%o0 + 0x140], #n_reads
  69 1:      lduw            [%o0 + 0x00], %o5
  70         lduw            [%o0 + 0x04], %g1
  71         lduw            [%o0 + 0x08], %g2
  72         add             %o4, %o5, %o4
  73         lduw            [%o0 + 0x0c], %g3
  74         add             %o4, %g1, %o4
  75         lduw            [%o0 + 0x10], %o5
  76         add             %o4, %g2, %o4
  77         lduw            [%o0 + 0x14], %g1
  78         add             %o4, %g3, %o4
  79         lduw            [%o0 + 0x18], %g2
  80         add             %o4, %o5, %o4
  81         lduw            [%o0 + 0x1c], %g3
  82         add             %o4, %g1, %o4
  83         lduw            [%o0 + 0x20], %o5
  84         add             %o4, %g2, %o4
  85         lduw            [%o0 + 0x24], %g1
  86         add             %o4, %g3, %o4
  87         lduw            [%o0 + 0x28], %g2
  88         add             %o4, %o5, %o4
  89         lduw            [%o0 + 0x2c], %g3
  90         add             %o4, %g1, %o4
  91         lduw            [%o0 + 0x30], %o5
  92         add             %o4, %g2, %o4
  93         lduw            [%o0 + 0x34], %g1
  94         add             %o4, %g3, %o4
  95         lduw            [%o0 + 0x38], %g2
  96         add             %o4, %o5, %o4
  97         lduw            [%o0 + 0x3c], %g3
  98         add             %o4, %g1, %o4
  99         prefetch        [%o0 + 0x180], #n_reads
 100         add             %o4, %g2, %o4
 101         subcc           %o3, 0x40, %o3
 102         add             %o0, 0x40, %o0
 103         bne,pt          %icc, 1b
 104          add            %o4, %g3, %o4
 105
 106 2:      and             %o1, 0x3c, %o3
 107         brz,pn          %o3, 2f
 108          sub            %o1, %o3, %o1
 109 1:      lduw            [%o0 + 0x00], %o5
 110         subcc           %o3, 0x4, %o3
 111         add             %o0, 0x4, %o0
 112         bne,pt          %icc, 1b
 113          add            %o4, %o5, %o4
 114
 115 2:
 116         /* fold 64-->32 */
 117         srlx            %o4, 32, %o5
 118         srl             %o4, 0, %o4
 119         add             %o4, %o5, %o4
 120         srlx            %o4, 32, %o5
 121         srl             %o4, 0, %o4
 122         add             %o4, %o5, %o4
 123
 124         /* fold 32-->16 */
 125         sethi           %hi(0xffff0000), %g1
 126         srl             %o4, 16, %o5
 127         andn            %o4, %g1, %g2
 128         add             %o5, %g2, %o4
 129         srl             %o4, 16, %o5
 130         andn            %o4, %g1, %g2
 131         add             %o5, %g2, %o4
 132
 133 csum_partial_end_cruft:
 134         /* %o4 has the 16-bit sum we have calculated so-far.  */
 135         cmp             %o1, 2
 136         blu,pt          %icc, 1f
 137          nop
 138         lduh            [%o0 + 0x00], %o5
 139         sub             %o1, 2, %o1
 140         add             %o0, 2, %o0
 141         add             %o4, %o5, %o4
 142 1:      brz,pt          %o1, 1f
 143          nop
 144         ldub            [%o0 + 0x00], %o5
 145         sub             %o1, 1, %o1
 146         add             %o0, 1, %o0
 147         sllx            %o5, 8, %o5
 148         add             %o4, %o5, %o4
 149 1:
 150         /* fold 32-->16 */
 151         sethi           %hi(0xffff0000), %g1
 152         srl             %o4, 16, %o5
 153         andn            %o4, %g1, %g2
 154         add             %o5, %g2, %o4
 155         srl             %o4, 16, %o5
 156         andn            %o4, %g1, %g2
 157         add             %o5, %g2, %o4
 158
 159 1:      brz,pt          %g7, 1f
 160          nop
 161
 162         /* We started with an odd byte, byte-swap the result.  */
 163         srl             %o4, 8, %o5
 164         and             %o4, 0xff, %g1
 165         sll             %g1, 8, %g1
 166         or              %o5, %g1, %o4
 167
 168 1:      addcc           %o2, %o4, %o2
 169         addc            %g0, %o2, %o2
 170
 171 csum_partial_finish:
 172         retl
 173          srl            %o2, 0, %o0