arch/sparc/lib/checksum_32.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /* checksum.S: Sparc optimized checksum code.
   3  *
   4  *  Copyright(C) 1995 Linus Torvalds
   5  *  Copyright(C) 1995 Miguel de Icaza
   6  *  Copyright(C) 1996 David S. Miller
   7  *  Copyright(C) 1997 Jakub Jelinek
   8  *
   9  * derived from:
  10  *      Linux/Alpha checksum c-code
  11  *      Linux/ix86 inline checksum assembly
  12  *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
  13  *      David Mosberger-Tang for optimized reference c-code
  14  *      BSD4.4 portable checksum routine
  15  */
  16
  17 #include <asm/errno.h>
  18 #include <asm/export.h>
  19
  20 #define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \
  21         ldd     [buf + offset + 0x00], t0;                      \
  22         ldd     [buf + offset + 0x08], t2;                      \
  23         addxcc  t0, sum, sum;                                   \
  24         addxcc  t1, sum, sum;                                   \
  25         ldd     [buf + offset + 0x10], t4;                      \
  26         addxcc  t2, sum, sum;                                   \
  27         addxcc  t3, sum, sum;                                   \
  28         ldd     [buf + offset + 0x18], t0;                      \
  29         addxcc  t4, sum, sum;                                   \
  30         addxcc  t5, sum, sum;                                   \
  31         addxcc  t0, sum, sum;                                   \
  32         addxcc  t1, sum, sum;
  33
  34 #define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3)        \
  35         ldd     [buf - offset - 0x08], t0;                      \
  36         ldd     [buf - offset - 0x00], t2;                      \
  37         addxcc  t0, sum, sum;                                   \
  38         addxcc  t1, sum, sum;                                   \
  39         addxcc  t2, sum, sum;                                   \
  40         addxcc  t3, sum, sum;
  41
  42         /* Do end cruft out of band to get better cache patterns. */
  43 csum_partial_end_cruft:
  44         be      1f                              ! caller asks %o1 & 0x8
  45          andcc  %o1, 4, %g0                     ! nope, check for word remaining
  46         ldd     [%o0], %g2                      ! load two
  47         addcc   %g2, %o2, %o2                   ! add first word to sum
  48         addxcc  %g3, %o2, %o2                   ! add second word as well
  49         add     %o0, 8, %o0                     ! advance buf ptr
  50         addx    %g0, %o2, %o2                   ! add in final carry
  51         andcc   %o1, 4, %g0                     ! check again for word remaining
  52 1:      be      1f                              ! nope, skip this code
  53          andcc  %o1, 3, %o1                     ! check for trailing bytes
  54         ld      [%o0], %g2                      ! load it
  55         addcc   %g2, %o2, %o2                   ! add to sum
  56         add     %o0, 4, %o0                     ! advance buf ptr
  57         addx    %g0, %o2, %o2                   ! add in final carry
  58         andcc   %o1, 3, %g0                     ! check again for trailing bytes
  59 1:      be      1f                              ! no trailing bytes, return
  60          addcc  %o1, -1, %g0                    ! only one byte remains?
  61         bne     2f                              ! at least two bytes more
  62          subcc  %o1, 2, %o1                     ! only two bytes more?
  63         b       4f                              ! only one byte remains
  64          or     %g0, %g0, %o4                   ! clear fake hword value
  65 2:      lduh    [%o0], %o4                      ! get hword
  66         be      6f                              ! jmp if only hword remains
  67          add    %o0, 2, %o0                     ! advance buf ptr either way
  68         sll     %o4, 16, %o4                    ! create upper hword
  69 4:      ldub    [%o0], %o5                      ! get final byte
  70         sll     %o5, 8, %o5                     ! put into place
  71         or      %o5, %o4, %o4                   ! coalese with hword (if any)
  72 6:      addcc   %o4, %o2, %o2                   ! add to sum
  73 1:      retl                                    ! get outta here
  74          addx   %g0, %o2, %o0                   ! add final carry into retval
  75
  76         /* Also do alignment out of band to get better cache patterns. */
  77 csum_partial_fix_alignment:
  78         cmp     %o1, 6
  79         bl      cpte - 0x4
  80          andcc  %o0, 0x2, %g0
  81         be      1f
  82          andcc  %o0, 0x4, %g0
  83         lduh    [%o0 + 0x00], %g2
  84         sub     %o1, 2, %o1
  85         add     %o0, 2, %o0
  86         sll     %g2, 16, %g2
  87         addcc   %g2, %o2, %o2
  88         srl     %o2, 16, %g3
  89         addx    %g0, %g3, %g2
  90         sll     %o2, 16, %o2
  91         sll     %g2, 16, %g3
  92         srl     %o2, 16, %o2
  93         andcc   %o0, 0x4, %g0
  94         or      %g3, %o2, %o2
  95 1:      be      cpa
  96          andcc  %o1, 0xffffff80, %o3
  97         ld      [%o0 + 0x00], %g2
  98         sub     %o1, 4, %o1
  99         addcc   %g2, %o2, %o2
 100         add     %o0, 4, %o0
 101         addx    %g0, %o2, %o2
 102         b       cpa
 103          andcc  %o1, 0xffffff80, %o3
 104
 105         /* The common case is to get called with a nicely aligned
 106          * buffer of size 0x20.  Follow the code path for that case.
 107          */
 108         .globl  csum_partial
 109         EXPORT_SYMBOL(csum_partial)
 110 csum_partial:                   /* %o0=buf, %o1=len, %o2=sum */
 111         andcc   %o0, 0x7, %g0                           ! alignment problems?
 112         bne     csum_partial_fix_alignment              ! yep, handle it
 113          sethi  %hi(cpte - 8), %g7                      ! prepare table jmp ptr
 114         andcc   %o1, 0xffffff80, %o3                    ! num loop iterations
 115 cpa:    be      3f                                      ! none to do
 116          andcc  %o1, 0x70, %g1                          ! clears carry flag too
 117 5:      CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 118         CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 119         CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 120         CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 121         addx    %g0, %o2, %o2                           ! sink in final carry
 122         subcc   %o3, 128, %o3                           ! detract from loop iters
 123         bne     5b                                      ! more to do
 124          add    %o0, 128, %o0                           ! advance buf ptr
 125         andcc   %o1, 0x70, %g1                          ! clears carry flag too
 126 3:      be      cpte                                    ! nope
 127          andcc  %o1, 0xf, %g0                           ! anything left at all?
 128         srl     %g1, 1, %o4                             ! compute offset
 129         sub     %g7, %g1, %g7                           ! adjust jmp ptr
 130         sub     %g7, %o4, %g7                           ! final jmp ptr adjust
 131         jmp     %g7 + %lo(cpte - 8)                     ! enter the table
 132          add    %o0, %g1, %o0                           ! advance buf ptr
 133 cptbl:  CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
 134         CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
 135         CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
 136         CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
 137         CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
 138         CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
 139         CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
 140         addx    %g0, %o2, %o2                           ! fetch final carry
 141         andcc   %o1, 0xf, %g0                           ! anything left at all?
 142 cpte:   bne     csum_partial_end_cruft                  ! yep, handle it
 143          andcc  %o1, 8, %g0                             ! check how much
 144 cpout:  retl                                            ! get outta here
 145          mov    %o2, %o0                                ! return computed csum
 146
 147 /* Work around cpp -rob */
 148 #define ALLOC #alloc
 149 #define EXECINSTR #execinstr
 150 #define EX(x,y)                                 \
 151 98:     x,y;                                    \
 152         .section __ex_table,ALLOC;              \
 153         .align  4;                              \
 154         .word   98b, cc_fault;                   \
 155         .text;                                  \
 156         .align  4
 157
 158         /* This aligned version executes typically in 8.5 superscalar cycles, this
 159          * is the best I can do.  I say 8.5 because the final add will pair with
 160          * the next ldd in the main unrolled loop.  Thus the pipe is always full.
 161          * If you change these macros (including order of instructions),
 162          * please check the fixup code below as well.
 163          */
 164 #define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)   \
 165         EX(ldd  [src + off + 0x00], t0);                                                \
 166         EX(ldd  [src + off + 0x08], t2);                                                \
 167         addxcc  t0, sum, sum;                                                           \
 168         EX(ldd  [src + off + 0x10], t4);                                                \
 169         addxcc  t1, sum, sum;                                                           \
 170         EX(ldd  [src + off + 0x18], t6);                                                \
 171         addxcc  t2, sum, sum;                                                           \
 172         EX(std  t0, [dst + off + 0x00]);                                                \
 173         addxcc  t3, sum, sum;                                                           \
 174         EX(std  t2, [dst + off + 0x08]);                                                \
 175         addxcc  t4, sum, sum;                                                           \
 176         EX(std  t4, [dst + off + 0x10]);                                                \
 177         addxcc  t5, sum, sum;                                                           \
 178         EX(std  t6, [dst + off + 0x18]);                                                \
 179         addxcc  t6, sum, sum;                                                           \
 180         addxcc  t7, sum, sum;
 181
 182         /* 12 superscalar cycles seems to be the limit for this case,
 183          * because of this we thus do all the ldd's together to get
 184          * Viking MXCC into streaming mode.  Ho hum...
 185          */
 186 #define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)   \
 187         EX(ldd  [src + off + 0x00], t0);                                        \
 188         EX(ldd  [src + off + 0x08], t2);                                        \
 189         EX(ldd  [src + off + 0x10], t4);                                        \
 190         EX(ldd  [src + off + 0x18], t6);                                        \
 191         EX(st   t0, [dst + off + 0x00]);                                        \
 192         addxcc  t0, sum, sum;                                                   \
 193         EX(st   t1, [dst + off + 0x04]);                                        \
 194         addxcc  t1, sum, sum;                                                   \
 195         EX(st   t2, [dst + off + 0x08]);                                        \
 196         addxcc  t2, sum, sum;                                                   \
 197         EX(st   t3, [dst + off + 0x0c]);                                        \
 198         addxcc  t3, sum, sum;                                                   \
 199         EX(st   t4, [dst + off + 0x10]);                                        \
 200         addxcc  t4, sum, sum;                                                   \
 201         EX(st   t5, [dst + off + 0x14]);                                        \
 202         addxcc  t5, sum, sum;                                                   \
 203         EX(st   t6, [dst + off + 0x18]);                                        \
 204         addxcc  t6, sum, sum;                                                   \
 205         EX(st   t7, [dst + off + 0x1c]);                                        \
 206         addxcc  t7, sum, sum;
 207
 208         /* Yuck, 6 superscalar cycles... */
 209 #define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3)  \
 210         EX(ldd  [src - off - 0x08], t0);                        \
 211         EX(ldd  [src - off - 0x00], t2);                        \
 212         addxcc  t0, sum, sum;                                   \
 213         EX(st   t0, [dst - off - 0x08]);                        \
 214         addxcc  t1, sum, sum;                                   \
 215         EX(st   t1, [dst - off - 0x04]);                        \
 216         addxcc  t2, sum, sum;                                   \
 217         EX(st   t2, [dst - off - 0x00]);                        \
 218         addxcc  t3, sum, sum;                                   \
 219         EX(st   t3, [dst - off + 0x04]);
 220
 221         /* Handle the end cruft code out of band for better cache patterns. */
 222 cc_end_cruft:
 223         be      1f
 224          andcc  %o3, 4, %g0
 225         EX(ldd  [%o0 + 0x00], %g2)
 226         add     %o1, 8, %o1
 227         addcc   %g2, %g7, %g7
 228         add     %o0, 8, %o0
 229         addxcc  %g3, %g7, %g7
 230         EX(st   %g2, [%o1 - 0x08])
 231         addx    %g0, %g7, %g7
 232         andcc   %o3, 4, %g0
 233         EX(st   %g3, [%o1 - 0x04])
 234 1:      be      1f
 235          andcc  %o3, 3, %o3
 236         EX(ld   [%o0 + 0x00], %g2)
 237         add     %o1, 4, %o1
 238         addcc   %g2, %g7, %g7
 239         EX(st   %g2, [%o1 - 0x04])
 240         addx    %g0, %g7, %g7
 241         andcc   %o3, 3, %g0
 242         add     %o0, 4, %o0
 243 1:      be      1f
 244          addcc  %o3, -1, %g0
 245         bne     2f
 246          subcc  %o3, 2, %o3
 247         b       4f
 248          or     %g0, %g0, %o4
 249 2:      EX(lduh [%o0 + 0x00], %o4)
 250         add     %o0, 2, %o0
 251         EX(sth  %o4, [%o1 + 0x00])
 252         be      6f
 253          add    %o1, 2, %o1
 254         sll     %o4, 16, %o4
 255 4:      EX(ldub [%o0 + 0x00], %o5)
 256         EX(stb  %o5, [%o1 + 0x00])
 257         sll     %o5, 8, %o5
 258         or      %o5, %o4, %o4
 259 6:      addcc   %o4, %g7, %g7
 260 1:      retl
 261          addx   %g0, %g7, %o0
 262
 263         /* Also, handle the alignment code out of band. */
 264 cc_dword_align:
 265         cmp     %g1, 16
 266         bge     1f
 267          srl    %g1, 1, %o3
 268 2:      cmp     %o3, 0
 269         be,a    ccte
 270          andcc  %g1, 0xf, %o3
 271         andcc   %o3, %o0, %g0   ! Check %o0 only (%o1 has the same last 2 bits)
 272         be,a    2b
 273          srl    %o3, 1, %o3
 274 1:      andcc   %o0, 0x1, %g0
 275         bne     ccslow
 276          andcc  %o0, 0x2, %g0
 277         be      1f
 278          andcc  %o0, 0x4, %g0
 279         EX(lduh [%o0 + 0x00], %g4)
 280         sub     %g1, 2, %g1
 281         EX(sth  %g4, [%o1 + 0x00])
 282         add     %o0, 2, %o0
 283         sll     %g4, 16, %g4
 284         addcc   %g4, %g7, %g7
 285         add     %o1, 2, %o1
 286         srl     %g7, 16, %g3
 287         addx    %g0, %g3, %g4
 288         sll     %g7, 16, %g7
 289         sll     %g4, 16, %g3
 290         srl     %g7, 16, %g7
 291         andcc   %o0, 0x4, %g0
 292         or      %g3, %g7, %g7
 293 1:      be      3f
 294          andcc  %g1, 0xffffff80, %g0
 295         EX(ld   [%o0 + 0x00], %g4)
 296         sub     %g1, 4, %g1
 297         EX(st   %g4, [%o1 + 0x00])
 298         add     %o0, 4, %o0
 299         addcc   %g4, %g7, %g7
 300         add     %o1, 4, %o1
 301         addx    %g0, %g7, %g7
 302         b       3f
 303          andcc  %g1, 0xffffff80, %g0
 304
 305         /* Sun, you just can't beat me, you just can't.  Stop trying,
 306          * give up.  I'm serious, I am going to kick the living shit
 307          * out of you, game over, lights out.
 308          */
 309         .align  8
 310         .globl  __csum_partial_copy_sparc_generic
 311         EXPORT_SYMBOL(__csum_partial_copy_sparc_generic)
 312 __csum_partial_copy_sparc_generic:
 313                                         /* %o0=src, %o1=dest, %g1=len, %g7=sum */
 314         xor     %o0, %o1, %o4           ! get changing bits
 315         andcc   %o4, 3, %g0             ! check for mismatched alignment
 316         bne     ccslow                  ! better this than unaligned/fixups
 317          andcc  %o0, 7, %g0             ! need to align things?
 318         bne     cc_dword_align          ! yes, we check for short lengths there
 319          andcc  %g1, 0xffffff80, %g0    ! can we use unrolled loop?
 320 3:      be      3f                      ! nope, less than one loop remains
 321          andcc  %o1, 4, %g0             ! dest aligned on 4 or 8 byte boundary?
 322         be      ccdbl + 4               ! 8 byte aligned, kick ass
 323 5:      CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 324         CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 325         CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 326         CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 327         sub     %g1, 128, %g1           ! detract from length
 328         addx    %g0, %g7, %g7           ! add in last carry bit
 329         andcc   %g1, 0xffffff80, %g0    ! more to csum?
 330         add     %o0, 128, %o0           ! advance src ptr
 331         bne     5b                      ! we did not go negative, continue looping
 332          add    %o1, 128, %o1           ! advance dest ptr
 333 3:      andcc   %g1, 0x70, %o2          ! can use table?
 334 ccmerge:be      ccte                    ! nope, go and check for end cruft
 335          andcc  %g1, 0xf, %o3           ! get low bits of length (clears carry btw)
 336         srl     %o2, 1, %o4             ! begin negative offset computation
 337         sethi   %hi(12f), %o5           ! set up table ptr end
 338         add     %o0, %o2, %o0           ! advance src ptr
 339         sub     %o5, %o4, %o5           ! continue table calculation
 340         sll     %o2, 1, %g2             ! constant multiplies are fun...
 341         sub     %o5, %g2, %o5           ! some more adjustments
 342         jmp     %o5 + %lo(12f)          ! jump into it, duff style, wheee...
 343          add    %o1, %o2, %o1           ! advance dest ptr (carry is clear btw)
 344 cctbl:  CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
 345         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
 346         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
 347         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
 348         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
 349         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
 350         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
 351 12:     addx    %g0, %g7, %g7
 352         andcc   %o3, 0xf, %g0           ! check for low bits set
 353 ccte:   bne     cc_end_cruft            ! something left, handle it out of band
 354          andcc  %o3, 8, %g0             ! begin checks for that code
 355         retl                            ! return
 356          mov    %g7, %o0                ! give em the computed checksum
 357 ccdbl:  CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 358         CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 359         CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 360         CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 361         sub     %g1, 128, %g1           ! detract from length
 362         addx    %g0, %g7, %g7           ! add in last carry bit
 363         andcc   %g1, 0xffffff80, %g0    ! more to csum?
 364         add     %o0, 128, %o0           ! advance src ptr
 365         bne     ccdbl                   ! we did not go negative, continue looping
 366          add    %o1, 128, %o1           ! advance dest ptr
 367         b       ccmerge                 ! finish it off, above
 368          andcc  %g1, 0x70, %o2          ! can use table? (clears carry btw)
 369
 370 ccslow: cmp     %g1, 0
 371         mov     0, %g5
 372         bleu    4f
 373          andcc  %o0, 1, %o5
 374         be,a    1f
 375          srl    %g1, 1, %g4
 376         sub     %g1, 1, %g1
 377         EX(ldub [%o0], %g5)
 378         add     %o0, 1, %o0
 379         EX(stb  %g5, [%o1])
 380         srl     %g1, 1, %g4
 381         add     %o1, 1, %o1
 382 1:      cmp     %g4, 0
 383         be,a    3f
 384          andcc  %g1, 1, %g0
 385         andcc   %o0, 2, %g0
 386         be,a    1f
 387          srl    %g4, 1, %g4
 388         EX(lduh [%o0], %o4)
 389         sub     %g1, 2, %g1
 390         srl     %o4, 8, %g2
 391         sub     %g4, 1, %g4
 392         EX(stb  %g2, [%o1])
 393         add     %o4, %g5, %g5
 394         EX(stb  %o4, [%o1 + 1])
 395         add     %o0, 2, %o0
 396         srl     %g4, 1, %g4
 397         add     %o1, 2, %o1
 398 1:      cmp     %g4, 0
 399         be,a    2f
 400          andcc  %g1, 2, %g0
 401         EX(ld   [%o0], %o4)
 402 5:      srl     %o4, 24, %g2
 403         srl     %o4, 16, %g3
 404         EX(stb  %g2, [%o1])
 405         srl     %o4, 8, %g2
 406         EX(stb  %g3, [%o1 + 1])
 407         add     %o0, 4, %o0
 408         EX(stb  %g2, [%o1 + 2])
 409         addcc   %o4, %g5, %g5
 410         EX(stb  %o4, [%o1 + 3])
 411         addx    %g5, %g0, %g5   ! I am now to lazy to optimize this (question it
 412         add     %o1, 4, %o1     ! is worthy). Maybe some day - with the sll/srl
 413         subcc   %g4, 1, %g4     ! tricks
 414         bne,a   5b
 415          EX(ld  [%o0], %o4)
 416         sll     %g5, 16, %g2
 417         srl     %g5, 16, %g5
 418         srl     %g2, 16, %g2
 419         andcc   %g1, 2, %g0
 420         add     %g2, %g5, %g5
 421 2:      be,a    3f
 422          andcc  %g1, 1, %g0
 423         EX(lduh [%o0], %o4)
 424         andcc   %g1, 1, %g0
 425         srl     %o4, 8, %g2
 426         add     %o0, 2, %o0
 427         EX(stb  %g2, [%o1])
 428         add     %g5, %o4, %g5
 429         EX(stb  %o4, [%o1 + 1])
 430         add     %o1, 2, %o1
 431 3:      be,a    1f
 432          sll    %g5, 16, %o4
 433         EX(ldub [%o0], %g2)
 434         sll     %g2, 8, %o4
 435         EX(stb  %g2, [%o1])
 436         add     %g5, %o4, %g5
 437         sll     %g5, 16, %o4
 438 1:      addcc   %o4, %g5, %g5
 439         srl     %g5, 16, %o4
 440         addx    %g0, %o4, %g5
 441         orcc    %o5, %g0, %g0
 442         be      4f
 443          srl    %g5, 8, %o4
 444         and     %g5, 0xff, %g2
 445         and     %o4, 0xff, %o4
 446         sll     %g2, 8, %g2
 447         or      %g2, %o4, %g5
 448 4:      addcc   %g7, %g5, %g7
 449         retl
 450          addx   %g0, %g7, %o0
 451
 452 /* We do these strange calculations for the csum_*_from_user case only, ie.
 453  * we only bother with faults on loads... */
 454
 455 cc_fault:
 456         ret
 457          clr    %o0