arch/mips/lib/memcpy.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Unified implementation of memcpy, memmove and the __copy_user backend.
   7  *
   8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10  * Copyright (C) 2002 Broadcom, Inc.
  11  *   memcpy/copy_user author: Mark Vandevoorde
  12  * Copyright (C) 2007  Maciej W. Rozycki
  13  *
  14  * Mnemonic names for arguments to memcpy/__copy_user
  15  */
  16
  17 /*
  18  * Hack to resolve longstanding prefetch issue
  19  *
  20  * Prefetching may be fatal on some systems if we're prefetching beyond the
  21  * end of memory on some systems.  It's also a seriously bad idea on non
  22  * dma-coherent systems.
  23  */
  24 #ifdef CONFIG_DMA_NONCOHERENT
  25 #undef CONFIG_CPU_HAS_PREFETCH
  26 #endif
  27 #ifdef CONFIG_MIPS_MALTA
  28 #undef CONFIG_CPU_HAS_PREFETCH
  29 #endif
  30
  31 #include <asm/asm.h>
  32 #include <asm/asm-offsets.h>
  33 #include <asm/regdef.h>
  34
  35 #define dst a0
  36 #define src a1
  37 #define len a2
  38
  39 /*
  40  * Spec
  41  *
  42  * memcpy copies len bytes from src to dst and sets v0 to dst.
  43  * It assumes that
  44  *   - src and dst don't overlap
  45  *   - src is readable
  46  *   - dst is writable
  47  * memcpy uses the standard calling convention
  48  *
  49  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  50  * the number of uncopied bytes due to an exception caused by a read or write.
  51  * __copy_user assumes that src and dst don't overlap, and that the call is
  52  * implementing one of the following:
  53  *   copy_to_user
  54  *     - src is readable  (no exceptions when reading src)
  55  *   copy_from_user
  56  *     - dst is writable  (no exceptions when writing dst)
  57  * __copy_user uses a non-standard calling convention; see
  58  * include/asm-mips/uaccess.h
  59  *
  60  * When an exception happens on a load, the handler must
  61  # ensure that all of the destination buffer is overwritten to prevent
  62  * leaking information to user mode programs.
  63  */
  64
  65 /*
  66  * Implementation
  67  */
  68
  69 /*
  70  * The exception handler for loads requires that:
  71  *  1- AT contain the address of the byte just past the end of the source
  72  *     of the copy,
  73  *  2- src_entry <= src < AT, and
  74  *  3- (dst - src) == (dst_entry - src_entry),
  75  * The _entry suffix denotes values when __copy_user was called.
  76  *
  77  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  78  * (2) is met by incrementing src by the number of bytes copied
  79  * (3) is met by not doing loads between a pair of increments of dst and src
  80  *
  81  * The exception handlers for stores adjust len (if necessary) and return.
  82  * These handlers do not need to overwrite any data.
  83  *
  84  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  85  * they're not protected.
  86  */
  87
  88 #define EXC(inst_reg,addr,handler)              \
  89 9:      inst_reg, addr;                         \
  90         .section __ex_table,"a";                \
  91         PTR     9b, handler;                    \
  92         .previous
  93
  94 /*
  95  * Only on the 64-bit kernel we can made use of 64-bit registers.
  96  */
  97 #ifdef CONFIG_64BIT
  98 #define USE_DOUBLE
  99 #endif
 100
 101 #ifdef USE_DOUBLE
 102
 103 #define LOAD   ld
 104 #define LOADL  ldl
 105 #define LOADR  ldr
 106 #define STOREL sdl
 107 #define STORER sdr
 108 #define STORE  sd
 109 #define ADD    daddu
 110 #define SUB    dsubu
 111 #define SRL    dsrl
 112 #define SRA    dsra
 113 #define SLL    dsll
 114 #define SLLV   dsllv
 115 #define SRLV   dsrlv
 116 #define NBYTES 8
 117 #define LOG_NBYTES 3
 118
 119 /*
 120  * As we are sharing code base with the mips32 tree (which use the o32 ABI
 121  * register definitions). We need to redefine the register definitions from
 122  * the n64 ABI register naming to the o32 ABI register naming.
 123  */
 124 #undef t0
 125 #undef t1
 126 #undef t2
 127 #undef t3
 128 #define t0      $8
 129 #define t1      $9
 130 #define t2      $10
 131 #define t3      $11
 132 #define t4      $12
 133 #define t5      $13
 134 #define t6      $14
 135 #define t7      $15
 136
 137 #else
 138
 139 #define LOAD   lw
 140 #define LOADL  lwl
 141 #define LOADR  lwr
 142 #define STOREL swl
 143 #define STORER swr
 144 #define STORE  sw
 145 #define ADD    addu
 146 #define SUB    subu
 147 #define SRL    srl
 148 #define SLL    sll
 149 #define SRA    sra
 150 #define SLLV   sllv
 151 #define SRLV   srlv
 152 #define NBYTES 4
 153 #define LOG_NBYTES 2
 154
 155 #endif /* USE_DOUBLE */
 156
 157 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 158 #define LDFIRST LOADR
 159 #define LDREST  LOADL
 160 #define STFIRST STORER
 161 #define STREST  STOREL
 162 #define SHIFT_DISCARD SLLV
 163 #else
 164 #define LDFIRST LOADL
 165 #define LDREST  LOADR
 166 #define STFIRST STOREL
 167 #define STREST  STORER
 168 #define SHIFT_DISCARD SRLV
 169 #endif
 170
 171 #define FIRST(unit) ((unit)*NBYTES)
 172 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 173 #define UNIT(unit)  FIRST(unit)
 174
 175 #define ADDRMASK (NBYTES-1)
 176
 177         .text
 178         .set    noreorder
 179 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 180         .set    noat
 181 #else
 182         .set    at=v1
 183 #endif
 184
 185 /*
 186  * A combined memcpy/__copy_user
 187  * __copy_user sets len to 0 for success; else to an upper bound of
 188  * the number of uncopied bytes.
 189  * memcpy sets v0 to dst.
 190  */
 191         .align  5
 192 LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
 193         move    v0, dst                         /* return value */
 194 .L__memcpy:
 195 FEXPORT(__copy_user)
 196         /*
 197          * Note: dst & src may be unaligned, len may be 0
 198          * Temps
 199          */
 200 #define rem t8
 201
 202         R10KCBARRIER(0(ra))
 203         /*
 204          * The "issue break"s below are very approximate.
 205          * Issue delays for dcache fills will perturb the schedule, as will
 206          * load queue full replay traps, etc.
 207          *
 208          * If len < NBYTES use byte operations.
 209          */
 210         PREF(   0, 0(src) )
 211         PREF(   1, 0(dst) )
 212         sltu    t2, len, NBYTES
 213         and     t1, dst, ADDRMASK
 214         PREF(   0, 1*32(src) )
 215         PREF(   1, 1*32(dst) )
 216         bnez    t2, .Lcopy_bytes_checklen
 217          and    t0, src, ADDRMASK
 218         PREF(   0, 2*32(src) )
 219         PREF(   1, 2*32(dst) )
 220         bnez    t1, .Ldst_unaligned
 221          nop
 222         bnez    t0, .Lsrc_unaligned_dst_aligned
 223         /*
 224          * use delay slot for fall-through
 225          * src and dst are aligned; need to compute rem
 226          */
 227 .Lboth_aligned:
 228          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 229         beqz    t0, .Lcleanup_both_aligned # len < 8*NBYTES
 230          and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
 231         PREF(   0, 3*32(src) )
 232         PREF(   1, 3*32(dst) )
 233         .align  4
 234 1:
 235         R10KCBARRIER(0(ra))
 236 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 237 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 238 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 239 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 240         SUB     len, len, 8*NBYTES
 241 EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
 242 EXC(    LOAD    t7, UNIT(5)(src),       .Ll_exc_copy)
 243 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p8u)
 244 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p7u)
 245 EXC(    LOAD    t0, UNIT(6)(src),       .Ll_exc_copy)
 246 EXC(    LOAD    t1, UNIT(7)(src),       .Ll_exc_copy)
 247         ADD     src, src, 8*NBYTES
 248         ADD     dst, dst, 8*NBYTES
 249 EXC(    STORE   t2, UNIT(-6)(dst),      .Ls_exc_p6u)
 250 EXC(    STORE   t3, UNIT(-5)(dst),      .Ls_exc_p5u)
 251 EXC(    STORE   t4, UNIT(-4)(dst),      .Ls_exc_p4u)
 252 EXC(    STORE   t7, UNIT(-3)(dst),      .Ls_exc_p3u)
 253 EXC(    STORE   t0, UNIT(-2)(dst),      .Ls_exc_p2u)
 254 EXC(    STORE   t1, UNIT(-1)(dst),      .Ls_exc_p1u)
 255         PREF(   0, 8*32(src) )
 256         PREF(   1, 8*32(dst) )
 257         bne     len, rem, 1b
 258          nop
 259
 260         /*
 261          * len == rem == the number of bytes left to copy < 8*NBYTES
 262          */
 263 .Lcleanup_both_aligned:
 264         beqz    len, .Ldone
 265          sltu   t0, len, 4*NBYTES
 266         bnez    t0, .Lless_than_4units
 267          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 268         /*
 269          * len >= 4*NBYTES
 270          */
 271 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 272 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 273 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 274 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 275         SUB     len, len, 4*NBYTES
 276         ADD     src, src, 4*NBYTES
 277         R10KCBARRIER(0(ra))
 278 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p4u)
 279 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p3u)
 280 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc_p2u)
 281 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc_p1u)
 282         .set    reorder                         /* DADDI_WAR */
 283         ADD     dst, dst, 4*NBYTES
 284         beqz    len, .Ldone
 285         .set    noreorder
 286 .Lless_than_4units:
 287         /*
 288          * rem = len % NBYTES
 289          */
 290         beq     rem, len, .Lcopy_bytes
 291          nop
 292 1:
 293         R10KCBARRIER(0(ra))
 294 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 295         ADD     src, src, NBYTES
 296         SUB     len, len, NBYTES
 297 EXC(    STORE   t0, 0(dst),             .Ls_exc_p1u)
 298         .set    reorder                         /* DADDI_WAR */
 299         ADD     dst, dst, NBYTES
 300         bne     rem, len, 1b
 301         .set    noreorder
 302
 303         /*
 304          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 305          * A loop would do only a byte at a time with possible branch
 306          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 307          * because can't assume read-access to dst.  Instead, use
 308          * STREST dst, which doesn't require read access to dst.
 309          *
 310          * This code should perform better than a simple loop on modern,
 311          * wide-issue mips processors because the code has fewer branches and
 312          * more instruction-level parallelism.
 313          */
 314 #define bits t2
 315         beqz    len, .Ldone
 316          ADD    t1, dst, len    # t1 is just past last byte of dst
 317         li      bits, 8*NBYTES
 318         SLL     rem, len, 3     # rem = number of bits to keep
 319 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 320         SUB     bits, bits, rem # bits = number of bits to discard
 321         SHIFT_DISCARD t0, t0, bits
 322 EXC(    STREST  t0, -1(t1),             .Ls_exc)
 323         jr      ra
 324          move   len, zero
 325 .Ldst_unaligned:
 326         /*
 327          * dst is unaligned
 328          * t0 = src & ADDRMASK
 329          * t1 = dst & ADDRMASK; T1 > 0
 330          * len >= NBYTES
 331          *
 332          * Copy enough bytes to align dst
 333          * Set match = (src and dst have same alignment)
 334          */
 335 #define match rem
 336 EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
 337         ADD     t2, zero, NBYTES
 338 EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
 339         SUB     t2, t2, t1      # t2 = number of bytes copied
 340         xor     match, t0, t1
 341         R10KCBARRIER(0(ra))
 342 EXC(    STFIRST t3, FIRST(0)(dst),      .Ls_exc)
 343         beq     len, t2, .Ldone
 344          SUB    len, len, t2
 345         ADD     dst, dst, t2
 346         beqz    match, .Lboth_aligned
 347          ADD    src, src, t2
 348
 349 .Lsrc_unaligned_dst_aligned:
 350         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 351         PREF(   0, 3*32(src) )
 352         beqz    t0, .Lcleanup_src_unaligned
 353          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 354         PREF(   1, 3*32(dst) )
 355 1:
 356 /*
 357  * Avoid consecutive LD*'s to the same register since some mips
 358  * implementations can't issue them in the same cycle.
 359  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 360  * are to the same unit (unless src is aligned, but it's not).
 361  */
 362         R10KCBARRIER(0(ra))
 363 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 364 EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
 365         SUB     len, len, 4*NBYTES
 366 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 367 EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
 368 EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
 369 EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
 370 EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
 371 EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
 372         PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 373         ADD     src, src, 4*NBYTES
 374 #ifdef CONFIG_CPU_SB1
 375         nop                             # improves slotting
 376 #endif
 377 EXC(    STORE   t0, UNIT(0)(dst),       .Ls_exc_p4u)
 378 EXC(    STORE   t1, UNIT(1)(dst),       .Ls_exc_p3u)
 379 EXC(    STORE   t2, UNIT(2)(dst),       .Ls_exc_p2u)
 380 EXC(    STORE   t3, UNIT(3)(dst),       .Ls_exc_p1u)
 381         PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 382         .set    reorder                         /* DADDI_WAR */
 383         ADD     dst, dst, 4*NBYTES
 384         bne     len, rem, 1b
 385         .set    noreorder
 386
 387 .Lcleanup_src_unaligned:
 388         beqz    len, .Ldone
 389          and    rem, len, NBYTES-1  # rem = len % NBYTES
 390         beq     rem, len, .Lcopy_bytes
 391          nop
 392 1:
 393         R10KCBARRIER(0(ra))
 394 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 395 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 396         ADD     src, src, NBYTES
 397         SUB     len, len, NBYTES
 398 EXC(    STORE   t0, 0(dst),             .Ls_exc_p1u)
 399         .set    reorder                         /* DADDI_WAR */
 400         ADD     dst, dst, NBYTES
 401         bne     len, rem, 1b
 402         .set    noreorder
 403
 404 .Lcopy_bytes_checklen:
 405         beqz    len, .Ldone
 406          nop
 407 .Lcopy_bytes:
 408         /* 0 < len < NBYTES  */
 409         R10KCBARRIER(0(ra))
 410 #define COPY_BYTE(N)                    \
 411 EXC(    lb      t0, N(src), .Ll_exc);   \
 412         SUB     len, len, 1;            \
 413         beqz    len, .Ldone;            \
 414 EXC(     sb     t0, N(dst), .Ls_exc_p1)
 415
 416         COPY_BYTE(0)
 417         COPY_BYTE(1)
 418 #ifdef USE_DOUBLE
 419         COPY_BYTE(2)
 420         COPY_BYTE(3)
 421         COPY_BYTE(4)
 422         COPY_BYTE(5)
 423 #endif
 424 EXC(    lb      t0, NBYTES-2(src), .Ll_exc)
 425         SUB     len, len, 1
 426         jr      ra
 427 EXC(     sb     t0, NBYTES-2(dst), .Ls_exc_p1)
 428 .Ldone:
 429         jr      ra
 430          nop
 431         END(memcpy)
 432
 433 .Ll_exc_copy:
 434         /*
 435          * Copy bytes from src until faulting load address (or until a
 436          * lb faults)
 437          *
 438          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 439          * may be more than a byte beyond the last address.
 440          * Hence, the lb below may get an exception.
 441          *
 442          * Assumes src < THREAD_BUADDR($28)
 443          */
 444         LOAD    t0, TI_TASK($28)
 445          nop
 446         LOAD    t0, THREAD_BUADDR(t0)
 447 1:
 448 EXC(    lb      t1, 0(src),     .Ll_exc)
 449         ADD     src, src, 1
 450         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 451         .set    reorder                         /* DADDI_WAR */
 452         ADD     dst, dst, 1
 453         bne     src, t0, 1b
 454         .set    noreorder
 455 .Ll_exc:
 456         LOAD    t0, TI_TASK($28)
 457          nop
 458         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 459          nop
 460         SUB     len, AT, t0             # len number of uncopied bytes
 461         /*
 462          * Here's where we rely on src and dst being incremented in tandem,
 463          *   See (3) above.
 464          * dst += (fault addr - src) to put dst at first byte to clear
 465          */
 466         ADD     dst, t0                 # compute start address in a1
 467         SUB     dst, src
 468         /*
 469          * Clear len bytes starting at dst.  Can't call __bzero because it
 470          * might modify len.  An inefficient loop for these rare times...
 471          */
 472         .set    reorder                         /* DADDI_WAR */
 473         SUB     src, len, 1
 474         beqz    len, .Ldone
 475         .set    noreorder
 476 1:      sb      zero, 0(dst)
 477         ADD     dst, dst, 1
 478 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 479         bnez    src, 1b
 480          SUB    src, src, 1
 481 #else
 482         .set    push
 483         .set    noat
 484         li      v1, 1
 485         bnez    src, 1b
 486          SUB    src, src, v1
 487         .set    pop
 488 #endif
 489         jr      ra
 490          nop
 491
 492
 493 #define SEXC(n)                                                 \
 494         .set    reorder;                        /* DADDI_WAR */ \
 495 .Ls_exc_p ## n ## u:                                            \
 496         ADD     len, len, n*NBYTES;                             \
 497         jr      ra;                                             \
 498         .set    noreorder
 499
 500 SEXC(8)
 501 SEXC(7)
 502 SEXC(6)
 503 SEXC(5)
 504 SEXC(4)
 505 SEXC(3)
 506 SEXC(2)
 507 SEXC(1)
 508
 509 .Ls_exc_p1:
 510         .set    reorder                         /* DADDI_WAR */
 511         ADD     len, len, 1
 512         jr      ra
 513         .set    noreorder
 514 .Ls_exc:
 515         jr      ra
 516          nop
 517
 518         .align  5
 519 LEAF(memmove)
 520         ADD     t0, a0, a2
 521         ADD     t1, a1, a2
 522         sltu    t0, a1, t0                      # dst + len <= src -> memcpy
 523         sltu    t1, a0, t1                      # dst >= src + len -> memcpy
 524         and     t0, t1
 525         beqz    t0, .L__memcpy
 526          move   v0, a0                          /* return value */
 527         beqz    a2, .Lr_out
 528         END(memmove)
 529
 530         /* fall through to __rmemcpy */
 531 LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
 532          sltu   t0, a1, a0
 533         beqz    t0, .Lr_end_bytes_up            # src >= dst
 534          nop
 535         ADD     a0, a2                          # dst = dst + len
 536         ADD     a1, a2                          # src = src + len
 537
 538 .Lr_end_bytes:
 539         R10KCBARRIER(0(ra))
 540         lb      t0, -1(a1)
 541         SUB     a2, a2, 0x1
 542         sb      t0, -1(a0)
 543         SUB     a1, a1, 0x1
 544         .set    reorder                         /* DADDI_WAR */
 545         SUB     a0, a0, 0x1
 546         bnez    a2, .Lr_end_bytes
 547         .set    noreorder
 548
 549 .Lr_out:
 550         jr      ra
 551          move   a2, zero
 552
 553 .Lr_end_bytes_up:
 554         R10KCBARRIER(0(ra))
 555         lb      t0, (a1)
 556         SUB     a2, a2, 0x1
 557         sb      t0, (a0)
 558         ADD     a1, a1, 0x1
 559         .set    reorder                         /* DADDI_WAR */
 560         ADD     a0, a0, 0x1
 561         bnez    a2, .Lr_end_bytes_up
 562         .set    noreorder
 563
 564         jr      ra
 565          move   a2, zero
 566         END(__rmemcpy)