arch/alpha/lib/memcpy.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/arch/alpha/lib/memcpy.c
   4  *
   5  *  Copyright (C) 1995  Linus Torvalds
   6  */
   7
   8 /*
   9  * This is a reasonably optimized memcpy() routine.
  10  */
  11
  12 /*
  13  * Note that the C code is written to be optimized into good assembly. However,
  14  * at this point gcc is unable to sanely compile "if (n >= 0)", resulting in a
  15  * explicit compare against 0 (instead of just using the proper "blt reg, xx" or
  16  * "bge reg, xx"). I hope alpha-gcc will be fixed to notice this eventually..
  17  */
  18
  19 #include <linux/types.h>
  20 #include <linux/export.h>
  21
  22 /*
  23  * This should be done in one go with ldq_u*2/mask/stq_u. Do it
  24  * with a macro so that we can fix it up later..
  25  */
  26 #define ALIGN_DEST_TO8_UP(d,s,n) \
  27         while (d & 7) { \
  28                 if (n <= 0) return; \
  29                 n--; \
  30                 *(char *) d = *(char *) s; \
  31                 d++; s++; \
  32         }
  33 #define ALIGN_DEST_TO8_DN(d,s,n) \
  34         while (d & 7) { \
  35                 if (n <= 0) return; \
  36                 n--; \
  37                 d--; s--; \
  38                 *(char *) d = *(char *) s; \
  39         }
  40
  41 /*
  42  * This should similarly be done with ldq_u*2/mask/stq. The destination
  43  * is aligned, but we don't fill in a full quad-word
  44  */
  45 #define DO_REST_UP(d,s,n) \
  46         while (n > 0) { \
  47                 n--; \
  48                 *(char *) d = *(char *) s; \
  49                 d++; s++; \
  50         }
  51 #define DO_REST_DN(d,s,n) \
  52         while (n > 0) { \
  53                 n--; \
  54                 d--; s--; \
  55                 *(char *) d = *(char *) s; \
  56         }
  57
  58 /*
  59  * This should be done with ldq/mask/stq. The source and destination are
  60  * aligned, but we don't fill in a full quad-word
  61  */
  62 #define DO_REST_ALIGNED_UP(d,s,n) DO_REST_UP(d,s,n)
  63 #define DO_REST_ALIGNED_DN(d,s,n) DO_REST_DN(d,s,n)
  64
  65 /*
  66  * This does unaligned memory copies. We want to avoid storing to
  67  * an unaligned address, as that would do a read-modify-write cycle.
  68  * We also want to avoid double-reading the unaligned reads.
  69  *
  70  * Note the ordering to try to avoid load (and address generation) latencies.
  71  */
  72 static inline void __memcpy_unaligned_up (unsigned long d, unsigned long s,
  73                                           long n)
  74 {
  75         ALIGN_DEST_TO8_UP(d,s,n);
  76         n -= 8;                 /* to avoid compare against 8 in the loop */
  77         if (n >= 0) {
  78                 unsigned long low_word, high_word;
  79                 __asm__("ldq_u %0,%1":"=r" (low_word):"m" (*(unsigned long *) s));
  80                 do {
  81                         unsigned long tmp;
  82                         __asm__("ldq_u %0,%1":"=r" (high_word):"m" (*(unsigned long *)(s+8)));
  83                         n -= 8;
  84                         __asm__("extql %1,%2,%0"
  85                                 :"=r" (low_word)
  86                                 :"r" (low_word), "r" (s));
  87                         __asm__("extqh %1,%2,%0"
  88                                 :"=r" (tmp)
  89                                 :"r" (high_word), "r" (s));
  90                         s += 8;
  91                         *(unsigned long *) d = low_word | tmp;
  92                         d += 8;
  93                         low_word = high_word;
  94                 } while (n >= 0);
  95         }
  96         n += 8;
  97         DO_REST_UP(d,s,n);
  98 }
  99
 100 static inline void __memcpy_unaligned_dn (unsigned long d, unsigned long s,
 101                                           long n)
 102 {
 103         /* I don't understand AXP assembler well enough for this. -Tim */
 104         s += n;
 105         d += n;
 106         while (n--)
 107                 * (char *) --d = * (char *) --s;
 108 }
 109
 110 /*
 111  * Hmm.. Strange. The __asm__ here is there to make gcc use an integer register
 112  * for the load-store. I don't know why, but it would seem that using a floating
 113  * point register for the move seems to slow things down (very small difference,
 114  * though).
 115  *
 116  * Note the ordering to try to avoid load (and address generation) latencies.
 117  */
 118 static inline void __memcpy_aligned_up (unsigned long d, unsigned long s,
 119                                         long n)
 120 {
 121         ALIGN_DEST_TO8_UP(d,s,n);
 122         n -= 8;
 123         while (n >= 0) {
 124                 unsigned long tmp;
 125                 __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 126                 n -= 8;
 127                 s += 8;
 128                 *(unsigned long *) d = tmp;
 129                 d += 8;
 130         }
 131         n += 8;
 132         DO_REST_ALIGNED_UP(d,s,n);
 133 }
 134 static inline void __memcpy_aligned_dn (unsigned long d, unsigned long s,
 135                                         long n)
 136 {
 137         s += n;
 138         d += n;
 139         ALIGN_DEST_TO8_DN(d,s,n);
 140         n -= 8;
 141         while (n >= 0) {
 142                 unsigned long tmp;
 143                 s -= 8;
 144                 __asm__("ldq %0,%1":"=r" (tmp):"m" (*(unsigned long *) s));
 145                 n -= 8;
 146                 d -= 8;
 147                 *(unsigned long *) d = tmp;
 148         }
 149         n += 8;
 150         DO_REST_ALIGNED_DN(d,s,n);
 151 }
 152
 153 void * memcpy(void * dest, const void *src, size_t n)
 154 {
 155         if (!(((unsigned long) dest ^ (unsigned long) src) & 7)) {
 156                 __memcpy_aligned_up ((unsigned long) dest, (unsigned long) src,
 157                                      n);
 158                 return dest;
 159         }
 160         __memcpy_unaligned_up ((unsigned long) dest, (unsigned long) src, n);
 161         return dest;
 162 }
 163 EXPORT_SYMBOL(memcpy);