arch/x86/lib/memmove_64.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * Normally compiler builtins are used, but sometimes the compiler calls out
   4  * of line code. Based on asm-i386/string.h.
   5  *
   6  * This assembly file is re-written from memmove_64.c file.
   7  *      - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
   8  */
   9 #include <linux/linkage.h>
  10 #include <asm/cpufeatures.h>
  11 #include <asm/alternative-asm.h>
  12 #include <asm/export.h>
  13
  14 #undef memmove
  15
  16 /*
  17  * Implement memmove(). This can handle overlap between src and dst.
  18  *
  19  * Input:
  20  * rdi: dest
  21  * rsi: src
  22  * rdx: count
  23  *
  24  * Output:
  25  * rax: dest
  26  */
  27 .weak memmove
  28
  29 SYM_FUNC_START_ALIAS(memmove)
  30 SYM_FUNC_START(__memmove)
  31
  32         /* Handle more 32 bytes in loop */
  33         mov %rdi, %rax
  34         cmp $0x20, %rdx
  35         jb      1f
  36
  37         /* Decide forward/backward copy mode */
  38         cmp %rdi, %rsi
  39         jge .Lmemmove_begin_forward
  40         mov %rsi, %r8
  41         add %rdx, %r8
  42         cmp %rdi, %r8
  43         jg 2f
  44
  45 .Lmemmove_begin_forward:
  46         ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
  47
  48         /*
  49          * movsq instruction have many startup latency
  50          * so we handle small size by general register.
  51          */
  52         cmp  $680, %rdx
  53         jb      3f
  54         /*
  55          * movsq instruction is only good for aligned case.
  56          */
  57
  58         cmpb %dil, %sil
  59         je 4f
  60 3:
  61         sub $0x20, %rdx
  62         /*
  63          * We gobble 32 bytes forward in each loop.
  64          */
  65 5:
  66         sub $0x20, %rdx
  67         movq 0*8(%rsi), %r11
  68         movq 1*8(%rsi), %r10
  69         movq 2*8(%rsi), %r9
  70         movq 3*8(%rsi), %r8
  71         leaq 4*8(%rsi), %rsi
  72
  73         movq %r11, 0*8(%rdi)
  74         movq %r10, 1*8(%rdi)
  75         movq %r9, 2*8(%rdi)
  76         movq %r8, 3*8(%rdi)
  77         leaq 4*8(%rdi), %rdi
  78         jae 5b
  79         addq $0x20, %rdx
  80         jmp 1f
  81         /*
  82          * Handle data forward by movsq.
  83          */
  84         .p2align 4
  85 4:
  86         movq %rdx, %rcx
  87         movq -8(%rsi, %rdx), %r11
  88         lea -8(%rdi, %rdx), %r10
  89         shrq $3, %rcx
  90         rep movsq
  91         movq %r11, (%r10)
  92         jmp 13f
  93 .Lmemmove_end_forward:
  94
  95         /*
  96          * Handle data backward by movsq.
  97          */
  98         .p2align 4
  99 7:
 100         movq %rdx, %rcx
 101         movq (%rsi), %r11
 102         movq %rdi, %r10
 103         leaq -8(%rsi, %rdx), %rsi
 104         leaq -8(%rdi, %rdx), %rdi
 105         shrq $3, %rcx
 106         std
 107         rep movsq
 108         cld
 109         movq %r11, (%r10)
 110         jmp 13f
 111
 112         /*
 113          * Start to prepare for backward copy.
 114          */
 115         .p2align 4
 116 2:
 117         cmp $680, %rdx
 118         jb 6f
 119         cmp %dil, %sil
 120         je 7b
 121 6:
 122         /*
 123          * Calculate copy position to tail.
 124          */
 125         addq %rdx, %rsi
 126         addq %rdx, %rdi
 127         subq $0x20, %rdx
 128         /*
 129          * We gobble 32 bytes backward in each loop.
 130          */
 131 8:
 132         subq $0x20, %rdx
 133         movq -1*8(%rsi), %r11
 134         movq -2*8(%rsi), %r10
 135         movq -3*8(%rsi), %r9
 136         movq -4*8(%rsi), %r8
 137         leaq -4*8(%rsi), %rsi
 138
 139         movq %r11, -1*8(%rdi)
 140         movq %r10, -2*8(%rdi)
 141         movq %r9, -3*8(%rdi)
 142         movq %r8, -4*8(%rdi)
 143         leaq -4*8(%rdi), %rdi
 144         jae 8b
 145         /*
 146          * Calculate copy position to head.
 147          */
 148         addq $0x20, %rdx
 149         subq %rdx, %rsi
 150         subq %rdx, %rdi
 151 1:
 152         cmpq $16, %rdx
 153         jb 9f
 154         /*
 155          * Move data from 16 bytes to 31 bytes.
 156          */
 157         movq 0*8(%rsi), %r11
 158         movq 1*8(%rsi), %r10
 159         movq -2*8(%rsi, %rdx), %r9
 160         movq -1*8(%rsi, %rdx), %r8
 161         movq %r11, 0*8(%rdi)
 162         movq %r10, 1*8(%rdi)
 163         movq %r9, -2*8(%rdi, %rdx)
 164         movq %r8, -1*8(%rdi, %rdx)
 165         jmp 13f
 166         .p2align 4
 167 9:
 168         cmpq $8, %rdx
 169         jb 10f
 170         /*
 171          * Move data from 8 bytes to 15 bytes.
 172          */
 173         movq 0*8(%rsi), %r11
 174         movq -1*8(%rsi, %rdx), %r10
 175         movq %r11, 0*8(%rdi)
 176         movq %r10, -1*8(%rdi, %rdx)
 177         jmp 13f
 178 10:
 179         cmpq $4, %rdx
 180         jb 11f
 181         /*
 182          * Move data from 4 bytes to 7 bytes.
 183          */
 184         movl (%rsi), %r11d
 185         movl -4(%rsi, %rdx), %r10d
 186         movl %r11d, (%rdi)
 187         movl %r10d, -4(%rdi, %rdx)
 188         jmp 13f
 189 11:
 190         cmp $2, %rdx
 191         jb 12f
 192         /*
 193          * Move data from 2 bytes to 3 bytes.
 194          */
 195         movw (%rsi), %r11w
 196         movw -2(%rsi, %rdx), %r10w
 197         movw %r11w, (%rdi)
 198         movw %r10w, -2(%rdi, %rdx)
 199         jmp 13f
 200 12:
 201         cmp $1, %rdx
 202         jb 13f
 203         /*
 204          * Move data for 1 byte.
 205          */
 206         movb (%rsi), %r11b
 207         movb %r11b, (%rdi)
 208 13:
 209         retq
 210 SYM_FUNC_END(__memmove)
 211 SYM_FUNC_END_ALIAS(memmove)
 212 EXPORT_SYMBOL(__memmove)
 213 EXPORT_SYMBOL(memmove)