tools/arch/x86/lib/memcpy_64.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /* Copyright 2002 Andi Kleen */
   3
   4 #include <linux/export.h>
   5 #include <linux/linkage.h>
   6 #include <asm/errno.h>
   7 #include <asm/cpufeatures.h>
   8 #include <asm/alternative.h>
   9
  10 .section .noinstr.text, "ax"
  11
  12 /*
  13  * memcpy - Copy a memory block.
  14  *
  15  * Input:
  16  *  rdi destination
  17  *  rsi source
  18  *  rdx count
  19  *
  20  * Output:
  21  * rax original destination
  22  *
  23  * The FSRM alternative should be done inline (avoiding the call and
  24  * the disgusting return handling), but that would require some help
  25  * from the compiler for better calling conventions.
  26  *
  27  * The 'rep movsb' itself is small enough to replace the call, but the
  28  * two register moves blow up the code. And one of them is "needed"
  29  * only for the return value that is the same as the source input,
  30  * which the compiler could/should do much better anyway.
  31  */
  32 SYM_TYPED_FUNC_START(__memcpy)
  33         ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
  34
  35         movq %rdi, %rax
  36         movq %rdx, %rcx
  37         rep movsb
  38         RET
  39 SYM_FUNC_END(__memcpy)
  40 EXPORT_SYMBOL(__memcpy)
  41
  42 SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
  43 EXPORT_SYMBOL(memcpy)
  44
  45 SYM_FUNC_START_LOCAL(memcpy_orig)
  46         movq %rdi, %rax
  47
  48         cmpq $0x20, %rdx
  49         jb .Lhandle_tail
  50
  51         /*
  52          * We check whether memory false dependence could occur,
  53          * then jump to corresponding copy mode.
  54          */
  55         cmp  %dil, %sil
  56         jl .Lcopy_backward
  57         subq $0x20, %rdx
  58 .Lcopy_forward_loop:
  59         subq $0x20,     %rdx
  60
  61         /*
  62          * Move in blocks of 4x8 bytes:
  63          */
  64         movq 0*8(%rsi), %r8
  65         movq 1*8(%rsi), %r9
  66         movq 2*8(%rsi), %r10
  67         movq 3*8(%rsi), %r11
  68         leaq 4*8(%rsi), %rsi
  69
  70         movq %r8,       0*8(%rdi)
  71         movq %r9,       1*8(%rdi)
  72         movq %r10,      2*8(%rdi)
  73         movq %r11,      3*8(%rdi)
  74         leaq 4*8(%rdi), %rdi
  75         jae  .Lcopy_forward_loop
  76         addl $0x20,     %edx
  77         jmp  .Lhandle_tail
  78
  79 .Lcopy_backward:
  80         /*
  81          * Calculate copy position to tail.
  82          */
  83         addq %rdx,      %rsi
  84         addq %rdx,      %rdi
  85         subq $0x20,     %rdx
  86         /*
  87          * At most 3 ALU operations in one cycle,
  88          * so append NOPS in the same 16 bytes trunk.
  89          */
  90         .p2align 4
  91 .Lcopy_backward_loop:
  92         subq $0x20,     %rdx
  93         movq -1*8(%rsi),        %r8
  94         movq -2*8(%rsi),        %r9
  95         movq -3*8(%rsi),        %r10
  96         movq -4*8(%rsi),        %r11
  97         leaq -4*8(%rsi),        %rsi
  98         movq %r8,               -1*8(%rdi)
  99         movq %r9,               -2*8(%rdi)
 100         movq %r10,              -3*8(%rdi)
 101         movq %r11,              -4*8(%rdi)
 102         leaq -4*8(%rdi),        %rdi
 103         jae  .Lcopy_backward_loop
 104
 105         /*
 106          * Calculate copy position to head.
 107          */
 108         addl $0x20,     %edx
 109         subq %rdx,      %rsi
 110         subq %rdx,      %rdi
 111 .Lhandle_tail:
 112         cmpl $16,       %edx
 113         jb   .Lless_16bytes
 114
 115         /*
 116          * Move data from 16 bytes to 31 bytes.
 117          */
 118         movq 0*8(%rsi), %r8
 119         movq 1*8(%rsi), %r9
 120         movq -2*8(%rsi, %rdx),  %r10
 121         movq -1*8(%rsi, %rdx),  %r11
 122         movq %r8,       0*8(%rdi)
 123         movq %r9,       1*8(%rdi)
 124         movq %r10,      -2*8(%rdi, %rdx)
 125         movq %r11,      -1*8(%rdi, %rdx)
 126         RET
 127         .p2align 4
 128 .Lless_16bytes:
 129         cmpl $8,        %edx
 130         jb   .Lless_8bytes
 131         /*
 132          * Move data from 8 bytes to 15 bytes.
 133          */
 134         movq 0*8(%rsi), %r8
 135         movq -1*8(%rsi, %rdx),  %r9
 136         movq %r8,       0*8(%rdi)
 137         movq %r9,       -1*8(%rdi, %rdx)
 138         RET
 139         .p2align 4
 140 .Lless_8bytes:
 141         cmpl $4,        %edx
 142         jb   .Lless_3bytes
 143
 144         /*
 145          * Move data from 4 bytes to 7 bytes.
 146          */
 147         movl (%rsi), %ecx
 148         movl -4(%rsi, %rdx), %r8d
 149         movl %ecx, (%rdi)
 150         movl %r8d, -4(%rdi, %rdx)
 151         RET
 152         .p2align 4
 153 .Lless_3bytes:
 154         subl $1, %edx
 155         jb .Lend
 156         /*
 157          * Move data from 1 bytes to 3 bytes.
 158          */
 159         movzbl (%rsi), %ecx
 160         jz .Lstore_1byte
 161         movzbq 1(%rsi), %r8
 162         movzbq (%rsi, %rdx), %r9
 163         movb %r8b, 1(%rdi)
 164         movb %r9b, (%rdi, %rdx)
 165 .Lstore_1byte:
 166         movb %cl, (%rdi)
 167
 168 .Lend:
 169         RET
 170 SYM_FUNC_END(memcpy_orig)
 171