arch/x86_64/lib/memset.S

   1 /* Copyright 2002 Andi Kleen, SuSE Labs */
   2 /*
   3  * ISO C memset - set a memory block to a byte value.
   4  *
   5  * rdi   destination
   6  * rsi   value (char)
   7  * rdx   count (bytes)
   8  *
   9  * rax   original destination
  10  */
  11         .globl __memset
  12         .globl memset
  13         .p2align 4
  14 memset:
  15 __memset:
  16         movq %rdi,%r10
  17         movq %rdx,%r11
  18
  19         /* expand byte value  */
  20         movzbl %sil,%ecx
  21         movabs $0x0101010101010101,%rax
  22         mul    %rcx             /* with rax, clobbers rdx */
  23
  24         /* align dst */
  25         movl  %edi,%r9d
  26         andl  $7,%r9d
  27         jnz  .Lbad_alignment
  28 .Lafter_bad_alignment:
  29
  30         movl %r11d,%ecx
  31         shrl $6,%ecx
  32         jz       .Lhandle_tail
  33
  34         .p2align 4
  35 .Lloop_64:
  36         decl   %ecx
  37         movq  %rax,(%rdi)
  38         movq  %rax,8(%rdi)
  39         movq  %rax,16(%rdi)
  40         movq  %rax,24(%rdi)
  41         movq  %rax,32(%rdi)
  42         movq  %rax,40(%rdi)
  43         movq  %rax,48(%rdi)
  44         movq  %rax,56(%rdi)
  45         leaq  64(%rdi),%rdi
  46         jnz    .Lloop_64
  47
  48         /* Handle tail in loops. The loops should be faster than hard
  49            to predict jump tables. */
  50         .p2align 4
  51 .Lhandle_tail:
  52         movl    %r11d,%ecx
  53         andl    $63&(~7),%ecx
  54         jz              .Lhandle_7
  55         shrl    $3,%ecx
  56         .p2align 4
  57 .Lloop_8:
  58         decl   %ecx
  59         movq  %rax,(%rdi)
  60         leaq  8(%rdi),%rdi
  61         jnz    .Lloop_8
  62
  63 .Lhandle_7:
  64         movl    %r11d,%ecx
  65         andl    $7,%ecx
  66         jz      .Lende
  67         .p2align 4
  68 .Lloop_1:
  69         decl    %ecx
  70         movb    %al,(%rdi)
  71         leaq    1(%rdi),%rdi
  72         jnz     .Lloop_1
  73
  74 .Lende:
  75         movq    %r10,%rax
  76         ret
  77
  78 .Lbad_alignment:
  79         cmpq $7,%r11
  80         jbe     .Lhandle_7
  81         movq %rax,(%rdi)        /* unaligned store */
  82         movq $8,%r8
  83         subq %r9,%r8
  84         addq %r8,%rdi
  85         subq %r8,%r11
  86         jmp .Lafter_bad_alignment
  87
  88         /* Some CPUs run faster using the string instructions.
  89            It is also a lot simpler. Use this when possible */
  90
  91 #include <asm/cpufeature.h>
  92
  93         .section .altinstructions,"a"
  94         .align 8
  95         .quad  memset
  96         .quad  memset_c
  97         .byte  X86_FEATURE_REP_GOOD
  98         .byte  memset_c_end-memset_c
  99         .byte  memset_c_end-memset_c
 100         .previous
 101
 102         .section .altinstr_replacement,"ax"
 103  /* rdi destination
 104   * rsi value
 105   * rdx count
 106   */
 107 memset_c:
 108         movq %rdi,%r9
 109         movl %edx,%r8d
 110         andl $7,%r8d
 111         movl %edx,%ecx
 112         shrl $3,%ecx
 113         /* expand byte value  */
 114         movzbl %sil,%esi
 115         movabs $0x0101010101010101,%rax
 116         mulq   %rsi             /* with rax, clobbers rdx */
 117         rep
 118         stosq
 119         movl %r8d,%ecx
 120         rep
 121         stosb
 122         movq %r9,%rax
 123         ret
 124 memset_c_end:
 125         .previous