arch/x86/lib/memset_64.S

   1 /* Copyright 2002 Andi Kleen, SuSE Labs */
   2
   3 #include <linux/linkage.h>
   4 #include <asm/cpufeatures.h>
   5 #include <asm/alternative-asm.h>
   6
   7 .weak memset
   8
   9 /*
  10  * ISO C memset - set a memory block to a byte value. This function uses fast
  11  * string to get better performance than the original function. The code is
  12  * simpler and shorter than the orignal function as well.
  13  *
  14  * rdi   destination
  15  * rsi   value (char)
  16  * rdx   count (bytes)
  17  *
  18  * rax   original destination
  19  */
  20 ENTRY(memset)
  21 ENTRY(__memset)
  22         /*
  23          * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
  24          * to use it when possible. If not available, use fast string instructions.
  25          *
  26          * Otherwise, use original memset function.
  27          */
  28         ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
  29                       "jmp memset_erms", X86_FEATURE_ERMS
  30
  31         movq %rdi,%r9
  32         movq %rdx,%rcx
  33         andl $7,%edx
  34         shrq $3,%rcx
  35         /* expand byte value  */
  36         movzbl %sil,%esi
  37         movabs $0x0101010101010101,%rax
  38         imulq %rsi,%rax
  39         rep stosq
  40         movl %edx,%ecx
  41         rep stosb
  42         movq %r9,%rax
  43         ret
  44 ENDPROC(memset)
  45 ENDPROC(__memset)
  46
  47 /*
  48  * ISO C memset - set a memory block to a byte value. This function uses
  49  * enhanced rep stosb to override the fast string function.
  50  * The code is simpler and shorter than the fast string function as well.
  51  *
  52  * rdi   destination
  53  * rsi   value (char)
  54  * rdx   count (bytes)
  55  *
  56  * rax   original destination
  57  */
  58 ENTRY(memset_erms)
  59         movq %rdi,%r9
  60         movb %sil,%al
  61         movq %rdx,%rcx
  62         rep stosb
  63         movq %r9,%rax
  64         ret
  65 ENDPROC(memset_erms)
  66
  67 ENTRY(memset_orig)
  68         movq %rdi,%r10
  69
  70         /* expand byte value  */
  71         movzbl %sil,%ecx
  72         movabs $0x0101010101010101,%rax
  73         imulq  %rcx,%rax
  74
  75         /* align dst */
  76         movl  %edi,%r9d
  77         andl  $7,%r9d
  78         jnz  .Lbad_alignment
  79 .Lafter_bad_alignment:
  80
  81         movq  %rdx,%rcx
  82         shrq  $6,%rcx
  83         jz       .Lhandle_tail
  84
  85         .p2align 4
  86 .Lloop_64:
  87         decq  %rcx
  88         movq  %rax,(%rdi)
  89         movq  %rax,8(%rdi)
  90         movq  %rax,16(%rdi)
  91         movq  %rax,24(%rdi)
  92         movq  %rax,32(%rdi)
  93         movq  %rax,40(%rdi)
  94         movq  %rax,48(%rdi)
  95         movq  %rax,56(%rdi)
  96         leaq  64(%rdi),%rdi
  97         jnz    .Lloop_64
  98
  99         /* Handle tail in loops. The loops should be faster than hard
 100            to predict jump tables. */
 101         .p2align 4
 102 .Lhandle_tail:
 103         movl    %edx,%ecx
 104         andl    $63&(~7),%ecx
 105         jz              .Lhandle_7
 106         shrl    $3,%ecx
 107         .p2align 4
 108 .Lloop_8:
 109         decl   %ecx
 110         movq  %rax,(%rdi)
 111         leaq  8(%rdi),%rdi
 112         jnz    .Lloop_8
 113
 114 .Lhandle_7:
 115         andl    $7,%edx
 116         jz      .Lende
 117         .p2align 4
 118 .Lloop_1:
 119         decl    %edx
 120         movb    %al,(%rdi)
 121         leaq    1(%rdi),%rdi
 122         jnz     .Lloop_1
 123
 124 .Lende:
 125         movq    %r10,%rax
 126         ret
 127
 128 .Lbad_alignment:
 129         cmpq $7,%rdx
 130         jbe     .Lhandle_7
 131         movq %rax,(%rdi)        /* unaligned store */
 132         movq $8,%r8
 133         subq %r9,%r8
 134         addq %r8,%rdi
 135         subq %r8,%rdx
 136         jmp .Lafter_bad_alignment
 137 .Lfinal:
 138 ENDPROC(memset_orig)