x86_64/memset: simple optimizations

author Denys Vlasenko <vda.linux@googlemail.com>

Tue, 10 Feb 2015 17:30:56 +0000 (18:30 +0100)

committer Rich Felker <dalias@aerifal.cx>

Tue, 10 Feb 2015 23:53:31 +0000 (18:53 -0500)
author Denys Vlasenko <vda.linux@googlemail.com>
Tue, 10 Feb 2015 17:30:56 +0000 (18:30 +0100)
committer Rich Felker <dalias@aerifal.cx>
Tue, 10 Feb 2015 23:53:31 +0000 (18:53 -0500)
diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s

index fc06eef833cb41960fae591b71834c47610ace52..263336b56c7700431d23a4048ac71f9d8ec69208 100644 (file)
--- a/src/string/x86_64/memset.s
+++ b/src/string/x86_64/memset.s
@@ -1,41 +1,43 @@
  .global memset
  .type memset,@function
  memset:
-       and $0xff,%esi
+       movzbl %sil,%esi
         mov $0x101010101010101,%rax
-       mov %rdx,%rcx
-       mov %rdi,%r8
+       # 64-bit imul has 3-7 cycles latency, launch early
         imul %rsi,%rax
-       cmp $16,%rcx
+
+       cmp $16,%rdx
         jb 1f
  
-       mov %rax,-8(%rdi,%rcx)
+       mov %rdx,%rcx
+       mov %rdi,%r8
         shr $3,%rcx
+       mov %rax,-8(%rdi,%rdx)
         rep
         stosq
         mov %r8,%rax
         ret
  
-1:     test %ecx,%ecx
+1:     test %edx,%edx
         jz 1f
  
         mov %al,(%rdi)
-       mov %al,-1(%rdi,%rcx)
-       cmp $2,%ecx
+       mov %al,-1(%rdi,%rdx)
+       cmp $2,%edx
         jbe 1f
  
         mov %al,1(%rdi)
-       mov %al,-2(%rdi,%rcx)
-       cmp $4,%ecx
+       mov %al,-2(%rdi,%rdx)
+       cmp $4,%edx
         jbe 1f
  
         mov %eax,(%rdi)
-       mov %eax,-4(%rdi,%rcx)
-       cmp $8,%ecx
+       mov %eax,-4(%rdi,%rdx)
+       cmp $8,%edx
         jbe 1f
  
         mov %eax,4(%rdi)
-       mov %eax,-8(%rdi,%rcx)
+       mov %eax,-8(%rdi,%rdx)
  
-1:     mov %r8,%rax
+1:     mov %rdi,%rax
         ret
author	Denys Vlasenko <vda.linux@googlemail.com>
	Tue, 10 Feb 2015 17:30:56 +0000 (18:30 +0100)
committer	Rich Felker <dalias@aerifal.cx>
	Tue, 10 Feb 2015 23:53:31 +0000 (18:53 -0500)