overhaul optimized i386 memset asm

author Rich Felker <dalias@aerifal.cx>

Thu, 26 Feb 2015 06:51:39 +0000 (01:51 -0500)

committer Rich Felker <dalias@aerifal.cx>

Thu, 26 Feb 2015 06:51:39 +0000 (01:51 -0500)
author Rich Felker <dalias@aerifal.cx>
Thu, 26 Feb 2015 06:51:39 +0000 (01:51 -0500)
committer Rich Felker <dalias@aerifal.cx>
Thu, 26 Feb 2015 06:51:39 +0000 (01:51 -0500)
diff --git a/src/string/i386/memset.s b/src/string/i386/memset.s

index 06ac923e9c19eb0af9f557f12514d71387790a77..d00422c4ac15d90f133fabdc79d231702c228f6f 100644 (file)
--- a/src/string/i386/memset.s
+++ b/src/string/i386/memset.s
@@ -1,47 +1,76 @@
  .global memset
  .type memset,@function
  memset:
-       mov 8(%esp),%al
-       push %edi
-       mov %al,%ah
-       mov %al,%dl
-       mov 16(%esp),%ecx
-       shl $16,%eax
-       mov 8(%esp),%edi
-       mov %dl,%al
-       mov %dl,%ah
-       cmp $16,%ecx
-       jb 1f
+       mov 12(%esp),%ecx
+       cmp $62,%ecx
+       ja 2f
  
-       mov %eax,-4(%edi,%ecx)
-       shr $2,%ecx
-       rep
-       stosl
-       mov 8(%esp),%eax
-       pop %edi
-       ret
-
-1:     test %ecx,%ecx
+       mov 8(%esp),%dl
+       mov 4(%esp),%eax
+       test %ecx,%ecx
         jz 1f
  
-       mov %al,(%edi)
-       mov %al,-1(%edi,%ecx)
+       mov %dl,%dh
+
+       mov %dl,(%eax)
+       mov %dl,-1(%eax,%ecx)
         cmp $2,%ecx
         jbe 1f
  
-       mov %al,1(%edi)
-       mov %al,-2(%edi,%ecx)
-       cmp $4,%ecx
+       mov %dx,1(%eax)
+       mov %dx,(-1-2)(%eax,%ecx)
+       cmp $6,%ecx
         jbe 1f
  
-       mov %eax,(%edi)
-       mov %eax,-4(%edi,%ecx)
-       cmp $8,%ecx
+       shl $16,%edx
+       mov 8(%esp),%dl
+       mov 8(%esp),%dh
+
+       mov %edx,(1+2)(%eax)
+       mov %edx,(-1-2-4)(%eax,%ecx)
+       cmp $14,%ecx
         jbe 1f
  
-       mov %eax,4(%edi)
-       mov %eax,-8(%edi,%ecx)
+       mov %edx,(1+2+4)(%eax)
+       mov %edx,(1+2+4+4)(%eax)
+       mov %edx,(-1-2-4-8)(%eax,%ecx)
+       mov %edx,(-1-2-4-4)(%eax,%ecx)
+       cmp $30,%ecx
+       jbe 1f
+
+       mov %edx,(1+2+4+8)(%eax)
+       mov %edx,(1+2+4+8+4)(%eax)
+       mov %edx,(1+2+4+8+8)(%eax)
+       mov %edx,(1+2+4+8+12)(%eax)
+       mov %edx,(-1-2-4-8-16)(%eax,%ecx)
+       mov %edx,(-1-2-4-8-12)(%eax,%ecx)
+       mov %edx,(-1-2-4-8-8)(%eax,%ecx)
+       mov %edx,(-1-2-4-8-4)(%eax,%ecx)
+
+1:     ret     
+
+2:     movzbl 8(%esp),%eax
+       mov %edi,12(%esp)
+       imul $0x1010101,%eax
+       mov 4(%esp),%edi
+       test $15,%edi
+       mov %eax,-4(%edi,%ecx)
+       jnz 2f
  
-1:     mov 8(%esp),%eax
-       pop %edi
+1:     shr $2, %ecx
+       rep
+       stosl
+       mov 4(%esp),%eax
+       mov 12(%esp),%edi
         ret
+       
+2:     xor %edx,%edx
+       sub %edi,%edx
+       and $15,%edx
+       mov %eax,(%edi)
+       mov %eax,4(%edi)
+       mov %eax,8(%edi)
+       mov %eax,12(%edi)
+       sub %edx,%ecx
+       add %edx,%edi
+       jmp 1b
author	Rich Felker <dalias@aerifal.cx>
	Thu, 26 Feb 2015 06:51:39 +0000 (01:51 -0500)
committer	Rich Felker <dalias@aerifal.cx>
	Thu, 26 Feb 2015 06:51:39 +0000 (01:51 -0500)