.global memset
.type memset,@function
memset:
- movzbl %sil,%esi
- mov $0x101010101010101,%rax
- # 64-bit imul has 3-7 cycles latency, launch early
- imul %rsi,%rax
+ movzbq %sil,%rax
+ mov $0x101010101010101,%r8
+ imul %r8,%rax
- cmp $16,%rdx
- jb 1f
+ cmp $126,%rdx
+ ja 2f
- lea -1(%rdx),%rcx
- mov %rdi,%r8
- shr $3,%rcx
- mov %rax,-8(%rdi,%rdx)
- rep
- stosq
- mov %r8,%rax
- ret
-
-1: test %edx,%edx
+ test %edx,%edx
jz 1f
- mov %al,(%rdi)
- mov %al,-1(%rdi,%rdx)
+ mov %sil,(%rdi)
+ mov %sil,-1(%rdi,%rdx)
cmp $2,%edx
jbe 1f
- mov %al,1(%rdi)
- mov %al,-2(%rdi,%rdx)
- cmp $4,%edx
+ mov %ax,1(%rdi)
+ mov %ax,(-1-2)(%rdi,%rdx)
+ cmp $6,%edx
+ jbe 1f
+
+ mov %eax,(1+2)(%rdi)
+ mov %eax,(-1-2-4)(%rdi,%rdx)
+ cmp $14,%edx
+ jbe 1f
+
+ mov %rax,(1+2+4)(%rdi)
+ mov %rax,(-1-2-4-8)(%rdi,%rdx)
+ cmp $30,%edx
jbe 1f
- mov %eax,(%rdi)
- mov %eax,-4(%rdi,%rdx)
- cmp $8,%edx
+ mov %rax,(1+2+4+8)(%rdi)
+ mov %rax,(1+2+4+8+8)(%rdi)
+ mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
+ mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
+ cmp $62,%edx
jbe 1f
- mov %eax,4(%rdi)
- mov %eax,-8(%rdi,%rdx)
+ mov %rax,(1+2+4+8+16)(%rdi)
+ mov %rax,(1+2+4+8+16+8)(%rdi)
+ mov %rax,(1+2+4+8+16+16)(%rdi)
+ mov %rax,(1+2+4+8+16+24)(%rdi)
+ mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
+ mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
+ mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
+ mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
1: mov %rdi,%rax
ret
+
+2: test $15,%edi
+ mov %rdi,%r8
+ mov %rax,-8(%rdi,%rdx)
+ mov %rdx,%rcx
+ jnz 2f
+
+1: shr $3,%rcx
+ rep
+ stosq
+ mov %r8,%rax
+ ret
+
+2: xor %edx,%edx
+ sub %edi,%edx
+ and $15,%edx
+ mov %rax,(%rdi)
+ mov %rax,8(%rdi)
+ sub %rdx,%rcx
+ add %rdx,%rdi
+ jmp 1b