for the sake of simplicity, I've only used rep movsb rather than
breaking up the copy for using rep movsd/q. on all modern cpus, this
seems to be fine, but if there are performance problems, there might
be a need to go back and add support for rep movsd/q.
--- /dev/null
+.global memmove
+.type memmove,@function
+memmove:
+ mov 4(%esp),%eax
+ sub 8(%esp),%eax
+ cmp 12(%esp),%eax
+ jae memcpy
+ push %esi
+ push %edi
+ mov 12(%esp),%edi
+ mov 16(%esp),%esi
+ mov 20(%esp),%ecx
+ lea -1(%edi,%ecx),%edi
+ lea -1(%esi,%ecx),%esi
+ std
+ rep movsb
+ cld
+ lea 1(%edi),%eax
+ pop %edi
+ pop %esi
+ ret
--- /dev/null
+.global memmove
+.type memmove,@function
+memmove:
+ mov %rdi,%rax
+ sub %rsi,%rax
+ cmp %rdx,%rax
+ jae memcpy
+ mov %rdx,%rcx
+ lea -1(%rdi,%rdx),%rdi
+ lea -1(%rsi,%rdx),%rsi
+ std
+ rep movsb
+ cld
+ lea 1(%rdi),%rax
+ ret