the fscale instruction is slow everywhere, probably because it
involves a costly and unnecessary integer truncation operation that
ends up being a no-op in common usages. instead, construct a floating
point scale value with integer arithmetic and simply multiply by it,
when possible.
for float and double, this is always possible by going to the
next-larger type. we use some cheap but effective saturating
arithmetic tricks to make sure even very large-magnitude exponents
fit. for long double, if the scaling exponent is too large to fit in
the exponent of a long double value, we simply fallback to the
expensive fscale method.
on atom cpu, these changes speed up scalbn by over 30%. (min rdtsc
timing dropped from 110 cycles to 70 cycles.)
.global scalbn
.type scalbn,@function
scalbn:
- fildl 12(%esp)
+ mov 12(%esp),%eax
+ add $0x3ffe,%eax
+ cmp $0x7ffd,%eax
+ jb 1f
+ sub $0x3ffe,%eax
+ sar $31,%eax
+ xor $0xfff,%eax
+ add $0x3ffe,%eax
+1: inc %eax
fldl 4(%esp)
- fscale
- fstp %st(1)
+ mov %eax,12(%esp)
+ mov $0x80000000,%eax
+ mov %eax,8(%esp)
+ xor %eax,%eax
+ mov %eax,4(%esp)
+ fldt 4(%esp)
+ fmulp
fstpl 4(%esp)
fldl 4(%esp)
ret
.global scalbnf
.type scalbnf,@function
scalbnf:
- fildl 8(%esp)
+ mov 8(%esp),%eax
+ add $0x3fe,%eax
+ cmp $0x7fd,%eax
+ jb 1f
+ sub $0x3fe,%eax
+ sar $31,%eax
+ xor $0x1ff,%eax
+ add $0x3fe,%eax
+1: inc %eax
+ shl $20,%eax
flds 4(%esp)
- fscale
- fstp %st(1)
+ mov %eax,8(%esp)
+ xor %eax,%eax
+ mov %eax,4(%esp)
+ fldl 4(%esp)
+ fmulp
fstps 4(%esp)
flds 4(%esp)
ret
.global scalbnl
.type scalbnl,@function
scalbnl:
- fildl 16(%esp)
+ mov 16(%esp),%eax
+ add $0x3ffe,%eax
+ cmp $0x7ffd,%eax
+ jae 1f
+ inc %eax
+ fldt 4(%esp)
+ mov %eax,12(%esp)
+ mov $0x80000000,%eax
+ mov %eax,8(%esp)
+ xor %eax,%eax
+ mov %eax,4(%esp)
+ fldt 4(%esp)
+ fmulp
+ ret
+1: fildl 16(%esp)
fldt 4(%esp)
fscale
fstp %st(1)