old: 2*atan2(sqrt(1-x),sqrt(1+x))
new: atan2(fabs(sqrt((1-x)*(1+x))),x)
improvements:
* all edge cases are fixed (sign of zero in downward rounding)
* a bit faster (here a single call is about 131ns vs 162ns)
* a bit more precise (at most 1ulp error on 1M uniform random
samples in [0,1), the old formula gave some 2ulp errors as well)
+# use acos(x) = atan2(fabs(sqrt((1-x)*(1+x))), x)
+
.global acosf
.type acosf,@function
acosf:
.type acos,@function
acos:
fldl 4(%esp)
-1: fld1
- fld %st(1)
+1: fld %st(0)
fld1
- fsubp
- fsqrt
- fxch %st(2)
- faddp
+ fsub %st(0),%st(1)
+ fadd %st(2)
+ fmulp
fsqrt
+ fabs # fix sign of zero (matters in downward rounding mode)
+ fxch %st(1)
fpatan
- fld1
- fld1
- faddp
- fmulp
ret
+# see ../i386/acos.s
+
.global acosl
.type acosl,@function
acosl:
fldt 8(%rsp)
+1: fld %st(0)
fld1
- fld %st(1)
- fld1
- fsubp
- fsqrt
- fxch %st(2)
- faddp
+ fsub %st(0),%st(1)
+ fadd %st(2)
+ fmulp
fsqrt
+ fabs
+ fxch %st(1)
fpatan
- fld1
- fld1
- faddp
- fmulp
ret