When FLT_EVAL_METHOD!=0 (only i386 with x87 fp) the excess
precision of an expression must be removed in an assignment.
(gcc needs -fexcess-precision=standard or -std=c99 for this)
This is done by extra load/store instructions which adds code
bloat when lot of temporaries are used and it makes the result
less precise in many cases.
Using double_t and float_t avoids these issues on i386 and
it makes no difference on other archs.
For now only a few functions are modified where the excess
precision is clearly beneficial (mostly polynomial evaluations
with temporaries).
object size differences on i386, gcc-4.8:
old new
__cosdf.o 123 95
__cos.o 199 169
__sindf.o 131 95
__sin.o 225 203
__tandf.o 207 151
__tan.o 605 499
erff.o 1470 1416
erf.o 1703 1649
j0f.o 1779 1745
j0.o 2308 2274
j1f.o 1602 1568
j1.o 2286 2252
tgamma.o 1431 1424
math/*.o 64164 63635
21 files changed:
double __cos(double x, double y)
{
double __cos(double x, double y)
{
float __cosdf(double x)
{
float __cosdf(double x)
{
/* Try to optimize for parallel evaluation as in __tandf.c. */
z = x*x;
/* Try to optimize for parallel evaluation as in __tandf.c. */
z = x*x;
*/
static inline double __log1p(double f)
{
*/
static inline double __log1p(double f)
{
- double hfsq,s,z,R,w,t1,t2;
+ double_t hfsq,s,z,R,w,t1,t2;
static inline float __log1pf(float f)
{
static inline float __log1pf(float f)
{
- float hfsq,s,z,R,w,t1,t2;
+ float_t hfsq,s,z,R,w,t1,t2;
s = f/(2.0f + f);
z = s*s;
s = f/(2.0f + f);
z = s*s;
double __sin(double x, double y, int iy)
{
double __sin(double x, double y, int iy)
{
float __sindf(double x)
{
float __sindf(double x)
{
/* Try to optimize for parallel evaluation as in __tandf.c. */
z = x*x;
/* Try to optimize for parallel evaluation as in __tandf.c. */
z = x*x;
double __tan(double x, double y, int iy)
{
double __tan(double x, double y, int iy)
{
- double z, r, v, w, s, sign;
+ double_t z, r, v, w, s, sign;
int32_t ix, hx;
GET_HIGH_WORD(hx,x);
int32_t ix, hx;
GET_HIGH_WORD(hx,x);
* -1.0 / (x+r) here
*/
/* compute -1.0 / (x+r) accurately */
* -1.0 / (x+r) here
*/
/* compute -1.0 / (x+r) accurately */
+ double_t a;
+ double z, t;
z = w;
SET_LOW_WORD(z,0);
v = r - (z - x); /* z+v = r+x */
z = w;
SET_LOW_WORD(z,0);
v = r - (z - x); /* z+v = r+x */
float __tandf(double x, int iy)
{
float __tandf(double x, int iy)
{
static double R(double z)
{
static double R(double z)
{
p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
q = 1.0+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
return p/q;
p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
q = 1.0+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
return p/q;
static float R(float z)
{
static float R(float z)
{
p = z*(pS0+z*(pS1+z*pS2));
q = 1.0f+z*qS1;
return p/q;
p = z*(pS0+z*(pS1+z*pS2));
q = 1.0f+z*qS1;
return p/q;
static double R(double z)
{
static double R(double z)
{
p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
q = 1.0+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
return p/q;
p = z*(pS0+z*(pS1+z*(pS2+z*(pS3+z*(pS4+z*pS5)))));
q = 1.0+z*(qS1+z*(qS2+z*(qS3+z*qS4)));
return p/q;
static float R(float z)
{
static float R(float z)
{
p = z*(pS0+z*(pS1+z*pS2));
q = 1.0f+z*qS1;
return p/q;
p = z*(pS0+z*(pS1+z*pS2));
q = 1.0f+z*qS1;
return p/q;
uint32_t ix,sign;
int id;
uint32_t ix,sign;
int id;
uint32_t ix,sign;
int id;
uint32_t ix,sign;
int id;
static double erfc1(double x)
{
static double erfc1(double x)
{
s = fabs(x) - 1;
P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
s = fabs(x) - 1;
P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
static double erfc2(uint32_t ix, double x)
{
static double erfc2(uint32_t ix, double x)
{
+ double_t s,R,S;
+ double z;
if (ix < 0x3ff40000) /* |x| < 1.25 */
return erfc1(x);
if (ix < 0x3ff40000) /* |x| < 1.25 */
return erfc1(x);
static float erfc1(float x)
{
static float erfc1(float x)
{
s = fabsf(x) - 1;
P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
s = fabsf(x) - 1;
P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
static float erfc2(uint32_t ix, float x)
{
static float erfc2(uint32_t ix, float x)
{
+ float_t s,R,S;
+ float z;
if (ix < 0x3fa00000) /* |x| < 1.25 */
return erfc1(x);
if (ix < 0x3fa00000) /* |x| < 1.25 */
return erfc1(x);
static double pzero(double x)
{
const double *p,*q;
static double pzero(double x)
{
const double *p,*q;
uint32_t ix;
GET_HIGH_WORD(ix, x);
uint32_t ix;
GET_HIGH_WORD(ix, x);
static double qzero(double x)
{
const double *p,*q;
static double qzero(double x)
{
const double *p,*q;
uint32_t ix;
GET_HIGH_WORD(ix, x);
uint32_t ix;
GET_HIGH_WORD(ix, x);
static float pzerof(float x)
{
const float *p,*q;
static float pzerof(float x)
{
const float *p,*q;
uint32_t ix;
GET_FLOAT_WORD(ix, x);
uint32_t ix;
GET_FLOAT_WORD(ix, x);
static float qzerof(float x)
{
const float *p,*q;
static float qzerof(float x)
{
const float *p,*q;
uint32_t ix;
GET_FLOAT_WORD(ix, x);
uint32_t ix;
GET_FLOAT_WORD(ix, x);
static double pone(double x)
{
const double *p,*q;
static double pone(double x)
{
const double *p,*q;
uint32_t ix;
GET_HIGH_WORD(ix, x);
uint32_t ix;
GET_HIGH_WORD(ix, x);
static double qone(double x)
{
const double *p,*q;
static double qone(double x)
{
const double *p,*q;
uint32_t ix;
GET_HIGH_WORD(ix, x);
uint32_t ix;
GET_HIGH_WORD(ix, x);
static float ponef(float x)
{
const float *p,*q;
static float ponef(float x)
{
const float *p,*q;
uint32_t ix;
GET_FLOAT_WORD(ix, x);
uint32_t ix;
GET_FLOAT_WORD(ix, x);
static float qonef(float x)
{
const float *p,*q;
static float qonef(float x)
{
const float *p,*q;
uint32_t ix;
GET_FLOAT_WORD(ix, x);
uint32_t ix;
GET_FLOAT_WORD(ix, x);
/* S(x) rational function for positive x */
static double S(double x)
{
/* S(x) rational function for positive x */
static double S(double x)
{
- double num = 0, den = 0;
+ double_t num = 0, den = 0;
int i;
/* to avoid overflow handle large x differently */
int i;
/* to avoid overflow handle large x differently */