From c2a0dfea629617a39af2f59bd400e1a3595d0783 Mon Sep 17 00:00:00 2001
From: Szabolcs Nagy <nsz@port70.net>
Date: Tue, 3 Sep 2013 14:37:48 +0000
Subject: [PATCH] math: rewrite hypot

method: if there is a large difference between the scale of x and y
then the larger magnitude dominates, otherwise reduce x,y so the
argument of sqrt (x*x+y*y) does not overflow or underflow and calculate
the argument precisely using exact multiplication. If the argument
has less error than 1/sqrt(2) ~ 0.7 ulp, then the result has less error
than 1 ulp in nearest rounding mode.

the original fdlibm method was the same, except it used bit hacks
instead of dekker-veltkamp algorithm, which is problematic for long
double where different representations are supported. (the new hypot
and hypotl code should be smaller and faster on 32bit cpu archs with
fast fpu), the new code behaves differently in non-nearest rounding,
but the error should be still less than 2ulps.

ld80 and ld128 are supported
---
 src/math/hypot.c  | 174 +++++++++++++++-----------------------------
 src/math/hypotf.c | 107 ++++++++--------------------
 src/math/hypotl.c | 178 +++++++++++++---------------------------------
 3 files changed, 135 insertions(+), 324 deletions(-)

diff --git a/src/math/hypot.c b/src/math/hypot.c
index 9a4cbdb3..29ec6a47 100644
--- a/src/math/hypot.c
+++ b/src/math/hypot.c
@@ -1,123 +1,67 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_hypot.c */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-/* hypot(x,y)
- *
- * Method :
- *      If (assume round-to-nearest) z=x*x+y*y
- *      has error less than sqrt(2)/2 ulp, then
- *      sqrt(z) has error less than 1 ulp (exercise).
- *
- *      So, compute sqrt(x*x+y*y) with some care as
- *      follows to get the error below 1 ulp:
- *
- *      Assume x>y>0;
- *      (if possible, set rounding to round-to-nearest)
- *      1. if x > 2y  use
- *              x1*x1+(y*y+(x2*(x+x1))) for x*x+y*y
- *      where x1 = x with lower 32 bits cleared, x2 = x-x1; else
- *      2. if x <= 2y use
- *              t1*y1+((x-y)*(x-y)+(t1*y2+t2*y))
- *      where t1 = 2x with lower 32 bits cleared, t2 = 2x-t1,
- *      y1= y with lower 32 bits chopped, y2 = y-y1.
- *
- *      NOTE: scaling may be necessary if some argument is too
- *            large or too tiny
- *
- * Special cases:
- *      hypot(x,y) is INF if x or y is +INF or -INF; else
- *      hypot(x,y) is NAN if x or y is NAN.
- *
- * Accuracy:
- *      hypot(x,y) returns sqrt(x^2+y^2) with error less
- *      than 1 ulps (units in the last place)
- */
+#include <math.h>
+#include <stdint.h>
+#include <float.h>
 
-#include "libm.h"
+#if FLT_EVAL_METHOD > 1U && LDBL_MANT_DIG == 64
+#define SPLIT (0x1p32 + 1)
+#else
+#define SPLIT (0x1p27 + 1)
+#endif
+
+static void sq(double_t *hi, double_t *lo, double x)
+{
+	double_t xh, xl, xc;
+
+	xc = x*SPLIT;
+	xh = x - xc + xc;
+	xl = x - xh;
+	*hi = x*x;
+	*lo = xh*xh - *hi + 2*xh*xl + xl*xl;
+}
 
 double hypot(double x, double y)
 {
-	double a,b,t1,t2,y1,y2,w;
-	int32_t j,k,ha,hb;
+	union {double f; uint64_t i;} ux = {x}, uy = {y}, ut;
+	int ex, ey;
+	double_t hx, lx, hy, ly, z;
 
-	GET_HIGH_WORD(ha, x);
-	ha &= 0x7fffffff;
-	GET_HIGH_WORD(hb, y);
-	hb &= 0x7fffffff;
-	if (hb > ha) {
-		a = y;
-		b = x;
-		j=ha; ha=hb; hb=j;
-	} else {
-		a = x;
-		b = y;
+	/* arrange |x| >= |y| */
+	ux.i &= -1ULL>>1;
+	uy.i &= -1ULL>>1;
+	if (ux.i < uy.i) {
+		ut = ux;
+		ux = uy;
+		uy = ut;
 	}
-	a = fabs(a);
-	b = fabs(b);
-	if (ha - hb > 0x3c00000)  /* x/y > 2**60 */
-		return a+b;
-	k = 0;
-	if (ha > 0x5f300000) {    /* a > 2**500 */
-		if(ha >= 0x7ff00000) {  /* Inf or NaN */
-			uint32_t low;
-			/* Use original arg order iff result is NaN; quieten sNaNs. */
-			w = fabs(x+0.0) - fabs(y+0.0);
-			GET_LOW_WORD(low, a);
-			if (((ha&0xfffff)|low) == 0) w = a;
-			GET_LOW_WORD(low, b);
-			if (((hb^0x7ff00000)|low) == 0) w = b;
-			return w;
-		}
-		/* scale a and b by 2**-600 */
-		ha -= 0x25800000; hb -= 0x25800000;  k += 600;
-		SET_HIGH_WORD(a, ha);
-		SET_HIGH_WORD(b, hb);
-	}
-	if (hb < 0x20b00000) {    /* b < 2**-500 */
-		if (hb <= 0x000fffff) {  /* subnormal b or 0 */
-			uint32_t low;
-			GET_LOW_WORD(low, b);
-			if ((hb|low) == 0)
-				return a;
-			t1 = 0;
-			SET_HIGH_WORD(t1, 0x7fd00000);  /* t1 = 2^1022 */
-			b *= t1;
-			a *= t1;
-			k -= 1022;
-		} else {            /* scale a and b by 2^600 */
-			ha += 0x25800000;  /* a *= 2^600 */
-			hb += 0x25800000;  /* b *= 2^600 */
-			k -= 600;
-			SET_HIGH_WORD(a, ha);
-			SET_HIGH_WORD(b, hb);
-		}
-	}
-	/* medium size a and b */
-	w = a - b;
-	if (w > b) {
-		t1 = 0;
-		SET_HIGH_WORD(t1, ha);
-		t2 = a-t1;
-		w  = sqrt(t1*t1-(b*(-b)-t2*(a+t1)));
-	} else {
-		a  = a + a;
-		y1 = 0;
-		SET_HIGH_WORD(y1, hb);
-		y2 = b - y1;
-		t1 = 0;
-		SET_HIGH_WORD(t1, ha+0x00100000);
-		t2 = a - t1;
-		w  = sqrt(t1*y1-(w*(-w)-(t1*y2+t2*b)));
+
+	/* special cases */
+	ex = ux.i>>52;
+	ey = uy.i>>52;
+	x = ux.f;
+	y = uy.f;
+	/* note: hypot(inf,nan) == inf */
+	if (ey == 0x7ff)
+		return y;
+	if (ex == 0x7ff || uy.i == 0)
+		return x;
+	/* note: hypot(x,y) ~= x + y*y/x/2 with inexact for small y/x */
+	/* 64 difference is enough for ld80 double_t */
+	if (ex - ey > 64)
+		return x + y;
+
+	/* precise sqrt argument in nearest rounding mode without overflow */
+	/* xh*xh must not overflow and xl*xl must not underflow in sq */
+	z = 1;
+	if (ex > 0x3ff+510) {
+		z = 0x1p700;
+		x *= 0x1p-700;
+		y *= 0x1p-700;
+	} else if (ey < 0x3ff-450) {
+		z = 0x1p-700;
+		x *= 0x1p700;
+		y *= 0x1p700;
 	}
-	if (k)
-		w = scalbn(w, k);
-	return w;
+	sq(&hx, &lx, x);
+	sq(&hy, &ly, y);
+	return z*sqrt(ly+lx+hy+hx);
 }
diff --git a/src/math/hypotf.c b/src/math/hypotf.c
index 4d80178d..2fc214b7 100644
--- a/src/math/hypotf.c
+++ b/src/math/hypotf.c
@@ -1,86 +1,35 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_hypotf.c */
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-#include "libm.h"
+#include <math.h>
+#include <stdint.h>
 
 float hypotf(float x, float y)
 {
-	float a,b,t1,t2,y1,y2,w;
-	int32_t j,k,ha,hb;
+	union {float f; uint32_t i;} ux = {x}, uy = {y}, ut;
+	float_t z;
 
-	GET_FLOAT_WORD(ha,x);
-	ha &= 0x7fffffff;
-	GET_FLOAT_WORD(hb,y);
-	hb &= 0x7fffffff;
-	if (hb > ha) {
-		a = y;
-		b = x;
-		j=ha; ha=hb; hb=j;
-	} else {
-		a = x;
-		b = y;
-	}
-	a = fabsf(a);
-	b = fabsf(b);
-	if (ha - hb > 0xf000000)  /* x/y > 2**30 */
-		return a+b;
-	k = 0;
-	if (ha > 0x58800000) {    /* a > 2**50 */
-		if(ha >= 0x7f800000) {  /* Inf or NaN */
-			/* Use original arg order iff result is NaN; quieten sNaNs. */
-			w = fabsf(x+0.0f) - fabsf(y+0.0f);
-			if (ha == 0x7f800000) w = a;
-			if (hb == 0x7f800000) w = b;
-			return w;
-		}
-		/* scale a and b by 2**-68 */
-		ha -= 0x22000000; hb -= 0x22000000; k += 68;
-		SET_FLOAT_WORD(a, ha);
-		SET_FLOAT_WORD(b, hb);
-	}
-	if (hb < 0x26800000) {    /* b < 2**-50 */
-		if (hb <= 0x007fffff) {  /* subnormal b or 0 */
-			if (hb == 0)
-				return a;
-			SET_FLOAT_WORD(t1, 0x7e800000);  /* t1 = 2^126 */
-			b *= t1;
-			a *= t1;
-			k -= 126;
-		} else {   /* scale a and b by 2^68 */
-			ha += 0x22000000;  /* a *= 2^68 */
-			hb += 0x22000000;  /* b *= 2^68 */
-			k -= 68;
-			SET_FLOAT_WORD(a, ha);
-			SET_FLOAT_WORD(b, hb);
-		}
+	ux.i &= -1U>>1;
+	uy.i &= -1U>>1;
+	if (ux.i < uy.i) {
+		ut = ux;
+		ux = uy;
+		uy = ut;
 	}
-	/* medium size a and b */
-	w = a - b;
-	if (w > b) {
-		SET_FLOAT_WORD(t1, ha&0xfffff000);
-		t2 = a - t1;
-		w  = sqrtf(t1*t1-(b*(-b)-t2*(a+t1)));
-	} else {
-		a  = a + a;
-		SET_FLOAT_WORD(y1, hb&0xfffff000);
-		y2 = b - y1;
-		SET_FLOAT_WORD(t1,(ha+0x00800000)&0xfffff000);
-		t2 = a - t1;
-		w  = sqrtf(t1*y1-(w*(-w)-(t1*y2+t2*b)));
+
+	x = ux.f;
+	y = uy.f;
+	if (uy.i == 0xff<<23)
+		return y;
+	if (ux.i >= 0xff<<23 || uy.i == 0 || ux.i - uy.i >= 25<<23)
+		return x + y;
+
+	z = 1;
+	if (ux.i >= (0x7f+60)<<23) {
+		z = 0x1p90f;
+		x *= 0x1p-90f;
+		y *= 0x1p-90f;
+	} else if (uy.i < (0x7f-60)<<23) {
+		z = 0x1p-90f;
+		x *= 0x1p90f;
+		y *= 0x1p90f;
 	}
-	if (k)
-		w = scalbnf(w, k);
-	return w;
+	return z*sqrtf((double)x*x + (double)y*y);
 }
diff --git a/src/math/hypotl.c b/src/math/hypotl.c
index f4a64f74..479aa92c 100644
--- a/src/math/hypotl.c
+++ b/src/math/hypotl.c
@@ -1,16 +1,3 @@
-/* origin: FreeBSD /usr/src/lib/msun/src/e_hypotl.c */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-/* long double version of hypot().  See comments in hypot.c. */
-
 #include "libm.h"
 
 #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
@@ -19,130 +6,61 @@ long double hypotl(long double x, long double y)
 	return hypot(x, y);
 }
 #elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
-
-#define GET_LDBL_EXPSIGN(i, v) do {     \
-	union IEEEl2bits uv;            \
-					\
-	uv.e = v;                       \
-	i = uv.xbits.expsign;           \
-} while (0)
-
-#define GET_LDBL_MAN(h, l, v) do {      \
-	union IEEEl2bits uv;            \
-					\
-	uv.e = v;                       \
-	h = uv.bits.manh;               \
-	l = uv.bits.manl;               \
-} while (0)
-
-#define SET_LDBL_EXPSIGN(v, i) do {     \
-	union IEEEl2bits uv;            \
-					\
-	uv.e = v;                       \
-	uv.xbits.expsign = i;           \
-	v = uv.e;                       \
-} while (0)
-
-#undef GET_HIGH_WORD
-#define GET_HIGH_WORD(i, v)     GET_LDBL_EXPSIGN(i, v)
-#undef SET_HIGH_WORD
-#define SET_HIGH_WORD(v, i)     SET_LDBL_EXPSIGN(v, i)
-
-#define DESW(exp)       (exp)           /* delta expsign word */
-#define ESW(exp)        (MAX_EXP - 1 + (exp))   /* expsign word */
-#define MANT_DIG        LDBL_MANT_DIG
-#define MAX_EXP         LDBL_MAX_EXP
-
-#if LDBL_MANL_SIZE > 32
-typedef uint64_t man_t;
-#else
-typedef uint32_t man_t;
+#if LDBL_MANT_DIG == 64
+#define SPLIT (0x1p32L+1)
+#elif LDBL_MANT_DIG == 113
+#define SPLIT (0x1p57L+1)
 #endif
 
+static void sq(long double *hi, long double *lo, long double x)
+{
+	long double xh, xl, xc;
+	xc = x*SPLIT;
+	xh = x - xc + xc;
+	xl = x - xh;
+	*hi = x*x;
+	*lo = xh*xh - *hi + 2*xh*xl + xl*xl;
+}
+
 long double hypotl(long double x, long double y)
 {
-	long double a=x,b=y,t1,t2,y1,y2,w;
-	int32_t j,k,ha,hb;
+	union ldshape ux = {x}, uy = {y};
+	int ex, ey;
+	long double hx, lx, hy, ly, z;
 
-	GET_HIGH_WORD(ha, x);
-	ha &= 0x7fff;
-	GET_HIGH_WORD(hb, y);
-	hb &= 0x7fff;
-	if (hb > ha) {
-		a = y;
-		b = x;
-		j=ha; ha=hb; hb=j;
-	} else {
-		a = x;
-		b = y;
-	}
-	a = fabsl(a);
-	b = fabsl(b);
-	if (ha - hb > DESW(MANT_DIG+7))  /* x/y > 2**(MANT_DIG+7) */
-		return a+b;
-	k = 0;
-	if (ha > ESW(MAX_EXP/2-12)) {    /* a>2**(MAX_EXP/2-12) */
-		if (ha >= ESW(MAX_EXP)) {  /* Inf or NaN */
-			man_t manh, manl;
-			/* Use original arg order iff result is NaN; quieten sNaNs. */
-			w = fabsl(x+0.0)-fabsl(y+0.0);
-			GET_LDBL_MAN(manh,manl,a);
-			if (manh == LDBL_NBIT && manl == 0) w = a;
-			GET_LDBL_MAN(manh,manl,b);
-			if (hb >= ESW(MAX_EXP) && manh == LDBL_NBIT && manl == 0) w = b;
-			return w;
-		}
-		/* scale a and b by 2**-(MAX_EXP/2+88) */
-		ha -= DESW(MAX_EXP/2+88); hb -= DESW(MAX_EXP/2+88);
-		k += MAX_EXP/2+88;
-		SET_HIGH_WORD(a, ha);
-		SET_HIGH_WORD(b, hb);
-	}
-	if (hb < ESW(-(MAX_EXP/2-12))) {  /* b < 2**-(MAX_EXP/2-12) */
-		if (hb <= 0) {  /* subnormal b or 0 */
-			man_t manh, manl;
-			GET_LDBL_MAN(manh,manl,b);
-			if ((manh|manl) == 0)
-				return a;
-			t1 = 0;
-			SET_HIGH_WORD(t1, ESW(MAX_EXP-2));  /* t1 = 2^(MAX_EXP-2) */
-			b *= t1;
-			a *= t1;
-			k -= MAX_EXP-2;
-		} else {            /* scale a and b by 2^(MAX_EXP/2+88) */
-			ha += DESW(MAX_EXP/2+88);
-			hb += DESW(MAX_EXP/2+88);
-			k -= MAX_EXP/2+88;
-			SET_HIGH_WORD(a, ha);
-			SET_HIGH_WORD(b, hb);
-		}
-	}
-	/* medium size a and b */
-	w = a - b;
-	if (w > b) {
-		t1 = a;
-		union IEEEl2bits uv;
-		uv.e = t1; uv.bits.manl = 0; t1 = uv.e;
-		t2 = a-t1;
-		w  = sqrtl(t1*t1-(b*(-b)-t2*(a+t1)));
+	ux.i.se &= 0x7fff;
+	uy.i.se &= 0x7fff;
+	if (ux.i.se < uy.i.se) {
+		ex = uy.i.se;
+		ey = ux.i.se;
+		x = uy.f;
+		y = ux.f;
 	} else {
-		a  = a+a;
-		y1 = b;
-		union IEEEl2bits uv;
-		uv.e = y1; uv.bits.manl = 0; y1 = uv.e;
-		y2 = b - y1;
-		t1 = a;
-		uv.e = t1; uv.bits.manl = 0; t1 = uv.e;
-		t2 = a - t1;
-		w  = sqrtl(t1*y1-(w*(-w)-(t1*y2+t2*b)));
+		ex = ux.i.se;
+		ey = uy.i.se;
+		x = ux.f;
+		y = uy.f;
 	}
-	if(k!=0) {
-		uint32_t high;
-		t1 = 1.0;
-		GET_HIGH_WORD(high, t1);
-		SET_HIGH_WORD(t1, high+DESW(k));
-		return t1*w;
+
+	if (ex == 0x7fff && isinf(y))
+		return y;
+	if (ex == 0x7fff || y == 0)
+		return x;
+	if (ex - ey > LDBL_MANT_DIG)
+		return x + y;
+
+	z = 1;
+	if (ex > 0x3fff+8000) {
+		z = 0x1p10000L;
+		x *= 0x1p-10000L;
+		y *= 0x1p-10000L;
+	} else if (ey < 0x3fff-8000) {
+		z = 0x1p-10000L;
+		x *= 0x1p10000L;
+		y *= 0x1p10000L;
 	}
-	return w;
+	sq(&hx, &lx, x);
+	sq(&hy, &ly, y);
+	return z*sqrtl(ly+lx+hy+hx);
 }
 #endif
-- 
2.25.1