Created
November 28, 2023 23:55
-
-
Save ptomato/d1ac949bbd3e5719766e770b7c0415bb to your computer and use it in GitHub Desktop.
fma() and extra-precise remquo() implemented using JS Numbers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// The following are line-by-line ports of the fma() and remquo() | |
// implementations from Openlibm, the math implementation of the Julia language. | |
// fma() | |
// Copyright (c) 2005-2011 David Schultz <[email protected]> | |
// 2-clause BSD license. | |
// remquo() | |
// Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. | |
// Developed at SunSoft, a Sun Microsystems, Inc. business. Permission to use, | |
// copy, modify, and distribute this software is freely granted, provided that | |
// this notice is preserved. | |
import copysign from 'math-float64-copysign'; | |
import frexp from 'math-float64-frexp'; | |
import u32f64 from 'math-float64-from-words'; | |
import ldexp from 'math-float64-ldexp'; | |
import f64u32 from 'math-float64-to-words'; | |
const MathAbs = Math.abs; | |
const MathLog2 = Math.log2; | |
const MathPow = Math.pow; | |
const NumberIsFinite = Number.isFinite; | |
const NumberIsNaN = Number.isNaN; | |
// Various floating-point info constants | |
const MANT_DIG = 53; | |
const DBL_MIN = MathPow(2, -1022); | |
// The dd functions represent a floating-point number with twice the precision | |
// of a JS Number. We maintain the invariant that "hi" stores the 53 high-order | |
// bits of the result. | |
/* | |
* Compute a + b exactly, returning the exact result in a dd record. We assume | |
* that both a and b are finite, but make no assumptions about their relative | |
* magnitudes. | |
*/ | |
function ddAdd(a, b) { | |
const hi = a + b; | |
const s = hi - a; | |
const lo = a - (hi - s) + (b - s); | |
return { hi, lo }; | |
} | |
/* | |
* Compute a * b exactly, returning the exact result in a dd record. We assume | |
* that both a and b are normalized, so no underflow or overflow will occur. | |
* In C, the system's floating-point rounding mode must be FE_TONEAREST, but | |
* that rounding mode is already mandated by the ECMAScript spec. | |
*/ | |
const SPLIT = MathPow(2, 27) + 1; | |
function ddMul(a, b) { | |
let p = a * SPLIT; | |
let ha = a - p; | |
ha += p; | |
const la = a - ha; | |
p = b * SPLIT; | |
let hb = b - p; | |
hb += p; | |
const lb = b - hb; | |
p = ha * hb; | |
const q = ha * lb + la * hb; | |
const hi = p + q; | |
const lo = p - hi + q + la * lb; | |
return { hi, lo }; | |
} | |
/* | |
* Compute a+b, with a small tweak: The least significant bit of the result is | |
* adjusted into a sticky bit summarizing all the bits that were lost to | |
* rounding. This adjustment negates the effects of double rounding when the | |
* result is added to another number with a higher exponent. For an explanation | |
* of round and sticky bits, see any reference on FPU design, e.g., | |
* | |
* J. Coonen. An Implementation Guide to a Proposed Standard for | |
* Floating-Point Arithmetic. Computer, vol. 13, no. 1, Jan 1980. | |
*/ | |
function addAdjusted(a, b) { | |
const sum = ddAdd(a, b); | |
if (sum.lo !== 0) { | |
let [hh, lh] = f64u32(sum.hi); | |
if (lh & 1 === 0) { | |
lh += copysign(1, sum.hi * sum.lo); | |
hh += lh >>> 32; | |
lh &= 0xffff_ffff; | |
sum.hi = u32f64(hh, lh); | |
} | |
} | |
return sum.hi; | |
} | |
/* | |
* Compute ldexp(a + b, scale) with a single rounding error. It is assumed that | |
* the result will be subnormal, and care is taken to ensure that double | |
* rounding does not occur. | |
*/ | |
function addAndDenormalize(a, b, scale) { | |
const sum = ddAdd(a, b); | |
// If we are losing at least two bits of accuracy to denormalization, then the | |
// first lost bit becomes a round bit, and we adjust the lowest bit of sum.hi | |
// to make it a sticky bit summarizing all the bits in sum.lo. With the sticky | |
// bit adjusted, the hardware will break any ties in the correct direction. | |
// | |
// If we are losing only one bit to denormalization, however, we must break | |
// the ties manually. | |
if (sum.lo !== 0) { | |
let [hh, lh] = f64u32(sum.hi); | |
const bitsLost = -((hh >>> 20) & 0x7ff) - scale + 1; | |
if ((bitsLost !== 1) ^ (lh & 1)) { | |
lh += copysign(1, sum.hi * sum.lo); | |
hh += lh >>> 32; | |
lh &= 0xffff_ffff; | |
sum.hi = u32f64(hh, lh); | |
} | |
} | |
return ldexp(sum.hi, scale); | |
} | |
/* | |
* Fused multiply-add: Compute x * y + z with a single rounding error. | |
* | |
* We use scaling to avoid overflow/underflow, along with the canonical | |
* precision-doubling technique adapted from: | |
* | |
* Dekker, T. A Floating-Point Technique for Extending the Available | |
* Precision. Numer. Math. 18, 224-242 (1971). | |
* | |
* https://github.com/JuliaMath/openlibm/blob/12f5ffcc990e16f4120d4bf607185243f5affcb8/src/s_fma.c | |
*/ | |
export function fma(x, y, z) { | |
// Handle special cases. The order of operations and the particular return | |
// values here are crucial in handling special cases involving infinities, | |
// NaNs, overflows, and signed zeroes correctly. | |
if (x === 0 || y === 0) return x * y + z; | |
if (z === 0) return x * y; | |
if (!NumberIsFinite(x) || !NumberIsFinite(y)) return x * y + z; | |
if (!NumberIsFinite(z)) return z; | |
const [xs, ex] = frexp(x); | |
const [ys, ey] = frexp(y); | |
let [zs, ez] = frexp(z); | |
let spread = ex + ey - ez; | |
// If x * y and z are many orders of magnitude apart, the scaling will | |
// overflow, so we handle these cases specially. | |
if (spread < -MANT_DIG) return z; | |
if (spread <= MANT_DIG * 2) { | |
zs = ldexp(zs, -spread); | |
} else { | |
zs = copysign(DBL_MIN, zs); | |
} | |
// Basic approach: | |
// (xy.hi, xy.lo) = x * y (exact) | |
// (r.hi, r.lo) = xy.hi + z (exact) | |
// adj = xy.lo + r.lo (inexact; low bit is sticky) | |
// result = r.hi + adj (correctly rounded) | |
const xy = ddMul(xs, ys); | |
const r = ddAdd(xy.hi, zs); | |
spread = ex + ey; | |
// When the addends cancel to 0, ensure that the result has the correct sign. | |
if (r.hi === 0) return xy.hi + zs + ldexp(xy.lo, spread); | |
const adj = addAdjusted(r.lo, xy.lo); | |
if (spread + ilogb(r.hi) > -1023) return ldexp(r.hi + adj, spread); | |
return addAndDenormalize(r.hi, adj, spread); | |
} | |
// ilogb(x) is the same as Math.log2(Math.abs(x)), but with integer sentinel | |
// return values for incalculable inputs, instead of NaN | |
const MAXINT32 = 2 ** 31 - 1; | |
const FP_ILOGB0 = -MAXINT32; | |
const FP_ILOGBNAN = MAXINT32; | |
export function ilogb(x) { | |
if (x === 0) return FP_ILOGB0; | |
if (NumberIsNaN(x)) return FP_ILOGBNAN; | |
if (!NumberIsFinite(x)) return MAXINT32; | |
return MathLog2(MathAbs(x)) | 0; | |
} | |
/* | |
* Return [rem, quo] with rem the IEEE remainder and quo the last n bits of the | |
* quotient, rounded to the nearest integer. We choose n = 31 because we wind up | |
* computing all the integer bits of the quotient anyway as a side-effect of | |
* computing the remainder by the shift and subtract method. In practice, this | |
* is far more bits than are needed to use remquo in reduction algorithms. | |
* | |
* https://github.com/JuliaMath/openlibm/blob/12f5ffcc990e16f4120d4bf607185243f5affcb8/src/s_remquo.c | |
*/ | |
export function remquo(x, y) { | |
let [hx, lx] = f64u32(x); | |
let [hy, ly] = f64u32(y); | |
const sxy = (hx ^ hy) & 0x8000_0000; | |
const sx = hx & 0x8000_0000; // sign of x | |
hx ^= sx; // |x| | |
hy &= 0x7fff_ffff; // |y| | |
// Purge off exception values | |
if (y === 0 || !NumberIsFinite(x) || NumberIsNaN(y)) return [(x * y) / (x * y), NaN]; | |
if (hx <= hy) { | |
// |x|<|y| return x or x - y | |
if (hx < hy || lx < ly) return remquoFixup(0, hx, lx, y, sx, sxy); | |
// |x|=|y| return x * 0 | |
if (lx === ly) return [x * 0, 1]; | |
} | |
const ix = ilogb(x); | |
let iy = ilogb(y); | |
// set up {hx,lx}, {hy,ly} and align y to x | |
if (ix >= -1022) { | |
hx = 0x0010_0000 | (0x000f_ffff & hx); | |
} else { | |
// subnormal x, shift x to normal | |
const n = -1022 - ix; | |
if (n <= 31) { | |
hx = (hx << n) | (lx >> (32 - n)); | |
lx <<= n; | |
} else { | |
hx = lx << (n - 32); | |
lx = 0; | |
} | |
} | |
if (iy >= -1022) { | |
hy = 0x0010_0000 | (0x000f_ffff & hy); | |
} else { | |
// subnormal y, shift y to normal | |
const n = -1022 - iy; | |
if (n <= 31) { | |
hy = (hy << n) | (ly >> (32 - n)); | |
ly <<= n; | |
} else { | |
hy = ly << (n - 32); | |
ly = 0; | |
} | |
} | |
// fix point fmod | |
let n = ix - iy; | |
let q = 0; | |
while (n--) { | |
let hz = hx - hy; | |
const lz = lx - ly; | |
if (lx < ly) hz -= 1; | |
if (hz < 0) { | |
hx = hx + hx + (lx >>> 31); | |
lx = lx + lx; | |
} else { | |
hx = hz + hz + (lz >>> 31); | |
lx = lz + lz; | |
q++; | |
} | |
q <<= 1; | |
} | |
let hz = hx - hy; | |
const lz = lx - ly; | |
if (lx < ly) hz -= 1; | |
if (hz >= 0) { | |
hx = hz; | |
lx = lz; | |
q++; | |
} | |
// convert back to floating value and restore the sign | |
if ((hx | lx) === 0) { | |
// return sign(x) * 0 | |
const quo = sxy ? -q : q; | |
return [sx * 0, quo]; | |
} | |
while (hx < 0x00100000) { | |
// normalize x | |
hx = hx + hx + (lx >>> 31); | |
lx = lx + lx; | |
iy -= 1; | |
} | |
if (iy >= -1022) { | |
// normalize output | |
hx = (hx - 0x00100000) | ((iy + 1023) << 20); | |
} else { | |
// subnormal output | |
n = -1022 - iy; | |
if (n <= 20) { | |
lx = (lx >> n) | (hx << (32 - n)); | |
hx >>= n; | |
} else if (n <= 31) { | |
lx = (hx << (32 - n)) | (lx >>> n); | |
hx = sx; | |
} else { | |
lx = hx >> (n - 32); | |
hx = sx; | |
} | |
} | |
return remquoFixup(q, hx, lx, y, sx, sxy); | |
} | |
const TWOM1021 = MathPow(2, -1021); | |
function remquoFixup(q, hx, lx, y, sx, sxy) { | |
let x = u32f64(hx, lx); | |
y = MathAbs(y); | |
if (y < TWOM1021) { | |
if (x + x > y || (x + x === y && q & 1)) { | |
q++; | |
x -= y; | |
} | |
} else if (x > 0.5 * y || (x === 0.5 * y && q & 1)) { | |
q++; | |
x -= y; | |
} | |
[hx, lx] = f64u32(x); | |
x = u32f64(hx ^ sx, lx); | |
q &= 0x7fff_ffff; | |
const quo = sxy ? -q : q; | |
return [x, quo]; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment