Marc-B-Reynolds · July 8, 2023 08:16
diff --git a/f32_quadratic_hq.c b/f32_quadratic_hq.c

 // whatever return type
 typedef struct { float h,l; } f32_pair_t;

 f32_pair_t f32_quadratic_hq(float A, float B, float C)
 {
  double a = (double)A;
  double b = (double)B;
  double c = (double)C;

  // compute the sqrt of the discriminant. for finite
  // input b^2 & 4ac cannot overflow nor underflow due
  // to the wider dynamic range of binary64. Both terms
  // are exact, can contain no more than 48 binary digits
  // and therefore both have at least 5 trailing zeroes.
  // catastrophic cancellation cannot occur since both
  // are exact. since b^2 cannot over/underflow:
  // d = |b| exactly if either a or c is zero or -4ac
  // cannot contribute.
  // the FMA usage is solely for performance reasons.
  // aside the effective sum/diff will be exactly
  // when b^2 and -4ac are close by exponents due
  // to the 5 trialing zeros..can't be bother to
  // come up with a bound since it does seem useful.
  
  double d = sqrt( fma(b,b,-4.0*a*c) );
  
  // the pair of roots could be computed using the
  // textbook equation since -b +/- sqrt(bb-4ac)
  // cannot hit catastrophic cancellation. b has
  // at most 24 binary digits and root term up to
  // 53 so any rounding errors cannot become
  // significant WRT the final 24 bit result. This
  // would only be useful, however, if we additionally
  // eliminate the pair of divisions in favor of
  // multipling by the reciprocal which would add
  // one more rounding step (almost no impact on
  // result since still in binary64) BUT we'd also
  // need to handle the a=0 case explictly (via
  // two cases) instead of the chosen method below
  // which can (should) lower into a conditional
  // move. judgement call which is better.
  double t  = b + copysign(d,b+0.f);

  f32_pair_t r;

  r.l = (float)( (-2.0*c) / t );
  r.h = (float)( t / (-2.0*a) );

  // a = 0 case: second root is duplicate of first
  // but evaluates to NaN. copy to meet contract
  if (a == 0.0) r.h = r.l;
  
  return r;
 }

	// whatever return type
	typedef struct { float h,l; } f32_pair_t;

	f32_pair_t f32_quadratic_hq(float A, float B, float C)
	{
	double a = (double)A;
	double b = (double)B;
	double c = (double)C;

	// compute the sqrt of the discriminant. for finite
	// input b^2 & 4ac cannot overflow nor underflow due
	// to the wider dynamic range of binary64. Both terms
	// are exact, can contain no more than 48 binary digits
	// and therefore both have at least 5 trailing zeroes.
	// catastrophic cancellation cannot occur since both
	// are exact. since b^2 cannot over/underflow:
	// d = \|b\| exactly if either a or c is zero or -4ac
	// cannot contribute.
	// the FMA usage is solely for performance reasons.
	// aside the effective sum/diff will be exactly
	// when b^2 and -4ac are close by exponents due
	// to the 5 trialing zeros..can't be bother to
	// come up with a bound since it does seem useful.

	double d = sqrt( fma(b,b,-4.0ac) );

	// the pair of roots could be computed using the
	// textbook equation since -b +/- sqrt(bb-4ac)
	// cannot hit catastrophic cancellation. b has
	// at most 24 binary digits and root term up to
	// 53 so any rounding errors cannot become
	// significant WRT the final 24 bit result. This
	// would only be useful, however, if we additionally
	// eliminate the pair of divisions in favor of
	// multipling by the reciprocal which would add
	// one more rounding step (almost no impact on
	// result since still in binary64) BUT we'd also
	// need to handle the a=0 case explictly (via
	// two cases) instead of the chosen method below
	// which can (should) lower into a conditional
	// move. judgement call which is better.
	double t = b + copysign(d,b+0.f);

	f32_pair_t r;

	r.l = (float)( (-2.0*c) / t );
	r.h = (float)( t / (-2.0*a) );

	// a = 0 case: second root is duplicate of first
	// but evaluates to NaN. copy to meet contract
	if (a == 0.0) r.h = r.l;

	return r;
	}