raphlinus · July 7, 2025 18:19
diff --git a/neon_flatten.rs b/neon_flatten.rs
 unsafe fn approx_parabola_integral(x: float32x4_t) -> float32x4_t {
    const D: f32 = 0.67;
    let x2 = vmulq_f32(x, x);
    let t1 = vfmaq_f32(vdupq_n_f32(D.powi(4)), vdupq_n_f32(0.25), x2);
    let t1_sqrt = vsqrtq_f32(t1);
    let t1_fourthroot = vsqrtq_f32(t1_sqrt);
    let denom = vaddq_f32(vdupq_n_f32(1.0 - D), t1_fourthroot);
    vdivq_f32(x, denom)
 }

 unsafe fn approx_parabola_integral_x2(x: float32x2_t) -> float32x2_t {
    const D: f32 = 0.67;
    let x2 = vmul_f32(x, x);
    let t1 = vfma_f32(vdup_n_f32(D.powi(4)), vdup_n_f32(0.25), x2);
    let t1_sqrt = vsqrt_f32(t1);
    let t1_fourthroot = vsqrt_f32(t1_sqrt);
    let denom = vadd_f32(vdup_n_f32(1.0 - D), t1_fourthroot);
    vdiv_f32(x, denom)
 }

 unsafe fn approx_parabola_inv_integral(x: float32x4_t) -> float32x4_t {
    const B: f32 = 0.39;
    let x2 = vmulq_f32(x, x);
    let t1 = vfmaq_f32(vdupq_n_f32(B * B), vdupq_n_f32(0.25), x2);
    let t1_sqrt = vsqrtq_f32(t1);
    let factor = vaddq_f32(vdupq_n_f32(1.0 - B), t1_sqrt);
    vmulq_f32(x, factor)
 }

 // Note: for consuming these, a structure of arrays is probably better
 #[repr(C)]
 #[derive(Clone, Copy, Default, Debug)]
 pub struct FlattenParams {
    a0: [f32; 2],
    a2: [f32; 2],
    u0: [f32; 2],
    uscale: [f32; 2],
    val: [f32; 2],
 }

 unsafe fn is_finite(x: float32x2_t) -> uint32x2_t {
    vclt_u32(vreinterpret_u32_f32(vabs_f32(x)), vdup_n_u32(0x7f80_0000))
 }

 const MAX_QUADS: usize = 16;

 // This code is adapted from kurbo CubicBez::to_quads.
 //
 // We can probably give a better type to `quads`.
 pub fn prepare_quads(c: CubicBez, quads: &mut [f32; MAX_QUADS * 6], accuracy: f64) -> usize {
    let max_hypot2 = 432.0 * accuracy * accuracy;
    let d01 = c.p1 - c.p0;
    let d12 = c.p2 - c.p1;
    let d23 = c.p3 - c.p2;
    let err = (2.0 * d12 - d01 - d23).hypot2();
    let mut n_float = (err / max_hypot2).powf(1. / 6.).floor() + 1.0;
    n_float = n_float.min(MAX_QUADS as f64);
    let n = n_float as usize;
    let dt = 1. / n_float;
    let dt_0_75 = 0.75 * dt;
    let d01s = dt_0_75 * d01;
    let d12s = dt_0_75 * d12;
    let d23s = dt_0_75 * d23;
    let mut p0 = c.p0;
    let mut q0 = d01s;
    for i in 0..n {
        let t1 = (i + 1) as f64 * dt;
        let p2 = if i == n - 1 { c.p3 } else { c.eval(t1) };
        let mt = 1.0 - t1;
        let q1 = (mt * mt) * d01s + (2.0 * t1 * mt) * d12s + (t1 * t1) * d23s;
        let p1 = p0.midpoint(p2) + q0 - q1;
        // these can be unchecked; let's look at the asm to see how much it hurts
        let ix = i * 6;
        quads[ix] = p0.x as f32;
        quads[ix + 1] = p0.y as f32;
        quads[ix + 2] = p1.x as f32;
        quads[ix + 3] = p1.y as f32;
        quads[ix + 4] = p2.x as f32;
        quads[ix + 5] = p2.y as f32;
        p0 = p2;
        q0 = q1;
    }
    n
 }

 /// Compute subdivision math for two quads.
 pub unsafe fn estimate_subdiv(
    quads: *const f32,
    sqrt_tol: f32,
    out: &mut FlattenParams,
 ) -> float32x2_t {
    let two_quads = vld3q_u64(quads as *const u64);
    // points are laid out q0.p0.x q0.p0.y q1.p0.x q1.p0.y
    let p0 = vreinterpretq_f32_u64(two_quads.0);
    let p1 = vreinterpretq_f32_u64(two_quads.1);
    let p2 = vreinterpretq_f32_u64(two_quads.2);
    let d01 = vsubq_f32(p1, p0);
    let d12 = vsubq_f32(p2, p1);
    let d01x = vuzp1q_f32(d01, d01);
    let d01y = vuzp2q_f32(d01, d01);
    let d12x = vuzp1q_f32(d12, d12);
    let d12y = vuzp2q_f32(d12, d12);
    let ddx = vsubq_f32(d01x, d12x);
    let ddy = vsubq_f32(d01y, d12y);
    let d02x = vaddq_f32(d01x, d12x);
    let d02y = vaddq_f32(d01y, d12y);
    let cross = vfmsq_f32(vmulq_f32(d02x, ddy), d02y, ddx);
    let x0_x2_a = vmulq_f32(vcombine_f32(vget_low_f32(d01x), vget_low_f32(d12x)), ddx);
    let x0_x2_num = vfmaq_f32(
        x0_x2_a,
        vcombine_f32(vget_low_f32(d01y), vget_low_f32(d12y)),
        ddy,
    );
    let x0_x2 = vdivq_f32(x0_x2_num, cross);
    let ddx_low = vget_low_f32(ddx);
    let ddy_low = vget_low_f32(ddy);
    let dd_hypot = vsqrt_f32(vmla_f32(vmul_f32(ddx_low, ddx_low), ddy_low, ddy_low));
    let x0 = vget_low_f32(x0_x2);
    let x2 = vget_high_f32(x0_x2);
    let scale_denom = vmul_f32(dd_hypot, vsub_f32(x2, x0));
    let scale = vabs_f32(vdiv_f32(vget_low_f32(cross), scale_denom));
    let a0_a2 = approx_parabola_integral(x0_x2);
    let da = vabs_f32(vsub_f32(vget_high_f32(a0_a2), vget_low_f32(a0_a2)));
    let sqrt_scale = vsqrt_f32(scale);
    let mask = vcgez_s32(veor_s32(vreinterpret_s32_f32(x0), vreinterpret_s32_f32(x2)));
    let noncusp = vmul_f32(da, sqrt_scale);
    // TODO: should we skip this if neither is a cusp? Maybe not worth branch prediction cost
    let xmin = vdiv_f32(vdup_n_f32(sqrt_tol), sqrt_scale);
    let approxint = approx_parabola_integral_x2(xmin);
    let cusp = vdiv_f32(vmul_f32(vdup_n_f32(sqrt_tol), da), approxint);
    let val_raw = vbsl_f32(mask, noncusp, cusp);
    let val = vreinterpret_f32_u32(vand_u32(is_finite(val_raw), vreinterpret_u32_f32(val_raw)));
    let u0_u2 = approx_parabola_inv_integral(a0_a2);
    let u0 = vget_low_f32(u0_u2);
    let uscale_a = vsub_f32(vget_high_f32(u0_u2), u0);
    let uscale = vdiv_f32(vdup_n_f32(1.0), uscale_a);
    vst1q_f32(out.a0.as_mut_ptr(), a0_a2);
    vst1_f32(out.u0.as_mut_ptr(), u0);
    vst1_f32(out.uscale.as_mut_ptr(), uscale);
    vst1_f32(out.val.as_mut_ptr(), val);
    val
 }

 const IOTA_F32: [f32; 4] = [0.0, 1.0, 2.0, 3.0];

 // maybe return number of lines written?
 unsafe fn output_lines(
    quad: *const f32,
    a0: f32,
    da: f32,
    u0: f32,
    uscale: f32,
    x0: f32,
    dx: f32,
    lines: *mut f32,
 ) {
    let xs = vfmaq_f32(
        vdupq_n_f32(x0),
        vld1q_f32(IOTA_F32.as_ptr()),
        vdupq_n_f32(dx),
    );
    let a = vfmaq_f32(vdupq_n_f32(a0), xs, vdupq_n_f32(da));
    let u = approx_parabola_integral(a);
    // todo: precalculate -u0 * uscale
    let t = vfmaq_f32(vdupq_n_f32(-u0 * uscale), u, vdupq_n_f32(uscale));
    let mt = vsubq_f32(vdupq_n_f32(1.0), t);
    let mt_mt = vmulq_f32(mt, mt);
    let mt_t_2 = vmulq_f32(vdupq_n_f32(2.0), vmulq_f32(mt, t));
    let t_t = vmulq_f32(t, t);
    let q = vld3_f32(quad);
    let x12 = vfmaq_lane_f32::<0>(vmulq_lane_f32::<0>(mt_mt, q.0), mt_t_2, q.1);
    let x = vfmaq_lane_f32::<0>(x12, t_t, q.2);
    let y12 = vfmaq_lane_f32::<1>(vmulq_lane_f32::<1>(mt_mt, q.0), mt_t_2, q.1);
    let y = vfmaq_lane_f32::<1>(y12, t_t, q.2);
    // write x0 y0 x0 y0 x1 y1 x1 y1 x2 y2 x2 y3 x3 y3 x3 y3
    let xx1 = vzip1q_f32(x, x); // x0 x0 x1 x1
    let yy1 = vzip1q_f32(y, y); // y0 y0 y1 y1
    vst2q_f32(lines, float32x4x2_t(xx1, yy1));
    let xx2 = vzip2q_f32(x, x); // x2 x2 x3 x3
    let yy2 = vzip2q_f32(y, y); // y2 y2 y3 y3
    vst2q_f32(lines.add(8), float32x4x2_t(xx2, yy2));
 }

 // Quads and params are scratch buffers; stack allocation is another possibility (but would involve MaybeUninit)
 pub fn flatten_cubic(
    c: CubicBez,
    quads: &mut [f32; MAX_QUADS * 6],
    params: &mut [FlattenParams; MAX_QUADS / 2],
    tolerance: f32,
    lines: &mut Vec<FlatLine>,
 ) {
    let sqrt_tol = tolerance.sqrt();
    const TO_QUAD_TOL: f32 = 0.1;
    unsafe {
        let n = prepare_quads(c, quads, (tolerance * TO_QUAD_TOL) as f64);
        if n % 2 != 0 {
            quads[n * 6..][..6].fill(0.0);
        }
        let sqrt_remain_tol = sqrt_tol * (1.0 - TO_QUAD_TOL).sqrt();
        let mut val_sum = vdup_n_f32(0.0);
        for i in 0..(n + 1) / 2 {
            let val = estimate_subdiv(quads.as_ptr().add(i * 12), sqrt_remain_tol, &mut params[i]);
            val_sum = vadd_f32(val_sum, val);
        }
        let sum = vget_lane_f32::<0>(vpadd_f32(val_sum, val_sum));
        let n_f32 = (0.5 * sum / sqrt_remain_tol).floor() + 1.0;
        let n = n_f32 as usize;
        lines.reserve(n + 4);
        println!("n = {n_f32}");
    }
 }

 #[test]
 fn test_quads() {
    let c = CubicBez::new((0., 0.), (55., 0.), (100., 45.), (100., 100.));
    let mut quads_buf = [0.0f32; MAX_QUADS * 6];
    unsafe {
        let n = prepare_quads(c, &mut quads_buf, 0.1);
        println!("{n}");
        println!("M{} {}", quads_buf[0], quads_buf[1]);
        for i in 0..n {
            let q = &quads_buf[i * 6..][..6];
            println!("Q{} {} {} {}", q[2], q[3], q[4], q[5]);
        }
    }
 }
	unsafe fn approx_parabola_integral(x: float32x4_t) -> float32x4_t {
	const D: f32 = 0.67;
	let x2 = vmulq_f32(x, x);
	let t1 = vfmaq_f32(vdupq_n_f32(D.powi(4)), vdupq_n_f32(0.25), x2);
	let t1_sqrt = vsqrtq_f32(t1);
	let t1_fourthroot = vsqrtq_f32(t1_sqrt);
	let denom = vaddq_f32(vdupq_n_f32(1.0 - D), t1_fourthroot);
	vdivq_f32(x, denom)
	}

	unsafe fn approx_parabola_integral_x2(x: float32x2_t) -> float32x2_t {
	const D: f32 = 0.67;
	let x2 = vmul_f32(x, x);
	let t1 = vfma_f32(vdup_n_f32(D.powi(4)), vdup_n_f32(0.25), x2);
	let t1_sqrt = vsqrt_f32(t1);
	let t1_fourthroot = vsqrt_f32(t1_sqrt);
	let denom = vadd_f32(vdup_n_f32(1.0 - D), t1_fourthroot);
	vdiv_f32(x, denom)
	}

	unsafe fn approx_parabola_inv_integral(x: float32x4_t) -> float32x4_t {
	const B: f32 = 0.39;
	let x2 = vmulq_f32(x, x);
	let t1 = vfmaq_f32(vdupq_n_f32(B * B), vdupq_n_f32(0.25), x2);
	let t1_sqrt = vsqrtq_f32(t1);
	let factor = vaddq_f32(vdupq_n_f32(1.0 - B), t1_sqrt);
	vmulq_f32(x, factor)
	}

	// Note: for consuming these, a structure of arrays is probably better
	#[repr(C)]
	#[derive(Clone, Copy, Default, Debug)]
	pub struct FlattenParams {
	a0: [f32; 2],
	a2: [f32; 2],
	u0: [f32; 2],
	uscale: [f32; 2],
	val: [f32; 2],
	}

	unsafe fn is_finite(x: float32x2_t) -> uint32x2_t {
	vclt_u32(vreinterpret_u32_f32(vabs_f32(x)), vdup_n_u32(0x7f80_0000))
	}

	const MAX_QUADS: usize = 16;

	// This code is adapted from kurbo CubicBez::to_quads.
	//
	// We can probably give a better type to `quads`.
	pub fn prepare_quads(c: CubicBez, quads: &mut [f32; MAX_QUADS * 6], accuracy: f64) -> usize {
	let max_hypot2 = 432.0 * accuracy * accuracy;
	let d01 = c.p1 - c.p0;
	let d12 = c.p2 - c.p1;
	let d23 = c.p3 - c.p2;
	let err = (2.0 * d12 - d01 - d23).hypot2();
	let mut n_float = (err / max_hypot2).powf(1. / 6.).floor() + 1.0;
	n_float = n_float.min(MAX_QUADS as f64);
	let n = n_float as usize;
	let dt = 1. / n_float;
	let dt_0_75 = 0.75 * dt;
	let d01s = dt_0_75 * d01;
	let d12s = dt_0_75 * d12;
	let d23s = dt_0_75 * d23;
	let mut p0 = c.p0;
	let mut q0 = d01s;
	for i in 0..n {
	let t1 = (i + 1) as f64 * dt;
	let p2 = if i == n - 1 { c.p3 } else { c.eval(t1) };
	let mt = 1.0 - t1;
	let q1 = (mt * mt) * d01s + (2.0 * t1 * mt) * d12s + (t1 * t1) * d23s;
	let p1 = p0.midpoint(p2) + q0 - q1;
	// these can be unchecked; let's look at the asm to see how much it hurts
	let ix = i * 6;
	quads[ix] = p0.x as f32;
	quads[ix + 1] = p0.y as f32;
	quads[ix + 2] = p1.x as f32;
	quads[ix + 3] = p1.y as f32;
	quads[ix + 4] = p2.x as f32;
	quads[ix + 5] = p2.y as f32;
	p0 = p2;
	q0 = q1;
	}
	n
	}

	/// Compute subdivision math for two quads.
	pub unsafe fn estimate_subdiv(
	quads: *const f32,
	sqrt_tol: f32,
	out: &mut FlattenParams,
	) -> float32x2_t {
	let two_quads = vld3q_u64(quads as *const u64);
	// points are laid out q0.p0.x q0.p0.y q1.p0.x q1.p0.y
	let p0 = vreinterpretq_f32_u64(two_quads.0);
	let p1 = vreinterpretq_f32_u64(two_quads.1);
	let p2 = vreinterpretq_f32_u64(two_quads.2);
	let d01 = vsubq_f32(p1, p0);
	let d12 = vsubq_f32(p2, p1);
	let d01x = vuzp1q_f32(d01, d01);
	let d01y = vuzp2q_f32(d01, d01);
	let d12x = vuzp1q_f32(d12, d12);
	let d12y = vuzp2q_f32(d12, d12);
	let ddx = vsubq_f32(d01x, d12x);
	let ddy = vsubq_f32(d01y, d12y);
	let d02x = vaddq_f32(d01x, d12x);
	let d02y = vaddq_f32(d01y, d12y);
	let cross = vfmsq_f32(vmulq_f32(d02x, ddy), d02y, ddx);
	let x0_x2_a = vmulq_f32(vcombine_f32(vget_low_f32(d01x), vget_low_f32(d12x)), ddx);
	let x0_x2_num = vfmaq_f32(
	x0_x2_a,
	vcombine_f32(vget_low_f32(d01y), vget_low_f32(d12y)),
	ddy,
	);
	let x0_x2 = vdivq_f32(x0_x2_num, cross);
	let ddx_low = vget_low_f32(ddx);
	let ddy_low = vget_low_f32(ddy);
	let dd_hypot = vsqrt_f32(vmla_f32(vmul_f32(ddx_low, ddx_low), ddy_low, ddy_low));
	let x0 = vget_low_f32(x0_x2);
	let x2 = vget_high_f32(x0_x2);
	let scale_denom = vmul_f32(dd_hypot, vsub_f32(x2, x0));
	let scale = vabs_f32(vdiv_f32(vget_low_f32(cross), scale_denom));
	let a0_a2 = approx_parabola_integral(x0_x2);
	let da = vabs_f32(vsub_f32(vget_high_f32(a0_a2), vget_low_f32(a0_a2)));
	let sqrt_scale = vsqrt_f32(scale);
	let mask = vcgez_s32(veor_s32(vreinterpret_s32_f32(x0), vreinterpret_s32_f32(x2)));
	let noncusp = vmul_f32(da, sqrt_scale);
	// TODO: should we skip this if neither is a cusp? Maybe not worth branch prediction cost
	let xmin = vdiv_f32(vdup_n_f32(sqrt_tol), sqrt_scale);
	let approxint = approx_parabola_integral_x2(xmin);
	let cusp = vdiv_f32(vmul_f32(vdup_n_f32(sqrt_tol), da), approxint);
	let val_raw = vbsl_f32(mask, noncusp, cusp);
	let val = vreinterpret_f32_u32(vand_u32(is_finite(val_raw), vreinterpret_u32_f32(val_raw)));
	let u0_u2 = approx_parabola_inv_integral(a0_a2);
	let u0 = vget_low_f32(u0_u2);
	let uscale_a = vsub_f32(vget_high_f32(u0_u2), u0);
	let uscale = vdiv_f32(vdup_n_f32(1.0), uscale_a);
	vst1q_f32(out.a0.as_mut_ptr(), a0_a2);
	vst1_f32(out.u0.as_mut_ptr(), u0);
	vst1_f32(out.uscale.as_mut_ptr(), uscale);
	vst1_f32(out.val.as_mut_ptr(), val);
	val
	}

	const IOTA_F32: [f32; 4] = [0.0, 1.0, 2.0, 3.0];

	// maybe return number of lines written?
	unsafe fn output_lines(
	quad: *const f32,
	a0: f32,
	da: f32,
	u0: f32,
	uscale: f32,
	x0: f32,
	dx: f32,
	lines: *mut f32,
	) {
	let xs = vfmaq_f32(
	vdupq_n_f32(x0),
	vld1q_f32(IOTA_F32.as_ptr()),
	vdupq_n_f32(dx),
	);
	let a = vfmaq_f32(vdupq_n_f32(a0), xs, vdupq_n_f32(da));
	let u = approx_parabola_integral(a);
	// todo: precalculate -u0 * uscale
	let t = vfmaq_f32(vdupq_n_f32(-u0 * uscale), u, vdupq_n_f32(uscale));
	let mt = vsubq_f32(vdupq_n_f32(1.0), t);
	let mt_mt = vmulq_f32(mt, mt);
	let mt_t_2 = vmulq_f32(vdupq_n_f32(2.0), vmulq_f32(mt, t));
	let t_t = vmulq_f32(t, t);
	let q = vld3_f32(quad);
	let x12 = vfmaq_lane_f32::<0>(vmulq_lane_f32::<0>(mt_mt, q.0), mt_t_2, q.1);
	let x = vfmaq_lane_f32::<0>(x12, t_t, q.2);
	let y12 = vfmaq_lane_f32::<1>(vmulq_lane_f32::<1>(mt_mt, q.0), mt_t_2, q.1);
	let y = vfmaq_lane_f32::<1>(y12, t_t, q.2);
	// write x0 y0 x0 y0 x1 y1 x1 y1 x2 y2 x2 y3 x3 y3 x3 y3
	let xx1 = vzip1q_f32(x, x); // x0 x0 x1 x1
	let yy1 = vzip1q_f32(y, y); // y0 y0 y1 y1
	vst2q_f32(lines, float32x4x2_t(xx1, yy1));
	let xx2 = vzip2q_f32(x, x); // x2 x2 x3 x3
	let yy2 = vzip2q_f32(y, y); // y2 y2 y3 y3
	vst2q_f32(lines.add(8), float32x4x2_t(xx2, yy2));
	}

	// Quads and params are scratch buffers; stack allocation is another possibility (but would involve MaybeUninit)
	pub fn flatten_cubic(
	c: CubicBez,
	quads: &mut [f32; MAX_QUADS * 6],
	params: &mut [FlattenParams; MAX_QUADS / 2],
	tolerance: f32,
	lines: &mut Vec<FlatLine>,
	) {
	let sqrt_tol = tolerance.sqrt();
	const TO_QUAD_TOL: f32 = 0.1;
	unsafe {
	let n = prepare_quads(c, quads, (tolerance * TO_QUAD_TOL) as f64);
	if n % 2 != 0 {
	quads[n * 6..][..6].fill(0.0);
	}
	let sqrt_remain_tol = sqrt_tol * (1.0 - TO_QUAD_TOL).sqrt();
	let mut val_sum = vdup_n_f32(0.0);
	for i in 0..(n + 1) / 2 {
	let val = estimate_subdiv(quads.as_ptr().add(i * 12), sqrt_remain_tol, &mut params[i]);
	val_sum = vadd_f32(val_sum, val);
	}
	let sum = vget_lane_f32::<0>(vpadd_f32(val_sum, val_sum));
	let n_f32 = (0.5 * sum / sqrt_remain_tol).floor() + 1.0;
	let n = n_f32 as usize;
	lines.reserve(n + 4);
	println!("n = {n_f32}");
	}
	}

	#[test]
	fn test_quads() {
	let c = CubicBez::new((0., 0.), (55., 0.), (100., 45.), (100., 100.));
	let mut quads_buf = [0.0f32; MAX_QUADS * 6];
	unsafe {
	let n = prepare_quads(c, &mut quads_buf, 0.1);
	println!("{n}");
	println!("M{} {}", quads_buf[0], quads_buf[1]);
	for i in 0..n {
	let q = &quads_buf[i * 6..][..6];
	println!("Q{} {} {} {}", q[2], q[3], q[4], q[5]);
	}
	}
	}