Skip to content

Instantly share code, notes, and snippets.

@raphlinus
Created July 7, 2025 18:19
Show Gist options
  • Save raphlinus/44e114fef2fd33b889383a60ced0129b to your computer and use it in GitHub Desktop.
Save raphlinus/44e114fef2fd33b889383a60ced0129b to your computer and use it in GitHub Desktop.
Sketch of Neon code for flattening
unsafe fn approx_parabola_integral(x: float32x4_t) -> float32x4_t {
const D: f32 = 0.67;
let x2 = vmulq_f32(x, x);
let t1 = vfmaq_f32(vdupq_n_f32(D.powi(4)), vdupq_n_f32(0.25), x2);
let t1_sqrt = vsqrtq_f32(t1);
let t1_fourthroot = vsqrtq_f32(t1_sqrt);
let denom = vaddq_f32(vdupq_n_f32(1.0 - D), t1_fourthroot);
vdivq_f32(x, denom)
}
unsafe fn approx_parabola_integral_x2(x: float32x2_t) -> float32x2_t {
const D: f32 = 0.67;
let x2 = vmul_f32(x, x);
let t1 = vfma_f32(vdup_n_f32(D.powi(4)), vdup_n_f32(0.25), x2);
let t1_sqrt = vsqrt_f32(t1);
let t1_fourthroot = vsqrt_f32(t1_sqrt);
let denom = vadd_f32(vdup_n_f32(1.0 - D), t1_fourthroot);
vdiv_f32(x, denom)
}
unsafe fn approx_parabola_inv_integral(x: float32x4_t) -> float32x4_t {
const B: f32 = 0.39;
let x2 = vmulq_f32(x, x);
let t1 = vfmaq_f32(vdupq_n_f32(B * B), vdupq_n_f32(0.25), x2);
let t1_sqrt = vsqrtq_f32(t1);
let factor = vaddq_f32(vdupq_n_f32(1.0 - B), t1_sqrt);
vmulq_f32(x, factor)
}
// Note: for consuming these, a structure of arrays is probably better
#[repr(C)]
#[derive(Clone, Copy, Default, Debug)]
pub struct FlattenParams {
a0: [f32; 2],
a2: [f32; 2],
u0: [f32; 2],
uscale: [f32; 2],
val: [f32; 2],
}
unsafe fn is_finite(x: float32x2_t) -> uint32x2_t {
vclt_u32(vreinterpret_u32_f32(vabs_f32(x)), vdup_n_u32(0x7f80_0000))
}
const MAX_QUADS: usize = 16;
// This code is adapted from kurbo CubicBez::to_quads.
//
// We can probably give a better type to `quads`.
pub fn prepare_quads(c: CubicBez, quads: &mut [f32; MAX_QUADS * 6], accuracy: f64) -> usize {
let max_hypot2 = 432.0 * accuracy * accuracy;
let d01 = c.p1 - c.p0;
let d12 = c.p2 - c.p1;
let d23 = c.p3 - c.p2;
let err = (2.0 * d12 - d01 - d23).hypot2();
let mut n_float = (err / max_hypot2).powf(1. / 6.).floor() + 1.0;
n_float = n_float.min(MAX_QUADS as f64);
let n = n_float as usize;
let dt = 1. / n_float;
let dt_0_75 = 0.75 * dt;
let d01s = dt_0_75 * d01;
let d12s = dt_0_75 * d12;
let d23s = dt_0_75 * d23;
let mut p0 = c.p0;
let mut q0 = d01s;
for i in 0..n {
let t1 = (i + 1) as f64 * dt;
let p2 = if i == n - 1 { c.p3 } else { c.eval(t1) };
let mt = 1.0 - t1;
let q1 = (mt * mt) * d01s + (2.0 * t1 * mt) * d12s + (t1 * t1) * d23s;
let p1 = p0.midpoint(p2) + q0 - q1;
// these can be unchecked; let's look at the asm to see how much it hurts
let ix = i * 6;
quads[ix] = p0.x as f32;
quads[ix + 1] = p0.y as f32;
quads[ix + 2] = p1.x as f32;
quads[ix + 3] = p1.y as f32;
quads[ix + 4] = p2.x as f32;
quads[ix + 5] = p2.y as f32;
p0 = p2;
q0 = q1;
}
n
}
/// Compute subdivision math for two quads.
pub unsafe fn estimate_subdiv(
quads: *const f32,
sqrt_tol: f32,
out: &mut FlattenParams,
) -> float32x2_t {
let two_quads = vld3q_u64(quads as *const u64);
// points are laid out q0.p0.x q0.p0.y q1.p0.x q1.p0.y
let p0 = vreinterpretq_f32_u64(two_quads.0);
let p1 = vreinterpretq_f32_u64(two_quads.1);
let p2 = vreinterpretq_f32_u64(two_quads.2);
let d01 = vsubq_f32(p1, p0);
let d12 = vsubq_f32(p2, p1);
let d01x = vuzp1q_f32(d01, d01);
let d01y = vuzp2q_f32(d01, d01);
let d12x = vuzp1q_f32(d12, d12);
let d12y = vuzp2q_f32(d12, d12);
let ddx = vsubq_f32(d01x, d12x);
let ddy = vsubq_f32(d01y, d12y);
let d02x = vaddq_f32(d01x, d12x);
let d02y = vaddq_f32(d01y, d12y);
let cross = vfmsq_f32(vmulq_f32(d02x, ddy), d02y, ddx);
let x0_x2_a = vmulq_f32(vcombine_f32(vget_low_f32(d01x), vget_low_f32(d12x)), ddx);
let x0_x2_num = vfmaq_f32(
x0_x2_a,
vcombine_f32(vget_low_f32(d01y), vget_low_f32(d12y)),
ddy,
);
let x0_x2 = vdivq_f32(x0_x2_num, cross);
let ddx_low = vget_low_f32(ddx);
let ddy_low = vget_low_f32(ddy);
let dd_hypot = vsqrt_f32(vmla_f32(vmul_f32(ddx_low, ddx_low), ddy_low, ddy_low));
let x0 = vget_low_f32(x0_x2);
let x2 = vget_high_f32(x0_x2);
let scale_denom = vmul_f32(dd_hypot, vsub_f32(x2, x0));
let scale = vabs_f32(vdiv_f32(vget_low_f32(cross), scale_denom));
let a0_a2 = approx_parabola_integral(x0_x2);
let da = vabs_f32(vsub_f32(vget_high_f32(a0_a2), vget_low_f32(a0_a2)));
let sqrt_scale = vsqrt_f32(scale);
let mask = vcgez_s32(veor_s32(vreinterpret_s32_f32(x0), vreinterpret_s32_f32(x2)));
let noncusp = vmul_f32(da, sqrt_scale);
// TODO: should we skip this if neither is a cusp? Maybe not worth branch prediction cost
let xmin = vdiv_f32(vdup_n_f32(sqrt_tol), sqrt_scale);
let approxint = approx_parabola_integral_x2(xmin);
let cusp = vdiv_f32(vmul_f32(vdup_n_f32(sqrt_tol), da), approxint);
let val_raw = vbsl_f32(mask, noncusp, cusp);
let val = vreinterpret_f32_u32(vand_u32(is_finite(val_raw), vreinterpret_u32_f32(val_raw)));
let u0_u2 = approx_parabola_inv_integral(a0_a2);
let u0 = vget_low_f32(u0_u2);
let uscale_a = vsub_f32(vget_high_f32(u0_u2), u0);
let uscale = vdiv_f32(vdup_n_f32(1.0), uscale_a);
vst1q_f32(out.a0.as_mut_ptr(), a0_a2);
vst1_f32(out.u0.as_mut_ptr(), u0);
vst1_f32(out.uscale.as_mut_ptr(), uscale);
vst1_f32(out.val.as_mut_ptr(), val);
val
}
const IOTA_F32: [f32; 4] = [0.0, 1.0, 2.0, 3.0];
// maybe return number of lines written?
unsafe fn output_lines(
quad: *const f32,
a0: f32,
da: f32,
u0: f32,
uscale: f32,
x0: f32,
dx: f32,
lines: *mut f32,
) {
let xs = vfmaq_f32(
vdupq_n_f32(x0),
vld1q_f32(IOTA_F32.as_ptr()),
vdupq_n_f32(dx),
);
let a = vfmaq_f32(vdupq_n_f32(a0), xs, vdupq_n_f32(da));
let u = approx_parabola_integral(a);
// todo: precalculate -u0 * uscale
let t = vfmaq_f32(vdupq_n_f32(-u0 * uscale), u, vdupq_n_f32(uscale));
let mt = vsubq_f32(vdupq_n_f32(1.0), t);
let mt_mt = vmulq_f32(mt, mt);
let mt_t_2 = vmulq_f32(vdupq_n_f32(2.0), vmulq_f32(mt, t));
let t_t = vmulq_f32(t, t);
let q = vld3_f32(quad);
let x12 = vfmaq_lane_f32::<0>(vmulq_lane_f32::<0>(mt_mt, q.0), mt_t_2, q.1);
let x = vfmaq_lane_f32::<0>(x12, t_t, q.2);
let y12 = vfmaq_lane_f32::<1>(vmulq_lane_f32::<1>(mt_mt, q.0), mt_t_2, q.1);
let y = vfmaq_lane_f32::<1>(y12, t_t, q.2);
// write x0 y0 x0 y0 x1 y1 x1 y1 x2 y2 x2 y3 x3 y3 x3 y3
let xx1 = vzip1q_f32(x, x); // x0 x0 x1 x1
let yy1 = vzip1q_f32(y, y); // y0 y0 y1 y1
vst2q_f32(lines, float32x4x2_t(xx1, yy1));
let xx2 = vzip2q_f32(x, x); // x2 x2 x3 x3
let yy2 = vzip2q_f32(y, y); // y2 y2 y3 y3
vst2q_f32(lines.add(8), float32x4x2_t(xx2, yy2));
}
// Quads and params are scratch buffers; stack allocation is another possibility (but would involve MaybeUninit)
pub fn flatten_cubic(
c: CubicBez,
quads: &mut [f32; MAX_QUADS * 6],
params: &mut [FlattenParams; MAX_QUADS / 2],
tolerance: f32,
lines: &mut Vec<FlatLine>,
) {
let sqrt_tol = tolerance.sqrt();
const TO_QUAD_TOL: f32 = 0.1;
unsafe {
let n = prepare_quads(c, quads, (tolerance * TO_QUAD_TOL) as f64);
if n % 2 != 0 {
quads[n * 6..][..6].fill(0.0);
}
let sqrt_remain_tol = sqrt_tol * (1.0 - TO_QUAD_TOL).sqrt();
let mut val_sum = vdup_n_f32(0.0);
for i in 0..(n + 1) / 2 {
let val = estimate_subdiv(quads.as_ptr().add(i * 12), sqrt_remain_tol, &mut params[i]);
val_sum = vadd_f32(val_sum, val);
}
let sum = vget_lane_f32::<0>(vpadd_f32(val_sum, val_sum));
let n_f32 = (0.5 * sum / sqrt_remain_tol).floor() + 1.0;
let n = n_f32 as usize;
lines.reserve(n + 4);
println!("n = {n_f32}");
}
}
#[test]
fn test_quads() {
let c = CubicBez::new((0., 0.), (55., 0.), (100., 45.), (100., 100.));
let mut quads_buf = [0.0f32; MAX_QUADS * 6];
unsafe {
let n = prepare_quads(c, &mut quads_buf, 0.1);
println!("{n}");
println!("M{} {}", quads_buf[0], quads_buf[1]);
for i in 0..n {
let q = &quads_buf[i * 6..][..6];
println!("Q{} {} {} {}", q[2], q[3], q[4], q[5]);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment