Created
July 7, 2025 18:19
-
-
Save raphlinus/44e114fef2fd33b889383a60ced0129b to your computer and use it in GitHub Desktop.
Sketch of Neon code for flattening
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
unsafe fn approx_parabola_integral(x: float32x4_t) -> float32x4_t { | |
const D: f32 = 0.67; | |
let x2 = vmulq_f32(x, x); | |
let t1 = vfmaq_f32(vdupq_n_f32(D.powi(4)), vdupq_n_f32(0.25), x2); | |
let t1_sqrt = vsqrtq_f32(t1); | |
let t1_fourthroot = vsqrtq_f32(t1_sqrt); | |
let denom = vaddq_f32(vdupq_n_f32(1.0 - D), t1_fourthroot); | |
vdivq_f32(x, denom) | |
} | |
unsafe fn approx_parabola_integral_x2(x: float32x2_t) -> float32x2_t { | |
const D: f32 = 0.67; | |
let x2 = vmul_f32(x, x); | |
let t1 = vfma_f32(vdup_n_f32(D.powi(4)), vdup_n_f32(0.25), x2); | |
let t1_sqrt = vsqrt_f32(t1); | |
let t1_fourthroot = vsqrt_f32(t1_sqrt); | |
let denom = vadd_f32(vdup_n_f32(1.0 - D), t1_fourthroot); | |
vdiv_f32(x, denom) | |
} | |
unsafe fn approx_parabola_inv_integral(x: float32x4_t) -> float32x4_t { | |
const B: f32 = 0.39; | |
let x2 = vmulq_f32(x, x); | |
let t1 = vfmaq_f32(vdupq_n_f32(B * B), vdupq_n_f32(0.25), x2); | |
let t1_sqrt = vsqrtq_f32(t1); | |
let factor = vaddq_f32(vdupq_n_f32(1.0 - B), t1_sqrt); | |
vmulq_f32(x, factor) | |
} | |
// Note: for consuming these, a structure of arrays is probably better | |
#[repr(C)] | |
#[derive(Clone, Copy, Default, Debug)] | |
pub struct FlattenParams { | |
a0: [f32; 2], | |
a2: [f32; 2], | |
u0: [f32; 2], | |
uscale: [f32; 2], | |
val: [f32; 2], | |
} | |
unsafe fn is_finite(x: float32x2_t) -> uint32x2_t { | |
vclt_u32(vreinterpret_u32_f32(vabs_f32(x)), vdup_n_u32(0x7f80_0000)) | |
} | |
const MAX_QUADS: usize = 16; | |
// This code is adapted from kurbo CubicBez::to_quads. | |
// | |
// We can probably give a better type to `quads`. | |
pub fn prepare_quads(c: CubicBez, quads: &mut [f32; MAX_QUADS * 6], accuracy: f64) -> usize { | |
let max_hypot2 = 432.0 * accuracy * accuracy; | |
let d01 = c.p1 - c.p0; | |
let d12 = c.p2 - c.p1; | |
let d23 = c.p3 - c.p2; | |
let err = (2.0 * d12 - d01 - d23).hypot2(); | |
let mut n_float = (err / max_hypot2).powf(1. / 6.).floor() + 1.0; | |
n_float = n_float.min(MAX_QUADS as f64); | |
let n = n_float as usize; | |
let dt = 1. / n_float; | |
let dt_0_75 = 0.75 * dt; | |
let d01s = dt_0_75 * d01; | |
let d12s = dt_0_75 * d12; | |
let d23s = dt_0_75 * d23; | |
let mut p0 = c.p0; | |
let mut q0 = d01s; | |
for i in 0..n { | |
let t1 = (i + 1) as f64 * dt; | |
let p2 = if i == n - 1 { c.p3 } else { c.eval(t1) }; | |
let mt = 1.0 - t1; | |
let q1 = (mt * mt) * d01s + (2.0 * t1 * mt) * d12s + (t1 * t1) * d23s; | |
let p1 = p0.midpoint(p2) + q0 - q1; | |
// these can be unchecked; let's look at the asm to see how much it hurts | |
let ix = i * 6; | |
quads[ix] = p0.x as f32; | |
quads[ix + 1] = p0.y as f32; | |
quads[ix + 2] = p1.x as f32; | |
quads[ix + 3] = p1.y as f32; | |
quads[ix + 4] = p2.x as f32; | |
quads[ix + 5] = p2.y as f32; | |
p0 = p2; | |
q0 = q1; | |
} | |
n | |
} | |
/// Compute subdivision math for two quads. | |
pub unsafe fn estimate_subdiv( | |
quads: *const f32, | |
sqrt_tol: f32, | |
out: &mut FlattenParams, | |
) -> float32x2_t { | |
let two_quads = vld3q_u64(quads as *const u64); | |
// points are laid out q0.p0.x q0.p0.y q1.p0.x q1.p0.y | |
let p0 = vreinterpretq_f32_u64(two_quads.0); | |
let p1 = vreinterpretq_f32_u64(two_quads.1); | |
let p2 = vreinterpretq_f32_u64(two_quads.2); | |
let d01 = vsubq_f32(p1, p0); | |
let d12 = vsubq_f32(p2, p1); | |
let d01x = vuzp1q_f32(d01, d01); | |
let d01y = vuzp2q_f32(d01, d01); | |
let d12x = vuzp1q_f32(d12, d12); | |
let d12y = vuzp2q_f32(d12, d12); | |
let ddx = vsubq_f32(d01x, d12x); | |
let ddy = vsubq_f32(d01y, d12y); | |
let d02x = vaddq_f32(d01x, d12x); | |
let d02y = vaddq_f32(d01y, d12y); | |
let cross = vfmsq_f32(vmulq_f32(d02x, ddy), d02y, ddx); | |
let x0_x2_a = vmulq_f32(vcombine_f32(vget_low_f32(d01x), vget_low_f32(d12x)), ddx); | |
let x0_x2_num = vfmaq_f32( | |
x0_x2_a, | |
vcombine_f32(vget_low_f32(d01y), vget_low_f32(d12y)), | |
ddy, | |
); | |
let x0_x2 = vdivq_f32(x0_x2_num, cross); | |
let ddx_low = vget_low_f32(ddx); | |
let ddy_low = vget_low_f32(ddy); | |
let dd_hypot = vsqrt_f32(vmla_f32(vmul_f32(ddx_low, ddx_low), ddy_low, ddy_low)); | |
let x0 = vget_low_f32(x0_x2); | |
let x2 = vget_high_f32(x0_x2); | |
let scale_denom = vmul_f32(dd_hypot, vsub_f32(x2, x0)); | |
let scale = vabs_f32(vdiv_f32(vget_low_f32(cross), scale_denom)); | |
let a0_a2 = approx_parabola_integral(x0_x2); | |
let da = vabs_f32(vsub_f32(vget_high_f32(a0_a2), vget_low_f32(a0_a2))); | |
let sqrt_scale = vsqrt_f32(scale); | |
let mask = vcgez_s32(veor_s32(vreinterpret_s32_f32(x0), vreinterpret_s32_f32(x2))); | |
let noncusp = vmul_f32(da, sqrt_scale); | |
// TODO: should we skip this if neither is a cusp? Maybe not worth branch prediction cost | |
let xmin = vdiv_f32(vdup_n_f32(sqrt_tol), sqrt_scale); | |
let approxint = approx_parabola_integral_x2(xmin); | |
let cusp = vdiv_f32(vmul_f32(vdup_n_f32(sqrt_tol), da), approxint); | |
let val_raw = vbsl_f32(mask, noncusp, cusp); | |
let val = vreinterpret_f32_u32(vand_u32(is_finite(val_raw), vreinterpret_u32_f32(val_raw))); | |
let u0_u2 = approx_parabola_inv_integral(a0_a2); | |
let u0 = vget_low_f32(u0_u2); | |
let uscale_a = vsub_f32(vget_high_f32(u0_u2), u0); | |
let uscale = vdiv_f32(vdup_n_f32(1.0), uscale_a); | |
vst1q_f32(out.a0.as_mut_ptr(), a0_a2); | |
vst1_f32(out.u0.as_mut_ptr(), u0); | |
vst1_f32(out.uscale.as_mut_ptr(), uscale); | |
vst1_f32(out.val.as_mut_ptr(), val); | |
val | |
} | |
const IOTA_F32: [f32; 4] = [0.0, 1.0, 2.0, 3.0]; | |
// maybe return number of lines written? | |
unsafe fn output_lines( | |
quad: *const f32, | |
a0: f32, | |
da: f32, | |
u0: f32, | |
uscale: f32, | |
x0: f32, | |
dx: f32, | |
lines: *mut f32, | |
) { | |
let xs = vfmaq_f32( | |
vdupq_n_f32(x0), | |
vld1q_f32(IOTA_F32.as_ptr()), | |
vdupq_n_f32(dx), | |
); | |
let a = vfmaq_f32(vdupq_n_f32(a0), xs, vdupq_n_f32(da)); | |
let u = approx_parabola_integral(a); | |
// todo: precalculate -u0 * uscale | |
let t = vfmaq_f32(vdupq_n_f32(-u0 * uscale), u, vdupq_n_f32(uscale)); | |
let mt = vsubq_f32(vdupq_n_f32(1.0), t); | |
let mt_mt = vmulq_f32(mt, mt); | |
let mt_t_2 = vmulq_f32(vdupq_n_f32(2.0), vmulq_f32(mt, t)); | |
let t_t = vmulq_f32(t, t); | |
let q = vld3_f32(quad); | |
let x12 = vfmaq_lane_f32::<0>(vmulq_lane_f32::<0>(mt_mt, q.0), mt_t_2, q.1); | |
let x = vfmaq_lane_f32::<0>(x12, t_t, q.2); | |
let y12 = vfmaq_lane_f32::<1>(vmulq_lane_f32::<1>(mt_mt, q.0), mt_t_2, q.1); | |
let y = vfmaq_lane_f32::<1>(y12, t_t, q.2); | |
// write x0 y0 x0 y0 x1 y1 x1 y1 x2 y2 x2 y3 x3 y3 x3 y3 | |
let xx1 = vzip1q_f32(x, x); // x0 x0 x1 x1 | |
let yy1 = vzip1q_f32(y, y); // y0 y0 y1 y1 | |
vst2q_f32(lines, float32x4x2_t(xx1, yy1)); | |
let xx2 = vzip2q_f32(x, x); // x2 x2 x3 x3 | |
let yy2 = vzip2q_f32(y, y); // y2 y2 y3 y3 | |
vst2q_f32(lines.add(8), float32x4x2_t(xx2, yy2)); | |
} | |
// Quads and params are scratch buffers; stack allocation is another possibility (but would involve MaybeUninit) | |
pub fn flatten_cubic( | |
c: CubicBez, | |
quads: &mut [f32; MAX_QUADS * 6], | |
params: &mut [FlattenParams; MAX_QUADS / 2], | |
tolerance: f32, | |
lines: &mut Vec<FlatLine>, | |
) { | |
let sqrt_tol = tolerance.sqrt(); | |
const TO_QUAD_TOL: f32 = 0.1; | |
unsafe { | |
let n = prepare_quads(c, quads, (tolerance * TO_QUAD_TOL) as f64); | |
if n % 2 != 0 { | |
quads[n * 6..][..6].fill(0.0); | |
} | |
let sqrt_remain_tol = sqrt_tol * (1.0 - TO_QUAD_TOL).sqrt(); | |
let mut val_sum = vdup_n_f32(0.0); | |
for i in 0..(n + 1) / 2 { | |
let val = estimate_subdiv(quads.as_ptr().add(i * 12), sqrt_remain_tol, &mut params[i]); | |
val_sum = vadd_f32(val_sum, val); | |
} | |
let sum = vget_lane_f32::<0>(vpadd_f32(val_sum, val_sum)); | |
let n_f32 = (0.5 * sum / sqrt_remain_tol).floor() + 1.0; | |
let n = n_f32 as usize; | |
lines.reserve(n + 4); | |
println!("n = {n_f32}"); | |
} | |
} | |
#[test] | |
fn test_quads() { | |
let c = CubicBez::new((0., 0.), (55., 0.), (100., 45.), (100., 100.)); | |
let mut quads_buf = [0.0f32; MAX_QUADS * 6]; | |
unsafe { | |
let n = prepare_quads(c, &mut quads_buf, 0.1); | |
println!("{n}"); | |
println!("M{} {}", quads_buf[0], quads_buf[1]); | |
for i in 0..n { | |
let q = &quads_buf[i * 6..][..6]; | |
println!("Q{} {} {} {}", q[2], q[3], q[4], q[5]); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment