vello_common 0.0.8

// Copyright 2025 the Vello Authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

//! This is a temporary module that contains a SIMD version of flattening of cubic curves, as
//! well as some code that was copied from kurbo, which is needed to reimplement the
//! full `flatten` method.

#[cfg(not(feature = "std"))]
use crate::kurbo::common::FloatFuncs as _;
use crate::kurbo::{CubicBez, Line, ParamCurve, ParamCurveNearest, PathEl, Point, QuadBez};
use crate::{
    flatten::{SQRT_TOL, TOL, TOL_2},
    geometry::RectU16,
    kurbo::Affine,
    tile::Tile,
};
use alloc::vec::Vec;
use bytemuck::{Pod, Zeroable};
use fearless_simd::*;

/// The element of a path made of lines.
///
/// Each subpath must start with a `MoveTo`. Closing of subpaths is not supported, and subpaths are
/// not closed implicitly when a new subpath (with `MoveTo`) is started. It is expected that closed
/// subpaths are watertight in the sense that the last `LineTo` matches exactly with the first
/// `MoveTo`.
///
/// This intentionally allows for non-watertight subpaths, as, e.g., lines that are fully outside
/// of the viewport do not need to be drawn.
///
/// See [`PathEl`] for a more general-purpose path element type.
pub(crate) enum LinePathEl {
    MoveTo(Point),
    LineTo(Point),
}

// Unlike kurbo, which takes a closure with a callback for outputting the lines, we use a trait
// instead. The reason is that this way the callback can be inlined, which is not possible with
// a closure and turned out to have a noticeable overhead.
pub(crate) trait Callback {
    fn callback(&mut self, el: LinePathEl);
}

/// See the docs for the kurbo implementation of flattening:
/// <https://docs.rs/kurbo/latest/kurbo/fn.flatten.html>
///
/// This version works using a similar approach but using f32x4/f32x8 SIMD instead.
#[inline(always)]
pub(crate) fn flatten<S: Simd>(
    simd: S,
    path: impl IntoIterator<Item = PathEl>,
    // Note: we explicitly pass the `Affine` here instead of using `map` on
    // the iterator because it seems like the `map` function isn't always inlined
    // properly, even when annotating the closure with `#[inline(always)]`.
    // See https://github.com/linebender/vello/pull/1600.
    affine: Affine,
    callback: &mut impl Callback,
    flatten_ctx: &mut FlattenCtx,
    cull_bbox: RectU16,
) {
    flatten_ctx.flattened_cubics.clear();

    // For the culling performed here to be correct, the top y coordinate of the cull bbox must be
    // aligned to strip row boundaries. Consider the alternative: for example, a strip row starting
    // at y=8, a cull bbox starting at y=10, and (nearly) horizontal geometry at y=9 that does not
    // cross into the cull bbox and does not extend above y=8 (and note this is all in a y-down
    // coordinate space).
    //
    // The culling performed here would remove that geometry. However, as it does not extend above
    // the strip row, it does not add coarse winding to the strip, and does not produce a sparse
    // fill. Yet, if this is the top part of, say, a rectangle that is partially visible, the
    // geometry is necessary for tiling to emit intermediate tiles that are necessary for the
    // bottom part of the strip to get filled.
    //
    // Therefore, we align `top` to strip row boundaries, such that all intermediate tiles are
    // produced.
    //
    // This is not necessary for the bottom. Consider a path where a segment extends just below the
    // cull bbox, but remains in the same strip row. The path is closed, so if anything is to be
    // rendered at all, there will be other geometry above that edge of the cull bbox. If that
    // geometry extends above the row, there will be coarse winding for a sparse fill. If not, it
    // there will be geometry to generate the intermediate tiles.
    let left = cull_bbox.x0 as f64;
    let top = ((cull_bbox.y0 / Tile::HEIGHT) * Tile::HEIGHT) as f64;
    let right = cull_bbox.x1 as f64;
    let bottom = cull_bbox.y1 as f64;

    let mut path = path.into_iter();
    let Some(first_el) = path.next() else {
        return;
    };
    let first_el = affine * first_el;
    let PathEl::MoveTo(start_pt) = first_el else {
        debug_assert!(
            matches!(first_el, PathEl::MoveTo(_)),
            "Non-empty paths must begin with `PathEl::MoveTo`, got {first_el:?}"
        );
        return;
    };

    let mut start_pt = start_pt;
    let mut last_pt = start_pt;
    callback.callback(LinePathEl::MoveTo(start_pt));

    for el in path {
        match affine * el {
            PathEl::MoveTo(p) => {
                if last_pt != start_pt {
                    callback.callback(LinePathEl::LineTo(start_pt));
                }
                last_pt = p;
                start_pt = p;
                callback.callback(LinePathEl::MoveTo(p));
            }
            PathEl::LineTo(p) => {
                last_pt = p;
                callback.callback(LinePathEl::LineTo(p));
            }
            PathEl::QuadTo(p1, p2) => {
                let p0 = last_pt;
                let line = Line::new(p0, p2);
                // If the quadratic Bézier is fully to the right, top, or bottom of the culling
                // bbox, it does not impact pixel coverage or winding. We can ignore it. The
                // following checks that conservatively by checking whether the bounding box of the
                // Bézier's control points is fully outside the culling bbox.
                if [p0, p1, p2].into_iter().all(|p| p.x > right)
                    || [p0, p1, p2].into_iter().all(|p| p.y < top)
                    || [p0, p1, p2].into_iter().all(|p| p.y > bottom)
                {
                    callback.callback(LinePathEl::MoveTo(p2));
                }
                // The following checks two things. First, if the quadratic Bézier is fully to the
                // left of the culling bbox, it may affect pixel coverage and winding, but its
                // exact shape does not matter. It can be emitted as a line segment [p0, p2].
                //
                // Second, an upper bound on the shortest distance of any point on the quadratic
                // Bézier curve to the line segment [p0, p2] is 1/2 of the control-point-to-line-segment
                // distance.
                //
                // The derivation is similar to that for the cubic Bézier (see below). In
                // short:
                //
                // q(t) = B0(t) p0 + B1(t) p1 + B2(t) p2
                // dist(q(t), [p0, p1]) <= B1(t) dist(p1, [p0, p1])
                //                       = 2 (1-t)t dist(p1, [p0, p1]).
                //
                // The maximum occurs at t=1/2, hence
                // max(dist(q(t), [p0, p1] <= 1/2 dist(p1, [p0, p1])).
                //
                // The following takes the square to elide the square root of the Euclidean
                // distance.
                else if [p0, p1, p2].into_iter().all(|p| p.x < left)
                    || line.nearest(p1, 0.).distance_sq <= 4. * TOL_2
                {
                    callback.callback(LinePathEl::LineTo(p2));
                } else {
                    let q = QuadBez::new(p0, p1, p2);
                    let params = q.estimate_subdiv(SQRT_TOL);
                    let n = ((0.5 / SQRT_TOL * params.val).ceil() as usize).max(1);
                    let step = 1.0 / (n as f64);
                    for i in 1..n {
                        let u = (i as f64) * step;
                        let t = q.determine_subdiv_t(&params, u);
                        let p = q.eval(t);
                        callback.callback(LinePathEl::LineTo(p));
                    }
                    callback.callback(LinePathEl::LineTo(p2));
                }
                last_pt = p2;
            }
            PathEl::CurveTo(p1, p2, p3) => {
                let p0 = last_pt;
                let line = Line::new(p0, p3);
                // If the cubic Bézier is fully to the right, top, or bottom of the culling bbox,
                // it does not impact pixel coverage or winding. We can ignore it. The following
                // checks that conservatively by checking whether the bounding box of the Bézier's
                // control points is fully outside the culling bbox.
                if [p0, p1, p2, p3].into_iter().all(|p| p.x > right)
                    || [p0, p1, p2, p3].into_iter().all(|p| p.y < top)
                    || [p0, p1, p2, p3].into_iter().all(|p| p.y > bottom)
                {
                    callback.callback(LinePathEl::MoveTo(p3));
                }
                // The following checks two things. First, if the cubic Bézier is fully to the left
                // of the culling bbox, it may affect pixel coverage and winding, but its exact
                // shape does not matter. It can be emitted as a line segment [p0, p3].
                //
                // Second, an upper bound on the shortest distance of any point on the cubic Bézier
                // curve to the line segment [p0, p3] is 3/4 of the maximum of the
                // control-point-to-line-segment distances.
                //
                // With Bernstein weights Bi(t), we have
                // c(t) = B0(t) p0 + B1(t) p1 + B2(t) p2 + B3(t) p3
                // with t from 0 to 1 (inclusive).
                //
                // Through convexivity of the Euclidean distance function and the line segment,
                // we have
                // dist(c(t), [p0, p3]) <= B1(t) dist(p1, [p0, p3]) + B2(t) dist(p2, [p0, p3])
                //                      <= (B1(t) + B2(t)) max(dist(p1, [p0, p3]), dist(p2, [p0, p3]))
                //                       = 3 ((1-t)t^2 + (1-t)^2t) max(dist(p1, [p0, p3]), dist(p2, [p0, p3])).
                //
                // The inner polynomial has its maximum of 1/4 at t=1/2, hence
                // max(dist(c(t), [p0, p3])) <= 3/4 max(dist(p1, [p0, p3]), dist(p2, [p0, p3])).
                //
                // The following takes the square to elide the square root of the Euclidean
                // distance.
                else if [p0, p1, p2, p3].into_iter().all(|p| p.x < left)
                    || f64::max(
                        line.nearest(p1, 0.).distance_sq,
                        line.nearest(p2, 0.).distance_sq,
                    ) <= 16. / 9. * TOL_2
                {
                    callback.callback(LinePathEl::LineTo(p3));
                } else {
                    let c = CubicBez::new(p0, p1, p2, p3);
                    let max = flatten_cubic_simd(simd, c, flatten_ctx);

                    for p in &flatten_ctx.flattened_cubics[1..max] {
                        callback.callback(LinePathEl::LineTo(Point::new(p.x as f64, p.y as f64)));
                    }
                }
                last_pt = p3;
            }
            PathEl::ClosePath => {
                if last_pt != start_pt {
                    callback.callback(LinePathEl::LineTo(start_pt));

                    // Kurbo says: "If `quad_to` [or another drawing op] is called immediately
                    // after `close_path` then the current subpath starts at the initial point of
                    // the previous subpath."
                    //
                    // Hence, we set `last_pt` back to the just-closed subpath's `start_pt`.
                    last_pt = start_pt;
                }
            }
        }
    }

    if last_pt != start_pt {
        callback.callback(LinePathEl::LineTo(start_pt));
    }
}

// The below methods are copied from kurbo and needed to implement flattening of normal quad curves.

/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
///
/// This is used for flattening curves.
fn approx_parabola_integral(x: f64) -> f64 {
    const D: f64 = 0.67;
    x / (1.0 - D + (D.powi(4) + 0.25 * x * x).sqrt().sqrt())
}

/// An approximation to the inverse parabola integral.
fn approx_parabola_inv_integral(x: f64) -> f64 {
    const B: f64 = 0.39;
    x * (1.0 - B + (B * B + 0.25 * x * x).sqrt())
}

impl FlattenParamsExt for QuadBez {
    #[inline(always)]
    fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams {
        // Determine transformation to $y = x^2$ parabola.
        let d01 = self.p1 - self.p0;
        let d12 = self.p2 - self.p1;
        let dd = d01 - d12;
        let cross = (self.p2 - self.p0).cross(dd);
        let x0 = d01.dot(dd) * cross.recip();
        let x2 = d12.dot(dd) * cross.recip();
        let scale = (cross / (dd.hypot() * (x2 - x0))).abs();

        // Compute number of subdivisions needed.
        let a0 = approx_parabola_integral(x0);
        let a2 = approx_parabola_integral(x2);
        let val = if scale.is_finite() {
            let da = (a2 - a0).abs();
            let sqrt_scale = scale.sqrt();
            if x0.signum() == x2.signum() {
                da * sqrt_scale
            } else {
                // Handle cusp case (segment contains curvature maximum)
                let xmin = sqrt_tol / sqrt_scale;
                sqrt_tol * da / approx_parabola_integral(xmin)
            }
        } else {
            0.0
        };
        let u0 = approx_parabola_inv_integral(a0);
        let u2 = approx_parabola_inv_integral(a2);
        let uscale = (u2 - u0).recip();
        FlattenParams {
            a0,
            a2,
            u0,
            uscale,
            val,
        }
    }

    #[inline(always)]
    fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64 {
        let a = params.a0 + (params.a2 - params.a0) * x;
        let u = approx_parabola_inv_integral(a);
        (u - params.u0) * params.uscale
    }
}

trait FlattenParamsExt {
    fn estimate_subdiv(&self, sqrt_tol: f64) -> FlattenParams;
    fn determine_subdiv_t(&self, params: &FlattenParams, x: f64) -> f64;
}

// Everything below is a SIMD implementation of flattening of cubic curves.
// It's a combination of https://gist.github.com/raphlinus/5f4e9feb85fd79bafc72da744571ec0e
// and https://gist.github.com/raphlinus/44e114fef2fd33b889383a60ced0129b.

// TODO(laurenz): Perhaps we should get rid of this in the future and work directly with f32,
// as it's the only reason we have to pull in proc_macros via the `derive` feature of bytemuck.
#[derive(Clone, Copy, Debug, Default, Zeroable, Pod)]
#[repr(C)]
struct Point32 {
    x: f32,
    y: f32,
}

struct FlattenParams {
    a0: f64,
    a2: f64,
    u0: f64,
    uscale: f64,
    /// The number of `subdivisions * 2 * sqrt_tol`.
    val: f64,
}

/// This limit was chosen based on the pre-existing GitHub gist.
/// This limit should not be hit in normal operation, but _might_ be hit for very large
/// transforms.
const MAX_QUADS: usize = 16;

/// The context needed for flattening curves.
#[derive(Default, Debug)]
pub struct FlattenCtx {
    // The +4 is to encourage alignment; might be better to be explicit
    even_pts: [Point32; MAX_QUADS + 4],
    odd_pts: [Point32; MAX_QUADS],
    a0: [f32; MAX_QUADS],
    da: [f32; MAX_QUADS],
    u0: [f32; MAX_QUADS],
    uscale: [f32; MAX_QUADS],
    val: [f32; MAX_QUADS],
    n_quads: usize,
    /// Reusable buffer for flattened cubic points.
    flattened_cubics: Vec<Point32>,
}

#[inline(always)]
fn is_finite_simd<S: Simd>(x: f32x4<S>) -> mask32x4<S> {
    let simd = x.simd;

    let x_abs = x.abs();
    let reinterpreted = u32x4::from_bytes(x_abs.to_bytes());
    simd.simd_lt_u32x4(reinterpreted, u32x4::splat(simd, 0x7f80_0000))
}

/// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
///
/// This is used for flattening curves.
///
/// SIMD version of [`approx_parabola_integral`].
#[inline(always)]
fn approx_parabola_integral_simd<S: Simd, F: SimdFloat<S, Element = f32>>(x: F) -> F {
    let simd = x.witness();

    const D: f32 = 0.67;
    const D_POWI_4: f32 = 0.201_511_2;

    let temp = F::splat(x.witness(), 0.25)
        .mul_add(x * x, F::splat(simd, D_POWI_4))
        .sqrt()
        .sqrt();
    let denom = temp + (1. - D);
    x / denom
}

/// An approximation to the inverse parabola integral.
///
/// SIMD version of [`approx_parabola_inv_integral`].
#[inline(always)]
fn approx_parabola_inv_integral_simd<S: Simd>(x: f32x8<S>) -> f32x8<S> {
    let simd = x.simd;

    const B: f32 = 0.39;
    const ONE_MINUS_B: f32 = 1.0 - B;

    let temp = f32x8::splat(simd, 0.25).mul_add(x * x, B * B).sqrt();
    let factor = f32x8::splat(simd, ONE_MINUS_B) + temp;
    x * factor
}

#[inline(always)]
fn pt_splat_simd<S: Simd>(simd: S, pt: Point32) -> f32x8<S> {
    let p_f64: f64 = bytemuck::cast(pt);
    simd.reinterpret_f32_f64x4(f64x4::splat(simd, p_f64))
}

#[inline(always)]
fn eval_cubics_simd<S: Simd>(simd: S, c: &CubicBez, n: usize, result: &mut FlattenCtx) {
    result.n_quads = n;
    let dt = 0.5 / n as f32;

    // TODO(laurenz): Perhaps we can SIMDify this better
    let p0p1 = f32x4::from_slice(
        simd,
        &[c.p0.x as f32, c.p0.y as f32, c.p1.x as f32, c.p1.y as f32],
    );
    let p2p3 = f32x4::from_slice(
        simd,
        &[c.p2.x as f32, c.p2.y as f32, c.p3.x as f32, c.p3.y as f32],
    );

    let split_single = |input: f32x4<S>| {
        let t1 = simd.reinterpret_f64_f32x4(input);
        let p0 = simd.zip_low_f64x2(t1, t1);
        let p1 = simd.zip_high_f64x2(t1, t1);

        let p0 = simd.reinterpret_f32_f64x2(p0);
        let p1 = simd.reinterpret_f32_f64x2(p1);

        (f32x8::block_splat(p0), f32x8::block_splat(p1))
    };

    let (p0_128, p1_128) = split_single(p0p1);
    let (p2_128, p3_128) = split_single(p2p3);

    // Use Horner's method to evaluate p(t) as ((a*t + b)*t + c)*t + d. This allows hoisting
    // coefficients out of the loop, and then evaluating as three sequential FMAs. Estrin's method
    // would expose more ILP, but the increase in work on such small polynomials (cubic here and
    // quad below) makes it a net slowdown.
    let coeff_a = (p1_128 - p2_128).mul_add(3.0, p3_128 - p0_128);
    let coeff_b = p1_128.mul_add(-2.0, p0_128 + p2_128) * 3.0;
    let coeff_c = (p1_128 - p0_128) * 3.0;
    let coeff_d = p0_128;

    let iota = f32x8::from_slice(simd, &[0.0, 0.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0]);
    let step = iota * dt;
    let mut t = step;
    let t_inc = f32x8::splat(simd, 4.0 * dt);

    let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.even_pts);
    let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut result.odd_pts);

    for i in 0..n.div_ceil(2) {
        let evaluated = (coeff_a.mul_add(t, coeff_b))
            .mul_add(t, coeff_c)
            .mul_add(t, coeff_d);

        let (low, high) = simd.split_f32x8(evaluated);

        low.store_slice(&mut even_pts[i * 4..][..4]);
        high.store_slice(&mut odd_pts[i * 4..][..4]);

        t += t_inc;
    }

    p3_128.store_slice(&mut even_pts[n * 2..][..8]);
}

#[inline(always)]
fn estimate_subdiv_simd<S: Simd>(simd: S, sqrt_tol: f32, ctx: &mut FlattenCtx) {
    let n = ctx.n_quads;

    let even_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.even_pts);
    let odd_pts: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.odd_pts);

    for i in 0..n.div_ceil(4) {
        let p0 = f32x8::from_slice(simd, &even_pts[i * 8..][..8]);
        let p_onehalf = f32x8::from_slice(simd, &odd_pts[i * 8..][..8]);
        let p2 = f32x8::from_slice(simd, &even_pts[(i * 8 + 2)..][..8]);
        let x = p0 * -0.5;
        let x1 = p_onehalf.mul_add(2.0, x);
        let p1 = p2.mul_add(-0.5, x1);

        p1.store_slice(&mut odd_pts[(i * 8)..][..8]);

        let d01 = p1 - p0;
        let d12 = p2 - p1;
        let d01x = simd.unzip_low_f32x8(d01, d01);
        let d01y = simd.unzip_high_f32x8(d01, d01);
        let d12x = simd.unzip_low_f32x8(d12, d12);
        let d12y = simd.unzip_high_f32x8(d12, d12);
        let ddx = d01x - d12x;
        let ddy = d01y - d12y;
        let d02x = d01x + d12x;
        let d02y = d01y + d12y;
        // (d02x * ddy) - (d02y * ddx)
        let cross = ddx.mul_add(-d02y, d02x * ddy);

        let x0_x2_a = {
            let (d01x_low, _) = simd.split_f32x8(d01x);
            let (d12x_low, _) = simd.split_f32x8(d12x);

            simd.combine_f32x4(d12x_low, d01x_low) * ddx
        };
        let temp1 = {
            let (d12y_low, _) = simd.split_f32x8(d12y);
            let (d01y_low, _) = simd.split_f32x8(d01y);

            simd.combine_f32x4(d12y_low, d01y_low)
        };
        let x0_x2_num = temp1.mul_add(ddy, x0_x2_a);
        let x0_x2 = x0_x2_num / cross;
        let (ddx_low, _) = simd.split_f32x8(ddx);
        let (ddy_low, _) = simd.split_f32x8(ddy);
        let dd_hypot = ddy_low.mul_add(ddy_low, ddx_low * ddx_low).sqrt();
        let (x0, x2) = simd.split_f32x8(x0_x2);
        let scale_denom = dd_hypot * (x2 - x0);
        let (temp2, _) = simd.split_f32x8(cross);
        let scale = (temp2 / scale_denom).abs();
        let a0_a2 = approx_parabola_integral_simd(x0_x2);
        let (a0, a2) = simd.split_f32x8(a0_a2);
        let da = a2 - a0;
        let da_abs = da.abs();
        let sqrt_scale = scale.sqrt();
        let temp3 = simd.or_i32x4(x0.bitcast(), x2.bitcast());
        let mask = simd.simd_ge_i32x4(temp3, i32x4::splat(simd, 0));
        let noncusp = da_abs * sqrt_scale;
        // TODO: should we skip this if neither is a cusp? Maybe not worth branch prediction cost
        let xmin = sqrt_tol / sqrt_scale;
        let approxint = approx_parabola_integral_simd(xmin);
        let cusp = (sqrt_tol * da_abs) / approxint;
        let val_raw = simd.select_f32x4(mask, noncusp, cusp);
        let finite_mask = is_finite_simd(val_raw);
        let val = simd.select_f32x4(finite_mask, val_raw, f32x4::splat(simd, 0.0));
        let u0_u2 = approx_parabola_inv_integral_simd(a0_a2);
        let (u0, u2) = simd.split_f32x8(u0_u2);
        let uscale_a = u2 - u0;
        let uscale = 1.0 / uscale_a;

        a0.store_slice(&mut ctx.a0[i * 4..][..4]);
        da.store_slice(&mut ctx.da[i * 4..][..4]);
        u0.store_slice(&mut ctx.u0[i * 4..][..4]);
        uscale.store_slice(&mut ctx.uscale[i * 4..][..4]);
        val.store_slice(&mut ctx.val[i * 4..][..4]);
    }
}

#[inline(always)]
fn output_lines_simd<S: Simd>(
    simd: S,
    ctx: &mut FlattenCtx,
    i: usize,
    x0: f32,
    dx: f32,
    n: usize,
    start_idx: usize,
) {
    let p0 = pt_splat_simd(simd, ctx.even_pts[i]);
    let p1 = pt_splat_simd(simd, ctx.odd_pts[i]);
    let p2 = pt_splat_simd(simd, ctx.even_pts[i + 1]);

    const IOTA2: [f32; 8] = [0., 0., 1., 1., 2., 2., 3., 3.];
    let iota2 = f32x8::from_slice(simd, IOTA2.as_ref());
    let x = iota2.mul_add(dx, f32x8::splat(simd, x0));
    let da = f32x8::splat(simd, ctx.da[i]);
    let mut a = da.mul_add(x, f32x8::splat(simd, ctx.a0[i]));
    let a_inc = 4.0 * dx * da;
    let uscale = f32x8::splat(simd, ctx.uscale[i]);
    let u0 = f32x8::splat(simd, ctx.u0[i]);

    // See Horner comment above
    let coeff_a = p1.mul_add(-2.0, p0) + p2;
    let coeff_b = (p1 - p0) * 2.0;
    let coeff_c = p0;

    let out: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.flattened_cubics[start_idx..]);

    for j in 0..n.div_ceil(4) {
        let u = approx_parabola_inv_integral_simd(a);
        let t = (u - u0) * uscale;
        let p = coeff_a.mul_add(t, coeff_b).mul_add(t, coeff_c);
        p.store_slice(&mut out[j * 8..][..8]);
        a += a_inc;
    }
}

#[inline(always)]
fn flatten_cubic_simd<S: Simd>(simd: S, c: CubicBez, ctx: &mut FlattenCtx) -> usize {
    let n_quads = estimate_num_quads(c, TOL as f32);
    eval_cubics_simd(simd, &c, n_quads, ctx);
    let tol = (TOL as f32) * (1.0 - TO_QUAD_TOL);
    let sqrt_tol = tol.sqrt();
    estimate_subdiv_simd(simd, sqrt_tol, ctx);
    let sum: f32 = ctx.val[..n_quads].iter().sum();
    let n = ((0.5 * sum / sqrt_tol).ceil() as usize).max(1);
    let target_len = n + 4;
    if target_len > ctx.flattened_cubics.len() {
        ctx.flattened_cubics.resize(target_len, Point32::default());
    }

    let step = sum / (n as f32);
    let step_recip = 1.0 / step;
    let mut val_sum = 0.0;
    let mut last_n = 0;
    let mut x0base = 0.0;

    for i in 0..n_quads {
        let val = ctx.val[i];
        val_sum += val;
        let this_n = val_sum * step_recip;
        let this_n_next = 1.0 + this_n.floor();
        let dn = this_n_next as usize - last_n;
        if dn > 0 {
            let dx = step / val;
            let x0 = x0base * dx;
            output_lines_simd(simd, ctx, i, x0, dx, dn, last_n);
        }
        x0base = this_n_next - this_n;
        last_n = this_n_next as usize;
    }

    ctx.flattened_cubics[n] = ctx.even_pts[n_quads];

    n + 1
}

#[inline(always)]
fn estimate_num_quads(c: CubicBez, accuracy: f32) -> usize {
    let q_accuracy = (accuracy * TO_QUAD_TOL) as f64;
    let max_hypot2 = 432.0 * q_accuracy * q_accuracy;
    let p1x2 = c.p1.to_vec2() * 3.0 - c.p0.to_vec2();
    let p2x2 = c.p2.to_vec2() * 3.0 - c.p3.to_vec2();
    let err = (p2x2 - p1x2).hypot2();
    let err_div = err / max_hypot2;

    estimate(err_div)
}

const TO_QUAD_TOL: f32 = 0.1;

#[inline(always)]
fn estimate(err_div: f64) -> usize {
    // The original version of this method was:
    // let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
    // n_quads.min(MAX_QUADS)
    //
    // Note how we always round up and clamp to the range [1, max_quads]. Since we don't
    // care about the actual fractional value resulting from the powf call we can simply
    // compute this using a precomputed lookup table evaluating 1^6, 2^6, 3^6, etc. and simply
    // comparing if the value is less than or equal to each threshold.

    const LUT: [f64; MAX_QUADS] = [
        1.0, 64.0, 729.0, 4096.0, 15625.0, 46656.0, 117649.0, 262144.0, 531441.0, 1000000.0,
        1771561.0, 2985984.0, 4826809.0, 7529536.0, 11390625.0, 16777216.0,
    ];

    #[expect(clippy::needless_range_loop, reason = "better clarity")]
    for i in 0..MAX_QUADS {
        if err_div <= LUT[i] {
            return i + 1;
        }
    }

    MAX_QUADS
}

#[cfg(test)]
mod tests {
    use crate::flatten_simd::{MAX_QUADS, estimate};

    fn old_estimate(err_div: f64) -> usize {
        let n_quads = (err_div.powf(1. / 6.0).ceil() as usize).max(1);
        n_quads.min(MAX_QUADS)
    }

    // Test is disabled by default since it takes 10-20 seconds to run, even in release mode.
    #[test]
    #[ignore]
    fn accuracy() {
        for i in 0..u32::MAX {
            let num = f32::from_bits(i);

            if num.is_finite() {
                assert_eq!(old_estimate(num as f64), estimate(num as f64), "{num}");
            }
        }
    }
}