facett-core 0.1.7

//! **The CPU render lane** (L0 fallback).
//!
//! - [`scissor`] — the rect scissor geometry (moved from map3d in Phase A).
//! - [`sdf`] — the CPU SDF coverage + thick-AA-line raster math (the source of
//!   truth the GPU `sdf.wgsl`/`line.wgsl` mirror).
//! - [`CpuCanvas`] — collects [`QuadInstance`]/[`LineInstance`] batches and
//!   rasterizes them onto a **`vello_cpu` [`Pixmap`]** via the [`sdf`] coverage
//!   math, producing a straight-RGBA8 frame. It implements the L0
//!   [`Canvas`](super::Canvas) seam; [`CpuRenderer`] implements
//!   [`Renderer`](super::Renderer).

pub mod scissor;
pub mod sdf;

pub use scissor::{clip_poly_to_rect, ink_outside_rect};

use vello_cpu::color::PremulRgba8;
use vello_cpu::Pixmap;

use super::camera::Camera;
use super::prim::{LineInstance, QuadInstance};
use super::{Backend, Canvas, Frame, Renderer};

/// A CPU canvas: an off-screen [`Pixmap`] (the vello_cpu raster target, per the
/// CONS-CORE spec) plus the batched SDF instances drawn onto it. The host pushes
/// quads + lines, then [`CpuCanvas::rasterize`] evaluates the [`sdf`] coverage for
/// every instance over its bounding box and alpha-composites it into the pixmap.
///
/// Coverage math is **byte-for-byte** the same the GPU lane runs, so a CPU frame
/// matches a GPU frame (the `sdf_primitives` parity test pins this).
pub struct CpuCanvas {
    pixmap: Pixmap,
    width: u32,
    height: u32,
    camera: Camera,
    /// Premultiplied background; painted per-row inside the parallel raster.
    background: PremulRgba8,
    quads: Vec<QuadInstance>,
    lines: Vec<LineInstance>,
}

impl CpuCanvas {
    /// A fresh `width × height` canvas cleared to `background` (straight RGBA8),
    /// under `camera`.
    ///
    /// **No serial clear here:** the background is no longer painted up-front in a
    /// single-threaded `for` over 1M pixels (that was a pure Amdahl serial tail).
    /// Instead [`raster_batches`] fills each row's background **inside** the gatling
    /// scanline kernel — the workers that own a row clear it before compositing, so
    /// the clear scales across all cores with zero extra alloc.
    pub fn new(width: u32, height: u32, camera: Camera, background: [u8; 4]) -> Self {
        let pixmap = Pixmap::new(width as u16, height as u16);
        Self {
            pixmap,
            width,
            height,
            camera,
            background: premul(background),
            quads: Vec::new(),
            lines: Vec::new(),
        }
    }

    /// The raster target — the `vello_cpu` pixmap (spec §2: "CpuCanvas → vello_cpu
    /// pixmap"). Exposed so a host can hand its glyph/curve overlay (L1 vello) the
    /// same target later.
    pub fn pixmap(&self) -> &Pixmap {
        &self.pixmap
    }

    /// Rasterize every batched instance onto the pixmap (lines under quads, the
    /// graph convention: chips draw on top of edges). Returns the straight
    /// (un-premultiplied) RGBA8 [`Frame`].
    ///
    /// **GATLING multicore (LAW 2):** the raster fans across all cores by **scanline
    /// row** — each row is owned by exactly one thread, so the blends never contend,
    /// and the per-pixel compositing order (all lines in order, then all quads in
    /// order) is preserved bit-for-bit vs the sequential path (the
    /// `parallel_raster_matches_sequential` test pins this). Below a pixel-work
    /// threshold the frame stays single-threaded (zero pool overhead on small draws).
    pub fn rasterize(&mut self) -> Frame {
        self.rasterize_with_workers(0)
    }

    /// [`rasterize`](Self::rasterize) with an explicit gatling worker count: `0` ⇒
    /// one per core (the production path), `1` ⇒ the forced single-threaded path.
    /// Exposed so a bench can time the **same** parallel region at 1 vs N workers and
    /// report the real cores-busy / speedup (rather than guessing from a sweep
    /// average). The output is identical for any worker count (the
    /// `parallel_raster_matches_sequential` test pins bit-identity at N vs 1).
    pub fn rasterize_with_workers(&mut self, workers: usize) -> Frame {
        let lines = std::mem::take(&mut self.lines);
        let quads = std::mem::take(&mut self.quads);
        self.raster_batches(&lines, &quads, workers);
        self.frame_with_workers(workers)
    }

    /// Run **only** the compositing raster (bg clear + y-bucketed SDF blend) at
    /// `workers` gatling workers — the region the GATLING scanline kernel governs,
    /// without the memory-bandwidth-bound un-premultiply ([`frame`](Self::frame)).
    /// Exposed for the scaling bench so cores-busy can be measured on the part that
    /// is CPU-bound, separate from the RAM-bandwidth-capped pixmap→RGBA8 conversion.
    pub fn raster_only(&mut self, workers: usize) {
        let lines = std::mem::take(&mut self.lines);
        let quads = std::mem::take(&mut self.quads);
        self.raster_batches(&lines, &quads, workers);
    }

    /// The pixel-work threshold above which the raster goes multicore. `width *
    /// height` (the full frame) is a cheap upper bound on the work; tiny frames
    /// (tooltips, sparklines) skip the gatling fan-out entirely.
    const PARALLEL_PX_THRESHOLD: usize = 64 * 1024; // 256×256

    /// Should this `w × h` frame fan across cores? (cheap upper bound on work.)
    #[inline]
    fn parallel_frame(w: u32, h: u32) -> bool {
        (w as usize * h as usize) >= Self::PARALLEL_PX_THRESHOLD
    }

    /// Raster `lines` (under) then `quads` (over) onto the pixmap, row-parallel via
    /// znippy's fork-join GATLING (rayon is forbidden). Each worker self-dispatches
    /// whole scanlines and writes into that row's **disjoint** pixel slice — no
    /// barrier, no wait-for-a-core, zero-copy (row index hand-off), zero-alloc in
    /// the hot loop. Below the pixel-work threshold it stays single-threaded.
    ///
    /// Two scaling lifts live entirely inside this parallel region (so they raise the
    /// per-core ceiling, they don't add serial tail):
    /// - **Per-row background clear.** The worker that owns row `y` paints the bg into
    ///   that row before compositing — the 1M-pixel clear is now N-way parallel, not a
    ///   serial `for` in `new`.
    /// - **Y-bucket reject.** A [`YBuckets`] index, built once per frame, lists only
    ///   the primitives whose vertical span covers each row. `raster_row` then visits
    ///   `O(prims_touching_row)` instead of `O(all_prims)` for every one of `h` rows
    ///   (was `rows × n` rejects per frame — 200k quads × 1024 rows = 205M wasted
    ///   span-tests; now ~one test per real touch).
    fn raster_batches(&mut self, lines: &[LineInstance], quads: &[QuadInstance], workers: usize) {
        let w = self.width;
        let h = self.height;
        if w == 0 || h == 0 {
            return;
        }
        let bg = self.background;
        // The empty case still must clear to bg (begin→present with no draws).
        let buckets = YBuckets::build(lines, quads, h);
        // Only fan out once the frame is big enough that thread hand-off pays off.
        let min_rows = if Self::parallel_frame(w, h) { 1 } else { usize::MAX };
        let data = self.pixmap.data_mut();
        znippy_zoomies::gatling_forkjoin::gatling_scanlines(
            data,
            h as usize,
            w as usize,
            workers, // 0 ⇒ one worker per core
            min_rows,
            |y, row| {
                // Clear this row to background, then composite only the primitives
                // whose span covers it (both inside the parallel region).
                for px in row.iter_mut() {
                    *px = bg;
                }
                let (li, qi) = buckets.row(y);
                raster_row(row, y as u32, w, lines, quads, li, qi);
            },
        );
    }

    /// Snapshot the current pixmap as a straight-RGBA8 [`Frame`] (un-premultiplied)
    /// without consuming the canvas.
    ///
    /// **Parallel un-premultiply:** the per-pixel un-premultiply is embarrassingly
    /// parallel, so it runs through the same gatling scanline kernel into a
    /// pre-allocated output buffer (zero per-pixel alloc) instead of the old
    /// single-threaded `flat_map().collect()` over the whole framebuffer — that
    /// `collect` was the second half of the Amdahl serial tail.
    pub fn frame(&self) -> Frame {
        self.frame_with_workers(0)
    }

    /// [`frame`](Self::frame) with an explicit worker count (see
    /// [`rasterize_with_workers`](Self::rasterize_with_workers)).
    pub fn frame_with_workers(&self, workers: usize) -> Frame {
        let w = self.width as usize;
        let h = self.height as usize;
        let len = w * h * 4;
        if w == 0 || h == 0 {
            return Frame { width: self.width, height: self.height, rgba: Vec::new() };
        }
        // Allocate the output **without zeroing** (no `vec![0u8; 67MB]`): the gatling
        // pass below writes every byte of every row (4 bytes × w per pixel, all h
        // rows), so the buffer is fully initialized before we ever read it. Zeroing
        // 67 MB only to overwrite it was a memory-bound serial tail that did not
        // scale — skipping it lifts the un-premultiply's per-core ceiling.
        let mut rgba: Vec<u8> = Vec::with_capacity(len);
        let src = self.pixmap.data();
        let min_rows = if Self::parallel_frame(self.width, self.height) { 1 } else { usize::MAX };
        {
            // SAFETY: `spare` is `len` uninitialized bytes; the kernel writes all of
            // them (each row's `w*4` bytes are fully assigned in `unpremul_into`), so
            // every byte is initialized before `set_len(len)` exposes them.
            let spare = rgba.spare_capacity_mut();
            // View the MaybeUninit slice as raw bytes for the byte-grid kernel — the
            // worker assigns (never reads) each byte, so this is sound.
            let buf = unsafe {
                std::slice::from_raw_parts_mut(spare.as_mut_ptr() as *mut u8, len)
            };
            // Output stride is `w * 4` bytes per row; each row maps src[y*w..] → dst.
            znippy_zoomies::gatling_forkjoin::gatling_scanlines(
                buf,
                h,
                w * 4,
                workers,
                min_rows,
                |y, out_row| {
                    let src_row = &src[y * w..(y + 1) * w];
                    for (p, o) in src_row.iter().zip(out_row.chunks_exact_mut(4)) {
                        unpremul_into(p, o);
                    }
                },
            );
        }
        // SAFETY: all `len` bytes were written by the kernel above.
        unsafe { rgba.set_len(len) };
        Frame { width: self.width, height: self.height, rgba }
    }
}

/// Un-premultiply one [`PremulRgba8`] into a straight `[r,g,b,a]` output slice.
#[inline]
fn unpremul_into(p: &PremulRgba8, o: &mut [u8]) {
    let a = p.a;
    if a == 0 {
        o[0] = 0;
        o[1] = 0;
        o[2] = 0;
        o[3] = 0;
    } else {
        let un = |c: u8| ((c as u32 * 255 + (a as u32) / 2) / a as u32).min(255) as u8;
        o[0] = un(p.r);
        o[1] = un(p.g);
        o[2] = un(p.b);
        o[3] = a;
    }
}

/// **Per-row primitive index** — built once per frame. For each scanline `y` it
/// holds the slice of line / quad indices whose vertical span covers that row, so
/// `raster_row` iterates only the primitives that can actually touch the row
/// instead of rejecting all `n` of them per row (the O(rows×n) tail).
///
/// Layout is a CSR-style flat index: `line_idx`/`quad_idx` are the concatenated
/// per-row index lists, and `line_off`/`quad_off` are the `h+1` row offsets into
/// them. Built in two passes (count rows, then fill) so it allocates exactly twice
/// — and entirely **outside** any timed parallel hot loop work per row.
struct YBuckets {
    line_idx: Vec<u32>,
    line_off: Vec<u32>,
    quad_idx: Vec<u32>,
    quad_off: Vec<u32>,
}

impl YBuckets {
    /// The `[y0, y1)` integer row span a primitive's pixel-`y` range `[mny, mxy]`
    /// can cover, clamped to `[0, h)`. Mirrors the per-row test in `raster_row`
    /// (`py = y + 0.5` must lie in `[mny, mxy]`), so a row is included iff it would
    /// pass that test — keeping the composite bit-identical.
    #[inline]
    fn span(mny: f32, mxy: f32, h: u32) -> (u32, u32) {
        // py = y+0.5 ∈ [mny, mxy] ⇒ y ∈ [mny-0.5, mxy-0.5]. Clamp to [0, h); an
        // off-screen primitive yields an empty (y0 >= y1) range and is skipped by
        // the `y0..y1` loop with no special-case branch. The `.max(0.0)` before the
        // `as u32` cast also tames NaN/negative inputs (NaN → 0).
        let y0 = ((mny - 0.5).ceil().max(0.0) as u32).min(h);
        let y1 = (((mxy - 0.5).floor() + 1.0).max(0.0) as u32).min(h); // exclusive
        (y0, y1.max(y0))
    }

    fn build(lines: &[LineInstance], quads: &[QuadInstance], h: u32) -> Self {
        let hu = h as usize;
        let mut line_off = vec![0u32; hu + 1];
        let mut quad_off = vec![0u32; hu + 1];
        // Pass 1: per-row counts (stored shifted by +1 for the prefix sum).
        for l in lines {
            let (_, mny, _, mxy) = l.bounds();
            let (y0, y1) = Self::span(mny, mxy, h);
            for y in y0..y1 {
                line_off[y as usize + 1] += 1;
            }
        }
        for q in quads {
            let cy = q.center[1];
            let he = q.half_extent();
            let (y0, y1) = Self::span(cy - he, cy + he, h);
            for y in y0..y1 {
                quad_off[y as usize + 1] += 1;
            }
        }
        // Prefix-sum the counts into offsets.
        for y in 0..hu {
            line_off[y + 1] += line_off[y];
            quad_off[y + 1] += quad_off[y];
        }
        let mut line_idx = vec![0u32; line_off[hu] as usize];
        let mut quad_idx = vec![0u32; quad_off[hu] as usize];
        // Pass 2: scatter primitive indices into each covered row's slot.
        let mut cursor = line_off.clone();
        for (i, l) in lines.iter().enumerate() {
            let (_, mny, _, mxy) = l.bounds();
            let (y0, y1) = Self::span(mny, mxy, h);
            for y in y0..y1 {
                let slot = &mut cursor[y as usize];
                line_idx[*slot as usize] = i as u32;
                *slot += 1;
            }
        }
        let mut cursor = quad_off.clone();
        for (i, q) in quads.iter().enumerate() {
            let cy = q.center[1];
            let he = q.half_extent();
            let (y0, y1) = Self::span(cy - he, cy + he, h);
            for y in y0..y1 {
                let slot = &mut cursor[y as usize];
                quad_idx[*slot as usize] = i as u32;
                *slot += 1;
            }
        }
        Self { line_idx, line_off, quad_idx, quad_off }
    }

    /// The `(line_indices, quad_indices)` that cover row `y`, **in original push
    /// order** (the scatter preserves it), so the composite order is unchanged.
    #[inline]
    fn row(&self, y: usize) -> (&[u32], &[u32]) {
        let l0 = self.line_off[y] as usize;
        let l1 = self.line_off[y + 1] as usize;
        let q0 = self.quad_off[y] as usize;
        let q1 = self.quad_off[y + 1] as usize;
        (&self.line_idx[l0..l1], &self.quad_idx[q0..q1])
    }
}

/// Raster all `lines` (under) then all `quads` (over) onto a **single scanline**
/// `row` (the `w` pixels at image row `y`). This is the parallel unit: a thread
/// owns one row, so blends never contend, and replaying the same instance order
/// per row keeps the composite bit-identical to the sequential path.
fn raster_row(
    row: &mut [PremulRgba8],
    y: u32,
    w: u32,
    lines: &[LineInstance],
    quads: &[QuadInstance],
    line_idx: &[u32],
    quad_idx: &[u32],
) {
    let py = y as f32 + 0.5;
    // Lines first (edges under chips). `line_idx` already lists only the lines whose
    // span covers this row, in push order — the per-row vertical reject is gone.
    for &i in line_idx {
        let l = &lines[i as usize];
        let (mnx, _, mxx, _) = l.bounds();
        let x0 = (mnx.floor()).max(0.0) as u32;
        let x1 = (mxx.ceil()).min(w as f32) as u32;
        for x in x0..x1 {
            let cov = sdf::line_coverage(l, [x as f32 + 0.5, py]);
            if cov > 0.0 {
                blend_px(&mut row[x as usize], l.color, cov);
            }
        }
    }
    // Quads over (only those whose span covers this row, in push order).
    for &i in quad_idx {
        let q = &quads[i as usize];
        let he = q.half_extent();
        let (cx, cy) = (q.center[0], q.center[1]);
        let x0 = ((cx - he).floor()).max(0.0) as u32;
        let x1 = ((cx + he).ceil()).min(w as f32) as u32;
        for x in x0..x1 {
            let dx = x as f32 + 0.5 - cx;
            let dy = py - cy;
            let cov = sdf::quad_coverage(q, dx, dy);
            if cov > 0.0 {
                blend_px(&mut row[x as usize], q.color, cov);
            }
        }
    }
}

/// Source-over alpha-composite a straight `[r,g,b,a] ∈ [0,1]` colour scaled by
/// `coverage` onto one premultiplied destination pixel.
#[inline]
fn blend_px(dst: &mut PremulRgba8, color: [f32; 4], coverage: f32) {
    let sa = (color[3] * coverage).clamp(0.0, 1.0);
    // Premultiplied source.
    let sr = color[0] * sa;
    let sg = color[1] * sa;
    let sb = color[2] * sa;
    let da = dst.a as f32 / 255.0;
    let dr = dst.r as f32 / 255.0;
    let dg = dst.g as f32 / 255.0;
    let db = dst.b as f32 / 255.0;
    let inv = 1.0 - sa;
    *dst = PremulRgba8 {
        r: ((sr + dr * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
        g: ((sg + dg * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
        b: ((sb + db * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
        a: ((sa + da * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
    };
}

/// Premultiply a straight `[u8;4]` into a [`PremulRgba8`].
fn premul(c: [u8; 4]) -> PremulRgba8 {
    let a = c[3] as u32;
    let m = |v: u8| ((v as u32 * a + 127) / 255) as u8;
    PremulRgba8 { r: m(c[0]), g: m(c[1]), b: m(c[2]), a: c[3] }
}

impl Canvas for CpuCanvas {
    fn push_quads(&mut self, quads: &[QuadInstance]) {
        self.quads.extend_from_slice(quads);
    }
    fn push_lines(&mut self, lines: &[LineInstance]) {
        self.lines.extend_from_slice(lines);
    }
    fn camera(&self) -> &Camera {
        &self.camera
    }
}

/// The CPU [`Renderer`] — `begin` opens a [`CpuCanvas`] sized to the rect, `present`
/// rasterizes it to a [`Frame`]. Headless / device / CI; always available (no GPU).
pub struct CpuRenderer {
    background: [u8; 4],
    canvas: Option<CpuCanvas>,
}

impl CpuRenderer {
    pub fn new(background: [u8; 4]) -> Self {
        Self { background, canvas: None }
    }
}

impl Default for CpuRenderer {
    fn default() -> Self {
        Self::new([12, 14, 20, 255])
    }
}

impl Renderer for CpuRenderer {
    fn begin(&mut self, width: u32, height: u32, camera: Camera) -> &mut dyn Canvas {
        self.canvas = Some(CpuCanvas::new(width, height, camera, self.background));
        self.canvas.as_mut().unwrap()
    }
    fn present(&mut self) -> Frame {
        self.canvas.take().map(|mut c| c.rasterize()).unwrap_or(Frame { width: 0, height: 0, rgba: Vec::new() })
    }
    fn backend(&self) -> Backend {
        Backend::CpuVello
    }
}

#[cfg(test)]
impl CpuCanvas {
    /// A forced single-threaded raster — the reference the GATLING parallel path is
    /// goldened against (the `parallel_raster_matches_sequential` test).
    fn rasterize_sequential(&mut self) -> Frame {
        let lines = std::mem::take(&mut self.lines);
        let quads = std::mem::take(&mut self.quads);
        let w = self.width;
        let h = self.height;
        let bg = self.background;
        let buckets = YBuckets::build(&lines, &quads, h);
        let data = self.pixmap.data_mut();
        for y in 0..h {
            let start = (y * w) as usize;
            let row = &mut data[start..start + w as usize];
            for px in row.iter_mut() {
                *px = bg;
            }
            let (li, qi) = buckets.row(y as usize);
            raster_row(row, y, w, &lines, &quads, li, qi);
        }
        self.frame()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::render::prim::{shape, CircleInstance, LineInstance, MarkerInstance, RingInstance};

    /// INJECT-ASSERT (GATLING): the no-barrier row-parallel raster produces a
    /// **bit-identical** frame to the single-threaded reference, on a frame big
    /// enough to actually fan across cores, with overlapping instances (so blend
    /// order matters) — proving the scanline split preserves compositing order.
    #[test]
    fn parallel_raster_matches_sequential() {
        // 512×512 ⇒ above PARALLEL_PX_THRESHOLD ⇒ the gatling path runs.
        let (w, h) = (512u32, 512u32);
        let mk = || {
            let mut c = CpuCanvas::new(w, h, Camera::default(), [9, 11, 16, 255]);
            // Many overlapping primitives so per-pixel blend order is load-bearing.
            let mut quads = Vec::new();
            let mut lines = Vec::new();
            for i in 0..400u32 {
                let x = (i * 37 % 500 + 6) as f32;
                let y = (i * 53 % 500 + 6) as f32;
                let col = [(i % 7) as f32 / 7.0, (i % 5) as f32 / 5.0, (i % 3) as f32 / 3.0, 0.7];
                if i % 3 == 0 {
                    quads.push(CircleInstance { center: [x, y], radius: 14.0, color: col, aa: 1.5 }.lower());
                } else if i % 3 == 1 {
                    quads.push(RingInstance { center: [x, y], radius: 16.0, inner: 8.0, color: col, aa: 1.5 }.lower());
                } else {
                    quads.push(MarkerInstance { center: [x, y], radius: 12.0, corner: 2.0, color: col, aa: 1.0, shape: shape::DIAMOND }.lower());
                }
                lines.push(LineInstance::round([x, y], [x + 40.0, y + 25.0], 3.0, 1.5, [col[0], col[1], col[2], 0.6]));
            }
            c.push_lines(&lines);
            c.push_quads(&quads);
            c
        };

        let parallel = mk().rasterize();
        let sequential = mk().rasterize_sequential();
        assert_eq!(parallel.width, sequential.width);
        assert_eq!(parallel.rgba.len(), sequential.rgba.len());
        assert_eq!(parallel.rgba, sequential.rgba, "GATLING parallel raster is bit-identical to sequential");
        // And it actually drew a substantial frame (not a degenerate match-on-blank).
        assert!(parallel.lit_px() > 50_000, "real content rastered, got {}", parallel.lit_px());
    }

    /// INJECT-ASSERT (y-bucket index): the y-bucketed raster (each row visits only
    /// the primitives whose span covers it) is **byte-identical** to a brute-force
    /// raster that tests every primitive against every row — including primitives
    /// that straddle the top (y<0) and bottom (y>h) frame edges, so the span clamp
    /// is exercised. This proves the O(rows×n)→O(touches) reject changed nothing
    /// the pixels see.
    #[test]
    fn ybucket_raster_matches_brute_force_all_primitives() {
        let (w, h) = (300u32, 200u32);
        let mut quads = Vec::new();
        let mut lines = Vec::new();
        for i in 0..120u32 {
            let x = (i * 41 % 290 + 4) as f32;
            // Deliberately push some centres above the top and below the bottom so
            // their spans clamp at 0 / h.
            let y = (i as f32 * 7.3) - 30.0;
            let col = [(i % 7) as f32 / 7.0, (i % 4) as f32 / 4.0, (i % 3) as f32 / 3.0, 0.65];
            quads.push(CircleInstance { center: [x, y], radius: 12.0, color: col, aa: 1.5 }.lower());
            lines.push(LineInstance::round([x, y], [x + 30.0, y + 50.0], 3.0, 1.5, [col[0], col[1], col[2], 0.5]));
        }

        // Bucketed path (production).
        let mut c = CpuCanvas::new(w, h, Camera::default(), [10, 12, 18, 255]);
        c.push_lines(&lines);
        c.push_quads(&quads);
        let bucketed = c.rasterize();

        // Brute-force reference: clear + composite ALL primitives on EVERY row (no
        // bucket index), using a full per-row index list `[0,1,2,…]`.
        let bg = premul([10, 12, 18, 255]);
        let mut pm = Pixmap::new(w as u16, h as u16);
        let all_l: Vec<u32> = (0..lines.len() as u32).collect();
        let all_q: Vec<u32> = (0..quads.len() as u32).collect();
        let data = pm.data_mut();
        for y in 0..h {
            let row = &mut data[(y * w) as usize..((y + 1) * w) as usize];
            for px in row.iter_mut() {
                *px = bg;
            }
            raster_row(row, y, w, &lines, &quads, &all_l, &all_q);
        }
        let brute: Vec<u8> = data
            .iter()
            .flat_map(|p| {
                let mut o = [0u8; 4];
                unpremul_into(p, &mut o);
                o
            })
            .collect();

        assert_eq!(bucketed.rgba.len(), brute.len());
        assert_eq!(bucketed.rgba, brute, "y-bucketed raster == brute-force-all-primitives");
        assert!(bucketed.lit_px() > 1_000, "real content drawn, got {}", bucketed.lit_px());
    }

    #[test]
    fn cpu_canvas_lights_pixels_inside_a_circle() {
        let cam = Camera::default();
        let mut canvas = CpuCanvas::new(64, 64, cam, [0, 0, 0, 255]);
        let c = CircleInstance { center: [32.0, 32.0], radius: 10.0, color: [1.0, 0.0, 0.0, 1.0], aa: 1.0 };
        canvas.push_quads(&[c.lower()]);
        let frame = canvas.rasterize();
        assert_eq!(frame.rgba.len(), 64 * 64 * 4);
        // Centre pixel is red.
        let i = ((32 * 64 + 32) * 4) as usize;
        assert!(frame.rgba[i] > 200 && frame.rgba[i + 1] < 50, "centre is red");
        // A far corner is still background (black).
        let c0 = 0;
        assert!(frame.rgba[c0] < 10, "corner stays background");
    }

    #[test]
    fn cpu_renderer_round_trips_through_the_seam() {
        let mut r = CpuRenderer::new([0, 0, 0, 255]);
        let canvas = r.begin(48, 48, Camera::default());
        let ring = RingInstance { center: [24.0, 24.0], radius: 12.0, inner: 6.0, color: [0.0, 1.0, 0.0, 1.0], aa: 1.0 };
        canvas.push_quads(&[ring.lower()]);
        assert_eq!(r.backend(), Backend::CpuVello);
        let frame = r.present();
        // The ring band (≈9px out) is green; the hole centre is background.
        let band = (((24) * 48 + (24 + 9)) * 4) as usize;
        assert!(frame.rgba[band + 1] > 200, "ring band green");
        let hole = ((24 * 48 + 24) * 4) as usize;
        assert!(frame.rgba[hole + 1] < 50, "ring hole is background");
    }
}