Skip to main content

facett_core/render/cpu/
mod.rs

1//! **The CPU render lane** (L0 fallback).
2//!
3//! - [`scissor`] — the rect scissor geometry (moved from map3d in Phase A).
4//! - [`sdf`] — the CPU SDF coverage + thick-AA-line raster math (the source of
5//!   truth the GPU `sdf.wgsl`/`line.wgsl` mirror).
6//! - [`CpuCanvas`] — collects [`QuadInstance`]/[`LineInstance`] batches and
7//!   rasterizes them onto a **`vello_cpu` [`Pixmap`]** via the [`sdf`] coverage
8//!   math, producing a straight-RGBA8 frame. It implements the L0
9//!   [`Canvas`](super::Canvas) seam; [`CpuRenderer`] implements
10//!   [`Renderer`](super::Renderer).
11
12pub mod scissor;
13pub mod sdf;
14
15pub use scissor::{clip_poly_to_rect, ink_outside_rect};
16
17use vello_cpu::color::PremulRgba8;
18use vello_cpu::Pixmap;
19
20use super::camera::Camera;
21use super::prim::{LineInstance, QuadInstance};
22use super::{Backend, Canvas, Frame, Renderer};
23
24/// A CPU canvas: an off-screen [`Pixmap`] (the vello_cpu raster target, per the
25/// CONS-CORE spec) plus the batched SDF instances drawn onto it. The host pushes
26/// quads + lines, then [`CpuCanvas::rasterize`] evaluates the [`sdf`] coverage for
27/// every instance over its bounding box and alpha-composites it into the pixmap.
28///
29/// Coverage math is **byte-for-byte** the same the GPU lane runs, so a CPU frame
30/// matches a GPU frame (the `sdf_primitives` parity test pins this).
31pub struct CpuCanvas {
32    pixmap: Pixmap,
33    width: u32,
34    height: u32,
35    camera: Camera,
36    /// Premultiplied background; painted per-row inside the parallel raster.
37    background: PremulRgba8,
38    quads: Vec<QuadInstance>,
39    lines: Vec<LineInstance>,
40}
41
42impl CpuCanvas {
43    /// A fresh `width × height` canvas cleared to `background` (straight RGBA8),
44    /// under `camera`.
45    ///
46    /// **No serial clear here:** the background is no longer painted up-front in a
47    /// single-threaded `for` over 1M pixels (that was a pure Amdahl serial tail).
48    /// Instead [`raster_batches`] fills each row's background **inside** the gatling
49    /// scanline kernel — the workers that own a row clear it before compositing, so
50    /// the clear scales across all cores with zero extra alloc.
51    pub fn new(width: u32, height: u32, camera: Camera, background: [u8; 4]) -> Self {
52        let pixmap = Pixmap::new(width as u16, height as u16);
53        Self {
54            pixmap,
55            width,
56            height,
57            camera,
58            background: premul(background),
59            quads: Vec::new(),
60            lines: Vec::new(),
61        }
62    }
63
64    /// The raster target — the `vello_cpu` pixmap (spec §2: "CpuCanvas → vello_cpu
65    /// pixmap"). Exposed so a host can hand its glyph/curve overlay (L1 vello) the
66    /// same target later.
67    pub fn pixmap(&self) -> &Pixmap {
68        &self.pixmap
69    }
70
71    /// Rasterize every batched instance onto the pixmap (lines under quads, the
72    /// graph convention: chips draw on top of edges). Returns the straight
73    /// (un-premultiplied) RGBA8 [`Frame`].
74    ///
75    /// **GATLING multicore (LAW 2):** the raster fans across all cores by **scanline
76    /// row** — each row is owned by exactly one thread, so the blends never contend,
77    /// and the per-pixel compositing order (all lines in order, then all quads in
78    /// order) is preserved bit-for-bit vs the sequential path (the
79    /// `parallel_raster_matches_sequential` test pins this). Below a pixel-work
80    /// threshold the frame stays single-threaded (zero pool overhead on small draws).
81    pub fn rasterize(&mut self) -> Frame {
82        self.rasterize_with_workers(0)
83    }
84
85    /// [`rasterize`](Self::rasterize) with an explicit gatling worker count: `0` ⇒
86    /// one per core (the production path), `1` ⇒ the forced single-threaded path.
87    /// Exposed so a bench can time the **same** parallel region at 1 vs N workers and
88    /// report the real cores-busy / speedup (rather than guessing from a sweep
89    /// average). The output is identical for any worker count (the
90    /// `parallel_raster_matches_sequential` test pins bit-identity at N vs 1).
91    pub fn rasterize_with_workers(&mut self, workers: usize) -> Frame {
92        let lines = std::mem::take(&mut self.lines);
93        let quads = std::mem::take(&mut self.quads);
94        self.raster_batches(&lines, &quads, workers);
95        self.frame_with_workers(workers)
96    }
97
98    /// Run **only** the compositing raster (bg clear + y-bucketed SDF blend) at
99    /// `workers` gatling workers — the region the GATLING scanline kernel governs,
100    /// without the memory-bandwidth-bound un-premultiply ([`frame`](Self::frame)).
101    /// Exposed for the scaling bench so cores-busy can be measured on the part that
102    /// is CPU-bound, separate from the RAM-bandwidth-capped pixmap→RGBA8 conversion.
103    pub fn raster_only(&mut self, workers: usize) {
104        let lines = std::mem::take(&mut self.lines);
105        let quads = std::mem::take(&mut self.quads);
106        self.raster_batches(&lines, &quads, workers);
107    }
108
109    /// The pixel-work threshold above which the raster goes multicore. `width *
110    /// height` (the full frame) is a cheap upper bound on the work; tiny frames
111    /// (tooltips, sparklines) skip the gatling fan-out entirely.
112    const PARALLEL_PX_THRESHOLD: usize = 64 * 1024; // 256×256
113
114    /// Should this `w × h` frame fan across cores? (cheap upper bound on work.)
115    #[inline]
116    fn parallel_frame(w: u32, h: u32) -> bool {
117        (w as usize * h as usize) >= Self::PARALLEL_PX_THRESHOLD
118    }
119
120    /// Raster `lines` (under) then `quads` (over) onto the pixmap, row-parallel via
121    /// znippy's fork-join GATLING (rayon is forbidden). Each worker self-dispatches
122    /// whole scanlines and writes into that row's **disjoint** pixel slice — no
123    /// barrier, no wait-for-a-core, zero-copy (row index hand-off), zero-alloc in
124    /// the hot loop. Below the pixel-work threshold it stays single-threaded.
125    ///
126    /// Two scaling lifts live entirely inside this parallel region (so they raise the
127    /// per-core ceiling, they don't add serial tail):
128    /// - **Per-row background clear.** The worker that owns row `y` paints the bg into
129    ///   that row before compositing — the 1M-pixel clear is now N-way parallel, not a
130    ///   serial `for` in `new`.
131    /// - **Y-bucket reject.** A [`YBuckets`] index, built once per frame, lists only
132    ///   the primitives whose vertical span covers each row. `raster_row` then visits
133    ///   `O(prims_touching_row)` instead of `O(all_prims)` for every one of `h` rows
134    ///   (was `rows × n` rejects per frame — 200k quads × 1024 rows = 205M wasted
135    ///   span-tests; now ~one test per real touch).
136    fn raster_batches(&mut self, lines: &[LineInstance], quads: &[QuadInstance], workers: usize) {
137        let w = self.width;
138        let h = self.height;
139        if w == 0 || h == 0 {
140            return;
141        }
142        let bg = self.background;
143        // The empty case still must clear to bg (begin→present with no draws).
144        let buckets = YBuckets::build(lines, quads, h);
145        // Only fan out once the frame is big enough that thread hand-off pays off.
146        let min_rows = if Self::parallel_frame(w, h) { 1 } else { usize::MAX };
147        let data = self.pixmap.data_mut();
148        znippy_zoomies::gatling_forkjoin::gatling_scanlines(
149            data,
150            h as usize,
151            w as usize,
152            workers, // 0 ⇒ one worker per core
153            min_rows,
154            |y, row| {
155                // Clear this row to background, then composite only the primitives
156                // whose span covers it (both inside the parallel region).
157                for px in row.iter_mut() {
158                    *px = bg;
159                }
160                let (li, qi) = buckets.row(y);
161                raster_row(row, y as u32, w, lines, quads, li, qi);
162            },
163        );
164    }
165
166    /// Snapshot the current pixmap as a straight-RGBA8 [`Frame`] (un-premultiplied)
167    /// without consuming the canvas.
168    ///
169    /// **Parallel un-premultiply:** the per-pixel un-premultiply is embarrassingly
170    /// parallel, so it runs through the same gatling scanline kernel into a
171    /// pre-allocated output buffer (zero per-pixel alloc) instead of the old
172    /// single-threaded `flat_map().collect()` over the whole framebuffer — that
173    /// `collect` was the second half of the Amdahl serial tail.
174    pub fn frame(&self) -> Frame {
175        self.frame_with_workers(0)
176    }
177
178    /// [`frame`](Self::frame) with an explicit worker count (see
179    /// [`rasterize_with_workers`](Self::rasterize_with_workers)).
180    pub fn frame_with_workers(&self, workers: usize) -> Frame {
181        let w = self.width as usize;
182        let h = self.height as usize;
183        let len = w * h * 4;
184        if w == 0 || h == 0 {
185            return Frame { width: self.width, height: self.height, rgba: Vec::new() };
186        }
187        // Allocate the output **without zeroing** (no `vec![0u8; 67MB]`): the gatling
188        // pass below writes every byte of every row (4 bytes × w per pixel, all h
189        // rows), so the buffer is fully initialized before we ever read it. Zeroing
190        // 67 MB only to overwrite it was a memory-bound serial tail that did not
191        // scale — skipping it lifts the un-premultiply's per-core ceiling.
192        let mut rgba: Vec<u8> = Vec::with_capacity(len);
193        let src = self.pixmap.data();
194        let min_rows = if Self::parallel_frame(self.width, self.height) { 1 } else { usize::MAX };
195        {
196            // SAFETY: `spare` is `len` uninitialized bytes; the kernel writes all of
197            // them (each row's `w*4` bytes are fully assigned in `unpremul_into`), so
198            // every byte is initialized before `set_len(len)` exposes them.
199            let spare = rgba.spare_capacity_mut();
200            // View the MaybeUninit slice as raw bytes for the byte-grid kernel — the
201            // worker assigns (never reads) each byte, so this is sound.
202            let buf = unsafe {
203                std::slice::from_raw_parts_mut(spare.as_mut_ptr() as *mut u8, len)
204            };
205            // Output stride is `w * 4` bytes per row; each row maps src[y*w..] → dst.
206            znippy_zoomies::gatling_forkjoin::gatling_scanlines(
207                buf,
208                h,
209                w * 4,
210                workers,
211                min_rows,
212                |y, out_row| {
213                    let src_row = &src[y * w..(y + 1) * w];
214                    for (p, o) in src_row.iter().zip(out_row.chunks_exact_mut(4)) {
215                        unpremul_into(p, o);
216                    }
217                },
218            );
219        }
220        // SAFETY: all `len` bytes were written by the kernel above.
221        unsafe { rgba.set_len(len) };
222        Frame { width: self.width, height: self.height, rgba }
223    }
224}
225
226/// Un-premultiply one [`PremulRgba8`] into a straight `[r,g,b,a]` output slice.
227#[inline]
228fn unpremul_into(p: &PremulRgba8, o: &mut [u8]) {
229    let a = p.a;
230    if a == 0 {
231        o[0] = 0;
232        o[1] = 0;
233        o[2] = 0;
234        o[3] = 0;
235    } else {
236        let un = |c: u8| ((c as u32 * 255 + (a as u32) / 2) / a as u32).min(255) as u8;
237        o[0] = un(p.r);
238        o[1] = un(p.g);
239        o[2] = un(p.b);
240        o[3] = a;
241    }
242}
243
244/// **Per-row primitive index** — built once per frame. For each scanline `y` it
245/// holds the slice of line / quad indices whose vertical span covers that row, so
246/// `raster_row` iterates only the primitives that can actually touch the row
247/// instead of rejecting all `n` of them per row (the O(rows×n) tail).
248///
249/// Layout is a CSR-style flat index: `line_idx`/`quad_idx` are the concatenated
250/// per-row index lists, and `line_off`/`quad_off` are the `h+1` row offsets into
251/// them. Built in two passes (count rows, then fill) so it allocates exactly twice
252/// — and entirely **outside** any timed parallel hot loop work per row.
253struct YBuckets {
254    line_idx: Vec<u32>,
255    line_off: Vec<u32>,
256    quad_idx: Vec<u32>,
257    quad_off: Vec<u32>,
258}
259
260impl YBuckets {
261    /// The `[y0, y1)` integer row span a primitive's pixel-`y` range `[mny, mxy]`
262    /// can cover, clamped to `[0, h)`. Mirrors the per-row test in `raster_row`
263    /// (`py = y + 0.5` must lie in `[mny, mxy]`), so a row is included iff it would
264    /// pass that test — keeping the composite bit-identical.
265    #[inline]
266    fn span(mny: f32, mxy: f32, h: u32) -> (u32, u32) {
267        // py = y+0.5 ∈ [mny, mxy] ⇒ y ∈ [mny-0.5, mxy-0.5]. Clamp to [0, h); an
268        // off-screen primitive yields an empty (y0 >= y1) range and is skipped by
269        // the `y0..y1` loop with no special-case branch. The `.max(0.0)` before the
270        // `as u32` cast also tames NaN/negative inputs (NaN → 0).
271        let y0 = ((mny - 0.5).ceil().max(0.0) as u32).min(h);
272        let y1 = (((mxy - 0.5).floor() + 1.0).max(0.0) as u32).min(h); // exclusive
273        (y0, y1.max(y0))
274    }
275
276    fn build(lines: &[LineInstance], quads: &[QuadInstance], h: u32) -> Self {
277        let hu = h as usize;
278        let mut line_off = vec![0u32; hu + 1];
279        let mut quad_off = vec![0u32; hu + 1];
280        // Pass 1: per-row counts (stored shifted by +1 for the prefix sum).
281        for l in lines {
282            let (_, mny, _, mxy) = l.bounds();
283            let (y0, y1) = Self::span(mny, mxy, h);
284            for y in y0..y1 {
285                line_off[y as usize + 1] += 1;
286            }
287        }
288        for q in quads {
289            let cy = q.center[1];
290            let he = q.half_extent();
291            let (y0, y1) = Self::span(cy - he, cy + he, h);
292            for y in y0..y1 {
293                quad_off[y as usize + 1] += 1;
294            }
295        }
296        // Prefix-sum the counts into offsets.
297        for y in 0..hu {
298            line_off[y + 1] += line_off[y];
299            quad_off[y + 1] += quad_off[y];
300        }
301        let mut line_idx = vec![0u32; line_off[hu] as usize];
302        let mut quad_idx = vec![0u32; quad_off[hu] as usize];
303        // Pass 2: scatter primitive indices into each covered row's slot.
304        let mut cursor = line_off.clone();
305        for (i, l) in lines.iter().enumerate() {
306            let (_, mny, _, mxy) = l.bounds();
307            let (y0, y1) = Self::span(mny, mxy, h);
308            for y in y0..y1 {
309                let slot = &mut cursor[y as usize];
310                line_idx[*slot as usize] = i as u32;
311                *slot += 1;
312            }
313        }
314        let mut cursor = quad_off.clone();
315        for (i, q) in quads.iter().enumerate() {
316            let cy = q.center[1];
317            let he = q.half_extent();
318            let (y0, y1) = Self::span(cy - he, cy + he, h);
319            for y in y0..y1 {
320                let slot = &mut cursor[y as usize];
321                quad_idx[*slot as usize] = i as u32;
322                *slot += 1;
323            }
324        }
325        Self { line_idx, line_off, quad_idx, quad_off }
326    }
327
328    /// The `(line_indices, quad_indices)` that cover row `y`, **in original push
329    /// order** (the scatter preserves it), so the composite order is unchanged.
330    #[inline]
331    fn row(&self, y: usize) -> (&[u32], &[u32]) {
332        let l0 = self.line_off[y] as usize;
333        let l1 = self.line_off[y + 1] as usize;
334        let q0 = self.quad_off[y] as usize;
335        let q1 = self.quad_off[y + 1] as usize;
336        (&self.line_idx[l0..l1], &self.quad_idx[q0..q1])
337    }
338}
339
340/// Raster all `lines` (under) then all `quads` (over) onto a **single scanline**
341/// `row` (the `w` pixels at image row `y`). This is the parallel unit: a thread
342/// owns one row, so blends never contend, and replaying the same instance order
343/// per row keeps the composite bit-identical to the sequential path.
344fn raster_row(
345    row: &mut [PremulRgba8],
346    y: u32,
347    w: u32,
348    lines: &[LineInstance],
349    quads: &[QuadInstance],
350    line_idx: &[u32],
351    quad_idx: &[u32],
352) {
353    let py = y as f32 + 0.5;
354    // Lines first (edges under chips). `line_idx` already lists only the lines whose
355    // span covers this row, in push order — the per-row vertical reject is gone.
356    for &i in line_idx {
357        let l = &lines[i as usize];
358        let (mnx, _, mxx, _) = l.bounds();
359        let x0 = (mnx.floor()).max(0.0) as u32;
360        let x1 = (mxx.ceil()).min(w as f32) as u32;
361        for x in x0..x1 {
362            let cov = sdf::line_coverage(l, [x as f32 + 0.5, py]);
363            if cov > 0.0 {
364                blend_px(&mut row[x as usize], l.color, cov);
365            }
366        }
367    }
368    // Quads over (only those whose span covers this row, in push order).
369    for &i in quad_idx {
370        let q = &quads[i as usize];
371        let he = q.half_extent();
372        let (cx, cy) = (q.center[0], q.center[1]);
373        let x0 = ((cx - he).floor()).max(0.0) as u32;
374        let x1 = ((cx + he).ceil()).min(w as f32) as u32;
375        for x in x0..x1 {
376            let dx = x as f32 + 0.5 - cx;
377            let dy = py - cy;
378            let cov = sdf::quad_coverage(q, dx, dy);
379            if cov > 0.0 {
380                blend_px(&mut row[x as usize], q.color, cov);
381            }
382        }
383    }
384}
385
386/// Source-over alpha-composite a straight `[r,g,b,a] ∈ [0,1]` colour scaled by
387/// `coverage` onto one premultiplied destination pixel.
388#[inline]
389fn blend_px(dst: &mut PremulRgba8, color: [f32; 4], coverage: f32) {
390    let sa = (color[3] * coverage).clamp(0.0, 1.0);
391    // Premultiplied source.
392    let sr = color[0] * sa;
393    let sg = color[1] * sa;
394    let sb = color[2] * sa;
395    let da = dst.a as f32 / 255.0;
396    let dr = dst.r as f32 / 255.0;
397    let dg = dst.g as f32 / 255.0;
398    let db = dst.b as f32 / 255.0;
399    let inv = 1.0 - sa;
400    *dst = PremulRgba8 {
401        r: ((sr + dr * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
402        g: ((sg + dg * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
403        b: ((sb + db * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
404        a: ((sa + da * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
405    };
406}
407
408/// Premultiply a straight `[u8;4]` into a [`PremulRgba8`].
409fn premul(c: [u8; 4]) -> PremulRgba8 {
410    let a = c[3] as u32;
411    let m = |v: u8| ((v as u32 * a + 127) / 255) as u8;
412    PremulRgba8 { r: m(c[0]), g: m(c[1]), b: m(c[2]), a: c[3] }
413}
414
415impl Canvas for CpuCanvas {
416    fn push_quads(&mut self, quads: &[QuadInstance]) {
417        self.quads.extend_from_slice(quads);
418    }
419    fn push_lines(&mut self, lines: &[LineInstance]) {
420        self.lines.extend_from_slice(lines);
421    }
422    fn camera(&self) -> &Camera {
423        &self.camera
424    }
425}
426
427/// The CPU [`Renderer`] — `begin` opens a [`CpuCanvas`] sized to the rect, `present`
428/// rasterizes it to a [`Frame`]. Headless / device / CI; always available (no GPU).
429pub struct CpuRenderer {
430    background: [u8; 4],
431    canvas: Option<CpuCanvas>,
432}
433
434impl CpuRenderer {
435    pub fn new(background: [u8; 4]) -> Self {
436        Self { background, canvas: None }
437    }
438}
439
440impl Default for CpuRenderer {
441    fn default() -> Self {
442        Self::new([12, 14, 20, 255])
443    }
444}
445
446impl Renderer for CpuRenderer {
447    fn begin(&mut self, width: u32, height: u32, camera: Camera) -> &mut dyn Canvas {
448        self.canvas = Some(CpuCanvas::new(width, height, camera, self.background));
449        self.canvas.as_mut().unwrap()
450    }
451    fn present(&mut self) -> Frame {
452        let frame = self.canvas.take().map(|mut c| c.rasterize()).unwrap_or(Frame { width: 0, height: 0, rgba: Vec::new() });
453        // ── L0 CPU render lane emit ───────────────────────────────────────────
454        // Records that the CPU vello lane ran AND a real output invariant: the
455        // frame has the requested dimensions and drew lit pixels. A blank frame
456        // (lit_px == 0) on a non-empty canvas is a RED row. Unique component
457        // string so the inventory test can assert this exact lane produced a row.
458        #[cfg(feature = "testmatrix")]
459        {
460            let lit = frame.lit_px();
461            let total = (frame.width as usize) * (frame.height as usize);
462            // OK when the surface has area and produced at least the background
463            // composite (lit_px > 0 once a frame of non-zero size is rasterized).
464            let ok = total == 0 || lit > 0;
465            crate::testmatrix::emit(
466                "facett-core::render::CpuRenderer::present",
467                "cpu_render",
468                ok && frame.rgba.len() == total * 4,
469                &format!("backend=CpuVello w={} h={} lit_px={lit} total_px={total}", frame.width, frame.height),
470            );
471        }
472        frame
473    }
474    fn backend(&self) -> Backend {
475        Backend::CpuVello
476    }
477}
478
479#[cfg(test)]
480impl CpuCanvas {
481    /// A forced single-threaded raster — the reference the GATLING parallel path is
482    /// goldened against (the `parallel_raster_matches_sequential` test).
483    fn rasterize_sequential(&mut self) -> Frame {
484        let lines = std::mem::take(&mut self.lines);
485        let quads = std::mem::take(&mut self.quads);
486        let w = self.width;
487        let h = self.height;
488        let bg = self.background;
489        let buckets = YBuckets::build(&lines, &quads, h);
490        let data = self.pixmap.data_mut();
491        for y in 0..h {
492            let start = (y * w) as usize;
493            let row = &mut data[start..start + w as usize];
494            for px in row.iter_mut() {
495                *px = bg;
496            }
497            let (li, qi) = buckets.row(y as usize);
498            raster_row(row, y, w, &lines, &quads, li, qi);
499        }
500        self.frame()
501    }
502}
503
504#[cfg(test)]
505mod tests {
506    use super::*;
507    use crate::render::prim::{shape, CircleInstance, LineInstance, MarkerInstance, RingInstance};
508
509    /// INJECT-ASSERT (GATLING): the no-barrier row-parallel raster produces a
510    /// **bit-identical** frame to the single-threaded reference, on a frame big
511    /// enough to actually fan across cores, with overlapping instances (so blend
512    /// order matters) — proving the scanline split preserves compositing order.
513    #[test]
514    fn parallel_raster_matches_sequential() {
515        // 512×512 ⇒ above PARALLEL_PX_THRESHOLD ⇒ the gatling path runs.
516        let (w, h) = (512u32, 512u32);
517        let mk = || {
518            let mut c = CpuCanvas::new(w, h, Camera::default(), [9, 11, 16, 255]);
519            // Many overlapping primitives so per-pixel blend order is load-bearing.
520            let mut quads = Vec::new();
521            let mut lines = Vec::new();
522            for i in 0..400u32 {
523                let x = (i * 37 % 500 + 6) as f32;
524                let y = (i * 53 % 500 + 6) as f32;
525                let col = [(i % 7) as f32 / 7.0, (i % 5) as f32 / 5.0, (i % 3) as f32 / 3.0, 0.7];
526                if i % 3 == 0 {
527                    quads.push(CircleInstance { center: [x, y], radius: 14.0, color: col, aa: 1.5 }.lower());
528                } else if i % 3 == 1 {
529                    quads.push(RingInstance { center: [x, y], radius: 16.0, inner: 8.0, color: col, aa: 1.5 }.lower());
530                } else {
531                    quads.push(MarkerInstance { center: [x, y], radius: 12.0, corner: 2.0, color: col, aa: 1.0, shape: shape::DIAMOND }.lower());
532                }
533                lines.push(LineInstance::round([x, y], [x + 40.0, y + 25.0], 3.0, 1.5, [col[0], col[1], col[2], 0.6]));
534            }
535            c.push_lines(&lines);
536            c.push_quads(&quads);
537            c
538        };
539
540        let parallel = mk().rasterize();
541        let sequential = mk().rasterize_sequential();
542        assert_eq!(parallel.width, sequential.width);
543        assert_eq!(parallel.rgba.len(), sequential.rgba.len());
544        assert_eq!(parallel.rgba, sequential.rgba, "GATLING parallel raster is bit-identical to sequential");
545        // And it actually drew a substantial frame (not a degenerate match-on-blank).
546        assert!(parallel.lit_px() > 50_000, "real content rastered, got {}", parallel.lit_px());
547    }
548
549    /// INJECT-ASSERT (y-bucket index): the y-bucketed raster (each row visits only
550    /// the primitives whose span covers it) is **byte-identical** to a brute-force
551    /// raster that tests every primitive against every row — including primitives
552    /// that straddle the top (y<0) and bottom (y>h) frame edges, so the span clamp
553    /// is exercised. This proves the O(rows×n)→O(touches) reject changed nothing
554    /// the pixels see.
555    #[test]
556    fn ybucket_raster_matches_brute_force_all_primitives() {
557        let (w, h) = (300u32, 200u32);
558        let mut quads = Vec::new();
559        let mut lines = Vec::new();
560        for i in 0..120u32 {
561            let x = (i * 41 % 290 + 4) as f32;
562            // Deliberately push some centres above the top and below the bottom so
563            // their spans clamp at 0 / h.
564            let y = (i as f32 * 7.3) - 30.0;
565            let col = [(i % 7) as f32 / 7.0, (i % 4) as f32 / 4.0, (i % 3) as f32 / 3.0, 0.65];
566            quads.push(CircleInstance { center: [x, y], radius: 12.0, color: col, aa: 1.5 }.lower());
567            lines.push(LineInstance::round([x, y], [x + 30.0, y + 50.0], 3.0, 1.5, [col[0], col[1], col[2], 0.5]));
568        }
569
570        // Bucketed path (production).
571        let mut c = CpuCanvas::new(w, h, Camera::default(), [10, 12, 18, 255]);
572        c.push_lines(&lines);
573        c.push_quads(&quads);
574        let bucketed = c.rasterize();
575
576        // Brute-force reference: clear + composite ALL primitives on EVERY row (no
577        // bucket index), using a full per-row index list `[0,1,2,…]`.
578        let bg = premul([10, 12, 18, 255]);
579        let mut pm = Pixmap::new(w as u16, h as u16);
580        let all_l: Vec<u32> = (0..lines.len() as u32).collect();
581        let all_q: Vec<u32> = (0..quads.len() as u32).collect();
582        let data = pm.data_mut();
583        for y in 0..h {
584            let row = &mut data[(y * w) as usize..((y + 1) * w) as usize];
585            for px in row.iter_mut() {
586                *px = bg;
587            }
588            raster_row(row, y, w, &lines, &quads, &all_l, &all_q);
589        }
590        let brute: Vec<u8> = data
591            .iter()
592            .flat_map(|p| {
593                let mut o = [0u8; 4];
594                unpremul_into(p, &mut o);
595                o
596            })
597            .collect();
598
599        assert_eq!(bucketed.rgba.len(), brute.len());
600        assert_eq!(bucketed.rgba, brute, "y-bucketed raster == brute-force-all-primitives");
601        assert!(bucketed.lit_px() > 1_000, "real content drawn, got {}", bucketed.lit_px());
602    }
603
604    /// INJECT-ASSERT (blend_px, source-over correctness): the private compositing
605    /// primitive obeys the Porter-Duff source-over algebra on premultiplied pixels.
606    /// Return value (the dst pixel) proves it — this is chain link "blend".
607    #[test]
608    fn blend_px_is_source_over_correct() {
609        // Opaque red over opaque black at full coverage → red.
610        let mut dst = premul([0, 0, 0, 255]);
611        blend_px(&mut dst, [1.0, 0.0, 0.0, 1.0], 1.0);
612        assert_eq!((dst.r, dst.g, dst.b, dst.a), (255, 0, 0, 255), "opaque over → source");
613
614        // Zero coverage leaves the destination untouched.
615        let mut dst2 = premul([10, 20, 30, 255]);
616        let before = (dst2.r, dst2.g, dst2.b, dst2.a);
617        blend_px(&mut dst2, [1.0, 1.0, 1.0, 1.0], 0.0);
618        assert_eq!((dst2.r, dst2.g, dst2.b, dst2.a), before, "zero coverage = no-op");
619
620        // 50% white over black: source-over with sa=0.5 → 0.5 premultiplied →
621        // straight ~128 on each channel (premultiplied store, opaque after).
622        let mut dst3 = premul([0, 0, 0, 255]);
623        blend_px(&mut dst3, [1.0, 1.0, 1.0, 1.0], 0.5);
624        // out_premul.r = 1*0.5 + 0*(1-0.5) = 0.5 → 128; alpha = 0.5+1*0.5 = 1 → 255.
625        assert!((dst3.r as i32 - 128).abs() <= 1, "half coverage white = ~128, got {}", dst3.r);
626        assert_eq!(dst3.a, 255, "over an opaque dst stays opaque");
627
628        // Two stacked half-covers ≈ a single 0.75-cover (monotone build-up).
629        let mut a = premul([0, 0, 0, 255]);
630        blend_px(&mut a, [1.0, 1.0, 1.0, 1.0], 0.5);
631        blend_px(&mut a, [1.0, 1.0, 1.0, 1.0], 0.5);
632        assert!(a.r > dst3.r, "stacking two half-covers lightens further (build-up)");
633    }
634
635    /// INJECT-ASSERT (premul ↔ unpremul_into round-trip): the colour transforms at
636    /// the frame boundary are inverses (within rounding) for an opaque pixel, and
637    /// a zero-alpha pixel un-premultiplies to transparent black. Return value proves
638    /// it — chain links "premul" (clear) and "unpremul" (frame readout).
639    #[test]
640    fn premul_unpremul_round_trip() {
641        for &c in &[[255u8, 0, 0, 255], [10, 200, 30, 255], [123, 45, 67, 255]] {
642            let p = premul(c);
643            let mut out = [0u8; 4];
644            unpremul_into(&p, &mut out);
645            for k in 0..4 {
646                assert!((out[k] as i32 - c[k] as i32).abs() <= 1, "round-trip {c:?} -> {out:?} at {k}");
647            }
648        }
649        // Transparent → all zero.
650        let mut out = [9u8; 4];
651        unpremul_into(&PremulRgba8 { r: 0, g: 0, b: 0, a: 0 }, &mut out);
652        assert_eq!(out, [0, 0, 0, 0], "alpha 0 unpremuls to transparent black");
653    }
654
655    /// INJECT-ASSERT (YBuckets::span + row): the per-row index covers EXACTLY the
656    /// rows a primitive's pixel-y span touches — the same `py = y+0.5 ∈ [mny,mxy]`
657    /// test raster_row applies — and clamps off-screen spans to [0,h). Return value
658    /// (the (y0,y1) range + the row index slices) proves it; this is chain link
659    /// "y-bucket".
660    #[test]
661    fn ybuckets_span_covers_exactly_the_touched_rows() {
662        let h = 100u32;
663        // A primitive spanning y∈[20.3, 40.8]: rows whose centre py=y+0.5 lies in
664        // that band are y=20..=40 (py 20.5..40.5). y0=20, y1=41 (exclusive).
665        let (y0, y1) = YBuckets::span(20.3, 40.8, h);
666        assert_eq!((y0, y1), (20, 41), "span covers the centred rows exactly");
667        // Every row in [y0,y1) passes the raster_row test; the rows just outside fail.
668        for y in y0..y1 {
669            let py = y as f32 + 0.5;
670            assert!((20.3..=40.8).contains(&py), "row {y} centre {py} is inside the span");
671        }
672        assert!((y0 as f32 - 0.5) + 0.5 < 20.3 || y0 == 0, "row below y0 is excluded");
673
674        // Off-screen clamp: a span entirely above the top → empty range.
675        let (a0, a1) = YBuckets::span(-50.0, -10.0, h);
676        assert!(a0 >= a1, "above-frame span is empty");
677        // A span straddling the top clamps y0 to 0.
678        let (b0, _b1) = YBuckets::span(-5.0, 10.0, h);
679        assert_eq!(b0, 0, "straddling the top clamps to row 0");
680        // A span past the bottom clamps y1 to h.
681        let (_c0, c1) = YBuckets::span(90.0, 200.0, h);
682        assert_eq!(c1, h, "past the bottom clamps to h");
683
684        // The built index hands raster_row exactly those rows, in push order. Build
685        // a 2-quad index and assert each row lists only the quads covering it.
686        let q_top = CircleInstance { center: [10.0, 10.0], radius: 5.0, color: [1.0; 4], aa: 0.0 }.lower();
687        let q_bot = CircleInstance { center: [10.0, 80.0], radius: 5.0, color: [1.0; 4], aa: 0.0 }.lower();
688        let buckets = YBuckets::build(&[], &[q_top, q_bot], h);
689        let (_, qrow10) = buckets.row(10); // covered by q_top only
690        assert_eq!(qrow10, &[0u32], "row 10 lists only the top quad");
691        let (_, qrow80) = buckets.row(80); // covered by q_bot only
692        assert_eq!(qrow80, &[1u32], "row 80 lists only the bottom quad");
693        let (_, qrow50) = buckets.row(50); // covered by neither
694        assert!(qrow50.is_empty(), "row 50 lists no quads");
695    }
696
697    #[test]
698    fn cpu_canvas_lights_pixels_inside_a_circle() {
699        let cam = Camera::default();
700        let mut canvas = CpuCanvas::new(64, 64, cam, [0, 0, 0, 255]);
701        let c = CircleInstance { center: [32.0, 32.0], radius: 10.0, color: [1.0, 0.0, 0.0, 1.0], aa: 1.0 };
702        canvas.push_quads(&[c.lower()]);
703        let frame = canvas.rasterize();
704        assert_eq!(frame.rgba.len(), 64 * 64 * 4);
705        // Centre pixel is red.
706        let i = ((32 * 64 + 32) * 4) as usize;
707        assert!(frame.rgba[i] > 200 && frame.rgba[i + 1] < 50, "centre is red");
708        // A far corner is still background (black).
709        let c0 = 0;
710        assert!(frame.rgba[c0] < 10, "corner stays background");
711    }
712
713    #[test]
714    fn cpu_renderer_round_trips_through_the_seam() {
715        let mut r = CpuRenderer::new([0, 0, 0, 255]);
716        let canvas = r.begin(48, 48, Camera::default());
717        let ring = RingInstance { center: [24.0, 24.0], radius: 12.0, inner: 6.0, color: [0.0, 1.0, 0.0, 1.0], aa: 1.0 };
718        canvas.push_quads(&[ring.lower()]);
719        assert_eq!(r.backend(), Backend::CpuVello);
720        let frame = r.present();
721        // The ring band (≈9px out) is green; the hole centre is background.
722        let band = (((24) * 48 + (24 + 9)) * 4) as usize;
723        assert!(frame.rgba[band + 1] > 200, "ring band green");
724        let hole = ((24 * 48 + 24) * 4) as usize;
725        assert!(frame.rgba[hole + 1] < 50, "ring hole is background");
726    }
727}