Skip to main content

facett_core/render/cpu/
mod.rs

1//! **The CPU render lane** (L0 fallback).
2//!
3//! - [`scissor`] — the rect scissor geometry (moved from map3d in Phase A).
4//! - [`sdf`] — the CPU SDF coverage + thick-AA-line raster math (the source of
5//!   truth the GPU `sdf.wgsl`/`line.wgsl` mirror).
6//! - [`CpuCanvas`] — collects [`QuadInstance`]/[`LineInstance`] batches and
7//!   rasterizes them onto a **`vello_cpu` [`Pixmap`]** via the [`sdf`] coverage
8//!   math, producing a straight-RGBA8 frame. It implements the L0
9//!   [`Canvas`](super::Canvas) seam; [`CpuRenderer`] implements
10//!   [`Renderer`](super::Renderer).
11
12pub mod scissor;
13pub mod sdf;
14
15pub use scissor::{clip_poly_to_rect, ink_outside_rect};
16
17use vello_cpu::color::PremulRgba8;
18use vello_cpu::Pixmap;
19
20use super::camera::Camera;
21use super::prim::{LineInstance, QuadInstance};
22use super::{Backend, Canvas, Frame, Renderer};
23
24/// A CPU canvas: an off-screen [`Pixmap`] (the vello_cpu raster target, per the
25/// CONS-CORE spec) plus the batched SDF instances drawn onto it. The host pushes
26/// quads + lines, then [`CpuCanvas::rasterize`] evaluates the [`sdf`] coverage for
27/// every instance over its bounding box and alpha-composites it into the pixmap.
28///
29/// Coverage math is **byte-for-byte** the same the GPU lane runs, so a CPU frame
30/// matches a GPU frame (the `sdf_primitives` parity test pins this).
31pub struct CpuCanvas {
32    pixmap: Pixmap,
33    width: u32,
34    height: u32,
35    camera: Camera,
36    /// Premultiplied background; painted per-row inside the parallel raster.
37    background: PremulRgba8,
38    quads: Vec<QuadInstance>,
39    lines: Vec<LineInstance>,
40}
41
42impl CpuCanvas {
43    /// A fresh `width × height` canvas cleared to `background` (straight RGBA8),
44    /// under `camera`.
45    ///
46    /// **No serial clear here:** the background is no longer painted up-front in a
47    /// single-threaded `for` over 1M pixels (that was a pure Amdahl serial tail).
48    /// Instead [`raster_batches`] fills each row's background **inside** the gatling
49    /// scanline kernel — the workers that own a row clear it before compositing, so
50    /// the clear scales across all cores with zero extra alloc.
51    pub fn new(width: u32, height: u32, camera: Camera, background: [u8; 4]) -> Self {
52        let pixmap = Pixmap::new(width as u16, height as u16);
53        Self {
54            pixmap,
55            width,
56            height,
57            camera,
58            background: premul(background),
59            quads: Vec::new(),
60            lines: Vec::new(),
61        }
62    }
63
64    /// The raster target — the `vello_cpu` pixmap (spec §2: "CpuCanvas → vello_cpu
65    /// pixmap"). Exposed so a host can hand its glyph/curve overlay (L1 vello) the
66    /// same target later.
67    pub fn pixmap(&self) -> &Pixmap {
68        &self.pixmap
69    }
70
71    /// Rasterize every batched instance onto the pixmap (lines under quads, the
72    /// graph convention: chips draw on top of edges). Returns the straight
73    /// (un-premultiplied) RGBA8 [`Frame`].
74    ///
75    /// **GATLING multicore (LAW 2):** the raster fans across all cores by **scanline
76    /// row** — each row is owned by exactly one thread, so the blends never contend,
77    /// and the per-pixel compositing order (all lines in order, then all quads in
78    /// order) is preserved bit-for-bit vs the sequential path (the
79    /// `parallel_raster_matches_sequential` test pins this). Below a pixel-work
80    /// threshold the frame stays single-threaded (zero pool overhead on small draws).
81    pub fn rasterize(&mut self) -> Frame {
82        self.rasterize_with_workers(0)
83    }
84
85    /// [`rasterize`](Self::rasterize) with an explicit gatling worker count: `0` ⇒
86    /// one per core (the production path), `1` ⇒ the forced single-threaded path.
87    /// Exposed so a bench can time the **same** parallel region at 1 vs N workers and
88    /// report the real cores-busy / speedup (rather than guessing from a sweep
89    /// average). The output is identical for any worker count (the
90    /// `parallel_raster_matches_sequential` test pins bit-identity at N vs 1).
91    pub fn rasterize_with_workers(&mut self, workers: usize) -> Frame {
92        let lines = std::mem::take(&mut self.lines);
93        let quads = std::mem::take(&mut self.quads);
94        self.raster_batches(&lines, &quads, workers);
95        self.frame_with_workers(workers)
96    }
97
98    /// Run **only** the compositing raster (bg clear + y-bucketed SDF blend) at
99    /// `workers` gatling workers — the region the GATLING scanline kernel governs,
100    /// without the memory-bandwidth-bound un-premultiply ([`frame`](Self::frame)).
101    /// Exposed for the scaling bench so cores-busy can be measured on the part that
102    /// is CPU-bound, separate from the RAM-bandwidth-capped pixmap→RGBA8 conversion.
103    pub fn raster_only(&mut self, workers: usize) {
104        let lines = std::mem::take(&mut self.lines);
105        let quads = std::mem::take(&mut self.quads);
106        self.raster_batches(&lines, &quads, workers);
107    }
108
109    /// The pixel-work threshold above which the raster goes multicore. `width *
110    /// height` (the full frame) is a cheap upper bound on the work; tiny frames
111    /// (tooltips, sparklines) skip the gatling fan-out entirely.
112    const PARALLEL_PX_THRESHOLD: usize = 64 * 1024; // 256×256
113
114    /// Should this `w × h` frame fan across cores? (cheap upper bound on work.)
115    #[inline]
116    fn parallel_frame(w: u32, h: u32) -> bool {
117        (w as usize * h as usize) >= Self::PARALLEL_PX_THRESHOLD
118    }
119
120    /// Raster `lines` (under) then `quads` (over) onto the pixmap, row-parallel via
121    /// znippy's fork-join GATLING (rayon is forbidden). Each worker self-dispatches
122    /// whole scanlines and writes into that row's **disjoint** pixel slice — no
123    /// barrier, no wait-for-a-core, zero-copy (row index hand-off), zero-alloc in
124    /// the hot loop. Below the pixel-work threshold it stays single-threaded.
125    ///
126    /// Two scaling lifts live entirely inside this parallel region (so they raise the
127    /// per-core ceiling, they don't add serial tail):
128    /// - **Per-row background clear.** The worker that owns row `y` paints the bg into
129    ///   that row before compositing — the 1M-pixel clear is now N-way parallel, not a
130    ///   serial `for` in `new`.
131    /// - **Y-bucket reject.** A [`YBuckets`] index, built once per frame, lists only
132    ///   the primitives whose vertical span covers each row. `raster_row` then visits
133    ///   `O(prims_touching_row)` instead of `O(all_prims)` for every one of `h` rows
134    ///   (was `rows × n` rejects per frame — 200k quads × 1024 rows = 205M wasted
135    ///   span-tests; now ~one test per real touch).
136    fn raster_batches(&mut self, lines: &[LineInstance], quads: &[QuadInstance], workers: usize) {
137        let w = self.width;
138        let h = self.height;
139        if w == 0 || h == 0 {
140            return;
141        }
142        let bg = self.background;
143        // The empty case still must clear to bg (begin→present with no draws).
144        let buckets = YBuckets::build(lines, quads, h);
145        // Only fan out once the frame is big enough that thread hand-off pays off.
146        let min_rows = if Self::parallel_frame(w, h) { 1 } else { usize::MAX };
147        let data = self.pixmap.data_mut();
148        znippy_zoomies::gatling_forkjoin::gatling_scanlines(
149            data,
150            h as usize,
151            w as usize,
152            workers, // 0 ⇒ one worker per core
153            min_rows,
154            |y, row| {
155                // Clear this row to background, then composite only the primitives
156                // whose span covers it (both inside the parallel region).
157                for px in row.iter_mut() {
158                    *px = bg;
159                }
160                let (li, qi) = buckets.row(y);
161                raster_row(row, y as u32, w, lines, quads, li, qi);
162            },
163        );
164    }
165
166    /// Snapshot the current pixmap as a straight-RGBA8 [`Frame`] (un-premultiplied)
167    /// without consuming the canvas.
168    ///
169    /// **Parallel un-premultiply:** the per-pixel un-premultiply is embarrassingly
170    /// parallel, so it runs through the same gatling scanline kernel into a
171    /// pre-allocated output buffer (zero per-pixel alloc) instead of the old
172    /// single-threaded `flat_map().collect()` over the whole framebuffer — that
173    /// `collect` was the second half of the Amdahl serial tail.
174    pub fn frame(&self) -> Frame {
175        self.frame_with_workers(0)
176    }
177
178    /// [`frame`](Self::frame) with an explicit worker count (see
179    /// [`rasterize_with_workers`](Self::rasterize_with_workers)).
180    pub fn frame_with_workers(&self, workers: usize) -> Frame {
181        let w = self.width as usize;
182        let h = self.height as usize;
183        let len = w * h * 4;
184        if w == 0 || h == 0 {
185            return Frame { width: self.width, height: self.height, rgba: Vec::new() };
186        }
187        // Allocate the output **without zeroing** (no `vec![0u8; 67MB]`): the gatling
188        // pass below writes every byte of every row (4 bytes × w per pixel, all h
189        // rows), so the buffer is fully initialized before we ever read it. Zeroing
190        // 67 MB only to overwrite it was a memory-bound serial tail that did not
191        // scale — skipping it lifts the un-premultiply's per-core ceiling.
192        let mut rgba: Vec<u8> = Vec::with_capacity(len);
193        let src = self.pixmap.data();
194        let min_rows = if Self::parallel_frame(self.width, self.height) { 1 } else { usize::MAX };
195        {
196            // SAFETY: `spare` is `len` uninitialized bytes; the kernel writes all of
197            // them (each row's `w*4` bytes are fully assigned in `unpremul_into`), so
198            // every byte is initialized before `set_len(len)` exposes them.
199            let spare = rgba.spare_capacity_mut();
200            // View the MaybeUninit slice as raw bytes for the byte-grid kernel — the
201            // worker assigns (never reads) each byte, so this is sound.
202            let buf = unsafe {
203                std::slice::from_raw_parts_mut(spare.as_mut_ptr() as *mut u8, len)
204            };
205            // Output stride is `w * 4` bytes per row; each row maps src[y*w..] → dst.
206            znippy_zoomies::gatling_forkjoin::gatling_scanlines(
207                buf,
208                h,
209                w * 4,
210                workers,
211                min_rows,
212                |y, out_row| {
213                    let src_row = &src[y * w..(y + 1) * w];
214                    for (p, o) in src_row.iter().zip(out_row.chunks_exact_mut(4)) {
215                        unpremul_into(p, o);
216                    }
217                },
218            );
219        }
220        // SAFETY: all `len` bytes were written by the kernel above.
221        unsafe { rgba.set_len(len) };
222        Frame { width: self.width, height: self.height, rgba }
223    }
224}
225
226/// Un-premultiply one [`PremulRgba8`] into a straight `[r,g,b,a]` output slice.
227#[inline]
228fn unpremul_into(p: &PremulRgba8, o: &mut [u8]) {
229    let a = p.a;
230    if a == 0 {
231        o[0] = 0;
232        o[1] = 0;
233        o[2] = 0;
234        o[3] = 0;
235    } else {
236        let un = |c: u8| ((c as u32 * 255 + (a as u32) / 2) / a as u32).min(255) as u8;
237        o[0] = un(p.r);
238        o[1] = un(p.g);
239        o[2] = un(p.b);
240        o[3] = a;
241    }
242}
243
244/// **Per-row primitive index** — built once per frame. For each scanline `y` it
245/// holds the slice of line / quad indices whose vertical span covers that row, so
246/// `raster_row` iterates only the primitives that can actually touch the row
247/// instead of rejecting all `n` of them per row (the O(rows×n) tail).
248///
249/// Layout is a CSR-style flat index: `line_idx`/`quad_idx` are the concatenated
250/// per-row index lists, and `line_off`/`quad_off` are the `h+1` row offsets into
251/// them. Built in two passes (count rows, then fill) so it allocates exactly twice
252/// — and entirely **outside** any timed parallel hot loop work per row.
253struct YBuckets {
254    line_idx: Vec<u32>,
255    line_off: Vec<u32>,
256    quad_idx: Vec<u32>,
257    quad_off: Vec<u32>,
258}
259
260impl YBuckets {
261    /// The `[y0, y1)` integer row span a primitive's pixel-`y` range `[mny, mxy]`
262    /// can cover, clamped to `[0, h)`. Mirrors the per-row test in `raster_row`
263    /// (`py = y + 0.5` must lie in `[mny, mxy]`), so a row is included iff it would
264    /// pass that test — keeping the composite bit-identical.
265    #[inline]
266    fn span(mny: f32, mxy: f32, h: u32) -> (u32, u32) {
267        // py = y+0.5 ∈ [mny, mxy] ⇒ y ∈ [mny-0.5, mxy-0.5]. Clamp to [0, h); an
268        // off-screen primitive yields an empty (y0 >= y1) range and is skipped by
269        // the `y0..y1` loop with no special-case branch. The `.max(0.0)` before the
270        // `as u32` cast also tames NaN/negative inputs (NaN → 0).
271        let y0 = ((mny - 0.5).ceil().max(0.0) as u32).min(h);
272        let y1 = (((mxy - 0.5).floor() + 1.0).max(0.0) as u32).min(h); // exclusive
273        (y0, y1.max(y0))
274    }
275
276    fn build(lines: &[LineInstance], quads: &[QuadInstance], h: u32) -> Self {
277        let hu = h as usize;
278        let mut line_off = vec![0u32; hu + 1];
279        let mut quad_off = vec![0u32; hu + 1];
280        // Pass 1: per-row counts (stored shifted by +1 for the prefix sum).
281        for l in lines {
282            let (_, mny, _, mxy) = l.bounds();
283            let (y0, y1) = Self::span(mny, mxy, h);
284            for y in y0..y1 {
285                line_off[y as usize + 1] += 1;
286            }
287        }
288        for q in quads {
289            let cy = q.center[1];
290            let he = q.half_extent();
291            let (y0, y1) = Self::span(cy - he, cy + he, h);
292            for y in y0..y1 {
293                quad_off[y as usize + 1] += 1;
294            }
295        }
296        // Prefix-sum the counts into offsets.
297        for y in 0..hu {
298            line_off[y + 1] += line_off[y];
299            quad_off[y + 1] += quad_off[y];
300        }
301        let mut line_idx = vec![0u32; line_off[hu] as usize];
302        let mut quad_idx = vec![0u32; quad_off[hu] as usize];
303        // Pass 2: scatter primitive indices into each covered row's slot.
304        let mut cursor = line_off.clone();
305        for (i, l) in lines.iter().enumerate() {
306            let (_, mny, _, mxy) = l.bounds();
307            let (y0, y1) = Self::span(mny, mxy, h);
308            for y in y0..y1 {
309                let slot = &mut cursor[y as usize];
310                line_idx[*slot as usize] = i as u32;
311                *slot += 1;
312            }
313        }
314        let mut cursor = quad_off.clone();
315        for (i, q) in quads.iter().enumerate() {
316            let cy = q.center[1];
317            let he = q.half_extent();
318            let (y0, y1) = Self::span(cy - he, cy + he, h);
319            for y in y0..y1 {
320                let slot = &mut cursor[y as usize];
321                quad_idx[*slot as usize] = i as u32;
322                *slot += 1;
323            }
324        }
325        Self { line_idx, line_off, quad_idx, quad_off }
326    }
327
328    /// The `(line_indices, quad_indices)` that cover row `y`, **in original push
329    /// order** (the scatter preserves it), so the composite order is unchanged.
330    #[inline]
331    fn row(&self, y: usize) -> (&[u32], &[u32]) {
332        let l0 = self.line_off[y] as usize;
333        let l1 = self.line_off[y + 1] as usize;
334        let q0 = self.quad_off[y] as usize;
335        let q1 = self.quad_off[y + 1] as usize;
336        (&self.line_idx[l0..l1], &self.quad_idx[q0..q1])
337    }
338}
339
340/// Raster all `lines` (under) then all `quads` (over) onto a **single scanline**
341/// `row` (the `w` pixels at image row `y`). This is the parallel unit: a thread
342/// owns one row, so blends never contend, and replaying the same instance order
343/// per row keeps the composite bit-identical to the sequential path.
344fn raster_row(
345    row: &mut [PremulRgba8],
346    y: u32,
347    w: u32,
348    lines: &[LineInstance],
349    quads: &[QuadInstance],
350    line_idx: &[u32],
351    quad_idx: &[u32],
352) {
353    let py = y as f32 + 0.5;
354    // Lines first (edges under chips). `line_idx` already lists only the lines whose
355    // span covers this row, in push order — the per-row vertical reject is gone.
356    for &i in line_idx {
357        let l = &lines[i as usize];
358        let (mnx, _, mxx, _) = l.bounds();
359        let x0 = (mnx.floor()).max(0.0) as u32;
360        let x1 = (mxx.ceil()).min(w as f32) as u32;
361        for x in x0..x1 {
362            let cov = sdf::line_coverage(l, [x as f32 + 0.5, py]);
363            if cov > 0.0 {
364                blend_px(&mut row[x as usize], l.color, cov);
365            }
366        }
367    }
368    // Quads over (only those whose span covers this row, in push order).
369    for &i in quad_idx {
370        let q = &quads[i as usize];
371        let he = q.half_extent();
372        let (cx, cy) = (q.center[0], q.center[1]);
373        let x0 = ((cx - he).floor()).max(0.0) as u32;
374        let x1 = ((cx + he).ceil()).min(w as f32) as u32;
375        for x in x0..x1 {
376            let dx = x as f32 + 0.5 - cx;
377            let dy = py - cy;
378            let cov = sdf::quad_coverage(q, dx, dy);
379            if cov > 0.0 {
380                blend_px(&mut row[x as usize], q.color, cov);
381            }
382        }
383    }
384}
385
386/// Source-over alpha-composite a straight `[r,g,b,a] ∈ [0,1]` colour scaled by
387/// `coverage` onto one premultiplied destination pixel.
388#[inline]
389fn blend_px(dst: &mut PremulRgba8, color: [f32; 4], coverage: f32) {
390    let sa = (color[3] * coverage).clamp(0.0, 1.0);
391    // Premultiplied source.
392    let sr = color[0] * sa;
393    let sg = color[1] * sa;
394    let sb = color[2] * sa;
395    let da = dst.a as f32 / 255.0;
396    let dr = dst.r as f32 / 255.0;
397    let dg = dst.g as f32 / 255.0;
398    let db = dst.b as f32 / 255.0;
399    let inv = 1.0 - sa;
400    *dst = PremulRgba8 {
401        r: ((sr + dr * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
402        g: ((sg + dg * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
403        b: ((sb + db * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
404        a: ((sa + da * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
405    };
406}
407
408/// Premultiply a straight `[u8;4]` into a [`PremulRgba8`].
409fn premul(c: [u8; 4]) -> PremulRgba8 {
410    let a = c[3] as u32;
411    let m = |v: u8| ((v as u32 * a + 127) / 255) as u8;
412    PremulRgba8 { r: m(c[0]), g: m(c[1]), b: m(c[2]), a: c[3] }
413}
414
415impl Canvas for CpuCanvas {
416    fn push_quads(&mut self, quads: &[QuadInstance]) {
417        self.quads.extend_from_slice(quads);
418    }
419    fn push_lines(&mut self, lines: &[LineInstance]) {
420        self.lines.extend_from_slice(lines);
421    }
422    fn camera(&self) -> &Camera {
423        &self.camera
424    }
425}
426
427/// The CPU [`Renderer`] — `begin` opens a [`CpuCanvas`] sized to the rect, `present`
428/// rasterizes it to a [`Frame`]. Headless / device / CI; always available (no GPU).
429pub struct CpuRenderer {
430    background: [u8; 4],
431    canvas: Option<CpuCanvas>,
432}
433
434impl CpuRenderer {
435    pub fn new(background: [u8; 4]) -> Self {
436        Self { background, canvas: None }
437    }
438}
439
440impl Default for CpuRenderer {
441    fn default() -> Self {
442        Self::new([12, 14, 20, 255])
443    }
444}
445
446impl Renderer for CpuRenderer {
447    fn begin(&mut self, width: u32, height: u32, camera: Camera) -> &mut dyn Canvas {
448        self.canvas = Some(CpuCanvas::new(width, height, camera, self.background));
449        self.canvas.as_mut().unwrap()
450    }
451    fn present(&mut self) -> Frame {
452        self.canvas.take().map(|mut c| c.rasterize()).unwrap_or(Frame { width: 0, height: 0, rgba: Vec::new() })
453    }
454    fn backend(&self) -> Backend {
455        Backend::CpuVello
456    }
457}
458
459#[cfg(test)]
460impl CpuCanvas {
461    /// A forced single-threaded raster — the reference the GATLING parallel path is
462    /// goldened against (the `parallel_raster_matches_sequential` test).
463    fn rasterize_sequential(&mut self) -> Frame {
464        let lines = std::mem::take(&mut self.lines);
465        let quads = std::mem::take(&mut self.quads);
466        let w = self.width;
467        let h = self.height;
468        let bg = self.background;
469        let buckets = YBuckets::build(&lines, &quads, h);
470        let data = self.pixmap.data_mut();
471        for y in 0..h {
472            let start = (y * w) as usize;
473            let row = &mut data[start..start + w as usize];
474            for px in row.iter_mut() {
475                *px = bg;
476            }
477            let (li, qi) = buckets.row(y as usize);
478            raster_row(row, y, w, &lines, &quads, li, qi);
479        }
480        self.frame()
481    }
482}
483
484#[cfg(test)]
485mod tests {
486    use super::*;
487    use crate::render::prim::{shape, CircleInstance, LineInstance, MarkerInstance, RingInstance};
488
489    /// INJECT-ASSERT (GATLING): the no-barrier row-parallel raster produces a
490    /// **bit-identical** frame to the single-threaded reference, on a frame big
491    /// enough to actually fan across cores, with overlapping instances (so blend
492    /// order matters) — proving the scanline split preserves compositing order.
493    #[test]
494    fn parallel_raster_matches_sequential() {
495        // 512×512 ⇒ above PARALLEL_PX_THRESHOLD ⇒ the gatling path runs.
496        let (w, h) = (512u32, 512u32);
497        let mk = || {
498            let mut c = CpuCanvas::new(w, h, Camera::default(), [9, 11, 16, 255]);
499            // Many overlapping primitives so per-pixel blend order is load-bearing.
500            let mut quads = Vec::new();
501            let mut lines = Vec::new();
502            for i in 0..400u32 {
503                let x = (i * 37 % 500 + 6) as f32;
504                let y = (i * 53 % 500 + 6) as f32;
505                let col = [(i % 7) as f32 / 7.0, (i % 5) as f32 / 5.0, (i % 3) as f32 / 3.0, 0.7];
506                if i % 3 == 0 {
507                    quads.push(CircleInstance { center: [x, y], radius: 14.0, color: col, aa: 1.5 }.lower());
508                } else if i % 3 == 1 {
509                    quads.push(RingInstance { center: [x, y], radius: 16.0, inner: 8.0, color: col, aa: 1.5 }.lower());
510                } else {
511                    quads.push(MarkerInstance { center: [x, y], radius: 12.0, corner: 2.0, color: col, aa: 1.0, shape: shape::DIAMOND }.lower());
512                }
513                lines.push(LineInstance::round([x, y], [x + 40.0, y + 25.0], 3.0, 1.5, [col[0], col[1], col[2], 0.6]));
514            }
515            c.push_lines(&lines);
516            c.push_quads(&quads);
517            c
518        };
519
520        let parallel = mk().rasterize();
521        let sequential = mk().rasterize_sequential();
522        assert_eq!(parallel.width, sequential.width);
523        assert_eq!(parallel.rgba.len(), sequential.rgba.len());
524        assert_eq!(parallel.rgba, sequential.rgba, "GATLING parallel raster is bit-identical to sequential");
525        // And it actually drew a substantial frame (not a degenerate match-on-blank).
526        assert!(parallel.lit_px() > 50_000, "real content rastered, got {}", parallel.lit_px());
527    }
528
529    /// INJECT-ASSERT (y-bucket index): the y-bucketed raster (each row visits only
530    /// the primitives whose span covers it) is **byte-identical** to a brute-force
531    /// raster that tests every primitive against every row — including primitives
532    /// that straddle the top (y<0) and bottom (y>h) frame edges, so the span clamp
533    /// is exercised. This proves the O(rows×n)→O(touches) reject changed nothing
534    /// the pixels see.
535    #[test]
536    fn ybucket_raster_matches_brute_force_all_primitives() {
537        let (w, h) = (300u32, 200u32);
538        let mut quads = Vec::new();
539        let mut lines = Vec::new();
540        for i in 0..120u32 {
541            let x = (i * 41 % 290 + 4) as f32;
542            // Deliberately push some centres above the top and below the bottom so
543            // their spans clamp at 0 / h.
544            let y = (i as f32 * 7.3) - 30.0;
545            let col = [(i % 7) as f32 / 7.0, (i % 4) as f32 / 4.0, (i % 3) as f32 / 3.0, 0.65];
546            quads.push(CircleInstance { center: [x, y], radius: 12.0, color: col, aa: 1.5 }.lower());
547            lines.push(LineInstance::round([x, y], [x + 30.0, y + 50.0], 3.0, 1.5, [col[0], col[1], col[2], 0.5]));
548        }
549
550        // Bucketed path (production).
551        let mut c = CpuCanvas::new(w, h, Camera::default(), [10, 12, 18, 255]);
552        c.push_lines(&lines);
553        c.push_quads(&quads);
554        let bucketed = c.rasterize();
555
556        // Brute-force reference: clear + composite ALL primitives on EVERY row (no
557        // bucket index), using a full per-row index list `[0,1,2,…]`.
558        let bg = premul([10, 12, 18, 255]);
559        let mut pm = Pixmap::new(w as u16, h as u16);
560        let all_l: Vec<u32> = (0..lines.len() as u32).collect();
561        let all_q: Vec<u32> = (0..quads.len() as u32).collect();
562        let data = pm.data_mut();
563        for y in 0..h {
564            let row = &mut data[(y * w) as usize..((y + 1) * w) as usize];
565            for px in row.iter_mut() {
566                *px = bg;
567            }
568            raster_row(row, y, w, &lines, &quads, &all_l, &all_q);
569        }
570        let brute: Vec<u8> = data
571            .iter()
572            .flat_map(|p| {
573                let mut o = [0u8; 4];
574                unpremul_into(p, &mut o);
575                o
576            })
577            .collect();
578
579        assert_eq!(bucketed.rgba.len(), brute.len());
580        assert_eq!(bucketed.rgba, brute, "y-bucketed raster == brute-force-all-primitives");
581        assert!(bucketed.lit_px() > 1_000, "real content drawn, got {}", bucketed.lit_px());
582    }
583
584    #[test]
585    fn cpu_canvas_lights_pixels_inside_a_circle() {
586        let cam = Camera::default();
587        let mut canvas = CpuCanvas::new(64, 64, cam, [0, 0, 0, 255]);
588        let c = CircleInstance { center: [32.0, 32.0], radius: 10.0, color: [1.0, 0.0, 0.0, 1.0], aa: 1.0 };
589        canvas.push_quads(&[c.lower()]);
590        let frame = canvas.rasterize();
591        assert_eq!(frame.rgba.len(), 64 * 64 * 4);
592        // Centre pixel is red.
593        let i = ((32 * 64 + 32) * 4) as usize;
594        assert!(frame.rgba[i] > 200 && frame.rgba[i + 1] < 50, "centre is red");
595        // A far corner is still background (black).
596        let c0 = 0;
597        assert!(frame.rgba[c0] < 10, "corner stays background");
598    }
599
600    #[test]
601    fn cpu_renderer_round_trips_through_the_seam() {
602        let mut r = CpuRenderer::new([0, 0, 0, 255]);
603        let canvas = r.begin(48, 48, Camera::default());
604        let ring = RingInstance { center: [24.0, 24.0], radius: 12.0, inner: 6.0, color: [0.0, 1.0, 0.0, 1.0], aa: 1.0 };
605        canvas.push_quads(&[ring.lower()]);
606        assert_eq!(r.backend(), Backend::CpuVello);
607        let frame = r.present();
608        // The ring band (≈9px out) is green; the hole centre is background.
609        let band = (((24) * 48 + (24 + 9)) * 4) as usize;
610        assert!(frame.rgba[band + 1] > 200, "ring band green");
611        let hole = ((24 * 48 + 24) * 4) as usize;
612        assert!(frame.rgba[hole + 1] < 50, "ring hole is background");
613    }
614}