facett_core/render/cpu/mod.rs
1//! **The CPU render lane** (L0 fallback).
2//!
3//! - [`scissor`] — the rect scissor geometry (moved from map3d in Phase A).
4//! - [`sdf`] — the CPU SDF coverage + thick-AA-line raster math (the source of
5//! truth the GPU `sdf.wgsl`/`line.wgsl` mirror).
6//! - [`CpuCanvas`] — collects [`QuadInstance`]/[`LineInstance`] batches and
7//! rasterizes them onto a **`vello_cpu` [`Pixmap`]** via the [`sdf`] coverage
8//! math, producing a straight-RGBA8 frame. It implements the L0
9//! [`Canvas`](super::Canvas) seam; [`CpuRenderer`] implements
10//! [`Renderer`](super::Renderer).
11
12pub mod scissor;
13pub mod sdf;
14
15pub use scissor::{clip_poly_to_rect, ink_outside_rect};
16
17use vello_cpu::color::PremulRgba8;
18use vello_cpu::Pixmap;
19
20use super::camera::Camera;
21use super::prim::{LineInstance, QuadInstance};
22use super::{Backend, Canvas, Frame, Renderer};
23
24/// A CPU canvas: an off-screen [`Pixmap`] (the vello_cpu raster target, per the
25/// CONS-CORE spec) plus the batched SDF instances drawn onto it. The host pushes
26/// quads + lines, then [`CpuCanvas::rasterize`] evaluates the [`sdf`] coverage for
27/// every instance over its bounding box and alpha-composites it into the pixmap.
28///
29/// Coverage math is **byte-for-byte** the same the GPU lane runs, so a CPU frame
30/// matches a GPU frame (the `sdf_primitives` parity test pins this).
31pub struct CpuCanvas {
32 pixmap: Pixmap,
33 width: u32,
34 height: u32,
35 camera: Camera,
36 /// Premultiplied background; painted per-row inside the parallel raster.
37 background: PremulRgba8,
38 quads: Vec<QuadInstance>,
39 lines: Vec<LineInstance>,
40}
41
42impl CpuCanvas {
43 /// A fresh `width × height` canvas cleared to `background` (straight RGBA8),
44 /// under `camera`.
45 ///
46 /// **No serial clear here:** the background is no longer painted up-front in a
47 /// single-threaded `for` over 1M pixels (that was a pure Amdahl serial tail).
48 /// Instead [`raster_batches`] fills each row's background **inside** the gatling
49 /// scanline kernel — the workers that own a row clear it before compositing, so
50 /// the clear scales across all cores with zero extra alloc.
51 pub fn new(width: u32, height: u32, camera: Camera, background: [u8; 4]) -> Self {
52 let pixmap = Pixmap::new(width as u16, height as u16);
53 Self {
54 pixmap,
55 width,
56 height,
57 camera,
58 background: premul(background),
59 quads: Vec::new(),
60 lines: Vec::new(),
61 }
62 }
63
64 /// The raster target — the `vello_cpu` pixmap (spec §2: "CpuCanvas → vello_cpu
65 /// pixmap"). Exposed so a host can hand its glyph/curve overlay (L1 vello) the
66 /// same target later.
67 pub fn pixmap(&self) -> &Pixmap {
68 &self.pixmap
69 }
70
71 /// Rasterize every batched instance onto the pixmap (lines under quads, the
72 /// graph convention: chips draw on top of edges). Returns the straight
73 /// (un-premultiplied) RGBA8 [`Frame`].
74 ///
75 /// **GATLING multicore (LAW 2):** the raster fans across all cores by **scanline
76 /// row** — each row is owned by exactly one thread, so the blends never contend,
77 /// and the per-pixel compositing order (all lines in order, then all quads in
78 /// order) is preserved bit-for-bit vs the sequential path (the
79 /// `parallel_raster_matches_sequential` test pins this). Below a pixel-work
80 /// threshold the frame stays single-threaded (zero pool overhead on small draws).
81 pub fn rasterize(&mut self) -> Frame {
82 self.rasterize_with_workers(0)
83 }
84
85 /// [`rasterize`](Self::rasterize) with an explicit gatling worker count: `0` ⇒
86 /// one per core (the production path), `1` ⇒ the forced single-threaded path.
87 /// Exposed so a bench can time the **same** parallel region at 1 vs N workers and
88 /// report the real cores-busy / speedup (rather than guessing from a sweep
89 /// average). The output is identical for any worker count (the
90 /// `parallel_raster_matches_sequential` test pins bit-identity at N vs 1).
91 pub fn rasterize_with_workers(&mut self, workers: usize) -> Frame {
92 let lines = std::mem::take(&mut self.lines);
93 let quads = std::mem::take(&mut self.quads);
94 self.raster_batches(&lines, &quads, workers);
95 self.frame_with_workers(workers)
96 }
97
98 /// Run **only** the compositing raster (bg clear + y-bucketed SDF blend) at
99 /// `workers` gatling workers — the region the GATLING scanline kernel governs,
100 /// without the memory-bandwidth-bound un-premultiply ([`frame`](Self::frame)).
101 /// Exposed for the scaling bench so cores-busy can be measured on the part that
102 /// is CPU-bound, separate from the RAM-bandwidth-capped pixmap→RGBA8 conversion.
103 pub fn raster_only(&mut self, workers: usize) {
104 let lines = std::mem::take(&mut self.lines);
105 let quads = std::mem::take(&mut self.quads);
106 self.raster_batches(&lines, &quads, workers);
107 }
108
109 /// The pixel-work threshold above which the raster goes multicore. `width *
110 /// height` (the full frame) is a cheap upper bound on the work; tiny frames
111 /// (tooltips, sparklines) skip the gatling fan-out entirely.
112 const PARALLEL_PX_THRESHOLD: usize = 64 * 1024; // 256×256
113
114 /// Should this `w × h` frame fan across cores? (cheap upper bound on work.)
115 #[inline]
116 fn parallel_frame(w: u32, h: u32) -> bool {
117 (w as usize * h as usize) >= Self::PARALLEL_PX_THRESHOLD
118 }
119
120 /// Raster `lines` (under) then `quads` (over) onto the pixmap, row-parallel via
121 /// znippy's fork-join GATLING (rayon is forbidden). Each worker self-dispatches
122 /// whole scanlines and writes into that row's **disjoint** pixel slice — no
123 /// barrier, no wait-for-a-core, zero-copy (row index hand-off), zero-alloc in
124 /// the hot loop. Below the pixel-work threshold it stays single-threaded.
125 ///
126 /// Two scaling lifts live entirely inside this parallel region (so they raise the
127 /// per-core ceiling, they don't add serial tail):
128 /// - **Per-row background clear.** The worker that owns row `y` paints the bg into
129 /// that row before compositing — the 1M-pixel clear is now N-way parallel, not a
130 /// serial `for` in `new`.
131 /// - **Y-bucket reject.** A [`YBuckets`] index, built once per frame, lists only
132 /// the primitives whose vertical span covers each row. `raster_row` then visits
133 /// `O(prims_touching_row)` instead of `O(all_prims)` for every one of `h` rows
134 /// (was `rows × n` rejects per frame — 200k quads × 1024 rows = 205M wasted
135 /// span-tests; now ~one test per real touch).
136 fn raster_batches(&mut self, lines: &[LineInstance], quads: &[QuadInstance], workers: usize) {
137 let w = self.width;
138 let h = self.height;
139 if w == 0 || h == 0 {
140 return;
141 }
142 let bg = self.background;
143 // The empty case still must clear to bg (begin→present with no draws).
144 let buckets = YBuckets::build(lines, quads, h);
145 // Only fan out once the frame is big enough that thread hand-off pays off.
146 let min_rows = if Self::parallel_frame(w, h) { 1 } else { usize::MAX };
147 let data = self.pixmap.data_mut();
148 znippy_zoomies::gatling_forkjoin::gatling_scanlines(
149 data,
150 h as usize,
151 w as usize,
152 workers, // 0 ⇒ one worker per core
153 min_rows,
154 |y, row| {
155 // Clear this row to background, then composite only the primitives
156 // whose span covers it (both inside the parallel region).
157 for px in row.iter_mut() {
158 *px = bg;
159 }
160 let (li, qi) = buckets.row(y);
161 raster_row(row, y as u32, w, lines, quads, li, qi);
162 },
163 );
164 }
165
166 /// Snapshot the current pixmap as a straight-RGBA8 [`Frame`] (un-premultiplied)
167 /// without consuming the canvas.
168 ///
169 /// **Parallel un-premultiply:** the per-pixel un-premultiply is embarrassingly
170 /// parallel, so it runs through the same gatling scanline kernel into a
171 /// pre-allocated output buffer (zero per-pixel alloc) instead of the old
172 /// single-threaded `flat_map().collect()` over the whole framebuffer — that
173 /// `collect` was the second half of the Amdahl serial tail.
174 pub fn frame(&self) -> Frame {
175 self.frame_with_workers(0)
176 }
177
178 /// [`frame`](Self::frame) with an explicit worker count (see
179 /// [`rasterize_with_workers`](Self::rasterize_with_workers)).
180 pub fn frame_with_workers(&self, workers: usize) -> Frame {
181 let w = self.width as usize;
182 let h = self.height as usize;
183 let len = w * h * 4;
184 if w == 0 || h == 0 {
185 return Frame { width: self.width, height: self.height, rgba: Vec::new() };
186 }
187 // Allocate the output **without zeroing** (no `vec![0u8; 67MB]`): the gatling
188 // pass below writes every byte of every row (4 bytes × w per pixel, all h
189 // rows), so the buffer is fully initialized before we ever read it. Zeroing
190 // 67 MB only to overwrite it was a memory-bound serial tail that did not
191 // scale — skipping it lifts the un-premultiply's per-core ceiling.
192 let mut rgba: Vec<u8> = Vec::with_capacity(len);
193 let src = self.pixmap.data();
194 let min_rows = if Self::parallel_frame(self.width, self.height) { 1 } else { usize::MAX };
195 {
196 // SAFETY: `spare` is `len` uninitialized bytes; the kernel writes all of
197 // them (each row's `w*4` bytes are fully assigned in `unpremul_into`), so
198 // every byte is initialized before `set_len(len)` exposes them.
199 let spare = rgba.spare_capacity_mut();
200 // View the MaybeUninit slice as raw bytes for the byte-grid kernel — the
201 // worker assigns (never reads) each byte, so this is sound.
202 let buf = unsafe {
203 std::slice::from_raw_parts_mut(spare.as_mut_ptr() as *mut u8, len)
204 };
205 // Output stride is `w * 4` bytes per row; each row maps src[y*w..] → dst.
206 znippy_zoomies::gatling_forkjoin::gatling_scanlines(
207 buf,
208 h,
209 w * 4,
210 workers,
211 min_rows,
212 |y, out_row| {
213 let src_row = &src[y * w..(y + 1) * w];
214 for (p, o) in src_row.iter().zip(out_row.chunks_exact_mut(4)) {
215 unpremul_into(p, o);
216 }
217 },
218 );
219 }
220 // SAFETY: all `len` bytes were written by the kernel above.
221 unsafe { rgba.set_len(len) };
222 Frame { width: self.width, height: self.height, rgba }
223 }
224}
225
226/// Un-premultiply one [`PremulRgba8`] into a straight `[r,g,b,a]` output slice.
227#[inline]
228fn unpremul_into(p: &PremulRgba8, o: &mut [u8]) {
229 let a = p.a;
230 if a == 0 {
231 o[0] = 0;
232 o[1] = 0;
233 o[2] = 0;
234 o[3] = 0;
235 } else {
236 let un = |c: u8| ((c as u32 * 255 + (a as u32) / 2) / a as u32).min(255) as u8;
237 o[0] = un(p.r);
238 o[1] = un(p.g);
239 o[2] = un(p.b);
240 o[3] = a;
241 }
242}
243
244/// **Per-row primitive index** — built once per frame. For each scanline `y` it
245/// holds the slice of line / quad indices whose vertical span covers that row, so
246/// `raster_row` iterates only the primitives that can actually touch the row
247/// instead of rejecting all `n` of them per row (the O(rows×n) tail).
248///
249/// Layout is a CSR-style flat index: `line_idx`/`quad_idx` are the concatenated
250/// per-row index lists, and `line_off`/`quad_off` are the `h+1` row offsets into
251/// them. Built in two passes (count rows, then fill) so it allocates exactly twice
252/// — and entirely **outside** any timed parallel hot loop work per row.
253struct YBuckets {
254 line_idx: Vec<u32>,
255 line_off: Vec<u32>,
256 quad_idx: Vec<u32>,
257 quad_off: Vec<u32>,
258}
259
260impl YBuckets {
261 /// The `[y0, y1)` integer row span a primitive's pixel-`y` range `[mny, mxy]`
262 /// can cover, clamped to `[0, h)`. Mirrors the per-row test in `raster_row`
263 /// (`py = y + 0.5` must lie in `[mny, mxy]`), so a row is included iff it would
264 /// pass that test — keeping the composite bit-identical.
265 #[inline]
266 fn span(mny: f32, mxy: f32, h: u32) -> (u32, u32) {
267 // py = y+0.5 ∈ [mny, mxy] ⇒ y ∈ [mny-0.5, mxy-0.5]. Clamp to [0, h); an
268 // off-screen primitive yields an empty (y0 >= y1) range and is skipped by
269 // the `y0..y1` loop with no special-case branch. The `.max(0.0)` before the
270 // `as u32` cast also tames NaN/negative inputs (NaN → 0).
271 let y0 = ((mny - 0.5).ceil().max(0.0) as u32).min(h);
272 let y1 = (((mxy - 0.5).floor() + 1.0).max(0.0) as u32).min(h); // exclusive
273 (y0, y1.max(y0))
274 }
275
276 fn build(lines: &[LineInstance], quads: &[QuadInstance], h: u32) -> Self {
277 let hu = h as usize;
278 let mut line_off = vec![0u32; hu + 1];
279 let mut quad_off = vec![0u32; hu + 1];
280 // Pass 1: per-row counts (stored shifted by +1 for the prefix sum).
281 for l in lines {
282 let (_, mny, _, mxy) = l.bounds();
283 let (y0, y1) = Self::span(mny, mxy, h);
284 for y in y0..y1 {
285 line_off[y as usize + 1] += 1;
286 }
287 }
288 for q in quads {
289 let cy = q.center[1];
290 let he = q.half_extent();
291 let (y0, y1) = Self::span(cy - he, cy + he, h);
292 for y in y0..y1 {
293 quad_off[y as usize + 1] += 1;
294 }
295 }
296 // Prefix-sum the counts into offsets.
297 for y in 0..hu {
298 line_off[y + 1] += line_off[y];
299 quad_off[y + 1] += quad_off[y];
300 }
301 let mut line_idx = vec![0u32; line_off[hu] as usize];
302 let mut quad_idx = vec![0u32; quad_off[hu] as usize];
303 // Pass 2: scatter primitive indices into each covered row's slot.
304 let mut cursor = line_off.clone();
305 for (i, l) in lines.iter().enumerate() {
306 let (_, mny, _, mxy) = l.bounds();
307 let (y0, y1) = Self::span(mny, mxy, h);
308 for y in y0..y1 {
309 let slot = &mut cursor[y as usize];
310 line_idx[*slot as usize] = i as u32;
311 *slot += 1;
312 }
313 }
314 let mut cursor = quad_off.clone();
315 for (i, q) in quads.iter().enumerate() {
316 let cy = q.center[1];
317 let he = q.half_extent();
318 let (y0, y1) = Self::span(cy - he, cy + he, h);
319 for y in y0..y1 {
320 let slot = &mut cursor[y as usize];
321 quad_idx[*slot as usize] = i as u32;
322 *slot += 1;
323 }
324 }
325 Self { line_idx, line_off, quad_idx, quad_off }
326 }
327
328 /// The `(line_indices, quad_indices)` that cover row `y`, **in original push
329 /// order** (the scatter preserves it), so the composite order is unchanged.
330 #[inline]
331 fn row(&self, y: usize) -> (&[u32], &[u32]) {
332 let l0 = self.line_off[y] as usize;
333 let l1 = self.line_off[y + 1] as usize;
334 let q0 = self.quad_off[y] as usize;
335 let q1 = self.quad_off[y + 1] as usize;
336 (&self.line_idx[l0..l1], &self.quad_idx[q0..q1])
337 }
338}
339
340/// Raster all `lines` (under) then all `quads` (over) onto a **single scanline**
341/// `row` (the `w` pixels at image row `y`). This is the parallel unit: a thread
342/// owns one row, so blends never contend, and replaying the same instance order
343/// per row keeps the composite bit-identical to the sequential path.
344fn raster_row(
345 row: &mut [PremulRgba8],
346 y: u32,
347 w: u32,
348 lines: &[LineInstance],
349 quads: &[QuadInstance],
350 line_idx: &[u32],
351 quad_idx: &[u32],
352) {
353 let py = y as f32 + 0.5;
354 // Lines first (edges under chips). `line_idx` already lists only the lines whose
355 // span covers this row, in push order — the per-row vertical reject is gone.
356 for &i in line_idx {
357 let l = &lines[i as usize];
358 let (mnx, _, mxx, _) = l.bounds();
359 let x0 = (mnx.floor()).max(0.0) as u32;
360 let x1 = (mxx.ceil()).min(w as f32) as u32;
361 for x in x0..x1 {
362 let cov = sdf::line_coverage(l, [x as f32 + 0.5, py]);
363 if cov > 0.0 {
364 blend_px(&mut row[x as usize], l.color, cov);
365 }
366 }
367 }
368 // Quads over (only those whose span covers this row, in push order).
369 for &i in quad_idx {
370 let q = &quads[i as usize];
371 let he = q.half_extent();
372 let (cx, cy) = (q.center[0], q.center[1]);
373 let x0 = ((cx - he).floor()).max(0.0) as u32;
374 let x1 = ((cx + he).ceil()).min(w as f32) as u32;
375 for x in x0..x1 {
376 let dx = x as f32 + 0.5 - cx;
377 let dy = py - cy;
378 let cov = sdf::quad_coverage(q, dx, dy);
379 if cov > 0.0 {
380 blend_px(&mut row[x as usize], q.color, cov);
381 }
382 }
383 }
384}
385
386/// Source-over alpha-composite a straight `[r,g,b,a] ∈ [0,1]` colour scaled by
387/// `coverage` onto one premultiplied destination pixel.
388#[inline]
389fn blend_px(dst: &mut PremulRgba8, color: [f32; 4], coverage: f32) {
390 let sa = (color[3] * coverage).clamp(0.0, 1.0);
391 // Premultiplied source.
392 let sr = color[0] * sa;
393 let sg = color[1] * sa;
394 let sb = color[2] * sa;
395 let da = dst.a as f32 / 255.0;
396 let dr = dst.r as f32 / 255.0;
397 let dg = dst.g as f32 / 255.0;
398 let db = dst.b as f32 / 255.0;
399 let inv = 1.0 - sa;
400 *dst = PremulRgba8 {
401 r: ((sr + dr * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
402 g: ((sg + dg * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
403 b: ((sb + db * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
404 a: ((sa + da * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
405 };
406}
407
408/// Premultiply a straight `[u8;4]` into a [`PremulRgba8`].
409fn premul(c: [u8; 4]) -> PremulRgba8 {
410 let a = c[3] as u32;
411 let m = |v: u8| ((v as u32 * a + 127) / 255) as u8;
412 PremulRgba8 { r: m(c[0]), g: m(c[1]), b: m(c[2]), a: c[3] }
413}
414
415impl Canvas for CpuCanvas {
416 fn push_quads(&mut self, quads: &[QuadInstance]) {
417 self.quads.extend_from_slice(quads);
418 }
419 fn push_lines(&mut self, lines: &[LineInstance]) {
420 self.lines.extend_from_slice(lines);
421 }
422 fn camera(&self) -> &Camera {
423 &self.camera
424 }
425}
426
427/// The CPU [`Renderer`] — `begin` opens a [`CpuCanvas`] sized to the rect, `present`
428/// rasterizes it to a [`Frame`]. Headless / device / CI; always available (no GPU).
429pub struct CpuRenderer {
430 background: [u8; 4],
431 canvas: Option<CpuCanvas>,
432}
433
434impl CpuRenderer {
435 pub fn new(background: [u8; 4]) -> Self {
436 Self { background, canvas: None }
437 }
438}
439
440impl Default for CpuRenderer {
441 fn default() -> Self {
442 Self::new([12, 14, 20, 255])
443 }
444}
445
446impl Renderer for CpuRenderer {
447 fn begin(&mut self, width: u32, height: u32, camera: Camera) -> &mut dyn Canvas {
448 self.canvas = Some(CpuCanvas::new(width, height, camera, self.background));
449 self.canvas.as_mut().unwrap()
450 }
451 fn present(&mut self) -> Frame {
452 let frame = self.canvas.take().map(|mut c| c.rasterize()).unwrap_or(Frame { width: 0, height: 0, rgba: Vec::new() });
453 // ── L0 CPU render lane emit ───────────────────────────────────────────
454 // Records that the CPU vello lane ran AND a real output invariant: the
455 // frame has the requested dimensions and drew lit pixels. A blank frame
456 // (lit_px == 0) on a non-empty canvas is a RED row. Unique component
457 // string so the inventory test can assert this exact lane produced a row.
458 #[cfg(feature = "testmatrix")]
459 {
460 let lit = frame.lit_px();
461 let total = (frame.width as usize) * (frame.height as usize);
462 // OK when the surface has area and produced at least the background
463 // composite (lit_px > 0 once a frame of non-zero size is rasterized).
464 let ok = total == 0 || lit > 0;
465 crate::testmatrix::emit(
466 "facett-core::render::CpuRenderer::present",
467 "cpu_render",
468 ok && frame.rgba.len() == total * 4,
469 &format!("backend=CpuVello w={} h={} lit_px={lit} total_px={total}", frame.width, frame.height),
470 );
471 }
472 frame
473 }
474 fn backend(&self) -> Backend {
475 Backend::CpuVello
476 }
477}
478
479#[cfg(test)]
480impl CpuCanvas {
481 /// A forced single-threaded raster — the reference the GATLING parallel path is
482 /// goldened against (the `parallel_raster_matches_sequential` test).
483 fn rasterize_sequential(&mut self) -> Frame {
484 let lines = std::mem::take(&mut self.lines);
485 let quads = std::mem::take(&mut self.quads);
486 let w = self.width;
487 let h = self.height;
488 let bg = self.background;
489 let buckets = YBuckets::build(&lines, &quads, h);
490 let data = self.pixmap.data_mut();
491 for y in 0..h {
492 let start = (y * w) as usize;
493 let row = &mut data[start..start + w as usize];
494 for px in row.iter_mut() {
495 *px = bg;
496 }
497 let (li, qi) = buckets.row(y as usize);
498 raster_row(row, y, w, &lines, &quads, li, qi);
499 }
500 self.frame()
501 }
502}
503
504#[cfg(test)]
505mod tests {
506 use super::*;
507 use crate::render::prim::{shape, CircleInstance, LineInstance, MarkerInstance, RingInstance};
508
509 /// INJECT-ASSERT (GATLING): the no-barrier row-parallel raster produces a
510 /// **bit-identical** frame to the single-threaded reference, on a frame big
511 /// enough to actually fan across cores, with overlapping instances (so blend
512 /// order matters) — proving the scanline split preserves compositing order.
513 #[test]
514 fn parallel_raster_matches_sequential() {
515 // 512×512 ⇒ above PARALLEL_PX_THRESHOLD ⇒ the gatling path runs.
516 let (w, h) = (512u32, 512u32);
517 let mk = || {
518 let mut c = CpuCanvas::new(w, h, Camera::default(), [9, 11, 16, 255]);
519 // Many overlapping primitives so per-pixel blend order is load-bearing.
520 let mut quads = Vec::new();
521 let mut lines = Vec::new();
522 for i in 0..400u32 {
523 let x = (i * 37 % 500 + 6) as f32;
524 let y = (i * 53 % 500 + 6) as f32;
525 let col = [(i % 7) as f32 / 7.0, (i % 5) as f32 / 5.0, (i % 3) as f32 / 3.0, 0.7];
526 if i % 3 == 0 {
527 quads.push(CircleInstance { center: [x, y], radius: 14.0, color: col, aa: 1.5 }.lower());
528 } else if i % 3 == 1 {
529 quads.push(RingInstance { center: [x, y], radius: 16.0, inner: 8.0, color: col, aa: 1.5 }.lower());
530 } else {
531 quads.push(MarkerInstance { center: [x, y], radius: 12.0, corner: 2.0, color: col, aa: 1.0, shape: shape::DIAMOND }.lower());
532 }
533 lines.push(LineInstance::round([x, y], [x + 40.0, y + 25.0], 3.0, 1.5, [col[0], col[1], col[2], 0.6]));
534 }
535 c.push_lines(&lines);
536 c.push_quads(&quads);
537 c
538 };
539
540 let parallel = mk().rasterize();
541 let sequential = mk().rasterize_sequential();
542 assert_eq!(parallel.width, sequential.width);
543 assert_eq!(parallel.rgba.len(), sequential.rgba.len());
544 assert_eq!(parallel.rgba, sequential.rgba, "GATLING parallel raster is bit-identical to sequential");
545 // And it actually drew a substantial frame (not a degenerate match-on-blank).
546 assert!(parallel.lit_px() > 50_000, "real content rastered, got {}", parallel.lit_px());
547 }
548
549 /// INJECT-ASSERT (y-bucket index): the y-bucketed raster (each row visits only
550 /// the primitives whose span covers it) is **byte-identical** to a brute-force
551 /// raster that tests every primitive against every row — including primitives
552 /// that straddle the top (y<0) and bottom (y>h) frame edges, so the span clamp
553 /// is exercised. This proves the O(rows×n)→O(touches) reject changed nothing
554 /// the pixels see.
555 #[test]
556 fn ybucket_raster_matches_brute_force_all_primitives() {
557 let (w, h) = (300u32, 200u32);
558 let mut quads = Vec::new();
559 let mut lines = Vec::new();
560 for i in 0..120u32 {
561 let x = (i * 41 % 290 + 4) as f32;
562 // Deliberately push some centres above the top and below the bottom so
563 // their spans clamp at 0 / h.
564 let y = (i as f32 * 7.3) - 30.0;
565 let col = [(i % 7) as f32 / 7.0, (i % 4) as f32 / 4.0, (i % 3) as f32 / 3.0, 0.65];
566 quads.push(CircleInstance { center: [x, y], radius: 12.0, color: col, aa: 1.5 }.lower());
567 lines.push(LineInstance::round([x, y], [x + 30.0, y + 50.0], 3.0, 1.5, [col[0], col[1], col[2], 0.5]));
568 }
569
570 // Bucketed path (production).
571 let mut c = CpuCanvas::new(w, h, Camera::default(), [10, 12, 18, 255]);
572 c.push_lines(&lines);
573 c.push_quads(&quads);
574 let bucketed = c.rasterize();
575
576 // Brute-force reference: clear + composite ALL primitives on EVERY row (no
577 // bucket index), using a full per-row index list `[0,1,2,…]`.
578 let bg = premul([10, 12, 18, 255]);
579 let mut pm = Pixmap::new(w as u16, h as u16);
580 let all_l: Vec<u32> = (0..lines.len() as u32).collect();
581 let all_q: Vec<u32> = (0..quads.len() as u32).collect();
582 let data = pm.data_mut();
583 for y in 0..h {
584 let row = &mut data[(y * w) as usize..((y + 1) * w) as usize];
585 for px in row.iter_mut() {
586 *px = bg;
587 }
588 raster_row(row, y, w, &lines, &quads, &all_l, &all_q);
589 }
590 let brute: Vec<u8> = data
591 .iter()
592 .flat_map(|p| {
593 let mut o = [0u8; 4];
594 unpremul_into(p, &mut o);
595 o
596 })
597 .collect();
598
599 assert_eq!(bucketed.rgba.len(), brute.len());
600 assert_eq!(bucketed.rgba, brute, "y-bucketed raster == brute-force-all-primitives");
601 assert!(bucketed.lit_px() > 1_000, "real content drawn, got {}", bucketed.lit_px());
602 }
603
604 /// INJECT-ASSERT (blend_px, source-over correctness): the private compositing
605 /// primitive obeys the Porter-Duff source-over algebra on premultiplied pixels.
606 /// Return value (the dst pixel) proves it — this is chain link "blend".
607 #[test]
608 fn blend_px_is_source_over_correct() {
609 // Opaque red over opaque black at full coverage → red.
610 let mut dst = premul([0, 0, 0, 255]);
611 blend_px(&mut dst, [1.0, 0.0, 0.0, 1.0], 1.0);
612 assert_eq!((dst.r, dst.g, dst.b, dst.a), (255, 0, 0, 255), "opaque over → source");
613
614 // Zero coverage leaves the destination untouched.
615 let mut dst2 = premul([10, 20, 30, 255]);
616 let before = (dst2.r, dst2.g, dst2.b, dst2.a);
617 blend_px(&mut dst2, [1.0, 1.0, 1.0, 1.0], 0.0);
618 assert_eq!((dst2.r, dst2.g, dst2.b, dst2.a), before, "zero coverage = no-op");
619
620 // 50% white over black: source-over with sa=0.5 → 0.5 premultiplied →
621 // straight ~128 on each channel (premultiplied store, opaque after).
622 let mut dst3 = premul([0, 0, 0, 255]);
623 blend_px(&mut dst3, [1.0, 1.0, 1.0, 1.0], 0.5);
624 // out_premul.r = 1*0.5 + 0*(1-0.5) = 0.5 → 128; alpha = 0.5+1*0.5 = 1 → 255.
625 assert!((dst3.r as i32 - 128).abs() <= 1, "half coverage white = ~128, got {}", dst3.r);
626 assert_eq!(dst3.a, 255, "over an opaque dst stays opaque");
627
628 // Two stacked half-covers ≈ a single 0.75-cover (monotone build-up).
629 let mut a = premul([0, 0, 0, 255]);
630 blend_px(&mut a, [1.0, 1.0, 1.0, 1.0], 0.5);
631 blend_px(&mut a, [1.0, 1.0, 1.0, 1.0], 0.5);
632 assert!(a.r > dst3.r, "stacking two half-covers lightens further (build-up)");
633 }
634
635 /// INJECT-ASSERT (premul ↔ unpremul_into round-trip): the colour transforms at
636 /// the frame boundary are inverses (within rounding) for an opaque pixel, and
637 /// a zero-alpha pixel un-premultiplies to transparent black. Return value proves
638 /// it — chain links "premul" (clear) and "unpremul" (frame readout).
639 #[test]
640 fn premul_unpremul_round_trip() {
641 for &c in &[[255u8, 0, 0, 255], [10, 200, 30, 255], [123, 45, 67, 255]] {
642 let p = premul(c);
643 let mut out = [0u8; 4];
644 unpremul_into(&p, &mut out);
645 for k in 0..4 {
646 assert!((out[k] as i32 - c[k] as i32).abs() <= 1, "round-trip {c:?} -> {out:?} at {k}");
647 }
648 }
649 // Transparent → all zero.
650 let mut out = [9u8; 4];
651 unpremul_into(&PremulRgba8 { r: 0, g: 0, b: 0, a: 0 }, &mut out);
652 assert_eq!(out, [0, 0, 0, 0], "alpha 0 unpremuls to transparent black");
653 }
654
655 /// INJECT-ASSERT (YBuckets::span + row): the per-row index covers EXACTLY the
656 /// rows a primitive's pixel-y span touches — the same `py = y+0.5 ∈ [mny,mxy]`
657 /// test raster_row applies — and clamps off-screen spans to [0,h). Return value
658 /// (the (y0,y1) range + the row index slices) proves it; this is chain link
659 /// "y-bucket".
660 #[test]
661 fn ybuckets_span_covers_exactly_the_touched_rows() {
662 let h = 100u32;
663 // A primitive spanning y∈[20.3, 40.8]: rows whose centre py=y+0.5 lies in
664 // that band are y=20..=40 (py 20.5..40.5). y0=20, y1=41 (exclusive).
665 let (y0, y1) = YBuckets::span(20.3, 40.8, h);
666 assert_eq!((y0, y1), (20, 41), "span covers the centred rows exactly");
667 // Every row in [y0,y1) passes the raster_row test; the rows just outside fail.
668 for y in y0..y1 {
669 let py = y as f32 + 0.5;
670 assert!((20.3..=40.8).contains(&py), "row {y} centre {py} is inside the span");
671 }
672 assert!((y0 as f32 - 0.5) + 0.5 < 20.3 || y0 == 0, "row below y0 is excluded");
673
674 // Off-screen clamp: a span entirely above the top → empty range.
675 let (a0, a1) = YBuckets::span(-50.0, -10.0, h);
676 assert!(a0 >= a1, "above-frame span is empty");
677 // A span straddling the top clamps y0 to 0.
678 let (b0, _b1) = YBuckets::span(-5.0, 10.0, h);
679 assert_eq!(b0, 0, "straddling the top clamps to row 0");
680 // A span past the bottom clamps y1 to h.
681 let (_c0, c1) = YBuckets::span(90.0, 200.0, h);
682 assert_eq!(c1, h, "past the bottom clamps to h");
683
684 // The built index hands raster_row exactly those rows, in push order. Build
685 // a 2-quad index and assert each row lists only the quads covering it.
686 let q_top = CircleInstance { center: [10.0, 10.0], radius: 5.0, color: [1.0; 4], aa: 0.0 }.lower();
687 let q_bot = CircleInstance { center: [10.0, 80.0], radius: 5.0, color: [1.0; 4], aa: 0.0 }.lower();
688 let buckets = YBuckets::build(&[], &[q_top, q_bot], h);
689 let (_, qrow10) = buckets.row(10); // covered by q_top only
690 assert_eq!(qrow10, &[0u32], "row 10 lists only the top quad");
691 let (_, qrow80) = buckets.row(80); // covered by q_bot only
692 assert_eq!(qrow80, &[1u32], "row 80 lists only the bottom quad");
693 let (_, qrow50) = buckets.row(50); // covered by neither
694 assert!(qrow50.is_empty(), "row 50 lists no quads");
695 }
696
697 #[test]
698 fn cpu_canvas_lights_pixels_inside_a_circle() {
699 let cam = Camera::default();
700 let mut canvas = CpuCanvas::new(64, 64, cam, [0, 0, 0, 255]);
701 let c = CircleInstance { center: [32.0, 32.0], radius: 10.0, color: [1.0, 0.0, 0.0, 1.0], aa: 1.0 };
702 canvas.push_quads(&[c.lower()]);
703 let frame = canvas.rasterize();
704 assert_eq!(frame.rgba.len(), 64 * 64 * 4);
705 // Centre pixel is red.
706 let i = ((32 * 64 + 32) * 4) as usize;
707 assert!(frame.rgba[i] > 200 && frame.rgba[i + 1] < 50, "centre is red");
708 // A far corner is still background (black).
709 let c0 = 0;
710 assert!(frame.rgba[c0] < 10, "corner stays background");
711 }
712
713 #[test]
714 fn cpu_renderer_round_trips_through_the_seam() {
715 let mut r = CpuRenderer::new([0, 0, 0, 255]);
716 let canvas = r.begin(48, 48, Camera::default());
717 let ring = RingInstance { center: [24.0, 24.0], radius: 12.0, inner: 6.0, color: [0.0, 1.0, 0.0, 1.0], aa: 1.0 };
718 canvas.push_quads(&[ring.lower()]);
719 assert_eq!(r.backend(), Backend::CpuVello);
720 let frame = r.present();
721 // The ring band (≈9px out) is green; the hole centre is background.
722 let band = (((24) * 48 + (24 + 9)) * 4) as usize;
723 assert!(frame.rgba[band + 1] > 200, "ring band green");
724 let hole = ((24 * 48 + 24) * 4) as usize;
725 assert!(frame.rgba[hole + 1] < 50, "ring hole is background");
726 }
727}