facett_core/render/cpu/mod.rs
1//! **The CPU render lane** (L0 fallback).
2//!
3//! - [`scissor`] — the rect scissor geometry (moved from map3d in Phase A).
4//! - [`sdf`] — the CPU SDF coverage + thick-AA-line raster math (the source of
5//! truth the GPU `sdf.wgsl`/`line.wgsl` mirror).
6//! - [`CpuCanvas`] — collects [`QuadInstance`]/[`LineInstance`] batches and
7//! rasterizes them onto a **`vello_cpu` [`Pixmap`]** via the [`sdf`] coverage
8//! math, producing a straight-RGBA8 frame. It implements the L0
9//! [`Canvas`](super::Canvas) seam; [`CpuRenderer`] implements
10//! [`Renderer`](super::Renderer).
11
12pub mod scissor;
13pub mod sdf;
14
15pub use scissor::{clip_poly_to_rect, ink_outside_rect};
16
17use vello_cpu::color::PremulRgba8;
18use vello_cpu::Pixmap;
19
20use super::camera::Camera;
21use super::prim::{LineInstance, QuadInstance};
22use super::{Backend, Canvas, Frame, Renderer};
23
24/// A CPU canvas: an off-screen [`Pixmap`] (the vello_cpu raster target, per the
25/// CONS-CORE spec) plus the batched SDF instances drawn onto it. The host pushes
26/// quads + lines, then [`CpuCanvas::rasterize`] evaluates the [`sdf`] coverage for
27/// every instance over its bounding box and alpha-composites it into the pixmap.
28///
29/// Coverage math is **byte-for-byte** the same the GPU lane runs, so a CPU frame
30/// matches a GPU frame (the `sdf_primitives` parity test pins this).
31pub struct CpuCanvas {
32 pixmap: Pixmap,
33 width: u32,
34 height: u32,
35 camera: Camera,
36 /// Premultiplied background; painted per-row inside the parallel raster.
37 background: PremulRgba8,
38 quads: Vec<QuadInstance>,
39 lines: Vec<LineInstance>,
40}
41
42impl CpuCanvas {
43 /// A fresh `width × height` canvas cleared to `background` (straight RGBA8),
44 /// under `camera`.
45 ///
46 /// **No serial clear here:** the background is no longer painted up-front in a
47 /// single-threaded `for` over 1M pixels (that was a pure Amdahl serial tail).
48 /// Instead [`raster_batches`] fills each row's background **inside** the gatling
49 /// scanline kernel — the workers that own a row clear it before compositing, so
50 /// the clear scales across all cores with zero extra alloc.
51 pub fn new(width: u32, height: u32, camera: Camera, background: [u8; 4]) -> Self {
52 let pixmap = Pixmap::new(width as u16, height as u16);
53 Self {
54 pixmap,
55 width,
56 height,
57 camera,
58 background: premul(background),
59 quads: Vec::new(),
60 lines: Vec::new(),
61 }
62 }
63
64 /// The raster target — the `vello_cpu` pixmap (spec §2: "CpuCanvas → vello_cpu
65 /// pixmap"). Exposed so a host can hand its glyph/curve overlay (L1 vello) the
66 /// same target later.
67 pub fn pixmap(&self) -> &Pixmap {
68 &self.pixmap
69 }
70
71 /// Rasterize every batched instance onto the pixmap (lines under quads, the
72 /// graph convention: chips draw on top of edges). Returns the straight
73 /// (un-premultiplied) RGBA8 [`Frame`].
74 ///
75 /// **GATLING multicore (LAW 2):** the raster fans across all cores by **scanline
76 /// row** — each row is owned by exactly one thread, so the blends never contend,
77 /// and the per-pixel compositing order (all lines in order, then all quads in
78 /// order) is preserved bit-for-bit vs the sequential path (the
79 /// `parallel_raster_matches_sequential` test pins this). Below a pixel-work
80 /// threshold the frame stays single-threaded (zero pool overhead on small draws).
81 pub fn rasterize(&mut self) -> Frame {
82 self.rasterize_with_workers(0)
83 }
84
85 /// [`rasterize`](Self::rasterize) with an explicit gatling worker count: `0` ⇒
86 /// one per core (the production path), `1` ⇒ the forced single-threaded path.
87 /// Exposed so a bench can time the **same** parallel region at 1 vs N workers and
88 /// report the real cores-busy / speedup (rather than guessing from a sweep
89 /// average). The output is identical for any worker count (the
90 /// `parallel_raster_matches_sequential` test pins bit-identity at N vs 1).
91 pub fn rasterize_with_workers(&mut self, workers: usize) -> Frame {
92 let lines = std::mem::take(&mut self.lines);
93 let quads = std::mem::take(&mut self.quads);
94 self.raster_batches(&lines, &quads, workers);
95 self.frame_with_workers(workers)
96 }
97
98 /// Run **only** the compositing raster (bg clear + y-bucketed SDF blend) at
99 /// `workers` gatling workers — the region the GATLING scanline kernel governs,
100 /// without the memory-bandwidth-bound un-premultiply ([`frame`](Self::frame)).
101 /// Exposed for the scaling bench so cores-busy can be measured on the part that
102 /// is CPU-bound, separate from the RAM-bandwidth-capped pixmap→RGBA8 conversion.
103 pub fn raster_only(&mut self, workers: usize) {
104 let lines = std::mem::take(&mut self.lines);
105 let quads = std::mem::take(&mut self.quads);
106 self.raster_batches(&lines, &quads, workers);
107 }
108
109 /// The pixel-work threshold above which the raster goes multicore. `width *
110 /// height` (the full frame) is a cheap upper bound on the work; tiny frames
111 /// (tooltips, sparklines) skip the gatling fan-out entirely.
112 const PARALLEL_PX_THRESHOLD: usize = 64 * 1024; // 256×256
113
114 /// Should this `w × h` frame fan across cores? (cheap upper bound on work.)
115 #[inline]
116 fn parallel_frame(w: u32, h: u32) -> bool {
117 (w as usize * h as usize) >= Self::PARALLEL_PX_THRESHOLD
118 }
119
120 /// Raster `lines` (under) then `quads` (over) onto the pixmap, row-parallel via
121 /// znippy's fork-join GATLING (rayon is forbidden). Each worker self-dispatches
122 /// whole scanlines and writes into that row's **disjoint** pixel slice — no
123 /// barrier, no wait-for-a-core, zero-copy (row index hand-off), zero-alloc in
124 /// the hot loop. Below the pixel-work threshold it stays single-threaded.
125 ///
126 /// Two scaling lifts live entirely inside this parallel region (so they raise the
127 /// per-core ceiling, they don't add serial tail):
128 /// - **Per-row background clear.** The worker that owns row `y` paints the bg into
129 /// that row before compositing — the 1M-pixel clear is now N-way parallel, not a
130 /// serial `for` in `new`.
131 /// - **Y-bucket reject.** A [`YBuckets`] index, built once per frame, lists only
132 /// the primitives whose vertical span covers each row. `raster_row` then visits
133 /// `O(prims_touching_row)` instead of `O(all_prims)` for every one of `h` rows
134 /// (was `rows × n` rejects per frame — 200k quads × 1024 rows = 205M wasted
135 /// span-tests; now ~one test per real touch).
136 fn raster_batches(&mut self, lines: &[LineInstance], quads: &[QuadInstance], workers: usize) {
137 let w = self.width;
138 let h = self.height;
139 if w == 0 || h == 0 {
140 return;
141 }
142 let bg = self.background;
143 // The empty case still must clear to bg (begin→present with no draws).
144 let buckets = YBuckets::build(lines, quads, h);
145 // Only fan out once the frame is big enough that thread hand-off pays off.
146 let min_rows = if Self::parallel_frame(w, h) { 1 } else { usize::MAX };
147 let data = self.pixmap.data_mut();
148 znippy_zoomies::gatling_forkjoin::gatling_scanlines(
149 data,
150 h as usize,
151 w as usize,
152 workers, // 0 ⇒ one worker per core
153 min_rows,
154 |y, row| {
155 // Clear this row to background, then composite only the primitives
156 // whose span covers it (both inside the parallel region).
157 for px in row.iter_mut() {
158 *px = bg;
159 }
160 let (li, qi) = buckets.row(y);
161 raster_row(row, y as u32, w, lines, quads, li, qi);
162 },
163 );
164 }
165
166 /// Snapshot the current pixmap as a straight-RGBA8 [`Frame`] (un-premultiplied)
167 /// without consuming the canvas.
168 ///
169 /// **Parallel un-premultiply:** the per-pixel un-premultiply is embarrassingly
170 /// parallel, so it runs through the same gatling scanline kernel into a
171 /// pre-allocated output buffer (zero per-pixel alloc) instead of the old
172 /// single-threaded `flat_map().collect()` over the whole framebuffer — that
173 /// `collect` was the second half of the Amdahl serial tail.
174 pub fn frame(&self) -> Frame {
175 self.frame_with_workers(0)
176 }
177
178 /// [`frame`](Self::frame) with an explicit worker count (see
179 /// [`rasterize_with_workers`](Self::rasterize_with_workers)).
180 pub fn frame_with_workers(&self, workers: usize) -> Frame {
181 let w = self.width as usize;
182 let h = self.height as usize;
183 let len = w * h * 4;
184 if w == 0 || h == 0 {
185 return Frame { width: self.width, height: self.height, rgba: Vec::new() };
186 }
187 // Allocate the output **without zeroing** (no `vec![0u8; 67MB]`): the gatling
188 // pass below writes every byte of every row (4 bytes × w per pixel, all h
189 // rows), so the buffer is fully initialized before we ever read it. Zeroing
190 // 67 MB only to overwrite it was a memory-bound serial tail that did not
191 // scale — skipping it lifts the un-premultiply's per-core ceiling.
192 let mut rgba: Vec<u8> = Vec::with_capacity(len);
193 let src = self.pixmap.data();
194 let min_rows = if Self::parallel_frame(self.width, self.height) { 1 } else { usize::MAX };
195 {
196 // SAFETY: `spare` is `len` uninitialized bytes; the kernel writes all of
197 // them (each row's `w*4` bytes are fully assigned in `unpremul_into`), so
198 // every byte is initialized before `set_len(len)` exposes them.
199 let spare = rgba.spare_capacity_mut();
200 // View the MaybeUninit slice as raw bytes for the byte-grid kernel — the
201 // worker assigns (never reads) each byte, so this is sound.
202 let buf = unsafe {
203 std::slice::from_raw_parts_mut(spare.as_mut_ptr() as *mut u8, len)
204 };
205 // Output stride is `w * 4` bytes per row; each row maps src[y*w..] → dst.
206 znippy_zoomies::gatling_forkjoin::gatling_scanlines(
207 buf,
208 h,
209 w * 4,
210 workers,
211 min_rows,
212 |y, out_row| {
213 let src_row = &src[y * w..(y + 1) * w];
214 for (p, o) in src_row.iter().zip(out_row.chunks_exact_mut(4)) {
215 unpremul_into(p, o);
216 }
217 },
218 );
219 }
220 // SAFETY: all `len` bytes were written by the kernel above.
221 unsafe { rgba.set_len(len) };
222 Frame { width: self.width, height: self.height, rgba }
223 }
224}
225
226/// Un-premultiply one [`PremulRgba8`] into a straight `[r,g,b,a]` output slice.
227#[inline]
228fn unpremul_into(p: &PremulRgba8, o: &mut [u8]) {
229 let a = p.a;
230 if a == 0 {
231 o[0] = 0;
232 o[1] = 0;
233 o[2] = 0;
234 o[3] = 0;
235 } else {
236 let un = |c: u8| ((c as u32 * 255 + (a as u32) / 2) / a as u32).min(255) as u8;
237 o[0] = un(p.r);
238 o[1] = un(p.g);
239 o[2] = un(p.b);
240 o[3] = a;
241 }
242}
243
244/// **Per-row primitive index** — built once per frame. For each scanline `y` it
245/// holds the slice of line / quad indices whose vertical span covers that row, so
246/// `raster_row` iterates only the primitives that can actually touch the row
247/// instead of rejecting all `n` of them per row (the O(rows×n) tail).
248///
249/// Layout is a CSR-style flat index: `line_idx`/`quad_idx` are the concatenated
250/// per-row index lists, and `line_off`/`quad_off` are the `h+1` row offsets into
251/// them. Built in two passes (count rows, then fill) so it allocates exactly twice
252/// — and entirely **outside** any timed parallel hot loop work per row.
253struct YBuckets {
254 line_idx: Vec<u32>,
255 line_off: Vec<u32>,
256 quad_idx: Vec<u32>,
257 quad_off: Vec<u32>,
258}
259
260impl YBuckets {
261 /// The `[y0, y1)` integer row span a primitive's pixel-`y` range `[mny, mxy]`
262 /// can cover, clamped to `[0, h)`. Mirrors the per-row test in `raster_row`
263 /// (`py = y + 0.5` must lie in `[mny, mxy]`), so a row is included iff it would
264 /// pass that test — keeping the composite bit-identical.
265 #[inline]
266 fn span(mny: f32, mxy: f32, h: u32) -> (u32, u32) {
267 // py = y+0.5 ∈ [mny, mxy] ⇒ y ∈ [mny-0.5, mxy-0.5]. Clamp to [0, h); an
268 // off-screen primitive yields an empty (y0 >= y1) range and is skipped by
269 // the `y0..y1` loop with no special-case branch. The `.max(0.0)` before the
270 // `as u32` cast also tames NaN/negative inputs (NaN → 0).
271 let y0 = ((mny - 0.5).ceil().max(0.0) as u32).min(h);
272 let y1 = (((mxy - 0.5).floor() + 1.0).max(0.0) as u32).min(h); // exclusive
273 (y0, y1.max(y0))
274 }
275
276 fn build(lines: &[LineInstance], quads: &[QuadInstance], h: u32) -> Self {
277 let hu = h as usize;
278 let mut line_off = vec![0u32; hu + 1];
279 let mut quad_off = vec![0u32; hu + 1];
280 // Pass 1: per-row counts (stored shifted by +1 for the prefix sum).
281 for l in lines {
282 let (_, mny, _, mxy) = l.bounds();
283 let (y0, y1) = Self::span(mny, mxy, h);
284 for y in y0..y1 {
285 line_off[y as usize + 1] += 1;
286 }
287 }
288 for q in quads {
289 let cy = q.center[1];
290 let he = q.half_extent();
291 let (y0, y1) = Self::span(cy - he, cy + he, h);
292 for y in y0..y1 {
293 quad_off[y as usize + 1] += 1;
294 }
295 }
296 // Prefix-sum the counts into offsets.
297 for y in 0..hu {
298 line_off[y + 1] += line_off[y];
299 quad_off[y + 1] += quad_off[y];
300 }
301 let mut line_idx = vec![0u32; line_off[hu] as usize];
302 let mut quad_idx = vec![0u32; quad_off[hu] as usize];
303 // Pass 2: scatter primitive indices into each covered row's slot.
304 let mut cursor = line_off.clone();
305 for (i, l) in lines.iter().enumerate() {
306 let (_, mny, _, mxy) = l.bounds();
307 let (y0, y1) = Self::span(mny, mxy, h);
308 for y in y0..y1 {
309 let slot = &mut cursor[y as usize];
310 line_idx[*slot as usize] = i as u32;
311 *slot += 1;
312 }
313 }
314 let mut cursor = quad_off.clone();
315 for (i, q) in quads.iter().enumerate() {
316 let cy = q.center[1];
317 let he = q.half_extent();
318 let (y0, y1) = Self::span(cy - he, cy + he, h);
319 for y in y0..y1 {
320 let slot = &mut cursor[y as usize];
321 quad_idx[*slot as usize] = i as u32;
322 *slot += 1;
323 }
324 }
325 Self { line_idx, line_off, quad_idx, quad_off }
326 }
327
328 /// The `(line_indices, quad_indices)` that cover row `y`, **in original push
329 /// order** (the scatter preserves it), so the composite order is unchanged.
330 #[inline]
331 fn row(&self, y: usize) -> (&[u32], &[u32]) {
332 let l0 = self.line_off[y] as usize;
333 let l1 = self.line_off[y + 1] as usize;
334 let q0 = self.quad_off[y] as usize;
335 let q1 = self.quad_off[y + 1] as usize;
336 (&self.line_idx[l0..l1], &self.quad_idx[q0..q1])
337 }
338}
339
340/// Raster all `lines` (under) then all `quads` (over) onto a **single scanline**
341/// `row` (the `w` pixels at image row `y`). This is the parallel unit: a thread
342/// owns one row, so blends never contend, and replaying the same instance order
343/// per row keeps the composite bit-identical to the sequential path.
344fn raster_row(
345 row: &mut [PremulRgba8],
346 y: u32,
347 w: u32,
348 lines: &[LineInstance],
349 quads: &[QuadInstance],
350 line_idx: &[u32],
351 quad_idx: &[u32],
352) {
353 let py = y as f32 + 0.5;
354 // Lines first (edges under chips). `line_idx` already lists only the lines whose
355 // span covers this row, in push order — the per-row vertical reject is gone.
356 for &i in line_idx {
357 let l = &lines[i as usize];
358 let (mnx, _, mxx, _) = l.bounds();
359 let x0 = (mnx.floor()).max(0.0) as u32;
360 let x1 = (mxx.ceil()).min(w as f32) as u32;
361 for x in x0..x1 {
362 let cov = sdf::line_coverage(l, [x as f32 + 0.5, py]);
363 if cov > 0.0 {
364 blend_px(&mut row[x as usize], l.color, cov);
365 }
366 }
367 }
368 // Quads over (only those whose span covers this row, in push order).
369 for &i in quad_idx {
370 let q = &quads[i as usize];
371 let he = q.half_extent();
372 let (cx, cy) = (q.center[0], q.center[1]);
373 let x0 = ((cx - he).floor()).max(0.0) as u32;
374 let x1 = ((cx + he).ceil()).min(w as f32) as u32;
375 for x in x0..x1 {
376 let dx = x as f32 + 0.5 - cx;
377 let dy = py - cy;
378 let cov = sdf::quad_coverage(q, dx, dy);
379 if cov > 0.0 {
380 blend_px(&mut row[x as usize], q.color, cov);
381 }
382 }
383 }
384}
385
386/// Source-over alpha-composite a straight `[r,g,b,a] ∈ [0,1]` colour scaled by
387/// `coverage` onto one premultiplied destination pixel.
388#[inline]
389fn blend_px(dst: &mut PremulRgba8, color: [f32; 4], coverage: f32) {
390 let sa = (color[3] * coverage).clamp(0.0, 1.0);
391 // Premultiplied source.
392 let sr = color[0] * sa;
393 let sg = color[1] * sa;
394 let sb = color[2] * sa;
395 let da = dst.a as f32 / 255.0;
396 let dr = dst.r as f32 / 255.0;
397 let dg = dst.g as f32 / 255.0;
398 let db = dst.b as f32 / 255.0;
399 let inv = 1.0 - sa;
400 *dst = PremulRgba8 {
401 r: ((sr + dr * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
402 g: ((sg + dg * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
403 b: ((sb + db * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
404 a: ((sa + da * inv) * 255.0).round().clamp(0.0, 255.0) as u8,
405 };
406}
407
408/// Premultiply a straight `[u8;4]` into a [`PremulRgba8`].
409fn premul(c: [u8; 4]) -> PremulRgba8 {
410 let a = c[3] as u32;
411 let m = |v: u8| ((v as u32 * a + 127) / 255) as u8;
412 PremulRgba8 { r: m(c[0]), g: m(c[1]), b: m(c[2]), a: c[3] }
413}
414
415impl Canvas for CpuCanvas {
416 fn push_quads(&mut self, quads: &[QuadInstance]) {
417 self.quads.extend_from_slice(quads);
418 }
419 fn push_lines(&mut self, lines: &[LineInstance]) {
420 self.lines.extend_from_slice(lines);
421 }
422 fn camera(&self) -> &Camera {
423 &self.camera
424 }
425}
426
427/// The CPU [`Renderer`] — `begin` opens a [`CpuCanvas`] sized to the rect, `present`
428/// rasterizes it to a [`Frame`]. Headless / device / CI; always available (no GPU).
429pub struct CpuRenderer {
430 background: [u8; 4],
431 canvas: Option<CpuCanvas>,
432}
433
434impl CpuRenderer {
435 pub fn new(background: [u8; 4]) -> Self {
436 Self { background, canvas: None }
437 }
438}
439
440impl Default for CpuRenderer {
441 fn default() -> Self {
442 Self::new([12, 14, 20, 255])
443 }
444}
445
446impl Renderer for CpuRenderer {
447 fn begin(&mut self, width: u32, height: u32, camera: Camera) -> &mut dyn Canvas {
448 self.canvas = Some(CpuCanvas::new(width, height, camera, self.background));
449 self.canvas.as_mut().unwrap()
450 }
451 fn present(&mut self) -> Frame {
452 self.canvas.take().map(|mut c| c.rasterize()).unwrap_or(Frame { width: 0, height: 0, rgba: Vec::new() })
453 }
454 fn backend(&self) -> Backend {
455 Backend::CpuVello
456 }
457}
458
459#[cfg(test)]
460impl CpuCanvas {
461 /// A forced single-threaded raster — the reference the GATLING parallel path is
462 /// goldened against (the `parallel_raster_matches_sequential` test).
463 fn rasterize_sequential(&mut self) -> Frame {
464 let lines = std::mem::take(&mut self.lines);
465 let quads = std::mem::take(&mut self.quads);
466 let w = self.width;
467 let h = self.height;
468 let bg = self.background;
469 let buckets = YBuckets::build(&lines, &quads, h);
470 let data = self.pixmap.data_mut();
471 for y in 0..h {
472 let start = (y * w) as usize;
473 let row = &mut data[start..start + w as usize];
474 for px in row.iter_mut() {
475 *px = bg;
476 }
477 let (li, qi) = buckets.row(y as usize);
478 raster_row(row, y, w, &lines, &quads, li, qi);
479 }
480 self.frame()
481 }
482}
483
484#[cfg(test)]
485mod tests {
486 use super::*;
487 use crate::render::prim::{shape, CircleInstance, LineInstance, MarkerInstance, RingInstance};
488
489 /// INJECT-ASSERT (GATLING): the no-barrier row-parallel raster produces a
490 /// **bit-identical** frame to the single-threaded reference, on a frame big
491 /// enough to actually fan across cores, with overlapping instances (so blend
492 /// order matters) — proving the scanline split preserves compositing order.
493 #[test]
494 fn parallel_raster_matches_sequential() {
495 // 512×512 ⇒ above PARALLEL_PX_THRESHOLD ⇒ the gatling path runs.
496 let (w, h) = (512u32, 512u32);
497 let mk = || {
498 let mut c = CpuCanvas::new(w, h, Camera::default(), [9, 11, 16, 255]);
499 // Many overlapping primitives so per-pixel blend order is load-bearing.
500 let mut quads = Vec::new();
501 let mut lines = Vec::new();
502 for i in 0..400u32 {
503 let x = (i * 37 % 500 + 6) as f32;
504 let y = (i * 53 % 500 + 6) as f32;
505 let col = [(i % 7) as f32 / 7.0, (i % 5) as f32 / 5.0, (i % 3) as f32 / 3.0, 0.7];
506 if i % 3 == 0 {
507 quads.push(CircleInstance { center: [x, y], radius: 14.0, color: col, aa: 1.5 }.lower());
508 } else if i % 3 == 1 {
509 quads.push(RingInstance { center: [x, y], radius: 16.0, inner: 8.0, color: col, aa: 1.5 }.lower());
510 } else {
511 quads.push(MarkerInstance { center: [x, y], radius: 12.0, corner: 2.0, color: col, aa: 1.0, shape: shape::DIAMOND }.lower());
512 }
513 lines.push(LineInstance::round([x, y], [x + 40.0, y + 25.0], 3.0, 1.5, [col[0], col[1], col[2], 0.6]));
514 }
515 c.push_lines(&lines);
516 c.push_quads(&quads);
517 c
518 };
519
520 let parallel = mk().rasterize();
521 let sequential = mk().rasterize_sequential();
522 assert_eq!(parallel.width, sequential.width);
523 assert_eq!(parallel.rgba.len(), sequential.rgba.len());
524 assert_eq!(parallel.rgba, sequential.rgba, "GATLING parallel raster is bit-identical to sequential");
525 // And it actually drew a substantial frame (not a degenerate match-on-blank).
526 assert!(parallel.lit_px() > 50_000, "real content rastered, got {}", parallel.lit_px());
527 }
528
529 /// INJECT-ASSERT (y-bucket index): the y-bucketed raster (each row visits only
530 /// the primitives whose span covers it) is **byte-identical** to a brute-force
531 /// raster that tests every primitive against every row — including primitives
532 /// that straddle the top (y<0) and bottom (y>h) frame edges, so the span clamp
533 /// is exercised. This proves the O(rows×n)→O(touches) reject changed nothing
534 /// the pixels see.
535 #[test]
536 fn ybucket_raster_matches_brute_force_all_primitives() {
537 let (w, h) = (300u32, 200u32);
538 let mut quads = Vec::new();
539 let mut lines = Vec::new();
540 for i in 0..120u32 {
541 let x = (i * 41 % 290 + 4) as f32;
542 // Deliberately push some centres above the top and below the bottom so
543 // their spans clamp at 0 / h.
544 let y = (i as f32 * 7.3) - 30.0;
545 let col = [(i % 7) as f32 / 7.0, (i % 4) as f32 / 4.0, (i % 3) as f32 / 3.0, 0.65];
546 quads.push(CircleInstance { center: [x, y], radius: 12.0, color: col, aa: 1.5 }.lower());
547 lines.push(LineInstance::round([x, y], [x + 30.0, y + 50.0], 3.0, 1.5, [col[0], col[1], col[2], 0.5]));
548 }
549
550 // Bucketed path (production).
551 let mut c = CpuCanvas::new(w, h, Camera::default(), [10, 12, 18, 255]);
552 c.push_lines(&lines);
553 c.push_quads(&quads);
554 let bucketed = c.rasterize();
555
556 // Brute-force reference: clear + composite ALL primitives on EVERY row (no
557 // bucket index), using a full per-row index list `[0,1,2,…]`.
558 let bg = premul([10, 12, 18, 255]);
559 let mut pm = Pixmap::new(w as u16, h as u16);
560 let all_l: Vec<u32> = (0..lines.len() as u32).collect();
561 let all_q: Vec<u32> = (0..quads.len() as u32).collect();
562 let data = pm.data_mut();
563 for y in 0..h {
564 let row = &mut data[(y * w) as usize..((y + 1) * w) as usize];
565 for px in row.iter_mut() {
566 *px = bg;
567 }
568 raster_row(row, y, w, &lines, &quads, &all_l, &all_q);
569 }
570 let brute: Vec<u8> = data
571 .iter()
572 .flat_map(|p| {
573 let mut o = [0u8; 4];
574 unpremul_into(p, &mut o);
575 o
576 })
577 .collect();
578
579 assert_eq!(bucketed.rgba.len(), brute.len());
580 assert_eq!(bucketed.rgba, brute, "y-bucketed raster == brute-force-all-primitives");
581 assert!(bucketed.lit_px() > 1_000, "real content drawn, got {}", bucketed.lit_px());
582 }
583
584 #[test]
585 fn cpu_canvas_lights_pixels_inside_a_circle() {
586 let cam = Camera::default();
587 let mut canvas = CpuCanvas::new(64, 64, cam, [0, 0, 0, 255]);
588 let c = CircleInstance { center: [32.0, 32.0], radius: 10.0, color: [1.0, 0.0, 0.0, 1.0], aa: 1.0 };
589 canvas.push_quads(&[c.lower()]);
590 let frame = canvas.rasterize();
591 assert_eq!(frame.rgba.len(), 64 * 64 * 4);
592 // Centre pixel is red.
593 let i = ((32 * 64 + 32) * 4) as usize;
594 assert!(frame.rgba[i] > 200 && frame.rgba[i + 1] < 50, "centre is red");
595 // A far corner is still background (black).
596 let c0 = 0;
597 assert!(frame.rgba[c0] < 10, "corner stays background");
598 }
599
600 #[test]
601 fn cpu_renderer_round_trips_through_the_seam() {
602 let mut r = CpuRenderer::new([0, 0, 0, 255]);
603 let canvas = r.begin(48, 48, Camera::default());
604 let ring = RingInstance { center: [24.0, 24.0], radius: 12.0, inner: 6.0, color: [0.0, 1.0, 0.0, 1.0], aa: 1.0 };
605 canvas.push_quads(&[ring.lower()]);
606 assert_eq!(r.backend(), Backend::CpuVello);
607 let frame = r.present();
608 // The ring band (≈9px out) is green; the hole centre is background.
609 let band = (((24) * 48 + (24 + 9)) * 4) as usize;
610 assert!(frame.rgba[band + 1] > 200, "ring band green");
611 let hole = ((24 * 48 + 24) * 4) as usize;
612 assert!(frame.rgba[hole + 1] < 50, "ring hole is background");
613 }
614}