roxlap_core/scalar_rasterizer.rs
1//! Scalar `Rasterizer` implementation — port of voxlaptest's 4.7.5
2//! scalar `hrendzsse` / `vrendzsse` fallbacks (`voxlap5.c:1947` /
3//! `:2003` post-4.8.4 line numbers). Writes one `u32` ARGB pixel +
4//! one `f32` z-buffer entry per screen position from radar entries
5//! the (still-stubbed) `gline` will produce in R4.3.
6//!
7//! Per-pixel math (the SSE-batched form lives behind R5; this is the
8//! pre-batch shape):
9//!
10//! ```text
11//! col = scratch.angstart[plc >> 16] + j // signed offset into radar
12//! framebuffer[pixel] = radar[col].col // packed ARGB
13//! z = radar[col].dist / sqrt(dirx² + diry²) // f32 z-buffer entry
14//! dirx += strx; diry += stry; plc += incr
15//! ```
16//!
17//! The vertical scan additionally mutates `scratch.uurend[sx] +=
18//! scratch.uurend[sx + half_stride]` per pixel — that's why the
19//! Rasterizer trait now hands `&mut ScanScratch` to `vrend` /
20//! `hrend`. `gline` is a TODO stub; without it the angstart entries
21//! it would write are zero, so a freshly allocated radar yields all-
22//! zero pixels. Tests pre-fill the radar manually to verify the
23//! scanline rasterizer end of the pipeline.
24
25// Module-wide cast allows: the per-pixel arithmetic constantly
26// crosses signed/unsigned and i32/usize boundaries (loop counters,
27// signed offsets into radar, framebuffer indices). Annotating each
28// site individually buries the per-pixel logic in lint suppressions;
29// the cast-correctness invariants hold by construction.
30#![allow(
31 clippy::cast_sign_loss,
32 clippy::cast_possible_truncation,
33 clippy::cast_possible_wrap,
34 clippy::similar_names
35)]
36
37use std::marker::PhantomData;
38
39use crate::camera_math::CameraState;
40use crate::fixed::ftol;
41use crate::gline::derive_gline_frustum;
42use crate::grouscan::{grouscan_run, CfType, GrouscanInputs, CF_SEED_INDEX};
43use crate::opticast::camera_column_slice;
44use crate::opticast_prelude::{OpticastPrelude, PREC};
45use crate::rasterizer::{Rasterizer, ScanScratch};
46use crate::ray_step::RayStep;
47use crate::scan_loops::ScanContext;
48
49/// Borrowed view of the framebuffer + zbuffer as raw pointers.
50///
51/// R12.2.0 introduces this so the per-frame `ScalarRasterizer` can be
52/// `Copy` (for the per-thread fan-out R12.2.1 lands). Holding `&'a
53/// mut [u32]` / `&'a mut [f32]` directly forces exclusive borrows
54/// per instance, blocking the four quadrants from running on four
55/// threads even though their pixel writes are disjoint.
56///
57/// Constructed safely from exclusive slice borrows — the slices are
58/// consumed and re-exposed as raw pointers tied to lifetime `'a`
59/// via `PhantomData`. Once a `RasterTarget` exists, it is the sole
60/// path to the underlying memory; the slices cannot be used through
61/// any other channel for the duration of `'a`.
62///
63/// `Copy` lets opticast hand each quadrant thread its own copy of
64/// the same target. The four threads write disjoint pixels (top /
65/// bottom / left / right wedges of the screen, no overlap), so
66/// pointer aliasing is safe under the documented invariant.
67///
68/// # Safety contract for parallel use
69/// Callers that copy a `RasterTarget` and pass copies to multiple
70/// threads MUST guarantee that the threads collectively write to
71/// pairwise-disjoint pixel indices. opticast enforces this via the
72/// four-quadrant wedge geometry (see `scan_loops::{top,right,
73/// bottom,left}_quadrant`). Single-threaded callers (R12.1 default)
74/// hold one copy and trivially satisfy the invariant.
75#[derive(Clone, Copy, Debug)]
76pub struct RasterTarget<'a> {
77 fb_ptr: *mut u32,
78 fb_len: usize,
79 zb_ptr: *mut f32,
80 zb_len: usize,
81 _marker: PhantomData<&'a mut [u32]>,
82}
83
84// SAFETY: `RasterTarget` is morally a borrowed mutable slice pair —
85// the same shape `&'a mut [u32]` / `&'a mut [f32]` would have, both of
86// which are `Send` when `T: Send`. Multi-thread safety is enforced
87// by the wedge / strip-disjoint write invariant (see struct doc).
88unsafe impl Send for RasterTarget<'_> {}
89
90// SAFETY: sharing `&RasterTarget` across threads exposes only the
91// raw pointers + lengths. Reading a pointer field is itself free of
92// data races; concurrent writes through the pointer are gated by
93// the disjoint-write invariant the caller upholds. Required so
94// `ScalarRasterizer: Sync`, which `rayon::par_iter_mut` needs to
95// share `&rasterizer` across the strip-parallel closures (R12.3.1).
96unsafe impl Sync for RasterTarget<'_> {}
97
98impl<'a> RasterTarget<'a> {
99 /// Build a target from exclusive slice borrows. The slices are
100 /// consumed (their `&'a mut` reborrow is the load-bearing thing —
101 /// this constructor is the only way to mint a `RasterTarget`
102 /// from safe code).
103 #[must_use]
104 pub fn new(framebuffer: &'a mut [u32], zbuffer: &'a mut [f32]) -> Self {
105 Self {
106 fb_ptr: framebuffer.as_mut_ptr(),
107 fb_len: framebuffer.len(),
108 zb_ptr: zbuffer.as_mut_ptr(),
109 zb_len: zbuffer.len(),
110 _marker: PhantomData,
111 }
112 }
113
114 /// Framebuffer length in `u32` elements.
115 #[must_use]
116 pub fn fb_len(self) -> usize {
117 self.fb_len
118 }
119
120 /// Raw mutable framebuffer pointer. Used by SSE blocks that do
121 /// their own arithmetic + bounds reasoning.
122 ///
123 /// # Safety
124 /// Callers must respect `fb_len` and the parallel-use invariant.
125 #[must_use]
126 pub fn fb_ptr(self) -> *mut u32 {
127 self.fb_ptr
128 }
129
130 /// Raw mutable zbuffer pointer. Same contract as
131 /// [`Self::fb_ptr`].
132 #[must_use]
133 pub fn zb_ptr(self) -> *mut f32 {
134 self.zb_ptr
135 }
136
137 /// Write one ARGB pixel.
138 ///
139 /// # Safety
140 /// `idx < self.fb_len()`, plus the parallel-use invariant.
141 pub unsafe fn write_color(self, idx: usize, color: u32) {
142 debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
143 // SAFETY: caller asserts in-bounds + disjoint-from-other-threads.
144 unsafe { self.fb_ptr.add(idx).write(color) };
145 }
146
147 /// Write one z-buffer entry.
148 ///
149 /// # Safety
150 /// `idx < self.fb_len()` (zbuffer length matches fb), plus the
151 /// parallel-use invariant.
152 pub unsafe fn write_depth(self, idx: usize, z: f32) {
153 debug_assert!(idx < self.zb_len, "zb idx {} >= len {}", idx, self.zb_len);
154 // SAFETY: caller asserts in-bounds + disjoint-from-other-threads.
155 unsafe { self.zb_ptr.add(idx).write(z) };
156 }
157}
158
159// gcsub now lives on `ScanScratch::gcsub`; the host pokes it via
160// `ScanScratch::set_side_shades` per frame (mirrors voxlap's
161// `setsideshades` global). The default-state pattern
162// (`0x00ff00ff00ff00ff` per entry, == `setsideshades(0,…,0)`) is
163// what `ScanScratch::new_for_size` initialises to, so the oracle
164// stays bit-exact when no shading is configured.
165
166/// Per-channel fog blend — voxlap5.c:2052-2056 (and the matching
167/// hrend / vrend scalar tail). `col` is the source ARGB voxel
168/// colour; `dist` is the radar slot's depth (PREC-scaled, so
169/// `>> 20` gives the integer cell distance index into `foglut`).
170/// `foglut` empty short-circuits to "no fog" (returns `col`
171/// unchanged); OOB indices saturate to 32767 (full fog) since
172/// `set_fog` pads the table that way.
173//
174// The C form is one branchless expression per channel; we keep
175// it as-is for legibility against the source. `as i32` casts
176// match the C int32 arithmetic.
177#[allow(
178 clippy::cast_sign_loss,
179 clippy::cast_possible_wrap,
180 clippy::many_single_char_names
181)]
182fn fog_blend(col: i32, dist: i32, foglut: &[i32], fog_col: i32) -> i32 {
183 if foglut.is_empty() {
184 return col;
185 }
186 let idx = (dist >> 20) as usize;
187 let l = foglut.get(idx).copied().unwrap_or(32767) & 32767;
188 let k = col;
189 let fc = fog_col;
190 let r = (((fc & 255) - (k & 255)) * l) >> 15;
191 let g = (((((fc >> 8) & 255) - ((k >> 8) & 255)) * l) >> 15) << 8;
192 let b = (((((fc >> 16) & 255) - ((k >> 16) & 255)) * l) >> 15) << 16;
193 r + g + b + k
194}
195
196/// Per-ray sky-row update. Mirror of voxlap5.c:1236-1255.
197///
198/// On the first ray of a quadrant (`scratch.sky_cur_lng < 0`),
199/// initialise from the ray's `atan2(vy1, vx1)` mapped through
200/// `sky.lng_mul`. On subsequent rays, walk `sky.lng[]` forward
201/// (when `sky_cur_dir < 0`) or backward (`sky_cur_dir >= 0`)
202/// until the cross-product flips sign — voxlap's rotating-cursor
203/// trick that avoids re-running atan2 per ray. After the search,
204/// stamp `scratch.sky_off = sky_cur_lng * sky.bpl`, which
205/// `phase_startsky_textured` divides by 4 to land at the row's
206/// pixel-base index.
207fn sky_per_ray_update(
208 scratch: &mut crate::rasterizer::ScanScratch,
209 sky: &crate::sky::Sky,
210 vx1: f32,
211 vy1: f32,
212) {
213 let ysiz = sky.ysiz;
214 if scratch.sky_cur_lng < 0 {
215 // First-ray init — atan2 mapped to row index.
216 let ang = vy1.atan2(vx1) + std::f32::consts::PI;
217 let raw = ang * sky.lng_mul - 0.5;
218 let mut lng = ftol(raw);
219 // Voxlap's `(uint32_t)skycurlng >= skyysiz` clamp uses an
220 // uninitialised `j` for the corrective shift; we substitute
221 // `rem_euclid` for a deterministic in-range value. The
222 // rotating-cursor walk in subsequent rays will quickly
223 // converge whichever way we land.
224 if (lng as u32) >= (ysiz as u32) {
225 lng = lng.rem_euclid(ysiz);
226 }
227 scratch.sky_cur_lng = lng;
228 } else if scratch.sky_cur_dir < 0 {
229 // Walk forward (rotating).
230 let mut j = scratch.sky_cur_lng + 1;
231 if j >= ysiz {
232 j = 0;
233 }
234 loop {
235 let l = sky.lng[j as usize];
236 if l[0] * vy1 <= l[1] * vx1 {
237 break;
238 }
239 scratch.sky_cur_lng = j;
240 j += 1;
241 if j >= ysiz {
242 j = 0;
243 }
244 }
245 } else {
246 // Walk backward (rotating).
247 loop {
248 let l = sky.lng[scratch.sky_cur_lng as usize];
249 if l[0] * vy1 >= l[1] * vx1 {
250 break;
251 }
252 scratch.sky_cur_lng -= 1;
253 if scratch.sky_cur_lng < 0 {
254 scratch.sky_cur_lng = ysiz - 1;
255 }
256 }
257 }
258 // Voxlap: `skyoff = skycurlng * skybpl + nskypic`. We strip
259 // the `+ nskypic` (texture base address) — `phase_startsky`
260 // adds it implicitly by indexing `sky.pixels` directly.
261 scratch.sky_off = scratch.sky_cur_lng * sky.bpl;
262}
263
264/// Per-frame state cached on first `frame_setup` call. Owned here
265/// (vs. borrowed from `ScanContext`) because gline needs to read
266/// it across many calls without re-borrowing each time. The
267/// `prelude` clone copies one `Vec<i32>` (the `y_lookup` mip
268/// table) per frame — cheap.
269//
270// `Clone` is used by [`ScalarRasterizer`]'s 4-way fan-out (R12.2.1)
271// to mint one rasterizer per quadrant thread; each clone copies the
272// (~few KB) y_lookup Vec — small per-frame allocation cost.
273#[derive(Clone)]
274struct FrameCache {
275 ray_step: RayStep,
276 camera_state: CameraState,
277 prelude: OpticastPrelude,
278 gstartz0: i32,
279 gstartz1: i32,
280 /// Voxlap's `v - *ixy_sptr_col` — byte offset within the
281 /// camera's column to the slab whose top bounds the air gap
282 /// from below. `0` ⇒ column-top.
283 vptr_offset: usize,
284 /// S4B.6.e: chunk-z that owns `vptr_offset`. For the in-camera-
285 /// chunk case this equals `prelude.camera_chunk_idx[2]`. For
286 /// cross-chunk look-down it points to the chunk holding the
287 /// real floor — gline_seed routes state.column / slab_buf to
288 /// it so rays start walking that chunk directly.
289 seed_chunk_z: i32,
290}
291
292/// Scalar rasterizer that writes pixels and a z-buffer entry per
293/// screen position.
294///
295/// Borrows the framebuffer + zbuffer for the duration of one
296/// `opticast` call; SDL hosts allocate these once and reuse across
297/// frames, see `roxlap-host`.
298//
299// `grid` carries the per-frame voxel-world borrow that gline reads
300// to call `grouscan_run` per ray. S4B.0 collapsed the previous
301// four-field `(slab_buf, column_offsets, mip_base_offsets, vsid)`
302// shape into a single `GridView<'a>`; gline + frame_setup
303// destructure via `self.grid.<field>` reads.
304#[derive(Clone)]
305pub struct ScalarRasterizer<'a> {
306 /// Framebuffer + zbuffer raw-pointer view. Stripped from the
307 /// caller's `&mut [u32]` / `&mut [f32]` borrows at construction
308 /// (see [`Self::new`]) so the rasterizer can be `Copy` for the
309 /// per-thread quadrant fan-out R12.2.1 lands. Single-threaded
310 /// path holds one copy, parallel path will hold four (one per
311 /// quadrant — wedge-disjoint pixel writes; see
312 /// [`RasterTarget`]'s safety contract).
313 target: RasterTarget<'a>,
314 /// Row stride in `u32` / `f32` elements (== framebuffer width
315 /// for tightly-packed buffers; SDL streaming textures may add
316 /// trailing padding).
317 pitch_pixels: usize,
318 /// Per-frame world borrow. `Copy`, so the parallel branches'
319 /// per-thread rasterizer clones share the same backing slab /
320 /// column-offset data without an extra heap allocation.
321 grid: crate::grid_view::GridView<'a>,
322 /// Optional sky texture borrow. `None` ⇒ `phase_startsky`
323 /// solid-fills with `scratch.skycast`. `Some(_)` ⇒ gline's
324 /// per-ray frustum prep updates `scratch.sky_off`, and
325 /// `phase_startsky` runs the textured search-and-sample loop.
326 /// Set via [`Self::with_sky`] after construction; unset ⇒
327 /// engine's existing solid-sky behaviour, byte-stable for the
328 /// oracle.
329 sky: Option<&'a crate::sky::Sky>,
330 /// Per-frame state cache. `None` until the first `frame_setup`
331 /// call; gline panics if invoked before that.
332 frame: Option<FrameCache>,
333}
334
335// R12.2.1 / R12.3.1: opticast's parallel branches fan the rasterizer
336// across rayon-managed threads — each thread owns its own clone. The
337// clones share `target: RasterTarget` (raw pointers; safe under the
338// strip-disjoint pixel-write invariant documented on RasterTarget),
339// hold &-refs into the slab/column data (Sync), and have independent
340// FrameCache copies. Compile-time checks: this fails if any field
341// becomes non-Send/non-Sync so the parallel path can no longer hold.
342const _: fn() = || {
343 fn assert_send<T: Send>() {}
344 fn assert_sync<T: Sync>() {}
345 assert_send::<ScalarRasterizer<'_>>();
346 assert_sync::<ScalarRasterizer<'_>>();
347};
348
349impl<'a> ScalarRasterizer<'a> {
350 /// Create a rasterizer that will write into the supplied
351 /// framebuffer + zbuffer pair. `pitch_pixels` must satisfy
352 /// `pitch_pixels * height ≤ framebuffer.len()` for the height
353 /// the engine renders into; the `frame_setup` hook does not
354 /// validate sizes (it has no height to check against).
355 ///
356 /// `grid` describes the world the renderer reads from. Build
357 /// from a [`roxlap_formats::vxl::Vxl`] via
358 /// [`crate::grid_view::GridView::from_single_vxl`], or from raw
359 /// parts via [`crate::grid_view::GridView::from_parts`].
360 ///
361 /// `ray_step` is initialised to a zero placeholder; the real
362 /// values get stamped on the first [`Rasterizer::frame_setup`]
363 /// call before any per-pixel work runs.
364 #[must_use]
365 pub fn new(
366 framebuffer: &'a mut [u32],
367 zbuffer: &'a mut [f32],
368 pitch_pixels: usize,
369 grid: crate::grid_view::GridView<'a>,
370 ) -> Self {
371 Self {
372 target: RasterTarget::new(framebuffer, zbuffer),
373 pitch_pixels,
374 grid,
375 sky: None,
376 frame: None,
377 }
378 }
379
380 /// Bind a sky texture for the lifetime of this rasterizer
381 /// instance. Hosts call this when [`crate::Engine::sky`] is
382 /// `Some(_)`. Without it, the rasterizer keeps the legacy
383 /// solid-fill `skycast` behaviour.
384 #[must_use]
385 pub fn with_sky(mut self, sky: &'a crate::sky::Sky) -> Self {
386 self.sky = Some(sky);
387 self
388 }
389}
390
391impl Rasterizer for ScalarRasterizer<'_> {
392 fn frame_setup(&mut self, ctx: &ScanContext<'_>) {
393 // Cache everything per-frame so gline doesn't re-borrow on
394 // every call. Prelude is cloned (one Vec<i32> alloc per
395 // frame for y_lookup; small).
396 self.frame = Some(FrameCache {
397 ray_step: *ctx.rs,
398 camera_state: *ctx.camera_state,
399 prelude: ctx.prelude.clone(),
400 gstartz0: ctx.camera_gstartz0,
401 gstartz1: ctx.camera_gstartz1,
402 vptr_offset: ctx.camera_vptr_offset,
403 seed_chunk_z: ctx.camera_seed_chunk_z,
404 });
405 }
406
407 #[allow(clippy::too_many_lines)]
408 fn gline(
409 &mut self,
410 scratch: &mut ScanScratch,
411 length: u32,
412 x0: f32,
413 y0: f32,
414 x1: f32,
415 y1: f32,
416 ) {
417 // Voxlap's per-scanline ray-cast: derive the frustum, seed
418 // cf[128], stamp scratch globals, call grouscan. Mirror of
419 // voxlap5.c:gline (1146..1235).
420 let cache = self
421 .frame
422 .as_ref()
423 .expect("gline called before frame_setup");
424 let leng = length as i32;
425
426 // S1.3: resolve the camera-position-specific state once at
427 // the top so the rest of `gline` reads from locals. With no
428 // S1.Z: with the negative-index walk, `gline` reads
429 // camera-position-specific state directly from the cache —
430 // no per-scanline override needed. OOB cameras flow through
431 // the same path; their (cx, cy) signed coords carry into
432 // grouscan via the prelude and the column-step path skips
433 // OOB columns as empty.
434 let pos_xfrac = cache.prelude.pos_xfrac;
435 let pos_yfrac = cache.prelude.pos_yfrac;
436 let li_pos_xy = [cache.prelude.li_pos[0], cache.prelude.li_pos[1]];
437 let column_index = cache.prelude.column_index;
438 // S4B.6.c: cf seed `z0` / `z1` are already world-z (the
439 // air-gap lookup translates chunk-local to world by adding
440 // `camera_chunk_z * chunk_size_z`). For unstacked grids
441 // (`chunks_z == 1`, camera_chunk_z == 0) the world-z and
442 // chunk-local values coincide — byte-identical with the
443 // pre-S4B.6.c path.
444 let gstartz0 = cache.gstartz0;
445 let gstartz1 = cache.gstartz1;
446 let vptr_offset = cache.vptr_offset;
447
448 // 1. Project per-ray frustum (vd0/vd1/vz0/vx1/vy1/vz1 +
449 // gixy/gpz/gdz). voxlap5.c:1153-1175.
450 let f = derive_gline_frustum(
451 &cache.camera_state,
452 pos_xfrac,
453 pos_yfrac,
454 self.grid.vsid,
455 length,
456 x0,
457 y0,
458 x1,
459 y1,
460 );
461
462 // 2. Stamp ray-step globals onto scratch.
463 scratch.gixy = f.gixy;
464 scratch.gpz = f.gpz;
465 scratch.gdz = f.gdz;
466
467 // 3. cmprecip[leng] = CMPPREC / leng (voxlap precomputed
468 // table; voxlap5.c:12315 builds it as `CMPPREC/(float)i`).
469 // CMPPREC = 256*4096 = PREC. gi0 / gi1 are per-pixel ray-
470 // step coefficients in Q12.20 (= PREC); cx0/cy0/cx1/cy1
471 // are the cf[128] seed endpoints. voxlap5.c:1179-1190.
472 // The `as f32` casts here lose precision for very large leng
473 // (> 2²³), but realistic scanline lengths (a few thousand)
474 // are well below that.
475 #[allow(clippy::cast_precision_loss)]
476 let cmpprec = PREC as f32;
477 #[allow(clippy::cast_precision_loss)]
478 let cmprecip = if leng > 0 {
479 cmpprec / (leng as f32)
480 } else {
481 0.0
482 };
483 // ftol() routes float→i32 through i64 to mirror voxlap C's
484 // wrap-on-overflow `lrintf+(int32_t)cast`. The cf-seed
485 // products (vd ± vd) * cmprecip and vd * cmpprec land at
486 // the i32 boundary for world-coord magnitudes near VSID
487 // (= 2048) × PREC (= 2²⁰); Rust's `as i32` saturates and
488 // diverges for those edge cases.
489 let (gi0, gi1, cx0, cy0) = if cache.prelude.forward_z_sign < 0 {
490 (
491 ftol((f.vd1 - f.vd0) * cmprecip),
492 ftol((f.vz1 - f.vz0) * cmprecip),
493 ftol(f.vd0 * cmpprec),
494 ftol(f.vz0 * cmpprec),
495 )
496 } else {
497 (
498 ftol((f.vd0 - f.vd1) * cmprecip),
499 ftol((f.vz0 - f.vz1) * cmprecip),
500 ftol(f.vd1 * cmpprec),
501 ftol(f.vz1 * cmpprec),
502 )
503 };
504 let cx1 = leng.wrapping_mul(gi0).wrapping_add(cx0);
505 let cy1 = leng.wrapping_mul(gi1).wrapping_add(cy0);
506
507 scratch.gi0 = gi0;
508 scratch.gi1 = gi1;
509
510 // 4. Seed cf[128] with the radar range + air-gap z-bounds +
511 // Q12.20 ray endpoints. voxlap5.c:1176-1190.
512 let gscanptr_isize = scratch.gscanptr as isize;
513 scratch.cf[CF_SEED_INDEX] = CfType {
514 i0: gscanptr_isize,
515 i1: gscanptr_isize + leng as isize,
516 z0: gstartz0,
517 z1: gstartz1,
518 cx0,
519 cy0,
520 cx1,
521 cy1,
522 // S4B.6.l: chz_layer scaffold. Pre-l.2 (= multi-chz seed
523 // construction) every cf entry maps to the same chz —
524 // `seed_chunk_z`, the chunk-z the rasterizer reads voxel
525 // data from. For non-cross-chunk-look-down poses this
526 // equals `camera_chunk_idx[2]`.
527 chz_layer: cache.seed_chunk_z,
528 };
529
530 // 5. gxmax = min(gmaxscandist, frustum-edge clip per axis).
531 // voxlap5.c:1192-1228. Unsigned compare — voxlap's `q`
532 // is a uint64_t product that may exceed gmaxscandist or
533 // wrap negative.
534 //
535 // Also stamps `skycast.dist` per voxlap5.c:1209-1227:
536 // initialised to `gxmax` (the scan-distance ceiling),
537 // overwritten with `0x7FFFFFFF` if either frustum-edge
538 // clip fires (= ray hits world edge before scan-dist
539 // cap → "infinitely far" sky depth). startsky's solid-
540 // fill writes this into every drained radar slot's
541 // `dist`, which the z-buffer ends up carrying.
542 //
543 // S4B.2.c.3: the world-edge clip is per-axis voxel-distance
544 // from `li_pos` to the grid's AABB edge along the ray's
545 // step direction. Single-chunk grids (`chunk_grid: None`)
546 // have AABB = `[0, vsid)²`, byte-identical to today. Multi-
547 // chunk grids derive AABB from `chunk_grid.origin_chunk_xy +
548 // chunks_x/y * chunk_size_xy` so rays can walk across all
549 // chunks before hitting the world edge.
550 let mut gxmax = cache.prelude.max_scan_dist;
551 scratch.skycast.dist = gxmax;
552 // S4B.2.d: world-edge clip uses the grid's voxel AABB.
553 // Single-chunk grids get `([0, 0], [vsid, vsid])` — same as
554 // the historical world-edge math. Multi-chunk grids extend
555 // to cover every chunk's voxel extent.
556 let (aabb_min, aabb_max) = self.grid.aabb_xy();
557 let (world_xmin, world_xmax) = (aabb_min[0], aabb_max[0]);
558 let (world_ymin, world_ymax) = (aabb_min[1], aabb_max[1]);
559 let j0 = if f.gixy[0] < 0 {
560 li_pos_xy[0] - world_xmin
561 } else {
562 world_xmax - 1 - li_pos_xy[0]
563 };
564 let q0 = (i64::from(f.gdz[0]).wrapping_mul(i64::from(j0)))
565 .wrapping_add(i64::from(f.gpz[0] as u32));
566 if (q0 as u64) < u64::from(gxmax as u32) {
567 gxmax = q0 as i32;
568 scratch.skycast.dist = i32::MAX;
569 }
570 let j1 = if f.gixy[1] < 0 {
571 li_pos_xy[1] - world_ymin
572 } else {
573 world_ymax - 1 - li_pos_xy[1]
574 };
575 let q1 = (i64::from(f.gdz[1]).wrapping_mul(i64::from(j1)))
576 .wrapping_add(i64::from(f.gpz[1] as u32));
577 if (q1 as u64) < u64::from(gxmax as u32) {
578 gxmax = q1 as i32;
579 scratch.skycast.dist = i32::MAX;
580 }
581 scratch.gxmax = gxmax;
582
583 // 5b. Per-ray sky-row search. Mirror of voxlap5.c:1236-
584 // 1255. Walks `sky.lng[]` to find the texel-row whose
585 // longitude vector matches the ray's `(vx1, vy1)`
586 // direction; stamps `scratch.sky_off` so
587 // `phase_startsky` knows which row to sample. No-op
588 // when no sky texture is bound.
589 if let Some(sky) = self.sky {
590 sky_per_ray_update(scratch, sky, f.vx1, f.vy1);
591 }
592
593 // 6. Build inputs and call grouscan_run.
594 //
595 // S4B.2.c.2: dispatch the camera-column lookup via
596 // `chunk_at_xy(camera_chunk_idx)` so multi-chunk grids
597 // start the walk inside the camera's actual chunk. Single-
598 // chunk grids (`chunk_grid: None`) get
599 // `chunk_at_xy([0, 0]) == Some(Self)` — the flat-table
600 // lookup with the camera's `column_index`, byte-identical
601 // to the goldens.
602 //
603 // OOB cameras (`in_bounds_xy: false`) start with an empty
604 // column — the grouscan column-step walk picks up real
605 // chunks once `(cx, cy)` cross into the grid.
606 // S4B.6.b: dispatch via `chunk_at_xyz` so stacked grids
607 // start the walk inside the camera's `(chx, chy, chz)`. For
608 // `chunks_z == 1` grids `camera_chunk_idx[2] == 0` and the
609 // shortcut path returns the same chunk as the pre-S4B.6
610 // `chunk_at_xy` lookup — byte-identical for the goldens.
611 //
612 // S4B.6.e: use `cache.seed_chunk_z` for the chz coordinate
613 // (= the chunk that owns the cf-seed's vptr_offset). For
614 // the no-cross-chunk path `seed_chunk_z ==
615 // camera_chunk_idx[2]` and this is unchanged. For
616 // cross-chunk look-down (camera in all-air-bedrock column
617 // with terrain in a deeper chunk) `seed_chunk_z` points to
618 // the deeper chunk so the rasterizer reads its real slab
619 // data instead of the empty camera-chunk column.
620 let seed_chunk_xyz = [
621 cache.prelude.camera_chunk_idx[0],
622 cache.prelude.camera_chunk_idx[1],
623 cache.seed_chunk_z,
624 ];
625 let camera_chunk_opt = if cache.prelude.in_bounds_xy {
626 self.grid.chunk_at_xyz(seed_chunk_xyz)
627 } else {
628 None
629 };
630 let (column, chunk_slab, chunk_cols, chunk_mips, chunk_vsid) = match camera_chunk_opt {
631 Some(chunk) => {
632 #[allow(clippy::cast_sign_loss)]
633 let column_idx_in_chunk = (cache.prelude.camera_local_xyz[1] as u32)
634 .wrapping_mul(chunk.chunk_size_xy)
635 .wrapping_add(cache.prelude.camera_local_xyz[0] as u32);
636 let col =
637 camera_column_slice(chunk.slab_buf, chunk.column_offsets, column_idx_in_chunk)
638 .unwrap_or(&[]);
639 (
640 col,
641 chunk.slab_buf,
642 chunk.column_offsets,
643 chunk.mip_base_offsets,
644 chunk.vsid,
645 )
646 }
647 None => (
648 &[][..],
649 self.grid.slab_buf,
650 self.grid.column_offsets,
651 self.grid.mip_base_offsets,
652 self.grid.vsid,
653 ),
654 };
655 // Copy gcsub out of scratch so the GrouscanInputs immutable
656 // borrow doesn't collide with the `&mut scratch` grouscan_run
657 // takes below. `[i64; 9]` is 72 bytes — cheap.
658 let mut gcsub_local: [i64; 9] = scratch.gcsub;
659 // Voxlap5.c:1230-1234. Per-ray, populate the wall-side lanes
660 // (0/1) from the directional lanes (4/5 = left/right,
661 // 6/7 = up/down) according to the sign of `gixy`. Without
662 // this, `wall_lane` reads from the stale `0x00ff_00ff_00ff_00ff`
663 // baseline and wall faces get no directional darkening, even
664 // after the host calls `set_side_shades`.
665 if scratch.sideshademode {
666 let lane0_idx = if f.gixy[0] < 0 { 4 } else { 5 };
667 let lane1_idx = if f.gixy[1] < 0 { 6 } else { 7 };
668 gcsub_local[0] = gcsub_local[lane0_idx];
669 gcsub_local[1] = gcsub_local[lane1_idx];
670 }
671 // S4B.6.j: decouple the camera's PHYSICAL chz (= used by
672 // the column-step's chunk-XY DDA at every XY crossing)
673 // from the SEED chunk's chz (= where state.column starts).
674 // They match for the in-camera-chunk path; they differ
675 // when seed-time cross-chunk look-down stepped down to a
676 // deeper chunk's real floor. Without the decoupling, the
677 // pinned `seed_chunk_z` propagates through every chunk-XY
678 // swap and chz=`camera_chunk_z` content at OTHER columns
679 // (e.g. a mountain peak the camera's column doesn't share)
680 // becomes permanently invisible.
681 //
682 // Camera ABOVE the grid (camera_chunk_idx[2] < origin_chunk_z,
683 // world z<0 in voxlap z-down) clamps to origin_chunk_z so the
684 // column-step's `chunk_at_xyz(.., camera_chunk_z)` queries land
685 // in the grid's top chunk. S5.2-followup: also clamp UP for
686 // camera BELOW the grid (raw chz past max_chz), common for
687 // rotated small grids whose inverse-rotation lands the local
688 // camera past the grid's z extent. Without the symmetric
689 // upper clamp every XY column lookup returns None and the
690 // grid renders pure sky. Mirrors the seed-side clamp in
691 // `camera_chunk_air_gap`.
692 let raw_camera_chunk_z = cache.prelude.camera_chunk_idx[2];
693 let origin_chunk_z = self.grid.chunk_grid.map_or(0, |cg| cg.origin_chunk_z);
694 let chunks_z = self.grid.chunk_grid.map_or(1, |cg| cg.chunks_z) as i32;
695 let max_chz = origin_chunk_z + chunks_z - 1;
696 let camera_chunk_z = raw_camera_chunk_z.clamp(origin_chunk_z, max_chz);
697 let seed_chunk_z = cache.seed_chunk_z;
698 #[allow(clippy::cast_possible_wrap)]
699 let chunk_size_z_signed = self.grid.chunk_size_z as i32;
700 // chunk_world_z_base for the SEED chunk. State reads its
701 // slab data first; per-XY column-step resets this to
702 // `camera_chunk_z * chunk_size_z` for the chunks the DDA
703 // visits. OOB-XY: synthesised seed, no base offset.
704 let chunk_world_z_base = if cache.prelude.in_bounds_xy {
705 seed_chunk_z * chunk_size_z_signed
706 } else {
707 0
708 };
709 let inputs = GrouscanInputs {
710 column,
711 gylookup: &cache.prelude.y_lookup,
712 gcsub: &gcsub_local,
713 slab_buf: chunk_slab,
714 column_offsets: chunk_cols,
715 mip_base_offsets: chunk_mips,
716 vsid: chunk_vsid,
717 sky: self.sky.map(crate::grouscan::SkyRef::from_sky),
718 grid_view: self.grid,
719 // S4B.6.b: pin the camera's chz layer for the
720 // column-step's chunk-XY swap.
721 camera_chunk_z,
722 // S4B.6.c: cf seed + slab byte reads operate in world-z
723 // by adding `chunk_world_z_base` to chunk-local z reads.
724 chunk_world_z_base,
725 chunk_size_z: self.grid.chunk_size_z,
726 };
727 // gmipnum: min of (built mip levels in chunk) and
728 // (caller-requested cap from settings.mip_levels). Both
729 // matter — the chunk dictates whether mip-N data exists,
730 // while the cap dictates how big y_lookup is. Mismatch
731 // would let `phase_remiporend` advance gylookup past its
732 // tail and read garbage offsets. R4.5d's body still gates
733 // on `state.gmipcnt < gmipnum` so a chunk with no mips
734 // takes the early-out and renders mip-0 only.
735 let chunk_mips = u32::try_from(self.grid.mip_base_offsets.len().saturating_sub(1))
736 .expect("mip count fits in u32");
737 let gmipnum = chunk_mips.min(cache.prelude.mip_levels).max(1);
738 let _ = grouscan_run(
739 scratch,
740 &inputs,
741 vptr_offset,
742 column_index as usize,
743 cache.prelude.cx,
744 cache.prelude.cy,
745 cache.prelude.x_mip,
746 gmipnum,
747 );
748
749 // gscanptr is advanced by the opticast quadrant scan
750 // (`scan_loops.rs::top_quadrant` etc., voxlap5.c:2382 area)
751 // AFTER each gline call. Voxlap's `gline` itself does NOT
752 // touch gscanptr — advancing it here too created gaps of
753 // `leng+1` unwritten radar slots between consecutive glines,
754 // which read back as 0 in hrend → black pixels at the
755 // sphere position in diag_down / high_down.
756 }
757
758 #[allow(clippy::too_many_lines)]
759 fn hrend(
760 &mut self,
761 scratch: &mut ScanScratch,
762 sx: i32,
763 sy: i32,
764 p1: i32,
765 plc: i32,
766 incr: i32,
767 j: i32,
768 ) {
769 let rs = self
770 .frame
771 .as_ref()
772 .map(|f| f.ray_step)
773 .expect("hrend/vrend called before frame_setup");
774 // Per-frame setup gives strx/stry/heix/heiy/addx/addy; per-
775 // pixel direction = strx*sx + heix*sy + addx, advancing by
776 // strx in the inner loop.
777 #[allow(clippy::cast_precision_loss)]
778 let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
779 #[allow(clippy::cast_precision_loss)]
780 let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
781 let row_start = sy as usize * self.pitch_pixels;
782
783 let mut plc_local = plc;
784 let mut x = sx;
785
786 // R5.1: SSE2 4-pixel batch via `_mm_rsqrt_ps` — port of
787 // voxlaptest's `hrendzsse` (voxlap5.c:1947). 12-bit
788 // approximation, no Newton refine, matching the
789 // historical asm. The tail (0..3 leftover pixels)
790 // continues with the bit-exact scalar form below; the
791 // batch's z lanes will not match scalar 1/sqrt exactly,
792 // mirroring voxlap. SSE2 is x86_64 baseline so no
793 // runtime CPU-feature check is needed.
794 //
795 // `cast_ptr_alignment` is suppressed because we use
796 // `_mm_storeu_si128` / `_mm_storeu_ps` — the `u`-suffix
797 // variants explicitly support unaligned addresses, so a
798 // u32 pointer cast to `*mut __m128i` is sound.
799 #[cfg(target_arch = "x86_64")]
800 #[allow(clippy::cast_ptr_alignment)]
801 unsafe {
802 use core::arch::x86_64::{
803 __m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
804 _mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
805 };
806 let strx = rs.strx;
807 let stry = rs.stry;
808 let vstrx4 = _mm_set1_ps(strx * 4.0);
809 let vstry4 = _mm_set1_ps(stry * 4.0);
810 let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
811 let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
812 while p1 - x >= 4 {
813 // Gather 4 castdat hits — one per ray index.
814 let mut col = [0i32; 4];
815 let mut dst = [0i32; 4];
816 for k in 0..4 {
817 let ray_idx = (plc_local >> 16) as usize;
818 let cd_offset = scratch.angstart[ray_idx] + j as isize;
819 let cd = scratch.radar[cd_offset as usize];
820 col[k] = cd.col;
821 dst[k] = cd.dist;
822 plc_local = plc_local.wrapping_add(incr);
823 }
824 // R5.2: per-pixel fog blend (voxlap's `hrendzfogsse`).
825 // No-op when foglut is empty. Voxlap's MMX path used
826 // pmulhw with foglut as 4 packed int16 lanes; we
827 // mirror the scalar fallback the goldens use, which
828 // applies a single `l = foglut[..] & 32767` factor
829 // per pixel (one `l` per ray, all 3 channels).
830 if !scratch.foglut.is_empty() {
831 for k in 0..4 {
832 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
833 }
834 }
835 let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
836 let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
837 let vdst = _mm_cvtepi32_ps(vdsi);
838 let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
839 let vinv = _mm_rsqrt_ps(vsqr);
840 let vz = _mm_mul_ps(vdst, vinv);
841
842 let pixel_idx = row_start + x as usize;
843 _mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
844 _mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
845
846 vdx = _mm_add_ps(vdx, vstrx4);
847 vdy = _mm_add_ps(vdy, vstry4);
848 x += 4;
849 }
850 // Bring scalar dirx/diry up to where the batch left
851 // off — first lane of the post-step vdx/vdy.
852 dirx = _mm_cvtss_f32(vdx);
853 diry = _mm_cvtss_f32(vdy);
854 }
855
856 // R9: NEON 4-pixel batch — aarch64 equivalent of the SSE2
857 // path above. Uses `vrsqrteq_f32` + one Newton–Raphson step
858 // via `vrsqrtsq_f32` for ~16-bit precision (vs SSE2's ~12-bit
859 // without Newton). NEON is baseline on all AArch64 — no
860 // runtime feature check needed. Stores are naturally unaligned.
861 #[cfg(target_arch = "aarch64")]
862 unsafe {
863 use core::arch::aarch64::{
864 float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
865 vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
866 vst1q_u32,
867 };
868 let strx = rs.strx;
869 let stry = rs.stry;
870 let vstrx4 = vdupq_n_f32(strx * 4.0);
871 let vstry4 = vdupq_n_f32(stry * 4.0);
872 let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
873 let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
874 let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
875 let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
876 while p1 - x >= 4 {
877 // Scalar gather — same as SSE2 path.
878 let mut col = [0i32; 4];
879 let mut dst = [0i32; 4];
880 for k in 0..4 {
881 let ray_idx = (plc_local >> 16) as usize;
882 let cd_offset = scratch.angstart[ray_idx] + j as isize;
883 let cd = scratch.radar[cd_offset as usize];
884 col[k] = cd.col;
885 dst[k] = cd.dist;
886 plc_local = plc_local.wrapping_add(incr);
887 }
888 if !scratch.foglut.is_empty() {
889 for k in 0..4 {
890 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
891 }
892 }
893 let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
894 let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
895 let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
896 // One Newton–Raphson step: est * vrsqrts(x * est, est).
897 let est = vrsqrteq_f32(vsqr);
898 let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
899 let vz = vmulq_f32(vdst, vinv);
900
901 let pixel_idx = row_start + x as usize;
902 vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
903 vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
904
905 vdx = vaddq_f32(vdx, vstrx4);
906 vdy = vaddq_f32(vdy, vstry4);
907 x += 4;
908 }
909 dirx = vgetq_lane_f32(vdx, 0);
910 diry = vgetq_lane_f32(vdy, 0);
911 }
912
913 // R10.3: wasm SIMD 4-pixel batch — equivalent of the SSE2
914 // / NEON paths above. Uses `1.0 / sqrt(x)` (full-precision
915 // `f32x4_sqrt` + `f32x4_div`) where SSE2 had `_mm_rsqrt_ps`
916 // and NEON had `vrsqrteq_f32`+Newton, since wasm SIMD has
917 // no rsqrt approximation. Wasm bytes therefore differ
918 // from both x86 and aarch64 goldens — captured by R10.4's
919 // separate `wasm-hashes.txt`.
920 #[cfg(target_arch = "wasm32")]
921 unsafe {
922 use core::arch::wasm32::{
923 f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
924 f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
925 };
926 let strx = rs.strx;
927 let stry = rs.stry;
928 let vstrx4 = f32x4_splat(strx * 4.0);
929 let vstry4 = f32x4_splat(stry * 4.0);
930 let one = f32x4_splat(1.0);
931 let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
932 let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
933 while p1 - x >= 4 {
934 // Scalar gather — same shape as SSE2 / NEON paths.
935 let mut col = [0i32; 4];
936 let mut dst = [0i32; 4];
937 for k in 0..4 {
938 let ray_idx = (plc_local >> 16) as usize;
939 let cd_offset = scratch.angstart[ray_idx] + j as isize;
940 let cd = scratch.radar[cd_offset as usize];
941 col[k] = cd.col;
942 dst[k] = cd.dist;
943 plc_local = plc_local.wrapping_add(incr);
944 }
945 if !scratch.foglut.is_empty() {
946 for k in 0..4 {
947 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
948 }
949 }
950 let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
951 let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
952 let vdst = f32x4_convert_i32x4(vdsi);
953 let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
954 let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
955 let vz = f32x4_mul(vdst, vinv);
956
957 let pixel_idx = row_start + x as usize;
958 v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
959 v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
960
961 vdx = f32x4_add(vdx, vstrx4);
962 vdy = f32x4_add(vdy, vstry4);
963 x += 4;
964 }
965 dirx = f32x4_extract_lane::<0>(vdx);
966 diry = f32x4_extract_lane::<0>(vdy);
967 }
968
969 // Scalar tail — handles 0..3 leftover pixels on x86_64 /
970 // aarch64 / wasm32 and the full body on other targets.
971 while x < p1 {
972 // ray index = signed shift right (voxlap's `plc >> 16`).
973 let ray_idx = (plc_local >> 16) as usize;
974 let cd_offset = scratch.angstart[ray_idx] + j as isize;
975 let cd = scratch.radar[cd_offset as usize];
976 let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
977
978 let pixel_idx = row_start + x as usize;
979 #[allow(clippy::cast_precision_loss)]
980 let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
981 // SAFETY: pixel_idx = sy*pitch + x, with sy < yres and x < p1
982 // ≤ xres (loop guard); p1 ≤ ctx.xres in scan_loops::top_quadrant /
983 // bottom_quadrant. fb / zb were allocated at pitch*height by the
984 // caller (asserted in Engine::render's preamble); pixel_idx is
985 // therefore in-range. Wedge-disjoint invariant: top + bottom
986 // quadrants own disjoint sy ranges.
987 unsafe {
988 self.target.write_color(pixel_idx, col as u32);
989 self.target.write_depth(pixel_idx, z);
990 }
991
992 dirx += rs.strx;
993 diry += rs.stry;
994 plc_local = plc_local.wrapping_add(incr);
995 x += 1;
996 }
997 }
998
999 #[allow(clippy::too_many_lines)]
1000 fn vrend(
1001 &mut self,
1002 scratch: &mut ScanScratch,
1003 sx: i32,
1004 sy: i32,
1005 p1: i32,
1006 iplc: i32,
1007 iinc: i32,
1008 ) {
1009 let rs = self
1010 .frame
1011 .as_ref()
1012 .map(|f| f.ray_step)
1013 .expect("hrend/vrend called before frame_setup");
1014 #[allow(clippy::cast_precision_loss)]
1015 let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
1016 #[allow(clippy::cast_precision_loss)]
1017 let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
1018 let row_start = sy as usize * self.pitch_pixels;
1019 let half_stride = scratch.uurend_half_stride;
1020
1021 let mut iplc_local = iplc;
1022 let mut x = sx;
1023
1024 // R5.3: SSE2 4-pixel batch — port of voxlaptest's
1025 // `vrendzsse` (voxlap5.c:2083). The per-column
1026 // `uurend[sx] += uurend[sx + half_stride]` update is
1027 // parallel-safe: uurend[sx + half_stride..] is read-only
1028 // here, and uurend[sx..+3] are four distinct lanes.
1029 // Read OLD u/d values, do the SSE z math, then write
1030 // back four NEW u values. Plus fog blend (R5.2-style)
1031 // when foglut is non-empty.
1032 #[cfg(target_arch = "x86_64")]
1033 #[allow(clippy::cast_ptr_alignment)]
1034 unsafe {
1035 use core::arch::x86_64::{
1036 __m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
1037 _mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
1038 };
1039 let strx = rs.strx;
1040 let stry = rs.stry;
1041 let vstrx4 = _mm_set1_ps(strx * 4.0);
1042 let vstry4 = _mm_set1_ps(stry * 4.0);
1043 let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
1044 let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
1045 while p1 - x >= 4 {
1046 let xu = x as usize;
1047 // Read 4 OLD uurend pairs (u, d). u = current ray
1048 // index for column; d = per-pixel delta.
1049 let mut u = [0i32; 4];
1050 let mut d = [0i32; 4];
1051 for k in 0..4 {
1052 u[k] = scratch.uurend[xu + k];
1053 d[k] = scratch.uurend[xu + k + half_stride];
1054 }
1055 // Gather 4 castdat hits.
1056 let mut col = [0i32; 4];
1057 let mut dst = [0i32; 4];
1058 for k in 0..4 {
1059 let ray_idx = (u[k] >> 16) as usize;
1060 let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
1061 let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
1062 let cd = scratch.radar[cd_offset as usize];
1063 col[k] = cd.col;
1064 dst[k] = cd.dist;
1065 }
1066 if !scratch.foglut.is_empty() {
1067 for k in 0..4 {
1068 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1069 }
1070 }
1071 let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
1072 let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
1073 let vdst = _mm_cvtepi32_ps(vdsi);
1074 let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
1075 let vinv = _mm_rsqrt_ps(vsqr);
1076 let vz = _mm_mul_ps(vdst, vinv);
1077
1078 let pixel_idx = row_start + xu;
1079 _mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
1080 _mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
1081
1082 // Write back NEW uurend values — u + d per lane.
1083 for k in 0..4 {
1084 scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1085 }
1086
1087 vdx = _mm_add_ps(vdx, vstrx4);
1088 vdy = _mm_add_ps(vdy, vstry4);
1089 iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1090 x += 4;
1091 }
1092 dirx = _mm_cvtss_f32(vdx);
1093 diry = _mm_cvtss_f32(vdy);
1094 }
1095
1096 // R9: NEON 4-pixel batch for vrend — aarch64 equivalent.
1097 // Same structure as hrend NEON: scalar gather + uurend
1098 // read/write, NEON rsqrt for z, vectorized store.
1099 #[cfg(target_arch = "aarch64")]
1100 unsafe {
1101 use core::arch::aarch64::{
1102 float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
1103 vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
1104 vst1q_u32,
1105 };
1106 let strx = rs.strx;
1107 let stry = rs.stry;
1108 let vstrx4 = vdupq_n_f32(strx * 4.0);
1109 let vstry4 = vdupq_n_f32(stry * 4.0);
1110 let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
1111 let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
1112 let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
1113 let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
1114 while p1 - x >= 4 {
1115 let xu = x as usize;
1116 // Read 4 OLD uurend pairs (u, d).
1117 let mut u = [0i32; 4];
1118 let mut d = [0i32; 4];
1119 for k in 0..4 {
1120 u[k] = scratch.uurend[xu + k];
1121 d[k] = scratch.uurend[xu + k + half_stride];
1122 }
1123 // Scalar gather — 4 castdat hits.
1124 let mut col = [0i32; 4];
1125 let mut dst = [0i32; 4];
1126 for k in 0..4 {
1127 let ray_idx = (u[k] >> 16) as usize;
1128 let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
1129 let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
1130 let cd = scratch.radar[cd_offset as usize];
1131 col[k] = cd.col;
1132 dst[k] = cd.dist;
1133 }
1134 if !scratch.foglut.is_empty() {
1135 for k in 0..4 {
1136 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1137 }
1138 }
1139 let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
1140 let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
1141 let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
1142 let est = vrsqrteq_f32(vsqr);
1143 let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
1144 let vz = vmulq_f32(vdst, vinv);
1145
1146 let pixel_idx = row_start + xu;
1147 vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
1148 vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
1149
1150 // Write back NEW uurend values — u + d per lane.
1151 for k in 0..4 {
1152 scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1153 }
1154
1155 vdx = vaddq_f32(vdx, vstrx4);
1156 vdy = vaddq_f32(vdy, vstry4);
1157 iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1158 x += 4;
1159 }
1160 dirx = vgetq_lane_f32(vdx, 0);
1161 diry = vgetq_lane_f32(vdy, 0);
1162 }
1163
1164 // R10.3: wasm SIMD 4-pixel batch for vrend — equivalent of
1165 // the SSE2 / NEON paths above. Same scalar-gather + uurend
1166 // read/write structure; full-precision `1.0 / sqrt(x)` for
1167 // the inverse magnitude, since wasm SIMD has no rsqrt
1168 // approximation. Bytes diverge from the other arches —
1169 // R10.4's `wasm-hashes.txt` covers the divergence.
1170 #[cfg(target_arch = "wasm32")]
1171 unsafe {
1172 use core::arch::wasm32::{
1173 f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
1174 f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
1175 };
1176 let strx = rs.strx;
1177 let stry = rs.stry;
1178 let vstrx4 = f32x4_splat(strx * 4.0);
1179 let vstry4 = f32x4_splat(stry * 4.0);
1180 let one = f32x4_splat(1.0);
1181 let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
1182 let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
1183 while p1 - x >= 4 {
1184 let xu = x as usize;
1185 // Read 4 OLD uurend pairs (u, d).
1186 let mut u = [0i32; 4];
1187 let mut d = [0i32; 4];
1188 for k in 0..4 {
1189 u[k] = scratch.uurend[xu + k];
1190 d[k] = scratch.uurend[xu + k + half_stride];
1191 }
1192 // Scalar gather — 4 castdat hits.
1193 let mut col = [0i32; 4];
1194 let mut dst = [0i32; 4];
1195 for k in 0..4 {
1196 let ray_idx = (u[k] >> 16) as usize;
1197 let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
1198 let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
1199 let cd = scratch.radar[cd_offset as usize];
1200 col[k] = cd.col;
1201 dst[k] = cd.dist;
1202 }
1203 if !scratch.foglut.is_empty() {
1204 for k in 0..4 {
1205 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1206 }
1207 }
1208 let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
1209 let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
1210 let vdst = f32x4_convert_i32x4(vdsi);
1211 let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
1212 let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
1213 let vz = f32x4_mul(vdst, vinv);
1214
1215 let pixel_idx = row_start + xu;
1216 v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
1217 v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
1218
1219 // Write back NEW uurend values — u + d per lane.
1220 for k in 0..4 {
1221 scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1222 }
1223
1224 vdx = f32x4_add(vdx, vstrx4);
1225 vdy = f32x4_add(vdy, vstry4);
1226 iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1227 x += 4;
1228 }
1229 dirx = f32x4_extract_lane::<0>(vdx);
1230 diry = f32x4_extract_lane::<0>(vdy);
1231 }
1232
1233 // Scalar tail — handles 0..3 leftover pixels on x86_64 /
1234 // aarch64 / wasm32 and the full body on other targets.
1235 while x < p1 {
1236 // Vertical scan reads the per-column ray index from
1237 // uurend[sx] (>>16 to drop the fractional bits).
1238 let xu = x as usize;
1239 let ray_idx = (scratch.uurend[xu] >> 16) as usize;
1240 let cd_offset = scratch.angstart[ray_idx] + iplc_local as isize;
1241 let cd = scratch.radar[cd_offset as usize];
1242 let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
1243
1244 let pixel_idx = row_start + xu;
1245 #[allow(clippy::cast_precision_loss)]
1246 let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
1247 // SAFETY: see hrend's matching write — pixel_idx is in-bounds
1248 // by the same scan_loops geometry argument; right + left
1249 // quadrants own disjoint sx ranges so cross-thread writes
1250 // are pairwise pixel-disjoint.
1251 unsafe {
1252 self.target.write_color(pixel_idx, col as u32);
1253 self.target.write_depth(pixel_idx, z);
1254 }
1255
1256 dirx += rs.strx;
1257 diry += rs.stry;
1258 // Advance per-column ray index. uurend[x] persists
1259 // across vrend calls — this state is what couples
1260 // consecutive scanlines through the same column.
1261 scratch.uurend[xu] = scratch.uurend[xu].wrapping_add(scratch.uurend[xu + half_stride]);
1262 x += 1;
1263 iplc_local = iplc_local.wrapping_add(iinc);
1264 }
1265 }
1266}
1267
1268#[cfg(test)]
1269mod tests {
1270 use super::*;
1271 use crate::rasterizer::CastDat;
1272
1273 /// Build owned per-frame state so tests can assemble a
1274 /// `ScanContext` with proper-lifetime borrows. Values aren't
1275 /// load-bearing for the scalar-fill behaviour tests; the real
1276 /// `gline` cares about them, hence `camera_state` joining the
1277 /// tuple.
1278 fn dummy_per_frame() -> (
1279 crate::camera_math::CameraState,
1280 crate::projection::ProjectionRect,
1281 crate::ray_step::RayStep,
1282 crate::opticast_prelude::OpticastPrelude,
1283 ) {
1284 let cam = crate::Camera {
1285 pos: [0.0, 0.0, 0.0],
1286 right: [1.0, 0.0, 0.0],
1287 down: [0.0, 1.0, 0.0],
1288 forward: [0.0, 0.0, 1.0],
1289 };
1290 let cs = crate::camera_math::derive(&cam, 64, 64, 32.0, 32.0, 32.0);
1291 let proj = crate::projection::derive_projection(&cs, 64, 64, 32.0, 32.0, 32.0, 1);
1292 let rs = crate::ray_step::derive_ray_step(&cs, proj.cx, proj.cy, 32.0);
1293 let prelude = crate::opticast_prelude::derive_prelude(&cs, 2048, 1, 4, 1024, 1);
1294 (cs, proj, rs, prelude)
1295 }
1296
1297 #[test]
1298 fn frame_setup_caches_ray_step() {
1299 let mut fb = vec![0u32; 64 * 64];
1300 let mut zb = vec![0.0f32; 64 * 64];
1301 let mip_base = [0usize, 0];
1302 let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1303 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1304 let (cs, proj, rs, prelude) = dummy_per_frame();
1305 let ctx = ScanContext {
1306 proj: &proj,
1307 rs: &rs,
1308 prelude: &prelude,
1309 xres: 64,
1310 y_start: 0,
1311 y_end: 64,
1312 anginc: 1,
1313 camera_state: &cs,
1314 camera_gstartz0: 0,
1315 camera_gstartz1: 0,
1316 camera_vptr_offset: 0,
1317 camera_seed_chunk_z: 0,
1318 };
1319 r.frame_setup(&ctx);
1320 let cached_rs = r.frame.as_ref().expect("frame populated").ray_step;
1321 assert_eq!(cached_rs.strx.to_bits(), rs.strx.to_bits());
1322 assert_eq!(cached_rs.stry.to_bits(), rs.stry.to_bits());
1323 assert_eq!(cached_rs.cx16, rs.cx16);
1324 assert_eq!(cached_rs.cy16, rs.cy16);
1325 }
1326
1327 #[cfg(target_arch = "x86_64")]
1328 #[test]
1329 fn hrend_sse_batch_writes_4_pixel_block() {
1330 // R5.1 smoke test: hrend's SSE batch fires for span len ≥
1331 // 4. Pre-fill radar with 4 distinct ARGB values, run hrend
1332 // over [10..14], assert the framebuffer carries the colour
1333 // bits (z lanes use rsqrtps approximation so are intent-
1334 // ionally not bit-checked).
1335 let mut fb = vec![0u32; 64 * 64];
1336 let mut zb = vec![0.0f32; 64 * 64];
1337 let mip_base = [0usize, 0];
1338 let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1339 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1340 let (cs, proj, rs, prelude) = dummy_per_frame();
1341 let ctx = ScanContext {
1342 proj: &proj,
1343 rs: &rs,
1344 prelude: &prelude,
1345 xres: 64,
1346 y_start: 0,
1347 y_end: 64,
1348 anginc: 1,
1349 camera_state: &cs,
1350 camera_gstartz0: 0,
1351 camera_gstartz1: 0,
1352 camera_vptr_offset: 0,
1353 camera_seed_chunk_z: 0,
1354 };
1355 r.frame_setup(&ctx);
1356
1357 let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1358 // 4 colour records so the batch reads each lane.
1359 for (i, slot) in scratch.radar.iter_mut().enumerate().take(4) {
1360 slot.col = 0x8000_0000_u32 as i32 | i as i32;
1361 slot.dist = 1024;
1362 }
1363 // angstart[i] = i so ray_idx i (= plc>>16) resolves to
1364 // radar[i + j] = radar[i] with j=0.
1365 for k in 0..4 {
1366 scratch.angstart[k] = k as isize;
1367 }
1368
1369 // sx=10, p1=14, j=0, plc=0, incr=1<<16 → plc>>16 steps
1370 // 0,1,2,3 over the four pixels → angstart[0..4].
1371 r.hrend(&mut scratch, 10, 5, 14, 0, 1 << 16, 0);
1372
1373 let row_off = 5 * 64;
1374 for k in 0..4 {
1375 let want = 0x8000_0000_u32 | k as u32;
1376 assert_eq!(
1377 fb[row_off + 10 + k],
1378 want,
1379 "fb[5][{}] = {:#010x}, expected {:#010x}",
1380 10 + k,
1381 fb[row_off + 10 + k],
1382 want,
1383 );
1384 // z lane non-zero (rsqrtps produced something).
1385 // Bit-compare to dodge clippy::float_cmp; we just want
1386 // to confirm the slot was written, not its precise value.
1387 assert_ne!(zb[row_off + 10 + k].to_bits(), 0u32);
1388 }
1389 }
1390
1391 #[test]
1392 fn fog_blend_disabled_returns_col_unchanged() {
1393 let foglut: Vec<i32> = Vec::new();
1394 let col = 0x0080_C040;
1395 assert_eq!(fog_blend(col, 0x1234_5678, &foglut, 0xFF_FFFF), col);
1396 }
1397
1398 #[test]
1399 fn fog_blend_full_fog_returns_fog_col_per_channel() {
1400 // l = 32767 → ((fog - col) * 32767) >> 15 = fog - col → final
1401 // is col + (fog - col) = fog (per channel; alpha untouched).
1402 let foglut = vec![32767; 2048];
1403 let col = 0x80_AA_BB_CC_u32 as i32;
1404 let fog = 0x00_11_22_33_i32;
1405 let blended = fog_blend(col, 0, &foglut, fog) as u32;
1406 // Low 24 bits = fog colour; alpha (high byte) survives from col.
1407 assert_eq!(blended & 0x00FF_FFFF, fog as u32 & 0x00FF_FFFF);
1408 assert_eq!(blended & 0xFF00_0000, col as u32 & 0xFF00_0000);
1409 }
1410
1411 #[test]
1412 fn set_fog_zero_distance_clears_table() {
1413 let mut s = ScanScratch::new_for_size(64, 64, 64);
1414 s.set_fog(0x1234_5678, 100);
1415 assert!(!s.foglut.is_empty());
1416 s.set_fog(0, 0);
1417 assert!(s.foglut.is_empty());
1418 }
1419
1420 #[test]
1421 fn set_fog_table_starts_at_zero_and_climbs() {
1422 let mut s = ScanScratch::new_for_size(64, 64, 64);
1423 s.set_fog(0xFF, 1024);
1424 // First entry: acc = 0 → hi16 = 0.
1425 assert_eq!(s.foglut[0], 0);
1426 // Last entry near the overflow boundary should be near 32767
1427 // (the saturate-fill value); voxlap's exact step makes
1428 // foglut[2047] either ~32766 (last walked entry) or 32767
1429 // (post-overflow padding) depending on max_scan_dist
1430 // divisibility.
1431 assert!(
1432 s.foglut[2047] > 30_000,
1433 "tail entry too low: {}",
1434 s.foglut[2047]
1435 );
1436 }
1437
1438 #[test]
1439 fn hrend_writes_pixel_per_column_from_radar() {
1440 // Pre-fill scratch.radar with a recognizable colour gradient
1441 // and pre-set scratch.angstart so hrend resolves to the
1442 // expected radar slot per pixel. Then call hrend manually
1443 // and verify the framebuffer received the colours.
1444 let mut fb = vec![0u32; 64 * 64];
1445 let mut zb = vec![0.0f32; 64 * 64];
1446 let mip_base = [0usize, 0];
1447 let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1448 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1449 let (cs, proj, rs, prelude) = dummy_per_frame();
1450 let ctx = ScanContext {
1451 proj: &proj,
1452 rs: &rs,
1453 prelude: &prelude,
1454 xres: 64,
1455 y_start: 0,
1456 y_end: 64,
1457 anginc: 1,
1458 camera_state: &cs,
1459 camera_gstartz0: 0,
1460 camera_gstartz1: 0,
1461 camera_vptr_offset: 0,
1462 camera_seed_chunk_z: 0,
1463 };
1464 r.frame_setup(&ctx);
1465
1466 let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1467 // Synthetic radar: 16 castdat entries with col = 0xAA...,
1468 // dist = 1 (zbuffer math will divide by sqrt of dir² so we
1469 // just pick a stable distance).
1470 for (i, slot) in scratch.radar.iter_mut().enumerate().take(16) {
1471 slot.col = 0x8000_0000_u32 as i32 | i as i32;
1472 slot.dist = 1024;
1473 }
1474 // angstart[0..4] all point at radar[0]; with j=0..3 and a
1475 // plc that increments by 0 (incr=0 holds plc>>16 at 0), the
1476 // pixel-by-pixel index lands at slots 0, 1, 2, 3 — i.e. j
1477 // selects the column.
1478 scratch.angstart[0] = 0;
1479
1480 // Render a span of 4 columns starting at sx=10, sy=5, with j
1481 // varying via the scan: but voxlap's hrend uses a single j
1482 // for the whole span (per-ray castdat column is fixed). We
1483 // pick j=2 and verify framebuffer[row*pitch + sx..sx+4] all
1484 // equal radar[0+2].col = 0x80000002.
1485 r.hrend(&mut scratch, 10, 5, 14, 0, 0, 2);
1486
1487 let row_off = 5 * 64;
1488 for x in 10..14 {
1489 let want = 0x8000_0000_u32 | 2;
1490 assert_eq!(
1491 fb[row_off + x],
1492 want,
1493 "fb[5][{x}] = {:#010x}, expected {:#010x}",
1494 fb[row_off + x],
1495 want,
1496 );
1497 }
1498 // Pixels outside the rendered span are untouched.
1499 assert_eq!(fb[row_off + 9], 0);
1500 assert_eq!(fb[row_off + 14], 0);
1501 }
1502
1503 #[test]
1504 fn end_to_end_opticast_runs_through_real_gline() {
1505 // Smoke test: with a valid above-the-slab camera and the
1506 // real R4.3a-rewire-3b gline, full opticast should run
1507 // without panicking and return `Rendered`. The synthetic
1508 // single-slab world has no voxel colour bytes so grouscan's
1509 // fill loops bail to startsky, which writes the configured
1510 // skycast into the radar — verify by setting a recognizable
1511 // skycast and asserting some pixels carry it.
1512 use crate::opticast as opticast_fn;
1513 use crate::rasterizer::ScratchPool;
1514 use crate::OpticastSettings;
1515
1516 let mut fb = vec![0u32; 640 * 480];
1517 let mut zb = vec![0.0f32; 640 * 480];
1518 let mut pool = ScratchPool::new(640, 480, 2048);
1519 // Recognizable sky colour — pixels filled by startsky's
1520 // solid-fill branch (the path the empty-colour-byte slab
1521 // ends up routing through) carry this.
1522 let sky_col = 0x80AB_CDEF_u32 as i32;
1523 pool.set_skycast(sky_col, 0x7FFF_FFFF);
1524
1525 // Single solid slab at z = 200..254. cz = 128 < 200 →
1526 // air-above-the-slab, opticast renders. Synthetic world:
1527 // only the camera's column (1024 * 2048 + 1024) holds the
1528 // slab; every other column is empty. Build world before
1529 // the rasterizer so it can borrow `&column` / `&column_offsets`.
1530 let column = vec![0u8, 200, 254, 0];
1531 let cam_idx = 1024usize * 2048 + 1024;
1532 let mut column_offsets = vec![0u32; 2048 * 2048 + 1];
1533 let column_len_u32 = u32::try_from(column.len()).expect("column fits u32");
1534 for offset in &mut column_offsets[(cam_idx + 1)..] {
1535 *offset = column_len_u32;
1536 }
1537
1538 let mip_base_offsets = [0usize, column_offsets.len()];
1539 let grid = crate::grid_view::GridView::from_parts(
1540 2048,
1541 &column,
1542 &column_offsets,
1543 &mip_base_offsets,
1544 );
1545 let mut rasterizer = ScalarRasterizer::new(&mut fb, &mut zb, 640, grid);
1546
1547 let cam = crate::Camera {
1548 pos: [1024.0, 1024.0, 128.0],
1549 right: [1.0, 0.0, 0.0],
1550 down: [0.0, 1.0, 0.0],
1551 forward: [0.0, 0.0, 1.0],
1552 };
1553 let settings = OpticastSettings::for_oracle_framebuffer(640, 480);
1554
1555 let outcome = opticast_fn(&mut rasterizer, &mut pool, &cam, &settings, grid);
1556 assert_eq!(outcome, crate::OpticastOutcome::Rendered);
1557
1558 // Wiring smoke test — gline → derive_gline_frustum →
1559 // grouscan_run chain executes for every ray without
1560 // panicking, opticast returns Rendered. The synthetic
1561 // single-slab world has no colour bytes (header only) so
1562 // grouscan's drawflor fill bails on the bounds check and
1563 // routes to predeletez → deletez → Done before reaching
1564 // startsky; the radar stays at default zeros, the
1565 // framebuffer ends up sky-blue (the host pre-fill). What
1566 // matters is that nothing crashed — the per-ray gline
1567 // arithmetic + cf[128] seeding + grouscan dispatch all
1568 // hold up under live ray geometry. Once R4.3a-rewire-4
1569 // loads a real `.vxl` with colour bytes, grouscan's fill
1570 // loops will write recognisable voxel colours and a
1571 // colour-presence assertion replaces this comment.
1572 let _ = sky_col; // suppress unused-let warning — kept as
1573 // scaffolding for the future assertion.
1574 }
1575
1576 #[cfg(target_arch = "x86_64")]
1577 #[test]
1578 fn vrend_sse_batch_writes_4_pixel_block() {
1579 // R5.3 smoke test: vrend's SSE batch fires for span len ≥
1580 // 4. Pre-fill 4 distinct radar entries, set angstart so
1581 // each lane's uurend[sx]>>16 indexes a different ray, run
1582 // vrend over [10..14], assert each column got the right
1583 // colour and uurend advanced.
1584 let mut fb = vec![0u32; 64 * 64];
1585 let mut zb = vec![0.0f32; 64 * 64];
1586 let mip_base = [0usize, 0];
1587 let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1588 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1589 let (cs, proj, rs, prelude) = dummy_per_frame();
1590 let ctx = ScanContext {
1591 proj: &proj,
1592 rs: &rs,
1593 prelude: &prelude,
1594 xres: 64,
1595 y_start: 0,
1596 y_end: 64,
1597 anginc: 1,
1598 camera_state: &cs,
1599 camera_gstartz0: 0,
1600 camera_gstartz1: 0,
1601 camera_vptr_offset: 0,
1602 camera_seed_chunk_z: 0,
1603 };
1604 r.frame_setup(&ctx);
1605
1606 let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1607 for k in 0..4 {
1608 scratch.radar[k] = CastDat {
1609 col: 0x8000_0000_u32 as i32 | k as i32,
1610 dist: 1024,
1611 };
1612 scratch.angstart[k] = k as isize;
1613 }
1614 // uurend[sx + k] >> 16 = k → ray_idx k → angstart[k] = k
1615 // → radar[k]. delta = 5 (so post-batch uurend[sx + k] =
1616 // (k << 16) + 5).
1617 let half = scratch.uurend_half_stride;
1618 for k in 0..4 {
1619 scratch.uurend[10 + k] = (k as i32) << 16;
1620 scratch.uurend[10 + k + half] = 5;
1621 }
1622
1623 r.vrend(&mut scratch, 10, 5, 14, 0, 0);
1624
1625 let row_off = 5 * 64;
1626 for k in 0..4 {
1627 let want = 0x8000_0000_u32 | k as u32;
1628 assert_eq!(fb[row_off + 10 + k], want, "fb col[{}]", 10 + k);
1629 assert!(zb[row_off + 10 + k].to_bits() != 0, "z[{}]", 10 + k);
1630 // Post-batch uurend = old_u + delta.
1631 assert_eq!(
1632 scratch.uurend[10 + k],
1633 ((k as i32) << 16) + 5,
1634 "uurend[{}]",
1635 10 + k
1636 );
1637 }
1638 }
1639
1640 #[test]
1641 fn vrend_advances_uurend_per_pixel() {
1642 // Verify the uurend[sx] += uurend[sx+half_stride] mutation
1643 // happens once per pixel.
1644 let mut fb = vec![0u32; 64 * 64];
1645 let mut zb = vec![0.0f32; 64 * 64];
1646 let mip_base = [0usize, 0];
1647 let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1648 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1649 let (cs, proj, rs, prelude) = dummy_per_frame();
1650 let ctx = ScanContext {
1651 proj: &proj,
1652 rs: &rs,
1653 prelude: &prelude,
1654 xres: 64,
1655 y_start: 0,
1656 y_end: 64,
1657 anginc: 1,
1658 camera_state: &cs,
1659 camera_gstartz0: 0,
1660 camera_gstartz1: 0,
1661 camera_vptr_offset: 0,
1662 camera_seed_chunk_z: 0,
1663 };
1664 r.frame_setup(&ctx);
1665
1666 let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1667 scratch.radar[0] = CastDat {
1668 col: 0x8033_4455_u32 as i32,
1669 dist: 1024,
1670 };
1671 // angstart[0] = 0 → all rays read radar[0 + iplc].
1672 scratch.angstart[0] = 0;
1673 // Pre-set uurend so ray_idx = uurend[sx] >> 16 = 0 for all
1674 // four columns (we want them to all hit angstart[0]).
1675 let half = scratch.uurend_half_stride;
1676 for sx in 10..14 {
1677 scratch.uurend[sx] = 0;
1678 scratch.uurend[sx + half] = 1; // delta added per pixel
1679 }
1680
1681 r.vrend(&mut scratch, 10, 5, 14, 0, 0);
1682
1683 // Each column's uurend should have advanced by the delta.
1684 for sx in 10..14 {
1685 assert_eq!(scratch.uurend[sx], 1, "uurend[{sx}] not advanced");
1686 }
1687 }
1688}