roxlap_core/scalar_rasterizer.rs
1//! Scalar `Rasterizer` implementation — port of voxlaptest's 4.7.5
2//! scalar `hrendzsse` / `vrendzsse` fallbacks (`voxlap5.c:1947` /
3//! `:2003` post-4.8.4 line numbers). Writes one `u32` ARGB pixel +
4//! one `f32` z-buffer entry per screen position from radar entries
5//! the (still-stubbed) `gline` will produce in R4.3.
6//!
7//! Per-pixel math (the SSE-batched form lives behind R5; this is the
8//! pre-batch shape):
9//!
10//! ```text
11//! col = scratch.angstart[plc >> 16] + j // signed offset into radar
12//! framebuffer[pixel] = radar[col].col // packed ARGB
13//! z = radar[col].dist / sqrt(dirx² + diry²) // f32 z-buffer entry
14//! dirx += strx; diry += stry; plc += incr
15//! ```
16//!
17//! The vertical scan additionally mutates `scratch.uurend[sx] +=
18//! scratch.uurend[sx + half_stride]` per pixel — that's why the
19//! Rasterizer trait now hands `&mut ScanScratch` to `vrend` /
20//! `hrend`. `gline` is a TODO stub; without it the angstart entries
21//! it would write are zero, so a freshly allocated radar yields all-
22//! zero pixels. Tests pre-fill the radar manually to verify the
23//! scanline rasterizer end of the pipeline.
24
25// Module-wide cast allows: the per-pixel arithmetic constantly
26// crosses signed/unsigned and i32/usize boundaries (loop counters,
27// signed offsets into radar, framebuffer indices). Annotating each
28// site individually buries the per-pixel logic in lint suppressions;
29// the cast-correctness invariants hold by construction.
30#![allow(
31 clippy::cast_sign_loss,
32 clippy::cast_possible_truncation,
33 clippy::cast_possible_wrap,
34 clippy::similar_names
35)]
36
37use std::marker::PhantomData;
38
39use crate::camera_math::CameraState;
40use crate::fixed::ftol;
41use crate::gline::derive_gline_frustum;
42use crate::grouscan::{grouscan_run, CfType, GrouscanInputs, CF_SEED_INDEX};
43use crate::opticast::camera_column_slice;
44use crate::opticast_prelude::{OpticastPrelude, PREC};
45use crate::rasterizer::{Rasterizer, ScanScratch};
46use crate::ray_step::RayStep;
47use crate::scan_loops::ScanContext;
48
49/// Borrowed view of the framebuffer + zbuffer as raw pointers.
50///
51/// R12.2.0 introduces this so the per-frame ScalarRasterizer can be
52/// `Copy` (for the per-thread fan-out R12.2.1 lands). Holding `&'a
53/// mut [u32]` / `&'a mut [f32]` directly forces exclusive borrows
54/// per instance, blocking the four quadrants from running on four
55/// threads even though their pixel writes are disjoint.
56///
57/// Constructed safely from exclusive slice borrows — the slices are
58/// consumed and re-exposed as raw pointers tied to lifetime `'a`
59/// via `PhantomData`. Once a `RasterTarget` exists, it is the sole
60/// path to the underlying memory; the slices cannot be used through
61/// any other channel for the duration of `'a`.
62///
63/// `Copy` lets opticast hand each quadrant thread its own copy of
64/// the same target. The four threads write disjoint pixels (top /
65/// bottom / left / right wedges of the screen, no overlap), so
66/// pointer aliasing is safe under the documented invariant.
67///
68/// # Safety contract for parallel use
69/// Callers that copy a `RasterTarget` and pass copies to multiple
70/// threads MUST guarantee that the threads collectively write to
71/// pairwise-disjoint pixel indices. opticast enforces this via the
72/// four-quadrant wedge geometry (see `scan_loops::{top,right,
73/// bottom,left}_quadrant`). Single-threaded callers (R12.1 default)
74/// hold one copy and trivially satisfy the invariant.
75#[derive(Clone, Copy, Debug)]
76pub struct RasterTarget<'a> {
77 fb_ptr: *mut u32,
78 fb_len: usize,
79 zb_ptr: *mut f32,
80 zb_len: usize,
81 _marker: PhantomData<&'a mut [u32]>,
82}
83
84// SAFETY: `RasterTarget` is morally a borrowed mutable slice pair —
85// the same shape `&'a mut [u32]` / `&'a mut [f32]` would have, both of
86// which are `Send` when `T: Send`. Multi-thread safety is enforced
87// by the wedge / strip-disjoint write invariant (see struct doc).
88unsafe impl Send for RasterTarget<'_> {}
89
90// SAFETY: sharing `&RasterTarget` across threads exposes only the
91// raw pointers + lengths. Reading a pointer field is itself free of
92// data races; concurrent writes through the pointer are gated by
93// the disjoint-write invariant the caller upholds. Required so
94// `ScalarRasterizer: Sync`, which `rayon::par_iter_mut` needs to
95// share `&rasterizer` across the strip-parallel closures (R12.3.1).
96unsafe impl Sync for RasterTarget<'_> {}
97
98impl<'a> RasterTarget<'a> {
99 /// Build a target from exclusive slice borrows. The slices are
100 /// consumed (their `&'a mut` reborrow is the load-bearing thing —
101 /// this constructor is the only way to mint a `RasterTarget`
102 /// from safe code).
103 #[must_use]
104 pub fn new(framebuffer: &'a mut [u32], zbuffer: &'a mut [f32]) -> Self {
105 Self {
106 fb_ptr: framebuffer.as_mut_ptr(),
107 fb_len: framebuffer.len(),
108 zb_ptr: zbuffer.as_mut_ptr(),
109 zb_len: zbuffer.len(),
110 _marker: PhantomData,
111 }
112 }
113
114 /// Framebuffer length in `u32` elements.
115 #[must_use]
116 pub fn fb_len(self) -> usize {
117 self.fb_len
118 }
119
120 /// Raw mutable framebuffer pointer. Used by SSE blocks that do
121 /// their own arithmetic + bounds reasoning.
122 ///
123 /// # Safety
124 /// Callers must respect `fb_len` and the parallel-use invariant.
125 #[must_use]
126 pub fn fb_ptr(self) -> *mut u32 {
127 self.fb_ptr
128 }
129
130 /// Raw mutable zbuffer pointer. Same contract as
131 /// [`Self::fb_ptr`].
132 #[must_use]
133 pub fn zb_ptr(self) -> *mut f32 {
134 self.zb_ptr
135 }
136
137 /// Write one ARGB pixel.
138 ///
139 /// # Safety
140 /// `idx < self.fb_len()`, plus the parallel-use invariant.
141 pub unsafe fn write_color(self, idx: usize, color: u32) {
142 debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
143 // SAFETY: caller asserts in-bounds + disjoint-from-other-threads.
144 unsafe { self.fb_ptr.add(idx).write(color) };
145 }
146
147 /// Write one z-buffer entry.
148 ///
149 /// # Safety
150 /// `idx < self.fb_len()` (zbuffer length matches fb), plus the
151 /// parallel-use invariant.
152 pub unsafe fn write_depth(self, idx: usize, z: f32) {
153 debug_assert!(idx < self.zb_len, "zb idx {} >= len {}", idx, self.zb_len);
154 // SAFETY: caller asserts in-bounds + disjoint-from-other-threads.
155 unsafe { self.zb_ptr.add(idx).write(z) };
156 }
157}
158
159// gcsub now lives on `ScanScratch::gcsub`; the host pokes it via
160// `ScanScratch::set_side_shades` per frame (mirrors voxlap's
161// `setsideshades` global). The default-state pattern
162// (`0x00ff00ff00ff00ff` per entry, == `setsideshades(0,…,0)`) is
163// what `ScanScratch::new_for_size` initialises to, so the oracle
164// stays bit-exact when no shading is configured.
165
166/// Per-channel fog blend — voxlap5.c:2052-2056 (and the matching
167/// hrend / vrend scalar tail). `col` is the source ARGB voxel
168/// colour; `dist` is the radar slot's depth (PREC-scaled, so
169/// `>> 20` gives the integer cell distance index into `foglut`).
170/// `foglut` empty short-circuits to "no fog" (returns `col`
171/// unchanged); OOB indices saturate to 32767 (full fog) since
172/// `set_fog` pads the table that way.
173//
174// The C form is one branchless expression per channel; we keep
175// it as-is for legibility against the source. `as i32` casts
176// match the C int32 arithmetic.
177#[allow(
178 clippy::cast_sign_loss,
179 clippy::cast_possible_wrap,
180 clippy::many_single_char_names
181)]
182fn fog_blend(col: i32, dist: i32, foglut: &[i32], fog_col: i32) -> i32 {
183 if foglut.is_empty() {
184 return col;
185 }
186 let idx = (dist >> 20) as usize;
187 let l = foglut.get(idx).copied().unwrap_or(32767) & 32767;
188 let k = col;
189 let fc = fog_col;
190 let r = (((fc & 255) - (k & 255)) * l) >> 15;
191 let g = (((((fc >> 8) & 255) - ((k >> 8) & 255)) * l) >> 15) << 8;
192 let b = (((((fc >> 16) & 255) - ((k >> 16) & 255)) * l) >> 15) << 16;
193 r + g + b + k
194}
195
196/// Per-ray sky-row update. Mirror of voxlap5.c:1236-1255.
197///
198/// On the first ray of a quadrant (`scratch.sky_cur_lng < 0`),
199/// initialise from the ray's `atan2(vy1, vx1)` mapped through
200/// `sky.lng_mul`. On subsequent rays, walk `sky.lng[]` forward
201/// (when `sky_cur_dir < 0`) or backward (`sky_cur_dir >= 0`)
202/// until the cross-product flips sign — voxlap's rotating-cursor
203/// trick that avoids re-running atan2 per ray. After the search,
204/// stamp `scratch.sky_off = sky_cur_lng * sky.bpl`, which
205/// `phase_startsky_textured` divides by 4 to land at the row's
206/// pixel-base index.
207fn sky_per_ray_update(
208 scratch: &mut crate::rasterizer::ScanScratch,
209 sky: &crate::sky::Sky,
210 vx1: f32,
211 vy1: f32,
212) {
213 let ysiz = sky.ysiz;
214 if scratch.sky_cur_lng < 0 {
215 // First-ray init — atan2 mapped to row index.
216 let ang = vy1.atan2(vx1) + std::f32::consts::PI;
217 let raw = ang * sky.lng_mul - 0.5;
218 let mut lng = ftol(raw);
219 // Voxlap's `(uint32_t)skycurlng >= skyysiz` clamp uses an
220 // uninitialised `j` for the corrective shift; we substitute
221 // `rem_euclid` for a deterministic in-range value. The
222 // rotating-cursor walk in subsequent rays will quickly
223 // converge whichever way we land.
224 if (lng as u32) >= (ysiz as u32) {
225 lng = lng.rem_euclid(ysiz);
226 }
227 scratch.sky_cur_lng = lng;
228 } else if scratch.sky_cur_dir < 0 {
229 // Walk forward (rotating).
230 let mut j = scratch.sky_cur_lng + 1;
231 if j >= ysiz {
232 j = 0;
233 }
234 loop {
235 let l = sky.lng[j as usize];
236 if l[0] * vy1 <= l[1] * vx1 {
237 break;
238 }
239 scratch.sky_cur_lng = j;
240 j += 1;
241 if j >= ysiz {
242 j = 0;
243 }
244 }
245 } else {
246 // Walk backward (rotating).
247 loop {
248 let l = sky.lng[scratch.sky_cur_lng as usize];
249 if l[0] * vy1 >= l[1] * vx1 {
250 break;
251 }
252 scratch.sky_cur_lng -= 1;
253 if scratch.sky_cur_lng < 0 {
254 scratch.sky_cur_lng = ysiz - 1;
255 }
256 }
257 }
258 // Voxlap: `skyoff = skycurlng * skybpl + nskypic`. We strip
259 // the `+ nskypic` (texture base address) — `phase_startsky`
260 // adds it implicitly by indexing `sky.pixels` directly.
261 scratch.sky_off = scratch.sky_cur_lng * sky.bpl;
262}
263
264/// Per-frame state cached on first `frame_setup` call. Owned here
265/// (vs. borrowed from `ScanContext`) because gline needs to read
266/// it across many calls without re-borrowing each time. The
267/// `prelude` clone copies one `Vec<i32>` (the `y_lookup` mip
268/// table) per frame — cheap.
269//
270// `Clone` is used by [`ScalarRasterizer`]'s 4-way fan-out (R12.2.1)
271// to mint one rasterizer per quadrant thread; each clone copies the
272// (~few KB) y_lookup Vec — small per-frame allocation cost.
273#[derive(Clone)]
274struct FrameCache {
275 ray_step: RayStep,
276 camera_state: CameraState,
277 prelude: OpticastPrelude,
278 gstartz0: i32,
279 gstartz1: i32,
280 /// Voxlap's `v - *ixy_sptr_col` — byte offset within the
281 /// camera's column to the slab whose top bounds the air gap
282 /// from below. `0` ⇒ column-top.
283 vptr_offset: usize,
284}
285
286/// Scalar rasterizer that writes pixels and a z-buffer entry per
287/// screen position.
288///
289/// Borrows the framebuffer + zbuffer for the duration of one
290/// `opticast` call; SDL hosts allocate these once and reuse across
291/// frames, see `roxlap-host`.
292//
293// `slab_buf` / `column_offsets` / `vsid` are R4.3a-rewire-2
294// scaffolding — the real `gline` (R4.3a-rewire-3) needs them to
295// call `grouscan_run` per ray. The current placeholder gline
296// doesn't read them yet, hence the dead_code allow.
297#[allow(dead_code)]
298#[derive(Clone)]
299pub struct ScalarRasterizer<'a> {
300 /// Framebuffer + zbuffer raw-pointer view. Stripped from the
301 /// caller's `&mut [u32]` / `&mut [f32]` borrows at construction
302 /// (see [`Self::new`]) so the rasterizer can be `Copy` for the
303 /// per-thread quadrant fan-out R12.2.1 lands. Single-threaded
304 /// path holds one copy, parallel path will hold four (one per
305 /// quadrant — wedge-disjoint pixel writes; see
306 /// [`RasterTarget`]'s safety contract).
307 target: RasterTarget<'a>,
308 /// Row stride in `u32` / `f32` elements (== framebuffer width
309 /// for tightly-packed buffers; SDL streaming textures may add
310 /// trailing padding).
311 pitch_pixels: usize,
312 /// World-level flat slab buffer (voxlap's malloc'd column
313 /// data). Re-borrowed from opticast's caller for the lifetime
314 /// of the rasterizer.
315 slab_buf: &'a [u8],
316 /// Per-column byte offsets into [`Self::slab_buf`], concatenated
317 /// across all built mip levels. The mip-0 sub-table prefix
318 /// (`vsid² + 1` entries) is what existing single-mip callers
319 /// pass; multi-mip callers pass the full concatenation and
320 /// declare boundaries via [`Self::mip_base_offsets`].
321 column_offsets: &'a [u32],
322 /// Per-mip column-offset sub-table base indices. Length
323 /// `mip_count + 1`; trailing sentinel equals
324 /// `column_offsets.len()`. Single-mip callers pass
325 /// `&[0, vsid² + 1]`. R4.5d's `phase_remiporend` indexes
326 /// this to land in mip-N+1's sub-table.
327 mip_base_offsets: &'a [usize],
328 /// World dimension. Combined with the prelude's `column_index`
329 /// and the column-step path in grouscan, this is what lets the
330 /// real gline walk the per-ray voxel-column traversal.
331 vsid: u32,
332 /// Optional sky texture borrow. `None` ⇒ `phase_startsky`
333 /// solid-fills with `scratch.skycast`. `Some(_)` ⇒ gline's
334 /// per-ray frustum prep updates `scratch.sky_off`, and
335 /// `phase_startsky` runs the textured search-and-sample loop.
336 /// Set via [`Self::with_sky`] after construction; unset ⇒
337 /// engine's existing solid-sky behaviour, byte-stable for the
338 /// oracle.
339 sky: Option<&'a crate::sky::Sky>,
340 /// Per-frame state cache. `None` until the first `frame_setup`
341 /// call; gline panics if invoked before that.
342 frame: Option<FrameCache>,
343}
344
345// R12.2.1 / R12.3.1: opticast's parallel branches fan the rasterizer
346// across rayon-managed threads — each thread owns its own clone. The
347// clones share `target: RasterTarget` (raw pointers; safe under the
348// strip-disjoint pixel-write invariant documented on RasterTarget),
349// hold &-refs into the slab/column data (Sync), and have independent
350// FrameCache copies. Compile-time checks: this fails if any field
351// becomes non-Send/non-Sync so the parallel path can no longer hold.
352const _: fn() = || {
353 fn assert_send<T: Send>() {}
354 fn assert_sync<T: Sync>() {}
355 assert_send::<ScalarRasterizer<'_>>();
356 assert_sync::<ScalarRasterizer<'_>>();
357};
358
359impl<'a> ScalarRasterizer<'a> {
360 /// Create a rasterizer that will write into the supplied
361 /// framebuffer + zbuffer pair. `pitch_pixels` must satisfy
362 /// `pitch_pixels * height ≤ framebuffer.len()` for the height
363 /// the engine renders into; the `frame_setup` hook does not
364 /// validate sizes (it has no height to check against).
365 ///
366 /// `slab_buf` / `column_offsets` / `mip_base_offsets` / `vsid`
367 /// describe the world the renderer reads from. Pass the matching
368 /// fields from a [`roxlap_formats::vxl::Vxl`] (or, for tests,
369 /// `&[0, vsid² + 1]` as the single-mip placeholder).
370 ///
371 /// `ray_step` is initialised to a zero placeholder; the real
372 /// values get stamped on the first [`Rasterizer::frame_setup`]
373 /// call before any per-pixel work runs.
374 #[must_use]
375 pub fn new(
376 framebuffer: &'a mut [u32],
377 zbuffer: &'a mut [f32],
378 pitch_pixels: usize,
379 slab_buf: &'a [u8],
380 column_offsets: &'a [u32],
381 mip_base_offsets: &'a [usize],
382 vsid: u32,
383 ) -> Self {
384 Self {
385 target: RasterTarget::new(framebuffer, zbuffer),
386 pitch_pixels,
387 slab_buf,
388 column_offsets,
389 mip_base_offsets,
390 vsid,
391 sky: None,
392 frame: None,
393 }
394 }
395
396 /// Bind a sky texture for the lifetime of this rasterizer
397 /// instance. Hosts call this when [`crate::Engine::sky`] is
398 /// `Some(_)`. Without it, the rasterizer keeps the legacy
399 /// solid-fill `skycast` behaviour.
400 #[must_use]
401 pub fn with_sky(mut self, sky: &'a crate::sky::Sky) -> Self {
402 self.sky = Some(sky);
403 self
404 }
405}
406
407impl Rasterizer for ScalarRasterizer<'_> {
408 fn frame_setup(&mut self, ctx: &ScanContext<'_>) {
409 // Cache everything per-frame so gline doesn't re-borrow on
410 // every call. Prelude is cloned (one Vec<i32> alloc per
411 // frame for y_lookup; small).
412 self.frame = Some(FrameCache {
413 ray_step: *ctx.rs,
414 camera_state: *ctx.camera_state,
415 prelude: ctx.prelude.clone(),
416 gstartz0: ctx.camera_gstartz0,
417 gstartz1: ctx.camera_gstartz1,
418 vptr_offset: ctx.camera_vptr_offset,
419 });
420 }
421
422 #[allow(clippy::too_many_lines)]
423 fn gline(
424 &mut self,
425 scratch: &mut ScanScratch,
426 length: u32,
427 x0: f32,
428 y0: f32,
429 x1: f32,
430 y1: f32,
431 ) {
432 // Voxlap's per-scanline ray-cast: derive the frustum, seed
433 // cf[128], stamp scratch globals, call grouscan. Mirror of
434 // voxlap5.c:gline (1146..1235).
435 let cache = self
436 .frame
437 .as_ref()
438 .expect("gline called before frame_setup");
439 let leng = length as i32;
440
441 // 1. Project per-ray frustum (vd0/vd1/vz0/vx1/vy1/vz1 +
442 // gixy/gpz/gdz). voxlap5.c:1153-1175.
443 let f = derive_gline_frustum(
444 &cache.camera_state,
445 &cache.prelude,
446 self.vsid,
447 length,
448 x0,
449 y0,
450 x1,
451 y1,
452 );
453
454 // 2. Stamp ray-step globals onto scratch.
455 scratch.gixy = f.gixy;
456 scratch.gpz = f.gpz;
457 scratch.gdz = f.gdz;
458
459 // 3. cmprecip[leng] = CMPPREC / leng (voxlap precomputed
460 // table; voxlap5.c:12315 builds it as `CMPPREC/(float)i`).
461 // CMPPREC = 256*4096 = PREC. gi0 / gi1 are per-pixel ray-
462 // step coefficients in Q12.20 (= PREC); cx0/cy0/cx1/cy1
463 // are the cf[128] seed endpoints. voxlap5.c:1179-1190.
464 // The `as f32` casts here lose precision for very large leng
465 // (> 2²³), but realistic scanline lengths (a few thousand)
466 // are well below that.
467 #[allow(clippy::cast_precision_loss)]
468 let cmpprec = PREC as f32;
469 #[allow(clippy::cast_precision_loss)]
470 let cmprecip = if leng > 0 {
471 cmpprec / (leng as f32)
472 } else {
473 0.0
474 };
475 // ftol() routes float→i32 through i64 to mirror voxlap C's
476 // wrap-on-overflow `lrintf+(int32_t)cast`. The cf-seed
477 // products (vd ± vd) * cmprecip and vd * cmpprec land at
478 // the i32 boundary for world-coord magnitudes near VSID
479 // (= 2048) × PREC (= 2²⁰); Rust's `as i32` saturates and
480 // diverges for those edge cases.
481 let (gi0, gi1, cx0, cy0) = if cache.prelude.forward_z_sign < 0 {
482 (
483 ftol((f.vd1 - f.vd0) * cmprecip),
484 ftol((f.vz1 - f.vz0) * cmprecip),
485 ftol(f.vd0 * cmpprec),
486 ftol(f.vz0 * cmpprec),
487 )
488 } else {
489 (
490 ftol((f.vd0 - f.vd1) * cmprecip),
491 ftol((f.vz0 - f.vz1) * cmprecip),
492 ftol(f.vd1 * cmpprec),
493 ftol(f.vz1 * cmpprec),
494 )
495 };
496 let cx1 = leng.wrapping_mul(gi0).wrapping_add(cx0);
497 let cy1 = leng.wrapping_mul(gi1).wrapping_add(cy0);
498
499 scratch.gi0 = gi0;
500 scratch.gi1 = gi1;
501
502 // 4. Seed cf[128] with the radar range + air-gap z-bounds +
503 // Q12.20 ray endpoints. voxlap5.c:1176-1190.
504 let gscanptr_isize = scratch.gscanptr as isize;
505 scratch.cf[CF_SEED_INDEX] = CfType {
506 i0: gscanptr_isize,
507 i1: gscanptr_isize + leng as isize,
508 z0: cache.gstartz0,
509 z1: cache.gstartz1,
510 cx0,
511 cy0,
512 cx1,
513 cy1,
514 };
515
516 // 5. gxmax = min(gmaxscandist, frustum-edge clip per axis).
517 // voxlap5.c:1192-1228. Unsigned compare — voxlap's `q`
518 // is a uint64_t product that may exceed gmaxscandist or
519 // wrap negative.
520 //
521 // Also stamps `skycast.dist` per voxlap5.c:1209-1227:
522 // initialised to `gxmax` (the scan-distance ceiling),
523 // overwritten with `0x7FFFFFFF` if either frustum-edge
524 // clip fires (= ray hits world edge before scan-dist
525 // cap → "infinitely far" sky depth). startsky's solid-
526 // fill writes this into every drained radar slot's
527 // `dist`, which the z-buffer ends up carrying.
528 let mut gxmax = cache.prelude.max_scan_dist;
529 scratch.skycast.dist = gxmax;
530 let li_pos = cache.prelude.li_pos;
531 let vsid_signed = self.vsid as i32;
532 let j0 = if f.gixy[0] < 0 {
533 li_pos[0]
534 } else {
535 vsid_signed - 1 - li_pos[0]
536 };
537 let q0 = (i64::from(f.gdz[0]).wrapping_mul(i64::from(j0)))
538 .wrapping_add(i64::from(f.gpz[0] as u32));
539 if (q0 as u64) < u64::from(gxmax as u32) {
540 gxmax = q0 as i32;
541 scratch.skycast.dist = i32::MAX;
542 }
543 let j1 = if f.gixy[1] < 0 {
544 li_pos[1]
545 } else {
546 vsid_signed - 1 - li_pos[1]
547 };
548 let q1 = (i64::from(f.gdz[1]).wrapping_mul(i64::from(j1)))
549 .wrapping_add(i64::from(f.gpz[1] as u32));
550 if (q1 as u64) < u64::from(gxmax as u32) {
551 gxmax = q1 as i32;
552 scratch.skycast.dist = i32::MAX;
553 }
554 scratch.gxmax = gxmax;
555
556 // 5b. Per-ray sky-row search. Mirror of voxlap5.c:1236-
557 // 1255. Walks `sky.lng[]` to find the texel-row whose
558 // longitude vector matches the ray's `(vx1, vy1)`
559 // direction; stamps `scratch.sky_off` so
560 // `phase_startsky` knows which row to sample. No-op
561 // when no sky texture is bound.
562 if let Some(sky) = self.sky {
563 sky_per_ray_update(scratch, sky, f.vx1, f.vy1);
564 }
565
566 // 6. Build inputs and call grouscan_run. The starting
567 // column is the camera's column (column_index from the
568 // prelude); the slab walker handles the rest.
569 let column = camera_column_slice(
570 self.slab_buf,
571 self.column_offsets,
572 cache.prelude.column_index,
573 )
574 .unwrap_or(&[]);
575 // Copy gcsub out of scratch so the GrouscanInputs immutable
576 // borrow doesn't collide with the `&mut scratch` grouscan_run
577 // takes below. `[i64; 9]` is 72 bytes — cheap.
578 let mut gcsub_local: [i64; 9] = scratch.gcsub;
579 // Voxlap5.c:1230-1234. Per-ray, populate the wall-side lanes
580 // (0/1) from the directional lanes (4/5 = left/right,
581 // 6/7 = up/down) according to the sign of `gixy`. Without
582 // this, `wall_lane` reads from the stale `0x00ff_00ff_00ff_00ff`
583 // baseline and wall faces get no directional darkening, even
584 // after the host calls `set_side_shades`.
585 if scratch.sideshademode {
586 let lane0_idx = if f.gixy[0] < 0 { 4 } else { 5 };
587 let lane1_idx = if f.gixy[1] < 0 { 6 } else { 7 };
588 gcsub_local[0] = gcsub_local[lane0_idx];
589 gcsub_local[1] = gcsub_local[lane1_idx];
590 }
591 let inputs = GrouscanInputs {
592 column,
593 gylookup: &cache.prelude.y_lookup,
594 gcsub: &gcsub_local,
595 slab_buf: self.slab_buf,
596 column_offsets: self.column_offsets,
597 mip_base_offsets: self.mip_base_offsets,
598 vsid: self.vsid,
599 sky: self.sky.map(crate::grouscan::SkyRef::from_sky),
600 };
601 // gmipnum = number of built mip levels. R4.5d's
602 // `phase_remiporend` body will start incrementing
603 // `state.gmipcnt` once gmipnum > 1 and the column step's
604 // `gpz > ngxmax` overflow fires; until then a multi-mip
605 // world simply renders mip-0 only, byte-stable with the
606 // single-mip path.
607 let gmipnum = u32::try_from(self.mip_base_offsets.len().saturating_sub(1))
608 .expect("mip count fits in u32");
609 let _ = grouscan_run(
610 scratch,
611 &inputs,
612 cache.vptr_offset,
613 cache.prelude.column_index as usize,
614 cache.prelude.x_mip,
615 gmipnum.max(1),
616 );
617
618 // gscanptr is advanced by the opticast quadrant scan
619 // (`scan_loops.rs::top_quadrant` etc., voxlap5.c:2382 area)
620 // AFTER each gline call. Voxlap's `gline` itself does NOT
621 // touch gscanptr — advancing it here too created gaps of
622 // `leng+1` unwritten radar slots between consecutive glines,
623 // which read back as 0 in hrend → black pixels at the
624 // sphere position in diag_down / high_down.
625 }
626
627 fn hrend(
628 &mut self,
629 scratch: &mut ScanScratch,
630 sx: i32,
631 sy: i32,
632 p1: i32,
633 plc: i32,
634 incr: i32,
635 j: i32,
636 ) {
637 let rs = self
638 .frame
639 .as_ref()
640 .map(|f| f.ray_step)
641 .expect("hrend/vrend called before frame_setup");
642 // Per-frame setup gives strx/stry/heix/heiy/addx/addy; per-
643 // pixel direction = strx*sx + heix*sy + addx, advancing by
644 // strx in the inner loop.
645 #[allow(clippy::cast_precision_loss)]
646 let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
647 #[allow(clippy::cast_precision_loss)]
648 let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
649 let row_start = sy as usize * self.pitch_pixels;
650
651 let mut plc_local = plc;
652 let mut x = sx;
653
654 // R5.1: SSE2 4-pixel batch via `_mm_rsqrt_ps` — port of
655 // voxlaptest's `hrendzsse` (voxlap5.c:1947). 12-bit
656 // approximation, no Newton refine, matching the
657 // historical asm. The tail (0..3 leftover pixels)
658 // continues with the bit-exact scalar form below; the
659 // batch's z lanes will not match scalar 1/sqrt exactly,
660 // mirroring voxlap. SSE2 is x86_64 baseline so no
661 // runtime CPU-feature check is needed.
662 //
663 // `cast_ptr_alignment` is suppressed because we use
664 // `_mm_storeu_si128` / `_mm_storeu_ps` — the `u`-suffix
665 // variants explicitly support unaligned addresses, so a
666 // u32 pointer cast to `*mut __m128i` is sound.
667 #[cfg(target_arch = "x86_64")]
668 #[allow(clippy::cast_ptr_alignment)]
669 unsafe {
670 use core::arch::x86_64::{
671 __m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
672 _mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
673 };
674 let strx = rs.strx;
675 let stry = rs.stry;
676 let vstrx4 = _mm_set1_ps(strx * 4.0);
677 let vstry4 = _mm_set1_ps(stry * 4.0);
678 let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
679 let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
680 while p1 - x >= 4 {
681 // Gather 4 castdat hits — one per ray index.
682 let mut col = [0i32; 4];
683 let mut dst = [0i32; 4];
684 for k in 0..4 {
685 let ray_idx = (plc_local >> 16) as usize;
686 let cd_offset = scratch.angstart[ray_idx] + j as isize;
687 let cd = scratch.radar[cd_offset as usize];
688 col[k] = cd.col;
689 dst[k] = cd.dist;
690 plc_local = plc_local.wrapping_add(incr);
691 }
692 // R5.2: per-pixel fog blend (voxlap's `hrendzfogsse`).
693 // No-op when foglut is empty. Voxlap's MMX path used
694 // pmulhw with foglut as 4 packed int16 lanes; we
695 // mirror the scalar fallback the goldens use, which
696 // applies a single `l = foglut[..] & 32767` factor
697 // per pixel (one `l` per ray, all 3 channels).
698 if !scratch.foglut.is_empty() {
699 for k in 0..4 {
700 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
701 }
702 }
703 let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
704 let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
705 let vdst = _mm_cvtepi32_ps(vdsi);
706 let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
707 let vinv = _mm_rsqrt_ps(vsqr);
708 let vz = _mm_mul_ps(vdst, vinv);
709
710 let pixel_idx = row_start + x as usize;
711 _mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
712 _mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
713
714 vdx = _mm_add_ps(vdx, vstrx4);
715 vdy = _mm_add_ps(vdy, vstry4);
716 x += 4;
717 }
718 // Bring scalar dirx/diry up to where the batch left
719 // off — first lane of the post-step vdx/vdy.
720 dirx = _mm_cvtss_f32(vdx);
721 diry = _mm_cvtss_f32(vdy);
722 }
723
724 // R9: NEON 4-pixel batch — aarch64 equivalent of the SSE2
725 // path above. Uses `vrsqrteq_f32` + one Newton–Raphson step
726 // via `vrsqrtsq_f32` for ~16-bit precision (vs SSE2's ~12-bit
727 // without Newton). NEON is baseline on all AArch64 — no
728 // runtime feature check needed. Stores are naturally unaligned.
729 #[cfg(target_arch = "aarch64")]
730 unsafe {
731 use core::arch::aarch64::{
732 float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
733 vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
734 vst1q_u32,
735 };
736 let strx = rs.strx;
737 let stry = rs.stry;
738 let vstrx4 = vdupq_n_f32(strx * 4.0);
739 let vstry4 = vdupq_n_f32(stry * 4.0);
740 let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
741 let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
742 let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
743 let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
744 while p1 - x >= 4 {
745 // Scalar gather — same as SSE2 path.
746 let mut col = [0i32; 4];
747 let mut dst = [0i32; 4];
748 for k in 0..4 {
749 let ray_idx = (plc_local >> 16) as usize;
750 let cd_offset = scratch.angstart[ray_idx] + j as isize;
751 let cd = scratch.radar[cd_offset as usize];
752 col[k] = cd.col;
753 dst[k] = cd.dist;
754 plc_local = plc_local.wrapping_add(incr);
755 }
756 if !scratch.foglut.is_empty() {
757 for k in 0..4 {
758 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
759 }
760 }
761 let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
762 let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
763 let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
764 // One Newton–Raphson step: est * vrsqrts(x * est, est).
765 let est = vrsqrteq_f32(vsqr);
766 let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
767 let vz = vmulq_f32(vdst, vinv);
768
769 let pixel_idx = row_start + x as usize;
770 vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
771 vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
772
773 vdx = vaddq_f32(vdx, vstrx4);
774 vdy = vaddq_f32(vdy, vstry4);
775 x += 4;
776 }
777 dirx = vgetq_lane_f32(vdx, 0);
778 diry = vgetq_lane_f32(vdy, 0);
779 }
780
781 // R10.3: wasm SIMD 4-pixel batch — equivalent of the SSE2
782 // / NEON paths above. Uses `1.0 / sqrt(x)` (full-precision
783 // `f32x4_sqrt` + `f32x4_div`) where SSE2 had `_mm_rsqrt_ps`
784 // and NEON had `vrsqrteq_f32`+Newton, since wasm SIMD has
785 // no rsqrt approximation. Wasm bytes therefore differ
786 // from both x86 and aarch64 goldens — captured by R10.4's
787 // separate `wasm-hashes.txt`.
788 #[cfg(target_arch = "wasm32")]
789 unsafe {
790 use core::arch::wasm32::{
791 f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
792 f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
793 };
794 let strx = rs.strx;
795 let stry = rs.stry;
796 let vstrx4 = f32x4_splat(strx * 4.0);
797 let vstry4 = f32x4_splat(stry * 4.0);
798 let one = f32x4_splat(1.0);
799 let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
800 let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
801 while p1 - x >= 4 {
802 // Scalar gather — same shape as SSE2 / NEON paths.
803 let mut col = [0i32; 4];
804 let mut dst = [0i32; 4];
805 for k in 0..4 {
806 let ray_idx = (plc_local >> 16) as usize;
807 let cd_offset = scratch.angstart[ray_idx] + j as isize;
808 let cd = scratch.radar[cd_offset as usize];
809 col[k] = cd.col;
810 dst[k] = cd.dist;
811 plc_local = plc_local.wrapping_add(incr);
812 }
813 if !scratch.foglut.is_empty() {
814 for k in 0..4 {
815 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
816 }
817 }
818 let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
819 let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
820 let vdst = f32x4_convert_i32x4(vdsi);
821 let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
822 let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
823 let vz = f32x4_mul(vdst, vinv);
824
825 let pixel_idx = row_start + x as usize;
826 v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
827 v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
828
829 vdx = f32x4_add(vdx, vstrx4);
830 vdy = f32x4_add(vdy, vstry4);
831 x += 4;
832 }
833 dirx = f32x4_extract_lane::<0>(vdx);
834 diry = f32x4_extract_lane::<0>(vdy);
835 }
836
837 // Scalar tail — handles 0..3 leftover pixels on x86_64 /
838 // aarch64 / wasm32 and the full body on other targets.
839 while x < p1 {
840 // ray index = signed shift right (voxlap's `plc >> 16`).
841 let ray_idx = (plc_local >> 16) as usize;
842 let cd_offset = scratch.angstart[ray_idx] + j as isize;
843 let cd = scratch.radar[cd_offset as usize];
844 let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
845
846 let pixel_idx = row_start + x as usize;
847 #[allow(clippy::cast_precision_loss)]
848 let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
849 // SAFETY: pixel_idx = sy*pitch + x, with sy < yres and x < p1
850 // ≤ xres (loop guard); p1 ≤ ctx.xres in scan_loops::top_quadrant /
851 // bottom_quadrant. fb / zb were allocated at pitch*height by the
852 // caller (asserted in Engine::render's preamble); pixel_idx is
853 // therefore in-range. Wedge-disjoint invariant: top + bottom
854 // quadrants own disjoint sy ranges.
855 unsafe {
856 self.target.write_color(pixel_idx, col as u32);
857 self.target.write_depth(pixel_idx, z);
858 }
859
860 dirx += rs.strx;
861 diry += rs.stry;
862 plc_local = plc_local.wrapping_add(incr);
863 x += 1;
864 }
865 }
866
867 fn vrend(
868 &mut self,
869 scratch: &mut ScanScratch,
870 sx: i32,
871 sy: i32,
872 p1: i32,
873 iplc: i32,
874 iinc: i32,
875 ) {
876 let rs = self
877 .frame
878 .as_ref()
879 .map(|f| f.ray_step)
880 .expect("hrend/vrend called before frame_setup");
881 #[allow(clippy::cast_precision_loss)]
882 let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
883 #[allow(clippy::cast_precision_loss)]
884 let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
885 let row_start = sy as usize * self.pitch_pixels;
886 let half_stride = scratch.uurend_half_stride;
887
888 let mut iplc_local = iplc;
889 let mut x = sx;
890
891 // R5.3: SSE2 4-pixel batch — port of voxlaptest's
892 // `vrendzsse` (voxlap5.c:2083). The per-column
893 // `uurend[sx] += uurend[sx + half_stride]` update is
894 // parallel-safe: uurend[sx + half_stride..] is read-only
895 // here, and uurend[sx..+3] are four distinct lanes.
896 // Read OLD u/d values, do the SSE z math, then write
897 // back four NEW u values. Plus fog blend (R5.2-style)
898 // when foglut is non-empty.
899 #[cfg(target_arch = "x86_64")]
900 #[allow(clippy::cast_ptr_alignment)]
901 unsafe {
902 use core::arch::x86_64::{
903 __m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
904 _mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
905 };
906 let strx = rs.strx;
907 let stry = rs.stry;
908 let vstrx4 = _mm_set1_ps(strx * 4.0);
909 let vstry4 = _mm_set1_ps(stry * 4.0);
910 let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
911 let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
912 while p1 - x >= 4 {
913 let xu = x as usize;
914 // Read 4 OLD uurend pairs (u, d). u = current ray
915 // index for column; d = per-pixel delta.
916 let mut u = [0i32; 4];
917 let mut d = [0i32; 4];
918 for k in 0..4 {
919 u[k] = scratch.uurend[xu + k];
920 d[k] = scratch.uurend[xu + k + half_stride];
921 }
922 // Gather 4 castdat hits.
923 let mut col = [0i32; 4];
924 let mut dst = [0i32; 4];
925 for k in 0..4 {
926 let ray_idx = (u[k] >> 16) as usize;
927 let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
928 let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
929 let cd = scratch.radar[cd_offset as usize];
930 col[k] = cd.col;
931 dst[k] = cd.dist;
932 }
933 if !scratch.foglut.is_empty() {
934 for k in 0..4 {
935 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
936 }
937 }
938 let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
939 let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
940 let vdst = _mm_cvtepi32_ps(vdsi);
941 let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
942 let vinv = _mm_rsqrt_ps(vsqr);
943 let vz = _mm_mul_ps(vdst, vinv);
944
945 let pixel_idx = row_start + xu;
946 _mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
947 _mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
948
949 // Write back NEW uurend values — u + d per lane.
950 for k in 0..4 {
951 scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
952 }
953
954 vdx = _mm_add_ps(vdx, vstrx4);
955 vdy = _mm_add_ps(vdy, vstry4);
956 iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
957 x += 4;
958 }
959 dirx = _mm_cvtss_f32(vdx);
960 diry = _mm_cvtss_f32(vdy);
961 }
962
963 // R9: NEON 4-pixel batch for vrend — aarch64 equivalent.
964 // Same structure as hrend NEON: scalar gather + uurend
965 // read/write, NEON rsqrt for z, vectorized store.
966 #[cfg(target_arch = "aarch64")]
967 unsafe {
968 use core::arch::aarch64::{
969 float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
970 vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
971 vst1q_u32,
972 };
973 let strx = rs.strx;
974 let stry = rs.stry;
975 let vstrx4 = vdupq_n_f32(strx * 4.0);
976 let vstry4 = vdupq_n_f32(stry * 4.0);
977 let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
978 let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
979 let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
980 let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
981 while p1 - x >= 4 {
982 let xu = x as usize;
983 // Read 4 OLD uurend pairs (u, d).
984 let mut u = [0i32; 4];
985 let mut d = [0i32; 4];
986 for k in 0..4 {
987 u[k] = scratch.uurend[xu + k];
988 d[k] = scratch.uurend[xu + k + half_stride];
989 }
990 // Scalar gather — 4 castdat hits.
991 let mut col = [0i32; 4];
992 let mut dst = [0i32; 4];
993 for k in 0..4 {
994 let ray_idx = (u[k] >> 16) as usize;
995 let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
996 let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
997 let cd = scratch.radar[cd_offset as usize];
998 col[k] = cd.col;
999 dst[k] = cd.dist;
1000 }
1001 if !scratch.foglut.is_empty() {
1002 for k in 0..4 {
1003 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1004 }
1005 }
1006 let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
1007 let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
1008 let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
1009 let est = vrsqrteq_f32(vsqr);
1010 let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
1011 let vz = vmulq_f32(vdst, vinv);
1012
1013 let pixel_idx = row_start + xu;
1014 vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
1015 vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
1016
1017 // Write back NEW uurend values — u + d per lane.
1018 for k in 0..4 {
1019 scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1020 }
1021
1022 vdx = vaddq_f32(vdx, vstrx4);
1023 vdy = vaddq_f32(vdy, vstry4);
1024 iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1025 x += 4;
1026 }
1027 dirx = vgetq_lane_f32(vdx, 0);
1028 diry = vgetq_lane_f32(vdy, 0);
1029 }
1030
1031 // R10.3: wasm SIMD 4-pixel batch for vrend — equivalent of
1032 // the SSE2 / NEON paths above. Same scalar-gather + uurend
1033 // read/write structure; full-precision `1.0 / sqrt(x)` for
1034 // the inverse magnitude, since wasm SIMD has no rsqrt
1035 // approximation. Bytes diverge from the other arches —
1036 // R10.4's `wasm-hashes.txt` covers the divergence.
1037 #[cfg(target_arch = "wasm32")]
1038 unsafe {
1039 use core::arch::wasm32::{
1040 f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
1041 f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
1042 };
1043 let strx = rs.strx;
1044 let stry = rs.stry;
1045 let vstrx4 = f32x4_splat(strx * 4.0);
1046 let vstry4 = f32x4_splat(stry * 4.0);
1047 let one = f32x4_splat(1.0);
1048 let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
1049 let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
1050 while p1 - x >= 4 {
1051 let xu = x as usize;
1052 // Read 4 OLD uurend pairs (u, d).
1053 let mut u = [0i32; 4];
1054 let mut d = [0i32; 4];
1055 for k in 0..4 {
1056 u[k] = scratch.uurend[xu + k];
1057 d[k] = scratch.uurend[xu + k + half_stride];
1058 }
1059 // Scalar gather — 4 castdat hits.
1060 let mut col = [0i32; 4];
1061 let mut dst = [0i32; 4];
1062 for k in 0..4 {
1063 let ray_idx = (u[k] >> 16) as usize;
1064 let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
1065 let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
1066 let cd = scratch.radar[cd_offset as usize];
1067 col[k] = cd.col;
1068 dst[k] = cd.dist;
1069 }
1070 if !scratch.foglut.is_empty() {
1071 for k in 0..4 {
1072 col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1073 }
1074 }
1075 let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
1076 let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
1077 let vdst = f32x4_convert_i32x4(vdsi);
1078 let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
1079 let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
1080 let vz = f32x4_mul(vdst, vinv);
1081
1082 let pixel_idx = row_start + xu;
1083 v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
1084 v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
1085
1086 // Write back NEW uurend values — u + d per lane.
1087 for k in 0..4 {
1088 scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1089 }
1090
1091 vdx = f32x4_add(vdx, vstrx4);
1092 vdy = f32x4_add(vdy, vstry4);
1093 iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1094 x += 4;
1095 }
1096 dirx = f32x4_extract_lane::<0>(vdx);
1097 diry = f32x4_extract_lane::<0>(vdy);
1098 }
1099
1100 // Scalar tail — handles 0..3 leftover pixels on x86_64 /
1101 // aarch64 / wasm32 and the full body on other targets.
1102 while x < p1 {
1103 // Vertical scan reads the per-column ray index from
1104 // uurend[sx] (>>16 to drop the fractional bits).
1105 let xu = x as usize;
1106 let ray_idx = (scratch.uurend[xu] >> 16) as usize;
1107 let cd_offset = scratch.angstart[ray_idx] + iplc_local as isize;
1108 let cd = scratch.radar[cd_offset as usize];
1109 let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
1110
1111 let pixel_idx = row_start + xu;
1112 #[allow(clippy::cast_precision_loss)]
1113 let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
1114 // SAFETY: see hrend's matching write — pixel_idx is in-bounds
1115 // by the same scan_loops geometry argument; right + left
1116 // quadrants own disjoint sx ranges so cross-thread writes
1117 // are pairwise pixel-disjoint.
1118 unsafe {
1119 self.target.write_color(pixel_idx, col as u32);
1120 self.target.write_depth(pixel_idx, z);
1121 }
1122
1123 dirx += rs.strx;
1124 diry += rs.stry;
1125 // Advance per-column ray index. uurend[x] persists
1126 // across vrend calls — this state is what couples
1127 // consecutive scanlines through the same column.
1128 scratch.uurend[xu] = scratch.uurend[xu].wrapping_add(scratch.uurend[xu + half_stride]);
1129 x += 1;
1130 iplc_local = iplc_local.wrapping_add(iinc);
1131 }
1132 }
1133}
1134
1135#[cfg(test)]
1136mod tests {
1137 use super::*;
1138 use crate::rasterizer::CastDat;
1139
1140 /// Build owned per-frame state so tests can assemble a
1141 /// `ScanContext` with proper-lifetime borrows. Values aren't
1142 /// load-bearing for the scalar-fill behaviour tests; the real
1143 /// `gline` cares about them, hence `camera_state` joining the
1144 /// tuple.
1145 fn dummy_per_frame() -> (
1146 crate::camera_math::CameraState,
1147 crate::projection::ProjectionRect,
1148 crate::ray_step::RayStep,
1149 crate::opticast_prelude::OpticastPrelude,
1150 ) {
1151 let cam = crate::Camera {
1152 pos: [0.0, 0.0, 0.0],
1153 right: [1.0, 0.0, 0.0],
1154 down: [0.0, 1.0, 0.0],
1155 forward: [0.0, 0.0, 1.0],
1156 };
1157 let cs = crate::camera_math::derive(&cam, 64, 64, 32.0, 32.0, 32.0);
1158 let proj = crate::projection::derive_projection(&cs, 64, 64, 32.0, 32.0, 32.0, 1);
1159 let rs = crate::ray_step::derive_ray_step(&cs, proj.cx, proj.cy, 32.0);
1160 let prelude = crate::opticast_prelude::derive_prelude(&cs, 2048, 1, 4, 1024);
1161 (cs, proj, rs, prelude)
1162 }
1163
1164 #[test]
1165 fn frame_setup_caches_ray_step() {
1166 let mut fb = vec![0u32; 64 * 64];
1167 let mut zb = vec![0.0f32; 64 * 64];
1168 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1169 let (cs, proj, rs, prelude) = dummy_per_frame();
1170 let ctx = ScanContext {
1171 proj: &proj,
1172 rs: &rs,
1173 prelude: &prelude,
1174 xres: 64,
1175 y_start: 0,
1176 y_end: 64,
1177 anginc: 1,
1178 camera_state: &cs,
1179 camera_gstartz0: 0,
1180 camera_gstartz1: 0,
1181 camera_vptr_offset: 0,
1182 };
1183 r.frame_setup(&ctx);
1184 let cached_rs = r.frame.as_ref().expect("frame populated").ray_step;
1185 assert_eq!(cached_rs.strx.to_bits(), rs.strx.to_bits());
1186 assert_eq!(cached_rs.stry.to_bits(), rs.stry.to_bits());
1187 assert_eq!(cached_rs.cx16, rs.cx16);
1188 assert_eq!(cached_rs.cy16, rs.cy16);
1189 }
1190
1191 #[cfg(target_arch = "x86_64")]
1192 #[test]
1193 fn hrend_sse_batch_writes_4_pixel_block() {
1194 // R5.1 smoke test: hrend's SSE batch fires for span len ≥
1195 // 4. Pre-fill radar with 4 distinct ARGB values, run hrend
1196 // over [10..14], assert the framebuffer carries the colour
1197 // bits (z lanes use rsqrtps approximation so are intent-
1198 // ionally not bit-checked).
1199 let mut fb = vec![0u32; 64 * 64];
1200 let mut zb = vec![0.0f32; 64 * 64];
1201 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1202 let (cs, proj, rs, prelude) = dummy_per_frame();
1203 let ctx = ScanContext {
1204 proj: &proj,
1205 rs: &rs,
1206 prelude: &prelude,
1207 xres: 64,
1208 y_start: 0,
1209 y_end: 64,
1210 anginc: 1,
1211 camera_state: &cs,
1212 camera_gstartz0: 0,
1213 camera_gstartz1: 0,
1214 camera_vptr_offset: 0,
1215 };
1216 r.frame_setup(&ctx);
1217
1218 let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1219 // 4 colour records so the batch reads each lane.
1220 for (i, slot) in scratch.radar.iter_mut().enumerate().take(4) {
1221 slot.col = 0x8000_0000_u32 as i32 | i as i32;
1222 slot.dist = 1024;
1223 }
1224 // angstart[i] = i so ray_idx i (= plc>>16) resolves to
1225 // radar[i + j] = radar[i] with j=0.
1226 for k in 0..4 {
1227 scratch.angstart[k] = k as isize;
1228 }
1229
1230 // sx=10, p1=14, j=0, plc=0, incr=1<<16 → plc>>16 steps
1231 // 0,1,2,3 over the four pixels → angstart[0..4].
1232 r.hrend(&mut scratch, 10, 5, 14, 0, 1 << 16, 0);
1233
1234 let row_off = 5 * 64;
1235 for k in 0..4 {
1236 let want = 0x8000_0000_u32 | k as u32;
1237 assert_eq!(
1238 fb[row_off + 10 + k],
1239 want,
1240 "fb[5][{}] = {:#010x}, expected {:#010x}",
1241 10 + k,
1242 fb[row_off + 10 + k],
1243 want,
1244 );
1245 // z lane non-zero (rsqrtps produced something).
1246 // Bit-compare to dodge clippy::float_cmp; we just want
1247 // to confirm the slot was written, not its precise value.
1248 assert_ne!(zb[row_off + 10 + k].to_bits(), 0u32);
1249 }
1250 }
1251
1252 #[test]
1253 fn fog_blend_disabled_returns_col_unchanged() {
1254 let foglut: Vec<i32> = Vec::new();
1255 let col = 0x0080_C040;
1256 assert_eq!(fog_blend(col, 0x1234_5678, &foglut, 0xFF_FFFF), col);
1257 }
1258
1259 #[test]
1260 fn fog_blend_full_fog_returns_fog_col_per_channel() {
1261 // l = 32767 → ((fog - col) * 32767) >> 15 = fog - col → final
1262 // is col + (fog - col) = fog (per channel; alpha untouched).
1263 let foglut = vec![32767; 2048];
1264 let col = 0x80_AA_BB_CC_u32 as i32;
1265 let fog = 0x00_11_22_33_i32;
1266 let blended = fog_blend(col, 0, &foglut, fog) as u32;
1267 // Low 24 bits = fog colour; alpha (high byte) survives from col.
1268 assert_eq!(blended & 0x00FF_FFFF, fog as u32 & 0x00FF_FFFF);
1269 assert_eq!(blended & 0xFF00_0000, col as u32 & 0xFF00_0000);
1270 }
1271
1272 #[test]
1273 fn set_fog_zero_distance_clears_table() {
1274 let mut s = ScanScratch::new_for_size(64, 64, 64);
1275 s.set_fog(0x1234_5678, 100);
1276 assert!(!s.foglut.is_empty());
1277 s.set_fog(0, 0);
1278 assert!(s.foglut.is_empty());
1279 }
1280
1281 #[test]
1282 fn set_fog_table_starts_at_zero_and_climbs() {
1283 let mut s = ScanScratch::new_for_size(64, 64, 64);
1284 s.set_fog(0xFF, 1024);
1285 // First entry: acc = 0 → hi16 = 0.
1286 assert_eq!(s.foglut[0], 0);
1287 // Last entry near the overflow boundary should be near 32767
1288 // (the saturate-fill value); voxlap's exact step makes
1289 // foglut[2047] either ~32766 (last walked entry) or 32767
1290 // (post-overflow padding) depending on max_scan_dist
1291 // divisibility.
1292 assert!(
1293 s.foglut[2047] > 30_000,
1294 "tail entry too low: {}",
1295 s.foglut[2047]
1296 );
1297 }
1298
1299 #[test]
1300 fn hrend_writes_pixel_per_column_from_radar() {
1301 // Pre-fill scratch.radar with a recognizable colour gradient
1302 // and pre-set scratch.angstart so hrend resolves to the
1303 // expected radar slot per pixel. Then call hrend manually
1304 // and verify the framebuffer received the colours.
1305 let mut fb = vec![0u32; 64 * 64];
1306 let mut zb = vec![0.0f32; 64 * 64];
1307 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1308 let (cs, proj, rs, prelude) = dummy_per_frame();
1309 let ctx = ScanContext {
1310 proj: &proj,
1311 rs: &rs,
1312 prelude: &prelude,
1313 xres: 64,
1314 y_start: 0,
1315 y_end: 64,
1316 anginc: 1,
1317 camera_state: &cs,
1318 camera_gstartz0: 0,
1319 camera_gstartz1: 0,
1320 camera_vptr_offset: 0,
1321 };
1322 r.frame_setup(&ctx);
1323
1324 let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1325 // Synthetic radar: 16 castdat entries with col = 0xAA...,
1326 // dist = 1 (zbuffer math will divide by sqrt of dir² so we
1327 // just pick a stable distance).
1328 for (i, slot) in scratch.radar.iter_mut().enumerate().take(16) {
1329 slot.col = 0x8000_0000_u32 as i32 | i as i32;
1330 slot.dist = 1024;
1331 }
1332 // angstart[0..4] all point at radar[0]; with j=0..3 and a
1333 // plc that increments by 0 (incr=0 holds plc>>16 at 0), the
1334 // pixel-by-pixel index lands at slots 0, 1, 2, 3 — i.e. j
1335 // selects the column.
1336 scratch.angstart[0] = 0;
1337
1338 // Render a span of 4 columns starting at sx=10, sy=5, with j
1339 // varying via the scan: but voxlap's hrend uses a single j
1340 // for the whole span (per-ray castdat column is fixed). We
1341 // pick j=2 and verify framebuffer[row*pitch + sx..sx+4] all
1342 // equal radar[0+2].col = 0x80000002.
1343 r.hrend(&mut scratch, 10, 5, 14, 0, 0, 2);
1344
1345 let row_off = 5 * 64;
1346 for x in 10..14 {
1347 let want = 0x8000_0000_u32 | 2;
1348 assert_eq!(
1349 fb[row_off + x],
1350 want,
1351 "fb[5][{x}] = {:#010x}, expected {:#010x}",
1352 fb[row_off + x],
1353 want,
1354 );
1355 }
1356 // Pixels outside the rendered span are untouched.
1357 assert_eq!(fb[row_off + 9], 0);
1358 assert_eq!(fb[row_off + 14], 0);
1359 }
1360
1361 #[test]
1362 fn end_to_end_opticast_runs_through_real_gline() {
1363 // Smoke test: with a valid above-the-slab camera and the
1364 // real R4.3a-rewire-3b gline, full opticast should run
1365 // without panicking and return `Rendered`. The synthetic
1366 // single-slab world has no voxel colour bytes so grouscan's
1367 // fill loops bail to startsky, which writes the configured
1368 // skycast into the radar — verify by setting a recognizable
1369 // skycast and asserting some pixels carry it.
1370 use crate::opticast as opticast_fn;
1371 use crate::rasterizer::ScratchPool;
1372 use crate::OpticastSettings;
1373
1374 let mut fb = vec![0u32; 640 * 480];
1375 let mut zb = vec![0.0f32; 640 * 480];
1376 let mut pool = ScratchPool::new(640, 480, 2048);
1377 // Recognizable sky colour — pixels filled by startsky's
1378 // solid-fill branch (the path the empty-colour-byte slab
1379 // ends up routing through) carry this.
1380 let sky_col = 0x80AB_CDEF_u32 as i32;
1381 pool.set_skycast(sky_col, 0x7FFF_FFFF);
1382
1383 // Single solid slab at z = 200..254. cz = 128 < 200 →
1384 // air-above-the-slab, opticast renders. Synthetic world:
1385 // only the camera's column (1024 * 2048 + 1024) holds the
1386 // slab; every other column is empty. Build world before
1387 // the rasterizer so it can borrow `&column` / `&column_offsets`.
1388 let column = vec![0u8, 200, 254, 0];
1389 let cam_idx = 1024usize * 2048 + 1024;
1390 let mut column_offsets = vec![0u32; 2048 * 2048 + 1];
1391 let column_len_u32 = u32::try_from(column.len()).expect("column fits u32");
1392 for offset in &mut column_offsets[(cam_idx + 1)..] {
1393 *offset = column_len_u32;
1394 }
1395
1396 let mip_base_offsets = [0usize, column_offsets.len()];
1397 let mut rasterizer = ScalarRasterizer::new(
1398 &mut fb,
1399 &mut zb,
1400 640,
1401 &column,
1402 &column_offsets,
1403 &mip_base_offsets,
1404 2048,
1405 );
1406
1407 let cam = crate::Camera {
1408 pos: [1024.0, 1024.0, 128.0],
1409 right: [1.0, 0.0, 0.0],
1410 down: [0.0, 1.0, 0.0],
1411 forward: [0.0, 0.0, 1.0],
1412 };
1413 let settings = OpticastSettings::for_oracle_framebuffer(640, 480);
1414
1415 let outcome = opticast_fn(
1416 &mut rasterizer,
1417 &mut pool,
1418 &cam,
1419 &settings,
1420 2048,
1421 &column,
1422 &column_offsets,
1423 );
1424 assert_eq!(outcome, crate::OpticastOutcome::Rendered);
1425
1426 // Wiring smoke test — gline → derive_gline_frustum →
1427 // grouscan_run chain executes for every ray without
1428 // panicking, opticast returns Rendered. The synthetic
1429 // single-slab world has no colour bytes (header only) so
1430 // grouscan's drawflor fill bails on the bounds check and
1431 // routes to predeletez → deletez → Done before reaching
1432 // startsky; the radar stays at default zeros, the
1433 // framebuffer ends up sky-blue (the host pre-fill). What
1434 // matters is that nothing crashed — the per-ray gline
1435 // arithmetic + cf[128] seeding + grouscan dispatch all
1436 // hold up under live ray geometry. Once R4.3a-rewire-4
1437 // loads a real `.vxl` with colour bytes, grouscan's fill
1438 // loops will write recognisable voxel colours and a
1439 // colour-presence assertion replaces this comment.
1440 let _ = sky_col; // suppress unused-let warning — kept as
1441 // scaffolding for the future assertion.
1442 }
1443
1444 #[cfg(target_arch = "x86_64")]
1445 #[test]
1446 fn vrend_sse_batch_writes_4_pixel_block() {
1447 // R5.3 smoke test: vrend's SSE batch fires for span len ≥
1448 // 4. Pre-fill 4 distinct radar entries, set angstart so
1449 // each lane's uurend[sx]>>16 indexes a different ray, run
1450 // vrend over [10..14], assert each column got the right
1451 // colour and uurend advanced.
1452 let mut fb = vec![0u32; 64 * 64];
1453 let mut zb = vec![0.0f32; 64 * 64];
1454 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1455 let (cs, proj, rs, prelude) = dummy_per_frame();
1456 let ctx = ScanContext {
1457 proj: &proj,
1458 rs: &rs,
1459 prelude: &prelude,
1460 xres: 64,
1461 y_start: 0,
1462 y_end: 64,
1463 anginc: 1,
1464 camera_state: &cs,
1465 camera_gstartz0: 0,
1466 camera_gstartz1: 0,
1467 camera_vptr_offset: 0,
1468 };
1469 r.frame_setup(&ctx);
1470
1471 let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1472 for k in 0..4 {
1473 scratch.radar[k] = CastDat {
1474 col: 0x8000_0000_u32 as i32 | k as i32,
1475 dist: 1024,
1476 };
1477 scratch.angstart[k] = k as isize;
1478 }
1479 // uurend[sx + k] >> 16 = k → ray_idx k → angstart[k] = k
1480 // → radar[k]. delta = 5 (so post-batch uurend[sx + k] =
1481 // (k << 16) + 5).
1482 let half = scratch.uurend_half_stride;
1483 for k in 0..4 {
1484 scratch.uurend[10 + k] = (k as i32) << 16;
1485 scratch.uurend[10 + k + half] = 5;
1486 }
1487
1488 r.vrend(&mut scratch, 10, 5, 14, 0, 0);
1489
1490 let row_off = 5 * 64;
1491 for k in 0..4 {
1492 let want = 0x8000_0000_u32 | k as u32;
1493 assert_eq!(fb[row_off + 10 + k], want, "fb col[{}]", 10 + k);
1494 assert!(zb[row_off + 10 + k].to_bits() != 0, "z[{}]", 10 + k);
1495 // Post-batch uurend = old_u + delta.
1496 assert_eq!(
1497 scratch.uurend[10 + k],
1498 ((k as i32) << 16) + 5,
1499 "uurend[{}]",
1500 10 + k
1501 );
1502 }
1503 }
1504
1505 #[test]
1506 fn vrend_advances_uurend_per_pixel() {
1507 // Verify the uurend[sx] += uurend[sx+half_stride] mutation
1508 // happens once per pixel.
1509 let mut fb = vec![0u32; 64 * 64];
1510 let mut zb = vec![0.0f32; 64 * 64];
1511 let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1512 let (cs, proj, rs, prelude) = dummy_per_frame();
1513 let ctx = ScanContext {
1514 proj: &proj,
1515 rs: &rs,
1516 prelude: &prelude,
1517 xres: 64,
1518 y_start: 0,
1519 y_end: 64,
1520 anginc: 1,
1521 camera_state: &cs,
1522 camera_gstartz0: 0,
1523 camera_gstartz1: 0,
1524 camera_vptr_offset: 0,
1525 };
1526 r.frame_setup(&ctx);
1527
1528 let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1529 scratch.radar[0] = CastDat {
1530 col: 0x8033_4455_u32 as i32,
1531 dist: 1024,
1532 };
1533 // angstart[0] = 0 → all rays read radar[0 + iplc].
1534 scratch.angstart[0] = 0;
1535 // Pre-set uurend so ray_idx = uurend[sx] >> 16 = 0 for all
1536 // four columns (we want them to all hit angstart[0]).
1537 let half = scratch.uurend_half_stride;
1538 for sx in 10..14 {
1539 scratch.uurend[sx] = 0;
1540 scratch.uurend[sx + half] = 1; // delta added per pixel
1541 }
1542
1543 r.vrend(&mut scratch, 10, 5, 14, 0, 0);
1544
1545 // Each column's uurend should have advanced by the delta.
1546 for sx in 10..14 {
1547 assert_eq!(scratch.uurend[sx], 1, "uurend[{sx}] not advanced");
1548 }
1549 }
1550}