Skip to main content

roxlap_core/
scalar_rasterizer.rs

1//! Scalar `Rasterizer` implementation — port of voxlaptest's 4.7.5
2//! scalar `hrendzsse` / `vrendzsse` fallbacks (`voxlap5.c:1947` /
3//! `:2003` post-4.8.4 line numbers). Writes one `u32` ARGB pixel +
4//! one `f32` z-buffer entry per screen position from radar entries
5//! the (still-stubbed) `gline` will produce in R4.3.
6//!
7//! Per-pixel math (the SSE-batched form lives behind R5; this is the
8//! pre-batch shape):
9//!
10//! ```text
11//! col = scratch.angstart[plc >> 16] + j     // signed offset into radar
12//! framebuffer[pixel] = radar[col].col       // packed ARGB
13//! z = radar[col].dist / sqrt(dirx² + diry²) // f32 z-buffer entry
14//! dirx += strx; diry += stry; plc += incr
15//! ```
16//!
17//! The vertical scan additionally mutates `scratch.uurend[sx] +=
18//! scratch.uurend[sx + half_stride]` per pixel — that's why the
19//! Rasterizer trait now hands `&mut ScanScratch` to `vrend` /
20//! `hrend`. `gline` is a TODO stub; without it the angstart entries
21//! it would write are zero, so a freshly allocated radar yields all-
22//! zero pixels. Tests pre-fill the radar manually to verify the
23//! scanline rasterizer end of the pipeline.
24
25// Module-wide cast allows: the per-pixel arithmetic constantly
26// crosses signed/unsigned and i32/usize boundaries (loop counters,
27// signed offsets into radar, framebuffer indices). Annotating each
28// site individually buries the per-pixel logic in lint suppressions;
29// the cast-correctness invariants hold by construction.
30#![allow(
31    clippy::cast_sign_loss,
32    clippy::cast_possible_truncation,
33    clippy::cast_possible_wrap,
34    clippy::similar_names
35)]
36
37use std::marker::PhantomData;
38
39use crate::camera_math::CameraState;
40use crate::fixed::ftol;
41use crate::gline::derive_gline_frustum;
42use crate::grouscan::{grouscan_run, CfType, GrouscanInputs, CF_SEED_INDEX};
43use crate::opticast::camera_column_slice;
44use crate::opticast_prelude::{OpticastPrelude, PREC};
45use crate::rasterizer::{Rasterizer, ScanScratch};
46use crate::ray_step::RayStep;
47use crate::scan_loops::ScanContext;
48
49/// Borrowed view of the framebuffer + zbuffer as raw pointers.
50///
51/// R12.2.0 introduces this so the per-frame ScalarRasterizer can be
52/// `Copy` (for the per-thread fan-out R12.2.1 lands). Holding `&'a
53/// mut [u32]` / `&'a mut [f32]` directly forces exclusive borrows
54/// per instance, blocking the four quadrants from running on four
55/// threads even though their pixel writes are disjoint.
56///
57/// Constructed safely from exclusive slice borrows — the slices are
58/// consumed and re-exposed as raw pointers tied to lifetime `'a`
59/// via `PhantomData`. Once a `RasterTarget` exists, it is the sole
60/// path to the underlying memory; the slices cannot be used through
61/// any other channel for the duration of `'a`.
62///
63/// `Copy` lets opticast hand each quadrant thread its own copy of
64/// the same target. The four threads write disjoint pixels (top /
65/// bottom / left / right wedges of the screen, no overlap), so
66/// pointer aliasing is safe under the documented invariant.
67///
68/// # Safety contract for parallel use
69/// Callers that copy a `RasterTarget` and pass copies to multiple
70/// threads MUST guarantee that the threads collectively write to
71/// pairwise-disjoint pixel indices. opticast enforces this via the
72/// four-quadrant wedge geometry (see `scan_loops::{top,right,
73/// bottom,left}_quadrant`). Single-threaded callers (R12.1 default)
74/// hold one copy and trivially satisfy the invariant.
75#[derive(Clone, Copy, Debug)]
76pub struct RasterTarget<'a> {
77    fb_ptr: *mut u32,
78    fb_len: usize,
79    zb_ptr: *mut f32,
80    zb_len: usize,
81    _marker: PhantomData<&'a mut [u32]>,
82}
83
84// SAFETY: `RasterTarget` is morally a borrowed mutable slice pair —
85// the same shape `&'a mut [u32]` / `&'a mut [f32]` would have, both of
86// which are `Send` when `T: Send`. Multi-thread safety is enforced
87// by the wedge / strip-disjoint write invariant (see struct doc).
88unsafe impl Send for RasterTarget<'_> {}
89
90// SAFETY: sharing `&RasterTarget` across threads exposes only the
91// raw pointers + lengths. Reading a pointer field is itself free of
92// data races; concurrent writes through the pointer are gated by
93// the disjoint-write invariant the caller upholds. Required so
94// `ScalarRasterizer: Sync`, which `rayon::par_iter_mut` needs to
95// share `&rasterizer` across the strip-parallel closures (R12.3.1).
96unsafe impl Sync for RasterTarget<'_> {}
97
98impl<'a> RasterTarget<'a> {
99    /// Build a target from exclusive slice borrows. The slices are
100    /// consumed (their `&'a mut` reborrow is the load-bearing thing —
101    /// this constructor is the only way to mint a `RasterTarget`
102    /// from safe code).
103    #[must_use]
104    pub fn new(framebuffer: &'a mut [u32], zbuffer: &'a mut [f32]) -> Self {
105        Self {
106            fb_ptr: framebuffer.as_mut_ptr(),
107            fb_len: framebuffer.len(),
108            zb_ptr: zbuffer.as_mut_ptr(),
109            zb_len: zbuffer.len(),
110            _marker: PhantomData,
111        }
112    }
113
114    /// Framebuffer length in `u32` elements.
115    #[must_use]
116    pub fn fb_len(self) -> usize {
117        self.fb_len
118    }
119
120    /// Raw mutable framebuffer pointer. Used by SSE blocks that do
121    /// their own arithmetic + bounds reasoning.
122    ///
123    /// # Safety
124    /// Callers must respect `fb_len` and the parallel-use invariant.
125    #[must_use]
126    pub fn fb_ptr(self) -> *mut u32 {
127        self.fb_ptr
128    }
129
130    /// Raw mutable zbuffer pointer. Same contract as
131    /// [`Self::fb_ptr`].
132    #[must_use]
133    pub fn zb_ptr(self) -> *mut f32 {
134        self.zb_ptr
135    }
136
137    /// Write one ARGB pixel.
138    ///
139    /// # Safety
140    /// `idx < self.fb_len()`, plus the parallel-use invariant.
141    pub unsafe fn write_color(self, idx: usize, color: u32) {
142        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
143        // SAFETY: caller asserts in-bounds + disjoint-from-other-threads.
144        unsafe { self.fb_ptr.add(idx).write(color) };
145    }
146
147    /// Write one z-buffer entry.
148    ///
149    /// # Safety
150    /// `idx < self.fb_len()` (zbuffer length matches fb), plus the
151    /// parallel-use invariant.
152    pub unsafe fn write_depth(self, idx: usize, z: f32) {
153        debug_assert!(idx < self.zb_len, "zb idx {} >= len {}", idx, self.zb_len);
154        // SAFETY: caller asserts in-bounds + disjoint-from-other-threads.
155        unsafe { self.zb_ptr.add(idx).write(z) };
156    }
157}
158
159// gcsub now lives on `ScanScratch::gcsub`; the host pokes it via
160// `ScanScratch::set_side_shades` per frame (mirrors voxlap's
161// `setsideshades` global). The default-state pattern
162// (`0x00ff00ff00ff00ff` per entry, == `setsideshades(0,…,0)`) is
163// what `ScanScratch::new_for_size` initialises to, so the oracle
164// stays bit-exact when no shading is configured.
165
166/// Per-channel fog blend — voxlap5.c:2052-2056 (and the matching
167/// hrend / vrend scalar tail). `col` is the source ARGB voxel
168/// colour; `dist` is the radar slot's depth (PREC-scaled, so
169/// `>> 20` gives the integer cell distance index into `foglut`).
170/// `foglut` empty short-circuits to "no fog" (returns `col`
171/// unchanged); OOB indices saturate to 32767 (full fog) since
172/// `set_fog` pads the table that way.
173//
174// The C form is one branchless expression per channel; we keep
175// it as-is for legibility against the source. `as i32` casts
176// match the C int32 arithmetic.
177#[allow(
178    clippy::cast_sign_loss,
179    clippy::cast_possible_wrap,
180    clippy::many_single_char_names
181)]
182fn fog_blend(col: i32, dist: i32, foglut: &[i32], fog_col: i32) -> i32 {
183    if foglut.is_empty() {
184        return col;
185    }
186    let idx = (dist >> 20) as usize;
187    let l = foglut.get(idx).copied().unwrap_or(32767) & 32767;
188    let k = col;
189    let fc = fog_col;
190    let r = (((fc & 255) - (k & 255)) * l) >> 15;
191    let g = (((((fc >> 8) & 255) - ((k >> 8) & 255)) * l) >> 15) << 8;
192    let b = (((((fc >> 16) & 255) - ((k >> 16) & 255)) * l) >> 15) << 16;
193    r + g + b + k
194}
195
196/// Per-ray sky-row update. Mirror of voxlap5.c:1236-1255.
197///
198/// On the first ray of a quadrant (`scratch.sky_cur_lng < 0`),
199/// initialise from the ray's `atan2(vy1, vx1)` mapped through
200/// `sky.lng_mul`. On subsequent rays, walk `sky.lng[]` forward
201/// (when `sky_cur_dir < 0`) or backward (`sky_cur_dir >= 0`)
202/// until the cross-product flips sign — voxlap's rotating-cursor
203/// trick that avoids re-running atan2 per ray. After the search,
204/// stamp `scratch.sky_off = sky_cur_lng * sky.bpl`, which
205/// `phase_startsky_textured` divides by 4 to land at the row's
206/// pixel-base index.
207fn sky_per_ray_update(
208    scratch: &mut crate::rasterizer::ScanScratch,
209    sky: &crate::sky::Sky,
210    vx1: f32,
211    vy1: f32,
212) {
213    let ysiz = sky.ysiz;
214    if scratch.sky_cur_lng < 0 {
215        // First-ray init — atan2 mapped to row index.
216        let ang = vy1.atan2(vx1) + std::f32::consts::PI;
217        let raw = ang * sky.lng_mul - 0.5;
218        let mut lng = ftol(raw);
219        // Voxlap's `(uint32_t)skycurlng >= skyysiz` clamp uses an
220        // uninitialised `j` for the corrective shift; we substitute
221        // `rem_euclid` for a deterministic in-range value. The
222        // rotating-cursor walk in subsequent rays will quickly
223        // converge whichever way we land.
224        if (lng as u32) >= (ysiz as u32) {
225            lng = lng.rem_euclid(ysiz);
226        }
227        scratch.sky_cur_lng = lng;
228    } else if scratch.sky_cur_dir < 0 {
229        // Walk forward (rotating).
230        let mut j = scratch.sky_cur_lng + 1;
231        if j >= ysiz {
232            j = 0;
233        }
234        loop {
235            let l = sky.lng[j as usize];
236            if l[0] * vy1 <= l[1] * vx1 {
237                break;
238            }
239            scratch.sky_cur_lng = j;
240            j += 1;
241            if j >= ysiz {
242                j = 0;
243            }
244        }
245    } else {
246        // Walk backward (rotating).
247        loop {
248            let l = sky.lng[scratch.sky_cur_lng as usize];
249            if l[0] * vy1 >= l[1] * vx1 {
250                break;
251            }
252            scratch.sky_cur_lng -= 1;
253            if scratch.sky_cur_lng < 0 {
254                scratch.sky_cur_lng = ysiz - 1;
255            }
256        }
257    }
258    // Voxlap: `skyoff = skycurlng * skybpl + nskypic`. We strip
259    // the `+ nskypic` (texture base address) — `phase_startsky`
260    // adds it implicitly by indexing `sky.pixels` directly.
261    scratch.sky_off = scratch.sky_cur_lng * sky.bpl;
262}
263
264/// Per-frame state cached on first `frame_setup` call. Owned here
265/// (vs. borrowed from `ScanContext`) because gline needs to read
266/// it across many calls without re-borrowing each time. The
267/// `prelude` clone copies one `Vec<i32>` (the `y_lookup` mip
268/// table) per frame — cheap.
269//
270// `Clone` is used by [`ScalarRasterizer`]'s 4-way fan-out (R12.2.1)
271// to mint one rasterizer per quadrant thread; each clone copies the
272// (~few KB) y_lookup Vec — small per-frame allocation cost.
273#[derive(Clone)]
274struct FrameCache {
275    ray_step: RayStep,
276    camera_state: CameraState,
277    prelude: OpticastPrelude,
278    gstartz0: i32,
279    gstartz1: i32,
280    /// Voxlap's `v - *ixy_sptr_col` — byte offset within the
281    /// camera's column to the slab whose top bounds the air gap
282    /// from below. `0` ⇒ column-top.
283    vptr_offset: usize,
284}
285
286/// Scalar rasterizer that writes pixels and a z-buffer entry per
287/// screen position.
288///
289/// Borrows the framebuffer + zbuffer for the duration of one
290/// `opticast` call; SDL hosts allocate these once and reuse across
291/// frames, see `roxlap-host`.
292//
293// `slab_buf` / `column_offsets` / `vsid` are R4.3a-rewire-2
294// scaffolding — the real `gline` (R4.3a-rewire-3) needs them to
295// call `grouscan_run` per ray. The current placeholder gline
296// doesn't read them yet, hence the dead_code allow.
297#[allow(dead_code)]
298#[derive(Clone)]
299pub struct ScalarRasterizer<'a> {
300    /// Framebuffer + zbuffer raw-pointer view. Stripped from the
301    /// caller's `&mut [u32]` / `&mut [f32]` borrows at construction
302    /// (see [`Self::new`]) so the rasterizer can be `Copy` for the
303    /// per-thread quadrant fan-out R12.2.1 lands. Single-threaded
304    /// path holds one copy, parallel path will hold four (one per
305    /// quadrant — wedge-disjoint pixel writes; see
306    /// [`RasterTarget`]'s safety contract).
307    target: RasterTarget<'a>,
308    /// Row stride in `u32` / `f32` elements (== framebuffer width
309    /// for tightly-packed buffers; SDL streaming textures may add
310    /// trailing padding).
311    pitch_pixels: usize,
312    /// World-level flat slab buffer (voxlap's malloc'd column
313    /// data). Re-borrowed from opticast's caller for the lifetime
314    /// of the rasterizer.
315    slab_buf: &'a [u8],
316    /// Per-column byte offsets into [`Self::slab_buf`], concatenated
317    /// across all built mip levels. The mip-0 sub-table prefix
318    /// (`vsid² + 1` entries) is what existing single-mip callers
319    /// pass; multi-mip callers pass the full concatenation and
320    /// declare boundaries via [`Self::mip_base_offsets`].
321    column_offsets: &'a [u32],
322    /// Per-mip column-offset sub-table base indices. Length
323    /// `mip_count + 1`; trailing sentinel equals
324    /// `column_offsets.len()`. Single-mip callers pass
325    /// `&[0, vsid² + 1]`. R4.5d's `phase_remiporend` indexes
326    /// this to land in mip-N+1's sub-table.
327    mip_base_offsets: &'a [usize],
328    /// World dimension. Combined with the prelude's `column_index`
329    /// and the column-step path in grouscan, this is what lets the
330    /// real gline walk the per-ray voxel-column traversal.
331    vsid: u32,
332    /// Optional sky texture borrow. `None` ⇒ `phase_startsky`
333    /// solid-fills with `scratch.skycast`. `Some(_)` ⇒ gline's
334    /// per-ray frustum prep updates `scratch.sky_off`, and
335    /// `phase_startsky` runs the textured search-and-sample loop.
336    /// Set via [`Self::with_sky`] after construction; unset ⇒
337    /// engine's existing solid-sky behaviour, byte-stable for the
338    /// oracle.
339    sky: Option<&'a crate::sky::Sky>,
340    /// Per-frame state cache. `None` until the first `frame_setup`
341    /// call; gline panics if invoked before that.
342    frame: Option<FrameCache>,
343}
344
345// R12.2.1 / R12.3.1: opticast's parallel branches fan the rasterizer
346// across rayon-managed threads — each thread owns its own clone. The
347// clones share `target: RasterTarget` (raw pointers; safe under the
348// strip-disjoint pixel-write invariant documented on RasterTarget),
349// hold &-refs into the slab/column data (Sync), and have independent
350// FrameCache copies. Compile-time checks: this fails if any field
351// becomes non-Send/non-Sync so the parallel path can no longer hold.
352const _: fn() = || {
353    fn assert_send<T: Send>() {}
354    fn assert_sync<T: Sync>() {}
355    assert_send::<ScalarRasterizer<'_>>();
356    assert_sync::<ScalarRasterizer<'_>>();
357};
358
359impl<'a> ScalarRasterizer<'a> {
360    /// Create a rasterizer that will write into the supplied
361    /// framebuffer + zbuffer pair. `pitch_pixels` must satisfy
362    /// `pitch_pixels * height ≤ framebuffer.len()` for the height
363    /// the engine renders into; the `frame_setup` hook does not
364    /// validate sizes (it has no height to check against).
365    ///
366    /// `slab_buf` / `column_offsets` / `mip_base_offsets` / `vsid`
367    /// describe the world the renderer reads from. Pass the matching
368    /// fields from a [`roxlap_formats::vxl::Vxl`] (or, for tests,
369    /// `&[0, vsid² + 1]` as the single-mip placeholder).
370    ///
371    /// `ray_step` is initialised to a zero placeholder; the real
372    /// values get stamped on the first [`Rasterizer::frame_setup`]
373    /// call before any per-pixel work runs.
374    #[must_use]
375    pub fn new(
376        framebuffer: &'a mut [u32],
377        zbuffer: &'a mut [f32],
378        pitch_pixels: usize,
379        slab_buf: &'a [u8],
380        column_offsets: &'a [u32],
381        mip_base_offsets: &'a [usize],
382        vsid: u32,
383    ) -> Self {
384        Self {
385            target: RasterTarget::new(framebuffer, zbuffer),
386            pitch_pixels,
387            slab_buf,
388            column_offsets,
389            mip_base_offsets,
390            vsid,
391            sky: None,
392            frame: None,
393        }
394    }
395
396    /// Bind a sky texture for the lifetime of this rasterizer
397    /// instance. Hosts call this when [`crate::Engine::sky`] is
398    /// `Some(_)`. Without it, the rasterizer keeps the legacy
399    /// solid-fill `skycast` behaviour.
400    #[must_use]
401    pub fn with_sky(mut self, sky: &'a crate::sky::Sky) -> Self {
402        self.sky = Some(sky);
403        self
404    }
405}
406
407impl Rasterizer for ScalarRasterizer<'_> {
408    fn frame_setup(&mut self, ctx: &ScanContext<'_>) {
409        // Cache everything per-frame so gline doesn't re-borrow on
410        // every call. Prelude is cloned (one Vec<i32> alloc per
411        // frame for y_lookup; small).
412        self.frame = Some(FrameCache {
413            ray_step: *ctx.rs,
414            camera_state: *ctx.camera_state,
415            prelude: ctx.prelude.clone(),
416            gstartz0: ctx.camera_gstartz0,
417            gstartz1: ctx.camera_gstartz1,
418            vptr_offset: ctx.camera_vptr_offset,
419        });
420    }
421
422    #[allow(clippy::too_many_lines)]
423    fn gline(
424        &mut self,
425        scratch: &mut ScanScratch,
426        length: u32,
427        x0: f32,
428        y0: f32,
429        x1: f32,
430        y1: f32,
431    ) {
432        // Voxlap's per-scanline ray-cast: derive the frustum, seed
433        // cf[128], stamp scratch globals, call grouscan. Mirror of
434        // voxlap5.c:gline (1146..1235).
435        let cache = self
436            .frame
437            .as_ref()
438            .expect("gline called before frame_setup");
439        let leng = length as i32;
440
441        // 1. Project per-ray frustum (vd0/vd1/vz0/vx1/vy1/vz1 +
442        //    gixy/gpz/gdz). voxlap5.c:1153-1175.
443        let f = derive_gline_frustum(
444            &cache.camera_state,
445            &cache.prelude,
446            self.vsid,
447            length,
448            x0,
449            y0,
450            x1,
451            y1,
452        );
453
454        // 2. Stamp ray-step globals onto scratch.
455        scratch.gixy = f.gixy;
456        scratch.gpz = f.gpz;
457        scratch.gdz = f.gdz;
458
459        // 3. cmprecip[leng] = CMPPREC / leng (voxlap precomputed
460        //    table; voxlap5.c:12315 builds it as `CMPPREC/(float)i`).
461        //    CMPPREC = 256*4096 = PREC. gi0 / gi1 are per-pixel ray-
462        //    step coefficients in Q12.20 (= PREC); cx0/cy0/cx1/cy1
463        //    are the cf[128] seed endpoints. voxlap5.c:1179-1190.
464        // The `as f32` casts here lose precision for very large leng
465        // (> 2²³), but realistic scanline lengths (a few thousand)
466        // are well below that.
467        #[allow(clippy::cast_precision_loss)]
468        let cmpprec = PREC as f32;
469        #[allow(clippy::cast_precision_loss)]
470        let cmprecip = if leng > 0 {
471            cmpprec / (leng as f32)
472        } else {
473            0.0
474        };
475        // ftol() routes float→i32 through i64 to mirror voxlap C's
476        // wrap-on-overflow `lrintf+(int32_t)cast`. The cf-seed
477        // products (vd ± vd) * cmprecip and vd * cmpprec land at
478        // the i32 boundary for world-coord magnitudes near VSID
479        // (= 2048) × PREC (= 2²⁰); Rust's `as i32` saturates and
480        // diverges for those edge cases.
481        let (gi0, gi1, cx0, cy0) = if cache.prelude.forward_z_sign < 0 {
482            (
483                ftol((f.vd1 - f.vd0) * cmprecip),
484                ftol((f.vz1 - f.vz0) * cmprecip),
485                ftol(f.vd0 * cmpprec),
486                ftol(f.vz0 * cmpprec),
487            )
488        } else {
489            (
490                ftol((f.vd0 - f.vd1) * cmprecip),
491                ftol((f.vz0 - f.vz1) * cmprecip),
492                ftol(f.vd1 * cmpprec),
493                ftol(f.vz1 * cmpprec),
494            )
495        };
496        let cx1 = leng.wrapping_mul(gi0).wrapping_add(cx0);
497        let cy1 = leng.wrapping_mul(gi1).wrapping_add(cy0);
498
499        scratch.gi0 = gi0;
500        scratch.gi1 = gi1;
501
502        // 4. Seed cf[128] with the radar range + air-gap z-bounds +
503        //    Q12.20 ray endpoints. voxlap5.c:1176-1190.
504        let gscanptr_isize = scratch.gscanptr as isize;
505        scratch.cf[CF_SEED_INDEX] = CfType {
506            i0: gscanptr_isize,
507            i1: gscanptr_isize + leng as isize,
508            z0: cache.gstartz0,
509            z1: cache.gstartz1,
510            cx0,
511            cy0,
512            cx1,
513            cy1,
514        };
515
516        // 5. gxmax = min(gmaxscandist, frustum-edge clip per axis).
517        //    voxlap5.c:1192-1228. Unsigned compare — voxlap's `q`
518        //    is a uint64_t product that may exceed gmaxscandist or
519        //    wrap negative.
520        //
521        //    Also stamps `skycast.dist` per voxlap5.c:1209-1227:
522        //    initialised to `gxmax` (the scan-distance ceiling),
523        //    overwritten with `0x7FFFFFFF` if either frustum-edge
524        //    clip fires (= ray hits world edge before scan-dist
525        //    cap → "infinitely far" sky depth). startsky's solid-
526        //    fill writes this into every drained radar slot's
527        //    `dist`, which the z-buffer ends up carrying.
528        let mut gxmax = cache.prelude.max_scan_dist;
529        scratch.skycast.dist = gxmax;
530        let li_pos = cache.prelude.li_pos;
531        let vsid_signed = self.vsid as i32;
532        let j0 = if f.gixy[0] < 0 {
533            li_pos[0]
534        } else {
535            vsid_signed - 1 - li_pos[0]
536        };
537        let q0 = (i64::from(f.gdz[0]).wrapping_mul(i64::from(j0)))
538            .wrapping_add(i64::from(f.gpz[0] as u32));
539        if (q0 as u64) < u64::from(gxmax as u32) {
540            gxmax = q0 as i32;
541            scratch.skycast.dist = i32::MAX;
542        }
543        let j1 = if f.gixy[1] < 0 {
544            li_pos[1]
545        } else {
546            vsid_signed - 1 - li_pos[1]
547        };
548        let q1 = (i64::from(f.gdz[1]).wrapping_mul(i64::from(j1)))
549            .wrapping_add(i64::from(f.gpz[1] as u32));
550        if (q1 as u64) < u64::from(gxmax as u32) {
551            gxmax = q1 as i32;
552            scratch.skycast.dist = i32::MAX;
553        }
554        scratch.gxmax = gxmax;
555
556        // 5b. Per-ray sky-row search. Mirror of voxlap5.c:1236-
557        //     1255. Walks `sky.lng[]` to find the texel-row whose
558        //     longitude vector matches the ray's `(vx1, vy1)`
559        //     direction; stamps `scratch.sky_off` so
560        //     `phase_startsky` knows which row to sample. No-op
561        //     when no sky texture is bound.
562        if let Some(sky) = self.sky {
563            sky_per_ray_update(scratch, sky, f.vx1, f.vy1);
564        }
565
566        // 6. Build inputs and call grouscan_run. The starting
567        //    column is the camera's column (column_index from the
568        //    prelude); the slab walker handles the rest.
569        let column = camera_column_slice(
570            self.slab_buf,
571            self.column_offsets,
572            cache.prelude.column_index,
573        )
574        .unwrap_or(&[]);
575        // Copy gcsub out of scratch so the GrouscanInputs immutable
576        // borrow doesn't collide with the `&mut scratch` grouscan_run
577        // takes below. `[i64; 9]` is 72 bytes — cheap.
578        let mut gcsub_local: [i64; 9] = scratch.gcsub;
579        // Voxlap5.c:1230-1234. Per-ray, populate the wall-side lanes
580        // (0/1) from the directional lanes (4/5 = left/right,
581        // 6/7 = up/down) according to the sign of `gixy`. Without
582        // this, `wall_lane` reads from the stale `0x00ff_00ff_00ff_00ff`
583        // baseline and wall faces get no directional darkening, even
584        // after the host calls `set_side_shades`.
585        if scratch.sideshademode {
586            let lane0_idx = if f.gixy[0] < 0 { 4 } else { 5 };
587            let lane1_idx = if f.gixy[1] < 0 { 6 } else { 7 };
588            gcsub_local[0] = gcsub_local[lane0_idx];
589            gcsub_local[1] = gcsub_local[lane1_idx];
590        }
591        let inputs = GrouscanInputs {
592            column,
593            gylookup: &cache.prelude.y_lookup,
594            gcsub: &gcsub_local,
595            slab_buf: self.slab_buf,
596            column_offsets: self.column_offsets,
597            mip_base_offsets: self.mip_base_offsets,
598            vsid: self.vsid,
599            sky: self.sky.map(crate::grouscan::SkyRef::from_sky),
600        };
601        // gmipnum = number of built mip levels. R4.5d's
602        // `phase_remiporend` body will start incrementing
603        // `state.gmipcnt` once gmipnum > 1 and the column step's
604        // `gpz > ngxmax` overflow fires; until then a multi-mip
605        // world simply renders mip-0 only, byte-stable with the
606        // single-mip path.
607        let gmipnum = u32::try_from(self.mip_base_offsets.len().saturating_sub(1))
608            .expect("mip count fits in u32");
609        let _ = grouscan_run(
610            scratch,
611            &inputs,
612            cache.vptr_offset,
613            cache.prelude.column_index as usize,
614            cache.prelude.x_mip,
615            gmipnum.max(1),
616        );
617
618        // gscanptr is advanced by the opticast quadrant scan
619        // (`scan_loops.rs::top_quadrant` etc., voxlap5.c:2382 area)
620        // AFTER each gline call. Voxlap's `gline` itself does NOT
621        // touch gscanptr — advancing it here too created gaps of
622        // `leng+1` unwritten radar slots between consecutive glines,
623        // which read back as 0 in hrend → black pixels at the
624        // sphere position in diag_down / high_down.
625    }
626
627    fn hrend(
628        &mut self,
629        scratch: &mut ScanScratch,
630        sx: i32,
631        sy: i32,
632        p1: i32,
633        plc: i32,
634        incr: i32,
635        j: i32,
636    ) {
637        let rs = self
638            .frame
639            .as_ref()
640            .map(|f| f.ray_step)
641            .expect("hrend/vrend called before frame_setup");
642        // Per-frame setup gives strx/stry/heix/heiy/addx/addy; per-
643        // pixel direction = strx*sx + heix*sy + addx, advancing by
644        // strx in the inner loop.
645        #[allow(clippy::cast_precision_loss)]
646        let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
647        #[allow(clippy::cast_precision_loss)]
648        let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
649        let row_start = sy as usize * self.pitch_pixels;
650
651        let mut plc_local = plc;
652        let mut x = sx;
653
654        // R5.1: SSE2 4-pixel batch via `_mm_rsqrt_ps` — port of
655        // voxlaptest's `hrendzsse` (voxlap5.c:1947). 12-bit
656        // approximation, no Newton refine, matching the
657        // historical asm. The tail (0..3 leftover pixels)
658        // continues with the bit-exact scalar form below; the
659        // batch's z lanes will not match scalar 1/sqrt exactly,
660        // mirroring voxlap. SSE2 is x86_64 baseline so no
661        // runtime CPU-feature check is needed.
662        //
663        // `cast_ptr_alignment` is suppressed because we use
664        // `_mm_storeu_si128` / `_mm_storeu_ps` — the `u`-suffix
665        // variants explicitly support unaligned addresses, so a
666        // u32 pointer cast to `*mut __m128i` is sound.
667        #[cfg(target_arch = "x86_64")]
668        #[allow(clippy::cast_ptr_alignment)]
669        unsafe {
670            use core::arch::x86_64::{
671                __m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
672                _mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
673            };
674            let strx = rs.strx;
675            let stry = rs.stry;
676            let vstrx4 = _mm_set1_ps(strx * 4.0);
677            let vstry4 = _mm_set1_ps(stry * 4.0);
678            let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
679            let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
680            while p1 - x >= 4 {
681                // Gather 4 castdat hits — one per ray index.
682                let mut col = [0i32; 4];
683                let mut dst = [0i32; 4];
684                for k in 0..4 {
685                    let ray_idx = (plc_local >> 16) as usize;
686                    let cd_offset = scratch.angstart[ray_idx] + j as isize;
687                    let cd = scratch.radar[cd_offset as usize];
688                    col[k] = cd.col;
689                    dst[k] = cd.dist;
690                    plc_local = plc_local.wrapping_add(incr);
691                }
692                // R5.2: per-pixel fog blend (voxlap's `hrendzfogsse`).
693                // No-op when foglut is empty. Voxlap's MMX path used
694                // pmulhw with foglut as 4 packed int16 lanes; we
695                // mirror the scalar fallback the goldens use, which
696                // applies a single `l = foglut[..] & 32767` factor
697                // per pixel (one `l` per ray, all 3 channels).
698                if !scratch.foglut.is_empty() {
699                    for k in 0..4 {
700                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
701                    }
702                }
703                let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
704                let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
705                let vdst = _mm_cvtepi32_ps(vdsi);
706                let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
707                let vinv = _mm_rsqrt_ps(vsqr);
708                let vz = _mm_mul_ps(vdst, vinv);
709
710                let pixel_idx = row_start + x as usize;
711                _mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
712                _mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
713
714                vdx = _mm_add_ps(vdx, vstrx4);
715                vdy = _mm_add_ps(vdy, vstry4);
716                x += 4;
717            }
718            // Bring scalar dirx/diry up to where the batch left
719            // off — first lane of the post-step vdx/vdy.
720            dirx = _mm_cvtss_f32(vdx);
721            diry = _mm_cvtss_f32(vdy);
722        }
723
724        // R9: NEON 4-pixel batch — aarch64 equivalent of the SSE2
725        // path above. Uses `vrsqrteq_f32` + one Newton–Raphson step
726        // via `vrsqrtsq_f32` for ~16-bit precision (vs SSE2's ~12-bit
727        // without Newton). NEON is baseline on all AArch64 — no
728        // runtime feature check needed. Stores are naturally unaligned.
729        #[cfg(target_arch = "aarch64")]
730        unsafe {
731            use core::arch::aarch64::{
732                float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
733                vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
734                vst1q_u32,
735            };
736            let strx = rs.strx;
737            let stry = rs.stry;
738            let vstrx4 = vdupq_n_f32(strx * 4.0);
739            let vstry4 = vdupq_n_f32(stry * 4.0);
740            let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
741            let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
742            let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
743            let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
744            while p1 - x >= 4 {
745                // Scalar gather — same as SSE2 path.
746                let mut col = [0i32; 4];
747                let mut dst = [0i32; 4];
748                for k in 0..4 {
749                    let ray_idx = (plc_local >> 16) as usize;
750                    let cd_offset = scratch.angstart[ray_idx] + j as isize;
751                    let cd = scratch.radar[cd_offset as usize];
752                    col[k] = cd.col;
753                    dst[k] = cd.dist;
754                    plc_local = plc_local.wrapping_add(incr);
755                }
756                if !scratch.foglut.is_empty() {
757                    for k in 0..4 {
758                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
759                    }
760                }
761                let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
762                let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
763                let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
764                // One Newton–Raphson step: est * vrsqrts(x * est, est).
765                let est = vrsqrteq_f32(vsqr);
766                let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
767                let vz = vmulq_f32(vdst, vinv);
768
769                let pixel_idx = row_start + x as usize;
770                vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
771                vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
772
773                vdx = vaddq_f32(vdx, vstrx4);
774                vdy = vaddq_f32(vdy, vstry4);
775                x += 4;
776            }
777            dirx = vgetq_lane_f32(vdx, 0);
778            diry = vgetq_lane_f32(vdy, 0);
779        }
780
781        // R10.3: wasm SIMD 4-pixel batch — equivalent of the SSE2
782        // / NEON paths above. Uses `1.0 / sqrt(x)` (full-precision
783        // `f32x4_sqrt` + `f32x4_div`) where SSE2 had `_mm_rsqrt_ps`
784        // and NEON had `vrsqrteq_f32`+Newton, since wasm SIMD has
785        // no rsqrt approximation. Wasm bytes therefore differ
786        // from both x86 and aarch64 goldens — captured by R10.4's
787        // separate `wasm-hashes.txt`.
788        #[cfg(target_arch = "wasm32")]
789        unsafe {
790            use core::arch::wasm32::{
791                f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
792                f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
793            };
794            let strx = rs.strx;
795            let stry = rs.stry;
796            let vstrx4 = f32x4_splat(strx * 4.0);
797            let vstry4 = f32x4_splat(stry * 4.0);
798            let one = f32x4_splat(1.0);
799            let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
800            let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
801            while p1 - x >= 4 {
802                // Scalar gather — same shape as SSE2 / NEON paths.
803                let mut col = [0i32; 4];
804                let mut dst = [0i32; 4];
805                for k in 0..4 {
806                    let ray_idx = (plc_local >> 16) as usize;
807                    let cd_offset = scratch.angstart[ray_idx] + j as isize;
808                    let cd = scratch.radar[cd_offset as usize];
809                    col[k] = cd.col;
810                    dst[k] = cd.dist;
811                    plc_local = plc_local.wrapping_add(incr);
812                }
813                if !scratch.foglut.is_empty() {
814                    for k in 0..4 {
815                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
816                    }
817                }
818                let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
819                let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
820                let vdst = f32x4_convert_i32x4(vdsi);
821                let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
822                let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
823                let vz = f32x4_mul(vdst, vinv);
824
825                let pixel_idx = row_start + x as usize;
826                v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
827                v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
828
829                vdx = f32x4_add(vdx, vstrx4);
830                vdy = f32x4_add(vdy, vstry4);
831                x += 4;
832            }
833            dirx = f32x4_extract_lane::<0>(vdx);
834            diry = f32x4_extract_lane::<0>(vdy);
835        }
836
837        // Scalar tail — handles 0..3 leftover pixels on x86_64 /
838        // aarch64 / wasm32 and the full body on other targets.
839        while x < p1 {
840            // ray index = signed shift right (voxlap's `plc >> 16`).
841            let ray_idx = (plc_local >> 16) as usize;
842            let cd_offset = scratch.angstart[ray_idx] + j as isize;
843            let cd = scratch.radar[cd_offset as usize];
844            let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
845
846            let pixel_idx = row_start + x as usize;
847            #[allow(clippy::cast_precision_loss)]
848            let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
849            // SAFETY: pixel_idx = sy*pitch + x, with sy < yres and x < p1
850            // ≤ xres (loop guard); p1 ≤ ctx.xres in scan_loops::top_quadrant /
851            // bottom_quadrant. fb / zb were allocated at pitch*height by the
852            // caller (asserted in Engine::render's preamble); pixel_idx is
853            // therefore in-range. Wedge-disjoint invariant: top + bottom
854            // quadrants own disjoint sy ranges.
855            unsafe {
856                self.target.write_color(pixel_idx, col as u32);
857                self.target.write_depth(pixel_idx, z);
858            }
859
860            dirx += rs.strx;
861            diry += rs.stry;
862            plc_local = plc_local.wrapping_add(incr);
863            x += 1;
864        }
865    }
866
867    fn vrend(
868        &mut self,
869        scratch: &mut ScanScratch,
870        sx: i32,
871        sy: i32,
872        p1: i32,
873        iplc: i32,
874        iinc: i32,
875    ) {
876        let rs = self
877            .frame
878            .as_ref()
879            .map(|f| f.ray_step)
880            .expect("hrend/vrend called before frame_setup");
881        #[allow(clippy::cast_precision_loss)]
882        let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
883        #[allow(clippy::cast_precision_loss)]
884        let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
885        let row_start = sy as usize * self.pitch_pixels;
886        let half_stride = scratch.uurend_half_stride;
887
888        let mut iplc_local = iplc;
889        let mut x = sx;
890
891        // R5.3: SSE2 4-pixel batch — port of voxlaptest's
892        // `vrendzsse` (voxlap5.c:2083). The per-column
893        // `uurend[sx] += uurend[sx + half_stride]` update is
894        // parallel-safe: uurend[sx + half_stride..] is read-only
895        // here, and uurend[sx..+3] are four distinct lanes.
896        // Read OLD u/d values, do the SSE z math, then write
897        // back four NEW u values. Plus fog blend (R5.2-style)
898        // when foglut is non-empty.
899        #[cfg(target_arch = "x86_64")]
900        #[allow(clippy::cast_ptr_alignment)]
901        unsafe {
902            use core::arch::x86_64::{
903                __m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
904                _mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
905            };
906            let strx = rs.strx;
907            let stry = rs.stry;
908            let vstrx4 = _mm_set1_ps(strx * 4.0);
909            let vstry4 = _mm_set1_ps(stry * 4.0);
910            let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
911            let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
912            while p1 - x >= 4 {
913                let xu = x as usize;
914                // Read 4 OLD uurend pairs (u, d). u = current ray
915                // index for column; d = per-pixel delta.
916                let mut u = [0i32; 4];
917                let mut d = [0i32; 4];
918                for k in 0..4 {
919                    u[k] = scratch.uurend[xu + k];
920                    d[k] = scratch.uurend[xu + k + half_stride];
921                }
922                // Gather 4 castdat hits.
923                let mut col = [0i32; 4];
924                let mut dst = [0i32; 4];
925                for k in 0..4 {
926                    let ray_idx = (u[k] >> 16) as usize;
927                    let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
928                    let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
929                    let cd = scratch.radar[cd_offset as usize];
930                    col[k] = cd.col;
931                    dst[k] = cd.dist;
932                }
933                if !scratch.foglut.is_empty() {
934                    for k in 0..4 {
935                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
936                    }
937                }
938                let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
939                let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
940                let vdst = _mm_cvtepi32_ps(vdsi);
941                let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
942                let vinv = _mm_rsqrt_ps(vsqr);
943                let vz = _mm_mul_ps(vdst, vinv);
944
945                let pixel_idx = row_start + xu;
946                _mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
947                _mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
948
949                // Write back NEW uurend values — u + d per lane.
950                for k in 0..4 {
951                    scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
952                }
953
954                vdx = _mm_add_ps(vdx, vstrx4);
955                vdy = _mm_add_ps(vdy, vstry4);
956                iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
957                x += 4;
958            }
959            dirx = _mm_cvtss_f32(vdx);
960            diry = _mm_cvtss_f32(vdy);
961        }
962
963        // R9: NEON 4-pixel batch for vrend — aarch64 equivalent.
964        // Same structure as hrend NEON: scalar gather + uurend
965        // read/write, NEON rsqrt for z, vectorized store.
966        #[cfg(target_arch = "aarch64")]
967        unsafe {
968            use core::arch::aarch64::{
969                float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
970                vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
971                vst1q_u32,
972            };
973            let strx = rs.strx;
974            let stry = rs.stry;
975            let vstrx4 = vdupq_n_f32(strx * 4.0);
976            let vstry4 = vdupq_n_f32(stry * 4.0);
977            let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
978            let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
979            let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
980            let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
981            while p1 - x >= 4 {
982                let xu = x as usize;
983                // Read 4 OLD uurend pairs (u, d).
984                let mut u = [0i32; 4];
985                let mut d = [0i32; 4];
986                for k in 0..4 {
987                    u[k] = scratch.uurend[xu + k];
988                    d[k] = scratch.uurend[xu + k + half_stride];
989                }
990                // Scalar gather — 4 castdat hits.
991                let mut col = [0i32; 4];
992                let mut dst = [0i32; 4];
993                for k in 0..4 {
994                    let ray_idx = (u[k] >> 16) as usize;
995                    let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
996                    let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
997                    let cd = scratch.radar[cd_offset as usize];
998                    col[k] = cd.col;
999                    dst[k] = cd.dist;
1000                }
1001                if !scratch.foglut.is_empty() {
1002                    for k in 0..4 {
1003                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1004                    }
1005                }
1006                let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
1007                let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
1008                let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
1009                let est = vrsqrteq_f32(vsqr);
1010                let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
1011                let vz = vmulq_f32(vdst, vinv);
1012
1013                let pixel_idx = row_start + xu;
1014                vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
1015                vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
1016
1017                // Write back NEW uurend values — u + d per lane.
1018                for k in 0..4 {
1019                    scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1020                }
1021
1022                vdx = vaddq_f32(vdx, vstrx4);
1023                vdy = vaddq_f32(vdy, vstry4);
1024                iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1025                x += 4;
1026            }
1027            dirx = vgetq_lane_f32(vdx, 0);
1028            diry = vgetq_lane_f32(vdy, 0);
1029        }
1030
1031        // R10.3: wasm SIMD 4-pixel batch for vrend — equivalent of
1032        // the SSE2 / NEON paths above. Same scalar-gather + uurend
1033        // read/write structure; full-precision `1.0 / sqrt(x)` for
1034        // the inverse magnitude, since wasm SIMD has no rsqrt
1035        // approximation. Bytes diverge from the other arches —
1036        // R10.4's `wasm-hashes.txt` covers the divergence.
1037        #[cfg(target_arch = "wasm32")]
1038        unsafe {
1039            use core::arch::wasm32::{
1040                f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
1041                f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
1042            };
1043            let strx = rs.strx;
1044            let stry = rs.stry;
1045            let vstrx4 = f32x4_splat(strx * 4.0);
1046            let vstry4 = f32x4_splat(stry * 4.0);
1047            let one = f32x4_splat(1.0);
1048            let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
1049            let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
1050            while p1 - x >= 4 {
1051                let xu = x as usize;
1052                // Read 4 OLD uurend pairs (u, d).
1053                let mut u = [0i32; 4];
1054                let mut d = [0i32; 4];
1055                for k in 0..4 {
1056                    u[k] = scratch.uurend[xu + k];
1057                    d[k] = scratch.uurend[xu + k + half_stride];
1058                }
1059                // Scalar gather — 4 castdat hits.
1060                let mut col = [0i32; 4];
1061                let mut dst = [0i32; 4];
1062                for k in 0..4 {
1063                    let ray_idx = (u[k] >> 16) as usize;
1064                    let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
1065                    let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
1066                    let cd = scratch.radar[cd_offset as usize];
1067                    col[k] = cd.col;
1068                    dst[k] = cd.dist;
1069                }
1070                if !scratch.foglut.is_empty() {
1071                    for k in 0..4 {
1072                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1073                    }
1074                }
1075                let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
1076                let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
1077                let vdst = f32x4_convert_i32x4(vdsi);
1078                let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
1079                let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
1080                let vz = f32x4_mul(vdst, vinv);
1081
1082                let pixel_idx = row_start + xu;
1083                v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
1084                v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
1085
1086                // Write back NEW uurend values — u + d per lane.
1087                for k in 0..4 {
1088                    scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1089                }
1090
1091                vdx = f32x4_add(vdx, vstrx4);
1092                vdy = f32x4_add(vdy, vstry4);
1093                iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1094                x += 4;
1095            }
1096            dirx = f32x4_extract_lane::<0>(vdx);
1097            diry = f32x4_extract_lane::<0>(vdy);
1098        }
1099
1100        // Scalar tail — handles 0..3 leftover pixels on x86_64 /
1101        // aarch64 / wasm32 and the full body on other targets.
1102        while x < p1 {
1103            // Vertical scan reads the per-column ray index from
1104            // uurend[sx] (>>16 to drop the fractional bits).
1105            let xu = x as usize;
1106            let ray_idx = (scratch.uurend[xu] >> 16) as usize;
1107            let cd_offset = scratch.angstart[ray_idx] + iplc_local as isize;
1108            let cd = scratch.radar[cd_offset as usize];
1109            let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
1110
1111            let pixel_idx = row_start + xu;
1112            #[allow(clippy::cast_precision_loss)]
1113            let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
1114            // SAFETY: see hrend's matching write — pixel_idx is in-bounds
1115            // by the same scan_loops geometry argument; right + left
1116            // quadrants own disjoint sx ranges so cross-thread writes
1117            // are pairwise pixel-disjoint.
1118            unsafe {
1119                self.target.write_color(pixel_idx, col as u32);
1120                self.target.write_depth(pixel_idx, z);
1121            }
1122
1123            dirx += rs.strx;
1124            diry += rs.stry;
1125            // Advance per-column ray index. uurend[x] persists
1126            // across vrend calls — this state is what couples
1127            // consecutive scanlines through the same column.
1128            scratch.uurend[xu] = scratch.uurend[xu].wrapping_add(scratch.uurend[xu + half_stride]);
1129            x += 1;
1130            iplc_local = iplc_local.wrapping_add(iinc);
1131        }
1132    }
1133}
1134
1135#[cfg(test)]
1136mod tests {
1137    use super::*;
1138    use crate::rasterizer::CastDat;
1139
1140    /// Build owned per-frame state so tests can assemble a
1141    /// `ScanContext` with proper-lifetime borrows. Values aren't
1142    /// load-bearing for the scalar-fill behaviour tests; the real
1143    /// `gline` cares about them, hence `camera_state` joining the
1144    /// tuple.
1145    fn dummy_per_frame() -> (
1146        crate::camera_math::CameraState,
1147        crate::projection::ProjectionRect,
1148        crate::ray_step::RayStep,
1149        crate::opticast_prelude::OpticastPrelude,
1150    ) {
1151        let cam = crate::Camera {
1152            pos: [0.0, 0.0, 0.0],
1153            right: [1.0, 0.0, 0.0],
1154            down: [0.0, 1.0, 0.0],
1155            forward: [0.0, 0.0, 1.0],
1156        };
1157        let cs = crate::camera_math::derive(&cam, 64, 64, 32.0, 32.0, 32.0);
1158        let proj = crate::projection::derive_projection(&cs, 64, 64, 32.0, 32.0, 32.0, 1);
1159        let rs = crate::ray_step::derive_ray_step(&cs, proj.cx, proj.cy, 32.0);
1160        let prelude = crate::opticast_prelude::derive_prelude(&cs, 2048, 1, 4, 1024);
1161        (cs, proj, rs, prelude)
1162    }
1163
1164    #[test]
1165    fn frame_setup_caches_ray_step() {
1166        let mut fb = vec![0u32; 64 * 64];
1167        let mut zb = vec![0.0f32; 64 * 64];
1168        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1169        let (cs, proj, rs, prelude) = dummy_per_frame();
1170        let ctx = ScanContext {
1171            proj: &proj,
1172            rs: &rs,
1173            prelude: &prelude,
1174            xres: 64,
1175            y_start: 0,
1176            y_end: 64,
1177            anginc: 1,
1178            camera_state: &cs,
1179            camera_gstartz0: 0,
1180            camera_gstartz1: 0,
1181            camera_vptr_offset: 0,
1182        };
1183        r.frame_setup(&ctx);
1184        let cached_rs = r.frame.as_ref().expect("frame populated").ray_step;
1185        assert_eq!(cached_rs.strx.to_bits(), rs.strx.to_bits());
1186        assert_eq!(cached_rs.stry.to_bits(), rs.stry.to_bits());
1187        assert_eq!(cached_rs.cx16, rs.cx16);
1188        assert_eq!(cached_rs.cy16, rs.cy16);
1189    }
1190
1191    #[cfg(target_arch = "x86_64")]
1192    #[test]
1193    fn hrend_sse_batch_writes_4_pixel_block() {
1194        // R5.1 smoke test: hrend's SSE batch fires for span len ≥
1195        // 4. Pre-fill radar with 4 distinct ARGB values, run hrend
1196        // over [10..14], assert the framebuffer carries the colour
1197        // bits (z lanes use rsqrtps approximation so are intent-
1198        // ionally not bit-checked).
1199        let mut fb = vec![0u32; 64 * 64];
1200        let mut zb = vec![0.0f32; 64 * 64];
1201        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1202        let (cs, proj, rs, prelude) = dummy_per_frame();
1203        let ctx = ScanContext {
1204            proj: &proj,
1205            rs: &rs,
1206            prelude: &prelude,
1207            xres: 64,
1208            y_start: 0,
1209            y_end: 64,
1210            anginc: 1,
1211            camera_state: &cs,
1212            camera_gstartz0: 0,
1213            camera_gstartz1: 0,
1214            camera_vptr_offset: 0,
1215        };
1216        r.frame_setup(&ctx);
1217
1218        let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1219        // 4 colour records so the batch reads each lane.
1220        for (i, slot) in scratch.radar.iter_mut().enumerate().take(4) {
1221            slot.col = 0x8000_0000_u32 as i32 | i as i32;
1222            slot.dist = 1024;
1223        }
1224        // angstart[i] = i so ray_idx i (= plc>>16) resolves to
1225        // radar[i + j] = radar[i] with j=0.
1226        for k in 0..4 {
1227            scratch.angstart[k] = k as isize;
1228        }
1229
1230        // sx=10, p1=14, j=0, plc=0, incr=1<<16 → plc>>16 steps
1231        // 0,1,2,3 over the four pixels → angstart[0..4].
1232        r.hrend(&mut scratch, 10, 5, 14, 0, 1 << 16, 0);
1233
1234        let row_off = 5 * 64;
1235        for k in 0..4 {
1236            let want = 0x8000_0000_u32 | k as u32;
1237            assert_eq!(
1238                fb[row_off + 10 + k],
1239                want,
1240                "fb[5][{}] = {:#010x}, expected {:#010x}",
1241                10 + k,
1242                fb[row_off + 10 + k],
1243                want,
1244            );
1245            // z lane non-zero (rsqrtps produced something).
1246            // Bit-compare to dodge clippy::float_cmp; we just want
1247            // to confirm the slot was written, not its precise value.
1248            assert_ne!(zb[row_off + 10 + k].to_bits(), 0u32);
1249        }
1250    }
1251
1252    #[test]
1253    fn fog_blend_disabled_returns_col_unchanged() {
1254        let foglut: Vec<i32> = Vec::new();
1255        let col = 0x0080_C040;
1256        assert_eq!(fog_blend(col, 0x1234_5678, &foglut, 0xFF_FFFF), col);
1257    }
1258
1259    #[test]
1260    fn fog_blend_full_fog_returns_fog_col_per_channel() {
1261        // l = 32767 → ((fog - col) * 32767) >> 15 = fog - col → final
1262        // is col + (fog - col) = fog (per channel; alpha untouched).
1263        let foglut = vec![32767; 2048];
1264        let col = 0x80_AA_BB_CC_u32 as i32;
1265        let fog = 0x00_11_22_33_i32;
1266        let blended = fog_blend(col, 0, &foglut, fog) as u32;
1267        // Low 24 bits = fog colour; alpha (high byte) survives from col.
1268        assert_eq!(blended & 0x00FF_FFFF, fog as u32 & 0x00FF_FFFF);
1269        assert_eq!(blended & 0xFF00_0000, col as u32 & 0xFF00_0000);
1270    }
1271
1272    #[test]
1273    fn set_fog_zero_distance_clears_table() {
1274        let mut s = ScanScratch::new_for_size(64, 64, 64);
1275        s.set_fog(0x1234_5678, 100);
1276        assert!(!s.foglut.is_empty());
1277        s.set_fog(0, 0);
1278        assert!(s.foglut.is_empty());
1279    }
1280
1281    #[test]
1282    fn set_fog_table_starts_at_zero_and_climbs() {
1283        let mut s = ScanScratch::new_for_size(64, 64, 64);
1284        s.set_fog(0xFF, 1024);
1285        // First entry: acc = 0 → hi16 = 0.
1286        assert_eq!(s.foglut[0], 0);
1287        // Last entry near the overflow boundary should be near 32767
1288        // (the saturate-fill value); voxlap's exact step makes
1289        // foglut[2047] either ~32766 (last walked entry) or 32767
1290        // (post-overflow padding) depending on max_scan_dist
1291        // divisibility.
1292        assert!(
1293            s.foglut[2047] > 30_000,
1294            "tail entry too low: {}",
1295            s.foglut[2047]
1296        );
1297    }
1298
1299    #[test]
1300    fn hrend_writes_pixel_per_column_from_radar() {
1301        // Pre-fill scratch.radar with a recognizable colour gradient
1302        // and pre-set scratch.angstart so hrend resolves to the
1303        // expected radar slot per pixel. Then call hrend manually
1304        // and verify the framebuffer received the colours.
1305        let mut fb = vec![0u32; 64 * 64];
1306        let mut zb = vec![0.0f32; 64 * 64];
1307        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1308        let (cs, proj, rs, prelude) = dummy_per_frame();
1309        let ctx = ScanContext {
1310            proj: &proj,
1311            rs: &rs,
1312            prelude: &prelude,
1313            xres: 64,
1314            y_start: 0,
1315            y_end: 64,
1316            anginc: 1,
1317            camera_state: &cs,
1318            camera_gstartz0: 0,
1319            camera_gstartz1: 0,
1320            camera_vptr_offset: 0,
1321        };
1322        r.frame_setup(&ctx);
1323
1324        let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1325        // Synthetic radar: 16 castdat entries with col = 0xAA...,
1326        // dist = 1 (zbuffer math will divide by sqrt of dir² so we
1327        // just pick a stable distance).
1328        for (i, slot) in scratch.radar.iter_mut().enumerate().take(16) {
1329            slot.col = 0x8000_0000_u32 as i32 | i as i32;
1330            slot.dist = 1024;
1331        }
1332        // angstart[0..4] all point at radar[0]; with j=0..3 and a
1333        // plc that increments by 0 (incr=0 holds plc>>16 at 0), the
1334        // pixel-by-pixel index lands at slots 0, 1, 2, 3 — i.e. j
1335        // selects the column.
1336        scratch.angstart[0] = 0;
1337
1338        // Render a span of 4 columns starting at sx=10, sy=5, with j
1339        // varying via the scan: but voxlap's hrend uses a single j
1340        // for the whole span (per-ray castdat column is fixed). We
1341        // pick j=2 and verify framebuffer[row*pitch + sx..sx+4] all
1342        // equal radar[0+2].col = 0x80000002.
1343        r.hrend(&mut scratch, 10, 5, 14, 0, 0, 2);
1344
1345        let row_off = 5 * 64;
1346        for x in 10..14 {
1347            let want = 0x8000_0000_u32 | 2;
1348            assert_eq!(
1349                fb[row_off + x],
1350                want,
1351                "fb[5][{x}] = {:#010x}, expected {:#010x}",
1352                fb[row_off + x],
1353                want,
1354            );
1355        }
1356        // Pixels outside the rendered span are untouched.
1357        assert_eq!(fb[row_off + 9], 0);
1358        assert_eq!(fb[row_off + 14], 0);
1359    }
1360
1361    #[test]
1362    fn end_to_end_opticast_runs_through_real_gline() {
1363        // Smoke test: with a valid above-the-slab camera and the
1364        // real R4.3a-rewire-3b gline, full opticast should run
1365        // without panicking and return `Rendered`. The synthetic
1366        // single-slab world has no voxel colour bytes so grouscan's
1367        // fill loops bail to startsky, which writes the configured
1368        // skycast into the radar — verify by setting a recognizable
1369        // skycast and asserting some pixels carry it.
1370        use crate::opticast as opticast_fn;
1371        use crate::rasterizer::ScratchPool;
1372        use crate::OpticastSettings;
1373
1374        let mut fb = vec![0u32; 640 * 480];
1375        let mut zb = vec![0.0f32; 640 * 480];
1376        let mut pool = ScratchPool::new(640, 480, 2048);
1377        // Recognizable sky colour — pixels filled by startsky's
1378        // solid-fill branch (the path the empty-colour-byte slab
1379        // ends up routing through) carry this.
1380        let sky_col = 0x80AB_CDEF_u32 as i32;
1381        pool.set_skycast(sky_col, 0x7FFF_FFFF);
1382
1383        // Single solid slab at z = 200..254. cz = 128 < 200 →
1384        // air-above-the-slab, opticast renders. Synthetic world:
1385        // only the camera's column (1024 * 2048 + 1024) holds the
1386        // slab; every other column is empty. Build world before
1387        // the rasterizer so it can borrow `&column` / `&column_offsets`.
1388        let column = vec![0u8, 200, 254, 0];
1389        let cam_idx = 1024usize * 2048 + 1024;
1390        let mut column_offsets = vec![0u32; 2048 * 2048 + 1];
1391        let column_len_u32 = u32::try_from(column.len()).expect("column fits u32");
1392        for offset in &mut column_offsets[(cam_idx + 1)..] {
1393            *offset = column_len_u32;
1394        }
1395
1396        let mip_base_offsets = [0usize, column_offsets.len()];
1397        let mut rasterizer = ScalarRasterizer::new(
1398            &mut fb,
1399            &mut zb,
1400            640,
1401            &column,
1402            &column_offsets,
1403            &mip_base_offsets,
1404            2048,
1405        );
1406
1407        let cam = crate::Camera {
1408            pos: [1024.0, 1024.0, 128.0],
1409            right: [1.0, 0.0, 0.0],
1410            down: [0.0, 1.0, 0.0],
1411            forward: [0.0, 0.0, 1.0],
1412        };
1413        let settings = OpticastSettings::for_oracle_framebuffer(640, 480);
1414
1415        let outcome = opticast_fn(
1416            &mut rasterizer,
1417            &mut pool,
1418            &cam,
1419            &settings,
1420            2048,
1421            &column,
1422            &column_offsets,
1423        );
1424        assert_eq!(outcome, crate::OpticastOutcome::Rendered);
1425
1426        // Wiring smoke test — gline → derive_gline_frustum →
1427        // grouscan_run chain executes for every ray without
1428        // panicking, opticast returns Rendered. The synthetic
1429        // single-slab world has no colour bytes (header only) so
1430        // grouscan's drawflor fill bails on the bounds check and
1431        // routes to predeletez → deletez → Done before reaching
1432        // startsky; the radar stays at default zeros, the
1433        // framebuffer ends up sky-blue (the host pre-fill). What
1434        // matters is that nothing crashed — the per-ray gline
1435        // arithmetic + cf[128] seeding + grouscan dispatch all
1436        // hold up under live ray geometry. Once R4.3a-rewire-4
1437        // loads a real `.vxl` with colour bytes, grouscan's fill
1438        // loops will write recognisable voxel colours and a
1439        // colour-presence assertion replaces this comment.
1440        let _ = sky_col; // suppress unused-let warning — kept as
1441                         // scaffolding for the future assertion.
1442    }
1443
1444    #[cfg(target_arch = "x86_64")]
1445    #[test]
1446    fn vrend_sse_batch_writes_4_pixel_block() {
1447        // R5.3 smoke test: vrend's SSE batch fires for span len ≥
1448        // 4. Pre-fill 4 distinct radar entries, set angstart so
1449        // each lane's uurend[sx]>>16 indexes a different ray, run
1450        // vrend over [10..14], assert each column got the right
1451        // colour and uurend advanced.
1452        let mut fb = vec![0u32; 64 * 64];
1453        let mut zb = vec![0.0f32; 64 * 64];
1454        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1455        let (cs, proj, rs, prelude) = dummy_per_frame();
1456        let ctx = ScanContext {
1457            proj: &proj,
1458            rs: &rs,
1459            prelude: &prelude,
1460            xres: 64,
1461            y_start: 0,
1462            y_end: 64,
1463            anginc: 1,
1464            camera_state: &cs,
1465            camera_gstartz0: 0,
1466            camera_gstartz1: 0,
1467            camera_vptr_offset: 0,
1468        };
1469        r.frame_setup(&ctx);
1470
1471        let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1472        for k in 0..4 {
1473            scratch.radar[k] = CastDat {
1474                col: 0x8000_0000_u32 as i32 | k as i32,
1475                dist: 1024,
1476            };
1477            scratch.angstart[k] = k as isize;
1478        }
1479        // uurend[sx + k] >> 16 = k → ray_idx k → angstart[k] = k
1480        // → radar[k]. delta = 5 (so post-batch uurend[sx + k] =
1481        // (k << 16) + 5).
1482        let half = scratch.uurend_half_stride;
1483        for k in 0..4 {
1484            scratch.uurend[10 + k] = (k as i32) << 16;
1485            scratch.uurend[10 + k + half] = 5;
1486        }
1487
1488        r.vrend(&mut scratch, 10, 5, 14, 0, 0);
1489
1490        let row_off = 5 * 64;
1491        for k in 0..4 {
1492            let want = 0x8000_0000_u32 | k as u32;
1493            assert_eq!(fb[row_off + 10 + k], want, "fb col[{}]", 10 + k);
1494            assert!(zb[row_off + 10 + k].to_bits() != 0, "z[{}]", 10 + k);
1495            // Post-batch uurend = old_u + delta.
1496            assert_eq!(
1497                scratch.uurend[10 + k],
1498                ((k as i32) << 16) + 5,
1499                "uurend[{}]",
1500                10 + k
1501            );
1502        }
1503    }
1504
1505    #[test]
1506    fn vrend_advances_uurend_per_pixel() {
1507        // Verify the uurend[sx] += uurend[sx+half_stride] mutation
1508        // happens once per pixel.
1509        let mut fb = vec![0u32; 64 * 64];
1510        let mut zb = vec![0.0f32; 64 * 64];
1511        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, &[], &[], &[0usize, 0], 64);
1512        let (cs, proj, rs, prelude) = dummy_per_frame();
1513        let ctx = ScanContext {
1514            proj: &proj,
1515            rs: &rs,
1516            prelude: &prelude,
1517            xres: 64,
1518            y_start: 0,
1519            y_end: 64,
1520            anginc: 1,
1521            camera_state: &cs,
1522            camera_gstartz0: 0,
1523            camera_gstartz1: 0,
1524            camera_vptr_offset: 0,
1525        };
1526        r.frame_setup(&ctx);
1527
1528        let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1529        scratch.radar[0] = CastDat {
1530            col: 0x8033_4455_u32 as i32,
1531            dist: 1024,
1532        };
1533        // angstart[0] = 0 → all rays read radar[0 + iplc].
1534        scratch.angstart[0] = 0;
1535        // Pre-set uurend so ray_idx = uurend[sx] >> 16 = 0 for all
1536        // four columns (we want them to all hit angstart[0]).
1537        let half = scratch.uurend_half_stride;
1538        for sx in 10..14 {
1539            scratch.uurend[sx] = 0;
1540            scratch.uurend[sx + half] = 1; // delta added per pixel
1541        }
1542
1543        r.vrend(&mut scratch, 10, 5, 14, 0, 0);
1544
1545        // Each column's uurend should have advanced by the delta.
1546        for sx in 10..14 {
1547            assert_eq!(scratch.uurend[sx], 1, "uurend[{sx}] not advanced");
1548        }
1549    }
1550}