Skip to main content

roxlap_core/
scalar_rasterizer.rs

1//! Scalar `Rasterizer` implementation — port of voxlaptest's 4.7.5
2//! scalar `hrendzsse` / `vrendzsse` fallbacks (`voxlap5.c:1947` /
3//! `:2003` post-4.8.4 line numbers). Writes one `u32` ARGB pixel +
4//! one `f32` z-buffer entry per screen position from radar entries
5//! the (still-stubbed) `gline` will produce in R4.3.
6//!
7//! Per-pixel math (the SSE-batched form lives behind R5; this is the
8//! pre-batch shape):
9//!
10//! ```text
11//! col = scratch.angstart[plc >> 16] + j     // signed offset into radar
12//! framebuffer[pixel] = radar[col].col       // packed ARGB
13//! z = radar[col].dist / sqrt(dirx² + diry²) // f32 z-buffer entry
14//! dirx += strx; diry += stry; plc += incr
15//! ```
16//!
17//! The vertical scan additionally mutates `scratch.uurend[sx] +=
18//! scratch.uurend[sx + half_stride]` per pixel — that's why the
19//! Rasterizer trait now hands `&mut ScanScratch` to `vrend` /
20//! `hrend`. `gline` is a TODO stub; without it the angstart entries
21//! it would write are zero, so a freshly allocated radar yields all-
22//! zero pixels. Tests pre-fill the radar manually to verify the
23//! scanline rasterizer end of the pipeline.
24
25// Module-wide cast allows: the per-pixel arithmetic constantly
26// crosses signed/unsigned and i32/usize boundaries (loop counters,
27// signed offsets into radar, framebuffer indices). Annotating each
28// site individually buries the per-pixel logic in lint suppressions;
29// the cast-correctness invariants hold by construction.
30#![allow(
31    clippy::cast_sign_loss,
32    clippy::cast_possible_truncation,
33    clippy::cast_possible_wrap,
34    clippy::similar_names
35)]
36
37use std::marker::PhantomData;
38
39use crate::camera_math::CameraState;
40use crate::fixed::ftol;
41use crate::gline::derive_gline_frustum;
42use crate::grouscan::{grouscan_run, CfType, GrouscanInputs, CF_SEED_INDEX};
43use crate::opticast::camera_column_slice;
44use crate::opticast_prelude::{OpticastPrelude, PREC};
45use crate::rasterizer::{Rasterizer, ScanScratch};
46use crate::ray_step::RayStep;
47use crate::scan_loops::ScanContext;
48
49/// Borrowed view of the framebuffer + zbuffer as raw pointers.
50///
51/// R12.2.0 introduces this so the per-frame `ScalarRasterizer` can be
52/// `Copy` (for the per-thread fan-out R12.2.1 lands). Holding `&'a
53/// mut [u32]` / `&'a mut [f32]` directly forces exclusive borrows
54/// per instance, blocking the four quadrants from running on four
55/// threads even though their pixel writes are disjoint.
56///
57/// Constructed safely from exclusive slice borrows — the slices are
58/// consumed and re-exposed as raw pointers tied to lifetime `'a`
59/// via `PhantomData`. Once a `RasterTarget` exists, it is the sole
60/// path to the underlying memory; the slices cannot be used through
61/// any other channel for the duration of `'a`.
62///
63/// `Copy` lets opticast hand each quadrant thread its own copy of
64/// the same target. The four threads write disjoint pixels (top /
65/// bottom / left / right wedges of the screen, no overlap), so
66/// pointer aliasing is safe under the documented invariant.
67///
68/// # Safety contract for parallel use
69/// Callers that copy a `RasterTarget` and pass copies to multiple
70/// threads MUST guarantee that the threads collectively write to
71/// pairwise-disjoint pixel indices. opticast enforces this via the
72/// four-quadrant wedge geometry (see `scan_loops::{top,right,
73/// bottom,left}_quadrant`). Single-threaded callers (R12.1 default)
74/// hold one copy and trivially satisfy the invariant.
75#[derive(Clone, Copy, Debug)]
76pub struct RasterTarget<'a> {
77    fb_ptr: *mut u32,
78    fb_len: usize,
79    zb_ptr: *mut f32,
80    zb_len: usize,
81    _marker: PhantomData<&'a mut [u32]>,
82}
83
84// SAFETY: `RasterTarget` is morally a borrowed mutable slice pair —
85// the same shape `&'a mut [u32]` / `&'a mut [f32]` would have, both of
86// which are `Send` when `T: Send`. Multi-thread safety is enforced
87// by the wedge / strip-disjoint write invariant (see struct doc).
88unsafe impl Send for RasterTarget<'_> {}
89
90// SAFETY: sharing `&RasterTarget` across threads exposes only the
91// raw pointers + lengths. Reading a pointer field is itself free of
92// data races; concurrent writes through the pointer are gated by
93// the disjoint-write invariant the caller upholds. Required so
94// `ScalarRasterizer: Sync`, which `rayon::par_iter_mut` needs to
95// share `&rasterizer` across the strip-parallel closures (R12.3.1).
96unsafe impl Sync for RasterTarget<'_> {}
97
98impl<'a> RasterTarget<'a> {
99    /// Build a target from exclusive slice borrows. The slices are
100    /// consumed (their `&'a mut` reborrow is the load-bearing thing —
101    /// this constructor is the only way to mint a `RasterTarget`
102    /// from safe code).
103    #[must_use]
104    pub fn new(framebuffer: &'a mut [u32], zbuffer: &'a mut [f32]) -> Self {
105        Self {
106            fb_ptr: framebuffer.as_mut_ptr(),
107            fb_len: framebuffer.len(),
108            zb_ptr: zbuffer.as_mut_ptr(),
109            zb_len: zbuffer.len(),
110            _marker: PhantomData,
111        }
112    }
113
114    /// Framebuffer length in `u32` elements.
115    #[must_use]
116    pub fn fb_len(self) -> usize {
117        self.fb_len
118    }
119
120    /// Raw mutable framebuffer pointer. Used by SSE blocks that do
121    /// their own arithmetic + bounds reasoning.
122    ///
123    /// # Safety
124    /// Callers must respect `fb_len` and the parallel-use invariant.
125    #[must_use]
126    pub fn fb_ptr(self) -> *mut u32 {
127        self.fb_ptr
128    }
129
130    /// Raw mutable zbuffer pointer. Same contract as
131    /// [`Self::fb_ptr`].
132    #[must_use]
133    pub fn zb_ptr(self) -> *mut f32 {
134        self.zb_ptr
135    }
136
137    /// Write one ARGB pixel.
138    ///
139    /// # Safety
140    /// `idx < self.fb_len()`, plus the parallel-use invariant.
141    pub unsafe fn write_color(self, idx: usize, color: u32) {
142        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
143        // SAFETY: caller asserts in-bounds + disjoint-from-other-threads.
144        unsafe { self.fb_ptr.add(idx).write(color) };
145    }
146
147    /// Write one z-buffer entry.
148    ///
149    /// # Safety
150    /// `idx < self.fb_len()` (zbuffer length matches fb), plus the
151    /// parallel-use invariant.
152    pub unsafe fn write_depth(self, idx: usize, z: f32) {
153        debug_assert!(idx < self.zb_len, "zb idx {} >= len {}", idx, self.zb_len);
154        // SAFETY: caller asserts in-bounds + disjoint-from-other-threads.
155        unsafe { self.zb_ptr.add(idx).write(z) };
156    }
157}
158
159// gcsub now lives on `ScanScratch::gcsub`; the host pokes it via
160// `ScanScratch::set_side_shades` per frame (mirrors voxlap's
161// `setsideshades` global). The default-state pattern
162// (`0x00ff00ff00ff00ff` per entry, == `setsideshades(0,…,0)`) is
163// what `ScanScratch::new_for_size` initialises to, so the oracle
164// stays bit-exact when no shading is configured.
165
166/// Per-channel fog blend — voxlap5.c:2052-2056 (and the matching
167/// hrend / vrend scalar tail). `col` is the source ARGB voxel
168/// colour; `dist` is the radar slot's depth (PREC-scaled, so
169/// `>> 20` gives the integer cell distance index into `foglut`).
170/// `foglut` empty short-circuits to "no fog" (returns `col`
171/// unchanged); OOB indices saturate to 32767 (full fog) since
172/// `set_fog` pads the table that way.
173//
174// The C form is one branchless expression per channel; we keep
175// it as-is for legibility against the source. `as i32` casts
176// match the C int32 arithmetic.
177#[allow(
178    clippy::cast_sign_loss,
179    clippy::cast_possible_wrap,
180    clippy::many_single_char_names
181)]
182fn fog_blend(col: i32, dist: i32, foglut: &[i32], fog_col: i32) -> i32 {
183    if foglut.is_empty() {
184        return col;
185    }
186    let idx = (dist >> 20) as usize;
187    let l = foglut.get(idx).copied().unwrap_or(32767) & 32767;
188    let k = col;
189    let fc = fog_col;
190    let r = (((fc & 255) - (k & 255)) * l) >> 15;
191    let g = (((((fc >> 8) & 255) - ((k >> 8) & 255)) * l) >> 15) << 8;
192    let b = (((((fc >> 16) & 255) - ((k >> 16) & 255)) * l) >> 15) << 16;
193    r + g + b + k
194}
195
196/// Per-ray sky-row update. Mirror of voxlap5.c:1236-1255.
197///
198/// On the first ray of a quadrant (`scratch.sky_cur_lng < 0`),
199/// initialise from the ray's `atan2(vy1, vx1)` mapped through
200/// `sky.lng_mul`. On subsequent rays, walk `sky.lng[]` forward
201/// (when `sky_cur_dir < 0`) or backward (`sky_cur_dir >= 0`)
202/// until the cross-product flips sign — voxlap's rotating-cursor
203/// trick that avoids re-running atan2 per ray. After the search,
204/// stamp `scratch.sky_off = sky_cur_lng * sky.bpl`, which
205/// `phase_startsky_textured` divides by 4 to land at the row's
206/// pixel-base index.
207fn sky_per_ray_update(
208    scratch: &mut crate::rasterizer::ScanScratch,
209    sky: &crate::sky::Sky,
210    vx1: f32,
211    vy1: f32,
212) {
213    let ysiz = sky.ysiz;
214    if scratch.sky_cur_lng < 0 {
215        // First-ray init — atan2 mapped to row index.
216        let ang = vy1.atan2(vx1) + std::f32::consts::PI;
217        let raw = ang * sky.lng_mul - 0.5;
218        let mut lng = ftol(raw);
219        // Voxlap's `(uint32_t)skycurlng >= skyysiz` clamp uses an
220        // uninitialised `j` for the corrective shift; we substitute
221        // `rem_euclid` for a deterministic in-range value. The
222        // rotating-cursor walk in subsequent rays will quickly
223        // converge whichever way we land.
224        if (lng as u32) >= (ysiz as u32) {
225            lng = lng.rem_euclid(ysiz);
226        }
227        scratch.sky_cur_lng = lng;
228    } else if scratch.sky_cur_dir < 0 {
229        // Walk forward (rotating).
230        let mut j = scratch.sky_cur_lng + 1;
231        if j >= ysiz {
232            j = 0;
233        }
234        loop {
235            let l = sky.lng[j as usize];
236            if l[0] * vy1 <= l[1] * vx1 {
237                break;
238            }
239            scratch.sky_cur_lng = j;
240            j += 1;
241            if j >= ysiz {
242                j = 0;
243            }
244        }
245    } else {
246        // Walk backward (rotating).
247        loop {
248            let l = sky.lng[scratch.sky_cur_lng as usize];
249            if l[0] * vy1 >= l[1] * vx1 {
250                break;
251            }
252            scratch.sky_cur_lng -= 1;
253            if scratch.sky_cur_lng < 0 {
254                scratch.sky_cur_lng = ysiz - 1;
255            }
256        }
257    }
258    // Voxlap: `skyoff = skycurlng * skybpl + nskypic`. We strip
259    // the `+ nskypic` (texture base address) — `phase_startsky`
260    // adds it implicitly by indexing `sky.pixels` directly.
261    scratch.sky_off = scratch.sky_cur_lng * sky.bpl;
262}
263
264/// Per-frame state cached on first `frame_setup` call. Owned here
265/// (vs. borrowed from `ScanContext`) because gline needs to read
266/// it across many calls without re-borrowing each time. The
267/// `prelude` clone copies one `Vec<i32>` (the `y_lookup` mip
268/// table) per frame — cheap.
269//
270// `Clone` is used by [`ScalarRasterizer`]'s 4-way fan-out (R12.2.1)
271// to mint one rasterizer per quadrant thread; each clone copies the
272// (~few KB) y_lookup Vec — small per-frame allocation cost.
273#[derive(Clone)]
274struct FrameCache {
275    ray_step: RayStep,
276    camera_state: CameraState,
277    prelude: OpticastPrelude,
278    gstartz0: i32,
279    gstartz1: i32,
280    /// Voxlap's `v - *ixy_sptr_col` — byte offset within the
281    /// camera's column to the slab whose top bounds the air gap
282    /// from below. `0` ⇒ column-top.
283    vptr_offset: usize,
284    /// S4B.6.e: chunk-z that owns `vptr_offset`. For the in-camera-
285    /// chunk case this equals `prelude.camera_chunk_idx[2]`. For
286    /// cross-chunk look-down it points to the chunk holding the
287    /// real floor — gline_seed routes state.column / slab_buf to
288    /// it so rays start walking that chunk directly.
289    seed_chunk_z: i32,
290}
291
292/// Scalar rasterizer that writes pixels and a z-buffer entry per
293/// screen position.
294///
295/// Borrows the framebuffer + zbuffer for the duration of one
296/// `opticast` call; SDL hosts allocate these once and reuse across
297/// frames, see `roxlap-host`.
298//
299// `grid` carries the per-frame voxel-world borrow that gline reads
300// to call `grouscan_run` per ray. S4B.0 collapsed the previous
301// four-field `(slab_buf, column_offsets, mip_base_offsets, vsid)`
302// shape into a single `GridView<'a>`; gline + frame_setup
303// destructure via `self.grid.<field>` reads.
304#[derive(Clone)]
305pub struct ScalarRasterizer<'a> {
306    /// Framebuffer + zbuffer raw-pointer view. Stripped from the
307    /// caller's `&mut [u32]` / `&mut [f32]` borrows at construction
308    /// (see [`Self::new`]) so the rasterizer can be `Copy` for the
309    /// per-thread quadrant fan-out R12.2.1 lands. Single-threaded
310    /// path holds one copy, parallel path will hold four (one per
311    /// quadrant — wedge-disjoint pixel writes; see
312    /// [`RasterTarget`]'s safety contract).
313    target: RasterTarget<'a>,
314    /// Row stride in `u32` / `f32` elements (== framebuffer width
315    /// for tightly-packed buffers; SDL streaming textures may add
316    /// trailing padding).
317    pitch_pixels: usize,
318    /// Per-frame world borrow. `Copy`, so the parallel branches'
319    /// per-thread rasterizer clones share the same backing slab /
320    /// column-offset data without an extra heap allocation.
321    grid: crate::grid_view::GridView<'a>,
322    /// Optional sky texture borrow. `None` ⇒ `phase_startsky`
323    /// solid-fills with `scratch.skycast`. `Some(_)` ⇒ gline's
324    /// per-ray frustum prep updates `scratch.sky_off`, and
325    /// `phase_startsky` runs the textured search-and-sample loop.
326    /// Set via [`Self::with_sky`] after construction; unset ⇒
327    /// engine's existing solid-sky behaviour, byte-stable for the
328    /// oracle.
329    sky: Option<&'a crate::sky::Sky>,
330    /// Per-frame state cache. `None` until the first `frame_setup`
331    /// call; gline panics if invoked before that.
332    frame: Option<FrameCache>,
333}
334
335// R12.2.1 / R12.3.1: opticast's parallel branches fan the rasterizer
336// across rayon-managed threads — each thread owns its own clone. The
337// clones share `target: RasterTarget` (raw pointers; safe under the
338// strip-disjoint pixel-write invariant documented on RasterTarget),
339// hold &-refs into the slab/column data (Sync), and have independent
340// FrameCache copies. Compile-time checks: this fails if any field
341// becomes non-Send/non-Sync so the parallel path can no longer hold.
342const _: fn() = || {
343    fn assert_send<T: Send>() {}
344    fn assert_sync<T: Sync>() {}
345    assert_send::<ScalarRasterizer<'_>>();
346    assert_sync::<ScalarRasterizer<'_>>();
347};
348
349impl<'a> ScalarRasterizer<'a> {
350    /// Create a rasterizer that will write into the supplied
351    /// framebuffer + zbuffer pair. `pitch_pixels` must satisfy
352    /// `pitch_pixels * height ≤ framebuffer.len()` for the height
353    /// the engine renders into; the `frame_setup` hook does not
354    /// validate sizes (it has no height to check against).
355    ///
356    /// `grid` describes the world the renderer reads from. Build
357    /// from a [`roxlap_formats::vxl::Vxl`] via
358    /// [`crate::grid_view::GridView::from_single_vxl`], or from raw
359    /// parts via [`crate::grid_view::GridView::from_parts`].
360    ///
361    /// `ray_step` is initialised to a zero placeholder; the real
362    /// values get stamped on the first [`Rasterizer::frame_setup`]
363    /// call before any per-pixel work runs.
364    #[must_use]
365    pub fn new(
366        framebuffer: &'a mut [u32],
367        zbuffer: &'a mut [f32],
368        pitch_pixels: usize,
369        grid: crate::grid_view::GridView<'a>,
370    ) -> Self {
371        Self {
372            target: RasterTarget::new(framebuffer, zbuffer),
373            pitch_pixels,
374            grid,
375            sky: None,
376            frame: None,
377        }
378    }
379
380    /// Bind a sky texture for the lifetime of this rasterizer
381    /// instance. Hosts call this when [`crate::Engine::sky`] is
382    /// `Some(_)`. Without it, the rasterizer keeps the legacy
383    /// solid-fill `skycast` behaviour.
384    #[must_use]
385    pub fn with_sky(mut self, sky: &'a crate::sky::Sky) -> Self {
386        self.sky = Some(sky);
387        self
388    }
389}
390
391impl Rasterizer for ScalarRasterizer<'_> {
392    fn frame_setup(&mut self, ctx: &ScanContext<'_>) {
393        // Cache everything per-frame so gline doesn't re-borrow on
394        // every call. Prelude is cloned (one Vec<i32> alloc per
395        // frame for y_lookup; small).
396        self.frame = Some(FrameCache {
397            ray_step: *ctx.rs,
398            camera_state: *ctx.camera_state,
399            prelude: ctx.prelude.clone(),
400            gstartz0: ctx.camera_gstartz0,
401            gstartz1: ctx.camera_gstartz1,
402            vptr_offset: ctx.camera_vptr_offset,
403            seed_chunk_z: ctx.camera_seed_chunk_z,
404        });
405    }
406
407    #[allow(clippy::too_many_lines)]
408    fn gline(
409        &mut self,
410        scratch: &mut ScanScratch,
411        length: u32,
412        x0: f32,
413        y0: f32,
414        x1: f32,
415        y1: f32,
416    ) {
417        // Voxlap's per-scanline ray-cast: derive the frustum, seed
418        // cf[128], stamp scratch globals, call grouscan. Mirror of
419        // voxlap5.c:gline (1146..1235).
420        let cache = self
421            .frame
422            .as_ref()
423            .expect("gline called before frame_setup");
424        let leng = length as i32;
425
426        // S1.3: resolve the camera-position-specific state once at
427        // the top so the rest of `gline` reads from locals. With no
428        // S1.Z: with the negative-index walk, `gline` reads
429        // camera-position-specific state directly from the cache —
430        // no per-scanline override needed. OOB cameras flow through
431        // the same path; their (cx, cy) signed coords carry into
432        // grouscan via the prelude and the column-step path skips
433        // OOB columns as empty.
434        let pos_xfrac = cache.prelude.pos_xfrac;
435        let pos_yfrac = cache.prelude.pos_yfrac;
436        let li_pos_xy = [cache.prelude.li_pos[0], cache.prelude.li_pos[1]];
437        let column_index = cache.prelude.column_index;
438        // S4B.6.c: cf seed `z0` / `z1` are already world-z (the
439        // air-gap lookup translates chunk-local to world by adding
440        // `camera_chunk_z * chunk_size_z`). For unstacked grids
441        // (`chunks_z == 1`, camera_chunk_z == 0) the world-z and
442        // chunk-local values coincide — byte-identical with the
443        // pre-S4B.6.c path.
444        let gstartz0 = cache.gstartz0;
445        let gstartz1 = cache.gstartz1;
446        let vptr_offset = cache.vptr_offset;
447
448        // 1. Project per-ray frustum (vd0/vd1/vz0/vx1/vy1/vz1 +
449        //    gixy/gpz/gdz). voxlap5.c:1153-1175.
450        let f = derive_gline_frustum(
451            &cache.camera_state,
452            pos_xfrac,
453            pos_yfrac,
454            self.grid.vsid,
455            length,
456            x0,
457            y0,
458            x1,
459            y1,
460        );
461
462        // 2. Stamp ray-step globals onto scratch.
463        scratch.gixy = f.gixy;
464        scratch.gpz = f.gpz;
465        scratch.gdz = f.gdz;
466
467        // 3. cmprecip[leng] = CMPPREC / leng (voxlap precomputed
468        //    table; voxlap5.c:12315 builds it as `CMPPREC/(float)i`).
469        //    CMPPREC = 256*4096 = PREC. gi0 / gi1 are per-pixel ray-
470        //    step coefficients in Q12.20 (= PREC); cx0/cy0/cx1/cy1
471        //    are the cf[128] seed endpoints. voxlap5.c:1179-1190.
472        // The `as f32` casts here lose precision for very large leng
473        // (> 2²³), but realistic scanline lengths (a few thousand)
474        // are well below that.
475        #[allow(clippy::cast_precision_loss)]
476        let cmpprec = PREC as f32;
477        #[allow(clippy::cast_precision_loss)]
478        let cmprecip = if leng > 0 {
479            cmpprec / (leng as f32)
480        } else {
481            0.0
482        };
483        // ftol() routes float→i32 through i64 to mirror voxlap C's
484        // wrap-on-overflow `lrintf+(int32_t)cast`. The cf-seed
485        // products (vd ± vd) * cmprecip and vd * cmpprec land at
486        // the i32 boundary for world-coord magnitudes near VSID
487        // (= 2048) × PREC (= 2²⁰); Rust's `as i32` saturates and
488        // diverges for those edge cases.
489        let (gi0, gi1, cx0, cy0) = if cache.prelude.forward_z_sign < 0 {
490            (
491                ftol((f.vd1 - f.vd0) * cmprecip),
492                ftol((f.vz1 - f.vz0) * cmprecip),
493                ftol(f.vd0 * cmpprec),
494                ftol(f.vz0 * cmpprec),
495            )
496        } else {
497            (
498                ftol((f.vd0 - f.vd1) * cmprecip),
499                ftol((f.vz0 - f.vz1) * cmprecip),
500                ftol(f.vd1 * cmpprec),
501                ftol(f.vz1 * cmpprec),
502            )
503        };
504        let cx1 = leng.wrapping_mul(gi0).wrapping_add(cx0);
505        let cy1 = leng.wrapping_mul(gi1).wrapping_add(cy0);
506
507        scratch.gi0 = gi0;
508        scratch.gi1 = gi1;
509
510        // 4. Seed cf[128] with the radar range + air-gap z-bounds +
511        //    Q12.20 ray endpoints. voxlap5.c:1176-1190.
512        let gscanptr_isize = scratch.gscanptr as isize;
513        scratch.cf[CF_SEED_INDEX] = CfType {
514            i0: gscanptr_isize,
515            i1: gscanptr_isize + leng as isize,
516            z0: gstartz0,
517            z1: gstartz1,
518            cx0,
519            cy0,
520            cx1,
521            cy1,
522            // S4B.6.l: chz_layer scaffold. Pre-l.2 (= multi-chz seed
523            // construction) every cf entry maps to the same chz —
524            // `seed_chunk_z`, the chunk-z the rasterizer reads voxel
525            // data from. For non-cross-chunk-look-down poses this
526            // equals `camera_chunk_idx[2]`.
527            chz_layer: cache.seed_chunk_z,
528        };
529
530        // 5. gxmax = min(gmaxscandist, frustum-edge clip per axis).
531        //    voxlap5.c:1192-1228. Unsigned compare — voxlap's `q`
532        //    is a uint64_t product that may exceed gmaxscandist or
533        //    wrap negative.
534        //
535        //    Also stamps `skycast.dist` per voxlap5.c:1209-1227:
536        //    initialised to `gxmax` (the scan-distance ceiling),
537        //    overwritten with `0x7FFFFFFF` if either frustum-edge
538        //    clip fires (= ray hits world edge before scan-dist
539        //    cap → "infinitely far" sky depth). startsky's solid-
540        //    fill writes this into every drained radar slot's
541        //    `dist`, which the z-buffer ends up carrying.
542        //
543        // S4B.2.c.3: the world-edge clip is per-axis voxel-distance
544        // from `li_pos` to the grid's AABB edge along the ray's
545        // step direction. Single-chunk grids (`chunk_grid: None`)
546        // have AABB = `[0, vsid)²`, byte-identical to today. Multi-
547        // chunk grids derive AABB from `chunk_grid.origin_chunk_xy +
548        // chunks_x/y * chunk_size_xy` so rays can walk across all
549        // chunks before hitting the world edge.
550        let mut gxmax = cache.prelude.max_scan_dist;
551        scratch.skycast.dist = gxmax;
552        // S4B.2.d: world-edge clip uses the grid's voxel AABB.
553        // Single-chunk grids get `([0, 0], [vsid, vsid])` — same as
554        // the historical world-edge math. Multi-chunk grids extend
555        // to cover every chunk's voxel extent.
556        let (aabb_min, aabb_max) = self.grid.aabb_xy();
557        let (world_xmin, world_xmax) = (aabb_min[0], aabb_max[0]);
558        let (world_ymin, world_ymax) = (aabb_min[1], aabb_max[1]);
559        let j0 = if f.gixy[0] < 0 {
560            li_pos_xy[0] - world_xmin
561        } else {
562            world_xmax - 1 - li_pos_xy[0]
563        };
564        let q0 = (i64::from(f.gdz[0]).wrapping_mul(i64::from(j0)))
565            .wrapping_add(i64::from(f.gpz[0] as u32));
566        if (q0 as u64) < u64::from(gxmax as u32) {
567            gxmax = q0 as i32;
568            scratch.skycast.dist = i32::MAX;
569        }
570        let j1 = if f.gixy[1] < 0 {
571            li_pos_xy[1] - world_ymin
572        } else {
573            world_ymax - 1 - li_pos_xy[1]
574        };
575        let q1 = (i64::from(f.gdz[1]).wrapping_mul(i64::from(j1)))
576            .wrapping_add(i64::from(f.gpz[1] as u32));
577        if (q1 as u64) < u64::from(gxmax as u32) {
578            gxmax = q1 as i32;
579            scratch.skycast.dist = i32::MAX;
580        }
581        scratch.gxmax = gxmax;
582
583        // 5b. Per-ray sky-row search. Mirror of voxlap5.c:1236-
584        //     1255. Walks `sky.lng[]` to find the texel-row whose
585        //     longitude vector matches the ray's `(vx1, vy1)`
586        //     direction; stamps `scratch.sky_off` so
587        //     `phase_startsky` knows which row to sample. No-op
588        //     when no sky texture is bound.
589        if let Some(sky) = self.sky {
590            sky_per_ray_update(scratch, sky, f.vx1, f.vy1);
591        }
592
593        // 6. Build inputs and call grouscan_run.
594        //
595        // S4B.2.c.2: dispatch the camera-column lookup via
596        // `chunk_at_xy(camera_chunk_idx)` so multi-chunk grids
597        // start the walk inside the camera's actual chunk. Single-
598        // chunk grids (`chunk_grid: None`) get
599        // `chunk_at_xy([0, 0]) == Some(Self)` — the flat-table
600        // lookup with the camera's `column_index`, byte-identical
601        // to the goldens.
602        //
603        // OOB cameras (`in_bounds_xy: false`) start with an empty
604        // column — the grouscan column-step walk picks up real
605        // chunks once `(cx, cy)` cross into the grid.
606        // S4B.6.b: dispatch via `chunk_at_xyz` so stacked grids
607        // start the walk inside the camera's `(chx, chy, chz)`. For
608        // `chunks_z == 1` grids `camera_chunk_idx[2] == 0` and the
609        // shortcut path returns the same chunk as the pre-S4B.6
610        // `chunk_at_xy` lookup — byte-identical for the goldens.
611        //
612        // S4B.6.e: use `cache.seed_chunk_z` for the chz coordinate
613        // (= the chunk that owns the cf-seed's vptr_offset). For
614        // the no-cross-chunk path `seed_chunk_z ==
615        // camera_chunk_idx[2]` and this is unchanged. For
616        // cross-chunk look-down (camera in all-air-bedrock column
617        // with terrain in a deeper chunk) `seed_chunk_z` points to
618        // the deeper chunk so the rasterizer reads its real slab
619        // data instead of the empty camera-chunk column.
620        let seed_chunk_xyz = [
621            cache.prelude.camera_chunk_idx[0],
622            cache.prelude.camera_chunk_idx[1],
623            cache.seed_chunk_z,
624        ];
625        let camera_chunk_opt = if cache.prelude.in_bounds_xy {
626            self.grid.chunk_at_xyz(seed_chunk_xyz)
627        } else {
628            None
629        };
630        let (column, chunk_slab, chunk_cols, chunk_mips, chunk_vsid) = match camera_chunk_opt {
631            Some(chunk) => {
632                #[allow(clippy::cast_sign_loss)]
633                let column_idx_in_chunk = (cache.prelude.camera_local_xyz[1] as u32)
634                    .wrapping_mul(chunk.chunk_size_xy)
635                    .wrapping_add(cache.prelude.camera_local_xyz[0] as u32);
636                let col =
637                    camera_column_slice(chunk.slab_buf, chunk.column_offsets, column_idx_in_chunk)
638                        .unwrap_or(&[]);
639                (
640                    col,
641                    chunk.slab_buf,
642                    chunk.column_offsets,
643                    chunk.mip_base_offsets,
644                    chunk.vsid,
645                )
646            }
647            None => (
648                &[][..],
649                self.grid.slab_buf,
650                self.grid.column_offsets,
651                self.grid.mip_base_offsets,
652                self.grid.vsid,
653            ),
654        };
655        // Copy gcsub out of scratch so the GrouscanInputs immutable
656        // borrow doesn't collide with the `&mut scratch` grouscan_run
657        // takes below. `[i64; 9]` is 72 bytes — cheap.
658        let mut gcsub_local: [i64; 9] = scratch.gcsub;
659        // Voxlap5.c:1230-1234. Per-ray, populate the wall-side lanes
660        // (0/1) from the directional lanes (4/5 = left/right,
661        // 6/7 = up/down) according to the sign of `gixy`. Without
662        // this, `wall_lane` reads from the stale `0x00ff_00ff_00ff_00ff`
663        // baseline and wall faces get no directional darkening, even
664        // after the host calls `set_side_shades`.
665        if scratch.sideshademode {
666            let lane0_idx = if f.gixy[0] < 0 { 4 } else { 5 };
667            let lane1_idx = if f.gixy[1] < 0 { 6 } else { 7 };
668            gcsub_local[0] = gcsub_local[lane0_idx];
669            gcsub_local[1] = gcsub_local[lane1_idx];
670        }
671        // S4B.6.j: decouple the camera's PHYSICAL chz (= used by
672        // the column-step's chunk-XY DDA at every XY crossing)
673        // from the SEED chunk's chz (= where state.column starts).
674        // They match for the in-camera-chunk path; they differ
675        // when seed-time cross-chunk look-down stepped down to a
676        // deeper chunk's real floor. Without the decoupling, the
677        // pinned `seed_chunk_z` propagates through every chunk-XY
678        // swap and chz=`camera_chunk_z` content at OTHER columns
679        // (e.g. a mountain peak the camera's column doesn't share)
680        // becomes permanently invisible.
681        //
682        // Camera ABOVE the grid (camera_chunk_idx[2] < origin_chunk_z,
683        // world z<0 in voxlap z-down) clamps to origin_chunk_z so the
684        // column-step's `chunk_at_xyz(.., camera_chunk_z)` queries land
685        // in the grid's top chunk. S5.2-followup: also clamp UP for
686        // camera BELOW the grid (raw chz past max_chz), common for
687        // rotated small grids whose inverse-rotation lands the local
688        // camera past the grid's z extent. Without the symmetric
689        // upper clamp every XY column lookup returns None and the
690        // grid renders pure sky. Mirrors the seed-side clamp in
691        // `camera_chunk_air_gap`.
692        let raw_camera_chunk_z = cache.prelude.camera_chunk_idx[2];
693        let origin_chunk_z = self.grid.chunk_grid.map_or(0, |cg| cg.origin_chunk_z);
694        let chunks_z = self.grid.chunk_grid.map_or(1, |cg| cg.chunks_z) as i32;
695        let max_chz = origin_chunk_z + chunks_z - 1;
696        let camera_chunk_z = raw_camera_chunk_z.clamp(origin_chunk_z, max_chz);
697        let seed_chunk_z = cache.seed_chunk_z;
698        #[allow(clippy::cast_possible_wrap)]
699        let chunk_size_z_signed = self.grid.chunk_size_z as i32;
700        // chunk_world_z_base for the SEED chunk. State reads its
701        // slab data first; per-XY column-step resets this to
702        // `camera_chunk_z * chunk_size_z` for the chunks the DDA
703        // visits. OOB-XY: synthesised seed, no base offset.
704        let chunk_world_z_base = if cache.prelude.in_bounds_xy {
705            seed_chunk_z * chunk_size_z_signed
706        } else {
707            0
708        };
709        let inputs = GrouscanInputs {
710            column,
711            gylookup: &cache.prelude.y_lookup,
712            gcsub: &gcsub_local,
713            slab_buf: chunk_slab,
714            column_offsets: chunk_cols,
715            mip_base_offsets: chunk_mips,
716            vsid: chunk_vsid,
717            sky: self.sky.map(crate::grouscan::SkyRef::from_sky),
718            grid_view: self.grid,
719            // S4B.6.b: pin the camera's chz layer for the
720            // column-step's chunk-XY swap.
721            camera_chunk_z,
722            // S4B.6.c: cf seed + slab byte reads operate in world-z
723            // by adding `chunk_world_z_base` to chunk-local z reads.
724            chunk_world_z_base,
725            chunk_size_z: self.grid.chunk_size_z,
726        };
727        // gmipnum: min of (built mip levels in chunk) and
728        // (caller-requested cap from settings.mip_levels). Both
729        // matter — the chunk dictates whether mip-N data exists,
730        // while the cap dictates how big y_lookup is. Mismatch
731        // would let `phase_remiporend` advance gylookup past its
732        // tail and read garbage offsets. R4.5d's body still gates
733        // on `state.gmipcnt < gmipnum` so a chunk with no mips
734        // takes the early-out and renders mip-0 only.
735        let chunk_mips = u32::try_from(self.grid.mip_base_offsets.len().saturating_sub(1))
736            .expect("mip count fits in u32");
737        let gmipnum = chunk_mips.min(cache.prelude.mip_levels).max(1);
738        let _ = grouscan_run(
739            scratch,
740            &inputs,
741            vptr_offset,
742            column_index as usize,
743            cache.prelude.cx,
744            cache.prelude.cy,
745            cache.prelude.x_mip,
746            gmipnum,
747        );
748
749        // gscanptr is advanced by the opticast quadrant scan
750        // (`scan_loops.rs::top_quadrant` etc., voxlap5.c:2382 area)
751        // AFTER each gline call. Voxlap's `gline` itself does NOT
752        // touch gscanptr — advancing it here too created gaps of
753        // `leng+1` unwritten radar slots between consecutive glines,
754        // which read back as 0 in hrend → black pixels at the
755        // sphere position in diag_down / high_down.
756    }
757
758    #[allow(clippy::too_many_lines)]
759    fn hrend(
760        &mut self,
761        scratch: &mut ScanScratch,
762        sx: i32,
763        sy: i32,
764        p1: i32,
765        plc: i32,
766        incr: i32,
767        j: i32,
768    ) {
769        let rs = self
770            .frame
771            .as_ref()
772            .map(|f| f.ray_step)
773            .expect("hrend/vrend called before frame_setup");
774        // Per-frame setup gives strx/stry/heix/heiy/addx/addy; per-
775        // pixel direction = strx*sx + heix*sy + addx, advancing by
776        // strx in the inner loop.
777        #[allow(clippy::cast_precision_loss)]
778        let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
779        #[allow(clippy::cast_precision_loss)]
780        let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
781        let row_start = sy as usize * self.pitch_pixels;
782
783        let mut plc_local = plc;
784        let mut x = sx;
785
786        // R5.1: SSE2 4-pixel batch via `_mm_rsqrt_ps` — port of
787        // voxlaptest's `hrendzsse` (voxlap5.c:1947). 12-bit
788        // approximation, no Newton refine, matching the
789        // historical asm. The tail (0..3 leftover pixels)
790        // continues with the bit-exact scalar form below; the
791        // batch's z lanes will not match scalar 1/sqrt exactly,
792        // mirroring voxlap. SSE2 is x86_64 baseline so no
793        // runtime CPU-feature check is needed.
794        //
795        // `cast_ptr_alignment` is suppressed because we use
796        // `_mm_storeu_si128` / `_mm_storeu_ps` — the `u`-suffix
797        // variants explicitly support unaligned addresses, so a
798        // u32 pointer cast to `*mut __m128i` is sound.
799        #[cfg(target_arch = "x86_64")]
800        #[allow(clippy::cast_ptr_alignment)]
801        unsafe {
802            use core::arch::x86_64::{
803                __m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
804                _mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
805            };
806            let strx = rs.strx;
807            let stry = rs.stry;
808            let vstrx4 = _mm_set1_ps(strx * 4.0);
809            let vstry4 = _mm_set1_ps(stry * 4.0);
810            let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
811            let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
812            while p1 - x >= 4 {
813                // Gather 4 castdat hits — one per ray index.
814                let mut col = [0i32; 4];
815                let mut dst = [0i32; 4];
816                for k in 0..4 {
817                    let ray_idx = (plc_local >> 16) as usize;
818                    let cd_offset = scratch.angstart[ray_idx] + j as isize;
819                    let cd = scratch.radar[cd_offset as usize];
820                    col[k] = cd.col;
821                    dst[k] = cd.dist;
822                    plc_local = plc_local.wrapping_add(incr);
823                }
824                // R5.2: per-pixel fog blend (voxlap's `hrendzfogsse`).
825                // No-op when foglut is empty. Voxlap's MMX path used
826                // pmulhw with foglut as 4 packed int16 lanes; we
827                // mirror the scalar fallback the goldens use, which
828                // applies a single `l = foglut[..] & 32767` factor
829                // per pixel (one `l` per ray, all 3 channels).
830                if !scratch.foglut.is_empty() {
831                    for k in 0..4 {
832                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
833                    }
834                }
835                let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
836                let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
837                let vdst = _mm_cvtepi32_ps(vdsi);
838                let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
839                let vinv = _mm_rsqrt_ps(vsqr);
840                let vz = _mm_mul_ps(vdst, vinv);
841
842                let pixel_idx = row_start + x as usize;
843                _mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
844                _mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
845
846                vdx = _mm_add_ps(vdx, vstrx4);
847                vdy = _mm_add_ps(vdy, vstry4);
848                x += 4;
849            }
850            // Bring scalar dirx/diry up to where the batch left
851            // off — first lane of the post-step vdx/vdy.
852            dirx = _mm_cvtss_f32(vdx);
853            diry = _mm_cvtss_f32(vdy);
854        }
855
856        // R9: NEON 4-pixel batch — aarch64 equivalent of the SSE2
857        // path above. Uses `vrsqrteq_f32` + one Newton–Raphson step
858        // via `vrsqrtsq_f32` for ~16-bit precision (vs SSE2's ~12-bit
859        // without Newton). NEON is baseline on all AArch64 — no
860        // runtime feature check needed. Stores are naturally unaligned.
861        #[cfg(target_arch = "aarch64")]
862        unsafe {
863            use core::arch::aarch64::{
864                float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
865                vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
866                vst1q_u32,
867            };
868            let strx = rs.strx;
869            let stry = rs.stry;
870            let vstrx4 = vdupq_n_f32(strx * 4.0);
871            let vstry4 = vdupq_n_f32(stry * 4.0);
872            let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
873            let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
874            let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
875            let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
876            while p1 - x >= 4 {
877                // Scalar gather — same as SSE2 path.
878                let mut col = [0i32; 4];
879                let mut dst = [0i32; 4];
880                for k in 0..4 {
881                    let ray_idx = (plc_local >> 16) as usize;
882                    let cd_offset = scratch.angstart[ray_idx] + j as isize;
883                    let cd = scratch.radar[cd_offset as usize];
884                    col[k] = cd.col;
885                    dst[k] = cd.dist;
886                    plc_local = plc_local.wrapping_add(incr);
887                }
888                if !scratch.foglut.is_empty() {
889                    for k in 0..4 {
890                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
891                    }
892                }
893                let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
894                let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
895                let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
896                // One Newton–Raphson step: est * vrsqrts(x * est, est).
897                let est = vrsqrteq_f32(vsqr);
898                let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
899                let vz = vmulq_f32(vdst, vinv);
900
901                let pixel_idx = row_start + x as usize;
902                vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
903                vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
904
905                vdx = vaddq_f32(vdx, vstrx4);
906                vdy = vaddq_f32(vdy, vstry4);
907                x += 4;
908            }
909            dirx = vgetq_lane_f32(vdx, 0);
910            diry = vgetq_lane_f32(vdy, 0);
911        }
912
913        // R10.3: wasm SIMD 4-pixel batch — equivalent of the SSE2
914        // / NEON paths above. Uses `1.0 / sqrt(x)` (full-precision
915        // `f32x4_sqrt` + `f32x4_div`) where SSE2 had `_mm_rsqrt_ps`
916        // and NEON had `vrsqrteq_f32`+Newton, since wasm SIMD has
917        // no rsqrt approximation. Wasm bytes therefore differ
918        // from both x86 and aarch64 goldens — captured by R10.4's
919        // separate `wasm-hashes.txt`.
920        #[cfg(target_arch = "wasm32")]
921        unsafe {
922            use core::arch::wasm32::{
923                f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
924                f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
925            };
926            let strx = rs.strx;
927            let stry = rs.stry;
928            let vstrx4 = f32x4_splat(strx * 4.0);
929            let vstry4 = f32x4_splat(stry * 4.0);
930            let one = f32x4_splat(1.0);
931            let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
932            let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
933            while p1 - x >= 4 {
934                // Scalar gather — same shape as SSE2 / NEON paths.
935                let mut col = [0i32; 4];
936                let mut dst = [0i32; 4];
937                for k in 0..4 {
938                    let ray_idx = (plc_local >> 16) as usize;
939                    let cd_offset = scratch.angstart[ray_idx] + j as isize;
940                    let cd = scratch.radar[cd_offset as usize];
941                    col[k] = cd.col;
942                    dst[k] = cd.dist;
943                    plc_local = plc_local.wrapping_add(incr);
944                }
945                if !scratch.foglut.is_empty() {
946                    for k in 0..4 {
947                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
948                    }
949                }
950                let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
951                let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
952                let vdst = f32x4_convert_i32x4(vdsi);
953                let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
954                let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
955                let vz = f32x4_mul(vdst, vinv);
956
957                let pixel_idx = row_start + x as usize;
958                v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
959                v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
960
961                vdx = f32x4_add(vdx, vstrx4);
962                vdy = f32x4_add(vdy, vstry4);
963                x += 4;
964            }
965            dirx = f32x4_extract_lane::<0>(vdx);
966            diry = f32x4_extract_lane::<0>(vdy);
967        }
968
969        // Scalar tail — handles 0..3 leftover pixels on x86_64 /
970        // aarch64 / wasm32 and the full body on other targets.
971        while x < p1 {
972            // ray index = signed shift right (voxlap's `plc >> 16`).
973            let ray_idx = (plc_local >> 16) as usize;
974            let cd_offset = scratch.angstart[ray_idx] + j as isize;
975            let cd = scratch.radar[cd_offset as usize];
976            let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
977
978            let pixel_idx = row_start + x as usize;
979            #[allow(clippy::cast_precision_loss)]
980            let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
981            // SAFETY: pixel_idx = sy*pitch + x, with sy < yres and x < p1
982            // ≤ xres (loop guard); p1 ≤ ctx.xres in scan_loops::top_quadrant /
983            // bottom_quadrant. fb / zb were allocated at pitch*height by the
984            // caller (asserted in Engine::render's preamble); pixel_idx is
985            // therefore in-range. Wedge-disjoint invariant: top + bottom
986            // quadrants own disjoint sy ranges.
987            unsafe {
988                self.target.write_color(pixel_idx, col as u32);
989                self.target.write_depth(pixel_idx, z);
990            }
991
992            dirx += rs.strx;
993            diry += rs.stry;
994            plc_local = plc_local.wrapping_add(incr);
995            x += 1;
996        }
997    }
998
999    #[allow(clippy::too_many_lines)]
1000    fn vrend(
1001        &mut self,
1002        scratch: &mut ScanScratch,
1003        sx: i32,
1004        sy: i32,
1005        p1: i32,
1006        iplc: i32,
1007        iinc: i32,
1008    ) {
1009        let rs = self
1010            .frame
1011            .as_ref()
1012            .map(|f| f.ray_step)
1013            .expect("hrend/vrend called before frame_setup");
1014        #[allow(clippy::cast_precision_loss)]
1015        let mut dirx = rs.strx * sx as f32 + rs.heix * sy as f32 + rs.addx;
1016        #[allow(clippy::cast_precision_loss)]
1017        let mut diry = rs.stry * sx as f32 + rs.heiy * sy as f32 + rs.addy;
1018        let row_start = sy as usize * self.pitch_pixels;
1019        let half_stride = scratch.uurend_half_stride;
1020
1021        let mut iplc_local = iplc;
1022        let mut x = sx;
1023
1024        // R5.3: SSE2 4-pixel batch — port of voxlaptest's
1025        // `vrendzsse` (voxlap5.c:2083). The per-column
1026        // `uurend[sx] += uurend[sx + half_stride]` update is
1027        // parallel-safe: uurend[sx + half_stride..] is read-only
1028        // here, and uurend[sx..+3] are four distinct lanes.
1029        // Read OLD u/d values, do the SSE z math, then write
1030        // back four NEW u values. Plus fog blend (R5.2-style)
1031        // when foglut is non-empty.
1032        #[cfg(target_arch = "x86_64")]
1033        #[allow(clippy::cast_ptr_alignment)]
1034        unsafe {
1035            use core::arch::x86_64::{
1036                __m128i, _mm_add_ps, _mm_cvtepi32_ps, _mm_cvtss_f32, _mm_mul_ps, _mm_rsqrt_ps,
1037                _mm_set1_ps, _mm_setr_epi32, _mm_setr_ps, _mm_storeu_ps, _mm_storeu_si128,
1038            };
1039            let strx = rs.strx;
1040            let stry = rs.stry;
1041            let vstrx4 = _mm_set1_ps(strx * 4.0);
1042            let vstry4 = _mm_set1_ps(stry * 4.0);
1043            let mut vdx = _mm_setr_ps(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
1044            let mut vdy = _mm_setr_ps(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
1045            while p1 - x >= 4 {
1046                let xu = x as usize;
1047                // Read 4 OLD uurend pairs (u, d). u = current ray
1048                // index for column; d = per-pixel delta.
1049                let mut u = [0i32; 4];
1050                let mut d = [0i32; 4];
1051                for k in 0..4 {
1052                    u[k] = scratch.uurend[xu + k];
1053                    d[k] = scratch.uurend[xu + k + half_stride];
1054                }
1055                // Gather 4 castdat hits.
1056                let mut col = [0i32; 4];
1057                let mut dst = [0i32; 4];
1058                for k in 0..4 {
1059                    let ray_idx = (u[k] >> 16) as usize;
1060                    let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
1061                    let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
1062                    let cd = scratch.radar[cd_offset as usize];
1063                    col[k] = cd.col;
1064                    dst[k] = cd.dist;
1065                }
1066                if !scratch.foglut.is_empty() {
1067                    for k in 0..4 {
1068                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1069                    }
1070                }
1071                let vcol = _mm_setr_epi32(col[0], col[1], col[2], col[3]);
1072                let vdsi = _mm_setr_epi32(dst[0], dst[1], dst[2], dst[3]);
1073                let vdst = _mm_cvtepi32_ps(vdsi);
1074                let vsqr = _mm_add_ps(_mm_mul_ps(vdx, vdx), _mm_mul_ps(vdy, vdy));
1075                let vinv = _mm_rsqrt_ps(vsqr);
1076                let vz = _mm_mul_ps(vdst, vinv);
1077
1078                let pixel_idx = row_start + xu;
1079                _mm_storeu_si128(self.target.fb_ptr().add(pixel_idx).cast::<__m128i>(), vcol);
1080                _mm_storeu_ps(self.target.zb_ptr().add(pixel_idx), vz);
1081
1082                // Write back NEW uurend values — u + d per lane.
1083                for k in 0..4 {
1084                    scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1085                }
1086
1087                vdx = _mm_add_ps(vdx, vstrx4);
1088                vdy = _mm_add_ps(vdy, vstry4);
1089                iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1090                x += 4;
1091            }
1092            dirx = _mm_cvtss_f32(vdx);
1093            diry = _mm_cvtss_f32(vdy);
1094        }
1095
1096        // R9: NEON 4-pixel batch for vrend — aarch64 equivalent.
1097        // Same structure as hrend NEON: scalar gather + uurend
1098        // read/write, NEON rsqrt for z, vectorized store.
1099        #[cfg(target_arch = "aarch64")]
1100        unsafe {
1101            use core::arch::aarch64::{
1102                float32x4_t, vaddq_f32, vcvtq_f32_s32, vdupq_n_f32, vgetq_lane_f32, vld1q_f32,
1103                vld1q_s32, vmulq_f32, vreinterpretq_u32_s32, vrsqrteq_f32, vrsqrtsq_f32, vst1q_f32,
1104                vst1q_u32,
1105            };
1106            let strx = rs.strx;
1107            let stry = rs.stry;
1108            let vstrx4 = vdupq_n_f32(strx * 4.0);
1109            let vstry4 = vdupq_n_f32(stry * 4.0);
1110            let dx_arr: [f32; 4] = [dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx];
1111            let dy_arr: [f32; 4] = [diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry];
1112            let mut vdx: float32x4_t = vld1q_f32(dx_arr.as_ptr());
1113            let mut vdy: float32x4_t = vld1q_f32(dy_arr.as_ptr());
1114            while p1 - x >= 4 {
1115                let xu = x as usize;
1116                // Read 4 OLD uurend pairs (u, d).
1117                let mut u = [0i32; 4];
1118                let mut d = [0i32; 4];
1119                for k in 0..4 {
1120                    u[k] = scratch.uurend[xu + k];
1121                    d[k] = scratch.uurend[xu + k + half_stride];
1122                }
1123                // Scalar gather — 4 castdat hits.
1124                let mut col = [0i32; 4];
1125                let mut dst = [0i32; 4];
1126                for k in 0..4 {
1127                    let ray_idx = (u[k] >> 16) as usize;
1128                    let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
1129                    let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
1130                    let cd = scratch.radar[cd_offset as usize];
1131                    col[k] = cd.col;
1132                    dst[k] = cd.dist;
1133                }
1134                if !scratch.foglut.is_empty() {
1135                    for k in 0..4 {
1136                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1137                    }
1138                }
1139                let vcol = vreinterpretq_u32_s32(vld1q_s32(col.as_ptr()));
1140                let vdst = vcvtq_f32_s32(vld1q_s32(dst.as_ptr()));
1141                let vsqr = vaddq_f32(vmulq_f32(vdx, vdx), vmulq_f32(vdy, vdy));
1142                let est = vrsqrteq_f32(vsqr);
1143                let vinv = vmulq_f32(est, vrsqrtsq_f32(vmulq_f32(vsqr, est), est));
1144                let vz = vmulq_f32(vdst, vinv);
1145
1146                let pixel_idx = row_start + xu;
1147                vst1q_u32(self.target.fb_ptr().add(pixel_idx), vcol);
1148                vst1q_f32(self.target.zb_ptr().add(pixel_idx), vz);
1149
1150                // Write back NEW uurend values — u + d per lane.
1151                for k in 0..4 {
1152                    scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1153                }
1154
1155                vdx = vaddq_f32(vdx, vstrx4);
1156                vdy = vaddq_f32(vdy, vstry4);
1157                iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1158                x += 4;
1159            }
1160            dirx = vgetq_lane_f32(vdx, 0);
1161            diry = vgetq_lane_f32(vdy, 0);
1162        }
1163
1164        // R10.3: wasm SIMD 4-pixel batch for vrend — equivalent of
1165        // the SSE2 / NEON paths above. Same scalar-gather + uurend
1166        // read/write structure; full-precision `1.0 / sqrt(x)` for
1167        // the inverse magnitude, since wasm SIMD has no rsqrt
1168        // approximation. Bytes diverge from the other arches —
1169        // R10.4's `wasm-hashes.txt` covers the divergence.
1170        #[cfg(target_arch = "wasm32")]
1171        unsafe {
1172            use core::arch::wasm32::{
1173                f32x4, f32x4_add, f32x4_convert_i32x4, f32x4_div, f32x4_extract_lane, f32x4_mul,
1174                f32x4_splat, f32x4_sqrt, i32x4, v128, v128_store,
1175            };
1176            let strx = rs.strx;
1177            let stry = rs.stry;
1178            let vstrx4 = f32x4_splat(strx * 4.0);
1179            let vstry4 = f32x4_splat(stry * 4.0);
1180            let one = f32x4_splat(1.0);
1181            let mut vdx = f32x4(dirx, dirx + strx, dirx + 2.0 * strx, dirx + 3.0 * strx);
1182            let mut vdy = f32x4(diry, diry + stry, diry + 2.0 * stry, diry + 3.0 * stry);
1183            while p1 - x >= 4 {
1184                let xu = x as usize;
1185                // Read 4 OLD uurend pairs (u, d).
1186                let mut u = [0i32; 4];
1187                let mut d = [0i32; 4];
1188                for k in 0..4 {
1189                    u[k] = scratch.uurend[xu + k];
1190                    d[k] = scratch.uurend[xu + k + half_stride];
1191                }
1192                // Scalar gather — 4 castdat hits.
1193                let mut col = [0i32; 4];
1194                let mut dst = [0i32; 4];
1195                for k in 0..4 {
1196                    let ray_idx = (u[k] >> 16) as usize;
1197                    let iplc_k = iplc_local.wrapping_add(iinc.wrapping_mul(k as i32));
1198                    let cd_offset = scratch.angstart[ray_idx] + iplc_k as isize;
1199                    let cd = scratch.radar[cd_offset as usize];
1200                    col[k] = cd.col;
1201                    dst[k] = cd.dist;
1202                }
1203                if !scratch.foglut.is_empty() {
1204                    for k in 0..4 {
1205                        col[k] = fog_blend(col[k], dst[k], &scratch.foglut, scratch.fog_col);
1206                    }
1207                }
1208                let vcol: v128 = i32x4(col[0], col[1], col[2], col[3]);
1209                let vdsi: v128 = i32x4(dst[0], dst[1], dst[2], dst[3]);
1210                let vdst = f32x4_convert_i32x4(vdsi);
1211                let vsqr = f32x4_add(f32x4_mul(vdx, vdx), f32x4_mul(vdy, vdy));
1212                let vinv = f32x4_div(one, f32x4_sqrt(vsqr));
1213                let vz = f32x4_mul(vdst, vinv);
1214
1215                let pixel_idx = row_start + xu;
1216                v128_store(self.target.fb_ptr().add(pixel_idx).cast::<v128>(), vcol);
1217                v128_store(self.target.zb_ptr().add(pixel_idx).cast::<v128>(), vz);
1218
1219                // Write back NEW uurend values — u + d per lane.
1220                for k in 0..4 {
1221                    scratch.uurend[xu + k] = u[k].wrapping_add(d[k]);
1222                }
1223
1224                vdx = f32x4_add(vdx, vstrx4);
1225                vdy = f32x4_add(vdy, vstry4);
1226                iplc_local = iplc_local.wrapping_add(iinc.wrapping_mul(4));
1227                x += 4;
1228            }
1229            dirx = f32x4_extract_lane::<0>(vdx);
1230            diry = f32x4_extract_lane::<0>(vdy);
1231        }
1232
1233        // Scalar tail — handles 0..3 leftover pixels on x86_64 /
1234        // aarch64 / wasm32 and the full body on other targets.
1235        while x < p1 {
1236            // Vertical scan reads the per-column ray index from
1237            // uurend[sx] (>>16 to drop the fractional bits).
1238            let xu = x as usize;
1239            let ray_idx = (scratch.uurend[xu] >> 16) as usize;
1240            let cd_offset = scratch.angstart[ray_idx] + iplc_local as isize;
1241            let cd = scratch.radar[cd_offset as usize];
1242            let col = fog_blend(cd.col, cd.dist, &scratch.foglut, scratch.fog_col);
1243
1244            let pixel_idx = row_start + xu;
1245            #[allow(clippy::cast_precision_loss)]
1246            let z = cd.dist as f32 / (dirx * dirx + diry * diry).sqrt();
1247            // SAFETY: see hrend's matching write — pixel_idx is in-bounds
1248            // by the same scan_loops geometry argument; right + left
1249            // quadrants own disjoint sx ranges so cross-thread writes
1250            // are pairwise pixel-disjoint.
1251            unsafe {
1252                self.target.write_color(pixel_idx, col as u32);
1253                self.target.write_depth(pixel_idx, z);
1254            }
1255
1256            dirx += rs.strx;
1257            diry += rs.stry;
1258            // Advance per-column ray index. uurend[x] persists
1259            // across vrend calls — this state is what couples
1260            // consecutive scanlines through the same column.
1261            scratch.uurend[xu] = scratch.uurend[xu].wrapping_add(scratch.uurend[xu + half_stride]);
1262            x += 1;
1263            iplc_local = iplc_local.wrapping_add(iinc);
1264        }
1265    }
1266}
1267
1268#[cfg(test)]
1269mod tests {
1270    use super::*;
1271    use crate::rasterizer::CastDat;
1272
1273    /// Build owned per-frame state so tests can assemble a
1274    /// `ScanContext` with proper-lifetime borrows. Values aren't
1275    /// load-bearing for the scalar-fill behaviour tests; the real
1276    /// `gline` cares about them, hence `camera_state` joining the
1277    /// tuple.
1278    fn dummy_per_frame() -> (
1279        crate::camera_math::CameraState,
1280        crate::projection::ProjectionRect,
1281        crate::ray_step::RayStep,
1282        crate::opticast_prelude::OpticastPrelude,
1283    ) {
1284        let cam = crate::Camera {
1285            pos: [0.0, 0.0, 0.0],
1286            right: [1.0, 0.0, 0.0],
1287            down: [0.0, 1.0, 0.0],
1288            forward: [0.0, 0.0, 1.0],
1289        };
1290        let cs = crate::camera_math::derive(&cam, 64, 64, 32.0, 32.0, 32.0);
1291        let proj = crate::projection::derive_projection(&cs, 64, 64, 32.0, 32.0, 32.0, 1);
1292        let rs = crate::ray_step::derive_ray_step(&cs, proj.cx, proj.cy, 32.0);
1293        let prelude = crate::opticast_prelude::derive_prelude(&cs, 2048, 1, 4, 1024, 1);
1294        (cs, proj, rs, prelude)
1295    }
1296
1297    #[test]
1298    fn frame_setup_caches_ray_step() {
1299        let mut fb = vec![0u32; 64 * 64];
1300        let mut zb = vec![0.0f32; 64 * 64];
1301        let mip_base = [0usize, 0];
1302        let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1303        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1304        let (cs, proj, rs, prelude) = dummy_per_frame();
1305        let ctx = ScanContext {
1306            proj: &proj,
1307            rs: &rs,
1308            prelude: &prelude,
1309            xres: 64,
1310            y_start: 0,
1311            y_end: 64,
1312            anginc: 1,
1313            camera_state: &cs,
1314            camera_gstartz0: 0,
1315            camera_gstartz1: 0,
1316            camera_vptr_offset: 0,
1317            camera_seed_chunk_z: 0,
1318        };
1319        r.frame_setup(&ctx);
1320        let cached_rs = r.frame.as_ref().expect("frame populated").ray_step;
1321        assert_eq!(cached_rs.strx.to_bits(), rs.strx.to_bits());
1322        assert_eq!(cached_rs.stry.to_bits(), rs.stry.to_bits());
1323        assert_eq!(cached_rs.cx16, rs.cx16);
1324        assert_eq!(cached_rs.cy16, rs.cy16);
1325    }
1326
1327    #[cfg(target_arch = "x86_64")]
1328    #[test]
1329    fn hrend_sse_batch_writes_4_pixel_block() {
1330        // R5.1 smoke test: hrend's SSE batch fires for span len ≥
1331        // 4. Pre-fill radar with 4 distinct ARGB values, run hrend
1332        // over [10..14], assert the framebuffer carries the colour
1333        // bits (z lanes use rsqrtps approximation so are intent-
1334        // ionally not bit-checked).
1335        let mut fb = vec![0u32; 64 * 64];
1336        let mut zb = vec![0.0f32; 64 * 64];
1337        let mip_base = [0usize, 0];
1338        let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1339        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1340        let (cs, proj, rs, prelude) = dummy_per_frame();
1341        let ctx = ScanContext {
1342            proj: &proj,
1343            rs: &rs,
1344            prelude: &prelude,
1345            xres: 64,
1346            y_start: 0,
1347            y_end: 64,
1348            anginc: 1,
1349            camera_state: &cs,
1350            camera_gstartz0: 0,
1351            camera_gstartz1: 0,
1352            camera_vptr_offset: 0,
1353            camera_seed_chunk_z: 0,
1354        };
1355        r.frame_setup(&ctx);
1356
1357        let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1358        // 4 colour records so the batch reads each lane.
1359        for (i, slot) in scratch.radar.iter_mut().enumerate().take(4) {
1360            slot.col = 0x8000_0000_u32 as i32 | i as i32;
1361            slot.dist = 1024;
1362        }
1363        // angstart[i] = i so ray_idx i (= plc>>16) resolves to
1364        // radar[i + j] = radar[i] with j=0.
1365        for k in 0..4 {
1366            scratch.angstart[k] = k as isize;
1367        }
1368
1369        // sx=10, p1=14, j=0, plc=0, incr=1<<16 → plc>>16 steps
1370        // 0,1,2,3 over the four pixels → angstart[0..4].
1371        r.hrend(&mut scratch, 10, 5, 14, 0, 1 << 16, 0);
1372
1373        let row_off = 5 * 64;
1374        for k in 0..4 {
1375            let want = 0x8000_0000_u32 | k as u32;
1376            assert_eq!(
1377                fb[row_off + 10 + k],
1378                want,
1379                "fb[5][{}] = {:#010x}, expected {:#010x}",
1380                10 + k,
1381                fb[row_off + 10 + k],
1382                want,
1383            );
1384            // z lane non-zero (rsqrtps produced something).
1385            // Bit-compare to dodge clippy::float_cmp; we just want
1386            // to confirm the slot was written, not its precise value.
1387            assert_ne!(zb[row_off + 10 + k].to_bits(), 0u32);
1388        }
1389    }
1390
1391    #[test]
1392    fn fog_blend_disabled_returns_col_unchanged() {
1393        let foglut: Vec<i32> = Vec::new();
1394        let col = 0x0080_C040;
1395        assert_eq!(fog_blend(col, 0x1234_5678, &foglut, 0xFF_FFFF), col);
1396    }
1397
1398    #[test]
1399    fn fog_blend_full_fog_returns_fog_col_per_channel() {
1400        // l = 32767 → ((fog - col) * 32767) >> 15 = fog - col → final
1401        // is col + (fog - col) = fog (per channel; alpha untouched).
1402        let foglut = vec![32767; 2048];
1403        let col = 0x80_AA_BB_CC_u32 as i32;
1404        let fog = 0x00_11_22_33_i32;
1405        let blended = fog_blend(col, 0, &foglut, fog) as u32;
1406        // Low 24 bits = fog colour; alpha (high byte) survives from col.
1407        assert_eq!(blended & 0x00FF_FFFF, fog as u32 & 0x00FF_FFFF);
1408        assert_eq!(blended & 0xFF00_0000, col as u32 & 0xFF00_0000);
1409    }
1410
1411    #[test]
1412    fn set_fog_zero_distance_clears_table() {
1413        let mut s = ScanScratch::new_for_size(64, 64, 64);
1414        s.set_fog(0x1234_5678, 100);
1415        assert!(!s.foglut.is_empty());
1416        s.set_fog(0, 0);
1417        assert!(s.foglut.is_empty());
1418    }
1419
1420    #[test]
1421    fn set_fog_table_starts_at_zero_and_climbs() {
1422        let mut s = ScanScratch::new_for_size(64, 64, 64);
1423        s.set_fog(0xFF, 1024);
1424        // First entry: acc = 0 → hi16 = 0.
1425        assert_eq!(s.foglut[0], 0);
1426        // Last entry near the overflow boundary should be near 32767
1427        // (the saturate-fill value); voxlap's exact step makes
1428        // foglut[2047] either ~32766 (last walked entry) or 32767
1429        // (post-overflow padding) depending on max_scan_dist
1430        // divisibility.
1431        assert!(
1432            s.foglut[2047] > 30_000,
1433            "tail entry too low: {}",
1434            s.foglut[2047]
1435        );
1436    }
1437
1438    #[test]
1439    fn hrend_writes_pixel_per_column_from_radar() {
1440        // Pre-fill scratch.radar with a recognizable colour gradient
1441        // and pre-set scratch.angstart so hrend resolves to the
1442        // expected radar slot per pixel. Then call hrend manually
1443        // and verify the framebuffer received the colours.
1444        let mut fb = vec![0u32; 64 * 64];
1445        let mut zb = vec![0.0f32; 64 * 64];
1446        let mip_base = [0usize, 0];
1447        let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1448        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1449        let (cs, proj, rs, prelude) = dummy_per_frame();
1450        let ctx = ScanContext {
1451            proj: &proj,
1452            rs: &rs,
1453            prelude: &prelude,
1454            xres: 64,
1455            y_start: 0,
1456            y_end: 64,
1457            anginc: 1,
1458            camera_state: &cs,
1459            camera_gstartz0: 0,
1460            camera_gstartz1: 0,
1461            camera_vptr_offset: 0,
1462            camera_seed_chunk_z: 0,
1463        };
1464        r.frame_setup(&ctx);
1465
1466        let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1467        // Synthetic radar: 16 castdat entries with col = 0xAA...,
1468        // dist = 1 (zbuffer math will divide by sqrt of dir² so we
1469        // just pick a stable distance).
1470        for (i, slot) in scratch.radar.iter_mut().enumerate().take(16) {
1471            slot.col = 0x8000_0000_u32 as i32 | i as i32;
1472            slot.dist = 1024;
1473        }
1474        // angstart[0..4] all point at radar[0]; with j=0..3 and a
1475        // plc that increments by 0 (incr=0 holds plc>>16 at 0), the
1476        // pixel-by-pixel index lands at slots 0, 1, 2, 3 — i.e. j
1477        // selects the column.
1478        scratch.angstart[0] = 0;
1479
1480        // Render a span of 4 columns starting at sx=10, sy=5, with j
1481        // varying via the scan: but voxlap's hrend uses a single j
1482        // for the whole span (per-ray castdat column is fixed). We
1483        // pick j=2 and verify framebuffer[row*pitch + sx..sx+4] all
1484        // equal radar[0+2].col = 0x80000002.
1485        r.hrend(&mut scratch, 10, 5, 14, 0, 0, 2);
1486
1487        let row_off = 5 * 64;
1488        for x in 10..14 {
1489            let want = 0x8000_0000_u32 | 2;
1490            assert_eq!(
1491                fb[row_off + x],
1492                want,
1493                "fb[5][{x}] = {:#010x}, expected {:#010x}",
1494                fb[row_off + x],
1495                want,
1496            );
1497        }
1498        // Pixels outside the rendered span are untouched.
1499        assert_eq!(fb[row_off + 9], 0);
1500        assert_eq!(fb[row_off + 14], 0);
1501    }
1502
1503    #[test]
1504    fn end_to_end_opticast_runs_through_real_gline() {
1505        // Smoke test: with a valid above-the-slab camera and the
1506        // real R4.3a-rewire-3b gline, full opticast should run
1507        // without panicking and return `Rendered`. The synthetic
1508        // single-slab world has no voxel colour bytes so grouscan's
1509        // fill loops bail to startsky, which writes the configured
1510        // skycast into the radar — verify by setting a recognizable
1511        // skycast and asserting some pixels carry it.
1512        use crate::opticast as opticast_fn;
1513        use crate::rasterizer::ScratchPool;
1514        use crate::OpticastSettings;
1515
1516        let mut fb = vec![0u32; 640 * 480];
1517        let mut zb = vec![0.0f32; 640 * 480];
1518        let mut pool = ScratchPool::new(640, 480, 2048);
1519        // Recognizable sky colour — pixels filled by startsky's
1520        // solid-fill branch (the path the empty-colour-byte slab
1521        // ends up routing through) carry this.
1522        let sky_col = 0x80AB_CDEF_u32 as i32;
1523        pool.set_skycast(sky_col, 0x7FFF_FFFF);
1524
1525        // Single solid slab at z = 200..254. cz = 128 < 200 →
1526        // air-above-the-slab, opticast renders. Synthetic world:
1527        // only the camera's column (1024 * 2048 + 1024) holds the
1528        // slab; every other column is empty. Build world before
1529        // the rasterizer so it can borrow `&column` / `&column_offsets`.
1530        let column = vec![0u8, 200, 254, 0];
1531        let cam_idx = 1024usize * 2048 + 1024;
1532        let mut column_offsets = vec![0u32; 2048 * 2048 + 1];
1533        let column_len_u32 = u32::try_from(column.len()).expect("column fits u32");
1534        for offset in &mut column_offsets[(cam_idx + 1)..] {
1535            *offset = column_len_u32;
1536        }
1537
1538        let mip_base_offsets = [0usize, column_offsets.len()];
1539        let grid = crate::grid_view::GridView::from_parts(
1540            2048,
1541            &column,
1542            &column_offsets,
1543            &mip_base_offsets,
1544        );
1545        let mut rasterizer = ScalarRasterizer::new(&mut fb, &mut zb, 640, grid);
1546
1547        let cam = crate::Camera {
1548            pos: [1024.0, 1024.0, 128.0],
1549            right: [1.0, 0.0, 0.0],
1550            down: [0.0, 1.0, 0.0],
1551            forward: [0.0, 0.0, 1.0],
1552        };
1553        let settings = OpticastSettings::for_oracle_framebuffer(640, 480);
1554
1555        let outcome = opticast_fn(&mut rasterizer, &mut pool, &cam, &settings, grid);
1556        assert_eq!(outcome, crate::OpticastOutcome::Rendered);
1557
1558        // Wiring smoke test — gline → derive_gline_frustum →
1559        // grouscan_run chain executes for every ray without
1560        // panicking, opticast returns Rendered. The synthetic
1561        // single-slab world has no colour bytes (header only) so
1562        // grouscan's drawflor fill bails on the bounds check and
1563        // routes to predeletez → deletez → Done before reaching
1564        // startsky; the radar stays at default zeros, the
1565        // framebuffer ends up sky-blue (the host pre-fill). What
1566        // matters is that nothing crashed — the per-ray gline
1567        // arithmetic + cf[128] seeding + grouscan dispatch all
1568        // hold up under live ray geometry. Once R4.3a-rewire-4
1569        // loads a real `.vxl` with colour bytes, grouscan's fill
1570        // loops will write recognisable voxel colours and a
1571        // colour-presence assertion replaces this comment.
1572        let _ = sky_col; // suppress unused-let warning — kept as
1573                         // scaffolding for the future assertion.
1574    }
1575
1576    #[cfg(target_arch = "x86_64")]
1577    #[test]
1578    fn vrend_sse_batch_writes_4_pixel_block() {
1579        // R5.3 smoke test: vrend's SSE batch fires for span len ≥
1580        // 4. Pre-fill 4 distinct radar entries, set angstart so
1581        // each lane's uurend[sx]>>16 indexes a different ray, run
1582        // vrend over [10..14], assert each column got the right
1583        // colour and uurend advanced.
1584        let mut fb = vec![0u32; 64 * 64];
1585        let mut zb = vec![0.0f32; 64 * 64];
1586        let mip_base = [0usize, 0];
1587        let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1588        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1589        let (cs, proj, rs, prelude) = dummy_per_frame();
1590        let ctx = ScanContext {
1591            proj: &proj,
1592            rs: &rs,
1593            prelude: &prelude,
1594            xres: 64,
1595            y_start: 0,
1596            y_end: 64,
1597            anginc: 1,
1598            camera_state: &cs,
1599            camera_gstartz0: 0,
1600            camera_gstartz1: 0,
1601            camera_vptr_offset: 0,
1602            camera_seed_chunk_z: 0,
1603        };
1604        r.frame_setup(&ctx);
1605
1606        let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1607        for k in 0..4 {
1608            scratch.radar[k] = CastDat {
1609                col: 0x8000_0000_u32 as i32 | k as i32,
1610                dist: 1024,
1611            };
1612            scratch.angstart[k] = k as isize;
1613        }
1614        // uurend[sx + k] >> 16 = k → ray_idx k → angstart[k] = k
1615        // → radar[k]. delta = 5 (so post-batch uurend[sx + k] =
1616        // (k << 16) + 5).
1617        let half = scratch.uurend_half_stride;
1618        for k in 0..4 {
1619            scratch.uurend[10 + k] = (k as i32) << 16;
1620            scratch.uurend[10 + k + half] = 5;
1621        }
1622
1623        r.vrend(&mut scratch, 10, 5, 14, 0, 0);
1624
1625        let row_off = 5 * 64;
1626        for k in 0..4 {
1627            let want = 0x8000_0000_u32 | k as u32;
1628            assert_eq!(fb[row_off + 10 + k], want, "fb col[{}]", 10 + k);
1629            assert!(zb[row_off + 10 + k].to_bits() != 0, "z[{}]", 10 + k);
1630            // Post-batch uurend = old_u + delta.
1631            assert_eq!(
1632                scratch.uurend[10 + k],
1633                ((k as i32) << 16) + 5,
1634                "uurend[{}]",
1635                10 + k
1636            );
1637        }
1638    }
1639
1640    #[test]
1641    fn vrend_advances_uurend_per_pixel() {
1642        // Verify the uurend[sx] += uurend[sx+half_stride] mutation
1643        // happens once per pixel.
1644        let mut fb = vec![0u32; 64 * 64];
1645        let mut zb = vec![0.0f32; 64 * 64];
1646        let mip_base = [0usize, 0];
1647        let grid = crate::grid_view::GridView::from_parts(64, &[], &[], &mip_base);
1648        let mut r = ScalarRasterizer::new(&mut fb, &mut zb, 64, grid);
1649        let (cs, proj, rs, prelude) = dummy_per_frame();
1650        let ctx = ScanContext {
1651            proj: &proj,
1652            rs: &rs,
1653            prelude: &prelude,
1654            xres: 64,
1655            y_start: 0,
1656            y_end: 64,
1657            anginc: 1,
1658            camera_state: &cs,
1659            camera_gstartz0: 0,
1660            camera_gstartz1: 0,
1661            camera_vptr_offset: 0,
1662            camera_seed_chunk_z: 0,
1663        };
1664        r.frame_setup(&ctx);
1665
1666        let mut scratch = ScanScratch::new_for_size(64, 64, 64);
1667        scratch.radar[0] = CastDat {
1668            col: 0x8033_4455_u32 as i32,
1669            dist: 1024,
1670        };
1671        // angstart[0] = 0 → all rays read radar[0 + iplc].
1672        scratch.angstart[0] = 0;
1673        // Pre-set uurend so ray_idx = uurend[sx] >> 16 = 0 for all
1674        // four columns (we want them to all hit angstart[0]).
1675        let half = scratch.uurend_half_stride;
1676        for sx in 10..14 {
1677            scratch.uurend[sx] = 0;
1678            scratch.uurend[sx + half] = 1; // delta added per pixel
1679        }
1680
1681        r.vrend(&mut scratch, 10, 5, 14, 0, 0);
1682
1683        // Each column's uurend should have advanced by the delta.
1684        for sx in 10..14 {
1685            assert_eq!(scratch.uurend[sx], 1, "uurend[{sx}] not advanced");
1686        }
1687    }
1688}