Skip to main content

roxlap_core/
sprite.rs

1//! KV6 sprite type + the `draw_sprite` dispatcher.
2//!
3//! Mirror of voxlap's `vx5sprite` (voxlap5.h:63-79) plus the
4//! `drawsprite` entry point (voxlap5.c:9818). For R6.1 the
5//! dispatcher is a stub — just enough API surface for the host to
6//! plumb a sprite reference through. R6.2-R6.4 fill in the actual
7//! kv6 frustum-cull + per-voxel rasterization behind it.
8//!
9//! Voxlap's vx5sprite is a 64-byte struct:
10//!
11//! ```text
12//! point3d p;       // position
13//! int32_t flags;   // bit 0: 0=normal shading
14//!                  // bit 1: 0=kv6data, 1=kfatype  (oracle uses 0)
15//!                  // bit 2: 0=normal, 1=invisible
16//! point3d s;       // x-basis (kv6data.xsiz direction)
17//! kv6data *voxnum; // (or kfatype *kfaptr if flag bit 1 set)
18//! point3d h;       // y-basis
19//! int32_t kfatim;
20//! point3d f;       // z-basis
21//! int32_t okfatim;
22//! ```
23//!
24//! For R6 we only handle kv6 sprites with `flags = 0` (the four
25//! oracle sprite poses all use this). KFA animation + the no-z and
26//! invisible flags are deferred.
27
28// The kv6draw port is pointer-arithmetic-heavy; the casts mirror C's
29// implicit i32/u32/usize narrowings. Loop bounds are clamped via
30// `lbound` so sign-loss / wrap is guarded at the type-system edge.
31// kv.{xsiz,ysiz,zsiz} are u32 with realistic max ≤ 256 (file format
32// limit) — well within f32's 24-bit mantissa.
33#![allow(
34    clippy::cast_possible_truncation,
35    clippy::cast_possible_wrap,
36    clippy::cast_sign_loss,
37    clippy::cast_precision_loss,
38    clippy::similar_names,
39    clippy::too_many_arguments,
40    clippy::too_many_lines,
41    clippy::cast_ptr_alignment, // _mm_loadl_epi64 / _mm_storeu_si128 are intentionally unaligned
42    clippy::doc_markdown,
43    clippy::no_effect_underscore_binding, // SSE intrinsic side-effect-only stores
44    clippy::no_effect, // the discarded pmaddwd intermediate
45    clippy::ref_as_ptr,
46    clippy::float_cmp_const,
47    clippy::float_cmp,
48)]
49
50use roxlap_formats::kv6::{Kv6, Voxel};
51use roxlap_formats::sprite::{Sprite, SPRITE_FLAG_INVISIBLE, SPRITE_FLAG_KFA, SPRITE_FLAG_NO_Z};
52
53use crate::camera_math::CameraState;
54use crate::engine::{Engine, LightSrc, DEFAULT_KV6COL};
55use crate::equivec::iunivec;
56use crate::fixed::ftol;
57use crate::opticast::OpticastSettings;
58use crate::ptfaces16::PTFACES16;
59
60/// Voxlap's `MAXLIGHTS` cap (`voxlap5.c`). Used to size the
61/// ambient-plus-N-lights `lightlist` scratch in `update_reflects`'s
62/// lightmode≥2 branch.
63const MAX_LIGHTS: usize = 16;
64
65/// Voxlap's `vx5.kv6mipfactor` default (`voxlap5.c:12335`). Threshold
66/// distance (in voxlap's "ftol-of-forward-projected" estimate units)
67/// above which kv6draw walks the lowermip chain. Roxlap doesn't yet
68/// model the lowermip chain in `roxlap-formats::Kv6`, so the mip
69/// descent loop in `kv6_draw_prepare` is structurally faithful but
70/// effectively a no-op until that lands.
71pub(crate) const KV6_MIPFACTOR_DEFAULT: i32 = 128;
72
73/// Post-cull state derived from a sprite + camera pair — what the
74/// per-voxel iteration in R6.3+ needs to start its setup. Borrows
75/// the mip-selected kv6 from the sprite.
76///
77/// Voxlap doesn't materialise this struct (it operates on local
78/// variables inside `kv6draw`); roxlap factors the cull out so it's
79/// independently testable without staging the rest of the
80/// rasterizer.
81#[derive(Debug, Clone)]
82#[allow(dead_code)] // R6.3+ will read these fields.
83pub(crate) struct Kv6DrawSetup<'a> {
84    /// Mip-selected kv6. For the base-mip case (always, today),
85    /// this is just `&sprite.kv6`.
86    pub kv: &'a Kv6,
87    /// Mip-scaled basis vectors. For the base mip these equal
88    /// `sprite.s/h/f`; if a future lowermip walk runs, each is
89    /// scaled by `2^mip`.
90    pub ts: [f32; 3],
91    pub th: [f32; 3],
92    pub tf: [f32; 3],
93    /// 0 for the base mip; reserved for lowermip support.
94    pub mip: u32,
95}
96
97/// Mip-LOD descent + 4-plane frustum cull, mirror of voxlap5.c:8832-
98/// 8875. Returns `None` if the sprite's bound cube is fully behind
99/// any of the four view-frustum edge planes (`CameraState::nor`),
100/// `Some(setup)` otherwise with the post-cull state R6.3 needs.
101///
102/// # Cull math
103///
104/// The bound cube has centre `npos` (in camera-relative coords) and
105/// three half-extent vectors `nstr`, `nhei`, `nfor` (each = the
106/// kv6-axis basis vector scaled by the corresponding half-extent).
107/// For each frustum-edge normal `n`, voxlap tests:
108///
109/// ```text
110/// |nstr · n| + |nhei · n| + |nfor · n| + npos · n < 0
111/// ```
112///
113/// — i.e. the cube's closest-point projection onto `n` is still
114/// behind the plane. Any plane satisfying this culls the sprite.
115pub(crate) fn kv6_draw_prepare<'a>(
116    sprite: &'a Sprite,
117    cam: &CameraState,
118) -> Option<Kv6DrawSetup<'a>> {
119    let kv = &sprite.kv6;
120
121    // Voxlap's quick-and-dirty distance estimate (voxlap5.c:8835):
122    //   y = ftol((spr->p - gipos) · gifor)
123    // Used by the lowermip descent loop. Roxlap-formats `Kv6` doesn't
124    // model lowermip yet, so the loop never runs and this value is
125    // unused — computed for symmetry with voxlap and to lock the
126    // path for a future mip-chain port.
127    let dx = sprite.p[0] - cam.pos[0];
128    let dy = sprite.p[1] - cam.pos[1];
129    let dz = sprite.p[2] - cam.pos[2];
130    let dist_estimate = ftol(dx * cam.forward[0] + dy * cam.forward[1] + dz * cam.forward[2]);
131    let _ = (dist_estimate, KV6_MIPFACTOR_DEFAULT);
132    let mip = 0u32;
133    let ts = sprite.s;
134    let th = sprite.h;
135    let tf = sprite.f;
136
137    // Bound-cube centre + half-extents in camera-relative coords.
138    // (voxlap5.c:8852-8860; tp is centre offset from pivot, tp2 is
139    // axis half-extent.) kv->xsiz/ysiz/zsiz fit f32 exactly for
140    // any realistic kv6 (≤ 256³ per the file format limit).
141    #[allow(clippy::cast_precision_loss)]
142    let half_x = kv.xsiz as f32 * 0.5;
143    #[allow(clippy::cast_precision_loss)]
144    let half_y = kv.ysiz as f32 * 0.5;
145    #[allow(clippy::cast_precision_loss)]
146    let half_z = kv.zsiz as f32 * 0.5;
147    let off_x = half_x - kv.xpiv;
148    let off_y = half_y - kv.ypiv;
149    let off_z = half_z - kv.zpiv;
150    let npos = [
151        off_x * ts[0] + off_y * th[0] + off_z * tf[0] + dx,
152        off_x * ts[1] + off_y * th[1] + off_z * tf[1] + dy,
153        off_x * ts[2] + off_y * th[2] + off_z * tf[2] + dz,
154    ];
155    let nstr = [ts[0] * half_x, ts[1] * half_x, ts[2] * half_x];
156    let nhei = [th[0] * half_y, th[1] * half_y, th[2] * half_y];
157    let nfor = [tf[0] * half_z, tf[1] * half_z, tf[2] * half_z];
158
159    // 4-plane cull (voxlap5.c:8861-8875, walked z=3..0).
160    for n in &cam.nor {
161        let proj_str = (nstr[0] * n[0] + nstr[1] * n[1] + nstr[2] * n[2]).abs();
162        let proj_hei = (nhei[0] * n[0] + nhei[1] * n[1] + nhei[2] * n[2]).abs();
163        let proj_for = (nfor[0] * n[0] + nfor[1] * n[1] + nfor[2] * n[2]).abs();
164        let proj_pos = npos[0] * n[0] + npos[1] * n[1] + npos[2] * n[2];
165        if proj_str + proj_hei + proj_for + proj_pos < 0.0 {
166            return None;
167        }
168    }
169
170    Some(Kv6DrawSetup {
171        kv,
172        ts,
173        th,
174        tf,
175        mip,
176    })
177}
178
179/// 3×3 + translation matrix multiply, port of voxlap's `mat2`
180/// (voxlap5.c:9619). Composes camera transform `(a_s, a_h, a_f, a_o)`
181/// with sprite basis `(b_s, b_h, b_f, b_o)` into camera-relative
182/// sprite basis `(c_s, c_h, c_f, c_o)`.
183///
184/// `c_s = a_s * b_s.x + a_h * b_s.y + a_f * b_s.z`, similarly for
185/// `c_h` / `c_f`. `c_o = same form on b_o + a_o`.
186#[allow(clippy::too_many_arguments)]
187pub(crate) fn mat2(
188    a_s: [f32; 3],
189    a_h: [f32; 3],
190    a_f: [f32; 3],
191    a_o: [f32; 3],
192    b_s: [f32; 3],
193    b_h: [f32; 3],
194    b_f: [f32; 3],
195    b_o: [f32; 3],
196) -> ([f32; 3], [f32; 3], [f32; 3], [f32; 3]) {
197    let c_s = [
198        a_s[0] * b_s[0] + a_h[0] * b_s[1] + a_f[0] * b_s[2],
199        a_s[1] * b_s[0] + a_h[1] * b_s[1] + a_f[1] * b_s[2],
200        a_s[2] * b_s[0] + a_h[2] * b_s[1] + a_f[2] * b_s[2],
201    ];
202    let c_h = [
203        a_s[0] * b_h[0] + a_h[0] * b_h[1] + a_f[0] * b_h[2],
204        a_s[1] * b_h[0] + a_h[1] * b_h[1] + a_f[1] * b_h[2],
205        a_s[2] * b_h[0] + a_h[2] * b_h[1] + a_f[2] * b_h[2],
206    ];
207    let c_f = [
208        a_s[0] * b_f[0] + a_h[0] * b_f[1] + a_f[0] * b_f[2],
209        a_s[1] * b_f[0] + a_h[1] * b_f[1] + a_f[1] * b_f[2],
210        a_s[2] * b_f[0] + a_h[2] * b_f[1] + a_f[2] * b_f[2],
211    ];
212    let c_o = [
213        a_s[0] * b_o[0] + a_h[0] * b_o[1] + a_f[0] * b_o[2] + a_o[0],
214        a_s[1] * b_o[0] + a_h[1] * b_o[1] + a_f[1] * b_o[2] + a_o[1],
215        a_s[2] * b_o[0] + a_h[2] * b_o[1] + a_f[2] * b_o[2] + a_o[2],
216    ];
217    (c_s, c_h, c_f, c_o)
218}
219
220/// Voxlap's `lbound(a, b, c)` (voxlap5.c:406): clamp `a` into the
221/// inclusive range `[b, c]`. `c` must be `>= b`.
222#[inline]
223fn lbound(a: i32, b: i32, c: i32) -> i32 {
224    a.clamp(b, c)
225}
226
227/// State derived from `Kv6DrawSetup` + `CameraState` that the
228/// per-voxel iteration consumes. Voxlap holds these on the stack
229/// inside `kv6draw`; roxlap factors them out so the iteration loop
230/// can be tested independently.
231#[derive(Debug, Clone)]
232#[allow(dead_code)] // R6.4+ reads scisdist / qsum0 / cadd / etc.
233pub(crate) struct Kv6IterState<'a> {
234    pub kv: &'a Kv6,
235    /// Camera origin expressed in kv6-local voxel coordinates,
236    /// clamped to `[-1, kv.xsiz]` etc. by voxlap's `lbound`. Splits
237    /// the voxel grid into the 4 + 1 quadrants the iteration walks
238    /// in different orders so that for each (x, y) column the inner
239    /// z-loop visits voxels closer to the camera first (= correct
240    /// painter's-style ordering for the rasterizer in R6.4).
241    pub inx: i32,
242    pub iny: i32,
243    pub inz: i32,
244    /// `vx5.xplanemin` / `vx5.xplanemax` mirror — voxlap defaults
245    /// to `[0, INT_MAX]` (no x-clipping). Roxlap doesn't yet expose
246    /// a public knob for these; pinning to the defaults matches the
247    /// oracle and any caller that doesn't care.
248    pub nxplanemin: i32,
249    pub nxplanemax: i32,
250}
251
252/// Full per-frame rasterizer state for one sprite — what
253/// `drawboundcubesse` reads via voxlap's globals.
254///
255/// Built by [`kv6_compute_full_state`] from the post-cull
256/// `Kv6DrawSetup` + the camera's projection params. Mirror of the
257/// voxlap5.c:8915-8973 setup block + the qsum1/qbplbpp framebuffer
258/// state from `voxsetframebuffer` (voxlap5.c:11119-11122) +
259/// kv6colmul/kv6coladd from `updatereflects` (voxlap5.c:8466).
260#[derive(Debug, Clone)]
261pub(crate) struct Kv6FullState<'a> {
262    pub iter: Kv6IterState<'a>,
263    /// 8 cube-vertex offsets, gihz-scaled. `cadd4[k]` for `k = 0..7`
264    /// is the offset of cube vertex `k` from the voxel origin, where
265    /// bit 0 = +x, bit 1 = +z (post-swap == old +z), bit 2 = +y
266    /// (post-swap == old -y). `cadd4[0]` is `[0; 4]`. Lane 3 of
267    /// each entry duplicates lane 2 (z) — voxlap's SSE convenience.
268    pub cadd4: [[f32; 4]; 8],
269    /// Per-z step table: `ztab4_per_z[z] = z * cadd4[2]`. Length =
270    /// `kv.zsiz`. Indexed by `v.z` in `drawboundcubesse`.
271    pub ztab4_per_z: Vec<[f32; 4]>,
272    /// Initial r1 — the x=0 column base after voxlap's "ANNOYING
273    /// HACK" pre-decrement. = `(npos*gihz with z2=npos.z) -
274    /// cadd4[4]`. Iterates by `cadd4[1]` per x and (via r0) by
275    /// `cadd4[4]` per y.
276    pub r1_initial: [f32; 4],
277    /// `r2 = -ysiz * cadd4[4]`. Used to reset r0 between forward-y
278    /// and reverse-y phases inside one x column.
279    pub r2: [f32; 4],
280    /// Near-plane scissor distance (camera-space Z).
281    /// `voxlap5.c:8953-8956` — equals the negative sum of any
282    /// negative components of post-swap `nstr.z` / `nhei.z` /
283    /// `nfor.z`. `0.0` if all three are non-negative.
284    pub scisdist: f32,
285    /// Viewport-clip biases (voxlap5.c:8947-8948). Used by the SSE2
286    /// path's `paddsw` / `pmaxsw` AABB clipping; the scalar port clips
287    /// directly against `target.width` / `target.height`.
288    #[allow(dead_code)]
289    pub qsum0: [i16; 4],
290    /// Viewport-clip floor (voxlap5.c:11120).
291    #[allow(dead_code)]
292    pub qsum1: [i16; 4],
293    /// Framebuffer pixel-stride packed for `pmaddwd` (voxlap5.c:11121).
294    #[allow(dead_code)]
295    pub qbplbpp: [i16; 4],
296    /// Per-direction colour modulation table built by
297    /// [`update_reflects`]. Indexed by `v.dir` (256 entries). Each
298    /// entry packs four `u16` modulation factors (one per byte
299    /// channel) used by `_mm_mulhi_epu16` against the unpacked
300    /// voxel colour.
301    pub kv6colmul: Box<[u64; 256]>,
302    /// Fog bias added after the colour modulate. Zero when fog is
303    /// disabled (the oracle case).
304    pub kv6coladd: u64,
305}
306
307/// Borrowed framebuffer + zbuffer the per-voxel rasterizer fills.
308///
309/// Mirrors voxlap's `kv6frameplace` + `zbuffermem` but in
310/// row-major-pixel form rather than byte-pointer form. `width` /
311/// `height` must match the `OpticastSettings.xres` / `yres` used
312/// when the per-frame `Kv6FullState` was built — the bounds derived from
313/// `qsum0` / `qsum1` assume that geometry.
314///
315/// Internally a raw-pointer view (similar to
316/// [`crate::scalar_rasterizer::RasterTarget`]) so the type is
317/// `Copy + Send + Sync` and the R12.4.2 [`draw_sprites_parallel`]
318/// entry point can hand per-thread copies into rayon worker
319/// closures. Each parallel sprite-draw competes for the
320/// framebuffer / zbuffer via z-test; for non-overlapping sprites
321/// this is race-free, for overlapping pixels a tied-z race may
322/// leak (visually indistinguishable, hash non-deterministic).
323#[derive(Clone, Copy, Debug)]
324pub struct DrawTarget<'a> {
325    fb_ptr: *mut u32,
326    fb_len: usize,
327    zb_ptr: *mut f32,
328    zb_len: usize,
329    /// Row stride in pixels.
330    pub pitch_pixels: usize,
331    pub width: u32,
332    pub height: u32,
333    _marker: std::marker::PhantomData<&'a mut [u32]>,
334}
335
336// SAFETY: same shape as the (`&'a mut [u32]`, `&'a mut [f32]`) pair
337// the constructor consumed; both are auto-`Send` for `T: Send`. The
338// pointer-aliasing safety contract for [`draw_sprites_parallel`] is
339// "z-test arbitrates concurrent writes" — a tied-z race is a
340// determinism issue, not a memory-safety issue.
341unsafe impl Send for DrawTarget<'_> {}
342unsafe impl Sync for DrawTarget<'_> {}
343
344impl<'a> DrawTarget<'a> {
345    /// Build a target from exclusive slice borrows + framebuffer
346    /// dimensions. The slices are consumed (their `&'a mut`
347    /// re-borrow is what gates lifetime); subsequent access happens
348    /// via the raw pointers held in the struct.
349    #[must_use]
350    pub fn new(
351        framebuffer: &'a mut [u32],
352        zbuffer: &'a mut [f32],
353        pitch_pixels: usize,
354        width: u32,
355        height: u32,
356    ) -> Self {
357        Self {
358            fb_ptr: framebuffer.as_mut_ptr(),
359            fb_len: framebuffer.len(),
360            zb_ptr: zbuffer.as_mut_ptr(),
361            zb_len: zbuffer.len(),
362            pitch_pixels,
363            width,
364            height,
365            _marker: std::marker::PhantomData,
366        }
367    }
368
369    /// Unconditional framebuffer write. Used by sequential 2D
370    /// blitters (`drawtile`) that don't engage z-testing.
371    ///
372    /// # Safety
373    /// `idx < self.fb_len`. The disjoint-write contract still
374    /// applies if multiple `Copy` instances of `DrawTarget` are in
375    /// flight across threads — this method does NOT arbitrate via
376    /// z-test.
377    #[inline]
378    pub unsafe fn fb_write(self, idx: usize, color: u32) {
379        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
380        // SAFETY: caller asserts in-bounds + (for parallel use)
381        // disjoint writes.
382        unsafe { self.fb_ptr.add(idx).write(color) };
383    }
384
385    /// Read one framebuffer pixel. Used by alpha-blend paths
386    /// (`drawtile` modulate-and-blend) that read-modify-write.
387    ///
388    /// # Safety
389    /// `idx < self.fb_len`. Concurrent writers to the same `idx`
390    /// from another thread invalidate the read; sequential blits
391    /// are race-free.
392    #[inline]
393    #[must_use]
394    pub unsafe fn fb_read(self, idx: usize) -> u32 {
395        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
396        // SAFETY: caller asserts in-bounds.
397        unsafe { self.fb_ptr.add(idx).read() }
398    }
399
400    /// Z-tested pixel write. If `z < zbuffer[idx]`, the new color +
401    /// z stamp the buffers; otherwise nothing changes.
402    ///
403    /// # Safety
404    /// `idx < self.fb_len`. For parallel callers, the wedge / z-test
405    /// arbitration contract on [`DrawTarget`] applies (see struct
406    /// doc).
407    #[inline]
408    #[must_use]
409    pub unsafe fn z_test_write(self, idx: usize, color: u32, z: f32) -> bool {
410        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
411        debug_assert!(idx < self.zb_len, "zb idx {} >= len {}", idx, self.zb_len);
412        // SAFETY: caller asserts in-bounds + concurrent-write contract.
413        unsafe {
414            let zp = self.zb_ptr.add(idx);
415            let cur_z = zp.read();
416            if z < cur_z {
417                zp.write(z);
418                self.fb_ptr.add(idx).write(color);
419                true
420            } else {
421                false
422            }
423        }
424    }
425}
426
427#[inline]
428fn vec4_add(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
429    [a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
430}
431
432#[inline]
433fn vec4_sub(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
434    [a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3]]
435}
436
437#[inline]
438fn vec4_scale(a: [f32; 4], s: f32) -> [f32; 4] {
439    [a[0] * s, a[1] * s, a[2] * s, a[3] * s]
440}
441
442/// Sprite lighting + colour state — the subset of voxlap's
443/// `vx5` global that `updatereflects` reads. Built once per
444/// frame from [`Engine`] state and passed to [`draw_sprite`].
445///
446/// All fields mirror voxlap names:
447/// - `kv6col` ↔ `vx5.kv6col`
448/// - `lightmode` ↔ `vx5.lightmode`
449/// - `lights` ↔ `vx5.lightsrc[0..vx5.numlights]`
450///
451/// The `vx5.fogcol`/`ofogdist` fog plumbing is deferred — sprite
452/// fog stays off for now, matching the oracle path
453/// (`vx5.fogcol < 0` ⇒ `ofogdist == -1` in voxlap C, no fog).
454#[derive(Debug, Clone, Copy)]
455pub struct SpriteLighting<'a> {
456    /// Material colour. R==G==B triggers the cheaper nolighta path
457    /// in `update_reflects`; arbitrary RGB takes the per-channel
458    /// nolightb path; lightmode≥2 ignores the R==G==B fast path
459    /// and always does per-channel modulation.
460    pub kv6col: u32,
461    /// `0` / `1` → directional surface tint (lightmode<2 paths).
462    /// `2` → per-light shadow-side modulation against `lights`.
463    pub lightmode: u32,
464    /// Active point lights — voxlap's `vx5.lightsrc[..vx5.numlights]`.
465    /// Empty for lightmode<2; populated for lightmode≥2.
466    pub lights: &'a [LightSrc],
467}
468
469impl<'a> SpriteLighting<'a> {
470    /// Snapshot the lighting + colour subset of an [`Engine`].
471    /// Use this once per frame in the host so the sprite render
472    /// reflects engine setters made between frames.
473    #[must_use]
474    pub fn from_engine(engine: &'a Engine) -> Self {
475        Self {
476            kv6col: engine.kv6col(),
477            lightmode: engine.lightmode(),
478            lights: engine.lights(),
479        }
480    }
481}
482
483impl SpriteLighting<'static> {
484    /// Default oracle config — grey `kv6col`, lightmode 0, no
485    /// lights. Used by `roxlap-oracle` so the four sprite golden
486    /// hashes stay byte-stable: this is the exact state voxlap C's
487    /// oracle has when it calls `drawsprite`.
488    #[must_use]
489    pub fn default_oracle() -> Self {
490        Self {
491            kv6col: DEFAULT_KV6COL,
492            lightmode: 0,
493            lights: &[],
494        }
495    }
496}
497
498/// Builds `kv6colmul[256]` + `kv6coladd[0]` from the engine's
499/// sprite lighting state. Mirror of voxlap's `updatereflects`
500/// (`voxlap5.c:8466-8750`).
501///
502/// Branches:
503/// - `lightmode < 2` + R==G==B `kv6col` → nolighta (cheap
504///   single-multiplier path, voxlap5.c:8553-8584).
505/// - `lightmode < 2` + arbitrary `kv6col` → nolightb (per-channel
506///   path, voxlap5.c:8587-8629).
507/// - `lightmode >= 2` → per-light shadow-side modulation
508///   (voxlap5.c:8631-8750), iterating the active `lights`.
509///
510/// `flags & 1` (disable shading) and the active-fog path remain
511/// deferred — neither is exercised by the oracle's four sprite
512/// poses, and adding them is a follow-up that doesn't change the
513/// already-frozen hashes.
514///
515fn update_reflects(sprite: &Sprite, lighting: &SpriteLighting<'_>) -> (Box<[u64; 256]>, u64) {
516    // Sprite fog plumbing is a follow-up — `vx5.fogcol < 0` (voxlap
517    // C oracle's set_fogcol(BR(...)) state) means ofogdist stays -1,
518    // fogmul = 0, kv6coladd[0] = 0. We pin to that here.
519    let fogmul_lo: u32 = 0;
520    let kv6coladd: u64 = 0;
521
522    let kv6col = lighting.kv6col;
523
524    // g = ((fogmul & 32767) ^ 32767) * (16*8/65536). With fogmul=0:
525    //   g = 32767 * (128/65536) ≈ 63.998.
526    let g_pre = ((((fogmul_lo & 0x7fff) ^ 0x7fff) as i32) as f32) * (16.0 * 8.0 / 65536.0);
527
528    let mut kv6colmul = Box::new([0u64; 256]);
529
530    if lighting.lightmode < 2 {
531        // (voxlap5.c:8538-8543) fx=fy=fz=1.0; tp = sum of basis vectors.
532        let tp_x = sprite.s[0] + sprite.h[0] + sprite.f[0];
533        let tp_y = sprite.s[1] + sprite.h[1] + sprite.f[1];
534        let tp_z = sprite.s[2] + sprite.h[2] + sprite.f[2];
535
536        let f0 = 64.0_f32 / (tp_x * tp_x + tp_y * tp_y + tp_z * tp_z).sqrt();
537
538        // R==G==B test: ((kv6col & 0xffff) << 8) ^ (kv6col & 0xffff00)
539        //   == 0  iff  R == G and G == B.
540        let lo16 = kv6col & 0xffff;
541        let mid24 = kv6col & 0x00ff_ff00;
542        let is_grey = ((lo16 << 8) ^ mid24) == 0;
543
544        if is_grey {
545            // Nolighta path (voxlap5.c:8553-8584): grey kv6col absorbs
546            // into a single multiplier per direction.
547            let g = g_pre * (((kv6col & 0xff) as f32) / 256.0);
548            let f = f0 * g;
549
550            let l0 = (tp_x * f) as i16; // (short)(...) is C truncating cast
551            let l1 = (tp_y * f) as i16;
552            let l2 = (tp_z * f) as i16;
553            let l3 = (g * 128.0) as i16;
554
555            let iu = iunivec();
556            for k in 0..256 {
557                let w = dot_iunivec_i16x4(iu[k], [l0, l1, l2, l3]);
558                let w64 = u64::from(w);
559                kv6colmul[k] = w64 | (w64 << 16) | (w64 << 32) | (w64 << 48);
560            }
561        } else {
562            // Nolightb path (voxlap5.c:8587-8629). Per-channel
563            // modulation factor M_k = (kv6col_byte_k << 8) → mulhi_pu16
564            // by the per-direction dot. Same dot derivation as nolighta.
565            let f = f0 * g_pre;
566
567            let l0 = (tp_x * f) as i16;
568            let l1 = (tp_y * f) as i16;
569            let l2 = (tp_z * f) as i16;
570            let l3 = (g_pre * 128.0) as i16;
571
572            let m = kv6col_channel_mods(kv6col);
573
574            let iu = iunivec();
575            for k in 0..256 {
576                let w = dot_iunivec_i16x4(iu[k], [l0, l1, l2, l3]);
577                kv6colmul[k] = pack_modulated_word(w, m);
578            }
579        }
580    } else {
581        // Lightmode≥2 path (voxlap5.c:8631-8750): per-sprite point
582        // lighting from `lighting.lights`. Each light projects onto
583        // the sprite's normalised basis; per-direction kv6colmul[i]
584        // starts from a synthetic ambient slot and subtracts shadow
585        // contributions from each light's "negative" lanes.
586        let m = kv6col_channel_mods(kv6col);
587        build_kv6colmul_lightmode2(sprite, lighting.lights, &mut kv6colmul, fogmul_lo, m);
588    }
589
590    (kv6colmul, kv6coladd)
591}
592
593/// Voxlap's `pmaddwd(iunivec[k], lightlist) summed across two
594/// dword lanes mod 2^32, take high 16` reduction. Returns the
595/// `u16` modulation factor before any per-channel packing.
596#[inline]
597fn dot_iunivec_i16x4(u: [i16; 4], l: [i16; 4]) -> u16 {
598    let u0 = i32::from(u[0]);
599    let u1 = i32::from(u[1]);
600    let u2 = i32::from(u[2]);
601    let u3 = i32::from(u[3]);
602    let lo = (u0.wrapping_mul(l[0].into())) as u32;
603    let lo = lo.wrapping_add((u1.wrapping_mul(l[1].into())) as u32);
604    let hi = (u2.wrapping_mul(l[2].into())) as u32;
605    let hi = hi.wrapping_add((u3.wrapping_mul(l[3].into())) as u32);
606    ((lo.wrapping_add(hi)) >> 16) as u16
607}
608
609/// `(kv6col_byte_k << 8)` per channel — the four `M_k` factors the
610/// nolightb / lightmode≥2 paths multiply against the per-direction
611/// dot via `pmulhuw`.
612#[inline]
613fn kv6col_channel_mods(kv6col: u32) -> [u16; 4] {
614    [
615        ((kv6col & 0xff) << 8) as u16,
616        (((kv6col >> 8) & 0xff) << 8) as u16,
617        (((kv6col >> 16) & 0xff) << 8) as u16,
618        (((kv6col >> 24) & 0xff) << 8) as u16,
619    ]
620}
621
622/// Pack one direction's `kv6colmul[k]` u64: per-channel
623/// `(W * M_c) >> 16` words concatenated.
624#[inline]
625fn pack_modulated_word(w_dot: u16, m: [u16; 4]) -> u64 {
626    let w = u32::from(w_dot);
627    let w0 = ((w * u32::from(m[0])) >> 16) as u16;
628    let w1 = ((w * u32::from(m[1])) >> 16) as u16;
629    let w2 = ((w * u32::from(m[2])) >> 16) as u16;
630    let w3 = ((w * u32::from(m[3])) >> 16) as u16;
631    u64::from(w0) | (u64::from(w1) << 16) | (u64::from(w2) << 32) | (u64::from(w3) << 48)
632}
633
634/// Lightmode≥2 path body — voxlap5.c:8631-8750. Builds the full
635/// `kv6colmul[256]` from the active light list.
636///
637/// Steps:
638/// 1. Normalise each sprite-basis axis (`sprs`/`sprh`/`sprf`).
639/// 2. For each light within `r2` of the sprite, compute its
640///    intensity falloff `h` and project the world-space delta onto
641///    the normalised sprite basis → store in `lightlist[k]`.
642/// 3. Append a synthetic ambient slot (voxlap's hardcoded
643///    `(fx, fy, fz) = (0, 0.5, 1.0)` direction) at
644///    `lightlist[lightcnt]`.
645/// 4. For each direction `idx ∈ 0..256`:
646///    - `base = ambient_slot · iunivec[idx]` (treated as one u32).
647///    - For each real light `k`: compute `dot = light_k ·
648///      iunivec[idx]`, split into low/high i16 lanes (asm-faithful
649///      "16-bits-is-ugly-but-ok-here" quirk); subtract the negative
650///      lanes from `base` (= shadow side of the surface).
651///    - `W = base >> 16`, then per-channel modulate against `M_c`
652///      and pack into `kv6colmul[idx]`.
653fn build_kv6colmul_lightmode2(
654    sprite: &Sprite,
655    lights: &[LightSrc],
656    kv6colmul: &mut [u64; 256],
657    fogmul_lo: u32,
658    m: [u16; 4],
659) {
660    // (voxlap5.c:8638-8643) Normalise sprite basis. WARNING from
661    // voxlap: only correct for orthonormal sprite-bases; non-
662    // orthogonal bases (e.g. shears) drift. The four oracle sprite
663    // poses are all orthonormal so this matches voxlap's behaviour.
664    let sprs = normalise(sprite.s);
665    let sprh = normalise(sprite.h);
666    let sprf = normalise(sprite.f);
667
668    // hh = ((fogmul & 32767) ^ 32767) / 65536 * 2 (voxlap5.c:8645).
669    // With fogmul=0 → hh = 32767 / 65536 * 2 ≈ 1.0. This is a
670    // distinct scaling from `g_pre` (= same numerator * 128/65536
671    // for the lightmode<2 path) — they differ by a factor of 64.
672    // An earlier port mistakenly derived hh from g_pre / 128 = 0.5,
673    // giving sprites half the intended ambient brightness.
674    let hh_initial = ((((fogmul_lo & 0x7fff) ^ 0x7fff) as i32) as f32) * (2.0 / 65536.0);
675
676    // Project each in-range light onto the sprite basis.
677    let mut lightlist: [[i16; 4]; MAX_LIGHTS + 1] = [[0; 4]; MAX_LIGHTS + 1];
678    let mut lightcnt: usize = 0;
679    for light in lights.iter().rev() {
680        if lightcnt >= MAX_LIGHTS {
681            break;
682        }
683        let fx = light.pos[0] - sprite.p[0];
684        let fy = light.pos[1] - sprite.p[1];
685        let fz = light.pos[2] - sprite.p[2];
686        let gg = fx * fx + fy * fy + fz * fz;
687        let ff = light.r2;
688        // Voxlap's `*(int32_t *)&gg < *(int32_t *)&ff` is a bit-
689        // pattern compare. For non-negative finite floats the bit
690        // order matches the magnitude order, so `gg < ff` is
691        // equivalent (and safer in the presence of NaN: NaN !< x
692        // for any x, matching voxlap's float-bit-cast trick).
693        if gg >= ff || gg <= 0.0 {
694            continue;
695        }
696        let f = ff.sqrt();
697        let g = gg.sqrt();
698        // h = (f*ff - g*gg) / (f*ff*g*gg) * sc * 16
699        let mut h = (f * ff - g * gg) / (f * ff * g * gg) * light.sc * 16.0;
700        if g * h > 4096.0 {
701            h = 4096.0 / g; // saturation clip
702        }
703        h *= hh_initial;
704        let l0 = (fx * sprs[0] + fy * sprs[1] + fz * sprs[2]) * h;
705        let l1 = (fx * sprh[0] + fy * sprh[1] + fz * sprh[2]) * h;
706        let l2 = (fx * sprf[0] + fy * sprf[1] + fz * sprf[2]) * h;
707        lightlist[lightcnt] = [l0 as i16, l1 as i16, l2 as i16, 0];
708        lightcnt += 1;
709    }
710
711    // Synthetic ambient slot: voxlap's hardcoded direction
712    // (fx, fy, fz) = (0, 0.5, 1.0) projected onto the sprite basis,
713    // scaled by `hh * 16*16*8/2 = hh * 1024`. The lane-3 bias is
714    // `hh * 48 / 16 = hh * 3`.
715    let amb_fx = 0.0_f32;
716    let amb_fy = 0.5_f32;
717    let amb_fz = 1.0_f32;
718    let hh = hh_initial * (16.0 * 16.0 * 8.0 / 2.0);
719    let al0 = (sprs[0] * amb_fx + sprs[1] * amb_fy + sprs[2] * amb_fz) * hh;
720    let al1 = (sprh[0] * amb_fx + sprh[1] * amb_fy + sprh[2] * amb_fz) * hh;
721    let al2 = (sprf[0] * amb_fx + sprf[1] * amb_fy + sprf[2] * amb_fz) * hh;
722    let al3 = hh * (48.0 / 16.0);
723    lightlist[lightcnt] = [al0 as i16, al1 as i16, al2 as i16, al3 as i16];
724
725    let iu = iunivec();
726    for idx in 0..256 {
727        let u = iu[idx];
728        // Ambient base = lightlist[lightcnt] · iunivec[idx], in u32
729        // wrapping arithmetic (asm summed the pmaddwd dword lanes
730        // mod 2^32).
731        let u0 = i32::from(u[0]);
732        let u1 = i32::from(u[1]);
733        let u2 = i32::from(u[2]);
734        let u3 = i32::from(u[3]);
735        let amb = lightlist[lightcnt];
736        let base_lo = (u0.wrapping_mul(i32::from(amb[0]))) as u32;
737        let base_lo = base_lo.wrapping_add((u1.wrapping_mul(i32::from(amb[1]))) as u32);
738        let base_hi = (u2.wrapping_mul(i32::from(amb[2]))) as u32;
739        let base_hi = base_hi.wrapping_add((u3.wrapping_mul(i32::from(amb[3]))) as u32);
740        let mut base = base_lo.wrapping_add(base_hi);
741
742        // For each real light, compute dot, then subtract its
743        // "negative" half-lanes from `base` (= shadow side).
744        for k in (0..lightcnt).rev() {
745            let l = lightlist[k];
746            let klo = (u0.wrapping_mul(i32::from(l[0]))) as u32;
747            let klo = klo.wrapping_add((u1.wrapping_mul(i32::from(l[1]))) as u32);
748            let khi = (u2.wrapping_mul(i32::from(l[2]))) as u32;
749            let khi = khi.wrapping_add((u3.wrapping_mul(i32::from(l[3]))) as u32);
750            let dot = klo.wrapping_add(khi);
751            // Voxlap quirk: 32-bit dot but pminsw is per-i16 lane.
752            // Light magnitudes stay clamped enough that the
753            // mixed-lane behaviour is benign — port faithfully.
754            let lo16 = (dot & 0xffff) as i16;
755            let hi16 = ((dot >> 16) & 0xffff) as i16;
756            let lo16c: u16 = if lo16 < 0 { lo16 as u16 } else { 0 };
757            let hi16c: u16 = if hi16 < 0 { hi16 as u16 } else { 0 };
758            let sub = (u32::from(hi16c) << 16) | u32::from(lo16c);
759            base = base.wrapping_sub(sub);
760        }
761
762        let w_dot = (base >> 16) as u16;
763        kv6colmul[idx] = pack_modulated_word(w_dot, m);
764    }
765}
766
767/// Normalise a 3-vector. Returns the unit-length version; if
768/// the input is zero-length, returns the input unchanged (avoids
769/// NaN propagation — voxlap's `1.0 / sqrt(...)` would NaN out for
770/// a zero basis axis but the C code never gets passed one).
771#[inline]
772fn normalise(v: [f32; 3]) -> [f32; 3] {
773    let len_sq = v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
774    if len_sq <= 0.0 {
775        return v;
776    }
777    let inv = 1.0 / len_sq.sqrt();
778    [v[0] * inv, v[1] * inv, v[2] * inv]
779}
780
781/// Full setup: mat2 + Cramer's + nfor↔nhei swap + cadd4/ztab4/r1/r2/
782/// scisdist/qsum0 init. Mirror of voxlap5.c:8915-8973.
783pub(crate) fn kv6_compute_full_state<'a>(
784    setup: &Kv6DrawSetup<'a>,
785    sprite: &Sprite,
786    lighting: &SpriteLighting<'_>,
787    cam: &CameraState,
788    settings: &OpticastSettings,
789    fb_width: u32,
790    fb_height: u32,
791    fb_pitch_pixels: usize,
792) -> Kv6FullState<'a> {
793    let sprite_pos = sprite.p;
794    let kv = setup.kv;
795
796    // Transform sprite basis from world to camera-relative
797    // screen-axis coords (voxlap5.c:8916). `(gixs, giys, gizs)` is
798    // the transposed camera basis; `giadd` is the translation half.
799    let (nstr, mut nhei, mut nfor, mut npos) = mat2(
800        cam.xs, cam.ys, cam.zs, cam.add, setup.ts, setup.th, setup.tf, sprite_pos,
801    );
802
803    // Shift `npos` so it points at the kv6 origin (corner [0,0,0])
804    // rather than the pivot point — Cramer's rule below solves for
805    // the camera origin in kv6-local voxel coords, which only makes
806    // sense relative to the corner. (voxlap5.c:8917-8919)
807    npos[0] -= kv.xpiv * nstr[0] + kv.ypiv * nhei[0] + kv.zpiv * nfor[0];
808    npos[1] -= kv.xpiv * nstr[1] + kv.ypiv * nhei[1] + kv.zpiv * nfor[1];
809    npos[2] -= kv.xpiv * nstr[2] + kv.ypiv * nhei[2] + kv.zpiv * nfor[2];
810
811    // Cramer's rule for `nstr * X + nhei * Y + nfor * Z + npos = 0`.
812    // (voxlap5.c:8923-8936)
813    let tp = [
814        nhei[1] * nfor[2] - nfor[1] * nhei[2],
815        nfor[1] * nstr[2] - nstr[1] * nfor[2],
816        nstr[1] * nhei[2] - nhei[1] * nstr[2],
817    ];
818    let det = nstr[0] * tp[0] + nhei[0] * tp[1] + nfor[0] * tp[2];
819    // Float-bit comparison against zero: matches voxlap's
820    // `if (f != 0)` and dodges clippy::float_cmp.
821    let (raw_inx, raw_iny, raw_inz) = if det.to_bits() & 0x7fff_ffff != 0 {
822        let f_inv = -1.0 / det;
823        let tp2 = [
824            npos[1] * nfor[2] - nfor[1] * npos[2],
825            nhei[1] * npos[2] - npos[1] * nhei[2],
826            npos[1] * nstr[2] - nstr[1] * npos[2],
827        ];
828        (
829            ftol((npos[0] * tp[0] - nhei[0] * tp2[0] - nfor[0] * tp2[1]) * f_inv),
830            ftol((npos[0] * tp[1] + nstr[0] * tp2[0] - nfor[0] * tp2[2]) * f_inv),
831            ftol((npos[0] * tp[2] + nstr[0] * tp2[1] + nhei[0] * tp2[2]) * f_inv),
832        )
833    } else {
834        (-1, -1, -1)
835    };
836
837    let xsiz_i = kv.xsiz as i32;
838    let ysiz_i = kv.ysiz as i32;
839    let zsiz_i = kv.zsiz as i32;
840    let iter = Kv6IterState {
841        kv,
842        inx: lbound(raw_inx, -1, xsiz_i),
843        iny: lbound(raw_iny, -1, ysiz_i),
844        inz: lbound(raw_inz, -1, zsiz_i),
845        // Voxlap default `vx5.xplanemin = 0`, `xplanemax = 0x7fffffff`.
846        nxplanemin: 0,
847        nxplanemax: i32::MAX,
848    };
849
850    // Swap `nhei` ↔ `nfor` with sign flip on the new `nfor`
851    // (voxlap5.c:8942-8944). Equivalent to a 90° rotation that lines
852    // the basis up with cadd4's bit-encoded vertex offsets:
853    //   cadd4[1] = +x  (post-swap nstr direction)
854    //   cadd4[2] = +z  (post-swap nhei direction == original +z)
855    //   cadd4[4] = +y  (post-swap nfor direction == original -y)
856    // After this point `nfor` / `nhei` carry the post-swap values.
857    let swap_x = nhei[0];
858    nhei[0] = nfor[0];
859    nfor[0] = -swap_x;
860    let swap_y = nhei[1];
861    nhei[1] = nfor[1];
862    nfor[1] = -swap_y;
863    let swap_z = nhei[2];
864    nhei[2] = nfor[2];
865    nfor[2] = -swap_z;
866
867    // qsum0 (voxlap5.c:8947-8948). The `0x7fff - (xres - hx)`
868    // form sets the bias such that adding it to a screen-space
869    // bound makes the bound saturate-positive when it lands
870    // inside the viewport.
871    let xres_i = settings.xres as i32;
872    let yres_i = settings.yres as i32;
873    let hx_i = ftol(settings.hx);
874    let hy_i = ftol(settings.hy);
875    let qsum0_x = (0x7fff - (xres_i - hx_i)) as i16;
876    let qsum0_y = (0x7fff - (yres_i - hy_i)) as i16;
877    let qsum0 = [qsum0_x, qsum0_y, qsum0_x, qsum0_y];
878
879    // scisdist (voxlap5.c:8953-8956). Voxlap's `*(int32_t *)&f < 0`
880    // bit-trick: a positive-finite float has bit-pattern >= 0;
881    // only *negative* floats land < 0 as signed int. So this loop
882    // sums the absolute value of any negative-z post-swap basis
883    // component into a near-plane bias.
884    let mut scisdist = 0.0f32;
885    if (nstr[2].to_bits() as i32) < 0 {
886        scisdist -= nstr[2];
887    }
888    if (nhei[2].to_bits() as i32) < 0 {
889        scisdist -= nhei[2];
890    }
891    if (nfor[2].to_bits() as i32) < 0 {
892        scisdist -= nfor[2];
893    }
894
895    // cadd4 step table (voxlap5.c:8958-8961). cadd4[1/2/4] are the
896    // three primary axis steps (x / z / y, post-swap); cadd4[3/5/6/7]
897    // are bit-OR sums (3 = 1+2, 5 = 1+4, 6 = 2+4, 7 = 3+4).
898    let gihz = settings.hz;
899    let cadd1 = [nstr[0] * gihz, nstr[1] * gihz, nstr[2], nstr[2]];
900    let cadd2 = [nhei[0] * gihz, nhei[1] * gihz, nhei[2], nhei[2]];
901    let cadd4_axis = [nfor[0] * gihz, nfor[1] * gihz, nfor[2], nfor[2]];
902    let cadd3 = vec4_add(cadd1, cadd2);
903    let cadd5 = vec4_add(cadd1, cadd4_axis);
904    let cadd6 = vec4_add(cadd2, cadd4_axis);
905    let cadd7 = vec4_add(cadd3, cadd4_axis);
906    let cadd4 = [
907        [0.0; 4], cadd1, cadd2, cadd3, cadd4_axis, cadd5, cadd6, cadd7,
908    ];
909
910    // ztab4 per-z step table (voxlap5.c:8973). ztab4[z] = z * cadd4[2]
911    // built incrementally by addps so per-step rounding matches.
912    let zsiz = kv.zsiz as usize;
913    let mut ztab4_per_z = Vec::with_capacity(zsiz);
914    if zsiz > 0 {
915        ztab4_per_z.push([0.0f32; 4]);
916        for i in 1..zsiz {
917            let prev = ztab4_per_z[i - 1];
918            ztab4_per_z.push(vec4_add(prev, cadd4[2]));
919        }
920    }
921
922    // r1 init (voxlap5.c:8961, 8976). Post-mat2 npos becomes the
923    // raw column-base; gihz-scale x/y; z lane keeps unscaled npos.z;
924    // z2 lane (lane 3) duplicates z. Then "ANNOYING HACK"
925    // pre-decrement by cadd4[4].
926    let r1_pre = [npos[0] * gihz, npos[1] * gihz, npos[2], npos[2]];
927    let r1_initial = vec4_sub(r1_pre, cadd4[4]);
928
929    // r2 = -ysiz * cadd4[4] (voxlap5.c:8974). intss + mulps in voxlap.
930    let r2 = vec4_scale(cadd4[4], -(ysiz_i as f32));
931
932    // qsum1 + qbplbpp from voxsetframebuffer (voxlap5.c:11119-11122).
933    // The framebuffer geometry is independent of the camera projection
934    // — these are derived from `(width, height, pitch_bytes)`.
935    let pitch_bytes = (fb_pitch_pixels as i32).saturating_mul(4);
936    let qsum1_x = 0x7fff_i32 - fb_width as i32;
937    let qsum1_y = 0x7fff_i32 - fb_height as i32;
938    let qsum1 = [
939        qsum1_x as i16,
940        qsum1_y as i16,
941        qsum1_x as i16,
942        qsum1_y as i16,
943    ];
944    let qbplbpp = [4i16, pitch_bytes as i16, 4, pitch_bytes as i16];
945
946    let (kv6colmul, kv6coladd) = update_reflects(sprite, lighting);
947
948    Kv6FullState {
949        iter,
950        cadd4,
951        ztab4_per_z,
952        r1_initial,
953        r2,
954        scisdist,
955        qsum0,
956        qsum1,
957        qbplbpp,
958        kv6colmul,
959        kv6coladd,
960    }
961}
962
963/// Per-voxel rasterizer (R6.4 complete).
964///
965/// Mirror of `voxlap5.c:8179-8320` (`drawboundcubesse`). For each
966/// voxel:
967/// 1. `effmask = mask & v.vis` early-out.
968/// 2. `origin = r0 + ztab4_per_z[v.z]`; scissor on `origin.z`.
969/// 3. Look up `ptfaces16[effmask]` — `face[0]` = 4 or 6 vertex
970///    count, `face[1..7]` = byte offsets into `caddasm` (the
971///    `cadd4[8]` array, each entry 16 bytes).
972/// 4. For each vertex pair (a, b), compute the projected screen
973///    coords as `(cadd4[a] + origin).xy / (cadd4[a] + origin).z`
974///    via `_mm_rcp_ps`.
975/// 5. Pack the 4 (or 6) projected vertices to int16, min/max-reduce
976///    to a single screen-AABB, viewport-clip via `qsum0` /
977///    `qsum1`, and early-out on degenerate rect.
978/// 6. Compute the per-voxel colour via the `mm5` cross-call tail +
979///    `kv6colmul[v.dir]` + `kv6coladd[0]` modulation.
980/// 7. Fill the screen rectangle with z-test + framebuffer write.
981///
982/// Returns the number of pixels actually written (z-test passing).
983/// Tests use this as a sanity gate; production callers ignore it.
984///
985/// `mm5_tail` is voxlap's static cross-call register tail
986/// (voxlap5.c:8170-8177). It carries one byte of contribution from
987/// the previous voxel's colour into the current; bit-equality with
988/// the asm requires preserving it across calls within one sprite.
989///
990/// Currently x86_64-only — relies on `_mm_rcp_ps` for bit-equality
991/// with voxlap C. NEON / wasm ports will need their own goldens
992/// (see `PORTING-RUST.md` R9 / R10).
993#[cfg(target_arch = "x86_64")]
994#[allow(clippy::trivially_copy_pass_by_ref)] // hot loop; matches voxlap's pointer-passed v.
995pub(crate) fn drawboundcubesse(
996    v: &Voxel,
997    mask: u32,
998    state: &Kv6FullState<'_>,
999    r0: [f32; 4],
1000    mm5_tail: &mut u32,
1001    target: &mut DrawTarget<'_>,
1002) -> u32 {
1003    use core::arch::x86_64::{
1004        __m128, __m128i, _mm_add_epi16, _mm_add_ps, _mm_adds_epi16, _mm_cvtsi128_si32,
1005        _mm_cvtsi32_si128, _mm_cvttps_epi32, _mm_loadl_epi64, _mm_loadu_ps, _mm_madd_epi16,
1006        _mm_max_epi16, _mm_min_epi16, _mm_movehl_ps, _mm_movelh_ps, _mm_mul_ps, _mm_mulhi_epu16,
1007        _mm_packs_epi32, _mm_packus_epi16, _mm_rcp_ps, _mm_setzero_si128, _mm_shufflelo_epi16,
1008        _mm_storeu_ps, _mm_storeu_si128, _mm_subs_epu16, _mm_unpackhi_epi64, _mm_unpacklo_epi32,
1009        _mm_unpacklo_epi8,
1010    };
1011
1012    let effmask = (mask & u32::from(v.vis)) as usize;
1013    if effmask == 0 || effmask >= PTFACES16.len() {
1014        return 0;
1015    }
1016    let face = PTFACES16[effmask];
1017    if face[0] == 0 {
1018        return 0;
1019    }
1020
1021    // origin = r0 + ztab4_per_z[v.z] (4 f32 lanes, [x*hz, y*hz, z, z]).
1022    let z_idx = v.z as usize;
1023    if z_idx >= state.ztab4_per_z.len() {
1024        return 0;
1025    }
1026    let ztep = state.ztab4_per_z[z_idx];
1027    // SAFETY: `_mm_loadu_ps` reads 16 unaligned bytes from a 4-f32
1028    // array (which is 16 bytes); subsequent intrinsics are SSE2
1029    // baseline on x86_64.
1030    unsafe {
1031        let r0_v = _mm_loadu_ps(r0.as_ptr());
1032        let ztep_v = _mm_loadu_ps(ztep.as_ptr());
1033        let origin_v: __m128 = _mm_add_ps(r0_v, ztep_v);
1034        let mut origin_arr = [0.0f32; 4];
1035        _mm_storeu_ps(origin_arr.as_mut_ptr(), origin_v);
1036        if origin_arr[2] < state.scisdist {
1037            return 0;
1038        }
1039
1040        // Project vertex pair (a, b). Returns __m128 with lanes:
1041        //   [b.x_proj, b.y_proj, a.x_proj, a.y_proj]
1042        // The byte offsets in face[k] index `caddasm` (= bytes into a
1043        // [point4d; 8] = [[f32; 4]; 8]); divide by 16 (= sizeof point4d)
1044        // to land back at the cadd4 index.
1045        let project = |off_a: u8, off_b: u8| -> __m128 {
1046            let a = state.cadd4[(off_a >> 4) as usize];
1047            let b = state.cadd4[(off_b >> 4) as usize];
1048            let wva = _mm_add_ps(_mm_loadu_ps(a.as_ptr()), origin_v);
1049            let wvb = _mm_add_ps(_mm_loadu_ps(b.as_ptr()), origin_v);
1050            let wv0 = _mm_movehl_ps(wva, wvb); // [b.z, b.z, a.z, a.z]
1051            let wv1 = _mm_movelh_ps(wvb, wva); // [b.x, b.y, a.x, a.y]
1052            let wv0_inv = _mm_rcp_ps(wv0);
1053            _mm_mul_ps(wv0_inv, wv1)
1054        };
1055
1056        let pair01 = project(face[1], face[2]);
1057        let pair23 = project(face[3], face[4]);
1058
1059        // Convert to int32 (truncate-toward-zero), pack to int16.
1060        // pack01_int16 lanes 0..3 = [v1x, v1y, v0x, v0y]
1061        // pack01_int16 lanes 4..7 = [v3x, v3y, v2x, v2y]
1062        let p01_i32 = _mm_cvttps_epi32(pair01);
1063        let p23_i32 = _mm_cvttps_epi32(pair23);
1064        let pack_lo = _mm_packs_epi32(p01_i32, p23_i32);
1065        let pack01 = pack_lo;
1066        let pack23 = _mm_unpackhi_epi64(pack_lo, _mm_setzero_si128());
1067        let mut mm_min = _mm_min_epi16(pack01, pack23);
1068        let mut mm_max = _mm_max_epi16(pack01, pack23);
1069
1070        if face[0] != 4 {
1071            let pair45 = project(face[5], face[6]);
1072            let p45_i32 = _mm_cvttps_epi32(pair45);
1073            let pack45 = _mm_packs_epi32(p45_i32, _mm_setzero_si128());
1074            mm_min = _mm_min_epi16(mm_min, pack45);
1075            mm_max = _mm_max_epi16(mm_max, pack45);
1076        }
1077
1078        // shufflelo(_, 0x0e) brings high half (lanes 2..3) into low
1079        // half so min/max collapses across all 4 (or 6) vertices.
1080        let mm_min_hi = _mm_shufflelo_epi16(mm_min, 0x0e);
1081        let mm_max_hi = _mm_shufflelo_epi16(mm_max, 0x0e);
1082        let mm_min_red = _mm_min_epi16(mm_min, mm_min_hi);
1083        let mm_max_red = _mm_max_epi16(mm_max, mm_max_hi);
1084
1085        // bounds = unpacklo(mm_min, mm_max) lanes 0..3 (i16)
1086        //        = [min_x, max_x, min_y, max_y]  ?
1087        // Actually: _mm_unpacklo_epi32 interleaves 32-bit lanes.
1088        // Low 32 of mm_min = (mm_min[0], mm_min[1]) i.e. (min_x, min_y).
1089        // Low 32 of mm_max similarly. After unpacklo_epi32:
1090        //   lanes_32[0] = mm_min low32, lanes_32[1] = mm_max low32
1091        //   → 4 i16: [min_x, min_y, max_x, max_y]
1092        let bounds = _mm_unpacklo_epi32(mm_min_red, mm_max_red);
1093
1094        // Apply qsum0 (saturated add) + qsum1 (max-floor). Both are
1095        // 8-byte values loaded into the low 64 bits of __m128i.
1096        let qsum0_v = _mm_loadl_epi64(state.qsum0.as_ptr().cast::<__m128i>());
1097        let qsum1_v = _mm_loadl_epi64(state.qsum1.as_ptr().cast::<__m128i>());
1098        let bounds = _mm_adds_epi16(bounds, qsum0_v);
1099        let bounds = _mm_max_epi16(bounds, qsum1_v);
1100
1101        // dxdy = subs_epu16(bounds_hi, bounds) — saturating unsigned
1102        // subtract, with bounds_hi being lanes [2,3,2,3] of bounds.
1103        let bounds_hi = _mm_shufflelo_epi16(bounds, 0xee);
1104        let dxdy = _mm_subs_epu16(bounds_hi, bounds);
1105        let dxdy_low = _mm_cvtsi128_si32(dxdy) as u32;
1106        let dx = (dxdy_low & 0xffff) as i32;
1107        if dx == 0 {
1108            return 0;
1109        }
1110        let dy = ((dxdy_low >> 16) as i32) - 1;
1111        if dy < 0 {
1112            return 0;
1113        }
1114
1115        // Recover pixel coords from bounds + qsum1. Bounds[0/1] are
1116        // currently in the saturated [0x7fff - res, 0x7fff] range;
1117        // pixel = bounds - qsum1.
1118        let mut bounds_arr = [0i16; 8];
1119        _mm_storeu_si128(bounds_arr.as_mut_ptr().cast::<__m128i>(), bounds);
1120        let pixel_min_x = i32::from(bounds_arr[0]) - i32::from(state.qsum1[0]);
1121        let pixel_min_y = i32::from(bounds_arr[1]) - i32::from(state.qsum1[1]);
1122
1123        // pmaddwd is consumed for completeness so the asm-equivalent
1124        // pixel-byte-offset is computable; not strictly needed since
1125        // we index directly via (pixel_min_x, pixel_min_y).
1126        let qbplbpp_v = _mm_loadl_epi64(state.qbplbpp.as_ptr().cast::<__m128i>());
1127        let _ = _mm_madd_epi16(bounds, qbplbpp_v);
1128
1129        // Colour modulation with mm5 cross-call tail.
1130        let tail_in = *mm5_tail;
1131        let mm5 = _mm_cvtsi32_si128(tail_in as i32);
1132        let col_v = _mm_cvtsi32_si128(v.col as i32);
1133        let mm5 = _mm_unpacklo_epi8(mm5, col_v);
1134        let kvm = state.kv6colmul[v.dir as usize];
1135        let kvm_v = _mm_loadl_epi64(std::ptr::addr_of!(kvm).cast::<__m128i>());
1136        let mm5 = _mm_mulhi_epu16(mm5, kvm_v);
1137        let kva_v = _mm_loadl_epi64(std::ptr::addr_of!(state.kv6coladd).cast::<__m128i>());
1138        let mm5 = _mm_add_epi16(mm5, kva_v);
1139        let mm5 = _mm_packus_epi16(mm5, mm5);
1140        let color = _mm_cvtsi128_si32(mm5) as u32;
1141        *mm5_tail = color;
1142
1143        // Fill rectangle [pixel_min_x .. +dx) × [pixel_min_y .. +dy+1).
1144        // The qsum0/qsum1 clip + saturating sub guarantee the rect
1145        // sits inside the framebuffer, so no per-pixel bounds check
1146        // needed beyond DrawTarget's debug_assert.
1147        let z_val = origin_arr[2];
1148        let pitch = target.pitch_pixels;
1149        let x0 = pixel_min_x as usize;
1150        let x_end = x0 + dx as usize;
1151        let mut written: u32 = 0;
1152        for row in 0..=(dy as usize) {
1153            let y = pixel_min_y as usize + row;
1154            let row_start = y * pitch;
1155            for x in x0..x_end {
1156                let idx = row_start + x;
1157                // SAFETY: idx < pitch * height by qsum0/qsum1 clip;
1158                // concurrent-write contract gated by z_test_write.
1159                // (Outer `unsafe` block in this fn covers the call.)
1160                if target.z_test_write(idx, color, z_val) {
1161                    written += 1;
1162                }
1163            }
1164        }
1165        written
1166    }
1167}
1168
1169/// R9: scalar port for non-x86_64 (aarch64 / wasm). Same algorithm as
1170/// the SSE2 version but uses IEEE 754 `1.0 / z` instead of `_mm_rcp_ps`
1171/// for perspective projection, so screen-space vertex positions (and
1172/// therefore per-arch goldens) will differ by ±1 pixel at edges.
1173/// Colour modulation replicates the `_mm_mulhi_epu16` + `_mm_packus_epi16`
1174/// byte arithmetic exactly.
1175#[cfg(not(target_arch = "x86_64"))]
1176#[allow(clippy::trivially_copy_pass_by_ref)]
1177pub(crate) fn drawboundcubesse(
1178    v: &Voxel,
1179    mask: u32,
1180    state: &Kv6FullState<'_>,
1181    r0: [f32; 4],
1182    mm5_tail: &mut u32,
1183    target: &mut DrawTarget<'_>,
1184) -> u32 {
1185    let effmask = (mask & u32::from(v.vis)) as usize;
1186    if effmask == 0 || effmask >= PTFACES16.len() {
1187        return 0;
1188    }
1189    let face = PTFACES16[effmask];
1190    if face[0] == 0 {
1191        return 0;
1192    }
1193
1194    // origin = r0 + ztab4_per_z[v.z]
1195    let z_idx = v.z as usize;
1196    if z_idx >= state.ztab4_per_z.len() {
1197        return 0;
1198    }
1199    let origin = vec4_add(r0, state.ztab4_per_z[z_idx]);
1200    if origin[2] < state.scisdist {
1201        return 0;
1202    }
1203
1204    // The SSE2 path's qsum0/qsum1 mechanism embeds the screen-center
1205    // offset (hx, hy) into the viewport clip; recover it here for
1206    // the direct screen-coordinate projection.
1207    let hx = (i32::from(state.qsum0[0]) - i32::from(state.qsum1[0])) as f32;
1208    let hy = (i32::from(state.qsum0[1]) - i32::from(state.qsum1[1])) as f32;
1209
1210    // Project one vertex: screen_xy = (cadd4[idx] + origin).xy / .z + (hx, hy)
1211    let project = |off: u8| -> (f32, f32) {
1212        let wv = vec4_add(state.cadd4[(off >> 4) as usize], origin);
1213        let inv_z = 1.0 / wv[2];
1214        (wv[0] * inv_z + hx, wv[1] * inv_z + hy)
1215    };
1216
1217    // Project 4 or 6 vertices, track screen AABB via truncation.
1218    let (a0x, a0y) = project(face[1]);
1219    let (a1x, a1y) = project(face[2]);
1220    let (a2x, a2y) = project(face[3]);
1221    let (a3x, a3y) = project(face[4]);
1222    let mut min_x = a0x.min(a1x).min(a2x).min(a3x) as i32;
1223    let mut min_y = a0y.min(a1y).min(a2y).min(a3y) as i32;
1224    let mut max_x = a0x.max(a1x).max(a2x).max(a3x) as i32;
1225    let mut max_y = a0y.max(a1y).max(a2y).max(a3y) as i32;
1226
1227    if face[0] != 4 {
1228        let (a4x, a4y) = project(face[5]);
1229        let (a5x, a5y) = project(face[6]);
1230        min_x = min_x.min(a4x as i32).min(a5x as i32);
1231        min_y = min_y.min(a4y as i32).min(a5y as i32);
1232        max_x = max_x.max(a4x as i32).max(a5x as i32);
1233        max_y = max_y.max(a4y as i32).max(a5y as i32);
1234    }
1235
1236    // Viewport clip (mirrors the qsum0/qsum1 saturating-add + max
1237    // sequence from the SSE2 path, but in direct screen coords).
1238    let fb_w = target.width as i32;
1239    let fb_h = target.height as i32;
1240    min_x = min_x.max(0);
1241    min_y = min_y.max(0);
1242    max_x = max_x.min(fb_w - 1);
1243    max_y = max_y.min(fb_h - 1);
1244    if min_x > max_x || min_y > max_y {
1245        return 0;
1246    }
1247
1248    // Colour modulation — replicates the SSE2 byte arithmetic:
1249    //   interleave = unpacklo_epi8(tail, col)  → 4 × u16
1250    //   result     = mulhi_epu16(interleave, kv6colmul[dir]) + kv6coladd
1251    //   color      = packus_epi16(result)      → 4 × u8 → u32
1252    let t = mm5_tail.to_le_bytes();
1253    let c = v.col.to_le_bytes();
1254    let interleaved: [u16; 4] = [
1255        (u16::from(c[0]) << 8) | u16::from(t[0]),
1256        (u16::from(c[1]) << 8) | u16::from(t[1]),
1257        (u16::from(c[2]) << 8) | u16::from(t[2]),
1258        (u16::from(c[3]) << 8) | u16::from(t[3]),
1259    ];
1260    let kvm = state.kv6colmul[v.dir as usize];
1261    let kva = state.kv6coladd;
1262    let mut color_bytes = [0u8; 4];
1263    for i in 0..4 {
1264        let km = ((kvm >> (i * 16)) & 0xffff) as u16;
1265        let ka = ((kva >> (i * 16)) & 0xffff) as u16;
1266        let hi = ((u32::from(interleaved[i]) * u32::from(km)) >> 16) as u16;
1267        let val = hi.wrapping_add(ka) as i16;
1268        color_bytes[i] = val.clamp(0, 255) as u8;
1269    }
1270    let color = u32::from_le_bytes(color_bytes);
1271    *mm5_tail = color;
1272
1273    // Fill rectangle with z-test.
1274    let z_val = origin[2];
1275    let pitch = target.pitch_pixels;
1276    let mut written: u32 = 0;
1277    for y in min_y..=max_y {
1278        let row_start = y as usize * pitch;
1279        for x in min_x..=max_x {
1280            let idx = row_start + x as usize;
1281            // SAFETY: viewport clip above guarantees idx < pitch * height.
1282            unsafe {
1283                if target.z_test_write(idx, color, z_val) {
1284                    written += 1;
1285                }
1286            }
1287        }
1288    }
1289    written
1290}
1291
1292/// One iteration of voxlap's `DRAWBOUNDCUBELINE` macro
1293/// (voxlap5.c:8809-8812). Walks the voxel range `[range_start,
1294/// range_end)` (one (x, y) column's voxels) in three phases:
1295///
1296/// 1. Forward through voxels with `z < inz`, calling
1297///    `callback(voxel, base_mask | 0x20, r0)`.
1298/// 2. Backward through voxels with `z > inz`, calling
1299///    `callback(voxel, base_mask | 0x10, r0)`.
1300/// 3. If a single voxel remains with `z == inz`, call
1301///    `callback(voxel, base_mask | 0x00, r0)`.
1302///
1303/// Each (x, y) column is visited exactly once. `r0` is the screen-
1304/// space origin for *this* column — voxlap stores it as
1305/// `ztab4[MAXZSIZ]` and `drawboundcubesse` reads it via that index.
1306fn draw_boundcube_line<F: FnMut(&Voxel, u32, [f32; 4])>(
1307    voxels: &[Voxel],
1308    range_start: usize,
1309    range_end: usize,
1310    inz: i32,
1311    base_mask: u32,
1312    r0: [f32; 4],
1313    callback: &mut F,
1314) {
1315    if range_end <= range_start {
1316        return;
1317    }
1318    let mut v0 = range_start;
1319    let mut v1_excl = range_end;
1320
1321    // Phase 1: forward while voxels[v0].z < inz.
1322    while v0 < v1_excl && i32::from(voxels[v0].z) < inz {
1323        callback(&voxels[v0], base_mask | 0x20, r0);
1324        v0 += 1;
1325    }
1326    // Phase 2: backward while voxels[v1_excl - 1].z > inz.
1327    while v0 < v1_excl && i32::from(voxels[v1_excl - 1].z) > inz {
1328        callback(&voxels[v1_excl - 1], base_mask | 0x10, r0);
1329        v1_excl -= 1;
1330    }
1331    // Phase 3: single voxel left with z == inz.
1332    if v0 + 1 == v1_excl {
1333        callback(&voxels[v0], base_mask, r0);
1334    }
1335}
1336
1337/// 9-arm per-(x, y) column iteration walking the kv6's voxel
1338/// grid in painter's-back-to-front order around the camera-split
1339/// point (`inx`, `iny`, `inz`). Mirror of voxlap5.c:8982-9062.
1340///
1341/// Tracks `r1` (current x-column base) and `r0` (current (x, y)
1342/// origin) the same way voxlap mutates them with addps/subps,
1343/// passing `r0` to each per-voxel callback. `r0` evolves as
1344/// `r0[x][y] = r1_initial + x * cadd4[1] - y * cadd4[4]` (with
1345/// the floating-point operations applied in voxlap's order so the
1346/// per-step rounding matches bit-for-bit).
1347///
1348/// Each (x, y) column is visited exactly once.
1349#[allow(clippy::too_many_lines)]
1350pub(crate) fn kv6_iterate<F: FnMut(&Voxel, u32, [f32; 4])>(
1351    state: &Kv6FullState<'_>,
1352    mut callback: F,
1353) {
1354    let kv = state.iter.kv;
1355    let xsiz = kv.xsiz as i32;
1356    let ysiz = kv.ysiz as i32;
1357    let inx = state.iter.inx;
1358    let iny = state.iter.iny;
1359    let inz = state.iter.inz;
1360    let nxplanemin = state.iter.nxplanemin;
1361    let nxplanemax = state.iter.nxplanemax;
1362    let cadd1 = state.cadd4[1];
1363    let cadd_y = state.cadd4[4];
1364    let r2 = state.r2;
1365
1366    let mut xv: usize = 0;
1367    let mut r1 = state.r1_initial;
1368
1369    // First half: x = 0..inx. Top-half quadrants (masks 0xa, 0x6, 0x2).
1370    let mut x: i32 = 0;
1371    while x < inx {
1372        let xu = x as usize;
1373        let xlen = kv.xlen[xu] as usize;
1374        if x < nxplanemin || x >= nxplanemax {
1375            xv += xlen;
1376            r1 = vec4_add(r1, cadd1);
1377            x += 1;
1378            continue;
1379        }
1380        let yv_initial = xv + xlen;
1381        let mut r0 = r1; // movps r0, r1
1382
1383        // Forward y: 0..iny  -> mask 0xa.
1384        let mut xv_local = xv;
1385        let mut y: i32 = 0;
1386        while y < iny {
1387            let yu = y as usize;
1388            let len = kv.ylen[xu][yu] as usize;
1389            let v0 = xv_local;
1390            xv_local += len;
1391            draw_boundcube_line(&kv.voxels, v0, xv_local, inz, 0xa, r0, &mut callback);
1392            r0 = vec4_sub(r0, cadd_y); // r0 -= cadd4[4]
1393            y += 1;
1394        }
1395
1396        // Setup for reverse y: r0 = r1 + r2 (= base + (-ysiz)*cadd4[4]),
1397        // then r1 += cadd4[1] for the next x column.
1398        let mut yv_local = yv_initial;
1399        r0 = vec4_add(r1, r2);
1400        r1 = vec4_add(r1, cadd1);
1401
1402        // Reverse y: ysiz-1..iny  -> mask 0x6.
1403        let mut y = ysiz - 1;
1404        while y > iny {
1405            r0 = vec4_add(r0, cadd_y); // r0 += cadd4[4]
1406            let yu = y as usize;
1407            let len = kv.ylen[xu][yu] as usize;
1408            let v1_excl = yv_local;
1409            yv_local -= len;
1410            draw_boundcube_line(&kv.voxels, yv_local, v1_excl, inz, 0x6, r0, &mut callback);
1411            y -= 1;
1412        }
1413
1414        // Edge y == iny  -> mask 0x2.
1415        if iny >= 0 && (iny as u32) < kv.ysiz {
1416            r0 = vec4_add(r0, cadd_y);
1417            let yu = iny as usize;
1418            let len = kv.ylen[xu][yu] as usize;
1419            let v1_excl = yv_local;
1420            yv_local -= len;
1421            draw_boundcube_line(&kv.voxels, yv_local, v1_excl, inz, 0x2, r0, &mut callback);
1422        }
1423
1424        xv += xlen;
1425        x += 1;
1426    }
1427
1428    // Setup for second half (voxlap5.c:9011): jump r1 to past-end.
1429    // r1 += (xsiz - x) * cadd4[1]  with x = post-first-half value.
1430    let dx_remain = (xsiz - x) as f32;
1431    r1 = vec4_add(r1, vec4_scale(cadd1, dx_remain));
1432
1433    // Second half: x = xsiz-1..inx (reverse). Bot-half quadrants
1434    // (masks 0x5, 0x9, 0x1).
1435    let mut xv2: usize = kv.voxels.len();
1436    let mut x = xsiz - 1;
1437    while x > inx {
1438        let xu = x as usize;
1439        let xlen = kv.xlen[xu] as usize;
1440        if x < nxplanemin || x >= nxplanemax {
1441            xv2 -= xlen;
1442            r1 = vec4_sub(r1, cadd1);
1443            x -= 1;
1444            continue;
1445        }
1446        let yv_initial = xv2 - xlen;
1447        // Voxlap order: r1 -= cadd1 first, then r0 = r1 + r2.
1448        r1 = vec4_sub(r1, cadd1);
1449        let mut r0 = vec4_add(r1, r2);
1450
1451        // Reverse y: ysiz-1..iny  -> mask 0x5.
1452        let mut xv_local = xv2;
1453        let mut y = ysiz - 1;
1454        while y > iny {
1455            r0 = vec4_add(r0, cadd_y);
1456            let yu = y as usize;
1457            let len = kv.ylen[xu][yu] as usize;
1458            let v1_excl = xv_local;
1459            xv_local -= len;
1460            draw_boundcube_line(&kv.voxels, xv_local, v1_excl, inz, 0x5, r0, &mut callback);
1461            y -= 1;
1462        }
1463
1464        // After reverse y: r0 = r1 (movps r0, r1).
1465        let mut yv_local = yv_initial;
1466        r0 = r1;
1467
1468        // Forward y: 0..iny  -> mask 0x9.
1469        let mut y: i32 = 0;
1470        while y < iny {
1471            let yu = y as usize;
1472            let len = kv.ylen[xu][yu] as usize;
1473            let v0 = yv_local;
1474            yv_local += len;
1475            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x9, r0, &mut callback);
1476            r0 = vec4_sub(r0, cadd_y);
1477            y += 1;
1478        }
1479
1480        // Edge y == iny  -> mask 0x1.
1481        if iny >= 0 && (iny as u32) < kv.ysiz {
1482            let yu = iny as usize;
1483            let len = kv.ylen[xu][yu] as usize;
1484            let v0 = yv_local;
1485            yv_local += len;
1486            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x1, r0, &mut callback);
1487        }
1488
1489        xv2 -= xlen;
1490        x -= 1;
1491    }
1492
1493    // Edge x == inx (middle column). Masks 0x4, 0x8, 0x0.
1494    if inx >= 0 && (inx as u32) < kv.xsiz {
1495        let xu = inx as usize;
1496        if inx < nxplanemin || inx >= nxplanemax {
1497            return;
1498        }
1499        let xlen = kv.xlen[xu] as usize;
1500        let yv_initial = xv2 - xlen;
1501        r1 = vec4_sub(r1, cadd1);
1502        let mut r0 = vec4_add(r1, r2);
1503
1504        // Reverse y -> mask 0x4.
1505        let mut xv_local = xv2;
1506        let mut y = ysiz - 1;
1507        while y > iny {
1508            r0 = vec4_add(r0, cadd_y);
1509            let yu = y as usize;
1510            let len = kv.ylen[xu][yu] as usize;
1511            let v1_excl = xv_local;
1512            xv_local -= len;
1513            draw_boundcube_line(&kv.voxels, xv_local, v1_excl, inz, 0x4, r0, &mut callback);
1514            y -= 1;
1515        }
1516
1517        // After reverse y: r0 = r1.
1518        let mut yv_local = yv_initial;
1519        r0 = r1;
1520
1521        // Forward y -> mask 0x8.
1522        let mut y: i32 = 0;
1523        while y < iny {
1524            let yu = y as usize;
1525            let len = kv.ylen[xu][yu] as usize;
1526            let v0 = yv_local;
1527            yv_local += len;
1528            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x8, r0, &mut callback);
1529            r0 = vec4_sub(r0, cadd_y);
1530            y += 1;
1531        }
1532
1533        // Edge y == iny -> mask 0x0.
1534        if iny >= 0 && (iny as u32) < kv.ysiz {
1535            let yu = iny as usize;
1536            let len = kv.ylen[xu][yu] as usize;
1537            let v0 = yv_local;
1538            yv_local += len;
1539            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x0, r0, &mut callback);
1540        }
1541    }
1542}
1543
1544/// Draw a sprite into a framebuffer + z-buffer.
1545///
1546/// Top-level dispatcher mirroring voxlap5.c:9818-9828:
1547/// - Skips on `flags & INVISIBLE`.
1548/// - Skips on `flags & KFA` (animation path; out of scope for R6).
1549/// - Skips on `flags & NO_Z` (handled by `drawboundcubenozsse`,
1550///   not yet ported — the four oracle sprite poses all use z-tested
1551///   rendering).
1552///
1553/// Otherwise: cull → setup math → 9-arm per-voxel iteration →
1554/// per-voxel rasterize via the R6.4 `drawboundcubesse` port.
1555///
1556/// Returns the total number of pixels written across all voxels of
1557/// the sprite (== sum of z-test passes). Zero means the sprite
1558/// produced no visible pixels (culled, fully behind near plane, or
1559/// totally occluded).
1560/// Render a batch of sprites in parallel via `rayon::par_iter`.
1561///
1562/// Each sprite runs its own [`draw_sprite`] pass on its own thread,
1563/// writing to the shared [`DrawTarget`] (raw pointers;
1564/// `Copy + Send + Sync`) under the z-test arbitration contract: a
1565/// pixel write only fires when the new sprite's z is strictly less
1566/// than the current zbuffer value. For non-overlapping sprites the
1567/// writes are pairwise-disjoint and the output is byte-identical
1568/// to a sequential pass over the same sprite list. For overlapping
1569/// pixels, two sprites at exactly tied z-values produce a
1570/// non-deterministic last-writer-wins outcome — visually
1571/// indistinguishable but hash-non-deterministic.
1572///
1573/// Returns the sum of `draw_sprite` return values (total pixels
1574/// written across all sprites).
1575///
1576/// `RAYON_NUM_THREADS=1` (or no parallelism worth) ⇒ effectively
1577/// sequential; rayon falls back to running each closure on the
1578/// calling thread without contention.
1579///
1580/// Use this for engine scenes with dozens-to-hundreds of sprites;
1581/// the per-sprite overhead amortises well past ~4 sprites on
1582/// consumer-class hardware.
1583#[allow(clippy::module_name_repetitions)]
1584#[must_use]
1585pub fn draw_sprites_parallel(
1586    target: DrawTarget<'_>,
1587    cam: &CameraState,
1588    settings: &OpticastSettings,
1589    lighting: &SpriteLighting<'_>,
1590    sprites: &[Sprite],
1591) -> u32 {
1592    use rayon::prelude::*;
1593
1594    let render_one = |sprite: &Sprite| {
1595        // `target` is `Copy`, so each closure captures its own
1596        // copy of the (raw fb / zb pointer) view. `cam`,
1597        // `settings`, `lighting` are `&` borrows — Sync.
1598        let mut t = target;
1599        draw_sprite(&mut t, cam, settings, lighting, sprite)
1600    };
1601
1602    sprites.par_iter().map(render_one).sum()
1603}
1604
1605pub fn draw_sprite(
1606    target: &mut DrawTarget<'_>,
1607    cam: &CameraState,
1608    settings: &OpticastSettings,
1609    lighting: &SpriteLighting<'_>,
1610    sprite: &Sprite,
1611) -> u32 {
1612    if sprite.flags & SPRITE_FLAG_INVISIBLE != 0 {
1613        return 0;
1614    }
1615    if sprite.flags & SPRITE_FLAG_KFA != 0 {
1616        return 0;
1617    }
1618    if sprite.flags & SPRITE_FLAG_NO_Z != 0 {
1619        // drawboundcubenozsse port deferred; oracle doesn't exercise it.
1620        return 0;
1621    }
1622    let Some(setup) = kv6_draw_prepare(sprite, cam) else {
1623        return 0;
1624    };
1625    let state = kv6_compute_full_state(
1626        &setup,
1627        sprite,
1628        lighting,
1629        cam,
1630        settings,
1631        target.width,
1632        target.height,
1633        target.pitch_pixels,
1634    );
1635    let mut mm5_tail: u32 = 0;
1636    let mut total_written: u32 = 0;
1637    kv6_iterate(&state, |voxel, mask, r0| {
1638        total_written += drawboundcubesse(voxel, mask, &state, r0, &mut mm5_tail, target);
1639    });
1640    total_written
1641}
1642
1643#[cfg(test)]
1644mod tests {
1645    use super::*;
1646    use crate::camera_math;
1647    use crate::Camera;
1648    use roxlap_formats::kv6::Kv6;
1649
1650    fn empty_kv6() -> Kv6 {
1651        Kv6 {
1652            xsiz: 1,
1653            ysiz: 1,
1654            zsiz: 1,
1655            xpiv: 0.5,
1656            ypiv: 0.5,
1657            zpiv: 0.5,
1658            voxels: Vec::new(),
1659            xlen: vec![0],
1660            ylen: vec![vec![0]],
1661            palette: None,
1662        }
1663    }
1664
1665    /// 17×17×17 kv6 with pivot at the centre — same dimensions as
1666    /// the meltsphere oracle sprite so the cull test exercises a
1667    /// realistic bound cube rather than a 1-voxel point.
1668    fn cube_kv6() -> Kv6 {
1669        Kv6 {
1670            xsiz: 17,
1671            ysiz: 17,
1672            zsiz: 17,
1673            xpiv: 8.5,
1674            ypiv: 8.5,
1675            zpiv: 8.5,
1676            voxels: Vec::new(),
1677            xlen: vec![0; 17],
1678            ylen: vec![vec![0; 17]; 17],
1679            palette: None,
1680        }
1681    }
1682
1683    /// `CameraState` matching the oracle's `sprite_front` pose:
1684    /// pos=(1020,1050,175), yaw=0, pitch=0 → forward = +x.
1685    fn oracle_sprite_front_camera() -> camera_math::CameraState {
1686        let camera = Camera {
1687            pos: [1020.0, 1050.0, 175.0],
1688            // From oracle.c set_camera_yaw_pitch with yaw=0, pitch=0:
1689            //   ifor = [1, 0, 0], istr = [0, 1, 0], ihei = [0, 0, 1].
1690            right: [0.0, 1.0, 0.0],
1691            down: [0.0, 0.0, 1.0],
1692            forward: [1.0, 0.0, 0.0],
1693        };
1694        camera_math::derive(&camera, 640, 480, 320.0, 240.0, 320.0)
1695    }
1696
1697    fn oracle_settings() -> OpticastSettings {
1698        OpticastSettings::for_oracle_framebuffer(640, 480)
1699    }
1700
1701    /// Test-only ergonomic shim: build a Kv6FullState with the
1702    /// oracle 640×480 framebuffer geometry. Mirrors the
1703    /// pre-R6.4 signature so tests don't have to spell out
1704    /// width/height/pitch every time.
1705    fn compute_state_for_test<'a>(
1706        setup: &Kv6DrawSetup<'a>,
1707        sprite: &Sprite,
1708        cam: &camera_math::CameraState,
1709    ) -> Kv6FullState<'a> {
1710        let lighting = SpriteLighting::default_oracle();
1711        kv6_compute_full_state(
1712            setup,
1713            sprite,
1714            &lighting,
1715            cam,
1716            &oracle_settings(),
1717            640,
1718            480,
1719            640,
1720        )
1721    }
1722
1723    /// Allocate a 640×480 framebuffer + zbuffer (zbuffer pre-filled
1724    /// with f32::INFINITY so any voxel passes the z-test on first
1725    /// write).
1726    fn alloc_target() -> (Vec<u32>, Vec<f32>) {
1727        let pixels = 640usize * 480usize;
1728        (vec![0u32; pixels], vec![f32::INFINITY; pixels])
1729    }
1730
1731    fn make_target<'a>(fb: &'a mut [u32], zb: &'a mut [f32]) -> DrawTarget<'a> {
1732        DrawTarget::new(fb, zb, 640, 640, 480)
1733    }
1734
1735    /// Bit-pattern compare for two `[f32; 4]` vectors. The setup
1736    /// math produces these via deterministic IEEE-754 ops, so
1737    /// bit-equality is well-defined and dodges `clippy::float_cmp`.
1738    fn bits4(a: [f32; 4]) -> [u32; 4] {
1739        a.map(f32::to_bits)
1740    }
1741
1742    /// Bytes of the dumped C-oracle meltsphere sprite — used by all
1743    /// the kv6-load tests below. Module-scope `const` keeps clippy's
1744    /// `items_after_statements` happy.
1745    const SPRITE_MELTSPHERE_KV6: &[u8] = include_bytes!("../tests/fixtures/sprite_meltsphere.kv6");
1746
1747    #[test]
1748    fn axis_aligned_sets_identity_basis() {
1749        // Compare bit patterns: these are integer-valued floats so
1750        // bit-equality is well-defined and dodges clippy::float_cmp.
1751        let bits = |a: [f32; 3]| a.map(f32::to_bits);
1752        let s = Sprite::axis_aligned(empty_kv6(), [10.0, 20.0, 30.0]);
1753        assert_eq!(bits(s.p), bits([10.0, 20.0, 30.0]));
1754        assert_eq!(bits(s.s), bits([1.0, 0.0, 0.0]));
1755        assert_eq!(bits(s.h), bits([0.0, 1.0, 0.0]));
1756        assert_eq!(bits(s.f), bits([0.0, 0.0, 1.0]));
1757        assert_eq!(s.flags, 0);
1758    }
1759
1760    #[test]
1761    fn invisible_flag_skips_dispatch() {
1762        let cam = oracle_sprite_front_camera();
1763        let mut s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1764        s.flags = SPRITE_FLAG_INVISIBLE;
1765        let (mut fb, mut zb) = alloc_target();
1766        let mut target = make_target(&mut fb, &mut zb);
1767        let lighting = SpriteLighting::default_oracle();
1768        assert_eq!(
1769            draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
1770            0
1771        );
1772    }
1773
1774    #[test]
1775    fn kfa_flag_skips_dispatch() {
1776        let cam = oracle_sprite_front_camera();
1777        let mut s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1778        s.flags = SPRITE_FLAG_KFA;
1779        let (mut fb, mut zb) = alloc_target();
1780        let mut target = make_target(&mut fb, &mut zb);
1781        let lighting = SpriteLighting::default_oracle();
1782        assert_eq!(
1783            draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
1784            0
1785        );
1786    }
1787
1788    #[test]
1789    fn cull_keeps_oracle_sprite_in_front_of_camera() {
1790        // Oracle's `sprite_front` pose: camera at (1020,1050,175)
1791        // looking +x; sprite at (1050,1050,175). Sprite is 30
1792        // units forward, on-axis — clearly inside the frustum.
1793        let cam = oracle_sprite_front_camera();
1794        let s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1795        assert!(
1796            kv6_draw_prepare(&s, &cam).is_some(),
1797            "front-of-camera sprite must NOT be culled"
1798        );
1799    }
1800
1801    #[test]
1802    fn cull_removes_sprite_far_behind_camera() {
1803        // Same camera; sprite far in the -forward direction
1804        // (= behind the camera).
1805        let cam = oracle_sprite_front_camera();
1806        let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
1807        assert!(
1808            kv6_draw_prepare(&s, &cam).is_none(),
1809            "behind-camera sprite must be culled"
1810        );
1811    }
1812
1813    #[test]
1814    fn cull_removes_sprite_far_to_the_right() {
1815        // Camera looks +x; sprite far in the +y direction (right
1816        // axis), far enough that the bound cube is fully outside
1817        // the right-edge frustum plane.
1818        let cam = oracle_sprite_front_camera();
1819        // 30 units forward, 200 units right — well outside the 90°
1820        // FOV's right edge.
1821        let s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0 + 200.0, 175.0]);
1822        assert!(
1823            kv6_draw_prepare(&s, &cam).is_none(),
1824            "far-right sprite must be culled"
1825        );
1826    }
1827
1828    #[test]
1829    fn cull_keeps_sprite_at_camera_position() {
1830        // Sprite centred on the camera — bound cube straddles the
1831        // camera, so by definition it's not fully outside any
1832        // frustum plane and must NOT be culled.
1833        let cam = oracle_sprite_front_camera();
1834        let s = Sprite::axis_aligned(cube_kv6(), cam.pos);
1835        assert!(
1836            kv6_draw_prepare(&s, &cam).is_some(),
1837            "sprite at camera position must not be culled"
1838        );
1839    }
1840
1841    #[test]
1842    fn iterate_visits_each_voxel_exactly_once() {
1843        // Build a synthetic 3×3×3 kv6 with one voxel per (x, y)
1844        // column at z = x + y mod 3. Then iterate and check
1845        // (a) total callback fires == 27 = numvoxs, and (b) every
1846        // voxel index 0..27 was visited exactly once.
1847        let xsiz: u32 = 3;
1848        let ysiz: u32 = 3;
1849        let zsiz: u32 = 3;
1850        let mut voxels = Vec::new();
1851        let mut xlen = vec![0u32; xsiz as usize];
1852        let mut ylen = vec![vec![0u16; ysiz as usize]; xsiz as usize];
1853        for x in 0..xsiz {
1854            for y in 0..ysiz {
1855                let z = ((x + y) % 3) as u16;
1856                voxels.push(Voxel {
1857                    col: 0x0080_0000,
1858                    z,
1859                    vis: 63,
1860                    dir: 0,
1861                });
1862                xlen[x as usize] += 1;
1863                ylen[x as usize][y as usize] = 1;
1864            }
1865        }
1866        let kv = Kv6 {
1867            xsiz,
1868            ysiz,
1869            zsiz,
1870            xpiv: 1.5,
1871            ypiv: 1.5,
1872            zpiv: 1.5,
1873            voxels,
1874            xlen,
1875            ylen,
1876            palette: None,
1877        };
1878        let setup = Kv6DrawSetup {
1879            kv: &kv,
1880            ts: [1.0, 0.0, 0.0],
1881            th: [0.0, 1.0, 0.0],
1882            tf: [0.0, 0.0, 1.0],
1883            mip: 0,
1884        };
1885        let cam = oracle_sprite_front_camera();
1886        let synth_sprite = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
1887        let state = compute_state_for_test(&setup, &synth_sprite, &cam);
1888
1889        // Every voxel index must fire exactly once. We use a
1890        // by-pointer identity check via .as_ptr() offsets.
1891        let voxels_ptr = kv.voxels.as_ptr();
1892        let mut visited = vec![0u32; kv.voxels.len()];
1893        let mut total: u32 = 0;
1894        kv6_iterate(&state, |v, _mask, _r0| {
1895            // SAFETY: callback receives a borrow of an entry of
1896            // `kv.voxels`; computing the offset is well-defined.
1897            let idx = unsafe { std::ptr::from_ref::<Voxel>(v).offset_from(voxels_ptr) } as usize;
1898            visited[idx] += 1;
1899            total += 1;
1900        });
1901        assert_eq!(total as usize, kv.voxels.len(), "total callback fires");
1902        for (i, &n) in visited.iter().enumerate() {
1903            assert_eq!(n, 1, "voxel {i} visited {n} times (want 1)");
1904        }
1905    }
1906
1907    #[test]
1908    fn iterate_meltsphere_oracle_visits_each_voxel_once() {
1909        // Load the dumped voxlap-C meltsphere fixture (R6.0e) and
1910        // run the iteration against the oracle's sprite_front
1911        // camera + sprite pose. Expected: every voxel hit exactly
1912        // once, total fires == kv.voxels.len() (= 401).
1913        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1914        assert_eq!(kv.voxels.len(), 401, "fixture voxel count");
1915
1916        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1917        let cam = oracle_sprite_front_camera();
1918        let setup = kv6_draw_prepare(&sprite, &cam).expect("oracle sprite must pass cull");
1919        let state = compute_state_for_test(&setup, &sprite, &cam);
1920
1921        let voxels_ptr = sprite.kv6.voxels.as_ptr();
1922        let mut visited = vec![0u32; sprite.kv6.voxels.len()];
1923        let mut total: u32 = 0;
1924        kv6_iterate(&state, |v, _mask, _r0| {
1925            let idx = unsafe { std::ptr::from_ref::<Voxel>(v).offset_from(voxels_ptr) } as usize;
1926            visited[idx] += 1;
1927            total += 1;
1928        });
1929        assert_eq!(total, 401);
1930        let max = visited.iter().copied().max().unwrap();
1931        let min = visited.iter().copied().min().unwrap();
1932        assert_eq!(max, 1, "no voxel may be visited twice");
1933        assert_eq!(min, 1, "no voxel may be skipped");
1934    }
1935
1936    #[test]
1937    fn full_state_basic_invariants() {
1938        // For the oracle sprite_front pose, sanity-check the setup
1939        // values: ztab4_per_z[0] is zero, ztab4_per_z[k] - ztab4_per_z[k-1]
1940        // equals cadd4[2], cadd4[3] = cadd4[1] + cadd4[2], cadd4[7] is
1941        // the 7-bit-OR sum, and r1_initial = (npos*gihz with z2=npos.z)
1942        // - cadd4[4].
1943        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1944        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1945        let cam = oracle_sprite_front_camera();
1946        let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
1947        let state = compute_state_for_test(&setup, &sprite, &cam);
1948
1949        // ztab4_per_z[0] = [0; 4].
1950        assert_eq!(bits4(state.ztab4_per_z[0]), bits4([0.0; 4]));
1951
1952        // For each subsequent z, ztab4_per_z[z] = ztab4_per_z[z-1] + cadd4[2].
1953        for z in 1..state.ztab4_per_z.len() {
1954            let want = vec4_add(state.ztab4_per_z[z - 1], state.cadd4[2]);
1955            assert_eq!(bits4(state.ztab4_per_z[z]), bits4(want), "ztab4_per_z[{z}]");
1956        }
1957
1958        // cadd4[3] = cadd4[1] + cadd4[2]; cadd4[5] = cadd4[1] + cadd4[4];
1959        // cadd4[6] = cadd4[2] + cadd4[4]; cadd4[7] = cadd4[3] + cadd4[4].
1960        assert_eq!(
1961            bits4(state.cadd4[3]),
1962            bits4(vec4_add(state.cadd4[1], state.cadd4[2]))
1963        );
1964        assert_eq!(
1965            bits4(state.cadd4[5]),
1966            bits4(vec4_add(state.cadd4[1], state.cadd4[4]))
1967        );
1968        assert_eq!(
1969            bits4(state.cadd4[6]),
1970            bits4(vec4_add(state.cadd4[2], state.cadd4[4]))
1971        );
1972        assert_eq!(
1973            bits4(state.cadd4[7]),
1974            bits4(vec4_add(state.cadd4[3], state.cadd4[4]))
1975        );
1976        assert_eq!(bits4(state.cadd4[0]), bits4([0.0; 4]));
1977
1978        // r2 = -ysiz * cadd4[4].
1979        let want_r2 = vec4_scale(state.cadd4[4], -(state.iter.kv.ysiz as f32));
1980        assert_eq!(bits4(state.r2), bits4(want_r2));
1981    }
1982
1983    #[test]
1984    fn drawboundcubesse_culls_invisible_face_mask() {
1985        // Synthetic voxel with vis=0 must short-circuit the
1986        // early-out and not consume the scissor branch.
1987        let v = Voxel {
1988            col: 0,
1989            z: 0,
1990            vis: 0,
1991            dir: 0,
1992        };
1993        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1994        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1995        let cam = oracle_sprite_front_camera();
1996        let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
1997        let state = compute_state_for_test(&setup, &sprite, &cam);
1998        let (mut fb, mut zb) = alloc_target();
1999        let mut target = make_target(&mut fb, &mut zb);
2000        let mut tail = 0u32;
2001        assert_eq!(
2002            drawboundcubesse(
2003                &v,
2004                0xff,
2005                &state,
2006                [0.0, 0.0, 100.0, 100.0],
2007                &mut tail,
2008                &mut target,
2009            ),
2010            0
2011        );
2012    }
2013
2014    #[test]
2015    fn drawboundcubesse_culls_voxel_behind_near_plane() {
2016        // Force scisdist > 0 by passing an r0 with very small
2017        // origin.z. Only triggers if scisdist > origin.z; for the
2018        // oracle sprite_front pose `scisdist` is some small
2019        // positive number (sum of any negative post-swap basis-z
2020        // components), so a r0 with z = -1 will cull.
2021        let v = Voxel {
2022            col: 0xff,
2023            z: 0,
2024            vis: 0xff,
2025            dir: 0,
2026        };
2027        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2028        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2029        let cam = oracle_sprite_front_camera();
2030        let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
2031        let state = compute_state_for_test(&setup, &sprite, &cam);
2032        // r0.z = -1000 makes origin.z = -1000 + ztab4_per_z[0].z = -1000.
2033        // scisdist >= 0; -1000 < scisdist → cull.
2034        let r0 = [0.0, 0.0, -1000.0, -1000.0];
2035        let (mut fb, mut zb) = alloc_target();
2036        let mut target = make_target(&mut fb, &mut zb);
2037        let mut tail = 0u32;
2038        assert_eq!(
2039            drawboundcubesse(&v, 0xff, &state, r0, &mut tail, &mut target),
2040            0
2041        );
2042    }
2043
2044    #[test]
2045    fn iterate_no_voxels_when_culled() {
2046        // Sprite far behind camera → cull. draw_sprite never
2047        // reaches kv6_iterate, so no callback fires.
2048        let cam = oracle_sprite_front_camera();
2049        let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
2050        // Cull catches it before iteration.
2051        assert!(kv6_draw_prepare(&s, &cam).is_none());
2052    }
2053
2054    #[test]
2055    fn draw_sprite_writes_pixels_for_oracle_meltsphere() {
2056        // R6.4 end-to-end: load the meltsphere fixture, run
2057        // draw_sprite at the sprite_front pose. Expect a non-zero
2058        // pixel count and at least one non-zero framebuffer entry.
2059        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2060        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2061        let cam = oracle_sprite_front_camera();
2062        let (mut fb, mut zb) = alloc_target();
2063        let mut target = make_target(&mut fb, &mut zb);
2064        let lighting = SpriteLighting::default_oracle();
2065        let written = draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &sprite);
2066        assert!(written > 0, "expected some pixels to be written");
2067        assert!(
2068            fb.iter().any(|&p| p != 0),
2069            "expected at least one non-zero framebuffer entry"
2070        );
2071        // Z-buffer must have shrunk somewhere from f32::INFINITY.
2072        assert!(
2073            zb.iter().any(|&z| z.is_finite()),
2074            "expected at least one finite zbuffer entry"
2075        );
2076    }
2077
2078    #[test]
2079    fn draw_sprite_returns_zero_for_culled_sprite() {
2080        let cam = oracle_sprite_front_camera();
2081        let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
2082        let (mut fb, mut zb) = alloc_target();
2083        let mut target = make_target(&mut fb, &mut zb);
2084        let lighting = SpriteLighting::default_oracle();
2085        assert_eq!(
2086            draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
2087            0
2088        );
2089        assert!(fb.iter().all(|&p| p == 0));
2090    }
2091
2092    /// `update_reflects` for the oracle sprite_front pose hits the
2093    /// nolighta path (R==G==B kv6col, no fog, lightmode<2). All
2094    /// kv6colmul[k] entries must repeat one u16 modulation factor
2095    /// across all 4 lanes.
2096    #[test]
2097    fn update_reflects_nolighta_lanes_match() {
2098        let s = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
2099        let lighting = SpriteLighting::default_oracle();
2100        let (cm, ca) = update_reflects(&s, &lighting);
2101        assert_eq!(ca, 0, "kv6coladd must be zero (no fog)");
2102        for (k, e) in cm.iter().enumerate() {
2103            let l0 = (e & 0xffff) as u16;
2104            let l1 = ((e >> 16) & 0xffff) as u16;
2105            let l2 = ((e >> 32) & 0xffff) as u16;
2106            let l3 = ((e >> 48) & 0xffff) as u16;
2107            assert_eq!(l0, l1, "kv6colmul[{k}] lane0 != lane1");
2108            assert_eq!(l0, l2, "kv6colmul[{k}] lane0 != lane2");
2109            assert_eq!(l0, l3, "kv6colmul[{k}] lane0 != lane3");
2110        }
2111    }
2112
2113    /// Non-grey kv6col forces the nolightb path. Lanes 0..3 of each
2114    /// `kv6colmul[k]` come from per-channel modulators built from
2115    /// the kv6col bytes — they should NOT all match unless the
2116    /// channels themselves match.
2117    #[test]
2118    fn update_reflects_nolightb_lanes_diverge_for_tinted_kv6col() {
2119        let s = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
2120        let lighting = SpriteLighting {
2121            kv6col: 0x0040_8040, // R != G != B
2122            lightmode: 0,
2123            lights: &[],
2124        };
2125        let (cm, _) = update_reflects(&s, &lighting);
2126        // Find any direction where the dot is non-zero (most are
2127        // non-zero); that direction's lanes must vary by channel.
2128        let mut saw_divergence = false;
2129        for e in cm.iter() {
2130            let l0 = (e & 0xffff) as u16;
2131            let l1 = ((e >> 16) & 0xffff) as u16;
2132            let l2 = ((e >> 32) & 0xffff) as u16;
2133            if l0 != l1 || l0 != l2 {
2134                saw_divergence = true;
2135                break;
2136            }
2137        }
2138        assert!(
2139            saw_divergence,
2140            "non-grey kv6col must produce per-channel divergence in some kv6colmul slot"
2141        );
2142    }
2143
2144    /// Lightmode-2 with one point light + grey kv6col still
2145    /// produces R==G==B lanes (because the per-channel modulators
2146    /// are all 0x80<<8 = 0x8000). It must produce a non-uniform
2147    /// kv6colmul (some directions face the light, others away),
2148    /// which differs from lightmode<2 where every direction has the
2149    /// same dot magnitude regardless of position.
2150    #[test]
2151    fn update_reflects_lightmode2_produces_directional_shading() {
2152        let s = Sprite::axis_aligned(empty_kv6(), [100.0, 100.0, 100.0]);
2153        let lights = [LightSrc {
2154            pos: [110.0, 100.0, 100.0],
2155            r2: 100.0,
2156            sc: 16.0,
2157        }];
2158        let lighting = SpriteLighting {
2159            kv6col: DEFAULT_KV6COL,
2160            lightmode: 2,
2161            lights: &lights,
2162        };
2163        let (cm, _) = update_reflects(&s, &lighting);
2164        // Some directions must darken (shadow side) while others
2165        // brighten (light side) — the spread between min and max
2166        // tells us shading is happening.
2167        let mut min_w = u16::MAX;
2168        let mut max_w = 0u16;
2169        for e in cm.iter() {
2170            let l0 = (e & 0xffff) as u16;
2171            min_w = min_w.min(l0);
2172            max_w = max_w.max(l0);
2173        }
2174        assert!(
2175            max_w > min_w + 16,
2176            "lightmode-2 should produce directional shading: min={min_w} max={max_w}"
2177        );
2178    }
2179
2180    /// Lightmode-2 with no lights → ambient-only. Should still
2181    /// produce some non-zero kv6colmul (the synthetic ambient slot
2182    /// is non-trivial).
2183    #[test]
2184    fn update_reflects_lightmode2_no_lights_falls_back_to_ambient() {
2185        let s = Sprite::axis_aligned(empty_kv6(), [100.0, 100.0, 100.0]);
2186        let lighting = SpriteLighting {
2187            kv6col: DEFAULT_KV6COL,
2188            lightmode: 2,
2189            lights: &[],
2190        };
2191        let (cm, _) = update_reflects(&s, &lighting);
2192        let any_nonzero = cm.iter().any(|&e| e != 0);
2193        assert!(
2194            any_nonzero,
2195            "lightmode-2 with no lights should still emit ambient shading"
2196        );
2197    }
2198}