Skip to main content

roxlap_core/
sprite.rs

1//! KV6 sprite type + the `draw_sprite` dispatcher.
2//!
3//! Mirror of voxlap's `vx5sprite` (voxlap5.h:63-79) plus the
4//! `drawsprite` entry point (voxlap5.c:9818). For R6.1 the
5//! dispatcher is a stub — just enough API surface for the host to
6//! plumb a sprite reference through. R6.2-R6.4 fill in the actual
7//! kv6 frustum-cull + per-voxel rasterization behind it.
8//!
9//! Voxlap's vx5sprite is a 64-byte struct:
10//!
11//! ```text
12//! point3d p;       // position
13//! int32_t flags;   // bit 0: 0=normal shading
14//!                  // bit 1: 0=kv6data, 1=kfatype  (oracle uses 0)
15//!                  // bit 2: 0=normal, 1=invisible
16//! point3d s;       // x-basis (kv6data.xsiz direction)
17//! kv6data *voxnum; // (or kfatype *kfaptr if flag bit 1 set)
18//! point3d h;       // y-basis
19//! int32_t kfatim;
20//! point3d f;       // z-basis
21//! int32_t okfatim;
22//! ```
23//!
24//! For R6 we only handle kv6 sprites with `flags = 0` (the four
25//! oracle sprite poses all use this). KFA animation + the no-z and
26//! invisible flags are deferred.
27
28// The kv6draw port is pointer-arithmetic-heavy; the casts mirror C's
29// implicit i32/u32/usize narrowings. Loop bounds are clamped via
30// `lbound` so sign-loss / wrap is guarded at the type-system edge.
31// kv.{xsiz,ysiz,zsiz} are u32 with realistic max ≤ 256 (file format
32// limit) — well within f32's 24-bit mantissa.
33#![allow(
34    clippy::cast_possible_truncation,
35    clippy::cast_possible_wrap,
36    clippy::cast_sign_loss,
37    clippy::cast_precision_loss,
38    clippy::similar_names,
39    clippy::too_many_arguments,
40    clippy::too_many_lines,
41    clippy::cast_ptr_alignment, // _mm_loadl_epi64 / _mm_storeu_si128 are intentionally unaligned
42    clippy::doc_markdown,
43    clippy::no_effect_underscore_binding, // SSE intrinsic side-effect-only stores
44    clippy::no_effect, // the discarded pmaddwd intermediate
45    clippy::ref_as_ptr,
46    clippy::float_cmp_const,
47    clippy::float_cmp,
48)]
49
50use roxlap_formats::kv6::{Kv6, Voxel};
51use roxlap_formats::sprite::{Sprite, SPRITE_FLAG_INVISIBLE, SPRITE_FLAG_KFA, SPRITE_FLAG_NO_Z};
52
53use crate::camera_math::CameraState;
54use crate::engine::{Engine, LightSrc, DEFAULT_KV6COL};
55use crate::equivec::iunivec;
56use crate::fixed::ftol;
57use crate::opticast::OpticastSettings;
58use crate::ptfaces16::PTFACES16;
59
60/// Voxlap's `MAXLIGHTS` cap (`voxlap5.c`). Used to size the
61/// ambient-plus-N-lights `lightlist` scratch in `update_reflects`'s
62/// lightmode≥2 branch.
63const MAX_LIGHTS: usize = 16;
64
65/// Voxlap's `vx5.kv6mipfactor` default (`voxlap5.c:12335`). Threshold
66/// distance (in voxlap's "ftol-of-forward-projected" estimate units)
67/// above which kv6draw walks the lowermip chain. Roxlap doesn't yet
68/// model the lowermip chain in `roxlap-formats::Kv6`, so the mip
69/// descent loop in `kv6_draw_prepare` is structurally faithful but
70/// effectively a no-op until that lands.
71pub(crate) const KV6_MIPFACTOR_DEFAULT: i32 = 128;
72
73/// Post-cull state derived from a sprite + camera pair — what the
74/// per-voxel iteration in R6.3+ needs to start its setup. Borrows
75/// the mip-selected kv6 from the sprite.
76///
77/// Voxlap doesn't materialise this struct (it operates on local
78/// variables inside `kv6draw`); roxlap factors the cull out so it's
79/// independently testable without staging the rest of the
80/// rasterizer.
81#[derive(Debug, Clone)]
82#[allow(dead_code)] // R6.3+ will read these fields.
83pub(crate) struct Kv6DrawSetup<'a> {
84    /// Mip-selected kv6. For the base-mip case (always, today),
85    /// this is just `&sprite.kv6`.
86    pub kv: &'a Kv6,
87    /// Mip-scaled basis vectors. For the base mip these equal
88    /// `sprite.s/h/f`; if a future lowermip walk runs, each is
89    /// scaled by `2^mip`.
90    pub ts: [f32; 3],
91    pub th: [f32; 3],
92    pub tf: [f32; 3],
93    /// 0 for the base mip; reserved for lowermip support.
94    pub mip: u32,
95}
96
97/// Mip-LOD descent + 4-plane frustum cull, mirror of voxlap5.c:8832-
98/// 8875. Returns `None` if the sprite's bound cube is fully behind
99/// any of the four view-frustum edge planes (`CameraState::nor`),
100/// `Some(setup)` otherwise with the post-cull state R6.3 needs.
101///
102/// # Cull math
103///
104/// The bound cube has centre `npos` (in camera-relative coords) and
105/// three half-extent vectors `nstr`, `nhei`, `nfor` (each = the
106/// kv6-axis basis vector scaled by the corresponding half-extent).
107/// For each frustum-edge normal `n`, voxlap tests:
108///
109/// ```text
110/// |nstr · n| + |nhei · n| + |nfor · n| + npos · n < 0
111/// ```
112///
113/// — i.e. the cube's closest-point projection onto `n` is still
114/// behind the plane. Any plane satisfying this culls the sprite.
115pub(crate) fn kv6_draw_prepare<'a>(
116    sprite: &'a Sprite,
117    cam: &CameraState,
118) -> Option<Kv6DrawSetup<'a>> {
119    let kv = &sprite.kv6;
120
121    // Voxlap's quick-and-dirty distance estimate (voxlap5.c:8835):
122    //   y = ftol((spr->p - gipos) · gifor)
123    // Used by the lowermip descent loop. Roxlap-formats `Kv6` doesn't
124    // model lowermip yet, so the loop never runs and this value is
125    // unused — computed for symmetry with voxlap and to lock the
126    // path for a future mip-chain port.
127    let dx = sprite.p[0] - cam.pos[0];
128    let dy = sprite.p[1] - cam.pos[1];
129    let dz = sprite.p[2] - cam.pos[2];
130    let dist_estimate = ftol(dx * cam.forward[0] + dy * cam.forward[1] + dz * cam.forward[2]);
131    let _ = (dist_estimate, KV6_MIPFACTOR_DEFAULT);
132    let mip = 0u32;
133    let ts = sprite.s;
134    let th = sprite.h;
135    let tf = sprite.f;
136
137    // Bound-cube centre + half-extents in camera-relative coords.
138    // (voxlap5.c:8852-8860; tp is centre offset from pivot, tp2 is
139    // axis half-extent.) kv->xsiz/ysiz/zsiz fit f32 exactly for
140    // any realistic kv6 (≤ 256³ per the file format limit).
141    #[allow(clippy::cast_precision_loss)]
142    let half_x = kv.xsiz as f32 * 0.5;
143    #[allow(clippy::cast_precision_loss)]
144    let half_y = kv.ysiz as f32 * 0.5;
145    #[allow(clippy::cast_precision_loss)]
146    let half_z = kv.zsiz as f32 * 0.5;
147    let off_x = half_x - kv.xpiv;
148    let off_y = half_y - kv.ypiv;
149    let off_z = half_z - kv.zpiv;
150    let npos = [
151        off_x * ts[0] + off_y * th[0] + off_z * tf[0] + dx,
152        off_x * ts[1] + off_y * th[1] + off_z * tf[1] + dy,
153        off_x * ts[2] + off_y * th[2] + off_z * tf[2] + dz,
154    ];
155    let nstr = [ts[0] * half_x, ts[1] * half_x, ts[2] * half_x];
156    let nhei = [th[0] * half_y, th[1] * half_y, th[2] * half_y];
157    let nfor = [tf[0] * half_z, tf[1] * half_z, tf[2] * half_z];
158
159    // 4-plane cull (voxlap5.c:8861-8875, walked z=3..0).
160    for n in &cam.nor {
161        let proj_str = (nstr[0] * n[0] + nstr[1] * n[1] + nstr[2] * n[2]).abs();
162        let proj_hei = (nhei[0] * n[0] + nhei[1] * n[1] + nhei[2] * n[2]).abs();
163        let proj_for = (nfor[0] * n[0] + nfor[1] * n[1] + nfor[2] * n[2]).abs();
164        let proj_pos = npos[0] * n[0] + npos[1] * n[1] + npos[2] * n[2];
165        if proj_str + proj_hei + proj_for + proj_pos < 0.0 {
166            return None;
167        }
168    }
169
170    Some(Kv6DrawSetup {
171        kv,
172        ts,
173        th,
174        tf,
175        mip,
176    })
177}
178
179/// 3×3 + translation matrix multiply, port of voxlap's `mat2`
180/// (voxlap5.c:9619). Composes camera transform `(a_s, a_h, a_f, a_o)`
181/// with sprite basis `(b_s, b_h, b_f, b_o)` into camera-relative
182/// sprite basis `(c_s, c_h, c_f, c_o)`.
183///
184/// `c_s = a_s * b_s.x + a_h * b_s.y + a_f * b_s.z`, similarly for
185/// `c_h` / `c_f`. `c_o = same form on b_o + a_o`.
186#[allow(clippy::too_many_arguments)]
187pub(crate) fn mat2(
188    a_s: [f32; 3],
189    a_h: [f32; 3],
190    a_f: [f32; 3],
191    a_o: [f32; 3],
192    b_s: [f32; 3],
193    b_h: [f32; 3],
194    b_f: [f32; 3],
195    b_o: [f32; 3],
196) -> ([f32; 3], [f32; 3], [f32; 3], [f32; 3]) {
197    let c_s = [
198        a_s[0] * b_s[0] + a_h[0] * b_s[1] + a_f[0] * b_s[2],
199        a_s[1] * b_s[0] + a_h[1] * b_s[1] + a_f[1] * b_s[2],
200        a_s[2] * b_s[0] + a_h[2] * b_s[1] + a_f[2] * b_s[2],
201    ];
202    let c_h = [
203        a_s[0] * b_h[0] + a_h[0] * b_h[1] + a_f[0] * b_h[2],
204        a_s[1] * b_h[0] + a_h[1] * b_h[1] + a_f[1] * b_h[2],
205        a_s[2] * b_h[0] + a_h[2] * b_h[1] + a_f[2] * b_h[2],
206    ];
207    let c_f = [
208        a_s[0] * b_f[0] + a_h[0] * b_f[1] + a_f[0] * b_f[2],
209        a_s[1] * b_f[0] + a_h[1] * b_f[1] + a_f[1] * b_f[2],
210        a_s[2] * b_f[0] + a_h[2] * b_f[1] + a_f[2] * b_f[2],
211    ];
212    let c_o = [
213        a_s[0] * b_o[0] + a_h[0] * b_o[1] + a_f[0] * b_o[2] + a_o[0],
214        a_s[1] * b_o[0] + a_h[1] * b_o[1] + a_f[1] * b_o[2] + a_o[1],
215        a_s[2] * b_o[0] + a_h[2] * b_o[1] + a_f[2] * b_o[2] + a_o[2],
216    ];
217    (c_s, c_h, c_f, c_o)
218}
219
220/// Voxlap's `lbound(a, b, c)` (voxlap5.c:406): clamp `a` into the
221/// inclusive range `[b, c]`. `c` must be `>= b`.
222#[inline]
223fn lbound(a: i32, b: i32, c: i32) -> i32 {
224    a.clamp(b, c)
225}
226
227/// State derived from `Kv6DrawSetup` + `CameraState` that the
228/// per-voxel iteration consumes. Voxlap holds these on the stack
229/// inside `kv6draw`; roxlap factors them out so the iteration loop
230/// can be tested independently.
231#[derive(Debug, Clone)]
232#[allow(dead_code)] // R6.4+ reads scisdist / qsum0 / cadd / etc.
233pub(crate) struct Kv6IterState<'a> {
234    pub kv: &'a Kv6,
235    /// Camera origin expressed in kv6-local voxel coordinates,
236    /// clamped to `[-1, kv.xsiz]` etc. by voxlap's `lbound`. Splits
237    /// the voxel grid into the 4 + 1 quadrants the iteration walks
238    /// in different orders so that for each (x, y) column the inner
239    /// z-loop visits voxels closer to the camera first (= correct
240    /// painter's-style ordering for the rasterizer in R6.4).
241    pub inx: i32,
242    pub iny: i32,
243    pub inz: i32,
244    /// `vx5.xplanemin` / `vx5.xplanemax` mirror — voxlap defaults
245    /// to `[0, INT_MAX]` (no x-clipping). Roxlap doesn't yet expose
246    /// a public knob for these; pinning to the defaults matches the
247    /// oracle and any caller that doesn't care.
248    pub nxplanemin: i32,
249    pub nxplanemax: i32,
250}
251
252/// Full per-frame rasterizer state for one sprite — what
253/// `drawboundcubesse` reads via voxlap's globals.
254///
255/// Built by [`kv6_compute_full_state`] from the post-cull
256/// `Kv6DrawSetup` + the camera's projection params. Mirror of the
257/// voxlap5.c:8915-8973 setup block + the qsum1/qbplbpp framebuffer
258/// state from `voxsetframebuffer` (voxlap5.c:11119-11122) +
259/// kv6colmul/kv6coladd from `updatereflects` (voxlap5.c:8466).
260#[derive(Debug, Clone)]
261pub(crate) struct Kv6FullState<'a> {
262    pub iter: Kv6IterState<'a>,
263    /// 8 cube-vertex offsets, gihz-scaled. `cadd4[k]` for `k = 0..7`
264    /// is the offset of cube vertex `k` from the voxel origin, where
265    /// bit 0 = +x, bit 1 = +z (post-swap == old +z), bit 2 = +y
266    /// (post-swap == old -y). `cadd4[0]` is `[0; 4]`. Lane 3 of
267    /// each entry duplicates lane 2 (z) — voxlap's SSE convenience.
268    pub cadd4: [[f32; 4]; 8],
269    /// Per-z step table: `ztab4_per_z[z] = z * cadd4[2]`. Length =
270    /// `kv.zsiz`. Indexed by `v.z` in `drawboundcubesse`.
271    pub ztab4_per_z: Vec<[f32; 4]>,
272    /// Initial r1 — the x=0 column base after voxlap's "ANNOYING
273    /// HACK" pre-decrement. = `(npos*gihz with z2=npos.z) -
274    /// cadd4[4]`. Iterates by `cadd4[1]` per x and (via r0) by
275    /// `cadd4[4]` per y.
276    pub r1_initial: [f32; 4],
277    /// `r2 = -ysiz * cadd4[4]`. Used to reset r0 between forward-y
278    /// and reverse-y phases inside one x column.
279    pub r2: [f32; 4],
280    /// Near-plane scissor distance (camera-space Z).
281    /// `voxlap5.c:8953-8956` — equals the negative sum of any
282    /// negative components of post-swap `nstr.z` / `nhei.z` /
283    /// `nfor.z`. `0.0` if all three are non-negative.
284    pub scisdist: f32,
285    /// Viewport-clip biases (voxlap5.c:8947-8948). Used by the SSE2
286    /// path's `paddsw` / `pmaxsw` AABB clipping; the scalar port clips
287    /// directly against `target.width` / `target.height`.
288    #[allow(dead_code)]
289    pub qsum0: [i16; 4],
290    /// Viewport-clip floor (voxlap5.c:11120).
291    #[allow(dead_code)]
292    pub qsum1: [i16; 4],
293    /// Framebuffer pixel-stride packed for `pmaddwd` (voxlap5.c:11121).
294    #[allow(dead_code)]
295    pub qbplbpp: [i16; 4],
296    /// Per-direction colour modulation table built by
297    /// [`update_reflects`]. Indexed by `v.dir` (256 entries). Each
298    /// entry packs four `u16` modulation factors (one per byte
299    /// channel) used by `_mm_mulhi_epu16` against the unpacked
300    /// voxel colour.
301    pub kv6colmul: Box<[u64; 256]>,
302    /// Fog bias added after the colour modulate. Zero when fog is
303    /// disabled (the oracle case).
304    pub kv6coladd: u64,
305}
306
307/// Borrowed framebuffer + zbuffer the per-voxel rasterizer fills.
308///
309/// Mirrors voxlap's `kv6frameplace` + `zbuffermem` but in
310/// row-major-pixel form rather than byte-pointer form. `width` /
311/// `height` must match the `OpticastSettings.xres` / `yres` used
312/// when the per-frame `Kv6FullState` was built — the bounds derived from
313/// `qsum0` / `qsum1` assume that geometry.
314///
315/// Internally a raw-pointer view (similar to
316/// [`crate::scalar_rasterizer::RasterTarget`]) so the type is `Copy
317/// + Send + Sync` and the R12.4.2 [`draw_sprites_parallel`] entry
318/// point can hand per-thread copies into rayon worker closures.
319/// Each parallel sprite-draw competes for the framebuffer / zbuffer
320/// via z-test; for non-overlapping sprites this is race-free, for
321/// overlapping pixels a tied-z race may leak (visually
322/// indistinguishable, hash non-deterministic).
323#[derive(Clone, Copy, Debug)]
324pub struct DrawTarget<'a> {
325    fb_ptr: *mut u32,
326    fb_len: usize,
327    zb_ptr: *mut f32,
328    zb_len: usize,
329    /// Row stride in pixels.
330    pub pitch_pixels: usize,
331    pub width: u32,
332    pub height: u32,
333    _marker: std::marker::PhantomData<&'a mut [u32]>,
334}
335
336// SAFETY: same shape as the (`&'a mut [u32]`, `&'a mut [f32]`) pair
337// the constructor consumed; both are auto-`Send` for `T: Send`. The
338// pointer-aliasing safety contract for [`draw_sprites_parallel`] is
339// "z-test arbitrates concurrent writes" — a tied-z race is a
340// determinism issue, not a memory-safety issue.
341unsafe impl Send for DrawTarget<'_> {}
342unsafe impl Sync for DrawTarget<'_> {}
343
344impl<'a> DrawTarget<'a> {
345    /// Build a target from exclusive slice borrows + framebuffer
346    /// dimensions. The slices are consumed (their `&'a mut`
347    /// re-borrow is what gates lifetime); subsequent access happens
348    /// via the raw pointers held in the struct.
349    #[must_use]
350    pub fn new(
351        framebuffer: &'a mut [u32],
352        zbuffer: &'a mut [f32],
353        pitch_pixels: usize,
354        width: u32,
355        height: u32,
356    ) -> Self {
357        Self {
358            fb_ptr: framebuffer.as_mut_ptr(),
359            fb_len: framebuffer.len(),
360            zb_ptr: zbuffer.as_mut_ptr(),
361            zb_len: zbuffer.len(),
362            pitch_pixels,
363            width,
364            height,
365            _marker: std::marker::PhantomData,
366        }
367    }
368
369    /// Unconditional framebuffer write. Used by sequential 2D
370    /// blitters (`drawtile`) that don't engage z-testing.
371    ///
372    /// # Safety
373    /// `idx < self.fb_len`. The disjoint-write contract still
374    /// applies if multiple `Copy` instances of `DrawTarget` are in
375    /// flight across threads — this method does NOT arbitrate via
376    /// z-test.
377    #[inline]
378    pub unsafe fn fb_write(self, idx: usize, color: u32) {
379        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
380        // SAFETY: caller asserts in-bounds + (for parallel use)
381        // disjoint writes.
382        unsafe { self.fb_ptr.add(idx).write(color) };
383    }
384
385    /// Read one framebuffer pixel. Used by alpha-blend paths
386    /// (`drawtile` modulate-and-blend) that read-modify-write.
387    ///
388    /// # Safety
389    /// `idx < self.fb_len`. Concurrent writers to the same `idx`
390    /// from another thread invalidate the read; sequential blits
391    /// are race-free.
392    #[inline]
393    #[must_use]
394    pub unsafe fn fb_read(self, idx: usize) -> u32 {
395        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
396        // SAFETY: caller asserts in-bounds.
397        unsafe { self.fb_ptr.add(idx).read() }
398    }
399
400    /// Z-tested pixel write. If `z < zbuffer[idx]`, the new color +
401    /// z stamp the buffers; otherwise nothing changes.
402    ///
403    /// # Safety
404    /// `idx < self.fb_len`. For parallel callers, the wedge / z-test
405    /// arbitration contract on [`DrawTarget`] applies (see struct
406    /// doc).
407    #[inline]
408    pub unsafe fn z_test_write(self, idx: usize, color: u32, z: f32) -> bool {
409        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
410        debug_assert!(idx < self.zb_len, "zb idx {} >= len {}", idx, self.zb_len);
411        // SAFETY: caller asserts in-bounds + concurrent-write contract.
412        unsafe {
413            let zp = self.zb_ptr.add(idx);
414            let cur_z = zp.read();
415            if z < cur_z {
416                zp.write(z);
417                self.fb_ptr.add(idx).write(color);
418                true
419            } else {
420                false
421            }
422        }
423    }
424}
425
426#[inline]
427fn vec4_add(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
428    [a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
429}
430
431#[inline]
432fn vec4_sub(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
433    [a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3]]
434}
435
436#[inline]
437fn vec4_scale(a: [f32; 4], s: f32) -> [f32; 4] {
438    [a[0] * s, a[1] * s, a[2] * s, a[3] * s]
439}
440
441/// Sprite lighting + colour state — the subset of voxlap's
442/// `vx5` global that `updatereflects` reads. Built once per
443/// frame from [`Engine`] state and passed to [`draw_sprite`].
444///
445/// All fields mirror voxlap names:
446/// - `kv6col` ↔ `vx5.kv6col`
447/// - `lightmode` ↔ `vx5.lightmode`
448/// - `lights` ↔ `vx5.lightsrc[0..vx5.numlights]`
449///
450/// The `vx5.fogcol`/`ofogdist` fog plumbing is deferred — sprite
451/// fog stays off for now, matching the oracle path
452/// (`vx5.fogcol < 0` ⇒ `ofogdist == -1` in voxlap C, no fog).
453#[derive(Debug, Clone, Copy)]
454pub struct SpriteLighting<'a> {
455    /// Material colour. R==G==B triggers the cheaper nolighta path
456    /// in `update_reflects`; arbitrary RGB takes the per-channel
457    /// nolightb path; lightmode≥2 ignores the R==G==B fast path
458    /// and always does per-channel modulation.
459    pub kv6col: u32,
460    /// `0` / `1` → directional surface tint (lightmode<2 paths).
461    /// `2` → per-light shadow-side modulation against `lights`.
462    pub lightmode: u32,
463    /// Active point lights — voxlap's `vx5.lightsrc[..vx5.numlights]`.
464    /// Empty for lightmode<2; populated for lightmode≥2.
465    pub lights: &'a [LightSrc],
466}
467
468impl<'a> SpriteLighting<'a> {
469    /// Snapshot the lighting + colour subset of an [`Engine`].
470    /// Use this once per frame in the host so the sprite render
471    /// reflects engine setters made between frames.
472    #[must_use]
473    pub fn from_engine(engine: &'a Engine) -> Self {
474        Self {
475            kv6col: engine.kv6col(),
476            lightmode: engine.lightmode(),
477            lights: engine.lights(),
478        }
479    }
480}
481
482impl SpriteLighting<'static> {
483    /// Default oracle config — grey `kv6col`, lightmode 0, no
484    /// lights. Used by `roxlap-oracle` so the four sprite golden
485    /// hashes stay byte-stable: this is the exact state voxlap C's
486    /// oracle has when it calls `drawsprite`.
487    #[must_use]
488    pub fn default_oracle() -> Self {
489        Self {
490            kv6col: DEFAULT_KV6COL,
491            lightmode: 0,
492            lights: &[],
493        }
494    }
495}
496
497/// Builds `kv6colmul[256]` + `kv6coladd[0]` from the engine's
498/// sprite lighting state. Mirror of voxlap's `updatereflects`
499/// (`voxlap5.c:8466-8750`).
500///
501/// Branches:
502/// - `lightmode < 2` + R==G==B `kv6col` → nolighta (cheap
503///   single-multiplier path, voxlap5.c:8553-8584).
504/// - `lightmode < 2` + arbitrary `kv6col` → nolightb (per-channel
505///   path, voxlap5.c:8587-8629).
506/// - `lightmode >= 2` → per-light shadow-side modulation
507///   (voxlap5.c:8631-8750), iterating the active `lights`.
508///
509/// `flags & 1` (disable shading) and the active-fog path remain
510/// deferred — neither is exercised by the oracle's four sprite
511/// poses, and adding them is a follow-up that doesn't change the
512/// already-frozen hashes.
513///
514fn update_reflects(sprite: &Sprite, lighting: &SpriteLighting<'_>) -> (Box<[u64; 256]>, u64) {
515    // Sprite fog plumbing is a follow-up — `vx5.fogcol < 0` (voxlap
516    // C oracle's set_fogcol(BR(...)) state) means ofogdist stays -1,
517    // fogmul = 0, kv6coladd[0] = 0. We pin to that here.
518    let fogmul_lo: u32 = 0;
519    let kv6coladd: u64 = 0;
520
521    let kv6col = lighting.kv6col;
522
523    // g = ((fogmul & 32767) ^ 32767) * (16*8/65536). With fogmul=0:
524    //   g = 32767 * (128/65536) ≈ 63.998.
525    let g_pre = ((((fogmul_lo & 0x7fff) ^ 0x7fff) as i32) as f32) * (16.0 * 8.0 / 65536.0);
526
527    let mut kv6colmul = Box::new([0u64; 256]);
528
529    if lighting.lightmode < 2 {
530        // (voxlap5.c:8538-8543) fx=fy=fz=1.0; tp = sum of basis vectors.
531        let tp_x = sprite.s[0] + sprite.h[0] + sprite.f[0];
532        let tp_y = sprite.s[1] + sprite.h[1] + sprite.f[1];
533        let tp_z = sprite.s[2] + sprite.h[2] + sprite.f[2];
534
535        let f0 = 64.0_f32 / (tp_x * tp_x + tp_y * tp_y + tp_z * tp_z).sqrt();
536
537        // R==G==B test: ((kv6col & 0xffff) << 8) ^ (kv6col & 0xffff00)
538        //   == 0  iff  R == G and G == B.
539        let lo16 = kv6col & 0xffff;
540        let mid24 = kv6col & 0x00ff_ff00;
541        let is_grey = ((lo16 << 8) ^ mid24) == 0;
542
543        if is_grey {
544            // Nolighta path (voxlap5.c:8553-8584): grey kv6col absorbs
545            // into a single multiplier per direction.
546            let g = g_pre * (((kv6col & 0xff) as f32) / 256.0);
547            let f = f0 * g;
548
549            let l0 = (tp_x * f) as i16; // (short)(...) is C truncating cast
550            let l1 = (tp_y * f) as i16;
551            let l2 = (tp_z * f) as i16;
552            let l3 = (g * 128.0) as i16;
553
554            let iu = iunivec();
555            for k in 0..256 {
556                let w = dot_iunivec_i16x4(iu[k], [l0, l1, l2, l3]);
557                let w64 = u64::from(w);
558                kv6colmul[k] = w64 | (w64 << 16) | (w64 << 32) | (w64 << 48);
559            }
560        } else {
561            // Nolightb path (voxlap5.c:8587-8629). Per-channel
562            // modulation factor M_k = (kv6col_byte_k << 8) → mulhi_pu16
563            // by the per-direction dot. Same dot derivation as nolighta.
564            let f = f0 * g_pre;
565
566            let l0 = (tp_x * f) as i16;
567            let l1 = (tp_y * f) as i16;
568            let l2 = (tp_z * f) as i16;
569            let l3 = (g_pre * 128.0) as i16;
570
571            let m = kv6col_channel_mods(kv6col);
572
573            let iu = iunivec();
574            for k in 0..256 {
575                let w = dot_iunivec_i16x4(iu[k], [l0, l1, l2, l3]);
576                kv6colmul[k] = pack_modulated_word(w, m);
577            }
578        }
579    } else {
580        // Lightmode≥2 path (voxlap5.c:8631-8750): per-sprite point
581        // lighting from `lighting.lights`. Each light projects onto
582        // the sprite's normalised basis; per-direction kv6colmul[i]
583        // starts from a synthetic ambient slot and subtracts shadow
584        // contributions from each light's "negative" lanes.
585        let m = kv6col_channel_mods(kv6col);
586        build_kv6colmul_lightmode2(sprite, lighting.lights, &mut kv6colmul, fogmul_lo, m);
587    }
588
589    (kv6colmul, kv6coladd)
590}
591
592/// Voxlap's `pmaddwd(iunivec[k], lightlist) summed across two
593/// dword lanes mod 2^32, take high 16` reduction. Returns the
594/// `u16` modulation factor before any per-channel packing.
595#[inline]
596fn dot_iunivec_i16x4(u: [i16; 4], l: [i16; 4]) -> u16 {
597    let u0 = i32::from(u[0]);
598    let u1 = i32::from(u[1]);
599    let u2 = i32::from(u[2]);
600    let u3 = i32::from(u[3]);
601    let lo = (u0.wrapping_mul(l[0].into())) as u32;
602    let lo = lo.wrapping_add((u1.wrapping_mul(l[1].into())) as u32);
603    let hi = (u2.wrapping_mul(l[2].into())) as u32;
604    let hi = hi.wrapping_add((u3.wrapping_mul(l[3].into())) as u32);
605    ((lo.wrapping_add(hi)) >> 16) as u16
606}
607
608/// `(kv6col_byte_k << 8)` per channel — the four `M_k` factors the
609/// nolightb / lightmode≥2 paths multiply against the per-direction
610/// dot via `pmulhuw`.
611#[inline]
612fn kv6col_channel_mods(kv6col: u32) -> [u16; 4] {
613    [
614        ((kv6col & 0xff) << 8) as u16,
615        (((kv6col >> 8) & 0xff) << 8) as u16,
616        (((kv6col >> 16) & 0xff) << 8) as u16,
617        (((kv6col >> 24) & 0xff) << 8) as u16,
618    ]
619}
620
621/// Pack one direction's `kv6colmul[k]` u64: per-channel
622/// `(W * M_c) >> 16` words concatenated.
623#[inline]
624fn pack_modulated_word(w_dot: u16, m: [u16; 4]) -> u64 {
625    let w = u32::from(w_dot);
626    let w0 = ((w * u32::from(m[0])) >> 16) as u16;
627    let w1 = ((w * u32::from(m[1])) >> 16) as u16;
628    let w2 = ((w * u32::from(m[2])) >> 16) as u16;
629    let w3 = ((w * u32::from(m[3])) >> 16) as u16;
630    u64::from(w0) | (u64::from(w1) << 16) | (u64::from(w2) << 32) | (u64::from(w3) << 48)
631}
632
633/// Lightmode≥2 path body — voxlap5.c:8631-8750. Builds the full
634/// `kv6colmul[256]` from the active light list.
635///
636/// Steps:
637/// 1. Normalise each sprite-basis axis (`sprs`/`sprh`/`sprf`).
638/// 2. For each light within `r2` of the sprite, compute its
639///    intensity falloff `h` and project the world-space delta onto
640///    the normalised sprite basis → store in `lightlist[k]`.
641/// 3. Append a synthetic ambient slot (voxlap's hardcoded
642///    `(fx, fy, fz) = (0, 0.5, 1.0)` direction) at
643///    `lightlist[lightcnt]`.
644/// 4. For each direction `idx ∈ 0..256`:
645///    - `base = ambient_slot · iunivec[idx]` (treated as one u32).
646///    - For each real light `k`: compute `dot = light_k ·
647///      iunivec[idx]`, split into low/high i16 lanes (asm-faithful
648///      "16-bits-is-ugly-but-ok-here" quirk); subtract the negative
649///      lanes from `base` (= shadow side of the surface).
650///    - `W = base >> 16`, then per-channel modulate against `M_c`
651///      and pack into `kv6colmul[idx]`.
652fn build_kv6colmul_lightmode2(
653    sprite: &Sprite,
654    lights: &[LightSrc],
655    kv6colmul: &mut [u64; 256],
656    fogmul_lo: u32,
657    m: [u16; 4],
658) {
659    // (voxlap5.c:8638-8643) Normalise sprite basis. WARNING from
660    // voxlap: only correct for orthonormal sprite-bases; non-
661    // orthogonal bases (e.g. shears) drift. The four oracle sprite
662    // poses are all orthonormal so this matches voxlap's behaviour.
663    let sprs = normalise(sprite.s);
664    let sprh = normalise(sprite.h);
665    let sprf = normalise(sprite.f);
666
667    // hh = ((fogmul & 32767) ^ 32767) / 65536 * 2 (voxlap5.c:8645).
668    // With fogmul=0 → hh = 32767 / 65536 * 2 ≈ 1.0. This is a
669    // distinct scaling from `g_pre` (= same numerator * 128/65536
670    // for the lightmode<2 path) — they differ by a factor of 64.
671    // An earlier port mistakenly derived hh from g_pre / 128 = 0.5,
672    // giving sprites half the intended ambient brightness.
673    let hh_initial = ((((fogmul_lo & 0x7fff) ^ 0x7fff) as i32) as f32) * (2.0 / 65536.0);
674
675    // Project each in-range light onto the sprite basis.
676    let mut lightlist: [[i16; 4]; MAX_LIGHTS + 1] = [[0; 4]; MAX_LIGHTS + 1];
677    let mut lightcnt: usize = 0;
678    for light in lights.iter().rev() {
679        if lightcnt >= MAX_LIGHTS {
680            break;
681        }
682        let fx = light.pos[0] - sprite.p[0];
683        let fy = light.pos[1] - sprite.p[1];
684        let fz = light.pos[2] - sprite.p[2];
685        let gg = fx * fx + fy * fy + fz * fz;
686        let ff = light.r2;
687        // Voxlap's `*(int32_t *)&gg < *(int32_t *)&ff` is a bit-
688        // pattern compare. For non-negative finite floats the bit
689        // order matches the magnitude order, so `gg < ff` is
690        // equivalent (and safer in the presence of NaN: NaN !< x
691        // for any x, matching voxlap's float-bit-cast trick).
692        if gg >= ff || gg <= 0.0 {
693            continue;
694        }
695        let f = ff.sqrt();
696        let g = gg.sqrt();
697        // h = (f*ff - g*gg) / (f*ff*g*gg) * sc * 16
698        let mut h = (f * ff - g * gg) / (f * ff * g * gg) * light.sc * 16.0;
699        if g * h > 4096.0 {
700            h = 4096.0 / g; // saturation clip
701        }
702        h *= hh_initial;
703        let l0 = (fx * sprs[0] + fy * sprs[1] + fz * sprs[2]) * h;
704        let l1 = (fx * sprh[0] + fy * sprh[1] + fz * sprh[2]) * h;
705        let l2 = (fx * sprf[0] + fy * sprf[1] + fz * sprf[2]) * h;
706        lightlist[lightcnt] = [l0 as i16, l1 as i16, l2 as i16, 0];
707        lightcnt += 1;
708    }
709
710    // Synthetic ambient slot: voxlap's hardcoded direction
711    // (fx, fy, fz) = (0, 0.5, 1.0) projected onto the sprite basis,
712    // scaled by `hh * 16*16*8/2 = hh * 1024`. The lane-3 bias is
713    // `hh * 48 / 16 = hh * 3`.
714    let amb_fx = 0.0_f32;
715    let amb_fy = 0.5_f32;
716    let amb_fz = 1.0_f32;
717    let hh = hh_initial * (16.0 * 16.0 * 8.0 / 2.0);
718    let al0 = (sprs[0] * amb_fx + sprs[1] * amb_fy + sprs[2] * amb_fz) * hh;
719    let al1 = (sprh[0] * amb_fx + sprh[1] * amb_fy + sprh[2] * amb_fz) * hh;
720    let al2 = (sprf[0] * amb_fx + sprf[1] * amb_fy + sprf[2] * amb_fz) * hh;
721    let al3 = hh * (48.0 / 16.0);
722    lightlist[lightcnt] = [al0 as i16, al1 as i16, al2 as i16, al3 as i16];
723
724    let iu = iunivec();
725    for idx in 0..256 {
726        let u = iu[idx];
727        // Ambient base = lightlist[lightcnt] · iunivec[idx], in u32
728        // wrapping arithmetic (asm summed the pmaddwd dword lanes
729        // mod 2^32).
730        let u0 = i32::from(u[0]);
731        let u1 = i32::from(u[1]);
732        let u2 = i32::from(u[2]);
733        let u3 = i32::from(u[3]);
734        let amb = lightlist[lightcnt];
735        let base_lo = (u0.wrapping_mul(i32::from(amb[0]))) as u32;
736        let base_lo = base_lo.wrapping_add((u1.wrapping_mul(i32::from(amb[1]))) as u32);
737        let base_hi = (u2.wrapping_mul(i32::from(amb[2]))) as u32;
738        let base_hi = base_hi.wrapping_add((u3.wrapping_mul(i32::from(amb[3]))) as u32);
739        let mut base = base_lo.wrapping_add(base_hi);
740
741        // For each real light, compute dot, then subtract its
742        // "negative" half-lanes from `base` (= shadow side).
743        for k in (0..lightcnt).rev() {
744            let l = lightlist[k];
745            let klo = (u0.wrapping_mul(i32::from(l[0]))) as u32;
746            let klo = klo.wrapping_add((u1.wrapping_mul(i32::from(l[1]))) as u32);
747            let khi = (u2.wrapping_mul(i32::from(l[2]))) as u32;
748            let khi = khi.wrapping_add((u3.wrapping_mul(i32::from(l[3]))) as u32);
749            let dot = klo.wrapping_add(khi);
750            // Voxlap quirk: 32-bit dot but pminsw is per-i16 lane.
751            // Light magnitudes stay clamped enough that the
752            // mixed-lane behaviour is benign — port faithfully.
753            let lo16 = (dot & 0xffff) as i16;
754            let hi16 = ((dot >> 16) & 0xffff) as i16;
755            let lo16c: u16 = if lo16 < 0 { lo16 as u16 } else { 0 };
756            let hi16c: u16 = if hi16 < 0 { hi16 as u16 } else { 0 };
757            let sub = (u32::from(hi16c) << 16) | u32::from(lo16c);
758            base = base.wrapping_sub(sub);
759        }
760
761        let w_dot = (base >> 16) as u16;
762        kv6colmul[idx] = pack_modulated_word(w_dot, m);
763    }
764}
765
766/// Normalise a 3-vector. Returns the unit-length version; if
767/// the input is zero-length, returns the input unchanged (avoids
768/// NaN propagation — voxlap's `1.0 / sqrt(...)` would NaN out for
769/// a zero basis axis but the C code never gets passed one).
770#[inline]
771fn normalise(v: [f32; 3]) -> [f32; 3] {
772    let len_sq = v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
773    if len_sq <= 0.0 {
774        return v;
775    }
776    let inv = 1.0 / len_sq.sqrt();
777    [v[0] * inv, v[1] * inv, v[2] * inv]
778}
779
780/// Full setup: mat2 + Cramer's + nfor↔nhei swap + cadd4/ztab4/r1/r2/
781/// scisdist/qsum0 init. Mirror of voxlap5.c:8915-8973.
782pub(crate) fn kv6_compute_full_state<'a>(
783    setup: &Kv6DrawSetup<'a>,
784    sprite: &Sprite,
785    lighting: &SpriteLighting<'_>,
786    cam: &CameraState,
787    settings: &OpticastSettings,
788    fb_width: u32,
789    fb_height: u32,
790    fb_pitch_pixels: usize,
791) -> Kv6FullState<'a> {
792    let sprite_pos = sprite.p;
793    let kv = setup.kv;
794
795    // Transform sprite basis from world to camera-relative
796    // screen-axis coords (voxlap5.c:8916). `(gixs, giys, gizs)` is
797    // the transposed camera basis; `giadd` is the translation half.
798    let (nstr, mut nhei, mut nfor, mut npos) = mat2(
799        cam.xs, cam.ys, cam.zs, cam.add, setup.ts, setup.th, setup.tf, sprite_pos,
800    );
801
802    // Shift `npos` so it points at the kv6 origin (corner [0,0,0])
803    // rather than the pivot point — Cramer's rule below solves for
804    // the camera origin in kv6-local voxel coords, which only makes
805    // sense relative to the corner. (voxlap5.c:8917-8919)
806    npos[0] -= kv.xpiv * nstr[0] + kv.ypiv * nhei[0] + kv.zpiv * nfor[0];
807    npos[1] -= kv.xpiv * nstr[1] + kv.ypiv * nhei[1] + kv.zpiv * nfor[1];
808    npos[2] -= kv.xpiv * nstr[2] + kv.ypiv * nhei[2] + kv.zpiv * nfor[2];
809
810    // Cramer's rule for `nstr * X + nhei * Y + nfor * Z + npos = 0`.
811    // (voxlap5.c:8923-8936)
812    let tp = [
813        nhei[1] * nfor[2] - nfor[1] * nhei[2],
814        nfor[1] * nstr[2] - nstr[1] * nfor[2],
815        nstr[1] * nhei[2] - nhei[1] * nstr[2],
816    ];
817    let det = nstr[0] * tp[0] + nhei[0] * tp[1] + nfor[0] * tp[2];
818    // Float-bit comparison against zero: matches voxlap's
819    // `if (f != 0)` and dodges clippy::float_cmp.
820    let (raw_inx, raw_iny, raw_inz) = if det.to_bits() & 0x7fff_ffff != 0 {
821        let f_inv = -1.0 / det;
822        let tp2 = [
823            npos[1] * nfor[2] - nfor[1] * npos[2],
824            nhei[1] * npos[2] - npos[1] * nhei[2],
825            npos[1] * nstr[2] - nstr[1] * npos[2],
826        ];
827        (
828            ftol((npos[0] * tp[0] - nhei[0] * tp2[0] - nfor[0] * tp2[1]) * f_inv),
829            ftol((npos[0] * tp[1] + nstr[0] * tp2[0] - nfor[0] * tp2[2]) * f_inv),
830            ftol((npos[0] * tp[2] + nstr[0] * tp2[1] + nhei[0] * tp2[2]) * f_inv),
831        )
832    } else {
833        (-1, -1, -1)
834    };
835
836    let xsiz_i = kv.xsiz as i32;
837    let ysiz_i = kv.ysiz as i32;
838    let zsiz_i = kv.zsiz as i32;
839    let iter = Kv6IterState {
840        kv,
841        inx: lbound(raw_inx, -1, xsiz_i),
842        iny: lbound(raw_iny, -1, ysiz_i),
843        inz: lbound(raw_inz, -1, zsiz_i),
844        // Voxlap default `vx5.xplanemin = 0`, `xplanemax = 0x7fffffff`.
845        nxplanemin: 0,
846        nxplanemax: i32::MAX,
847    };
848
849    // Swap `nhei` ↔ `nfor` with sign flip on the new `nfor`
850    // (voxlap5.c:8942-8944). Equivalent to a 90° rotation that lines
851    // the basis up with cadd4's bit-encoded vertex offsets:
852    //   cadd4[1] = +x  (post-swap nstr direction)
853    //   cadd4[2] = +z  (post-swap nhei direction == original +z)
854    //   cadd4[4] = +y  (post-swap nfor direction == original -y)
855    // After this point `nfor` / `nhei` carry the post-swap values.
856    let swap_x = nhei[0];
857    nhei[0] = nfor[0];
858    nfor[0] = -swap_x;
859    let swap_y = nhei[1];
860    nhei[1] = nfor[1];
861    nfor[1] = -swap_y;
862    let swap_z = nhei[2];
863    nhei[2] = nfor[2];
864    nfor[2] = -swap_z;
865
866    // qsum0 (voxlap5.c:8947-8948). The `0x7fff - (xres - hx)`
867    // form sets the bias such that adding it to a screen-space
868    // bound makes the bound saturate-positive when it lands
869    // inside the viewport.
870    let xres_i = settings.xres as i32;
871    let yres_i = settings.yres as i32;
872    let hx_i = ftol(settings.hx);
873    let hy_i = ftol(settings.hy);
874    let qsum0_x = (0x7fff - (xres_i - hx_i)) as i16;
875    let qsum0_y = (0x7fff - (yres_i - hy_i)) as i16;
876    let qsum0 = [qsum0_x, qsum0_y, qsum0_x, qsum0_y];
877
878    // scisdist (voxlap5.c:8953-8956). Voxlap's `*(int32_t *)&f < 0`
879    // bit-trick: a positive-finite float has bit-pattern >= 0;
880    // only *negative* floats land < 0 as signed int. So this loop
881    // sums the absolute value of any negative-z post-swap basis
882    // component into a near-plane bias.
883    let mut scisdist = 0.0f32;
884    if (nstr[2].to_bits() as i32) < 0 {
885        scisdist -= nstr[2];
886    }
887    if (nhei[2].to_bits() as i32) < 0 {
888        scisdist -= nhei[2];
889    }
890    if (nfor[2].to_bits() as i32) < 0 {
891        scisdist -= nfor[2];
892    }
893
894    // cadd4 step table (voxlap5.c:8958-8961). cadd4[1/2/4] are the
895    // three primary axis steps (x / z / y, post-swap); cadd4[3/5/6/7]
896    // are bit-OR sums (3 = 1+2, 5 = 1+4, 6 = 2+4, 7 = 3+4).
897    let gihz = settings.hz;
898    let cadd1 = [nstr[0] * gihz, nstr[1] * gihz, nstr[2], nstr[2]];
899    let cadd2 = [nhei[0] * gihz, nhei[1] * gihz, nhei[2], nhei[2]];
900    let cadd4_axis = [nfor[0] * gihz, nfor[1] * gihz, nfor[2], nfor[2]];
901    let cadd3 = vec4_add(cadd1, cadd2);
902    let cadd5 = vec4_add(cadd1, cadd4_axis);
903    let cadd6 = vec4_add(cadd2, cadd4_axis);
904    let cadd7 = vec4_add(cadd3, cadd4_axis);
905    let cadd4 = [
906        [0.0; 4], cadd1, cadd2, cadd3, cadd4_axis, cadd5, cadd6, cadd7,
907    ];
908
909    // ztab4 per-z step table (voxlap5.c:8973). ztab4[z] = z * cadd4[2]
910    // built incrementally by addps so per-step rounding matches.
911    let zsiz = kv.zsiz as usize;
912    let mut ztab4_per_z = Vec::with_capacity(zsiz);
913    if zsiz > 0 {
914        ztab4_per_z.push([0.0f32; 4]);
915        for i in 1..zsiz {
916            let prev = ztab4_per_z[i - 1];
917            ztab4_per_z.push(vec4_add(prev, cadd4[2]));
918        }
919    }
920
921    // r1 init (voxlap5.c:8961, 8976). Post-mat2 npos becomes the
922    // raw column-base; gihz-scale x/y; z lane keeps unscaled npos.z;
923    // z2 lane (lane 3) duplicates z. Then "ANNOYING HACK"
924    // pre-decrement by cadd4[4].
925    let r1_pre = [npos[0] * gihz, npos[1] * gihz, npos[2], npos[2]];
926    let r1_initial = vec4_sub(r1_pre, cadd4[4]);
927
928    // r2 = -ysiz * cadd4[4] (voxlap5.c:8974). intss + mulps in voxlap.
929    let r2 = vec4_scale(cadd4[4], -(ysiz_i as f32));
930
931    // qsum1 + qbplbpp from voxsetframebuffer (voxlap5.c:11119-11122).
932    // The framebuffer geometry is independent of the camera projection
933    // — these are derived from `(width, height, pitch_bytes)`.
934    let pitch_bytes = (fb_pitch_pixels as i32).saturating_mul(4);
935    let qsum1_x = 0x7fff_i32 - fb_width as i32;
936    let qsum1_y = 0x7fff_i32 - fb_height as i32;
937    let qsum1 = [
938        qsum1_x as i16,
939        qsum1_y as i16,
940        qsum1_x as i16,
941        qsum1_y as i16,
942    ];
943    let qbplbpp = [4i16, pitch_bytes as i16, 4, pitch_bytes as i16];
944
945    let (kv6colmul, kv6coladd) = update_reflects(sprite, lighting);
946
947    Kv6FullState {
948        iter,
949        cadd4,
950        ztab4_per_z,
951        r1_initial,
952        r2,
953        scisdist,
954        qsum0,
955        qsum1,
956        qbplbpp,
957        kv6colmul,
958        kv6coladd,
959    }
960}
961
962/// Per-voxel rasterizer (R6.4 complete).
963///
964/// Mirror of `voxlap5.c:8179-8320` (`drawboundcubesse`). For each
965/// voxel:
966/// 1. `effmask = mask & v.vis` early-out.
967/// 2. `origin = r0 + ztab4_per_z[v.z]`; scissor on `origin.z`.
968/// 3. Look up `ptfaces16[effmask]` — `face[0]` = 4 or 6 vertex
969///    count, `face[1..7]` = byte offsets into `caddasm` (the
970///    `cadd4[8]` array, each entry 16 bytes).
971/// 4. For each vertex pair (a, b), compute the projected screen
972///    coords as `(cadd4[a] + origin).xy / (cadd4[a] + origin).z`
973///    via `_mm_rcp_ps`.
974/// 5. Pack the 4 (or 6) projected vertices to int16, min/max-reduce
975///    to a single screen-AABB, viewport-clip via `qsum0` /
976///    `qsum1`, and early-out on degenerate rect.
977/// 6. Compute the per-voxel colour via the `mm5` cross-call tail +
978///    `kv6colmul[v.dir]` + `kv6coladd[0]` modulation.
979/// 7. Fill the screen rectangle with z-test + framebuffer write.
980///
981/// Returns the number of pixels actually written (z-test passing).
982/// Tests use this as a sanity gate; production callers ignore it.
983///
984/// `mm5_tail` is voxlap's static cross-call register tail
985/// (voxlap5.c:8170-8177). It carries one byte of contribution from
986/// the previous voxel's colour into the current; bit-equality with
987/// the asm requires preserving it across calls within one sprite.
988///
989/// Currently x86_64-only — relies on `_mm_rcp_ps` for bit-equality
990/// with voxlap C. NEON / wasm ports will need their own goldens
991/// (see `PORTING-RUST.md` R9 / R10).
992#[cfg(target_arch = "x86_64")]
993#[allow(clippy::trivially_copy_pass_by_ref)] // hot loop; matches voxlap's pointer-passed v.
994pub(crate) fn drawboundcubesse(
995    v: &Voxel,
996    mask: u32,
997    state: &Kv6FullState<'_>,
998    r0: [f32; 4],
999    mm5_tail: &mut u32,
1000    target: &mut DrawTarget<'_>,
1001) -> u32 {
1002    use core::arch::x86_64::{
1003        __m128, __m128i, _mm_add_epi16, _mm_add_ps, _mm_adds_epi16, _mm_cvtsi128_si32,
1004        _mm_cvtsi32_si128, _mm_cvttps_epi32, _mm_loadl_epi64, _mm_loadu_ps, _mm_madd_epi16,
1005        _mm_max_epi16, _mm_min_epi16, _mm_movehl_ps, _mm_movelh_ps, _mm_mul_ps, _mm_mulhi_epu16,
1006        _mm_packs_epi32, _mm_packus_epi16, _mm_rcp_ps, _mm_setzero_si128, _mm_shufflelo_epi16,
1007        _mm_storeu_ps, _mm_storeu_si128, _mm_subs_epu16, _mm_unpackhi_epi64, _mm_unpacklo_epi32,
1008        _mm_unpacklo_epi8,
1009    };
1010
1011    let effmask = (mask & u32::from(v.vis)) as usize;
1012    if effmask == 0 || effmask >= PTFACES16.len() {
1013        return 0;
1014    }
1015    let face = PTFACES16[effmask];
1016    if face[0] == 0 {
1017        return 0;
1018    }
1019
1020    // origin = r0 + ztab4_per_z[v.z] (4 f32 lanes, [x*hz, y*hz, z, z]).
1021    let z_idx = v.z as usize;
1022    if z_idx >= state.ztab4_per_z.len() {
1023        return 0;
1024    }
1025    let ztep = state.ztab4_per_z[z_idx];
1026    // SAFETY: `_mm_loadu_ps` reads 16 unaligned bytes from a 4-f32
1027    // array (which is 16 bytes); subsequent intrinsics are SSE2
1028    // baseline on x86_64.
1029    unsafe {
1030        let r0_v = _mm_loadu_ps(r0.as_ptr());
1031        let ztep_v = _mm_loadu_ps(ztep.as_ptr());
1032        let origin_v: __m128 = _mm_add_ps(r0_v, ztep_v);
1033        let mut origin_arr = [0.0f32; 4];
1034        _mm_storeu_ps(origin_arr.as_mut_ptr(), origin_v);
1035        if origin_arr[2] < state.scisdist {
1036            return 0;
1037        }
1038
1039        // Project vertex pair (a, b). Returns __m128 with lanes:
1040        //   [b.x_proj, b.y_proj, a.x_proj, a.y_proj]
1041        // The byte offsets in face[k] index `caddasm` (= bytes into a
1042        // [point4d; 8] = [[f32; 4]; 8]); divide by 16 (= sizeof point4d)
1043        // to land back at the cadd4 index.
1044        let project = |off_a: u8, off_b: u8| -> __m128 {
1045            let a = state.cadd4[(off_a >> 4) as usize];
1046            let b = state.cadd4[(off_b >> 4) as usize];
1047            let wva = _mm_add_ps(_mm_loadu_ps(a.as_ptr()), origin_v);
1048            let wvb = _mm_add_ps(_mm_loadu_ps(b.as_ptr()), origin_v);
1049            let wv0 = _mm_movehl_ps(wva, wvb); // [b.z, b.z, a.z, a.z]
1050            let wv1 = _mm_movelh_ps(wvb, wva); // [b.x, b.y, a.x, a.y]
1051            let wv0_inv = _mm_rcp_ps(wv0);
1052            _mm_mul_ps(wv0_inv, wv1)
1053        };
1054
1055        let pair01 = project(face[1], face[2]);
1056        let pair23 = project(face[3], face[4]);
1057
1058        // Convert to int32 (truncate-toward-zero), pack to int16.
1059        // pack01_int16 lanes 0..3 = [v1x, v1y, v0x, v0y]
1060        // pack01_int16 lanes 4..7 = [v3x, v3y, v2x, v2y]
1061        let p01_i32 = _mm_cvttps_epi32(pair01);
1062        let p23_i32 = _mm_cvttps_epi32(pair23);
1063        let pack_lo = _mm_packs_epi32(p01_i32, p23_i32);
1064        let pack01 = pack_lo;
1065        let pack23 = _mm_unpackhi_epi64(pack_lo, _mm_setzero_si128());
1066        let mut mm_min = _mm_min_epi16(pack01, pack23);
1067        let mut mm_max = _mm_max_epi16(pack01, pack23);
1068
1069        if face[0] != 4 {
1070            let pair45 = project(face[5], face[6]);
1071            let p45_i32 = _mm_cvttps_epi32(pair45);
1072            let pack45 = _mm_packs_epi32(p45_i32, _mm_setzero_si128());
1073            mm_min = _mm_min_epi16(mm_min, pack45);
1074            mm_max = _mm_max_epi16(mm_max, pack45);
1075        }
1076
1077        // shufflelo(_, 0x0e) brings high half (lanes 2..3) into low
1078        // half so min/max collapses across all 4 (or 6) vertices.
1079        let mm_min_hi = _mm_shufflelo_epi16(mm_min, 0x0e);
1080        let mm_max_hi = _mm_shufflelo_epi16(mm_max, 0x0e);
1081        let mm_min_red = _mm_min_epi16(mm_min, mm_min_hi);
1082        let mm_max_red = _mm_max_epi16(mm_max, mm_max_hi);
1083
1084        // bounds = unpacklo(mm_min, mm_max) lanes 0..3 (i16)
1085        //        = [min_x, max_x, min_y, max_y]  ?
1086        // Actually: _mm_unpacklo_epi32 interleaves 32-bit lanes.
1087        // Low 32 of mm_min = (mm_min[0], mm_min[1]) i.e. (min_x, min_y).
1088        // Low 32 of mm_max similarly. After unpacklo_epi32:
1089        //   lanes_32[0] = mm_min low32, lanes_32[1] = mm_max low32
1090        //   → 4 i16: [min_x, min_y, max_x, max_y]
1091        let bounds = _mm_unpacklo_epi32(mm_min_red, mm_max_red);
1092
1093        // Apply qsum0 (saturated add) + qsum1 (max-floor). Both are
1094        // 8-byte values loaded into the low 64 bits of __m128i.
1095        let qsum0_v = _mm_loadl_epi64(state.qsum0.as_ptr().cast::<__m128i>());
1096        let qsum1_v = _mm_loadl_epi64(state.qsum1.as_ptr().cast::<__m128i>());
1097        let bounds = _mm_adds_epi16(bounds, qsum0_v);
1098        let bounds = _mm_max_epi16(bounds, qsum1_v);
1099
1100        // dxdy = subs_epu16(bounds_hi, bounds) — saturating unsigned
1101        // subtract, with bounds_hi being lanes [2,3,2,3] of bounds.
1102        let bounds_hi = _mm_shufflelo_epi16(bounds, 0xee);
1103        let dxdy = _mm_subs_epu16(bounds_hi, bounds);
1104        let dxdy_low = _mm_cvtsi128_si32(dxdy) as u32;
1105        let dx = (dxdy_low & 0xffff) as i32;
1106        if dx == 0 {
1107            return 0;
1108        }
1109        let dy = ((dxdy_low >> 16) as i32) - 1;
1110        if dy < 0 {
1111            return 0;
1112        }
1113
1114        // Recover pixel coords from bounds + qsum1. Bounds[0/1] are
1115        // currently in the saturated [0x7fff - res, 0x7fff] range;
1116        // pixel = bounds - qsum1.
1117        let mut bounds_arr = [0i16; 8];
1118        _mm_storeu_si128(bounds_arr.as_mut_ptr().cast::<__m128i>(), bounds);
1119        let pixel_min_x = i32::from(bounds_arr[0]) - i32::from(state.qsum1[0]);
1120        let pixel_min_y = i32::from(bounds_arr[1]) - i32::from(state.qsum1[1]);
1121
1122        // pmaddwd is consumed for completeness so the asm-equivalent
1123        // pixel-byte-offset is computable; not strictly needed since
1124        // we index directly via (pixel_min_x, pixel_min_y).
1125        let qbplbpp_v = _mm_loadl_epi64(state.qbplbpp.as_ptr().cast::<__m128i>());
1126        let _ = _mm_madd_epi16(bounds, qbplbpp_v);
1127
1128        // Colour modulation with mm5 cross-call tail.
1129        let tail_in = *mm5_tail;
1130        let mm5 = _mm_cvtsi32_si128(tail_in as i32);
1131        let col_v = _mm_cvtsi32_si128(v.col as i32);
1132        let mm5 = _mm_unpacklo_epi8(mm5, col_v);
1133        let kvm = state.kv6colmul[v.dir as usize];
1134        let kvm_v = _mm_loadl_epi64(std::ptr::addr_of!(kvm).cast::<__m128i>());
1135        let mm5 = _mm_mulhi_epu16(mm5, kvm_v);
1136        let kva_v = _mm_loadl_epi64(std::ptr::addr_of!(state.kv6coladd).cast::<__m128i>());
1137        let mm5 = _mm_add_epi16(mm5, kva_v);
1138        let mm5 = _mm_packus_epi16(mm5, mm5);
1139        let color = _mm_cvtsi128_si32(mm5) as u32;
1140        *mm5_tail = color;
1141
1142        // Fill rectangle [pixel_min_x .. +dx) × [pixel_min_y .. +dy+1).
1143        // The qsum0/qsum1 clip + saturating sub guarantee the rect
1144        // sits inside the framebuffer, so no per-pixel bounds check
1145        // needed beyond DrawTarget's debug_assert.
1146        let z_val = origin_arr[2];
1147        let pitch = target.pitch_pixels;
1148        let x0 = pixel_min_x as usize;
1149        let x_end = x0 + dx as usize;
1150        let mut written: u32 = 0;
1151        for row in 0..=(dy as usize) {
1152            let y = pixel_min_y as usize + row;
1153            let row_start = y * pitch;
1154            for x in x0..x_end {
1155                let idx = row_start + x;
1156                // SAFETY: idx < pitch * height by qsum0/qsum1 clip;
1157                // concurrent-write contract gated by z_test_write.
1158                // (Outer `unsafe` block in this fn covers the call.)
1159                if target.z_test_write(idx, color, z_val) {
1160                    written += 1;
1161                }
1162            }
1163        }
1164        written
1165    }
1166}
1167
1168/// R9: scalar port for non-x86_64 (aarch64 / wasm). Same algorithm as
1169/// the SSE2 version but uses IEEE 754 `1.0 / z` instead of `_mm_rcp_ps`
1170/// for perspective projection, so screen-space vertex positions (and
1171/// therefore per-arch goldens) will differ by ±1 pixel at edges.
1172/// Colour modulation replicates the `_mm_mulhi_epu16` + `_mm_packus_epi16`
1173/// byte arithmetic exactly.
1174#[cfg(not(target_arch = "x86_64"))]
1175#[allow(clippy::trivially_copy_pass_by_ref)]
1176pub(crate) fn drawboundcubesse(
1177    v: &Voxel,
1178    mask: u32,
1179    state: &Kv6FullState<'_>,
1180    r0: [f32; 4],
1181    mm5_tail: &mut u32,
1182    target: &mut DrawTarget<'_>,
1183) -> u32 {
1184    let effmask = (mask & u32::from(v.vis)) as usize;
1185    if effmask == 0 || effmask >= PTFACES16.len() {
1186        return 0;
1187    }
1188    let face = PTFACES16[effmask];
1189    if face[0] == 0 {
1190        return 0;
1191    }
1192
1193    // origin = r0 + ztab4_per_z[v.z]
1194    let z_idx = v.z as usize;
1195    if z_idx >= state.ztab4_per_z.len() {
1196        return 0;
1197    }
1198    let origin = vec4_add(r0, state.ztab4_per_z[z_idx]);
1199    if origin[2] < state.scisdist {
1200        return 0;
1201    }
1202
1203    // The SSE2 path's qsum0/qsum1 mechanism embeds the screen-center
1204    // offset (hx, hy) into the viewport clip; recover it here for
1205    // the direct screen-coordinate projection.
1206    let hx = (i32::from(state.qsum0[0]) - i32::from(state.qsum1[0])) as f32;
1207    let hy = (i32::from(state.qsum0[1]) - i32::from(state.qsum1[1])) as f32;
1208
1209    // Project one vertex: screen_xy = (cadd4[idx] + origin).xy / .z + (hx, hy)
1210    let project = |off: u8| -> (f32, f32) {
1211        let wv = vec4_add(state.cadd4[(off >> 4) as usize], origin);
1212        let inv_z = 1.0 / wv[2];
1213        (wv[0] * inv_z + hx, wv[1] * inv_z + hy)
1214    };
1215
1216    // Project 4 or 6 vertices, track screen AABB via truncation.
1217    let (a0x, a0y) = project(face[1]);
1218    let (a1x, a1y) = project(face[2]);
1219    let (a2x, a2y) = project(face[3]);
1220    let (a3x, a3y) = project(face[4]);
1221    let mut min_x = a0x.min(a1x).min(a2x).min(a3x) as i32;
1222    let mut min_y = a0y.min(a1y).min(a2y).min(a3y) as i32;
1223    let mut max_x = a0x.max(a1x).max(a2x).max(a3x) as i32;
1224    let mut max_y = a0y.max(a1y).max(a2y).max(a3y) as i32;
1225
1226    if face[0] != 4 {
1227        let (a4x, a4y) = project(face[5]);
1228        let (a5x, a5y) = project(face[6]);
1229        min_x = min_x.min(a4x as i32).min(a5x as i32);
1230        min_y = min_y.min(a4y as i32).min(a5y as i32);
1231        max_x = max_x.max(a4x as i32).max(a5x as i32);
1232        max_y = max_y.max(a4y as i32).max(a5y as i32);
1233    }
1234
1235    // Viewport clip (mirrors the qsum0/qsum1 saturating-add + max
1236    // sequence from the SSE2 path, but in direct screen coords).
1237    let fb_w = target.width as i32;
1238    let fb_h = target.height as i32;
1239    min_x = min_x.max(0);
1240    min_y = min_y.max(0);
1241    max_x = max_x.min(fb_w - 1);
1242    max_y = max_y.min(fb_h - 1);
1243    if min_x > max_x || min_y > max_y {
1244        return 0;
1245    }
1246
1247    // Colour modulation — replicates the SSE2 byte arithmetic:
1248    //   interleave = unpacklo_epi8(tail, col)  → 4 × u16
1249    //   result     = mulhi_epu16(interleave, kv6colmul[dir]) + kv6coladd
1250    //   color      = packus_epi16(result)      → 4 × u8 → u32
1251    let t = mm5_tail.to_le_bytes();
1252    let c = v.col.to_le_bytes();
1253    let interleaved: [u16; 4] = [
1254        (u16::from(c[0]) << 8) | u16::from(t[0]),
1255        (u16::from(c[1]) << 8) | u16::from(t[1]),
1256        (u16::from(c[2]) << 8) | u16::from(t[2]),
1257        (u16::from(c[3]) << 8) | u16::from(t[3]),
1258    ];
1259    let kvm = state.kv6colmul[v.dir as usize];
1260    let kva = state.kv6coladd;
1261    let mut color_bytes = [0u8; 4];
1262    for i in 0..4 {
1263        let km = ((kvm >> (i * 16)) & 0xffff) as u16;
1264        let ka = ((kva >> (i * 16)) & 0xffff) as u16;
1265        let hi = ((u32::from(interleaved[i]) * u32::from(km)) >> 16) as u16;
1266        let val = hi.wrapping_add(ka) as i16;
1267        color_bytes[i] = val.clamp(0, 255) as u8;
1268    }
1269    let color = u32::from_le_bytes(color_bytes);
1270    *mm5_tail = color;
1271
1272    // Fill rectangle with z-test.
1273    let z_val = origin[2];
1274    let pitch = target.pitch_pixels;
1275    let mut written: u32 = 0;
1276    for y in min_y..=max_y {
1277        let row_start = y as usize * pitch;
1278        for x in min_x..=max_x {
1279            let idx = row_start + x as usize;
1280            // SAFETY: viewport clip above guarantees idx < pitch * height.
1281            unsafe {
1282                if target.z_test_write(idx, color, z_val) {
1283                    written += 1;
1284                }
1285            }
1286        }
1287    }
1288    written
1289}
1290
1291/// One iteration of voxlap's `DRAWBOUNDCUBELINE` macro
1292/// (voxlap5.c:8809-8812). Walks the voxel range `[range_start,
1293/// range_end)` (one (x, y) column's voxels) in three phases:
1294///
1295/// 1. Forward through voxels with `z < inz`, calling
1296///    `callback(voxel, base_mask | 0x20, r0)`.
1297/// 2. Backward through voxels with `z > inz`, calling
1298///    `callback(voxel, base_mask | 0x10, r0)`.
1299/// 3. If a single voxel remains with `z == inz`, call
1300///    `callback(voxel, base_mask | 0x00, r0)`.
1301///
1302/// Each (x, y) column is visited exactly once. `r0` is the screen-
1303/// space origin for *this* column — voxlap stores it as
1304/// `ztab4[MAXZSIZ]` and `drawboundcubesse` reads it via that index.
1305fn draw_boundcube_line<F: FnMut(&Voxel, u32, [f32; 4])>(
1306    voxels: &[Voxel],
1307    range_start: usize,
1308    range_end: usize,
1309    inz: i32,
1310    base_mask: u32,
1311    r0: [f32; 4],
1312    callback: &mut F,
1313) {
1314    if range_end <= range_start {
1315        return;
1316    }
1317    let mut v0 = range_start;
1318    let mut v1_excl = range_end;
1319
1320    // Phase 1: forward while voxels[v0].z < inz.
1321    while v0 < v1_excl && i32::from(voxels[v0].z) < inz {
1322        callback(&voxels[v0], base_mask | 0x20, r0);
1323        v0 += 1;
1324    }
1325    // Phase 2: backward while voxels[v1_excl - 1].z > inz.
1326    while v0 < v1_excl && i32::from(voxels[v1_excl - 1].z) > inz {
1327        callback(&voxels[v1_excl - 1], base_mask | 0x10, r0);
1328        v1_excl -= 1;
1329    }
1330    // Phase 3: single voxel left with z == inz.
1331    if v0 + 1 == v1_excl {
1332        callback(&voxels[v0], base_mask, r0);
1333    }
1334}
1335
1336/// 9-arm per-(x, y) column iteration walking the kv6's voxel
1337/// grid in painter's-back-to-front order around the camera-split
1338/// point (`inx`, `iny`, `inz`). Mirror of voxlap5.c:8982-9062.
1339///
1340/// Tracks `r1` (current x-column base) and `r0` (current (x, y)
1341/// origin) the same way voxlap mutates them with addps/subps,
1342/// passing `r0` to each per-voxel callback. `r0` evolves as
1343/// `r0[x][y] = r1_initial + x * cadd4[1] - y * cadd4[4]` (with
1344/// the floating-point operations applied in voxlap's order so the
1345/// per-step rounding matches bit-for-bit).
1346///
1347/// Each (x, y) column is visited exactly once.
1348#[allow(clippy::too_many_lines)]
1349pub(crate) fn kv6_iterate<F: FnMut(&Voxel, u32, [f32; 4])>(
1350    state: &Kv6FullState<'_>,
1351    mut callback: F,
1352) {
1353    let kv = state.iter.kv;
1354    let xsiz = kv.xsiz as i32;
1355    let ysiz = kv.ysiz as i32;
1356    let inx = state.iter.inx;
1357    let iny = state.iter.iny;
1358    let inz = state.iter.inz;
1359    let nxplanemin = state.iter.nxplanemin;
1360    let nxplanemax = state.iter.nxplanemax;
1361    let cadd1 = state.cadd4[1];
1362    let cadd_y = state.cadd4[4];
1363    let r2 = state.r2;
1364
1365    let mut xv: usize = 0;
1366    let mut r1 = state.r1_initial;
1367
1368    // First half: x = 0..inx. Top-half quadrants (masks 0xa, 0x6, 0x2).
1369    let mut x: i32 = 0;
1370    while x < inx {
1371        let xu = x as usize;
1372        let xlen = kv.xlen[xu] as usize;
1373        if x < nxplanemin || x >= nxplanemax {
1374            xv += xlen;
1375            r1 = vec4_add(r1, cadd1);
1376            x += 1;
1377            continue;
1378        }
1379        let yv_initial = xv + xlen;
1380        let mut r0 = r1; // movps r0, r1
1381
1382        // Forward y: 0..iny  -> mask 0xa.
1383        let mut xv_local = xv;
1384        let mut y: i32 = 0;
1385        while y < iny {
1386            let yu = y as usize;
1387            let len = kv.ylen[xu][yu] as usize;
1388            let v0 = xv_local;
1389            xv_local += len;
1390            draw_boundcube_line(&kv.voxels, v0, xv_local, inz, 0xa, r0, &mut callback);
1391            r0 = vec4_sub(r0, cadd_y); // r0 -= cadd4[4]
1392            y += 1;
1393        }
1394
1395        // Setup for reverse y: r0 = r1 + r2 (= base + (-ysiz)*cadd4[4]),
1396        // then r1 += cadd4[1] for the next x column.
1397        let mut yv_local = yv_initial;
1398        r0 = vec4_add(r1, r2);
1399        r1 = vec4_add(r1, cadd1);
1400
1401        // Reverse y: ysiz-1..iny  -> mask 0x6.
1402        let mut y = ysiz - 1;
1403        while y > iny {
1404            r0 = vec4_add(r0, cadd_y); // r0 += cadd4[4]
1405            let yu = y as usize;
1406            let len = kv.ylen[xu][yu] as usize;
1407            let v1_excl = yv_local;
1408            yv_local -= len;
1409            draw_boundcube_line(&kv.voxels, yv_local, v1_excl, inz, 0x6, r0, &mut callback);
1410            y -= 1;
1411        }
1412
1413        // Edge y == iny  -> mask 0x2.
1414        if iny >= 0 && (iny as u32) < kv.ysiz {
1415            r0 = vec4_add(r0, cadd_y);
1416            let yu = iny as usize;
1417            let len = kv.ylen[xu][yu] as usize;
1418            let v1_excl = yv_local;
1419            yv_local -= len;
1420            draw_boundcube_line(&kv.voxels, yv_local, v1_excl, inz, 0x2, r0, &mut callback);
1421        }
1422
1423        xv += xlen;
1424        x += 1;
1425    }
1426
1427    // Setup for second half (voxlap5.c:9011): jump r1 to past-end.
1428    // r1 += (xsiz - x) * cadd4[1]  with x = post-first-half value.
1429    let dx_remain = (xsiz - x) as f32;
1430    r1 = vec4_add(r1, vec4_scale(cadd1, dx_remain));
1431
1432    // Second half: x = xsiz-1..inx (reverse). Bot-half quadrants
1433    // (masks 0x5, 0x9, 0x1).
1434    let mut xv2: usize = kv.voxels.len();
1435    let mut x = xsiz - 1;
1436    while x > inx {
1437        let xu = x as usize;
1438        let xlen = kv.xlen[xu] as usize;
1439        if x < nxplanemin || x >= nxplanemax {
1440            xv2 -= xlen;
1441            r1 = vec4_sub(r1, cadd1);
1442            x -= 1;
1443            continue;
1444        }
1445        let yv_initial = xv2 - xlen;
1446        // Voxlap order: r1 -= cadd1 first, then r0 = r1 + r2.
1447        r1 = vec4_sub(r1, cadd1);
1448        let mut r0 = vec4_add(r1, r2);
1449
1450        // Reverse y: ysiz-1..iny  -> mask 0x5.
1451        let mut xv_local = xv2;
1452        let mut y = ysiz - 1;
1453        while y > iny {
1454            r0 = vec4_add(r0, cadd_y);
1455            let yu = y as usize;
1456            let len = kv.ylen[xu][yu] as usize;
1457            let v1_excl = xv_local;
1458            xv_local -= len;
1459            draw_boundcube_line(&kv.voxels, xv_local, v1_excl, inz, 0x5, r0, &mut callback);
1460            y -= 1;
1461        }
1462
1463        // After reverse y: r0 = r1 (movps r0, r1).
1464        let mut yv_local = yv_initial;
1465        r0 = r1;
1466
1467        // Forward y: 0..iny  -> mask 0x9.
1468        let mut y: i32 = 0;
1469        while y < iny {
1470            let yu = y as usize;
1471            let len = kv.ylen[xu][yu] as usize;
1472            let v0 = yv_local;
1473            yv_local += len;
1474            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x9, r0, &mut callback);
1475            r0 = vec4_sub(r0, cadd_y);
1476            y += 1;
1477        }
1478
1479        // Edge y == iny  -> mask 0x1.
1480        if iny >= 0 && (iny as u32) < kv.ysiz {
1481            let yu = iny as usize;
1482            let len = kv.ylen[xu][yu] as usize;
1483            let v0 = yv_local;
1484            yv_local += len;
1485            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x1, r0, &mut callback);
1486        }
1487
1488        xv2 -= xlen;
1489        x -= 1;
1490    }
1491
1492    // Edge x == inx (middle column). Masks 0x4, 0x8, 0x0.
1493    if inx >= 0 && (inx as u32) < kv.xsiz {
1494        let xu = inx as usize;
1495        if inx < nxplanemin || inx >= nxplanemax {
1496            return;
1497        }
1498        let xlen = kv.xlen[xu] as usize;
1499        let yv_initial = xv2 - xlen;
1500        r1 = vec4_sub(r1, cadd1);
1501        let mut r0 = vec4_add(r1, r2);
1502
1503        // Reverse y -> mask 0x4.
1504        let mut xv_local = xv2;
1505        let mut y = ysiz - 1;
1506        while y > iny {
1507            r0 = vec4_add(r0, cadd_y);
1508            let yu = y as usize;
1509            let len = kv.ylen[xu][yu] as usize;
1510            let v1_excl = xv_local;
1511            xv_local -= len;
1512            draw_boundcube_line(&kv.voxels, xv_local, v1_excl, inz, 0x4, r0, &mut callback);
1513            y -= 1;
1514        }
1515
1516        // After reverse y: r0 = r1.
1517        let mut yv_local = yv_initial;
1518        r0 = r1;
1519
1520        // Forward y -> mask 0x8.
1521        let mut y: i32 = 0;
1522        while y < iny {
1523            let yu = y as usize;
1524            let len = kv.ylen[xu][yu] as usize;
1525            let v0 = yv_local;
1526            yv_local += len;
1527            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x8, r0, &mut callback);
1528            r0 = vec4_sub(r0, cadd_y);
1529            y += 1;
1530        }
1531
1532        // Edge y == iny -> mask 0x0.
1533        if iny >= 0 && (iny as u32) < kv.ysiz {
1534            let yu = iny as usize;
1535            let len = kv.ylen[xu][yu] as usize;
1536            let v0 = yv_local;
1537            yv_local += len;
1538            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x0, r0, &mut callback);
1539        }
1540    }
1541}
1542
1543/// Draw a sprite into a framebuffer + z-buffer.
1544///
1545/// Top-level dispatcher mirroring voxlap5.c:9818-9828:
1546/// - Skips on `flags & INVISIBLE`.
1547/// - Skips on `flags & KFA` (animation path; out of scope for R6).
1548/// - Skips on `flags & NO_Z` (handled by `drawboundcubenozsse`,
1549///   not yet ported — the four oracle sprite poses all use z-tested
1550///   rendering).
1551///
1552/// Otherwise: cull → setup math → 9-arm per-voxel iteration →
1553/// per-voxel rasterize via the R6.4 `drawboundcubesse` port.
1554///
1555/// Returns the total number of pixels written across all voxels of
1556/// the sprite (== sum of z-test passes). Zero means the sprite
1557/// produced no visible pixels (culled, fully behind near plane, or
1558/// totally occluded).
1559/// Render a batch of sprites in parallel via `rayon::par_iter`.
1560///
1561/// Each sprite runs its own [`draw_sprite`] pass on its own thread,
1562/// writing to the shared [`DrawTarget`] (raw pointers; `Copy + Send
1563/// + Sync`) under the z-test arbitration contract: a pixel write
1564/// only fires when the new sprite's z is strictly less than the
1565/// current zbuffer value. For non-overlapping sprites the writes
1566/// are pairwise-disjoint and the output is byte-identical to a
1567/// sequential pass over the same sprite list. For overlapping
1568/// pixels, two sprites at exactly tied z-values produce a
1569/// non-deterministic last-writer-wins outcome — visually
1570/// indistinguishable but hash-non-deterministic.
1571///
1572/// Returns the sum of `draw_sprite` return values (total pixels
1573/// written across all sprites).
1574///
1575/// `RAYON_NUM_THREADS=1` (or no parallelism worth) ⇒ effectively
1576/// sequential; rayon falls back to running each closure on the
1577/// calling thread without contention.
1578///
1579/// Use this for engine scenes with dozens-to-hundreds of sprites;
1580/// the per-sprite overhead amortises well past ~4 sprites on
1581/// consumer-class hardware.
1582#[allow(clippy::module_name_repetitions)]
1583pub fn draw_sprites_parallel(
1584    target: DrawTarget<'_>,
1585    cam: &CameraState,
1586    settings: &OpticastSettings,
1587    lighting: &SpriteLighting<'_>,
1588    sprites: &[Sprite],
1589) -> u32 {
1590    let render_one = |sprite: &Sprite| {
1591        // `target` is `Copy`, so each closure captures its own
1592        // copy of the (raw fb / zb pointer) view. `cam`,
1593        // `settings`, `lighting` are `&` borrows — Sync.
1594        let mut t = target;
1595        draw_sprite(&mut t, cam, settings, lighting, sprite)
1596    };
1597
1598    use rayon::prelude::*;
1599    sprites.par_iter().map(render_one).sum()
1600}
1601
1602pub fn draw_sprite(
1603    target: &mut DrawTarget<'_>,
1604    cam: &CameraState,
1605    settings: &OpticastSettings,
1606    lighting: &SpriteLighting<'_>,
1607    sprite: &Sprite,
1608) -> u32 {
1609    if sprite.flags & SPRITE_FLAG_INVISIBLE != 0 {
1610        return 0;
1611    }
1612    if sprite.flags & SPRITE_FLAG_KFA != 0 {
1613        return 0;
1614    }
1615    if sprite.flags & SPRITE_FLAG_NO_Z != 0 {
1616        // drawboundcubenozsse port deferred; oracle doesn't exercise it.
1617        return 0;
1618    }
1619    let Some(setup) = kv6_draw_prepare(sprite, cam) else {
1620        return 0;
1621    };
1622    let state = kv6_compute_full_state(
1623        &setup,
1624        sprite,
1625        lighting,
1626        cam,
1627        settings,
1628        target.width,
1629        target.height,
1630        target.pitch_pixels,
1631    );
1632    let mut mm5_tail: u32 = 0;
1633    let mut total_written: u32 = 0;
1634    kv6_iterate(&state, |voxel, mask, r0| {
1635        total_written += drawboundcubesse(voxel, mask, &state, r0, &mut mm5_tail, target);
1636    });
1637    total_written
1638}
1639
1640#[cfg(test)]
1641mod tests {
1642    use super::*;
1643    use crate::camera_math;
1644    use crate::Camera;
1645    use roxlap_formats::kv6::Kv6;
1646
1647    fn empty_kv6() -> Kv6 {
1648        Kv6 {
1649            xsiz: 1,
1650            ysiz: 1,
1651            zsiz: 1,
1652            xpiv: 0.5,
1653            ypiv: 0.5,
1654            zpiv: 0.5,
1655            voxels: Vec::new(),
1656            xlen: vec![0],
1657            ylen: vec![vec![0]],
1658            palette: None,
1659        }
1660    }
1661
1662    /// 17×17×17 kv6 with pivot at the centre — same dimensions as
1663    /// the meltsphere oracle sprite so the cull test exercises a
1664    /// realistic bound cube rather than a 1-voxel point.
1665    fn cube_kv6() -> Kv6 {
1666        Kv6 {
1667            xsiz: 17,
1668            ysiz: 17,
1669            zsiz: 17,
1670            xpiv: 8.5,
1671            ypiv: 8.5,
1672            zpiv: 8.5,
1673            voxels: Vec::new(),
1674            xlen: vec![0; 17],
1675            ylen: vec![vec![0; 17]; 17],
1676            palette: None,
1677        }
1678    }
1679
1680    /// `CameraState` matching the oracle's `sprite_front` pose:
1681    /// pos=(1020,1050,175), yaw=0, pitch=0 → forward = +x.
1682    fn oracle_sprite_front_camera() -> camera_math::CameraState {
1683        let camera = Camera {
1684            pos: [1020.0, 1050.0, 175.0],
1685            // From oracle.c set_camera_yaw_pitch with yaw=0, pitch=0:
1686            //   ifor = [1, 0, 0], istr = [0, 1, 0], ihei = [0, 0, 1].
1687            right: [0.0, 1.0, 0.0],
1688            down: [0.0, 0.0, 1.0],
1689            forward: [1.0, 0.0, 0.0],
1690        };
1691        camera_math::derive(&camera, 640, 480, 320.0, 240.0, 320.0)
1692    }
1693
1694    fn oracle_settings() -> OpticastSettings {
1695        OpticastSettings::for_oracle_framebuffer(640, 480)
1696    }
1697
1698    /// Test-only ergonomic shim: build a Kv6FullState with the
1699    /// oracle 640×480 framebuffer geometry. Mirrors the
1700    /// pre-R6.4 signature so tests don't have to spell out
1701    /// width/height/pitch every time.
1702    fn compute_state_for_test<'a>(
1703        setup: &Kv6DrawSetup<'a>,
1704        sprite: &Sprite,
1705        cam: &camera_math::CameraState,
1706    ) -> Kv6FullState<'a> {
1707        let lighting = SpriteLighting::default_oracle();
1708        kv6_compute_full_state(
1709            setup,
1710            sprite,
1711            &lighting,
1712            cam,
1713            &oracle_settings(),
1714            640,
1715            480,
1716            640,
1717        )
1718    }
1719
1720    /// Allocate a 640×480 framebuffer + zbuffer (zbuffer pre-filled
1721    /// with f32::INFINITY so any voxel passes the z-test on first
1722    /// write).
1723    fn alloc_target() -> (Vec<u32>, Vec<f32>) {
1724        let pixels = 640usize * 480usize;
1725        (vec![0u32; pixels], vec![f32::INFINITY; pixels])
1726    }
1727
1728    fn make_target<'a>(fb: &'a mut [u32], zb: &'a mut [f32]) -> DrawTarget<'a> {
1729        DrawTarget::new(fb, zb, 640, 640, 480)
1730    }
1731
1732    /// Bit-pattern compare for two `[f32; 4]` vectors. The setup
1733    /// math produces these via deterministic IEEE-754 ops, so
1734    /// bit-equality is well-defined and dodges `clippy::float_cmp`.
1735    fn bits4(a: [f32; 4]) -> [u32; 4] {
1736        a.map(f32::to_bits)
1737    }
1738
1739    /// Bytes of the dumped C-oracle meltsphere sprite — used by all
1740    /// the kv6-load tests below. Module-scope `const` keeps clippy's
1741    /// `items_after_statements` happy.
1742    const SPRITE_MELTSPHERE_KV6: &[u8] = include_bytes!("../tests/fixtures/sprite_meltsphere.kv6");
1743
1744    #[test]
1745    fn axis_aligned_sets_identity_basis() {
1746        // Compare bit patterns: these are integer-valued floats so
1747        // bit-equality is well-defined and dodges clippy::float_cmp.
1748        let bits = |a: [f32; 3]| a.map(f32::to_bits);
1749        let s = Sprite::axis_aligned(empty_kv6(), [10.0, 20.0, 30.0]);
1750        assert_eq!(bits(s.p), bits([10.0, 20.0, 30.0]));
1751        assert_eq!(bits(s.s), bits([1.0, 0.0, 0.0]));
1752        assert_eq!(bits(s.h), bits([0.0, 1.0, 0.0]));
1753        assert_eq!(bits(s.f), bits([0.0, 0.0, 1.0]));
1754        assert_eq!(s.flags, 0);
1755    }
1756
1757    #[test]
1758    fn invisible_flag_skips_dispatch() {
1759        let cam = oracle_sprite_front_camera();
1760        let mut s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1761        s.flags = SPRITE_FLAG_INVISIBLE;
1762        let (mut fb, mut zb) = alloc_target();
1763        let mut target = make_target(&mut fb, &mut zb);
1764        let lighting = SpriteLighting::default_oracle();
1765        assert_eq!(
1766            draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
1767            0
1768        );
1769    }
1770
1771    #[test]
1772    fn kfa_flag_skips_dispatch() {
1773        let cam = oracle_sprite_front_camera();
1774        let mut s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1775        s.flags = SPRITE_FLAG_KFA;
1776        let (mut fb, mut zb) = alloc_target();
1777        let mut target = make_target(&mut fb, &mut zb);
1778        let lighting = SpriteLighting::default_oracle();
1779        assert_eq!(
1780            draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
1781            0
1782        );
1783    }
1784
1785    #[test]
1786    fn cull_keeps_oracle_sprite_in_front_of_camera() {
1787        // Oracle's `sprite_front` pose: camera at (1020,1050,175)
1788        // looking +x; sprite at (1050,1050,175). Sprite is 30
1789        // units forward, on-axis — clearly inside the frustum.
1790        let cam = oracle_sprite_front_camera();
1791        let s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1792        assert!(
1793            kv6_draw_prepare(&s, &cam).is_some(),
1794            "front-of-camera sprite must NOT be culled"
1795        );
1796    }
1797
1798    #[test]
1799    fn cull_removes_sprite_far_behind_camera() {
1800        // Same camera; sprite far in the -forward direction
1801        // (= behind the camera).
1802        let cam = oracle_sprite_front_camera();
1803        let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
1804        assert!(
1805            kv6_draw_prepare(&s, &cam).is_none(),
1806            "behind-camera sprite must be culled"
1807        );
1808    }
1809
1810    #[test]
1811    fn cull_removes_sprite_far_to_the_right() {
1812        // Camera looks +x; sprite far in the +y direction (right
1813        // axis), far enough that the bound cube is fully outside
1814        // the right-edge frustum plane.
1815        let cam = oracle_sprite_front_camera();
1816        // 30 units forward, 200 units right — well outside the 90°
1817        // FOV's right edge.
1818        let s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0 + 200.0, 175.0]);
1819        assert!(
1820            kv6_draw_prepare(&s, &cam).is_none(),
1821            "far-right sprite must be culled"
1822        );
1823    }
1824
1825    #[test]
1826    fn cull_keeps_sprite_at_camera_position() {
1827        // Sprite centred on the camera — bound cube straddles the
1828        // camera, so by definition it's not fully outside any
1829        // frustum plane and must NOT be culled.
1830        let cam = oracle_sprite_front_camera();
1831        let s = Sprite::axis_aligned(cube_kv6(), cam.pos);
1832        assert!(
1833            kv6_draw_prepare(&s, &cam).is_some(),
1834            "sprite at camera position must not be culled"
1835        );
1836    }
1837
1838    #[test]
1839    fn iterate_visits_each_voxel_exactly_once() {
1840        // Build a synthetic 3×3×3 kv6 with one voxel per (x, y)
1841        // column at z = x + y mod 3. Then iterate and check
1842        // (a) total callback fires == 27 = numvoxs, and (b) every
1843        // voxel index 0..27 was visited exactly once.
1844        let xsiz: u32 = 3;
1845        let ysiz: u32 = 3;
1846        let zsiz: u32 = 3;
1847        let mut voxels = Vec::new();
1848        let mut xlen = vec![0u32; xsiz as usize];
1849        let mut ylen = vec![vec![0u16; ysiz as usize]; xsiz as usize];
1850        for x in 0..xsiz {
1851            for y in 0..ysiz {
1852                let z = ((x + y) % 3) as u16;
1853                voxels.push(Voxel {
1854                    col: 0x0080_0000,
1855                    z,
1856                    vis: 63,
1857                    dir: 0,
1858                });
1859                xlen[x as usize] += 1;
1860                ylen[x as usize][y as usize] = 1;
1861            }
1862        }
1863        let kv = Kv6 {
1864            xsiz,
1865            ysiz,
1866            zsiz,
1867            xpiv: 1.5,
1868            ypiv: 1.5,
1869            zpiv: 1.5,
1870            voxels,
1871            xlen,
1872            ylen,
1873            palette: None,
1874        };
1875        let setup = Kv6DrawSetup {
1876            kv: &kv,
1877            ts: [1.0, 0.0, 0.0],
1878            th: [0.0, 1.0, 0.0],
1879            tf: [0.0, 0.0, 1.0],
1880            mip: 0,
1881        };
1882        let cam = oracle_sprite_front_camera();
1883        let synth_sprite = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
1884        let state = compute_state_for_test(&setup, &synth_sprite, &cam);
1885
1886        // Every voxel index must fire exactly once. We use a
1887        // by-pointer identity check via .as_ptr() offsets.
1888        let voxels_ptr = kv.voxels.as_ptr();
1889        let mut visited = vec![0u32; kv.voxels.len()];
1890        let mut total: u32 = 0;
1891        kv6_iterate(&state, |v, _mask, _r0| {
1892            // SAFETY: callback receives a borrow of an entry of
1893            // `kv.voxels`; computing the offset is well-defined.
1894            let idx = unsafe { std::ptr::from_ref::<Voxel>(v).offset_from(voxels_ptr) } as usize;
1895            visited[idx] += 1;
1896            total += 1;
1897        });
1898        assert_eq!(total as usize, kv.voxels.len(), "total callback fires");
1899        for (i, &n) in visited.iter().enumerate() {
1900            assert_eq!(n, 1, "voxel {i} visited {n} times (want 1)");
1901        }
1902    }
1903
1904    #[test]
1905    fn iterate_meltsphere_oracle_visits_each_voxel_once() {
1906        // Load the dumped voxlap-C meltsphere fixture (R6.0e) and
1907        // run the iteration against the oracle's sprite_front
1908        // camera + sprite pose. Expected: every voxel hit exactly
1909        // once, total fires == kv.voxels.len() (= 401).
1910        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1911        assert_eq!(kv.voxels.len(), 401, "fixture voxel count");
1912
1913        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1914        let cam = oracle_sprite_front_camera();
1915        let setup = kv6_draw_prepare(&sprite, &cam).expect("oracle sprite must pass cull");
1916        let state = compute_state_for_test(&setup, &sprite, &cam);
1917
1918        let voxels_ptr = sprite.kv6.voxels.as_ptr();
1919        let mut visited = vec![0u32; sprite.kv6.voxels.len()];
1920        let mut total: u32 = 0;
1921        kv6_iterate(&state, |v, _mask, _r0| {
1922            let idx = unsafe { std::ptr::from_ref::<Voxel>(v).offset_from(voxels_ptr) } as usize;
1923            visited[idx] += 1;
1924            total += 1;
1925        });
1926        assert_eq!(total, 401);
1927        let max = visited.iter().copied().max().unwrap();
1928        let min = visited.iter().copied().min().unwrap();
1929        assert_eq!(max, 1, "no voxel may be visited twice");
1930        assert_eq!(min, 1, "no voxel may be skipped");
1931    }
1932
1933    #[test]
1934    fn full_state_basic_invariants() {
1935        // For the oracle sprite_front pose, sanity-check the setup
1936        // values: ztab4_per_z[0] is zero, ztab4_per_z[k] - ztab4_per_z[k-1]
1937        // equals cadd4[2], cadd4[3] = cadd4[1] + cadd4[2], cadd4[7] is
1938        // the 7-bit-OR sum, and r1_initial = (npos*gihz with z2=npos.z)
1939        // - cadd4[4].
1940        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1941        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1942        let cam = oracle_sprite_front_camera();
1943        let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
1944        let state = compute_state_for_test(&setup, &sprite, &cam);
1945
1946        // ztab4_per_z[0] = [0; 4].
1947        assert_eq!(bits4(state.ztab4_per_z[0]), bits4([0.0; 4]));
1948
1949        // For each subsequent z, ztab4_per_z[z] = ztab4_per_z[z-1] + cadd4[2].
1950        for z in 1..state.ztab4_per_z.len() {
1951            let want = vec4_add(state.ztab4_per_z[z - 1], state.cadd4[2]);
1952            assert_eq!(bits4(state.ztab4_per_z[z]), bits4(want), "ztab4_per_z[{z}]");
1953        }
1954
1955        // cadd4[3] = cadd4[1] + cadd4[2]; cadd4[5] = cadd4[1] + cadd4[4];
1956        // cadd4[6] = cadd4[2] + cadd4[4]; cadd4[7] = cadd4[3] + cadd4[4].
1957        assert_eq!(
1958            bits4(state.cadd4[3]),
1959            bits4(vec4_add(state.cadd4[1], state.cadd4[2]))
1960        );
1961        assert_eq!(
1962            bits4(state.cadd4[5]),
1963            bits4(vec4_add(state.cadd4[1], state.cadd4[4]))
1964        );
1965        assert_eq!(
1966            bits4(state.cadd4[6]),
1967            bits4(vec4_add(state.cadd4[2], state.cadd4[4]))
1968        );
1969        assert_eq!(
1970            bits4(state.cadd4[7]),
1971            bits4(vec4_add(state.cadd4[3], state.cadd4[4]))
1972        );
1973        assert_eq!(bits4(state.cadd4[0]), bits4([0.0; 4]));
1974
1975        // r2 = -ysiz * cadd4[4].
1976        let want_r2 = vec4_scale(state.cadd4[4], -(state.iter.kv.ysiz as f32));
1977        assert_eq!(bits4(state.r2), bits4(want_r2));
1978    }
1979
1980    #[test]
1981    fn drawboundcubesse_culls_invisible_face_mask() {
1982        // Synthetic voxel with vis=0 must short-circuit the
1983        // early-out and not consume the scissor branch.
1984        let v = Voxel {
1985            col: 0,
1986            z: 0,
1987            vis: 0,
1988            dir: 0,
1989        };
1990        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1991        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1992        let cam = oracle_sprite_front_camera();
1993        let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
1994        let state = compute_state_for_test(&setup, &sprite, &cam);
1995        let (mut fb, mut zb) = alloc_target();
1996        let mut target = make_target(&mut fb, &mut zb);
1997        let mut tail = 0u32;
1998        assert_eq!(
1999            drawboundcubesse(
2000                &v,
2001                0xff,
2002                &state,
2003                [0.0, 0.0, 100.0, 100.0],
2004                &mut tail,
2005                &mut target,
2006            ),
2007            0
2008        );
2009    }
2010
2011    #[test]
2012    fn drawboundcubesse_culls_voxel_behind_near_plane() {
2013        // Force scisdist > 0 by passing an r0 with very small
2014        // origin.z. Only triggers if scisdist > origin.z; for the
2015        // oracle sprite_front pose `scisdist` is some small
2016        // positive number (sum of any negative post-swap basis-z
2017        // components), so a r0 with z = -1 will cull.
2018        let v = Voxel {
2019            col: 0xff,
2020            z: 0,
2021            vis: 0xff,
2022            dir: 0,
2023        };
2024        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2025        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2026        let cam = oracle_sprite_front_camera();
2027        let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
2028        let state = compute_state_for_test(&setup, &sprite, &cam);
2029        // r0.z = -1000 makes origin.z = -1000 + ztab4_per_z[0].z = -1000.
2030        // scisdist >= 0; -1000 < scisdist → cull.
2031        let r0 = [0.0, 0.0, -1000.0, -1000.0];
2032        let (mut fb, mut zb) = alloc_target();
2033        let mut target = make_target(&mut fb, &mut zb);
2034        let mut tail = 0u32;
2035        assert_eq!(
2036            drawboundcubesse(&v, 0xff, &state, r0, &mut tail, &mut target),
2037            0
2038        );
2039    }
2040
2041    #[test]
2042    fn iterate_no_voxels_when_culled() {
2043        // Sprite far behind camera → cull. draw_sprite never
2044        // reaches kv6_iterate, so no callback fires.
2045        let cam = oracle_sprite_front_camera();
2046        let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
2047        // Cull catches it before iteration.
2048        assert!(kv6_draw_prepare(&s, &cam).is_none());
2049    }
2050
2051    #[test]
2052    fn draw_sprite_writes_pixels_for_oracle_meltsphere() {
2053        // R6.4 end-to-end: load the meltsphere fixture, run
2054        // draw_sprite at the sprite_front pose. Expect a non-zero
2055        // pixel count and at least one non-zero framebuffer entry.
2056        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2057        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2058        let cam = oracle_sprite_front_camera();
2059        let (mut fb, mut zb) = alloc_target();
2060        let mut target = make_target(&mut fb, &mut zb);
2061        let lighting = SpriteLighting::default_oracle();
2062        let written = draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &sprite);
2063        assert!(written > 0, "expected some pixels to be written");
2064        assert!(
2065            fb.iter().any(|&p| p != 0),
2066            "expected at least one non-zero framebuffer entry"
2067        );
2068        // Z-buffer must have shrunk somewhere from f32::INFINITY.
2069        assert!(
2070            zb.iter().any(|&z| z.is_finite()),
2071            "expected at least one finite zbuffer entry"
2072        );
2073    }
2074
2075    #[test]
2076    fn draw_sprite_returns_zero_for_culled_sprite() {
2077        let cam = oracle_sprite_front_camera();
2078        let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
2079        let (mut fb, mut zb) = alloc_target();
2080        let mut target = make_target(&mut fb, &mut zb);
2081        let lighting = SpriteLighting::default_oracle();
2082        assert_eq!(
2083            draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
2084            0
2085        );
2086        assert!(fb.iter().all(|&p| p == 0));
2087    }
2088
2089    /// `update_reflects` for the oracle sprite_front pose hits the
2090    /// nolighta path (R==G==B kv6col, no fog, lightmode<2). All
2091    /// kv6colmul[k] entries must repeat one u16 modulation factor
2092    /// across all 4 lanes.
2093    #[test]
2094    fn update_reflects_nolighta_lanes_match() {
2095        let s = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
2096        let lighting = SpriteLighting::default_oracle();
2097        let (cm, ca) = update_reflects(&s, &lighting);
2098        assert_eq!(ca, 0, "kv6coladd must be zero (no fog)");
2099        for (k, e) in cm.iter().enumerate() {
2100            let l0 = (e & 0xffff) as u16;
2101            let l1 = ((e >> 16) & 0xffff) as u16;
2102            let l2 = ((e >> 32) & 0xffff) as u16;
2103            let l3 = ((e >> 48) & 0xffff) as u16;
2104            assert_eq!(l0, l1, "kv6colmul[{k}] lane0 != lane1");
2105            assert_eq!(l0, l2, "kv6colmul[{k}] lane0 != lane2");
2106            assert_eq!(l0, l3, "kv6colmul[{k}] lane0 != lane3");
2107        }
2108    }
2109
2110    /// Non-grey kv6col forces the nolightb path. Lanes 0..3 of each
2111    /// `kv6colmul[k]` come from per-channel modulators built from
2112    /// the kv6col bytes — they should NOT all match unless the
2113    /// channels themselves match.
2114    #[test]
2115    fn update_reflects_nolightb_lanes_diverge_for_tinted_kv6col() {
2116        let s = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
2117        let lighting = SpriteLighting {
2118            kv6col: 0x0040_8040, // R != G != B
2119            lightmode: 0,
2120            lights: &[],
2121        };
2122        let (cm, _) = update_reflects(&s, &lighting);
2123        // Find any direction where the dot is non-zero (most are
2124        // non-zero); that direction's lanes must vary by channel.
2125        let mut saw_divergence = false;
2126        for e in cm.iter() {
2127            let l0 = (e & 0xffff) as u16;
2128            let l1 = ((e >> 16) & 0xffff) as u16;
2129            let l2 = ((e >> 32) & 0xffff) as u16;
2130            if l0 != l1 || l0 != l2 {
2131                saw_divergence = true;
2132                break;
2133            }
2134        }
2135        assert!(
2136            saw_divergence,
2137            "non-grey kv6col must produce per-channel divergence in some kv6colmul slot"
2138        );
2139    }
2140
2141    /// Lightmode-2 with one point light + grey kv6col still
2142    /// produces R==G==B lanes (because the per-channel modulators
2143    /// are all 0x80<<8 = 0x8000). It must produce a non-uniform
2144    /// kv6colmul (some directions face the light, others away),
2145    /// which differs from lightmode<2 where every direction has the
2146    /// same dot magnitude regardless of position.
2147    #[test]
2148    fn update_reflects_lightmode2_produces_directional_shading() {
2149        let s = Sprite::axis_aligned(empty_kv6(), [100.0, 100.0, 100.0]);
2150        let lights = [LightSrc {
2151            pos: [110.0, 100.0, 100.0],
2152            r2: 100.0,
2153            sc: 16.0,
2154        }];
2155        let lighting = SpriteLighting {
2156            kv6col: DEFAULT_KV6COL,
2157            lightmode: 2,
2158            lights: &lights,
2159        };
2160        let (cm, _) = update_reflects(&s, &lighting);
2161        // Some directions must darken (shadow side) while others
2162        // brighten (light side) — the spread between min and max
2163        // tells us shading is happening.
2164        let mut min_w = u16::MAX;
2165        let mut max_w = 0u16;
2166        for e in cm.iter() {
2167            let l0 = (e & 0xffff) as u16;
2168            min_w = min_w.min(l0);
2169            max_w = max_w.max(l0);
2170        }
2171        assert!(
2172            max_w > min_w + 16,
2173            "lightmode-2 should produce directional shading: min={min_w} max={max_w}"
2174        );
2175    }
2176
2177    /// Lightmode-2 with no lights → ambient-only. Should still
2178    /// produce some non-zero kv6colmul (the synthetic ambient slot
2179    /// is non-trivial).
2180    #[test]
2181    fn update_reflects_lightmode2_no_lights_falls_back_to_ambient() {
2182        let s = Sprite::axis_aligned(empty_kv6(), [100.0, 100.0, 100.0]);
2183        let lighting = SpriteLighting {
2184            kv6col: DEFAULT_KV6COL,
2185            lightmode: 2,
2186            lights: &[],
2187        };
2188        let (cm, _) = update_reflects(&s, &lighting);
2189        let any_nonzero = cm.iter().any(|&e| e != 0);
2190        assert!(
2191            any_nonzero,
2192            "lightmode-2 with no lights should still emit ambient shading"
2193        );
2194    }
2195}