Skip to main content

roxlap_core/
sprite.rs

1//! KV6 sprite type + the `draw_sprite` dispatcher.
2//!
3//! Mirror of voxlap's `vx5sprite` (voxlap5.h:63-79) plus the
4//! `drawsprite` entry point (voxlap5.c:9818). For R6.1 the
5//! dispatcher is a stub — just enough API surface for the host to
6//! plumb a sprite reference through. R6.2-R6.4 fill in the actual
7//! kv6 frustum-cull + per-voxel rasterization behind it.
8//!
9//! Voxlap's vx5sprite is a 64-byte struct:
10//!
11//! ```text
12//! point3d p;       // position
13//! int32_t flags;   // bit 0: 0=normal shading
14//!                  // bit 1: 0=kv6data, 1=kfatype  (oracle uses 0)
15//!                  // bit 2: 0=normal, 1=invisible
16//! point3d s;       // x-basis (kv6data.xsiz direction)
17//! kv6data *voxnum; // (or kfatype *kfaptr if flag bit 1 set)
18//! point3d h;       // y-basis
19//! int32_t kfatim;
20//! point3d f;       // z-basis
21//! int32_t okfatim;
22//! ```
23//!
24//! For R6 we only handle kv6 sprites with `flags = 0` (the four
25//! oracle sprite poses all use this). KFA animation + the no-z and
26//! invisible flags are deferred.
27
28// The kv6draw port is pointer-arithmetic-heavy; the casts mirror C's
29// implicit i32/u32/usize narrowings. Loop bounds are clamped via
30// `lbound` so sign-loss / wrap is guarded at the type-system edge.
31// kv.{xsiz,ysiz,zsiz} are u32 with realistic max ≤ 256 (file format
32// limit) — well within f32's 24-bit mantissa.
33#![allow(
34    clippy::cast_possible_truncation,
35    clippy::cast_possible_wrap,
36    clippy::cast_sign_loss,
37    clippy::cast_precision_loss,
38    clippy::similar_names,
39    clippy::too_many_arguments,
40    clippy::too_many_lines,
41    clippy::cast_ptr_alignment, // _mm_loadl_epi64 / _mm_storeu_si128 are intentionally unaligned
42    clippy::doc_markdown,
43    clippy::no_effect_underscore_binding, // SSE intrinsic side-effect-only stores
44    clippy::no_effect, // the discarded pmaddwd intermediate
45    clippy::ref_as_ptr,
46    clippy::float_cmp_const,
47    clippy::float_cmp,
48)]
49
50use roxlap_formats::kv6::{Kv6, Voxel};
51use roxlap_formats::sprite::{Sprite, SPRITE_FLAG_INVISIBLE, SPRITE_FLAG_KFA, SPRITE_FLAG_NO_Z};
52
53use crate::camera_math::CameraState;
54use crate::engine::{Engine, LightSrc, DEFAULT_KV6COL};
55use crate::equivec::iunivec;
56use crate::fixed::ftol;
57use crate::opticast::OpticastSettings;
58use crate::ptfaces16::PTFACES16;
59
60/// Voxlap's `MAXLIGHTS` cap (`voxlap5.c`). Used to size the
61/// ambient-plus-N-lights `lightlist` scratch in `update_reflects`'s
62/// lightmode≥2 branch.
63const MAX_LIGHTS: usize = 16;
64
65/// Voxlap's `vx5.kv6mipfactor` default (`voxlap5.c:12335`). Threshold
66/// distance (in voxlap's "ftol-of-forward-projected" estimate units)
67/// above which kv6draw walks the lowermip chain. Roxlap doesn't yet
68/// model the lowermip chain in `roxlap-formats::Kv6`, so the mip
69/// descent loop in `kv6_draw_prepare` is structurally faithful but
70/// effectively a no-op until that lands.
71pub(crate) const KV6_MIPFACTOR_DEFAULT: i32 = 128;
72
73/// Post-cull state derived from a sprite + camera pair — what the
74/// per-voxel iteration in R6.3+ needs to start its setup. Borrows
75/// the mip-selected kv6 from the sprite.
76///
77/// Voxlap doesn't materialise this struct (it operates on local
78/// variables inside `kv6draw`); roxlap factors the cull out so it's
79/// independently testable without staging the rest of the
80/// rasterizer.
81#[derive(Debug, Clone)]
82#[allow(dead_code)] // R6.3+ will read these fields.
83pub(crate) struct Kv6DrawSetup<'a> {
84    /// Mip-selected kv6. For the base-mip case (always, today),
85    /// this is just `&sprite.kv6`.
86    pub kv: &'a Kv6,
87    /// Mip-scaled basis vectors. For the base mip these equal
88    /// `sprite.s/h/f`; if a future lowermip walk runs, each is
89    /// scaled by `2^mip`.
90    pub ts: [f32; 3],
91    pub th: [f32; 3],
92    pub tf: [f32; 3],
93    /// 0 for the base mip; reserved for lowermip support.
94    pub mip: u32,
95}
96
97/// Mip-LOD descent + 4-plane frustum cull, mirror of voxlap5.c:8832-
98/// 8875. Returns `None` if the sprite's bound cube is fully behind
99/// any of the four view-frustum edge planes (`CameraState::nor`),
100/// `Some(setup)` otherwise with the post-cull state R6.3 needs.
101///
102/// # Cull math
103///
104/// The bound cube has centre `npos` (in camera-relative coords) and
105/// three half-extent vectors `nstr`, `nhei`, `nfor` (each = the
106/// kv6-axis basis vector scaled by the corresponding half-extent).
107/// For each frustum-edge normal `n`, voxlap tests:
108///
109/// ```text
110/// |nstr · n| + |nhei · n| + |nfor · n| + npos · n < 0
111/// ```
112///
113/// — i.e. the cube's closest-point projection onto `n` is still
114/// behind the plane. Any plane satisfying this culls the sprite.
115pub(crate) fn kv6_draw_prepare<'a>(
116    sprite: &'a Sprite,
117    cam: &CameraState,
118) -> Option<Kv6DrawSetup<'a>> {
119    let kv = &sprite.kv6;
120
121    // Voxlap's quick-and-dirty distance estimate (voxlap5.c:8835):
122    //   y = ftol((spr->p - gipos) · gifor)
123    // Used by the lowermip descent loop. Roxlap-formats `Kv6` doesn't
124    // model lowermip yet, so the loop never runs and this value is
125    // unused — computed for symmetry with voxlap and to lock the
126    // path for a future mip-chain port.
127    let dx = sprite.p[0] - cam.pos[0];
128    let dy = sprite.p[1] - cam.pos[1];
129    let dz = sprite.p[2] - cam.pos[2];
130    let dist_estimate = ftol(dx * cam.forward[0] + dy * cam.forward[1] + dz * cam.forward[2]);
131    let _ = (dist_estimate, KV6_MIPFACTOR_DEFAULT);
132    let mip = 0u32;
133    let ts = sprite.s;
134    let th = sprite.h;
135    let tf = sprite.f;
136
137    // Bound-cube centre + half-extents in camera-relative coords.
138    // (voxlap5.c:8852-8860; tp is centre offset from pivot, tp2 is
139    // axis half-extent.) kv->xsiz/ysiz/zsiz fit f32 exactly for
140    // any realistic kv6 (≤ 256³ per the file format limit).
141    #[allow(clippy::cast_precision_loss)]
142    let half_x = kv.xsiz as f32 * 0.5;
143    #[allow(clippy::cast_precision_loss)]
144    let half_y = kv.ysiz as f32 * 0.5;
145    #[allow(clippy::cast_precision_loss)]
146    let half_z = kv.zsiz as f32 * 0.5;
147    let off_x = half_x - kv.xpiv;
148    let off_y = half_y - kv.ypiv;
149    let off_z = half_z - kv.zpiv;
150    let npos = [
151        off_x * ts[0] + off_y * th[0] + off_z * tf[0] + dx,
152        off_x * ts[1] + off_y * th[1] + off_z * tf[1] + dy,
153        off_x * ts[2] + off_y * th[2] + off_z * tf[2] + dz,
154    ];
155    let nstr = [ts[0] * half_x, ts[1] * half_x, ts[2] * half_x];
156    let nhei = [th[0] * half_y, th[1] * half_y, th[2] * half_y];
157    let nfor = [tf[0] * half_z, tf[1] * half_z, tf[2] * half_z];
158
159    // 4-plane cull (voxlap5.c:8861-8875, walked z=3..0).
160    for n in &cam.nor {
161        let proj_str = (nstr[0] * n[0] + nstr[1] * n[1] + nstr[2] * n[2]).abs();
162        let proj_hei = (nhei[0] * n[0] + nhei[1] * n[1] + nhei[2] * n[2]).abs();
163        let proj_for = (nfor[0] * n[0] + nfor[1] * n[1] + nfor[2] * n[2]).abs();
164        let proj_pos = npos[0] * n[0] + npos[1] * n[1] + npos[2] * n[2];
165        if proj_str + proj_hei + proj_for + proj_pos < 0.0 {
166            return None;
167        }
168    }
169
170    Some(Kv6DrawSetup {
171        kv,
172        ts,
173        th,
174        tf,
175        mip,
176    })
177}
178
179/// 3×3 + translation matrix multiply, port of voxlap's `mat2`
180/// (voxlap5.c:9619). Composes camera transform `(a_s, a_h, a_f, a_o)`
181/// with sprite basis `(b_s, b_h, b_f, b_o)` into camera-relative
182/// sprite basis `(c_s, c_h, c_f, c_o)`.
183///
184/// `c_s = a_s * b_s.x + a_h * b_s.y + a_f * b_s.z`, similarly for
185/// `c_h` / `c_f`. `c_o = same form on b_o + a_o`.
186#[allow(clippy::too_many_arguments)]
187pub(crate) fn mat2(
188    a_s: [f32; 3],
189    a_h: [f32; 3],
190    a_f: [f32; 3],
191    a_o: [f32; 3],
192    b_s: [f32; 3],
193    b_h: [f32; 3],
194    b_f: [f32; 3],
195    b_o: [f32; 3],
196) -> ([f32; 3], [f32; 3], [f32; 3], [f32; 3]) {
197    let c_s = [
198        a_s[0] * b_s[0] + a_h[0] * b_s[1] + a_f[0] * b_s[2],
199        a_s[1] * b_s[0] + a_h[1] * b_s[1] + a_f[1] * b_s[2],
200        a_s[2] * b_s[0] + a_h[2] * b_s[1] + a_f[2] * b_s[2],
201    ];
202    let c_h = [
203        a_s[0] * b_h[0] + a_h[0] * b_h[1] + a_f[0] * b_h[2],
204        a_s[1] * b_h[0] + a_h[1] * b_h[1] + a_f[1] * b_h[2],
205        a_s[2] * b_h[0] + a_h[2] * b_h[1] + a_f[2] * b_h[2],
206    ];
207    let c_f = [
208        a_s[0] * b_f[0] + a_h[0] * b_f[1] + a_f[0] * b_f[2],
209        a_s[1] * b_f[0] + a_h[1] * b_f[1] + a_f[1] * b_f[2],
210        a_s[2] * b_f[0] + a_h[2] * b_f[1] + a_f[2] * b_f[2],
211    ];
212    let c_o = [
213        a_s[0] * b_o[0] + a_h[0] * b_o[1] + a_f[0] * b_o[2] + a_o[0],
214        a_s[1] * b_o[0] + a_h[1] * b_o[1] + a_f[1] * b_o[2] + a_o[1],
215        a_s[2] * b_o[0] + a_h[2] * b_o[1] + a_f[2] * b_o[2] + a_o[2],
216    ];
217    (c_s, c_h, c_f, c_o)
218}
219
220/// Voxlap's `lbound(a, b, c)` (voxlap5.c:406): clamp `a` into the
221/// inclusive range `[b, c]`. `c` must be `>= b`.
222#[inline]
223fn lbound(a: i32, b: i32, c: i32) -> i32 {
224    a.clamp(b, c)
225}
226
227/// State derived from `Kv6DrawSetup` + `CameraState` that the
228/// per-voxel iteration consumes. Voxlap holds these on the stack
229/// inside `kv6draw`; roxlap factors them out so the iteration loop
230/// can be tested independently.
231#[derive(Debug, Clone)]
232#[allow(dead_code)] // R6.4+ reads scisdist / qsum0 / cadd / etc.
233pub(crate) struct Kv6IterState<'a> {
234    pub kv: &'a Kv6,
235    /// Camera origin expressed in kv6-local voxel coordinates,
236    /// clamped to `[-1, kv.xsiz]` etc. by voxlap's `lbound`. Splits
237    /// the voxel grid into the 4 + 1 quadrants the iteration walks
238    /// in different orders so that for each (x, y) column the inner
239    /// z-loop visits voxels closer to the camera first (= correct
240    /// painter's-style ordering for the rasterizer in R6.4).
241    pub inx: i32,
242    pub iny: i32,
243    pub inz: i32,
244    /// `vx5.xplanemin` / `vx5.xplanemax` mirror — voxlap defaults
245    /// to `[0, INT_MAX]` (no x-clipping). Roxlap doesn't yet expose
246    /// a public knob for these; pinning to the defaults matches the
247    /// oracle and any caller that doesn't care.
248    pub nxplanemin: i32,
249    pub nxplanemax: i32,
250}
251
252/// Full per-frame rasterizer state for one sprite — what
253/// `drawboundcubesse` reads via voxlap's globals.
254///
255/// Built by [`kv6_compute_full_state`] from the post-cull
256/// `Kv6DrawSetup` + the camera's projection params. Mirror of the
257/// voxlap5.c:8915-8973 setup block + the qsum1/qbplbpp framebuffer
258/// state from `voxsetframebuffer` (voxlap5.c:11119-11122) +
259/// kv6colmul/kv6coladd from `updatereflects` (voxlap5.c:8466).
260#[derive(Debug, Clone)]
261pub(crate) struct Kv6FullState<'a> {
262    pub iter: Kv6IterState<'a>,
263    /// 8 cube-vertex offsets, gihz-scaled. `cadd4[k]` for `k = 0..7`
264    /// is the offset of cube vertex `k` from the voxel origin, where
265    /// bit 0 = +x, bit 1 = +z (post-swap == old +z), bit 2 = +y
266    /// (post-swap == old -y). `cadd4[0]` is `[0; 4]`. Lane 3 of
267    /// each entry duplicates lane 2 (z) — voxlap's SSE convenience.
268    pub cadd4: [[f32; 4]; 8],
269    /// Per-z step table: `ztab4_per_z[z] = z * cadd4[2]`. Length =
270    /// `kv.zsiz`. Indexed by `v.z` in `drawboundcubesse`.
271    pub ztab4_per_z: Vec<[f32; 4]>,
272    /// Initial r1 — the x=0 column base after voxlap's "ANNOYING
273    /// HACK" pre-decrement. = `(npos*gihz with z2=npos.z) -
274    /// cadd4[4]`. Iterates by `cadd4[1]` per x and (via r0) by
275    /// `cadd4[4]` per y.
276    pub r1_initial: [f32; 4],
277    /// `r2 = -ysiz * cadd4[4]`. Used to reset r0 between forward-y
278    /// and reverse-y phases inside one x column.
279    pub r2: [f32; 4],
280    /// Near-plane scissor distance (camera-space Z).
281    /// `voxlap5.c:8953-8956` — equals the negative sum of any
282    /// negative components of post-swap `nstr.z` / `nhei.z` /
283    /// `nfor.z`. `0.0` if all three are non-negative.
284    pub scisdist: f32,
285    /// Viewport-clip biases (voxlap5.c:8947-8948). Used by the SSE2
286    /// path's `paddsw` / `pmaxsw` AABB clipping; the scalar port clips
287    /// directly against `target.width` / `target.height`.
288    #[allow(dead_code)]
289    pub qsum0: [i16; 4],
290    /// Viewport-clip floor (voxlap5.c:11120).
291    #[allow(dead_code)]
292    pub qsum1: [i16; 4],
293    /// Framebuffer pixel-stride packed for `pmaddwd` (voxlap5.c:11121).
294    #[allow(dead_code)]
295    pub qbplbpp: [i16; 4],
296    /// Per-direction colour modulation table built by
297    /// [`update_reflects`]. Indexed by `v.dir` (256 entries). Each
298    /// entry packs four `u16` modulation factors (one per byte
299    /// channel) used by `_mm_mulhi_epu16` against the unpacked
300    /// voxel colour.
301    pub kv6colmul: Box<[u64; 256]>,
302    /// Fog bias added after the colour modulate. Zero when fog is
303    /// disabled (the oracle case).
304    pub kv6coladd: u64,
305}
306
307/// Borrowed framebuffer + zbuffer the per-voxel rasterizer fills.
308///
309/// Mirrors voxlap's `kv6frameplace` + `zbuffermem` but in
310/// row-major-pixel form rather than byte-pointer form. `width` /
311/// `height` must match the `OpticastSettings.xres` / `yres` used
312/// when the per-frame `Kv6FullState` was built — the bounds derived from
313/// `qsum0` / `qsum1` assume that geometry.
314///
315/// Internally a raw-pointer view (similar to
316/// [`crate::scalar_rasterizer::RasterTarget`]) so the type is
317/// `Copy + Send + Sync` and the R12.4.2 [`draw_sprites_parallel`]
318/// entry point can hand per-thread copies into rayon worker
319/// closures. Each parallel sprite-draw competes for the
320/// framebuffer / zbuffer via z-test; for non-overlapping sprites
321/// this is race-free, for overlapping pixels a tied-z race may
322/// leak (visually indistinguishable, hash non-deterministic).
323#[derive(Clone, Copy, Debug)]
324pub struct DrawTarget<'a> {
325    fb_ptr: *mut u32,
326    fb_len: usize,
327    zb_ptr: *mut f32,
328    zb_len: usize,
329    /// Row stride in pixels.
330    pub pitch_pixels: usize,
331    pub width: u32,
332    pub height: u32,
333    _marker: std::marker::PhantomData<&'a mut [u32]>,
334}
335
336// SAFETY: same shape as the (`&'a mut [u32]`, `&'a mut [f32]`) pair
337// the constructor consumed; both are auto-`Send` for `T: Send`. The
338// pointer-aliasing safety contract for [`draw_sprites_parallel`] is
339// "z-test arbitrates concurrent writes" — a tied-z race is a
340// determinism issue, not a memory-safety issue.
341unsafe impl Send for DrawTarget<'_> {}
342unsafe impl Sync for DrawTarget<'_> {}
343
344impl<'a> DrawTarget<'a> {
345    /// Build a target from exclusive slice borrows + framebuffer
346    /// dimensions. The slices are consumed (their `&'a mut`
347    /// re-borrow is what gates lifetime); subsequent access happens
348    /// via the raw pointers held in the struct.
349    #[must_use]
350    pub fn new(
351        framebuffer: &'a mut [u32],
352        zbuffer: &'a mut [f32],
353        pitch_pixels: usize,
354        width: u32,
355        height: u32,
356    ) -> Self {
357        Self {
358            fb_ptr: framebuffer.as_mut_ptr(),
359            fb_len: framebuffer.len(),
360            zb_ptr: zbuffer.as_mut_ptr(),
361            zb_len: zbuffer.len(),
362            pitch_pixels,
363            width,
364            height,
365            _marker: std::marker::PhantomData,
366        }
367    }
368
369    /// Unconditional framebuffer write. Used by sequential 2D
370    /// blitters (`drawtile`) that don't engage z-testing.
371    ///
372    /// # Safety
373    /// `idx < self.fb_len`. The disjoint-write contract still
374    /// applies if multiple `Copy` instances of `DrawTarget` are in
375    /// flight across threads — this method does NOT arbitrate via
376    /// z-test.
377    #[inline]
378    pub unsafe fn fb_write(self, idx: usize, color: u32) {
379        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
380        // SAFETY: caller asserts in-bounds + (for parallel use)
381        // disjoint writes.
382        unsafe { self.fb_ptr.add(idx).write(color) };
383    }
384
385    /// Read one framebuffer pixel. Used by alpha-blend paths
386    /// (`drawtile` modulate-and-blend) that read-modify-write.
387    ///
388    /// # Safety
389    /// `idx < self.fb_len`. Concurrent writers to the same `idx`
390    /// from another thread invalidate the read; sequential blits
391    /// are race-free.
392    #[inline]
393    #[must_use]
394    pub unsafe fn fb_read(self, idx: usize) -> u32 {
395        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
396        // SAFETY: caller asserts in-bounds.
397        unsafe { self.fb_ptr.add(idx).read() }
398    }
399
400    /// Z-tested pixel write. If `z < zbuffer[idx]`, the new color +
401    /// z stamp the buffers; otherwise nothing changes.
402    ///
403    /// # Safety
404    /// `idx < self.fb_len`. For parallel callers, the wedge / z-test
405    /// arbitration contract on [`DrawTarget`] applies (see struct
406    /// doc).
407    #[inline]
408    #[must_use]
409    pub unsafe fn z_test_write(self, idx: usize, color: u32, z: f32) -> bool {
410        debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
411        debug_assert!(idx < self.zb_len, "zb idx {} >= len {}", idx, self.zb_len);
412        // SAFETY: caller asserts in-bounds + concurrent-write contract.
413        unsafe {
414            let zp = self.zb_ptr.add(idx);
415            let cur_z = zp.read();
416            if z < cur_z {
417                zp.write(z);
418                self.fb_ptr.add(idx).write(color);
419                true
420            } else {
421                false
422            }
423        }
424    }
425}
426
427#[inline]
428fn vec4_add(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
429    [a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
430}
431
432#[inline]
433fn vec4_sub(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
434    [a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3]]
435}
436
437#[inline]
438fn vec4_scale(a: [f32; 4], s: f32) -> [f32; 4] {
439    [a[0] * s, a[1] * s, a[2] * s, a[3] * s]
440}
441
442/// Sprite lighting + colour state — the subset of voxlap's
443/// `vx5` global that `updatereflects` reads. Built once per
444/// frame from [`Engine`] state and passed to [`draw_sprite`].
445///
446/// All fields mirror voxlap names:
447/// - `kv6col` ↔ `vx5.kv6col`
448/// - `lightmode` ↔ `vx5.lightmode`
449/// - `lights` ↔ `vx5.lightsrc[0..vx5.numlights]`
450///
451/// The `vx5.fogcol`/`ofogdist` fog plumbing is deferred — sprite
452/// fog stays off for now, matching the oracle path
453/// (`vx5.fogcol < 0` ⇒ `ofogdist == -1` in voxlap C, no fog).
454#[derive(Debug, Clone, Copy)]
455pub struct SpriteLighting<'a> {
456    /// Material colour. R==G==B triggers the cheaper nolighta path
457    /// in `update_reflects`; arbitrary RGB takes the per-channel
458    /// nolightb path; lightmode≥2 ignores the R==G==B fast path
459    /// and always does per-channel modulation.
460    pub kv6col: u32,
461    /// `0` / `1` → directional surface tint (lightmode<2 paths).
462    /// `2` → per-light shadow-side modulation against `lights`.
463    pub lightmode: u32,
464    /// Active point lights — voxlap's `vx5.lightsrc[..vx5.numlights]`.
465    /// Empty for lightmode<2; populated for lightmode≥2.
466    pub lights: &'a [LightSrc],
467}
468
469impl<'a> SpriteLighting<'a> {
470    /// Snapshot the lighting + colour subset of an [`Engine`].
471    /// Use this once per frame in the host so the sprite render
472    /// reflects engine setters made between frames.
473    #[must_use]
474    pub fn from_engine(engine: &'a Engine) -> Self {
475        Self {
476            kv6col: engine.kv6col(),
477            lightmode: engine.lightmode(),
478            lights: engine.lights(),
479        }
480    }
481}
482
483impl SpriteLighting<'static> {
484    /// Default oracle config — grey `kv6col`, lightmode 0, no
485    /// lights. Used by `roxlap-oracle` so the four sprite golden
486    /// hashes stay byte-stable: this is the exact state voxlap C's
487    /// oracle has when it calls `drawsprite`.
488    #[must_use]
489    pub fn default_oracle() -> Self {
490        Self {
491            kv6col: DEFAULT_KV6COL,
492            lightmode: 0,
493            lights: &[],
494        }
495    }
496}
497
498/// Builds `kv6colmul[256]` + `kv6coladd[0]` from the engine's
499/// sprite lighting state. Mirror of voxlap's `updatereflects`
500/// (`voxlap5.c:8466-8750`).
501///
502/// Branches:
503/// - `lightmode < 2` + R==G==B `kv6col` → nolighta (cheap
504///   single-multiplier path, voxlap5.c:8553-8584).
505/// - `lightmode < 2` + arbitrary `kv6col` → nolightb (per-channel
506///   path, voxlap5.c:8587-8629).
507/// - `lightmode >= 2` → per-light shadow-side modulation
508///   (voxlap5.c:8631-8750), iterating the active `lights`.
509///
510/// `flags & 1` (disable shading) and the active-fog path remain
511/// deferred — neither is exercised by the oracle's four sprite
512/// poses, and adding them is a follow-up that doesn't change the
513/// already-frozen hashes.
514///
515fn update_reflects(sprite: &Sprite, lighting: &SpriteLighting<'_>) -> (Box<[u64; 256]>, u64) {
516    // Sprite fog plumbing is a follow-up — `vx5.fogcol < 0` (voxlap
517    // C oracle's set_fogcol(BR(...)) state) means ofogdist stays -1,
518    // fogmul = 0, kv6coladd[0] = 0. We pin to that here.
519    let fogmul_lo: u32 = 0;
520    let kv6coladd: u64 = 0;
521
522    let kv6col = lighting.kv6col;
523
524    // g = ((fogmul & 32767) ^ 32767) * (16*8/65536). With fogmul=0:
525    //   g = 32767 * (128/65536) ≈ 63.998.
526    let g_pre = ((((fogmul_lo & 0x7fff) ^ 0x7fff) as i32) as f32) * (16.0 * 8.0 / 65536.0);
527
528    let mut kv6colmul = Box::new([0u64; 256]);
529
530    if lighting.lightmode < 2 {
531        // (voxlap5.c:8538-8543) fx=fy=fz=1.0; tp = sum of basis vectors.
532        let tp_x = sprite.s[0] + sprite.h[0] + sprite.f[0];
533        let tp_y = sprite.s[1] + sprite.h[1] + sprite.f[1];
534        let tp_z = sprite.s[2] + sprite.h[2] + sprite.f[2];
535
536        let f0 = 64.0_f32 / (tp_x * tp_x + tp_y * tp_y + tp_z * tp_z).sqrt();
537
538        // R==G==B test: ((kv6col & 0xffff) << 8) ^ (kv6col & 0xffff00)
539        //   == 0  iff  R == G and G == B.
540        let lo16 = kv6col & 0xffff;
541        let mid24 = kv6col & 0x00ff_ff00;
542        let is_grey = ((lo16 << 8) ^ mid24) == 0;
543
544        if is_grey {
545            // Nolighta path (voxlap5.c:8553-8584): grey kv6col absorbs
546            // into a single multiplier per direction.
547            let g = g_pre * (((kv6col & 0xff) as f32) / 256.0);
548            let f = f0 * g;
549
550            let l0 = (tp_x * f) as i16; // (short)(...) is C truncating cast
551            let l1 = (tp_y * f) as i16;
552            let l2 = (tp_z * f) as i16;
553            let l3 = (g * 128.0) as i16;
554
555            let iu = iunivec();
556            for k in 0..256 {
557                let w = dot_iunivec_i16x4(iu[k], [l0, l1, l2, l3]);
558                let w64 = u64::from(w);
559                kv6colmul[k] = w64 | (w64 << 16) | (w64 << 32) | (w64 << 48);
560            }
561        } else {
562            // Nolightb path (voxlap5.c:8587-8629). Per-channel
563            // modulation factor M_k = (kv6col_byte_k << 8) → mulhi_pu16
564            // by the per-direction dot. Same dot derivation as nolighta.
565            let f = f0 * g_pre;
566
567            let l0 = (tp_x * f) as i16;
568            let l1 = (tp_y * f) as i16;
569            let l2 = (tp_z * f) as i16;
570            let l3 = (g_pre * 128.0) as i16;
571
572            let m = kv6col_channel_mods(kv6col);
573
574            let iu = iunivec();
575            for k in 0..256 {
576                let w = dot_iunivec_i16x4(iu[k], [l0, l1, l2, l3]);
577                kv6colmul[k] = pack_modulated_word(w, m);
578            }
579        }
580    } else {
581        // Lightmode≥2 path (voxlap5.c:8631-8750): per-sprite point
582        // lighting from `lighting.lights`. Each light projects onto
583        // the sprite's normalised basis; per-direction kv6colmul[i]
584        // starts from a synthetic ambient slot and subtracts shadow
585        // contributions from each light's "negative" lanes.
586        let m = kv6col_channel_mods(kv6col);
587        build_kv6colmul_lightmode2(sprite, lighting.lights, &mut kv6colmul, fogmul_lo, m);
588    }
589
590    (kv6colmul, kv6coladd)
591}
592
593/// Build voxlap's per-surface-normal colour-modulation table for a
594/// sprite under the given lighting — the `kv6colmul[256]` (one packed
595/// u64 per `Voxel::dir`, four 16-bit channel multipliers) plus the
596/// `kv6coladd` bias `draw_sprite` adds. This is exactly the table the
597/// CPU rasteriser uses (`update_reflects`); exposed so other backends
598/// (the GPU sprite pass) can shade KV6 sprites with identical math
599/// rather than re-deriving voxlap's lighting in a shader.
600///
601/// Per voxel, the final colour is, per channel `c`:
602/// `clamp(((rgb[c] << 8) * (kv6colmul[dir] >> 16*c & 0xffff)) >> 16
603///  + (kv6coladd >> 16*c & 0xffff), 0, 255)` — i.e. an
604/// `_mm_mulhi_epu16` + `_mm_add_epi16` + `_mm_packus_epi16`.
605#[must_use]
606pub fn sprite_colmul(sprite: &Sprite, lighting: &SpriteLighting<'_>) -> ([u64; 256], u64) {
607    let (mul, add) = update_reflects(sprite, lighting);
608    (*mul, add)
609}
610
611/// Voxlap's `pmaddwd(iunivec[k], lightlist) summed across two
612/// dword lanes mod 2^32, take high 16` reduction. Returns the
613/// `u16` modulation factor before any per-channel packing.
614#[inline]
615fn dot_iunivec_i16x4(u: [i16; 4], l: [i16; 4]) -> u16 {
616    let u0 = i32::from(u[0]);
617    let u1 = i32::from(u[1]);
618    let u2 = i32::from(u[2]);
619    let u3 = i32::from(u[3]);
620    let lo = (u0.wrapping_mul(l[0].into())) as u32;
621    let lo = lo.wrapping_add((u1.wrapping_mul(l[1].into())) as u32);
622    let hi = (u2.wrapping_mul(l[2].into())) as u32;
623    let hi = hi.wrapping_add((u3.wrapping_mul(l[3].into())) as u32);
624    ((lo.wrapping_add(hi)) >> 16) as u16
625}
626
627/// `(kv6col_byte_k << 8)` per channel — the four `M_k` factors the
628/// nolightb / lightmode≥2 paths multiply against the per-direction
629/// dot via `pmulhuw`.
630#[inline]
631fn kv6col_channel_mods(kv6col: u32) -> [u16; 4] {
632    [
633        ((kv6col & 0xff) << 8) as u16,
634        (((kv6col >> 8) & 0xff) << 8) as u16,
635        (((kv6col >> 16) & 0xff) << 8) as u16,
636        (((kv6col >> 24) & 0xff) << 8) as u16,
637    ]
638}
639
640/// Pack one direction's `kv6colmul[k]` u64: per-channel
641/// `(W * M_c) >> 16` words concatenated.
642#[inline]
643fn pack_modulated_word(w_dot: u16, m: [u16; 4]) -> u64 {
644    let w = u32::from(w_dot);
645    let w0 = ((w * u32::from(m[0])) >> 16) as u16;
646    let w1 = ((w * u32::from(m[1])) >> 16) as u16;
647    let w2 = ((w * u32::from(m[2])) >> 16) as u16;
648    let w3 = ((w * u32::from(m[3])) >> 16) as u16;
649    u64::from(w0) | (u64::from(w1) << 16) | (u64::from(w2) << 32) | (u64::from(w3) << 48)
650}
651
652/// Lightmode≥2 path body — voxlap5.c:8631-8750. Builds the full
653/// `kv6colmul[256]` from the active light list.
654///
655/// Steps:
656/// 1. Normalise each sprite-basis axis (`sprs`/`sprh`/`sprf`).
657/// 2. For each light within `r2` of the sprite, compute its
658///    intensity falloff `h` and project the world-space delta onto
659///    the normalised sprite basis → store in `lightlist[k]`.
660/// 3. Append a synthetic ambient slot (voxlap's hardcoded
661///    `(fx, fy, fz) = (0, 0.5, 1.0)` direction) at
662///    `lightlist[lightcnt]`.
663/// 4. For each direction `idx ∈ 0..256`:
664///    - `base = ambient_slot · iunivec[idx]` (treated as one u32).
665///    - For each real light `k`: compute `dot = light_k ·
666///      iunivec[idx]`, split into low/high i16 lanes (asm-faithful
667///      "16-bits-is-ugly-but-ok-here" quirk); subtract the negative
668///      lanes from `base` (= shadow side of the surface).
669///    - `W = base >> 16`, then per-channel modulate against `M_c`
670///      and pack into `kv6colmul[idx]`.
671fn build_kv6colmul_lightmode2(
672    sprite: &Sprite,
673    lights: &[LightSrc],
674    kv6colmul: &mut [u64; 256],
675    fogmul_lo: u32,
676    m: [u16; 4],
677) {
678    // (voxlap5.c:8638-8643) Normalise sprite basis. WARNING from
679    // voxlap: only correct for orthonormal sprite-bases; non-
680    // orthogonal bases (e.g. shears) drift. The four oracle sprite
681    // poses are all orthonormal so this matches voxlap's behaviour.
682    let sprs = normalise(sprite.s);
683    let sprh = normalise(sprite.h);
684    let sprf = normalise(sprite.f);
685
686    // hh = ((fogmul & 32767) ^ 32767) / 65536 * 2 (voxlap5.c:8645).
687    // With fogmul=0 → hh = 32767 / 65536 * 2 ≈ 1.0. This is a
688    // distinct scaling from `g_pre` (= same numerator * 128/65536
689    // for the lightmode<2 path) — they differ by a factor of 64.
690    // An earlier port mistakenly derived hh from g_pre / 128 = 0.5,
691    // giving sprites half the intended ambient brightness.
692    let hh_initial = ((((fogmul_lo & 0x7fff) ^ 0x7fff) as i32) as f32) * (2.0 / 65536.0);
693
694    // Project each in-range light onto the sprite basis.
695    let mut lightlist: [[i16; 4]; MAX_LIGHTS + 1] = [[0; 4]; MAX_LIGHTS + 1];
696    let mut lightcnt: usize = 0;
697    for light in lights.iter().rev() {
698        if lightcnt >= MAX_LIGHTS {
699            break;
700        }
701        let fx = light.pos[0] - sprite.p[0];
702        let fy = light.pos[1] - sprite.p[1];
703        let fz = light.pos[2] - sprite.p[2];
704        let gg = fx * fx + fy * fy + fz * fz;
705        let ff = light.r2;
706        // Voxlap's `*(int32_t *)&gg < *(int32_t *)&ff` is a bit-
707        // pattern compare. For non-negative finite floats the bit
708        // order matches the magnitude order, so `gg < ff` is
709        // equivalent (and safer in the presence of NaN: NaN !< x
710        // for any x, matching voxlap's float-bit-cast trick).
711        if gg >= ff || gg <= 0.0 {
712            continue;
713        }
714        let f = ff.sqrt();
715        let g = gg.sqrt();
716        // h = (f*ff - g*gg) / (f*ff*g*gg) * sc * 16
717        let mut h = (f * ff - g * gg) / (f * ff * g * gg) * light.sc * 16.0;
718        if g * h > 4096.0 {
719            h = 4096.0 / g; // saturation clip
720        }
721        h *= hh_initial;
722        let l0 = (fx * sprs[0] + fy * sprs[1] + fz * sprs[2]) * h;
723        let l1 = (fx * sprh[0] + fy * sprh[1] + fz * sprh[2]) * h;
724        let l2 = (fx * sprf[0] + fy * sprf[1] + fz * sprf[2]) * h;
725        lightlist[lightcnt] = [l0 as i16, l1 as i16, l2 as i16, 0];
726        lightcnt += 1;
727    }
728
729    // Synthetic ambient slot: voxlap's hardcoded direction
730    // (fx, fy, fz) = (0, 0.5, 1.0) projected onto the sprite basis,
731    // scaled by `hh * 16*16*8/2 = hh * 1024`. The lane-3 bias is
732    // `hh * 48 / 16 = hh * 3`.
733    let amb_fx = 0.0_f32;
734    let amb_fy = 0.5_f32;
735    let amb_fz = 1.0_f32;
736    let hh = hh_initial * (16.0 * 16.0 * 8.0 / 2.0);
737    let al0 = (sprs[0] * amb_fx + sprs[1] * amb_fy + sprs[2] * amb_fz) * hh;
738    let al1 = (sprh[0] * amb_fx + sprh[1] * amb_fy + sprh[2] * amb_fz) * hh;
739    let al2 = (sprf[0] * amb_fx + sprf[1] * amb_fy + sprf[2] * amb_fz) * hh;
740    let al3 = hh * (48.0 / 16.0);
741    lightlist[lightcnt] = [al0 as i16, al1 as i16, al2 as i16, al3 as i16];
742
743    let iu = iunivec();
744    for idx in 0..256 {
745        let u = iu[idx];
746        // Ambient base = lightlist[lightcnt] · iunivec[idx], in u32
747        // wrapping arithmetic (asm summed the pmaddwd dword lanes
748        // mod 2^32).
749        let u0 = i32::from(u[0]);
750        let u1 = i32::from(u[1]);
751        let u2 = i32::from(u[2]);
752        let u3 = i32::from(u[3]);
753        let amb = lightlist[lightcnt];
754        let base_lo = (u0.wrapping_mul(i32::from(amb[0]))) as u32;
755        let base_lo = base_lo.wrapping_add((u1.wrapping_mul(i32::from(amb[1]))) as u32);
756        let base_hi = (u2.wrapping_mul(i32::from(amb[2]))) as u32;
757        let base_hi = base_hi.wrapping_add((u3.wrapping_mul(i32::from(amb[3]))) as u32);
758        let mut base = base_lo.wrapping_add(base_hi);
759
760        // For each real light, compute dot, then subtract its
761        // "negative" half-lanes from `base` (= shadow side).
762        for k in (0..lightcnt).rev() {
763            let l = lightlist[k];
764            let klo = (u0.wrapping_mul(i32::from(l[0]))) as u32;
765            let klo = klo.wrapping_add((u1.wrapping_mul(i32::from(l[1]))) as u32);
766            let khi = (u2.wrapping_mul(i32::from(l[2]))) as u32;
767            let khi = khi.wrapping_add((u3.wrapping_mul(i32::from(l[3]))) as u32);
768            let dot = klo.wrapping_add(khi);
769            // Voxlap quirk: 32-bit dot but pminsw is per-i16 lane.
770            // Light magnitudes stay clamped enough that the
771            // mixed-lane behaviour is benign — port faithfully.
772            let lo16 = (dot & 0xffff) as i16;
773            let hi16 = ((dot >> 16) & 0xffff) as i16;
774            let lo16c: u16 = if lo16 < 0 { lo16 as u16 } else { 0 };
775            let hi16c: u16 = if hi16 < 0 { hi16 as u16 } else { 0 };
776            let sub = (u32::from(hi16c) << 16) | u32::from(lo16c);
777            base = base.wrapping_sub(sub);
778        }
779
780        let w_dot = (base >> 16) as u16;
781        kv6colmul[idx] = pack_modulated_word(w_dot, m);
782    }
783}
784
785/// Normalise a 3-vector. Returns the unit-length version; if
786/// the input is zero-length, returns the input unchanged (avoids
787/// NaN propagation — voxlap's `1.0 / sqrt(...)` would NaN out for
788/// a zero basis axis but the C code never gets passed one).
789#[inline]
790fn normalise(v: [f32; 3]) -> [f32; 3] {
791    let len_sq = v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
792    if len_sq <= 0.0 {
793        return v;
794    }
795    let inv = 1.0 / len_sq.sqrt();
796    [v[0] * inv, v[1] * inv, v[2] * inv]
797}
798
799/// Full setup: mat2 + Cramer's + nfor↔nhei swap + cadd4/ztab4/r1/r2/
800/// scisdist/qsum0 init. Mirror of voxlap5.c:8915-8973.
801pub(crate) fn kv6_compute_full_state<'a>(
802    setup: &Kv6DrawSetup<'a>,
803    sprite: &Sprite,
804    lighting: &SpriteLighting<'_>,
805    cam: &CameraState,
806    settings: &OpticastSettings,
807    fb_width: u32,
808    fb_height: u32,
809    fb_pitch_pixels: usize,
810) -> Kv6FullState<'a> {
811    let sprite_pos = sprite.p;
812    let kv = setup.kv;
813
814    // Transform sprite basis from world to camera-relative
815    // screen-axis coords (voxlap5.c:8916). `(gixs, giys, gizs)` is
816    // the transposed camera basis; `giadd` is the translation half.
817    let (nstr, mut nhei, mut nfor, mut npos) = mat2(
818        cam.xs, cam.ys, cam.zs, cam.add, setup.ts, setup.th, setup.tf, sprite_pos,
819    );
820
821    // Shift `npos` so it points at the kv6 origin (corner [0,0,0])
822    // rather than the pivot point — Cramer's rule below solves for
823    // the camera origin in kv6-local voxel coords, which only makes
824    // sense relative to the corner. (voxlap5.c:8917-8919)
825    npos[0] -= kv.xpiv * nstr[0] + kv.ypiv * nhei[0] + kv.zpiv * nfor[0];
826    npos[1] -= kv.xpiv * nstr[1] + kv.ypiv * nhei[1] + kv.zpiv * nfor[1];
827    npos[2] -= kv.xpiv * nstr[2] + kv.ypiv * nhei[2] + kv.zpiv * nfor[2];
828
829    // Cramer's rule for `nstr * X + nhei * Y + nfor * Z + npos = 0`.
830    // (voxlap5.c:8923-8936)
831    let tp = [
832        nhei[1] * nfor[2] - nfor[1] * nhei[2],
833        nfor[1] * nstr[2] - nstr[1] * nfor[2],
834        nstr[1] * nhei[2] - nhei[1] * nstr[2],
835    ];
836    let det = nstr[0] * tp[0] + nhei[0] * tp[1] + nfor[0] * tp[2];
837    // Float-bit comparison against zero: matches voxlap's
838    // `if (f != 0)` and dodges clippy::float_cmp.
839    let (raw_inx, raw_iny, raw_inz) = if det.to_bits() & 0x7fff_ffff != 0 {
840        let f_inv = -1.0 / det;
841        let tp2 = [
842            npos[1] * nfor[2] - nfor[1] * npos[2],
843            nhei[1] * npos[2] - npos[1] * nhei[2],
844            npos[1] * nstr[2] - nstr[1] * npos[2],
845        ];
846        (
847            ftol((npos[0] * tp[0] - nhei[0] * tp2[0] - nfor[0] * tp2[1]) * f_inv),
848            ftol((npos[0] * tp[1] + nstr[0] * tp2[0] - nfor[0] * tp2[2]) * f_inv),
849            ftol((npos[0] * tp[2] + nstr[0] * tp2[1] + nhei[0] * tp2[2]) * f_inv),
850        )
851    } else {
852        (-1, -1, -1)
853    };
854
855    let xsiz_i = kv.xsiz as i32;
856    let ysiz_i = kv.ysiz as i32;
857    let zsiz_i = kv.zsiz as i32;
858    let iter = Kv6IterState {
859        kv,
860        inx: lbound(raw_inx, -1, xsiz_i),
861        iny: lbound(raw_iny, -1, ysiz_i),
862        inz: lbound(raw_inz, -1, zsiz_i),
863        // Voxlap default `vx5.xplanemin = 0`, `xplanemax = 0x7fffffff`.
864        nxplanemin: 0,
865        nxplanemax: i32::MAX,
866    };
867
868    // Swap `nhei` ↔ `nfor` with sign flip on the new `nfor`
869    // (voxlap5.c:8942-8944). Equivalent to a 90° rotation that lines
870    // the basis up with cadd4's bit-encoded vertex offsets:
871    //   cadd4[1] = +x  (post-swap nstr direction)
872    //   cadd4[2] = +z  (post-swap nhei direction == original +z)
873    //   cadd4[4] = +y  (post-swap nfor direction == original -y)
874    // After this point `nfor` / `nhei` carry the post-swap values.
875    let swap_x = nhei[0];
876    nhei[0] = nfor[0];
877    nfor[0] = -swap_x;
878    let swap_y = nhei[1];
879    nhei[1] = nfor[1];
880    nfor[1] = -swap_y;
881    let swap_z = nhei[2];
882    nhei[2] = nfor[2];
883    nfor[2] = -swap_z;
884
885    // qsum0 (voxlap5.c:8947-8948). The `0x7fff - (xres - hx)`
886    // form sets the bias such that adding it to a screen-space
887    // bound makes the bound saturate-positive when it lands
888    // inside the viewport.
889    let xres_i = settings.xres as i32;
890    let yres_i = settings.yres as i32;
891    let hx_i = ftol(settings.hx);
892    let hy_i = ftol(settings.hy);
893    let qsum0_x = (0x7fff - (xres_i - hx_i)) as i16;
894    let qsum0_y = (0x7fff - (yres_i - hy_i)) as i16;
895    let qsum0 = [qsum0_x, qsum0_y, qsum0_x, qsum0_y];
896
897    // scisdist (voxlap5.c:8953-8956). Voxlap's `*(int32_t *)&f < 0`
898    // bit-trick: a positive-finite float has bit-pattern >= 0;
899    // only *negative* floats land < 0 as signed int. So this loop
900    // sums the absolute value of any negative-z post-swap basis
901    // component into a near-plane bias.
902    let mut scisdist = 0.0f32;
903    if (nstr[2].to_bits() as i32) < 0 {
904        scisdist -= nstr[2];
905    }
906    if (nhei[2].to_bits() as i32) < 0 {
907        scisdist -= nhei[2];
908    }
909    if (nfor[2].to_bits() as i32) < 0 {
910        scisdist -= nfor[2];
911    }
912
913    // cadd4 step table (voxlap5.c:8958-8961). cadd4[1/2/4] are the
914    // three primary axis steps (x / z / y, post-swap); cadd4[3/5/6/7]
915    // are bit-OR sums (3 = 1+2, 5 = 1+4, 6 = 2+4, 7 = 3+4).
916    let gihz = settings.hz;
917    let cadd1 = [nstr[0] * gihz, nstr[1] * gihz, nstr[2], nstr[2]];
918    let cadd2 = [nhei[0] * gihz, nhei[1] * gihz, nhei[2], nhei[2]];
919    let cadd4_axis = [nfor[0] * gihz, nfor[1] * gihz, nfor[2], nfor[2]];
920    let cadd3 = vec4_add(cadd1, cadd2);
921    let cadd5 = vec4_add(cadd1, cadd4_axis);
922    let cadd6 = vec4_add(cadd2, cadd4_axis);
923    let cadd7 = vec4_add(cadd3, cadd4_axis);
924    let cadd4 = [
925        [0.0; 4], cadd1, cadd2, cadd3, cadd4_axis, cadd5, cadd6, cadd7,
926    ];
927
928    // ztab4 per-z step table (voxlap5.c:8973). ztab4[z] = z * cadd4[2]
929    // built incrementally by addps so per-step rounding matches.
930    let zsiz = kv.zsiz as usize;
931    let mut ztab4_per_z = Vec::with_capacity(zsiz);
932    if zsiz > 0 {
933        ztab4_per_z.push([0.0f32; 4]);
934        for i in 1..zsiz {
935            let prev = ztab4_per_z[i - 1];
936            ztab4_per_z.push(vec4_add(prev, cadd4[2]));
937        }
938    }
939
940    // r1 init (voxlap5.c:8961, 8976). Post-mat2 npos becomes the
941    // raw column-base; gihz-scale x/y; z lane keeps unscaled npos.z;
942    // z2 lane (lane 3) duplicates z. Then "ANNOYING HACK"
943    // pre-decrement by cadd4[4].
944    let r1_pre = [npos[0] * gihz, npos[1] * gihz, npos[2], npos[2]];
945    let r1_initial = vec4_sub(r1_pre, cadd4[4]);
946
947    // r2 = -ysiz * cadd4[4] (voxlap5.c:8974). intss + mulps in voxlap.
948    let r2 = vec4_scale(cadd4[4], -(ysiz_i as f32));
949
950    // qsum1 + qbplbpp from voxsetframebuffer (voxlap5.c:11119-11122).
951    // The framebuffer geometry is independent of the camera projection
952    // — these are derived from `(width, height, pitch_bytes)`.
953    let pitch_bytes = (fb_pitch_pixels as i32).saturating_mul(4);
954    let qsum1_x = 0x7fff_i32 - fb_width as i32;
955    let qsum1_y = 0x7fff_i32 - fb_height as i32;
956    let qsum1 = [
957        qsum1_x as i16,
958        qsum1_y as i16,
959        qsum1_x as i16,
960        qsum1_y as i16,
961    ];
962    let qbplbpp = [4i16, pitch_bytes as i16, 4, pitch_bytes as i16];
963
964    let (kv6colmul, kv6coladd) = update_reflects(sprite, lighting);
965
966    Kv6FullState {
967        iter,
968        cadd4,
969        ztab4_per_z,
970        r1_initial,
971        r2,
972        scisdist,
973        qsum0,
974        qsum1,
975        qbplbpp,
976        kv6colmul,
977        kv6coladd,
978    }
979}
980
981/// Per-voxel rasterizer (R6.4 complete).
982///
983/// Mirror of `voxlap5.c:8179-8320` (`drawboundcubesse`). For each
984/// voxel:
985/// 1. `effmask = mask & v.vis` early-out.
986/// 2. `origin = r0 + ztab4_per_z[v.z]`; scissor on `origin.z`.
987/// 3. Look up `ptfaces16[effmask]` — `face[0]` = 4 or 6 vertex
988///    count, `face[1..7]` = byte offsets into `caddasm` (the
989///    `cadd4[8]` array, each entry 16 bytes).
990/// 4. For each vertex pair (a, b), compute the projected screen
991///    coords as `(cadd4[a] + origin).xy / (cadd4[a] + origin).z`
992///    via `_mm_rcp_ps`.
993/// 5. Pack the 4 (or 6) projected vertices to int16, min/max-reduce
994///    to a single screen-AABB, viewport-clip via `qsum0` /
995///    `qsum1`, and early-out on degenerate rect.
996/// 6. Compute the per-voxel colour via the `mm5` cross-call tail +
997///    `kv6colmul[v.dir]` + `kv6coladd[0]` modulation.
998/// 7. Fill the screen rectangle with z-test + framebuffer write.
999///
1000/// Returns the number of pixels actually written (z-test passing).
1001/// Tests use this as a sanity gate; production callers ignore it.
1002///
1003/// `mm5_tail` is voxlap's static cross-call register tail
1004/// (voxlap5.c:8170-8177). It carries one byte of contribution from
1005/// the previous voxel's colour into the current; bit-equality with
1006/// the asm requires preserving it across calls within one sprite.
1007///
1008/// Currently x86_64-only — relies on `_mm_rcp_ps` for bit-equality
1009/// with voxlap C. NEON / wasm ports will need their own goldens
1010/// (see `PORTING-RUST.md` R9 / R10).
1011#[cfg(target_arch = "x86_64")]
1012#[allow(clippy::trivially_copy_pass_by_ref)] // hot loop; matches voxlap's pointer-passed v.
1013pub(crate) fn drawboundcubesse(
1014    v: &Voxel,
1015    mask: u32,
1016    state: &Kv6FullState<'_>,
1017    r0: [f32; 4],
1018    mm5_tail: &mut u32,
1019    target: &mut DrawTarget<'_>,
1020) -> u32 {
1021    use core::arch::x86_64::{
1022        __m128, __m128i, _mm_add_epi16, _mm_add_ps, _mm_adds_epi16, _mm_cvtsi128_si32,
1023        _mm_cvtsi32_si128, _mm_cvttps_epi32, _mm_loadl_epi64, _mm_loadu_ps, _mm_madd_epi16,
1024        _mm_max_epi16, _mm_min_epi16, _mm_movehl_ps, _mm_movelh_ps, _mm_mul_ps, _mm_mulhi_epu16,
1025        _mm_packs_epi32, _mm_packus_epi16, _mm_rcp_ps, _mm_setzero_si128, _mm_shufflelo_epi16,
1026        _mm_storeu_ps, _mm_storeu_si128, _mm_subs_epu16, _mm_unpackhi_epi64, _mm_unpacklo_epi32,
1027        _mm_unpacklo_epi8,
1028    };
1029
1030    let effmask = (mask & u32::from(v.vis)) as usize;
1031    if effmask == 0 || effmask >= PTFACES16.len() {
1032        return 0;
1033    }
1034    let face = PTFACES16[effmask];
1035    if face[0] == 0 {
1036        return 0;
1037    }
1038
1039    // origin = r0 + ztab4_per_z[v.z] (4 f32 lanes, [x*hz, y*hz, z, z]).
1040    let z_idx = v.z as usize;
1041    if z_idx >= state.ztab4_per_z.len() {
1042        return 0;
1043    }
1044    let ztep = state.ztab4_per_z[z_idx];
1045    // SAFETY: `_mm_loadu_ps` reads 16 unaligned bytes from a 4-f32
1046    // array (which is 16 bytes); subsequent intrinsics are SSE2
1047    // baseline on x86_64.
1048    unsafe {
1049        let r0_v = _mm_loadu_ps(r0.as_ptr());
1050        let ztep_v = _mm_loadu_ps(ztep.as_ptr());
1051        let origin_v: __m128 = _mm_add_ps(r0_v, ztep_v);
1052        let mut origin_arr = [0.0f32; 4];
1053        _mm_storeu_ps(origin_arr.as_mut_ptr(), origin_v);
1054        if origin_arr[2] < state.scisdist {
1055            return 0;
1056        }
1057
1058        // Project vertex pair (a, b). Returns __m128 with lanes:
1059        //   [b.x_proj, b.y_proj, a.x_proj, a.y_proj]
1060        // The byte offsets in face[k] index `caddasm` (= bytes into a
1061        // [point4d; 8] = [[f32; 4]; 8]); divide by 16 (= sizeof point4d)
1062        // to land back at the cadd4 index.
1063        let project = |off_a: u8, off_b: u8| -> __m128 {
1064            let a = state.cadd4[(off_a >> 4) as usize];
1065            let b = state.cadd4[(off_b >> 4) as usize];
1066            let wva = _mm_add_ps(_mm_loadu_ps(a.as_ptr()), origin_v);
1067            let wvb = _mm_add_ps(_mm_loadu_ps(b.as_ptr()), origin_v);
1068            let wv0 = _mm_movehl_ps(wva, wvb); // [b.z, b.z, a.z, a.z]
1069            let wv1 = _mm_movelh_ps(wvb, wva); // [b.x, b.y, a.x, a.y]
1070            let wv0_inv = _mm_rcp_ps(wv0);
1071            _mm_mul_ps(wv0_inv, wv1)
1072        };
1073
1074        let pair01 = project(face[1], face[2]);
1075        let pair23 = project(face[3], face[4]);
1076
1077        // Convert to int32 (truncate-toward-zero), pack to int16.
1078        // pack01_int16 lanes 0..3 = [v1x, v1y, v0x, v0y]
1079        // pack01_int16 lanes 4..7 = [v3x, v3y, v2x, v2y]
1080        let p01_i32 = _mm_cvttps_epi32(pair01);
1081        let p23_i32 = _mm_cvttps_epi32(pair23);
1082        let pack_lo = _mm_packs_epi32(p01_i32, p23_i32);
1083        let pack01 = pack_lo;
1084        let pack23 = _mm_unpackhi_epi64(pack_lo, _mm_setzero_si128());
1085        let mut mm_min = _mm_min_epi16(pack01, pack23);
1086        let mut mm_max = _mm_max_epi16(pack01, pack23);
1087
1088        if face[0] != 4 {
1089            let pair45 = project(face[5], face[6]);
1090            let p45_i32 = _mm_cvttps_epi32(pair45);
1091            let pack45 = _mm_packs_epi32(p45_i32, _mm_setzero_si128());
1092            mm_min = _mm_min_epi16(mm_min, pack45);
1093            mm_max = _mm_max_epi16(mm_max, pack45);
1094        }
1095
1096        // shufflelo(_, 0x0e) brings high half (lanes 2..3) into low
1097        // half so min/max collapses across all 4 (or 6) vertices.
1098        let mm_min_hi = _mm_shufflelo_epi16(mm_min, 0x0e);
1099        let mm_max_hi = _mm_shufflelo_epi16(mm_max, 0x0e);
1100        let mm_min_red = _mm_min_epi16(mm_min, mm_min_hi);
1101        let mm_max_red = _mm_max_epi16(mm_max, mm_max_hi);
1102
1103        // bounds = unpacklo(mm_min, mm_max) lanes 0..3 (i16)
1104        //        = [min_x, max_x, min_y, max_y]  ?
1105        // Actually: _mm_unpacklo_epi32 interleaves 32-bit lanes.
1106        // Low 32 of mm_min = (mm_min[0], mm_min[1]) i.e. (min_x, min_y).
1107        // Low 32 of mm_max similarly. After unpacklo_epi32:
1108        //   lanes_32[0] = mm_min low32, lanes_32[1] = mm_max low32
1109        //   → 4 i16: [min_x, min_y, max_x, max_y]
1110        let bounds = _mm_unpacklo_epi32(mm_min_red, mm_max_red);
1111
1112        // Apply qsum0 (saturated add) + qsum1 (max-floor). Both are
1113        // 8-byte values loaded into the low 64 bits of __m128i.
1114        let qsum0_v = _mm_loadl_epi64(state.qsum0.as_ptr().cast::<__m128i>());
1115        let qsum1_v = _mm_loadl_epi64(state.qsum1.as_ptr().cast::<__m128i>());
1116        let bounds = _mm_adds_epi16(bounds, qsum0_v);
1117        let bounds = _mm_max_epi16(bounds, qsum1_v);
1118
1119        // dxdy = subs_epu16(bounds_hi, bounds) — saturating unsigned
1120        // subtract, with bounds_hi being lanes [2,3,2,3] of bounds.
1121        let bounds_hi = _mm_shufflelo_epi16(bounds, 0xee);
1122        let dxdy = _mm_subs_epu16(bounds_hi, bounds);
1123        let dxdy_low = _mm_cvtsi128_si32(dxdy) as u32;
1124        let dx = (dxdy_low & 0xffff) as i32;
1125        if dx == 0 {
1126            return 0;
1127        }
1128        let dy = ((dxdy_low >> 16) as i32) - 1;
1129        if dy < 0 {
1130            return 0;
1131        }
1132
1133        // Recover pixel coords from bounds + qsum1. Bounds[0/1] are
1134        // currently in the saturated [0x7fff - res, 0x7fff] range;
1135        // pixel = bounds - qsum1.
1136        let mut bounds_arr = [0i16; 8];
1137        _mm_storeu_si128(bounds_arr.as_mut_ptr().cast::<__m128i>(), bounds);
1138        let pixel_min_x = i32::from(bounds_arr[0]) - i32::from(state.qsum1[0]);
1139        let pixel_min_y = i32::from(bounds_arr[1]) - i32::from(state.qsum1[1]);
1140
1141        // pmaddwd is consumed for completeness so the asm-equivalent
1142        // pixel-byte-offset is computable; not strictly needed since
1143        // we index directly via (pixel_min_x, pixel_min_y).
1144        let qbplbpp_v = _mm_loadl_epi64(state.qbplbpp.as_ptr().cast::<__m128i>());
1145        let _ = _mm_madd_epi16(bounds, qbplbpp_v);
1146
1147        // Colour modulation with mm5 cross-call tail.
1148        let tail_in = *mm5_tail;
1149        let mm5 = _mm_cvtsi32_si128(tail_in as i32);
1150        let col_v = _mm_cvtsi32_si128(v.col as i32);
1151        let mm5 = _mm_unpacklo_epi8(mm5, col_v);
1152        let kvm = state.kv6colmul[v.dir as usize];
1153        let kvm_v = _mm_loadl_epi64(std::ptr::addr_of!(kvm).cast::<__m128i>());
1154        let mm5 = _mm_mulhi_epu16(mm5, kvm_v);
1155        let kva_v = _mm_loadl_epi64(std::ptr::addr_of!(state.kv6coladd).cast::<__m128i>());
1156        let mm5 = _mm_add_epi16(mm5, kva_v);
1157        let mm5 = _mm_packus_epi16(mm5, mm5);
1158        let color = _mm_cvtsi128_si32(mm5) as u32;
1159        *mm5_tail = color;
1160
1161        // Fill rectangle [pixel_min_x .. +dx) × [pixel_min_y .. +dy+1).
1162        // The qsum0/qsum1 clip + saturating sub guarantee the rect
1163        // sits inside the framebuffer, so no per-pixel bounds check
1164        // needed beyond DrawTarget's debug_assert.
1165        let z_val = origin_arr[2];
1166        let pitch = target.pitch_pixels;
1167        let x0 = pixel_min_x as usize;
1168        let x_end = x0 + dx as usize;
1169        let mut written: u32 = 0;
1170        for row in 0..=(dy as usize) {
1171            let y = pixel_min_y as usize + row;
1172            let row_start = y * pitch;
1173            for x in x0..x_end {
1174                let idx = row_start + x;
1175                // SAFETY: idx < pitch * height by qsum0/qsum1 clip;
1176                // concurrent-write contract gated by z_test_write.
1177                // (Outer `unsafe` block in this fn covers the call.)
1178                if target.z_test_write(idx, color, z_val) {
1179                    written += 1;
1180                }
1181            }
1182        }
1183        written
1184    }
1185}
1186
1187/// R9: scalar port for non-x86_64 (aarch64 / wasm). Same algorithm as
1188/// the SSE2 version but uses IEEE 754 `1.0 / z` instead of `_mm_rcp_ps`
1189/// for perspective projection, so screen-space vertex positions (and
1190/// therefore per-arch goldens) will differ by ±1 pixel at edges.
1191/// Colour modulation replicates the `_mm_mulhi_epu16` + `_mm_packus_epi16`
1192/// byte arithmetic exactly.
1193#[cfg(not(target_arch = "x86_64"))]
1194#[allow(clippy::trivially_copy_pass_by_ref)]
1195pub(crate) fn drawboundcubesse(
1196    v: &Voxel,
1197    mask: u32,
1198    state: &Kv6FullState<'_>,
1199    r0: [f32; 4],
1200    mm5_tail: &mut u32,
1201    target: &mut DrawTarget<'_>,
1202) -> u32 {
1203    let effmask = (mask & u32::from(v.vis)) as usize;
1204    if effmask == 0 || effmask >= PTFACES16.len() {
1205        return 0;
1206    }
1207    let face = PTFACES16[effmask];
1208    if face[0] == 0 {
1209        return 0;
1210    }
1211
1212    // origin = r0 + ztab4_per_z[v.z]
1213    let z_idx = v.z as usize;
1214    if z_idx >= state.ztab4_per_z.len() {
1215        return 0;
1216    }
1217    let origin = vec4_add(r0, state.ztab4_per_z[z_idx]);
1218    if origin[2] < state.scisdist {
1219        return 0;
1220    }
1221
1222    // The SSE2 path's qsum0/qsum1 mechanism embeds the screen-center
1223    // offset (hx, hy) into the viewport clip; recover it here for
1224    // the direct screen-coordinate projection.
1225    let hx = (i32::from(state.qsum0[0]) - i32::from(state.qsum1[0])) as f32;
1226    let hy = (i32::from(state.qsum0[1]) - i32::from(state.qsum1[1])) as f32;
1227
1228    // Project one vertex: screen_xy = (cadd4[idx] + origin).xy / .z + (hx, hy)
1229    let project = |off: u8| -> (f32, f32) {
1230        let wv = vec4_add(state.cadd4[(off >> 4) as usize], origin);
1231        let inv_z = 1.0 / wv[2];
1232        (wv[0] * inv_z + hx, wv[1] * inv_z + hy)
1233    };
1234
1235    // Project 4 or 6 vertices, track screen AABB via truncation.
1236    let (a0x, a0y) = project(face[1]);
1237    let (a1x, a1y) = project(face[2]);
1238    let (a2x, a2y) = project(face[3]);
1239    let (a3x, a3y) = project(face[4]);
1240    let mut min_x = a0x.min(a1x).min(a2x).min(a3x) as i32;
1241    let mut min_y = a0y.min(a1y).min(a2y).min(a3y) as i32;
1242    let mut max_x = a0x.max(a1x).max(a2x).max(a3x) as i32;
1243    let mut max_y = a0y.max(a1y).max(a2y).max(a3y) as i32;
1244
1245    if face[0] != 4 {
1246        let (a4x, a4y) = project(face[5]);
1247        let (a5x, a5y) = project(face[6]);
1248        min_x = min_x.min(a4x as i32).min(a5x as i32);
1249        min_y = min_y.min(a4y as i32).min(a5y as i32);
1250        max_x = max_x.max(a4x as i32).max(a5x as i32);
1251        max_y = max_y.max(a4y as i32).max(a5y as i32);
1252    }
1253
1254    // Viewport clip (mirrors the qsum0/qsum1 saturating-add + max
1255    // sequence from the SSE2 path, but in direct screen coords).
1256    let fb_w = target.width as i32;
1257    let fb_h = target.height as i32;
1258    min_x = min_x.max(0);
1259    min_y = min_y.max(0);
1260    max_x = max_x.min(fb_w - 1);
1261    max_y = max_y.min(fb_h - 1);
1262    if min_x > max_x || min_y > max_y {
1263        return 0;
1264    }
1265
1266    // Colour modulation — replicates the SSE2 byte arithmetic:
1267    //   interleave = unpacklo_epi8(tail, col)  → 4 × u16
1268    //   result     = mulhi_epu16(interleave, kv6colmul[dir]) + kv6coladd
1269    //   color      = packus_epi16(result)      → 4 × u8 → u32
1270    let t = mm5_tail.to_le_bytes();
1271    let c = v.col.to_le_bytes();
1272    let interleaved: [u16; 4] = [
1273        (u16::from(c[0]) << 8) | u16::from(t[0]),
1274        (u16::from(c[1]) << 8) | u16::from(t[1]),
1275        (u16::from(c[2]) << 8) | u16::from(t[2]),
1276        (u16::from(c[3]) << 8) | u16::from(t[3]),
1277    ];
1278    let kvm = state.kv6colmul[v.dir as usize];
1279    let kva = state.kv6coladd;
1280    let mut color_bytes = [0u8; 4];
1281    for i in 0..4 {
1282        let km = ((kvm >> (i * 16)) & 0xffff) as u16;
1283        let ka = ((kva >> (i * 16)) & 0xffff) as u16;
1284        let hi = ((u32::from(interleaved[i]) * u32::from(km)) >> 16) as u16;
1285        let val = hi.wrapping_add(ka) as i16;
1286        color_bytes[i] = val.clamp(0, 255) as u8;
1287    }
1288    let color = u32::from_le_bytes(color_bytes);
1289    *mm5_tail = color;
1290
1291    // Fill rectangle with z-test.
1292    let z_val = origin[2];
1293    let pitch = target.pitch_pixels;
1294    let mut written: u32 = 0;
1295    for y in min_y..=max_y {
1296        let row_start = y as usize * pitch;
1297        for x in min_x..=max_x {
1298            let idx = row_start + x as usize;
1299            // SAFETY: viewport clip above guarantees idx < pitch * height.
1300            unsafe {
1301                if target.z_test_write(idx, color, z_val) {
1302                    written += 1;
1303                }
1304            }
1305        }
1306    }
1307    written
1308}
1309
1310/// One iteration of voxlap's `DRAWBOUNDCUBELINE` macro
1311/// (voxlap5.c:8809-8812). Walks the voxel range `[range_start,
1312/// range_end)` (one (x, y) column's voxels) in three phases:
1313///
1314/// 1. Forward through voxels with `z < inz`, calling
1315///    `callback(voxel, base_mask | 0x20, r0)`.
1316/// 2. Backward through voxels with `z > inz`, calling
1317///    `callback(voxel, base_mask | 0x10, r0)`.
1318/// 3. If a single voxel remains with `z == inz`, call
1319///    `callback(voxel, base_mask | 0x00, r0)`.
1320///
1321/// Each (x, y) column is visited exactly once. `r0` is the screen-
1322/// space origin for *this* column — voxlap stores it as
1323/// `ztab4[MAXZSIZ]` and `drawboundcubesse` reads it via that index.
1324fn draw_boundcube_line<F: FnMut(&Voxel, u32, [f32; 4])>(
1325    voxels: &[Voxel],
1326    range_start: usize,
1327    range_end: usize,
1328    inz: i32,
1329    base_mask: u32,
1330    r0: [f32; 4],
1331    callback: &mut F,
1332) {
1333    if range_end <= range_start {
1334        return;
1335    }
1336    let mut v0 = range_start;
1337    let mut v1_excl = range_end;
1338
1339    // Phase 1: forward while voxels[v0].z < inz.
1340    while v0 < v1_excl && i32::from(voxels[v0].z) < inz {
1341        callback(&voxels[v0], base_mask | 0x20, r0);
1342        v0 += 1;
1343    }
1344    // Phase 2: backward while voxels[v1_excl - 1].z > inz.
1345    while v0 < v1_excl && i32::from(voxels[v1_excl - 1].z) > inz {
1346        callback(&voxels[v1_excl - 1], base_mask | 0x10, r0);
1347        v1_excl -= 1;
1348    }
1349    // Phase 3: single voxel left with z == inz.
1350    if v0 + 1 == v1_excl {
1351        callback(&voxels[v0], base_mask, r0);
1352    }
1353}
1354
1355/// 9-arm per-(x, y) column iteration walking the kv6's voxel
1356/// grid in painter's-back-to-front order around the camera-split
1357/// point (`inx`, `iny`, `inz`). Mirror of voxlap5.c:8982-9062.
1358///
1359/// Tracks `r1` (current x-column base) and `r0` (current (x, y)
1360/// origin) the same way voxlap mutates them with addps/subps,
1361/// passing `r0` to each per-voxel callback. `r0` evolves as
1362/// `r0[x][y] = r1_initial + x * cadd4[1] - y * cadd4[4]` (with
1363/// the floating-point operations applied in voxlap's order so the
1364/// per-step rounding matches bit-for-bit).
1365///
1366/// Each (x, y) column is visited exactly once.
1367#[allow(clippy::too_many_lines)]
1368pub(crate) fn kv6_iterate<F: FnMut(&Voxel, u32, [f32; 4])>(
1369    state: &Kv6FullState<'_>,
1370    mut callback: F,
1371) {
1372    let kv = state.iter.kv;
1373    let xsiz = kv.xsiz as i32;
1374    let ysiz = kv.ysiz as i32;
1375    let inx = state.iter.inx;
1376    let iny = state.iter.iny;
1377    let inz = state.iter.inz;
1378    let nxplanemin = state.iter.nxplanemin;
1379    let nxplanemax = state.iter.nxplanemax;
1380    let cadd1 = state.cadd4[1];
1381    let cadd_y = state.cadd4[4];
1382    let r2 = state.r2;
1383
1384    let mut xv: usize = 0;
1385    let mut r1 = state.r1_initial;
1386
1387    // First half: x = 0..inx. Top-half quadrants (masks 0xa, 0x6, 0x2).
1388    let mut x: i32 = 0;
1389    while x < inx {
1390        let xu = x as usize;
1391        let xlen = kv.xlen[xu] as usize;
1392        if x < nxplanemin || x >= nxplanemax {
1393            xv += xlen;
1394            r1 = vec4_add(r1, cadd1);
1395            x += 1;
1396            continue;
1397        }
1398        let yv_initial = xv + xlen;
1399        let mut r0 = r1; // movps r0, r1
1400
1401        // Forward y: 0..iny  -> mask 0xa.
1402        let mut xv_local = xv;
1403        let mut y: i32 = 0;
1404        while y < iny {
1405            let yu = y as usize;
1406            let len = kv.ylen[xu][yu] as usize;
1407            let v0 = xv_local;
1408            xv_local += len;
1409            draw_boundcube_line(&kv.voxels, v0, xv_local, inz, 0xa, r0, &mut callback);
1410            r0 = vec4_sub(r0, cadd_y); // r0 -= cadd4[4]
1411            y += 1;
1412        }
1413
1414        // Setup for reverse y: r0 = r1 + r2 (= base + (-ysiz)*cadd4[4]),
1415        // then r1 += cadd4[1] for the next x column.
1416        let mut yv_local = yv_initial;
1417        r0 = vec4_add(r1, r2);
1418        r1 = vec4_add(r1, cadd1);
1419
1420        // Reverse y: ysiz-1..iny  -> mask 0x6.
1421        let mut y = ysiz - 1;
1422        while y > iny {
1423            r0 = vec4_add(r0, cadd_y); // r0 += cadd4[4]
1424            let yu = y as usize;
1425            let len = kv.ylen[xu][yu] as usize;
1426            let v1_excl = yv_local;
1427            yv_local -= len;
1428            draw_boundcube_line(&kv.voxels, yv_local, v1_excl, inz, 0x6, r0, &mut callback);
1429            y -= 1;
1430        }
1431
1432        // Edge y == iny  -> mask 0x2.
1433        if iny >= 0 && (iny as u32) < kv.ysiz {
1434            r0 = vec4_add(r0, cadd_y);
1435            let yu = iny as usize;
1436            let len = kv.ylen[xu][yu] as usize;
1437            let v1_excl = yv_local;
1438            yv_local -= len;
1439            draw_boundcube_line(&kv.voxels, yv_local, v1_excl, inz, 0x2, r0, &mut callback);
1440        }
1441
1442        xv += xlen;
1443        x += 1;
1444    }
1445
1446    // Setup for second half (voxlap5.c:9011): jump r1 to past-end.
1447    // r1 += (xsiz - x) * cadd4[1]  with x = post-first-half value.
1448    let dx_remain = (xsiz - x) as f32;
1449    r1 = vec4_add(r1, vec4_scale(cadd1, dx_remain));
1450
1451    // Second half: x = xsiz-1..inx (reverse). Bot-half quadrants
1452    // (masks 0x5, 0x9, 0x1).
1453    let mut xv2: usize = kv.voxels.len();
1454    let mut x = xsiz - 1;
1455    while x > inx {
1456        let xu = x as usize;
1457        let xlen = kv.xlen[xu] as usize;
1458        if x < nxplanemin || x >= nxplanemax {
1459            xv2 -= xlen;
1460            r1 = vec4_sub(r1, cadd1);
1461            x -= 1;
1462            continue;
1463        }
1464        let yv_initial = xv2 - xlen;
1465        // Voxlap order: r1 -= cadd1 first, then r0 = r1 + r2.
1466        r1 = vec4_sub(r1, cadd1);
1467        let mut r0 = vec4_add(r1, r2);
1468
1469        // Reverse y: ysiz-1..iny  -> mask 0x5.
1470        let mut xv_local = xv2;
1471        let mut y = ysiz - 1;
1472        while y > iny {
1473            r0 = vec4_add(r0, cadd_y);
1474            let yu = y as usize;
1475            let len = kv.ylen[xu][yu] as usize;
1476            let v1_excl = xv_local;
1477            xv_local -= len;
1478            draw_boundcube_line(&kv.voxels, xv_local, v1_excl, inz, 0x5, r0, &mut callback);
1479            y -= 1;
1480        }
1481
1482        // After reverse y: r0 = r1 (movps r0, r1).
1483        let mut yv_local = yv_initial;
1484        r0 = r1;
1485
1486        // Forward y: 0..iny  -> mask 0x9.
1487        let mut y: i32 = 0;
1488        while y < iny {
1489            let yu = y as usize;
1490            let len = kv.ylen[xu][yu] as usize;
1491            let v0 = yv_local;
1492            yv_local += len;
1493            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x9, r0, &mut callback);
1494            r0 = vec4_sub(r0, cadd_y);
1495            y += 1;
1496        }
1497
1498        // Edge y == iny  -> mask 0x1.
1499        if iny >= 0 && (iny as u32) < kv.ysiz {
1500            let yu = iny as usize;
1501            let len = kv.ylen[xu][yu] as usize;
1502            let v0 = yv_local;
1503            yv_local += len;
1504            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x1, r0, &mut callback);
1505        }
1506
1507        xv2 -= xlen;
1508        x -= 1;
1509    }
1510
1511    // Edge x == inx (middle column). Masks 0x4, 0x8, 0x0.
1512    if inx >= 0 && (inx as u32) < kv.xsiz {
1513        let xu = inx as usize;
1514        if inx < nxplanemin || inx >= nxplanemax {
1515            return;
1516        }
1517        let xlen = kv.xlen[xu] as usize;
1518        let yv_initial = xv2 - xlen;
1519        r1 = vec4_sub(r1, cadd1);
1520        let mut r0 = vec4_add(r1, r2);
1521
1522        // Reverse y -> mask 0x4.
1523        let mut xv_local = xv2;
1524        let mut y = ysiz - 1;
1525        while y > iny {
1526            r0 = vec4_add(r0, cadd_y);
1527            let yu = y as usize;
1528            let len = kv.ylen[xu][yu] as usize;
1529            let v1_excl = xv_local;
1530            xv_local -= len;
1531            draw_boundcube_line(&kv.voxels, xv_local, v1_excl, inz, 0x4, r0, &mut callback);
1532            y -= 1;
1533        }
1534
1535        // After reverse y: r0 = r1.
1536        let mut yv_local = yv_initial;
1537        r0 = r1;
1538
1539        // Forward y -> mask 0x8.
1540        let mut y: i32 = 0;
1541        while y < iny {
1542            let yu = y as usize;
1543            let len = kv.ylen[xu][yu] as usize;
1544            let v0 = yv_local;
1545            yv_local += len;
1546            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x8, r0, &mut callback);
1547            r0 = vec4_sub(r0, cadd_y);
1548            y += 1;
1549        }
1550
1551        // Edge y == iny -> mask 0x0.
1552        if iny >= 0 && (iny as u32) < kv.ysiz {
1553            let yu = iny as usize;
1554            let len = kv.ylen[xu][yu] as usize;
1555            let v0 = yv_local;
1556            yv_local += len;
1557            draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x0, r0, &mut callback);
1558        }
1559    }
1560}
1561
1562/// Draw a sprite into a framebuffer + z-buffer.
1563///
1564/// Top-level dispatcher mirroring voxlap5.c:9818-9828:
1565/// - Skips on `flags & INVISIBLE`.
1566/// - Skips on `flags & KFA` (animation path; out of scope for R6).
1567/// - Skips on `flags & NO_Z` (handled by `drawboundcubenozsse`,
1568///   not yet ported — the four oracle sprite poses all use z-tested
1569///   rendering).
1570///
1571/// Otherwise: cull → setup math → 9-arm per-voxel iteration →
1572/// per-voxel rasterize via the R6.4 `drawboundcubesse` port.
1573///
1574/// Returns the total number of pixels written across all voxels of
1575/// the sprite (== sum of z-test passes). Zero means the sprite
1576/// produced no visible pixels (culled, fully behind near plane, or
1577/// totally occluded).
1578/// Render a batch of sprites in parallel via `rayon::par_iter`.
1579///
1580/// Each sprite runs its own [`draw_sprite`] pass on its own thread,
1581/// writing to the shared [`DrawTarget`] (raw pointers;
1582/// `Copy + Send + Sync`) under the z-test arbitration contract: a
1583/// pixel write only fires when the new sprite's z is strictly less
1584/// than the current zbuffer value. For non-overlapping sprites the
1585/// writes are pairwise-disjoint and the output is byte-identical
1586/// to a sequential pass over the same sprite list. For overlapping
1587/// pixels, two sprites at exactly tied z-values produce a
1588/// non-deterministic last-writer-wins outcome — visually
1589/// indistinguishable but hash-non-deterministic.
1590///
1591/// Returns the sum of `draw_sprite` return values (total pixels
1592/// written across all sprites).
1593///
1594/// `RAYON_NUM_THREADS=1` (or no parallelism worth) ⇒ effectively
1595/// sequential; rayon falls back to running each closure on the
1596/// calling thread without contention.
1597///
1598/// Use this for engine scenes with dozens-to-hundreds of sprites;
1599/// the per-sprite overhead amortises well past ~4 sprites on
1600/// consumer-class hardware.
1601#[allow(clippy::module_name_repetitions)]
1602#[must_use]
1603pub fn draw_sprites_parallel(
1604    target: DrawTarget<'_>,
1605    cam: &CameraState,
1606    settings: &OpticastSettings,
1607    lighting: &SpriteLighting<'_>,
1608    sprites: &[Sprite],
1609) -> u32 {
1610    use rayon::prelude::*;
1611
1612    let render_one = |sprite: &Sprite| {
1613        // `target` is `Copy`, so each closure captures its own
1614        // copy of the (raw fb / zb pointer) view. `cam`,
1615        // `settings`, `lighting` are `&` borrows — Sync.
1616        let mut t = target;
1617        draw_sprite(&mut t, cam, settings, lighting, sprite)
1618    };
1619
1620    sprites.par_iter().map(render_one).sum()
1621}
1622
1623pub fn draw_sprite(
1624    target: &mut DrawTarget<'_>,
1625    cam: &CameraState,
1626    settings: &OpticastSettings,
1627    lighting: &SpriteLighting<'_>,
1628    sprite: &Sprite,
1629) -> u32 {
1630    if sprite.flags & SPRITE_FLAG_INVISIBLE != 0 {
1631        return 0;
1632    }
1633    if sprite.flags & SPRITE_FLAG_KFA != 0 {
1634        return 0;
1635    }
1636    if sprite.flags & SPRITE_FLAG_NO_Z != 0 {
1637        // drawboundcubenozsse port deferred; oracle doesn't exercise it.
1638        return 0;
1639    }
1640    let Some(setup) = kv6_draw_prepare(sprite, cam) else {
1641        return 0;
1642    };
1643    let state = kv6_compute_full_state(
1644        &setup,
1645        sprite,
1646        lighting,
1647        cam,
1648        settings,
1649        target.width,
1650        target.height,
1651        target.pitch_pixels,
1652    );
1653    let mut mm5_tail: u32 = 0;
1654    let mut total_written: u32 = 0;
1655    kv6_iterate(&state, |voxel, mask, r0| {
1656        total_written += drawboundcubesse(voxel, mask, &state, r0, &mut mm5_tail, target);
1657    });
1658    total_written
1659}
1660
1661#[cfg(test)]
1662mod tests {
1663    use super::*;
1664    use crate::camera_math;
1665    use crate::Camera;
1666    use roxlap_formats::kv6::Kv6;
1667
1668    fn empty_kv6() -> Kv6 {
1669        Kv6 {
1670            xsiz: 1,
1671            ysiz: 1,
1672            zsiz: 1,
1673            xpiv: 0.5,
1674            ypiv: 0.5,
1675            zpiv: 0.5,
1676            voxels: Vec::new(),
1677            xlen: vec![0],
1678            ylen: vec![vec![0]],
1679            palette: None,
1680        }
1681    }
1682
1683    /// 17×17×17 kv6 with pivot at the centre — same dimensions as
1684    /// the meltsphere oracle sprite so the cull test exercises a
1685    /// realistic bound cube rather than a 1-voxel point.
1686    fn cube_kv6() -> Kv6 {
1687        Kv6 {
1688            xsiz: 17,
1689            ysiz: 17,
1690            zsiz: 17,
1691            xpiv: 8.5,
1692            ypiv: 8.5,
1693            zpiv: 8.5,
1694            voxels: Vec::new(),
1695            xlen: vec![0; 17],
1696            ylen: vec![vec![0; 17]; 17],
1697            palette: None,
1698        }
1699    }
1700
1701    /// `CameraState` matching the oracle's `sprite_front` pose:
1702    /// pos=(1020,1050,175), yaw=0, pitch=0 → forward = +x.
1703    fn oracle_sprite_front_camera() -> camera_math::CameraState {
1704        let camera = Camera {
1705            pos: [1020.0, 1050.0, 175.0],
1706            // From oracle.c set_camera_yaw_pitch with yaw=0, pitch=0:
1707            //   ifor = [1, 0, 0], istr = [0, 1, 0], ihei = [0, 0, 1].
1708            right: [0.0, 1.0, 0.0],
1709            down: [0.0, 0.0, 1.0],
1710            forward: [1.0, 0.0, 0.0],
1711        };
1712        camera_math::derive(&camera, 640, 480, 320.0, 240.0, 320.0)
1713    }
1714
1715    fn oracle_settings() -> OpticastSettings {
1716        OpticastSettings::for_oracle_framebuffer(640, 480)
1717    }
1718
1719    /// Test-only ergonomic shim: build a Kv6FullState with the
1720    /// oracle 640×480 framebuffer geometry. Mirrors the
1721    /// pre-R6.4 signature so tests don't have to spell out
1722    /// width/height/pitch every time.
1723    fn compute_state_for_test<'a>(
1724        setup: &Kv6DrawSetup<'a>,
1725        sprite: &Sprite,
1726        cam: &camera_math::CameraState,
1727    ) -> Kv6FullState<'a> {
1728        let lighting = SpriteLighting::default_oracle();
1729        kv6_compute_full_state(
1730            setup,
1731            sprite,
1732            &lighting,
1733            cam,
1734            &oracle_settings(),
1735            640,
1736            480,
1737            640,
1738        )
1739    }
1740
1741    /// Allocate a 640×480 framebuffer + zbuffer (zbuffer pre-filled
1742    /// with f32::INFINITY so any voxel passes the z-test on first
1743    /// write).
1744    fn alloc_target() -> (Vec<u32>, Vec<f32>) {
1745        let pixels = 640usize * 480usize;
1746        (vec![0u32; pixels], vec![f32::INFINITY; pixels])
1747    }
1748
1749    fn make_target<'a>(fb: &'a mut [u32], zb: &'a mut [f32]) -> DrawTarget<'a> {
1750        DrawTarget::new(fb, zb, 640, 640, 480)
1751    }
1752
1753    /// Bit-pattern compare for two `[f32; 4]` vectors. The setup
1754    /// math produces these via deterministic IEEE-754 ops, so
1755    /// bit-equality is well-defined and dodges `clippy::float_cmp`.
1756    fn bits4(a: [f32; 4]) -> [u32; 4] {
1757        a.map(f32::to_bits)
1758    }
1759
1760    /// Bytes of the dumped C-oracle meltsphere sprite — used by all
1761    /// the kv6-load tests below. Module-scope `const` keeps clippy's
1762    /// `items_after_statements` happy.
1763    const SPRITE_MELTSPHERE_KV6: &[u8] = include_bytes!("../tests/fixtures/sprite_meltsphere.kv6");
1764
1765    #[test]
1766    fn axis_aligned_sets_identity_basis() {
1767        // Compare bit patterns: these are integer-valued floats so
1768        // bit-equality is well-defined and dodges clippy::float_cmp.
1769        let bits = |a: [f32; 3]| a.map(f32::to_bits);
1770        let s = Sprite::axis_aligned(empty_kv6(), [10.0, 20.0, 30.0]);
1771        assert_eq!(bits(s.p), bits([10.0, 20.0, 30.0]));
1772        assert_eq!(bits(s.s), bits([1.0, 0.0, 0.0]));
1773        assert_eq!(bits(s.h), bits([0.0, 1.0, 0.0]));
1774        assert_eq!(bits(s.f), bits([0.0, 0.0, 1.0]));
1775        assert_eq!(s.flags, 0);
1776    }
1777
1778    #[test]
1779    fn invisible_flag_skips_dispatch() {
1780        let cam = oracle_sprite_front_camera();
1781        let mut s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1782        s.flags = SPRITE_FLAG_INVISIBLE;
1783        let (mut fb, mut zb) = alloc_target();
1784        let mut target = make_target(&mut fb, &mut zb);
1785        let lighting = SpriteLighting::default_oracle();
1786        assert_eq!(
1787            draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
1788            0
1789        );
1790    }
1791
1792    #[test]
1793    fn kfa_flag_skips_dispatch() {
1794        let cam = oracle_sprite_front_camera();
1795        let mut s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1796        s.flags = SPRITE_FLAG_KFA;
1797        let (mut fb, mut zb) = alloc_target();
1798        let mut target = make_target(&mut fb, &mut zb);
1799        let lighting = SpriteLighting::default_oracle();
1800        assert_eq!(
1801            draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
1802            0
1803        );
1804    }
1805
1806    #[test]
1807    fn cull_keeps_oracle_sprite_in_front_of_camera() {
1808        // Oracle's `sprite_front` pose: camera at (1020,1050,175)
1809        // looking +x; sprite at (1050,1050,175). Sprite is 30
1810        // units forward, on-axis — clearly inside the frustum.
1811        let cam = oracle_sprite_front_camera();
1812        let s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1813        assert!(
1814            kv6_draw_prepare(&s, &cam).is_some(),
1815            "front-of-camera sprite must NOT be culled"
1816        );
1817    }
1818
1819    #[test]
1820    fn cull_removes_sprite_far_behind_camera() {
1821        // Same camera; sprite far in the -forward direction
1822        // (= behind the camera).
1823        let cam = oracle_sprite_front_camera();
1824        let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
1825        assert!(
1826            kv6_draw_prepare(&s, &cam).is_none(),
1827            "behind-camera sprite must be culled"
1828        );
1829    }
1830
1831    #[test]
1832    fn cull_removes_sprite_far_to_the_right() {
1833        // Camera looks +x; sprite far in the +y direction (right
1834        // axis), far enough that the bound cube is fully outside
1835        // the right-edge frustum plane.
1836        let cam = oracle_sprite_front_camera();
1837        // 30 units forward, 200 units right — well outside the 90°
1838        // FOV's right edge.
1839        let s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0 + 200.0, 175.0]);
1840        assert!(
1841            kv6_draw_prepare(&s, &cam).is_none(),
1842            "far-right sprite must be culled"
1843        );
1844    }
1845
1846    #[test]
1847    fn cull_keeps_sprite_at_camera_position() {
1848        // Sprite centred on the camera — bound cube straddles the
1849        // camera, so by definition it's not fully outside any
1850        // frustum plane and must NOT be culled.
1851        let cam = oracle_sprite_front_camera();
1852        let s = Sprite::axis_aligned(cube_kv6(), cam.pos);
1853        assert!(
1854            kv6_draw_prepare(&s, &cam).is_some(),
1855            "sprite at camera position must not be culled"
1856        );
1857    }
1858
1859    #[test]
1860    fn iterate_visits_each_voxel_exactly_once() {
1861        // Build a synthetic 3×3×3 kv6 with one voxel per (x, y)
1862        // column at z = x + y mod 3. Then iterate and check
1863        // (a) total callback fires == 27 = numvoxs, and (b) every
1864        // voxel index 0..27 was visited exactly once.
1865        let xsiz: u32 = 3;
1866        let ysiz: u32 = 3;
1867        let zsiz: u32 = 3;
1868        let mut voxels = Vec::new();
1869        let mut xlen = vec![0u32; xsiz as usize];
1870        let mut ylen = vec![vec![0u16; ysiz as usize]; xsiz as usize];
1871        for x in 0..xsiz {
1872            for y in 0..ysiz {
1873                let z = ((x + y) % 3) as u16;
1874                voxels.push(Voxel {
1875                    col: 0x0080_0000,
1876                    z,
1877                    vis: 63,
1878                    dir: 0,
1879                });
1880                xlen[x as usize] += 1;
1881                ylen[x as usize][y as usize] = 1;
1882            }
1883        }
1884        let kv = Kv6 {
1885            xsiz,
1886            ysiz,
1887            zsiz,
1888            xpiv: 1.5,
1889            ypiv: 1.5,
1890            zpiv: 1.5,
1891            voxels,
1892            xlen,
1893            ylen,
1894            palette: None,
1895        };
1896        let setup = Kv6DrawSetup {
1897            kv: &kv,
1898            ts: [1.0, 0.0, 0.0],
1899            th: [0.0, 1.0, 0.0],
1900            tf: [0.0, 0.0, 1.0],
1901            mip: 0,
1902        };
1903        let cam = oracle_sprite_front_camera();
1904        let synth_sprite = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
1905        let state = compute_state_for_test(&setup, &synth_sprite, &cam);
1906
1907        // Every voxel index must fire exactly once. We use a
1908        // by-pointer identity check via .as_ptr() offsets.
1909        let voxels_ptr = kv.voxels.as_ptr();
1910        let mut visited = vec![0u32; kv.voxels.len()];
1911        let mut total: u32 = 0;
1912        kv6_iterate(&state, |v, _mask, _r0| {
1913            // SAFETY: callback receives a borrow of an entry of
1914            // `kv.voxels`; computing the offset is well-defined.
1915            let idx = unsafe { std::ptr::from_ref::<Voxel>(v).offset_from(voxels_ptr) } as usize;
1916            visited[idx] += 1;
1917            total += 1;
1918        });
1919        assert_eq!(total as usize, kv.voxels.len(), "total callback fires");
1920        for (i, &n) in visited.iter().enumerate() {
1921            assert_eq!(n, 1, "voxel {i} visited {n} times (want 1)");
1922        }
1923    }
1924
1925    #[test]
1926    fn iterate_meltsphere_oracle_visits_each_voxel_once() {
1927        // Load the dumped voxlap-C meltsphere fixture (R6.0e) and
1928        // run the iteration against the oracle's sprite_front
1929        // camera + sprite pose. Expected: every voxel hit exactly
1930        // once, total fires == kv.voxels.len() (= 401).
1931        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1932        assert_eq!(kv.voxels.len(), 401, "fixture voxel count");
1933
1934        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1935        let cam = oracle_sprite_front_camera();
1936        let setup = kv6_draw_prepare(&sprite, &cam).expect("oracle sprite must pass cull");
1937        let state = compute_state_for_test(&setup, &sprite, &cam);
1938
1939        let voxels_ptr = sprite.kv6.voxels.as_ptr();
1940        let mut visited = vec![0u32; sprite.kv6.voxels.len()];
1941        let mut total: u32 = 0;
1942        kv6_iterate(&state, |v, _mask, _r0| {
1943            let idx = unsafe { std::ptr::from_ref::<Voxel>(v).offset_from(voxels_ptr) } as usize;
1944            visited[idx] += 1;
1945            total += 1;
1946        });
1947        assert_eq!(total, 401);
1948        let max = visited.iter().copied().max().unwrap();
1949        let min = visited.iter().copied().min().unwrap();
1950        assert_eq!(max, 1, "no voxel may be visited twice");
1951        assert_eq!(min, 1, "no voxel may be skipped");
1952    }
1953
1954    #[test]
1955    fn full_state_basic_invariants() {
1956        // For the oracle sprite_front pose, sanity-check the setup
1957        // values: ztab4_per_z[0] is zero, ztab4_per_z[k] - ztab4_per_z[k-1]
1958        // equals cadd4[2], cadd4[3] = cadd4[1] + cadd4[2], cadd4[7] is
1959        // the 7-bit-OR sum, and r1_initial = (npos*gihz with z2=npos.z)
1960        // - cadd4[4].
1961        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1962        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1963        let cam = oracle_sprite_front_camera();
1964        let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
1965        let state = compute_state_for_test(&setup, &sprite, &cam);
1966
1967        // ztab4_per_z[0] = [0; 4].
1968        assert_eq!(bits4(state.ztab4_per_z[0]), bits4([0.0; 4]));
1969
1970        // For each subsequent z, ztab4_per_z[z] = ztab4_per_z[z-1] + cadd4[2].
1971        for z in 1..state.ztab4_per_z.len() {
1972            let want = vec4_add(state.ztab4_per_z[z - 1], state.cadd4[2]);
1973            assert_eq!(bits4(state.ztab4_per_z[z]), bits4(want), "ztab4_per_z[{z}]");
1974        }
1975
1976        // cadd4[3] = cadd4[1] + cadd4[2]; cadd4[5] = cadd4[1] + cadd4[4];
1977        // cadd4[6] = cadd4[2] + cadd4[4]; cadd4[7] = cadd4[3] + cadd4[4].
1978        assert_eq!(
1979            bits4(state.cadd4[3]),
1980            bits4(vec4_add(state.cadd4[1], state.cadd4[2]))
1981        );
1982        assert_eq!(
1983            bits4(state.cadd4[5]),
1984            bits4(vec4_add(state.cadd4[1], state.cadd4[4]))
1985        );
1986        assert_eq!(
1987            bits4(state.cadd4[6]),
1988            bits4(vec4_add(state.cadd4[2], state.cadd4[4]))
1989        );
1990        assert_eq!(
1991            bits4(state.cadd4[7]),
1992            bits4(vec4_add(state.cadd4[3], state.cadd4[4]))
1993        );
1994        assert_eq!(bits4(state.cadd4[0]), bits4([0.0; 4]));
1995
1996        // r2 = -ysiz * cadd4[4].
1997        let want_r2 = vec4_scale(state.cadd4[4], -(state.iter.kv.ysiz as f32));
1998        assert_eq!(bits4(state.r2), bits4(want_r2));
1999    }
2000
2001    #[test]
2002    fn drawboundcubesse_culls_invisible_face_mask() {
2003        // Synthetic voxel with vis=0 must short-circuit the
2004        // early-out and not consume the scissor branch.
2005        let v = Voxel {
2006            col: 0,
2007            z: 0,
2008            vis: 0,
2009            dir: 0,
2010        };
2011        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2012        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2013        let cam = oracle_sprite_front_camera();
2014        let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
2015        let state = compute_state_for_test(&setup, &sprite, &cam);
2016        let (mut fb, mut zb) = alloc_target();
2017        let mut target = make_target(&mut fb, &mut zb);
2018        let mut tail = 0u32;
2019        assert_eq!(
2020            drawboundcubesse(
2021                &v,
2022                0xff,
2023                &state,
2024                [0.0, 0.0, 100.0, 100.0],
2025                &mut tail,
2026                &mut target,
2027            ),
2028            0
2029        );
2030    }
2031
2032    #[test]
2033    fn drawboundcubesse_culls_voxel_behind_near_plane() {
2034        // Force scisdist > 0 by passing an r0 with very small
2035        // origin.z. Only triggers if scisdist > origin.z; for the
2036        // oracle sprite_front pose `scisdist` is some small
2037        // positive number (sum of any negative post-swap basis-z
2038        // components), so a r0 with z = -1 will cull.
2039        let v = Voxel {
2040            col: 0xff,
2041            z: 0,
2042            vis: 0xff,
2043            dir: 0,
2044        };
2045        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2046        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2047        let cam = oracle_sprite_front_camera();
2048        let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
2049        let state = compute_state_for_test(&setup, &sprite, &cam);
2050        // r0.z = -1000 makes origin.z = -1000 + ztab4_per_z[0].z = -1000.
2051        // scisdist >= 0; -1000 < scisdist → cull.
2052        let r0 = [0.0, 0.0, -1000.0, -1000.0];
2053        let (mut fb, mut zb) = alloc_target();
2054        let mut target = make_target(&mut fb, &mut zb);
2055        let mut tail = 0u32;
2056        assert_eq!(
2057            drawboundcubesse(&v, 0xff, &state, r0, &mut tail, &mut target),
2058            0
2059        );
2060    }
2061
2062    #[test]
2063    fn iterate_no_voxels_when_culled() {
2064        // Sprite far behind camera → cull. draw_sprite never
2065        // reaches kv6_iterate, so no callback fires.
2066        let cam = oracle_sprite_front_camera();
2067        let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
2068        // Cull catches it before iteration.
2069        assert!(kv6_draw_prepare(&s, &cam).is_none());
2070    }
2071
2072    #[test]
2073    fn draw_sprite_writes_pixels_for_oracle_meltsphere() {
2074        // R6.4 end-to-end: load the meltsphere fixture, run
2075        // draw_sprite at the sprite_front pose. Expect a non-zero
2076        // pixel count and at least one non-zero framebuffer entry.
2077        let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2078        let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2079        let cam = oracle_sprite_front_camera();
2080        let (mut fb, mut zb) = alloc_target();
2081        let mut target = make_target(&mut fb, &mut zb);
2082        let lighting = SpriteLighting::default_oracle();
2083        let written = draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &sprite);
2084        assert!(written > 0, "expected some pixels to be written");
2085        assert!(
2086            fb.iter().any(|&p| p != 0),
2087            "expected at least one non-zero framebuffer entry"
2088        );
2089        // Z-buffer must have shrunk somewhere from f32::INFINITY.
2090        assert!(
2091            zb.iter().any(|&z| z.is_finite()),
2092            "expected at least one finite zbuffer entry"
2093        );
2094    }
2095
2096    #[test]
2097    fn draw_sprite_returns_zero_for_culled_sprite() {
2098        let cam = oracle_sprite_front_camera();
2099        let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
2100        let (mut fb, mut zb) = alloc_target();
2101        let mut target = make_target(&mut fb, &mut zb);
2102        let lighting = SpriteLighting::default_oracle();
2103        assert_eq!(
2104            draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
2105            0
2106        );
2107        assert!(fb.iter().all(|&p| p == 0));
2108    }
2109
2110    /// `update_reflects` for the oracle sprite_front pose hits the
2111    /// nolighta path (R==G==B kv6col, no fog, lightmode<2). All
2112    /// kv6colmul[k] entries must repeat one u16 modulation factor
2113    /// across all 4 lanes.
2114    #[test]
2115    fn update_reflects_nolighta_lanes_match() {
2116        let s = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
2117        let lighting = SpriteLighting::default_oracle();
2118        let (cm, ca) = update_reflects(&s, &lighting);
2119        assert_eq!(ca, 0, "kv6coladd must be zero (no fog)");
2120        for (k, e) in cm.iter().enumerate() {
2121            let l0 = (e & 0xffff) as u16;
2122            let l1 = ((e >> 16) & 0xffff) as u16;
2123            let l2 = ((e >> 32) & 0xffff) as u16;
2124            let l3 = ((e >> 48) & 0xffff) as u16;
2125            assert_eq!(l0, l1, "kv6colmul[{k}] lane0 != lane1");
2126            assert_eq!(l0, l2, "kv6colmul[{k}] lane0 != lane2");
2127            assert_eq!(l0, l3, "kv6colmul[{k}] lane0 != lane3");
2128        }
2129    }
2130
2131    /// Non-grey kv6col forces the nolightb path. Lanes 0..3 of each
2132    /// `kv6colmul[k]` come from per-channel modulators built from
2133    /// the kv6col bytes — they should NOT all match unless the
2134    /// channels themselves match.
2135    #[test]
2136    fn update_reflects_nolightb_lanes_diverge_for_tinted_kv6col() {
2137        let s = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
2138        let lighting = SpriteLighting {
2139            kv6col: 0x0040_8040, // R != G != B
2140            lightmode: 0,
2141            lights: &[],
2142        };
2143        let (cm, _) = update_reflects(&s, &lighting);
2144        // Find any direction where the dot is non-zero (most are
2145        // non-zero); that direction's lanes must vary by channel.
2146        let mut saw_divergence = false;
2147        for e in cm.iter() {
2148            let l0 = (e & 0xffff) as u16;
2149            let l1 = ((e >> 16) & 0xffff) as u16;
2150            let l2 = ((e >> 32) & 0xffff) as u16;
2151            if l0 != l1 || l0 != l2 {
2152                saw_divergence = true;
2153                break;
2154            }
2155        }
2156        assert!(
2157            saw_divergence,
2158            "non-grey kv6col must produce per-channel divergence in some kv6colmul slot"
2159        );
2160    }
2161
2162    /// Lightmode-2 with one point light + grey kv6col still
2163    /// produces R==G==B lanes (because the per-channel modulators
2164    /// are all 0x80<<8 = 0x8000). It must produce a non-uniform
2165    /// kv6colmul (some directions face the light, others away),
2166    /// which differs from lightmode<2 where every direction has the
2167    /// same dot magnitude regardless of position.
2168    #[test]
2169    fn update_reflects_lightmode2_produces_directional_shading() {
2170        let s = Sprite::axis_aligned(empty_kv6(), [100.0, 100.0, 100.0]);
2171        let lights = [LightSrc {
2172            pos: [110.0, 100.0, 100.0],
2173            r2: 100.0,
2174            sc: 16.0,
2175        }];
2176        let lighting = SpriteLighting {
2177            kv6col: DEFAULT_KV6COL,
2178            lightmode: 2,
2179            lights: &lights,
2180        };
2181        let (cm, _) = update_reflects(&s, &lighting);
2182        // Some directions must darken (shadow side) while others
2183        // brighten (light side) — the spread between min and max
2184        // tells us shading is happening.
2185        let mut min_w = u16::MAX;
2186        let mut max_w = 0u16;
2187        for e in cm.iter() {
2188            let l0 = (e & 0xffff) as u16;
2189            min_w = min_w.min(l0);
2190            max_w = max_w.max(l0);
2191        }
2192        assert!(
2193            max_w > min_w + 16,
2194            "lightmode-2 should produce directional shading: min={min_w} max={max_w}"
2195        );
2196    }
2197
2198    /// Lightmode-2 with no lights → ambient-only. Should still
2199    /// produce some non-zero kv6colmul (the synthetic ambient slot
2200    /// is non-trivial).
2201    #[test]
2202    fn update_reflects_lightmode2_no_lights_falls_back_to_ambient() {
2203        let s = Sprite::axis_aligned(empty_kv6(), [100.0, 100.0, 100.0]);
2204        let lighting = SpriteLighting {
2205            kv6col: DEFAULT_KV6COL,
2206            lightmode: 2,
2207            lights: &[],
2208        };
2209        let (cm, _) = update_reflects(&s, &lighting);
2210        let any_nonzero = cm.iter().any(|&e| e != 0);
2211        assert!(
2212            any_nonzero,
2213            "lightmode-2 with no lights should still emit ambient shading"
2214        );
2215    }
2216}