roxlap_core/sprite.rs
1//! KV6 sprite type + the `draw_sprite` dispatcher.
2//!
3//! Mirror of voxlap's `vx5sprite` (voxlap5.h:63-79) plus the
4//! `drawsprite` entry point (voxlap5.c:9818). For R6.1 the
5//! dispatcher is a stub — just enough API surface for the host to
6//! plumb a sprite reference through. R6.2-R6.4 fill in the actual
7//! kv6 frustum-cull + per-voxel rasterization behind it.
8//!
9//! Voxlap's vx5sprite is a 64-byte struct:
10//!
11//! ```text
12//! point3d p; // position
13//! int32_t flags; // bit 0: 0=normal shading
14//! // bit 1: 0=kv6data, 1=kfatype (oracle uses 0)
15//! // bit 2: 0=normal, 1=invisible
16//! point3d s; // x-basis (kv6data.xsiz direction)
17//! kv6data *voxnum; // (or kfatype *kfaptr if flag bit 1 set)
18//! point3d h; // y-basis
19//! int32_t kfatim;
20//! point3d f; // z-basis
21//! int32_t okfatim;
22//! ```
23//!
24//! For R6 we only handle kv6 sprites with `flags = 0` (the four
25//! oracle sprite poses all use this). KFA animation + the no-z and
26//! invisible flags are deferred.
27
28// The kv6draw port is pointer-arithmetic-heavy; the casts mirror C's
29// implicit i32/u32/usize narrowings. Loop bounds are clamped via
30// `lbound` so sign-loss / wrap is guarded at the type-system edge.
31// kv.{xsiz,ysiz,zsiz} are u32 with realistic max ≤ 256 (file format
32// limit) — well within f32's 24-bit mantissa.
33#![allow(
34 clippy::cast_possible_truncation,
35 clippy::cast_possible_wrap,
36 clippy::cast_sign_loss,
37 clippy::cast_precision_loss,
38 clippy::similar_names,
39 clippy::too_many_arguments,
40 clippy::too_many_lines,
41 clippy::cast_ptr_alignment, // _mm_loadl_epi64 / _mm_storeu_si128 are intentionally unaligned
42 clippy::doc_markdown,
43 clippy::no_effect_underscore_binding, // SSE intrinsic side-effect-only stores
44 clippy::no_effect, // the discarded pmaddwd intermediate
45 clippy::ref_as_ptr,
46 clippy::float_cmp_const,
47 clippy::float_cmp,
48)]
49
50use roxlap_formats::kv6::{Kv6, Voxel};
51use roxlap_formats::sprite::{Sprite, SPRITE_FLAG_INVISIBLE, SPRITE_FLAG_KFA, SPRITE_FLAG_NO_Z};
52
53use crate::camera_math::CameraState;
54use crate::engine::{Engine, LightSrc, DEFAULT_KV6COL};
55use crate::equivec::iunivec;
56use crate::fixed::ftol;
57use crate::opticast::OpticastSettings;
58use crate::ptfaces16::PTFACES16;
59
60/// Voxlap's `MAXLIGHTS` cap (`voxlap5.c`). Used to size the
61/// ambient-plus-N-lights `lightlist` scratch in `update_reflects`'s
62/// lightmode≥2 branch.
63const MAX_LIGHTS: usize = 16;
64
65/// Voxlap's `vx5.kv6mipfactor` default (`voxlap5.c:12335`). Threshold
66/// distance (in voxlap's "ftol-of-forward-projected" estimate units)
67/// above which kv6draw walks the lowermip chain. Roxlap doesn't yet
68/// model the lowermip chain in `roxlap-formats::Kv6`, so the mip
69/// descent loop in `kv6_draw_prepare` is structurally faithful but
70/// effectively a no-op until that lands.
71pub(crate) const KV6_MIPFACTOR_DEFAULT: i32 = 128;
72
73/// Post-cull state derived from a sprite + camera pair — what the
74/// per-voxel iteration in R6.3+ needs to start its setup. Borrows
75/// the mip-selected kv6 from the sprite.
76///
77/// Voxlap doesn't materialise this struct (it operates on local
78/// variables inside `kv6draw`); roxlap factors the cull out so it's
79/// independently testable without staging the rest of the
80/// rasterizer.
81#[derive(Debug, Clone)]
82#[allow(dead_code)] // R6.3+ will read these fields.
83pub(crate) struct Kv6DrawSetup<'a> {
84 /// Mip-selected kv6. For the base-mip case (always, today),
85 /// this is just `&sprite.kv6`.
86 pub kv: &'a Kv6,
87 /// Mip-scaled basis vectors. For the base mip these equal
88 /// `sprite.s/h/f`; if a future lowermip walk runs, each is
89 /// scaled by `2^mip`.
90 pub ts: [f32; 3],
91 pub th: [f32; 3],
92 pub tf: [f32; 3],
93 /// 0 for the base mip; reserved for lowermip support.
94 pub mip: u32,
95}
96
97/// Mip-LOD descent + 4-plane frustum cull, mirror of voxlap5.c:8832-
98/// 8875. Returns `None` if the sprite's bound cube is fully behind
99/// any of the four view-frustum edge planes (`CameraState::nor`),
100/// `Some(setup)` otherwise with the post-cull state R6.3 needs.
101///
102/// # Cull math
103///
104/// The bound cube has centre `npos` (in camera-relative coords) and
105/// three half-extent vectors `nstr`, `nhei`, `nfor` (each = the
106/// kv6-axis basis vector scaled by the corresponding half-extent).
107/// For each frustum-edge normal `n`, voxlap tests:
108///
109/// ```text
110/// |nstr · n| + |nhei · n| + |nfor · n| + npos · n < 0
111/// ```
112///
113/// — i.e. the cube's closest-point projection onto `n` is still
114/// behind the plane. Any plane satisfying this culls the sprite.
115pub(crate) fn kv6_draw_prepare<'a>(
116 sprite: &'a Sprite,
117 cam: &CameraState,
118) -> Option<Kv6DrawSetup<'a>> {
119 let kv = &sprite.kv6;
120
121 // Voxlap's quick-and-dirty distance estimate (voxlap5.c:8835):
122 // y = ftol((spr->p - gipos) · gifor)
123 // Used by the lowermip descent loop. Roxlap-formats `Kv6` doesn't
124 // model lowermip yet, so the loop never runs and this value is
125 // unused — computed for symmetry with voxlap and to lock the
126 // path for a future mip-chain port.
127 let dx = sprite.p[0] - cam.pos[0];
128 let dy = sprite.p[1] - cam.pos[1];
129 let dz = sprite.p[2] - cam.pos[2];
130 let dist_estimate = ftol(dx * cam.forward[0] + dy * cam.forward[1] + dz * cam.forward[2]);
131 let _ = (dist_estimate, KV6_MIPFACTOR_DEFAULT);
132 let mip = 0u32;
133 let ts = sprite.s;
134 let th = sprite.h;
135 let tf = sprite.f;
136
137 // Bound-cube centre + half-extents in camera-relative coords.
138 // (voxlap5.c:8852-8860; tp is centre offset from pivot, tp2 is
139 // axis half-extent.) kv->xsiz/ysiz/zsiz fit f32 exactly for
140 // any realistic kv6 (≤ 256³ per the file format limit).
141 #[allow(clippy::cast_precision_loss)]
142 let half_x = kv.xsiz as f32 * 0.5;
143 #[allow(clippy::cast_precision_loss)]
144 let half_y = kv.ysiz as f32 * 0.5;
145 #[allow(clippy::cast_precision_loss)]
146 let half_z = kv.zsiz as f32 * 0.5;
147 let off_x = half_x - kv.xpiv;
148 let off_y = half_y - kv.ypiv;
149 let off_z = half_z - kv.zpiv;
150 let npos = [
151 off_x * ts[0] + off_y * th[0] + off_z * tf[0] + dx,
152 off_x * ts[1] + off_y * th[1] + off_z * tf[1] + dy,
153 off_x * ts[2] + off_y * th[2] + off_z * tf[2] + dz,
154 ];
155 let nstr = [ts[0] * half_x, ts[1] * half_x, ts[2] * half_x];
156 let nhei = [th[0] * half_y, th[1] * half_y, th[2] * half_y];
157 let nfor = [tf[0] * half_z, tf[1] * half_z, tf[2] * half_z];
158
159 // 4-plane cull (voxlap5.c:8861-8875, walked z=3..0).
160 for n in &cam.nor {
161 let proj_str = (nstr[0] * n[0] + nstr[1] * n[1] + nstr[2] * n[2]).abs();
162 let proj_hei = (nhei[0] * n[0] + nhei[1] * n[1] + nhei[2] * n[2]).abs();
163 let proj_for = (nfor[0] * n[0] + nfor[1] * n[1] + nfor[2] * n[2]).abs();
164 let proj_pos = npos[0] * n[0] + npos[1] * n[1] + npos[2] * n[2];
165 if proj_str + proj_hei + proj_for + proj_pos < 0.0 {
166 return None;
167 }
168 }
169
170 Some(Kv6DrawSetup {
171 kv,
172 ts,
173 th,
174 tf,
175 mip,
176 })
177}
178
179/// 3×3 + translation matrix multiply, port of voxlap's `mat2`
180/// (voxlap5.c:9619). Composes camera transform `(a_s, a_h, a_f, a_o)`
181/// with sprite basis `(b_s, b_h, b_f, b_o)` into camera-relative
182/// sprite basis `(c_s, c_h, c_f, c_o)`.
183///
184/// `c_s = a_s * b_s.x + a_h * b_s.y + a_f * b_s.z`, similarly for
185/// `c_h` / `c_f`. `c_o = same form on b_o + a_o`.
186#[allow(clippy::too_many_arguments)]
187pub(crate) fn mat2(
188 a_s: [f32; 3],
189 a_h: [f32; 3],
190 a_f: [f32; 3],
191 a_o: [f32; 3],
192 b_s: [f32; 3],
193 b_h: [f32; 3],
194 b_f: [f32; 3],
195 b_o: [f32; 3],
196) -> ([f32; 3], [f32; 3], [f32; 3], [f32; 3]) {
197 let c_s = [
198 a_s[0] * b_s[0] + a_h[0] * b_s[1] + a_f[0] * b_s[2],
199 a_s[1] * b_s[0] + a_h[1] * b_s[1] + a_f[1] * b_s[2],
200 a_s[2] * b_s[0] + a_h[2] * b_s[1] + a_f[2] * b_s[2],
201 ];
202 let c_h = [
203 a_s[0] * b_h[0] + a_h[0] * b_h[1] + a_f[0] * b_h[2],
204 a_s[1] * b_h[0] + a_h[1] * b_h[1] + a_f[1] * b_h[2],
205 a_s[2] * b_h[0] + a_h[2] * b_h[1] + a_f[2] * b_h[2],
206 ];
207 let c_f = [
208 a_s[0] * b_f[0] + a_h[0] * b_f[1] + a_f[0] * b_f[2],
209 a_s[1] * b_f[0] + a_h[1] * b_f[1] + a_f[1] * b_f[2],
210 a_s[2] * b_f[0] + a_h[2] * b_f[1] + a_f[2] * b_f[2],
211 ];
212 let c_o = [
213 a_s[0] * b_o[0] + a_h[0] * b_o[1] + a_f[0] * b_o[2] + a_o[0],
214 a_s[1] * b_o[0] + a_h[1] * b_o[1] + a_f[1] * b_o[2] + a_o[1],
215 a_s[2] * b_o[0] + a_h[2] * b_o[1] + a_f[2] * b_o[2] + a_o[2],
216 ];
217 (c_s, c_h, c_f, c_o)
218}
219
220/// Voxlap's `lbound(a, b, c)` (voxlap5.c:406): clamp `a` into the
221/// inclusive range `[b, c]`. `c` must be `>= b`.
222#[inline]
223fn lbound(a: i32, b: i32, c: i32) -> i32 {
224 a.clamp(b, c)
225}
226
227/// State derived from `Kv6DrawSetup` + `CameraState` that the
228/// per-voxel iteration consumes. Voxlap holds these on the stack
229/// inside `kv6draw`; roxlap factors them out so the iteration loop
230/// can be tested independently.
231#[derive(Debug, Clone)]
232#[allow(dead_code)] // R6.4+ reads scisdist / qsum0 / cadd / etc.
233pub(crate) struct Kv6IterState<'a> {
234 pub kv: &'a Kv6,
235 /// Camera origin expressed in kv6-local voxel coordinates,
236 /// clamped to `[-1, kv.xsiz]` etc. by voxlap's `lbound`. Splits
237 /// the voxel grid into the 4 + 1 quadrants the iteration walks
238 /// in different orders so that for each (x, y) column the inner
239 /// z-loop visits voxels closer to the camera first (= correct
240 /// painter's-style ordering for the rasterizer in R6.4).
241 pub inx: i32,
242 pub iny: i32,
243 pub inz: i32,
244 /// `vx5.xplanemin` / `vx5.xplanemax` mirror — voxlap defaults
245 /// to `[0, INT_MAX]` (no x-clipping). Roxlap doesn't yet expose
246 /// a public knob for these; pinning to the defaults matches the
247 /// oracle and any caller that doesn't care.
248 pub nxplanemin: i32,
249 pub nxplanemax: i32,
250}
251
252/// Full per-frame rasterizer state for one sprite — what
253/// `drawboundcubesse` reads via voxlap's globals.
254///
255/// Built by [`kv6_compute_full_state`] from the post-cull
256/// `Kv6DrawSetup` + the camera's projection params. Mirror of the
257/// voxlap5.c:8915-8973 setup block + the qsum1/qbplbpp framebuffer
258/// state from `voxsetframebuffer` (voxlap5.c:11119-11122) +
259/// kv6colmul/kv6coladd from `updatereflects` (voxlap5.c:8466).
260#[derive(Debug, Clone)]
261pub(crate) struct Kv6FullState<'a> {
262 pub iter: Kv6IterState<'a>,
263 /// 8 cube-vertex offsets, gihz-scaled. `cadd4[k]` for `k = 0..7`
264 /// is the offset of cube vertex `k` from the voxel origin, where
265 /// bit 0 = +x, bit 1 = +z (post-swap == old +z), bit 2 = +y
266 /// (post-swap == old -y). `cadd4[0]` is `[0; 4]`. Lane 3 of
267 /// each entry duplicates lane 2 (z) — voxlap's SSE convenience.
268 pub cadd4: [[f32; 4]; 8],
269 /// Per-z step table: `ztab4_per_z[z] = z * cadd4[2]`. Length =
270 /// `kv.zsiz`. Indexed by `v.z` in `drawboundcubesse`.
271 pub ztab4_per_z: Vec<[f32; 4]>,
272 /// Initial r1 — the x=0 column base after voxlap's "ANNOYING
273 /// HACK" pre-decrement. = `(npos*gihz with z2=npos.z) -
274 /// cadd4[4]`. Iterates by `cadd4[1]` per x and (via r0) by
275 /// `cadd4[4]` per y.
276 pub r1_initial: [f32; 4],
277 /// `r2 = -ysiz * cadd4[4]`. Used to reset r0 between forward-y
278 /// and reverse-y phases inside one x column.
279 pub r2: [f32; 4],
280 /// Near-plane scissor distance (camera-space Z).
281 /// `voxlap5.c:8953-8956` — equals the negative sum of any
282 /// negative components of post-swap `nstr.z` / `nhei.z` /
283 /// `nfor.z`. `0.0` if all three are non-negative.
284 pub scisdist: f32,
285 /// Viewport-clip biases (voxlap5.c:8947-8948). Used by the SSE2
286 /// path's `paddsw` / `pmaxsw` AABB clipping; the scalar port clips
287 /// directly against `target.width` / `target.height`.
288 #[allow(dead_code)]
289 pub qsum0: [i16; 4],
290 /// Viewport-clip floor (voxlap5.c:11120).
291 #[allow(dead_code)]
292 pub qsum1: [i16; 4],
293 /// Framebuffer pixel-stride packed for `pmaddwd` (voxlap5.c:11121).
294 #[allow(dead_code)]
295 pub qbplbpp: [i16; 4],
296 /// Per-direction colour modulation table built by
297 /// [`update_reflects`]. Indexed by `v.dir` (256 entries). Each
298 /// entry packs four `u16` modulation factors (one per byte
299 /// channel) used by `_mm_mulhi_epu16` against the unpacked
300 /// voxel colour.
301 pub kv6colmul: Box<[u64; 256]>,
302 /// Fog bias added after the colour modulate. Zero when fog is
303 /// disabled (the oracle case).
304 pub kv6coladd: u64,
305}
306
307/// Borrowed framebuffer + zbuffer the per-voxel rasterizer fills.
308///
309/// Mirrors voxlap's `kv6frameplace` + `zbuffermem` but in
310/// row-major-pixel form rather than byte-pointer form. `width` /
311/// `height` must match the `OpticastSettings.xres` / `yres` used
312/// when the per-frame `Kv6FullState` was built — the bounds derived from
313/// `qsum0` / `qsum1` assume that geometry.
314///
315/// Internally a raw-pointer view (similar to
316/// [`crate::scalar_rasterizer::RasterTarget`]) so the type is
317/// `Copy + Send + Sync` and the R12.4.2 [`draw_sprites_parallel`]
318/// entry point can hand per-thread copies into rayon worker
319/// closures. Each parallel sprite-draw competes for the
320/// framebuffer / zbuffer via z-test; for non-overlapping sprites
321/// this is race-free, for overlapping pixels a tied-z race may
322/// leak (visually indistinguishable, hash non-deterministic).
323#[derive(Clone, Copy, Debug)]
324pub struct DrawTarget<'a> {
325 fb_ptr: *mut u32,
326 fb_len: usize,
327 zb_ptr: *mut f32,
328 zb_len: usize,
329 /// Row stride in pixels.
330 pub pitch_pixels: usize,
331 pub width: u32,
332 pub height: u32,
333 _marker: std::marker::PhantomData<&'a mut [u32]>,
334}
335
336// SAFETY: same shape as the (`&'a mut [u32]`, `&'a mut [f32]`) pair
337// the constructor consumed; both are auto-`Send` for `T: Send`. The
338// pointer-aliasing safety contract for [`draw_sprites_parallel`] is
339// "z-test arbitrates concurrent writes" — a tied-z race is a
340// determinism issue, not a memory-safety issue.
341unsafe impl Send for DrawTarget<'_> {}
342unsafe impl Sync for DrawTarget<'_> {}
343
344impl<'a> DrawTarget<'a> {
345 /// Build a target from exclusive slice borrows + framebuffer
346 /// dimensions. The slices are consumed (their `&'a mut`
347 /// re-borrow is what gates lifetime); subsequent access happens
348 /// via the raw pointers held in the struct.
349 #[must_use]
350 pub fn new(
351 framebuffer: &'a mut [u32],
352 zbuffer: &'a mut [f32],
353 pitch_pixels: usize,
354 width: u32,
355 height: u32,
356 ) -> Self {
357 Self {
358 fb_ptr: framebuffer.as_mut_ptr(),
359 fb_len: framebuffer.len(),
360 zb_ptr: zbuffer.as_mut_ptr(),
361 zb_len: zbuffer.len(),
362 pitch_pixels,
363 width,
364 height,
365 _marker: std::marker::PhantomData,
366 }
367 }
368
369 /// Unconditional framebuffer write. Used by sequential 2D
370 /// blitters (`drawtile`) that don't engage z-testing.
371 ///
372 /// # Safety
373 /// `idx < self.fb_len`. The disjoint-write contract still
374 /// applies if multiple `Copy` instances of `DrawTarget` are in
375 /// flight across threads — this method does NOT arbitrate via
376 /// z-test.
377 #[inline]
378 pub unsafe fn fb_write(self, idx: usize, color: u32) {
379 debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
380 // SAFETY: caller asserts in-bounds + (for parallel use)
381 // disjoint writes.
382 unsafe { self.fb_ptr.add(idx).write(color) };
383 }
384
385 /// Read one framebuffer pixel. Used by alpha-blend paths
386 /// (`drawtile` modulate-and-blend) that read-modify-write.
387 ///
388 /// # Safety
389 /// `idx < self.fb_len`. Concurrent writers to the same `idx`
390 /// from another thread invalidate the read; sequential blits
391 /// are race-free.
392 #[inline]
393 #[must_use]
394 pub unsafe fn fb_read(self, idx: usize) -> u32 {
395 debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
396 // SAFETY: caller asserts in-bounds.
397 unsafe { self.fb_ptr.add(idx).read() }
398 }
399
400 /// Z-tested pixel write. If `z < zbuffer[idx]`, the new color +
401 /// z stamp the buffers; otherwise nothing changes.
402 ///
403 /// # Safety
404 /// `idx < self.fb_len`. For parallel callers, the wedge / z-test
405 /// arbitration contract on [`DrawTarget`] applies (see struct
406 /// doc).
407 #[inline]
408 #[must_use]
409 pub unsafe fn z_test_write(self, idx: usize, color: u32, z: f32) -> bool {
410 debug_assert!(idx < self.fb_len, "fb idx {} >= len {}", idx, self.fb_len);
411 debug_assert!(idx < self.zb_len, "zb idx {} >= len {}", idx, self.zb_len);
412 // SAFETY: caller asserts in-bounds + concurrent-write contract.
413 unsafe {
414 let zp = self.zb_ptr.add(idx);
415 let cur_z = zp.read();
416 if z < cur_z {
417 zp.write(z);
418 self.fb_ptr.add(idx).write(color);
419 true
420 } else {
421 false
422 }
423 }
424 }
425}
426
427#[inline]
428fn vec4_add(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
429 [a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
430}
431
432#[inline]
433fn vec4_sub(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
434 [a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3]]
435}
436
437#[inline]
438fn vec4_scale(a: [f32; 4], s: f32) -> [f32; 4] {
439 [a[0] * s, a[1] * s, a[2] * s, a[3] * s]
440}
441
442/// Sprite lighting + colour state — the subset of voxlap's
443/// `vx5` global that `updatereflects` reads. Built once per
444/// frame from [`Engine`] state and passed to [`draw_sprite`].
445///
446/// All fields mirror voxlap names:
447/// - `kv6col` ↔ `vx5.kv6col`
448/// - `lightmode` ↔ `vx5.lightmode`
449/// - `lights` ↔ `vx5.lightsrc[0..vx5.numlights]`
450///
451/// The `vx5.fogcol`/`ofogdist` fog plumbing is deferred — sprite
452/// fog stays off for now, matching the oracle path
453/// (`vx5.fogcol < 0` ⇒ `ofogdist == -1` in voxlap C, no fog).
454#[derive(Debug, Clone, Copy)]
455pub struct SpriteLighting<'a> {
456 /// Material colour. R==G==B triggers the cheaper nolighta path
457 /// in `update_reflects`; arbitrary RGB takes the per-channel
458 /// nolightb path; lightmode≥2 ignores the R==G==B fast path
459 /// and always does per-channel modulation.
460 pub kv6col: u32,
461 /// `0` / `1` → directional surface tint (lightmode<2 paths).
462 /// `2` → per-light shadow-side modulation against `lights`.
463 pub lightmode: u32,
464 /// Active point lights — voxlap's `vx5.lightsrc[..vx5.numlights]`.
465 /// Empty for lightmode<2; populated for lightmode≥2.
466 pub lights: &'a [LightSrc],
467}
468
469impl<'a> SpriteLighting<'a> {
470 /// Snapshot the lighting + colour subset of an [`Engine`].
471 /// Use this once per frame in the host so the sprite render
472 /// reflects engine setters made between frames.
473 #[must_use]
474 pub fn from_engine(engine: &'a Engine) -> Self {
475 Self {
476 kv6col: engine.kv6col(),
477 lightmode: engine.lightmode(),
478 lights: engine.lights(),
479 }
480 }
481}
482
483impl SpriteLighting<'static> {
484 /// Default oracle config — grey `kv6col`, lightmode 0, no
485 /// lights. Used by `roxlap-oracle` so the four sprite golden
486 /// hashes stay byte-stable: this is the exact state voxlap C's
487 /// oracle has when it calls `drawsprite`.
488 #[must_use]
489 pub fn default_oracle() -> Self {
490 Self {
491 kv6col: DEFAULT_KV6COL,
492 lightmode: 0,
493 lights: &[],
494 }
495 }
496}
497
498/// Builds `kv6colmul[256]` + `kv6coladd[0]` from the engine's
499/// sprite lighting state. Mirror of voxlap's `updatereflects`
500/// (`voxlap5.c:8466-8750`).
501///
502/// Branches:
503/// - `lightmode < 2` + R==G==B `kv6col` → nolighta (cheap
504/// single-multiplier path, voxlap5.c:8553-8584).
505/// - `lightmode < 2` + arbitrary `kv6col` → nolightb (per-channel
506/// path, voxlap5.c:8587-8629).
507/// - `lightmode >= 2` → per-light shadow-side modulation
508/// (voxlap5.c:8631-8750), iterating the active `lights`.
509///
510/// `flags & 1` (disable shading) and the active-fog path remain
511/// deferred — neither is exercised by the oracle's four sprite
512/// poses, and adding them is a follow-up that doesn't change the
513/// already-frozen hashes.
514///
515fn update_reflects(sprite: &Sprite, lighting: &SpriteLighting<'_>) -> (Box<[u64; 256]>, u64) {
516 // Sprite fog plumbing is a follow-up — `vx5.fogcol < 0` (voxlap
517 // C oracle's set_fogcol(BR(...)) state) means ofogdist stays -1,
518 // fogmul = 0, kv6coladd[0] = 0. We pin to that here.
519 let fogmul_lo: u32 = 0;
520 let kv6coladd: u64 = 0;
521
522 let kv6col = lighting.kv6col;
523
524 // g = ((fogmul & 32767) ^ 32767) * (16*8/65536). With fogmul=0:
525 // g = 32767 * (128/65536) ≈ 63.998.
526 let g_pre = ((((fogmul_lo & 0x7fff) ^ 0x7fff) as i32) as f32) * (16.0 * 8.0 / 65536.0);
527
528 let mut kv6colmul = Box::new([0u64; 256]);
529
530 if lighting.lightmode < 2 {
531 // (voxlap5.c:8538-8543) fx=fy=fz=1.0; tp = sum of basis vectors.
532 let tp_x = sprite.s[0] + sprite.h[0] + sprite.f[0];
533 let tp_y = sprite.s[1] + sprite.h[1] + sprite.f[1];
534 let tp_z = sprite.s[2] + sprite.h[2] + sprite.f[2];
535
536 let f0 = 64.0_f32 / (tp_x * tp_x + tp_y * tp_y + tp_z * tp_z).sqrt();
537
538 // R==G==B test: ((kv6col & 0xffff) << 8) ^ (kv6col & 0xffff00)
539 // == 0 iff R == G and G == B.
540 let lo16 = kv6col & 0xffff;
541 let mid24 = kv6col & 0x00ff_ff00;
542 let is_grey = ((lo16 << 8) ^ mid24) == 0;
543
544 if is_grey {
545 // Nolighta path (voxlap5.c:8553-8584): grey kv6col absorbs
546 // into a single multiplier per direction.
547 let g = g_pre * (((kv6col & 0xff) as f32) / 256.0);
548 let f = f0 * g;
549
550 let l0 = (tp_x * f) as i16; // (short)(...) is C truncating cast
551 let l1 = (tp_y * f) as i16;
552 let l2 = (tp_z * f) as i16;
553 let l3 = (g * 128.0) as i16;
554
555 let iu = iunivec();
556 for k in 0..256 {
557 let w = dot_iunivec_i16x4(iu[k], [l0, l1, l2, l3]);
558 let w64 = u64::from(w);
559 kv6colmul[k] = w64 | (w64 << 16) | (w64 << 32) | (w64 << 48);
560 }
561 } else {
562 // Nolightb path (voxlap5.c:8587-8629). Per-channel
563 // modulation factor M_k = (kv6col_byte_k << 8) → mulhi_pu16
564 // by the per-direction dot. Same dot derivation as nolighta.
565 let f = f0 * g_pre;
566
567 let l0 = (tp_x * f) as i16;
568 let l1 = (tp_y * f) as i16;
569 let l2 = (tp_z * f) as i16;
570 let l3 = (g_pre * 128.0) as i16;
571
572 let m = kv6col_channel_mods(kv6col);
573
574 let iu = iunivec();
575 for k in 0..256 {
576 let w = dot_iunivec_i16x4(iu[k], [l0, l1, l2, l3]);
577 kv6colmul[k] = pack_modulated_word(w, m);
578 }
579 }
580 } else {
581 // Lightmode≥2 path (voxlap5.c:8631-8750): per-sprite point
582 // lighting from `lighting.lights`. Each light projects onto
583 // the sprite's normalised basis; per-direction kv6colmul[i]
584 // starts from a synthetic ambient slot and subtracts shadow
585 // contributions from each light's "negative" lanes.
586 let m = kv6col_channel_mods(kv6col);
587 build_kv6colmul_lightmode2(sprite, lighting.lights, &mut kv6colmul, fogmul_lo, m);
588 }
589
590 (kv6colmul, kv6coladd)
591}
592
593/// Build voxlap's per-surface-normal colour-modulation table for a
594/// sprite under the given lighting — the `kv6colmul[256]` (one packed
595/// u64 per `Voxel::dir`, four 16-bit channel multipliers) plus the
596/// `kv6coladd` bias `draw_sprite` adds. This is exactly the table the
597/// CPU rasteriser uses (`update_reflects`); exposed so other backends
598/// (the GPU sprite pass) can shade KV6 sprites with identical math
599/// rather than re-deriving voxlap's lighting in a shader.
600///
601/// Per voxel, the final colour is, per channel `c`:
602/// `clamp(((rgb[c] << 8) * (kv6colmul[dir] >> 16*c & 0xffff)) >> 16
603/// + (kv6coladd >> 16*c & 0xffff), 0, 255)` — i.e. an
604/// `_mm_mulhi_epu16` + `_mm_add_epi16` + `_mm_packus_epi16`.
605#[must_use]
606pub fn sprite_colmul(sprite: &Sprite, lighting: &SpriteLighting<'_>) -> ([u64; 256], u64) {
607 let (mul, add) = update_reflects(sprite, lighting);
608 (*mul, add)
609}
610
611/// Voxlap's `pmaddwd(iunivec[k], lightlist) summed across two
612/// dword lanes mod 2^32, take high 16` reduction. Returns the
613/// `u16` modulation factor before any per-channel packing.
614#[inline]
615fn dot_iunivec_i16x4(u: [i16; 4], l: [i16; 4]) -> u16 {
616 let u0 = i32::from(u[0]);
617 let u1 = i32::from(u[1]);
618 let u2 = i32::from(u[2]);
619 let u3 = i32::from(u[3]);
620 let lo = (u0.wrapping_mul(l[0].into())) as u32;
621 let lo = lo.wrapping_add((u1.wrapping_mul(l[1].into())) as u32);
622 let hi = (u2.wrapping_mul(l[2].into())) as u32;
623 let hi = hi.wrapping_add((u3.wrapping_mul(l[3].into())) as u32);
624 ((lo.wrapping_add(hi)) >> 16) as u16
625}
626
627/// `(kv6col_byte_k << 8)` per channel — the four `M_k` factors the
628/// nolightb / lightmode≥2 paths multiply against the per-direction
629/// dot via `pmulhuw`.
630#[inline]
631fn kv6col_channel_mods(kv6col: u32) -> [u16; 4] {
632 [
633 ((kv6col & 0xff) << 8) as u16,
634 (((kv6col >> 8) & 0xff) << 8) as u16,
635 (((kv6col >> 16) & 0xff) << 8) as u16,
636 (((kv6col >> 24) & 0xff) << 8) as u16,
637 ]
638}
639
640/// Pack one direction's `kv6colmul[k]` u64: per-channel
641/// `(W * M_c) >> 16` words concatenated.
642#[inline]
643fn pack_modulated_word(w_dot: u16, m: [u16; 4]) -> u64 {
644 let w = u32::from(w_dot);
645 let w0 = ((w * u32::from(m[0])) >> 16) as u16;
646 let w1 = ((w * u32::from(m[1])) >> 16) as u16;
647 let w2 = ((w * u32::from(m[2])) >> 16) as u16;
648 let w3 = ((w * u32::from(m[3])) >> 16) as u16;
649 u64::from(w0) | (u64::from(w1) << 16) | (u64::from(w2) << 32) | (u64::from(w3) << 48)
650}
651
652/// Lightmode≥2 path body — voxlap5.c:8631-8750. Builds the full
653/// `kv6colmul[256]` from the active light list.
654///
655/// Steps:
656/// 1. Normalise each sprite-basis axis (`sprs`/`sprh`/`sprf`).
657/// 2. For each light within `r2` of the sprite, compute its
658/// intensity falloff `h` and project the world-space delta onto
659/// the normalised sprite basis → store in `lightlist[k]`.
660/// 3. Append a synthetic ambient slot (voxlap's hardcoded
661/// `(fx, fy, fz) = (0, 0.5, 1.0)` direction) at
662/// `lightlist[lightcnt]`.
663/// 4. For each direction `idx ∈ 0..256`:
664/// - `base = ambient_slot · iunivec[idx]` (treated as one u32).
665/// - For each real light `k`: compute `dot = light_k ·
666/// iunivec[idx]`, split into low/high i16 lanes (asm-faithful
667/// "16-bits-is-ugly-but-ok-here" quirk); subtract the negative
668/// lanes from `base` (= shadow side of the surface).
669/// - `W = base >> 16`, then per-channel modulate against `M_c`
670/// and pack into `kv6colmul[idx]`.
671fn build_kv6colmul_lightmode2(
672 sprite: &Sprite,
673 lights: &[LightSrc],
674 kv6colmul: &mut [u64; 256],
675 fogmul_lo: u32,
676 m: [u16; 4],
677) {
678 // (voxlap5.c:8638-8643) Normalise sprite basis. WARNING from
679 // voxlap: only correct for orthonormal sprite-bases; non-
680 // orthogonal bases (e.g. shears) drift. The four oracle sprite
681 // poses are all orthonormal so this matches voxlap's behaviour.
682 let sprs = normalise(sprite.s);
683 let sprh = normalise(sprite.h);
684 let sprf = normalise(sprite.f);
685
686 // hh = ((fogmul & 32767) ^ 32767) / 65536 * 2 (voxlap5.c:8645).
687 // With fogmul=0 → hh = 32767 / 65536 * 2 ≈ 1.0. This is a
688 // distinct scaling from `g_pre` (= same numerator * 128/65536
689 // for the lightmode<2 path) — they differ by a factor of 64.
690 // An earlier port mistakenly derived hh from g_pre / 128 = 0.5,
691 // giving sprites half the intended ambient brightness.
692 let hh_initial = ((((fogmul_lo & 0x7fff) ^ 0x7fff) as i32) as f32) * (2.0 / 65536.0);
693
694 // Project each in-range light onto the sprite basis.
695 let mut lightlist: [[i16; 4]; MAX_LIGHTS + 1] = [[0; 4]; MAX_LIGHTS + 1];
696 let mut lightcnt: usize = 0;
697 for light in lights.iter().rev() {
698 if lightcnt >= MAX_LIGHTS {
699 break;
700 }
701 let fx = light.pos[0] - sprite.p[0];
702 let fy = light.pos[1] - sprite.p[1];
703 let fz = light.pos[2] - sprite.p[2];
704 let gg = fx * fx + fy * fy + fz * fz;
705 let ff = light.r2;
706 // Voxlap's `*(int32_t *)&gg < *(int32_t *)&ff` is a bit-
707 // pattern compare. For non-negative finite floats the bit
708 // order matches the magnitude order, so `gg < ff` is
709 // equivalent (and safer in the presence of NaN: NaN !< x
710 // for any x, matching voxlap's float-bit-cast trick).
711 if gg >= ff || gg <= 0.0 {
712 continue;
713 }
714 let f = ff.sqrt();
715 let g = gg.sqrt();
716 // h = (f*ff - g*gg) / (f*ff*g*gg) * sc * 16
717 let mut h = (f * ff - g * gg) / (f * ff * g * gg) * light.sc * 16.0;
718 if g * h > 4096.0 {
719 h = 4096.0 / g; // saturation clip
720 }
721 h *= hh_initial;
722 let l0 = (fx * sprs[0] + fy * sprs[1] + fz * sprs[2]) * h;
723 let l1 = (fx * sprh[0] + fy * sprh[1] + fz * sprh[2]) * h;
724 let l2 = (fx * sprf[0] + fy * sprf[1] + fz * sprf[2]) * h;
725 lightlist[lightcnt] = [l0 as i16, l1 as i16, l2 as i16, 0];
726 lightcnt += 1;
727 }
728
729 // Synthetic ambient slot: voxlap's hardcoded direction
730 // (fx, fy, fz) = (0, 0.5, 1.0) projected onto the sprite basis,
731 // scaled by `hh * 16*16*8/2 = hh * 1024`. The lane-3 bias is
732 // `hh * 48 / 16 = hh * 3`.
733 let amb_fx = 0.0_f32;
734 let amb_fy = 0.5_f32;
735 let amb_fz = 1.0_f32;
736 let hh = hh_initial * (16.0 * 16.0 * 8.0 / 2.0);
737 let al0 = (sprs[0] * amb_fx + sprs[1] * amb_fy + sprs[2] * amb_fz) * hh;
738 let al1 = (sprh[0] * amb_fx + sprh[1] * amb_fy + sprh[2] * amb_fz) * hh;
739 let al2 = (sprf[0] * amb_fx + sprf[1] * amb_fy + sprf[2] * amb_fz) * hh;
740 let al3 = hh * (48.0 / 16.0);
741 lightlist[lightcnt] = [al0 as i16, al1 as i16, al2 as i16, al3 as i16];
742
743 let iu = iunivec();
744 for idx in 0..256 {
745 let u = iu[idx];
746 // Ambient base = lightlist[lightcnt] · iunivec[idx], in u32
747 // wrapping arithmetic (asm summed the pmaddwd dword lanes
748 // mod 2^32).
749 let u0 = i32::from(u[0]);
750 let u1 = i32::from(u[1]);
751 let u2 = i32::from(u[2]);
752 let u3 = i32::from(u[3]);
753 let amb = lightlist[lightcnt];
754 let base_lo = (u0.wrapping_mul(i32::from(amb[0]))) as u32;
755 let base_lo = base_lo.wrapping_add((u1.wrapping_mul(i32::from(amb[1]))) as u32);
756 let base_hi = (u2.wrapping_mul(i32::from(amb[2]))) as u32;
757 let base_hi = base_hi.wrapping_add((u3.wrapping_mul(i32::from(amb[3]))) as u32);
758 let mut base = base_lo.wrapping_add(base_hi);
759
760 // For each real light, compute dot, then subtract its
761 // "negative" half-lanes from `base` (= shadow side).
762 for k in (0..lightcnt).rev() {
763 let l = lightlist[k];
764 let klo = (u0.wrapping_mul(i32::from(l[0]))) as u32;
765 let klo = klo.wrapping_add((u1.wrapping_mul(i32::from(l[1]))) as u32);
766 let khi = (u2.wrapping_mul(i32::from(l[2]))) as u32;
767 let khi = khi.wrapping_add((u3.wrapping_mul(i32::from(l[3]))) as u32);
768 let dot = klo.wrapping_add(khi);
769 // Voxlap quirk: 32-bit dot but pminsw is per-i16 lane.
770 // Light magnitudes stay clamped enough that the
771 // mixed-lane behaviour is benign — port faithfully.
772 let lo16 = (dot & 0xffff) as i16;
773 let hi16 = ((dot >> 16) & 0xffff) as i16;
774 let lo16c: u16 = if lo16 < 0 { lo16 as u16 } else { 0 };
775 let hi16c: u16 = if hi16 < 0 { hi16 as u16 } else { 0 };
776 let sub = (u32::from(hi16c) << 16) | u32::from(lo16c);
777 base = base.wrapping_sub(sub);
778 }
779
780 let w_dot = (base >> 16) as u16;
781 kv6colmul[idx] = pack_modulated_word(w_dot, m);
782 }
783}
784
785/// Normalise a 3-vector. Returns the unit-length version; if
786/// the input is zero-length, returns the input unchanged (avoids
787/// NaN propagation — voxlap's `1.0 / sqrt(...)` would NaN out for
788/// a zero basis axis but the C code never gets passed one).
789#[inline]
790fn normalise(v: [f32; 3]) -> [f32; 3] {
791 let len_sq = v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
792 if len_sq <= 0.0 {
793 return v;
794 }
795 let inv = 1.0 / len_sq.sqrt();
796 [v[0] * inv, v[1] * inv, v[2] * inv]
797}
798
799/// Full setup: mat2 + Cramer's + nfor↔nhei swap + cadd4/ztab4/r1/r2/
800/// scisdist/qsum0 init. Mirror of voxlap5.c:8915-8973.
801pub(crate) fn kv6_compute_full_state<'a>(
802 setup: &Kv6DrawSetup<'a>,
803 sprite: &Sprite,
804 lighting: &SpriteLighting<'_>,
805 cam: &CameraState,
806 settings: &OpticastSettings,
807 fb_width: u32,
808 fb_height: u32,
809 fb_pitch_pixels: usize,
810) -> Kv6FullState<'a> {
811 let sprite_pos = sprite.p;
812 let kv = setup.kv;
813
814 // Transform sprite basis from world to camera-relative
815 // screen-axis coords (voxlap5.c:8916). `(gixs, giys, gizs)` is
816 // the transposed camera basis; `giadd` is the translation half.
817 let (nstr, mut nhei, mut nfor, mut npos) = mat2(
818 cam.xs, cam.ys, cam.zs, cam.add, setup.ts, setup.th, setup.tf, sprite_pos,
819 );
820
821 // Shift `npos` so it points at the kv6 origin (corner [0,0,0])
822 // rather than the pivot point — Cramer's rule below solves for
823 // the camera origin in kv6-local voxel coords, which only makes
824 // sense relative to the corner. (voxlap5.c:8917-8919)
825 npos[0] -= kv.xpiv * nstr[0] + kv.ypiv * nhei[0] + kv.zpiv * nfor[0];
826 npos[1] -= kv.xpiv * nstr[1] + kv.ypiv * nhei[1] + kv.zpiv * nfor[1];
827 npos[2] -= kv.xpiv * nstr[2] + kv.ypiv * nhei[2] + kv.zpiv * nfor[2];
828
829 // Cramer's rule for `nstr * X + nhei * Y + nfor * Z + npos = 0`.
830 // (voxlap5.c:8923-8936)
831 let tp = [
832 nhei[1] * nfor[2] - nfor[1] * nhei[2],
833 nfor[1] * nstr[2] - nstr[1] * nfor[2],
834 nstr[1] * nhei[2] - nhei[1] * nstr[2],
835 ];
836 let det = nstr[0] * tp[0] + nhei[0] * tp[1] + nfor[0] * tp[2];
837 // Float-bit comparison against zero: matches voxlap's
838 // `if (f != 0)` and dodges clippy::float_cmp.
839 let (raw_inx, raw_iny, raw_inz) = if det.to_bits() & 0x7fff_ffff != 0 {
840 let f_inv = -1.0 / det;
841 let tp2 = [
842 npos[1] * nfor[2] - nfor[1] * npos[2],
843 nhei[1] * npos[2] - npos[1] * nhei[2],
844 npos[1] * nstr[2] - nstr[1] * npos[2],
845 ];
846 (
847 ftol((npos[0] * tp[0] - nhei[0] * tp2[0] - nfor[0] * tp2[1]) * f_inv),
848 ftol((npos[0] * tp[1] + nstr[0] * tp2[0] - nfor[0] * tp2[2]) * f_inv),
849 ftol((npos[0] * tp[2] + nstr[0] * tp2[1] + nhei[0] * tp2[2]) * f_inv),
850 )
851 } else {
852 (-1, -1, -1)
853 };
854
855 let xsiz_i = kv.xsiz as i32;
856 let ysiz_i = kv.ysiz as i32;
857 let zsiz_i = kv.zsiz as i32;
858 let iter = Kv6IterState {
859 kv,
860 inx: lbound(raw_inx, -1, xsiz_i),
861 iny: lbound(raw_iny, -1, ysiz_i),
862 inz: lbound(raw_inz, -1, zsiz_i),
863 // Voxlap default `vx5.xplanemin = 0`, `xplanemax = 0x7fffffff`.
864 nxplanemin: 0,
865 nxplanemax: i32::MAX,
866 };
867
868 // Swap `nhei` ↔ `nfor` with sign flip on the new `nfor`
869 // (voxlap5.c:8942-8944). Equivalent to a 90° rotation that lines
870 // the basis up with cadd4's bit-encoded vertex offsets:
871 // cadd4[1] = +x (post-swap nstr direction)
872 // cadd4[2] = +z (post-swap nhei direction == original +z)
873 // cadd4[4] = +y (post-swap nfor direction == original -y)
874 // After this point `nfor` / `nhei` carry the post-swap values.
875 let swap_x = nhei[0];
876 nhei[0] = nfor[0];
877 nfor[0] = -swap_x;
878 let swap_y = nhei[1];
879 nhei[1] = nfor[1];
880 nfor[1] = -swap_y;
881 let swap_z = nhei[2];
882 nhei[2] = nfor[2];
883 nfor[2] = -swap_z;
884
885 // qsum0 (voxlap5.c:8947-8948). The `0x7fff - (xres - hx)`
886 // form sets the bias such that adding it to a screen-space
887 // bound makes the bound saturate-positive when it lands
888 // inside the viewport.
889 let xres_i = settings.xres as i32;
890 let yres_i = settings.yres as i32;
891 let hx_i = ftol(settings.hx);
892 let hy_i = ftol(settings.hy);
893 let qsum0_x = (0x7fff - (xres_i - hx_i)) as i16;
894 let qsum0_y = (0x7fff - (yres_i - hy_i)) as i16;
895 let qsum0 = [qsum0_x, qsum0_y, qsum0_x, qsum0_y];
896
897 // scisdist (voxlap5.c:8953-8956). Voxlap's `*(int32_t *)&f < 0`
898 // bit-trick: a positive-finite float has bit-pattern >= 0;
899 // only *negative* floats land < 0 as signed int. So this loop
900 // sums the absolute value of any negative-z post-swap basis
901 // component into a near-plane bias.
902 let mut scisdist = 0.0f32;
903 if (nstr[2].to_bits() as i32) < 0 {
904 scisdist -= nstr[2];
905 }
906 if (nhei[2].to_bits() as i32) < 0 {
907 scisdist -= nhei[2];
908 }
909 if (nfor[2].to_bits() as i32) < 0 {
910 scisdist -= nfor[2];
911 }
912
913 // cadd4 step table (voxlap5.c:8958-8961). cadd4[1/2/4] are the
914 // three primary axis steps (x / z / y, post-swap); cadd4[3/5/6/7]
915 // are bit-OR sums (3 = 1+2, 5 = 1+4, 6 = 2+4, 7 = 3+4).
916 let gihz = settings.hz;
917 let cadd1 = [nstr[0] * gihz, nstr[1] * gihz, nstr[2], nstr[2]];
918 let cadd2 = [nhei[0] * gihz, nhei[1] * gihz, nhei[2], nhei[2]];
919 let cadd4_axis = [nfor[0] * gihz, nfor[1] * gihz, nfor[2], nfor[2]];
920 let cadd3 = vec4_add(cadd1, cadd2);
921 let cadd5 = vec4_add(cadd1, cadd4_axis);
922 let cadd6 = vec4_add(cadd2, cadd4_axis);
923 let cadd7 = vec4_add(cadd3, cadd4_axis);
924 let cadd4 = [
925 [0.0; 4], cadd1, cadd2, cadd3, cadd4_axis, cadd5, cadd6, cadd7,
926 ];
927
928 // ztab4 per-z step table (voxlap5.c:8973). ztab4[z] = z * cadd4[2]
929 // built incrementally by addps so per-step rounding matches.
930 let zsiz = kv.zsiz as usize;
931 let mut ztab4_per_z = Vec::with_capacity(zsiz);
932 if zsiz > 0 {
933 ztab4_per_z.push([0.0f32; 4]);
934 for i in 1..zsiz {
935 let prev = ztab4_per_z[i - 1];
936 ztab4_per_z.push(vec4_add(prev, cadd4[2]));
937 }
938 }
939
940 // r1 init (voxlap5.c:8961, 8976). Post-mat2 npos becomes the
941 // raw column-base; gihz-scale x/y; z lane keeps unscaled npos.z;
942 // z2 lane (lane 3) duplicates z. Then "ANNOYING HACK"
943 // pre-decrement by cadd4[4].
944 let r1_pre = [npos[0] * gihz, npos[1] * gihz, npos[2], npos[2]];
945 let r1_initial = vec4_sub(r1_pre, cadd4[4]);
946
947 // r2 = -ysiz * cadd4[4] (voxlap5.c:8974). intss + mulps in voxlap.
948 let r2 = vec4_scale(cadd4[4], -(ysiz_i as f32));
949
950 // qsum1 + qbplbpp from voxsetframebuffer (voxlap5.c:11119-11122).
951 // The framebuffer geometry is independent of the camera projection
952 // — these are derived from `(width, height, pitch_bytes)`.
953 let pitch_bytes = (fb_pitch_pixels as i32).saturating_mul(4);
954 let qsum1_x = 0x7fff_i32 - fb_width as i32;
955 let qsum1_y = 0x7fff_i32 - fb_height as i32;
956 let qsum1 = [
957 qsum1_x as i16,
958 qsum1_y as i16,
959 qsum1_x as i16,
960 qsum1_y as i16,
961 ];
962 let qbplbpp = [4i16, pitch_bytes as i16, 4, pitch_bytes as i16];
963
964 let (kv6colmul, kv6coladd) = update_reflects(sprite, lighting);
965
966 Kv6FullState {
967 iter,
968 cadd4,
969 ztab4_per_z,
970 r1_initial,
971 r2,
972 scisdist,
973 qsum0,
974 qsum1,
975 qbplbpp,
976 kv6colmul,
977 kv6coladd,
978 }
979}
980
981/// Per-voxel rasterizer (R6.4 complete).
982///
983/// Mirror of `voxlap5.c:8179-8320` (`drawboundcubesse`). For each
984/// voxel:
985/// 1. `effmask = mask & v.vis` early-out.
986/// 2. `origin = r0 + ztab4_per_z[v.z]`; scissor on `origin.z`.
987/// 3. Look up `ptfaces16[effmask]` — `face[0]` = 4 or 6 vertex
988/// count, `face[1..7]` = byte offsets into `caddasm` (the
989/// `cadd4[8]` array, each entry 16 bytes).
990/// 4. For each vertex pair (a, b), compute the projected screen
991/// coords as `(cadd4[a] + origin).xy / (cadd4[a] + origin).z`
992/// via `_mm_rcp_ps`.
993/// 5. Pack the 4 (or 6) projected vertices to int16, min/max-reduce
994/// to a single screen-AABB, viewport-clip via `qsum0` /
995/// `qsum1`, and early-out on degenerate rect.
996/// 6. Compute the per-voxel colour via the `mm5` cross-call tail +
997/// `kv6colmul[v.dir]` + `kv6coladd[0]` modulation.
998/// 7. Fill the screen rectangle with z-test + framebuffer write.
999///
1000/// Returns the number of pixels actually written (z-test passing).
1001/// Tests use this as a sanity gate; production callers ignore it.
1002///
1003/// `mm5_tail` is voxlap's static cross-call register tail
1004/// (voxlap5.c:8170-8177). It carries one byte of contribution from
1005/// the previous voxel's colour into the current; bit-equality with
1006/// the asm requires preserving it across calls within one sprite.
1007///
1008/// Currently x86_64-only — relies on `_mm_rcp_ps` for bit-equality
1009/// with voxlap C. NEON / wasm ports will need their own goldens
1010/// (see `PORTING-RUST.md` R9 / R10).
1011#[cfg(target_arch = "x86_64")]
1012#[allow(clippy::trivially_copy_pass_by_ref)] // hot loop; matches voxlap's pointer-passed v.
1013pub(crate) fn drawboundcubesse(
1014 v: &Voxel,
1015 mask: u32,
1016 state: &Kv6FullState<'_>,
1017 r0: [f32; 4],
1018 mm5_tail: &mut u32,
1019 target: &mut DrawTarget<'_>,
1020) -> u32 {
1021 use core::arch::x86_64::{
1022 __m128, __m128i, _mm_add_epi16, _mm_add_ps, _mm_adds_epi16, _mm_cvtsi128_si32,
1023 _mm_cvtsi32_si128, _mm_cvttps_epi32, _mm_loadl_epi64, _mm_loadu_ps, _mm_madd_epi16,
1024 _mm_max_epi16, _mm_min_epi16, _mm_movehl_ps, _mm_movelh_ps, _mm_mul_ps, _mm_mulhi_epu16,
1025 _mm_packs_epi32, _mm_packus_epi16, _mm_rcp_ps, _mm_setzero_si128, _mm_shufflelo_epi16,
1026 _mm_storeu_ps, _mm_storeu_si128, _mm_subs_epu16, _mm_unpackhi_epi64, _mm_unpacklo_epi32,
1027 _mm_unpacklo_epi8,
1028 };
1029
1030 let effmask = (mask & u32::from(v.vis)) as usize;
1031 if effmask == 0 || effmask >= PTFACES16.len() {
1032 return 0;
1033 }
1034 let face = PTFACES16[effmask];
1035 if face[0] == 0 {
1036 return 0;
1037 }
1038
1039 // origin = r0 + ztab4_per_z[v.z] (4 f32 lanes, [x*hz, y*hz, z, z]).
1040 let z_idx = v.z as usize;
1041 if z_idx >= state.ztab4_per_z.len() {
1042 return 0;
1043 }
1044 let ztep = state.ztab4_per_z[z_idx];
1045 // SAFETY: `_mm_loadu_ps` reads 16 unaligned bytes from a 4-f32
1046 // array (which is 16 bytes); subsequent intrinsics are SSE2
1047 // baseline on x86_64.
1048 unsafe {
1049 let r0_v = _mm_loadu_ps(r0.as_ptr());
1050 let ztep_v = _mm_loadu_ps(ztep.as_ptr());
1051 let origin_v: __m128 = _mm_add_ps(r0_v, ztep_v);
1052 let mut origin_arr = [0.0f32; 4];
1053 _mm_storeu_ps(origin_arr.as_mut_ptr(), origin_v);
1054 if origin_arr[2] < state.scisdist {
1055 return 0;
1056 }
1057
1058 // Project vertex pair (a, b). Returns __m128 with lanes:
1059 // [b.x_proj, b.y_proj, a.x_proj, a.y_proj]
1060 // The byte offsets in face[k] index `caddasm` (= bytes into a
1061 // [point4d; 8] = [[f32; 4]; 8]); divide by 16 (= sizeof point4d)
1062 // to land back at the cadd4 index.
1063 let project = |off_a: u8, off_b: u8| -> __m128 {
1064 let a = state.cadd4[(off_a >> 4) as usize];
1065 let b = state.cadd4[(off_b >> 4) as usize];
1066 let wva = _mm_add_ps(_mm_loadu_ps(a.as_ptr()), origin_v);
1067 let wvb = _mm_add_ps(_mm_loadu_ps(b.as_ptr()), origin_v);
1068 let wv0 = _mm_movehl_ps(wva, wvb); // [b.z, b.z, a.z, a.z]
1069 let wv1 = _mm_movelh_ps(wvb, wva); // [b.x, b.y, a.x, a.y]
1070 let wv0_inv = _mm_rcp_ps(wv0);
1071 _mm_mul_ps(wv0_inv, wv1)
1072 };
1073
1074 let pair01 = project(face[1], face[2]);
1075 let pair23 = project(face[3], face[4]);
1076
1077 // Convert to int32 (truncate-toward-zero), pack to int16.
1078 // pack01_int16 lanes 0..3 = [v1x, v1y, v0x, v0y]
1079 // pack01_int16 lanes 4..7 = [v3x, v3y, v2x, v2y]
1080 let p01_i32 = _mm_cvttps_epi32(pair01);
1081 let p23_i32 = _mm_cvttps_epi32(pair23);
1082 let pack_lo = _mm_packs_epi32(p01_i32, p23_i32);
1083 let pack01 = pack_lo;
1084 let pack23 = _mm_unpackhi_epi64(pack_lo, _mm_setzero_si128());
1085 let mut mm_min = _mm_min_epi16(pack01, pack23);
1086 let mut mm_max = _mm_max_epi16(pack01, pack23);
1087
1088 if face[0] != 4 {
1089 let pair45 = project(face[5], face[6]);
1090 let p45_i32 = _mm_cvttps_epi32(pair45);
1091 let pack45 = _mm_packs_epi32(p45_i32, _mm_setzero_si128());
1092 mm_min = _mm_min_epi16(mm_min, pack45);
1093 mm_max = _mm_max_epi16(mm_max, pack45);
1094 }
1095
1096 // shufflelo(_, 0x0e) brings high half (lanes 2..3) into low
1097 // half so min/max collapses across all 4 (or 6) vertices.
1098 let mm_min_hi = _mm_shufflelo_epi16(mm_min, 0x0e);
1099 let mm_max_hi = _mm_shufflelo_epi16(mm_max, 0x0e);
1100 let mm_min_red = _mm_min_epi16(mm_min, mm_min_hi);
1101 let mm_max_red = _mm_max_epi16(mm_max, mm_max_hi);
1102
1103 // bounds = unpacklo(mm_min, mm_max) lanes 0..3 (i16)
1104 // = [min_x, max_x, min_y, max_y] ?
1105 // Actually: _mm_unpacklo_epi32 interleaves 32-bit lanes.
1106 // Low 32 of mm_min = (mm_min[0], mm_min[1]) i.e. (min_x, min_y).
1107 // Low 32 of mm_max similarly. After unpacklo_epi32:
1108 // lanes_32[0] = mm_min low32, lanes_32[1] = mm_max low32
1109 // → 4 i16: [min_x, min_y, max_x, max_y]
1110 let bounds = _mm_unpacklo_epi32(mm_min_red, mm_max_red);
1111
1112 // Apply qsum0 (saturated add) + qsum1 (max-floor). Both are
1113 // 8-byte values loaded into the low 64 bits of __m128i.
1114 let qsum0_v = _mm_loadl_epi64(state.qsum0.as_ptr().cast::<__m128i>());
1115 let qsum1_v = _mm_loadl_epi64(state.qsum1.as_ptr().cast::<__m128i>());
1116 let bounds = _mm_adds_epi16(bounds, qsum0_v);
1117 let bounds = _mm_max_epi16(bounds, qsum1_v);
1118
1119 // dxdy = subs_epu16(bounds_hi, bounds) — saturating unsigned
1120 // subtract, with bounds_hi being lanes [2,3,2,3] of bounds.
1121 let bounds_hi = _mm_shufflelo_epi16(bounds, 0xee);
1122 let dxdy = _mm_subs_epu16(bounds_hi, bounds);
1123 let dxdy_low = _mm_cvtsi128_si32(dxdy) as u32;
1124 let dx = (dxdy_low & 0xffff) as i32;
1125 if dx == 0 {
1126 return 0;
1127 }
1128 let dy = ((dxdy_low >> 16) as i32) - 1;
1129 if dy < 0 {
1130 return 0;
1131 }
1132
1133 // Recover pixel coords from bounds + qsum1. Bounds[0/1] are
1134 // currently in the saturated [0x7fff - res, 0x7fff] range;
1135 // pixel = bounds - qsum1.
1136 let mut bounds_arr = [0i16; 8];
1137 _mm_storeu_si128(bounds_arr.as_mut_ptr().cast::<__m128i>(), bounds);
1138 let pixel_min_x = i32::from(bounds_arr[0]) - i32::from(state.qsum1[0]);
1139 let pixel_min_y = i32::from(bounds_arr[1]) - i32::from(state.qsum1[1]);
1140
1141 // pmaddwd is consumed for completeness so the asm-equivalent
1142 // pixel-byte-offset is computable; not strictly needed since
1143 // we index directly via (pixel_min_x, pixel_min_y).
1144 let qbplbpp_v = _mm_loadl_epi64(state.qbplbpp.as_ptr().cast::<__m128i>());
1145 let _ = _mm_madd_epi16(bounds, qbplbpp_v);
1146
1147 // Colour modulation with mm5 cross-call tail.
1148 let tail_in = *mm5_tail;
1149 let mm5 = _mm_cvtsi32_si128(tail_in as i32);
1150 let col_v = _mm_cvtsi32_si128(v.col as i32);
1151 let mm5 = _mm_unpacklo_epi8(mm5, col_v);
1152 let kvm = state.kv6colmul[v.dir as usize];
1153 let kvm_v = _mm_loadl_epi64(std::ptr::addr_of!(kvm).cast::<__m128i>());
1154 let mm5 = _mm_mulhi_epu16(mm5, kvm_v);
1155 let kva_v = _mm_loadl_epi64(std::ptr::addr_of!(state.kv6coladd).cast::<__m128i>());
1156 let mm5 = _mm_add_epi16(mm5, kva_v);
1157 let mm5 = _mm_packus_epi16(mm5, mm5);
1158 let color = _mm_cvtsi128_si32(mm5) as u32;
1159 *mm5_tail = color;
1160
1161 // Fill rectangle [pixel_min_x .. +dx) × [pixel_min_y .. +dy+1).
1162 // The qsum0/qsum1 clip + saturating sub guarantee the rect
1163 // sits inside the framebuffer, so no per-pixel bounds check
1164 // needed beyond DrawTarget's debug_assert.
1165 let z_val = origin_arr[2];
1166 let pitch = target.pitch_pixels;
1167 let x0 = pixel_min_x as usize;
1168 let x_end = x0 + dx as usize;
1169 let mut written: u32 = 0;
1170 for row in 0..=(dy as usize) {
1171 let y = pixel_min_y as usize + row;
1172 let row_start = y * pitch;
1173 for x in x0..x_end {
1174 let idx = row_start + x;
1175 // SAFETY: idx < pitch * height by qsum0/qsum1 clip;
1176 // concurrent-write contract gated by z_test_write.
1177 // (Outer `unsafe` block in this fn covers the call.)
1178 if target.z_test_write(idx, color, z_val) {
1179 written += 1;
1180 }
1181 }
1182 }
1183 written
1184 }
1185}
1186
1187/// R9: scalar port for non-x86_64 (aarch64 / wasm). Same algorithm as
1188/// the SSE2 version but uses IEEE 754 `1.0 / z` instead of `_mm_rcp_ps`
1189/// for perspective projection, so screen-space vertex positions (and
1190/// therefore per-arch goldens) will differ by ±1 pixel at edges.
1191/// Colour modulation replicates the `_mm_mulhi_epu16` + `_mm_packus_epi16`
1192/// byte arithmetic exactly.
1193#[cfg(not(target_arch = "x86_64"))]
1194#[allow(clippy::trivially_copy_pass_by_ref)]
1195pub(crate) fn drawboundcubesse(
1196 v: &Voxel,
1197 mask: u32,
1198 state: &Kv6FullState<'_>,
1199 r0: [f32; 4],
1200 mm5_tail: &mut u32,
1201 target: &mut DrawTarget<'_>,
1202) -> u32 {
1203 let effmask = (mask & u32::from(v.vis)) as usize;
1204 if effmask == 0 || effmask >= PTFACES16.len() {
1205 return 0;
1206 }
1207 let face = PTFACES16[effmask];
1208 if face[0] == 0 {
1209 return 0;
1210 }
1211
1212 // origin = r0 + ztab4_per_z[v.z]
1213 let z_idx = v.z as usize;
1214 if z_idx >= state.ztab4_per_z.len() {
1215 return 0;
1216 }
1217 let origin = vec4_add(r0, state.ztab4_per_z[z_idx]);
1218 if origin[2] < state.scisdist {
1219 return 0;
1220 }
1221
1222 // The SSE2 path's qsum0/qsum1 mechanism embeds the screen-center
1223 // offset (hx, hy) into the viewport clip; recover it here for
1224 // the direct screen-coordinate projection.
1225 let hx = (i32::from(state.qsum0[0]) - i32::from(state.qsum1[0])) as f32;
1226 let hy = (i32::from(state.qsum0[1]) - i32::from(state.qsum1[1])) as f32;
1227
1228 // Project one vertex: screen_xy = (cadd4[idx] + origin).xy / .z + (hx, hy)
1229 let project = |off: u8| -> (f32, f32) {
1230 let wv = vec4_add(state.cadd4[(off >> 4) as usize], origin);
1231 let inv_z = 1.0 / wv[2];
1232 (wv[0] * inv_z + hx, wv[1] * inv_z + hy)
1233 };
1234
1235 // Project 4 or 6 vertices, track screen AABB via truncation.
1236 let (a0x, a0y) = project(face[1]);
1237 let (a1x, a1y) = project(face[2]);
1238 let (a2x, a2y) = project(face[3]);
1239 let (a3x, a3y) = project(face[4]);
1240 let mut min_x = a0x.min(a1x).min(a2x).min(a3x) as i32;
1241 let mut min_y = a0y.min(a1y).min(a2y).min(a3y) as i32;
1242 let mut max_x = a0x.max(a1x).max(a2x).max(a3x) as i32;
1243 let mut max_y = a0y.max(a1y).max(a2y).max(a3y) as i32;
1244
1245 if face[0] != 4 {
1246 let (a4x, a4y) = project(face[5]);
1247 let (a5x, a5y) = project(face[6]);
1248 min_x = min_x.min(a4x as i32).min(a5x as i32);
1249 min_y = min_y.min(a4y as i32).min(a5y as i32);
1250 max_x = max_x.max(a4x as i32).max(a5x as i32);
1251 max_y = max_y.max(a4y as i32).max(a5y as i32);
1252 }
1253
1254 // Viewport clip (mirrors the qsum0/qsum1 saturating-add + max
1255 // sequence from the SSE2 path, but in direct screen coords).
1256 let fb_w = target.width as i32;
1257 let fb_h = target.height as i32;
1258 min_x = min_x.max(0);
1259 min_y = min_y.max(0);
1260 max_x = max_x.min(fb_w - 1);
1261 max_y = max_y.min(fb_h - 1);
1262 if min_x > max_x || min_y > max_y {
1263 return 0;
1264 }
1265
1266 // Colour modulation — replicates the SSE2 byte arithmetic:
1267 // interleave = unpacklo_epi8(tail, col) → 4 × u16
1268 // result = mulhi_epu16(interleave, kv6colmul[dir]) + kv6coladd
1269 // color = packus_epi16(result) → 4 × u8 → u32
1270 let t = mm5_tail.to_le_bytes();
1271 let c = v.col.to_le_bytes();
1272 let interleaved: [u16; 4] = [
1273 (u16::from(c[0]) << 8) | u16::from(t[0]),
1274 (u16::from(c[1]) << 8) | u16::from(t[1]),
1275 (u16::from(c[2]) << 8) | u16::from(t[2]),
1276 (u16::from(c[3]) << 8) | u16::from(t[3]),
1277 ];
1278 let kvm = state.kv6colmul[v.dir as usize];
1279 let kva = state.kv6coladd;
1280 let mut color_bytes = [0u8; 4];
1281 for i in 0..4 {
1282 let km = ((kvm >> (i * 16)) & 0xffff) as u16;
1283 let ka = ((kva >> (i * 16)) & 0xffff) as u16;
1284 let hi = ((u32::from(interleaved[i]) * u32::from(km)) >> 16) as u16;
1285 let val = hi.wrapping_add(ka) as i16;
1286 color_bytes[i] = val.clamp(0, 255) as u8;
1287 }
1288 let color = u32::from_le_bytes(color_bytes);
1289 *mm5_tail = color;
1290
1291 // Fill rectangle with z-test.
1292 let z_val = origin[2];
1293 let pitch = target.pitch_pixels;
1294 let mut written: u32 = 0;
1295 for y in min_y..=max_y {
1296 let row_start = y as usize * pitch;
1297 for x in min_x..=max_x {
1298 let idx = row_start + x as usize;
1299 // SAFETY: viewport clip above guarantees idx < pitch * height.
1300 unsafe {
1301 if target.z_test_write(idx, color, z_val) {
1302 written += 1;
1303 }
1304 }
1305 }
1306 }
1307 written
1308}
1309
1310/// One iteration of voxlap's `DRAWBOUNDCUBELINE` macro
1311/// (voxlap5.c:8809-8812). Walks the voxel range `[range_start,
1312/// range_end)` (one (x, y) column's voxels) in three phases:
1313///
1314/// 1. Forward through voxels with `z < inz`, calling
1315/// `callback(voxel, base_mask | 0x20, r0)`.
1316/// 2. Backward through voxels with `z > inz`, calling
1317/// `callback(voxel, base_mask | 0x10, r0)`.
1318/// 3. If a single voxel remains with `z == inz`, call
1319/// `callback(voxel, base_mask | 0x00, r0)`.
1320///
1321/// Each (x, y) column is visited exactly once. `r0` is the screen-
1322/// space origin for *this* column — voxlap stores it as
1323/// `ztab4[MAXZSIZ]` and `drawboundcubesse` reads it via that index.
1324fn draw_boundcube_line<F: FnMut(&Voxel, u32, [f32; 4])>(
1325 voxels: &[Voxel],
1326 range_start: usize,
1327 range_end: usize,
1328 inz: i32,
1329 base_mask: u32,
1330 r0: [f32; 4],
1331 callback: &mut F,
1332) {
1333 if range_end <= range_start {
1334 return;
1335 }
1336 let mut v0 = range_start;
1337 let mut v1_excl = range_end;
1338
1339 // Phase 1: forward while voxels[v0].z < inz.
1340 while v0 < v1_excl && i32::from(voxels[v0].z) < inz {
1341 callback(&voxels[v0], base_mask | 0x20, r0);
1342 v0 += 1;
1343 }
1344 // Phase 2: backward while voxels[v1_excl - 1].z > inz.
1345 while v0 < v1_excl && i32::from(voxels[v1_excl - 1].z) > inz {
1346 callback(&voxels[v1_excl - 1], base_mask | 0x10, r0);
1347 v1_excl -= 1;
1348 }
1349 // Phase 3: single voxel left with z == inz.
1350 if v0 + 1 == v1_excl {
1351 callback(&voxels[v0], base_mask, r0);
1352 }
1353}
1354
1355/// 9-arm per-(x, y) column iteration walking the kv6's voxel
1356/// grid in painter's-back-to-front order around the camera-split
1357/// point (`inx`, `iny`, `inz`). Mirror of voxlap5.c:8982-9062.
1358///
1359/// Tracks `r1` (current x-column base) and `r0` (current (x, y)
1360/// origin) the same way voxlap mutates them with addps/subps,
1361/// passing `r0` to each per-voxel callback. `r0` evolves as
1362/// `r0[x][y] = r1_initial + x * cadd4[1] - y * cadd4[4]` (with
1363/// the floating-point operations applied in voxlap's order so the
1364/// per-step rounding matches bit-for-bit).
1365///
1366/// Each (x, y) column is visited exactly once.
1367#[allow(clippy::too_many_lines)]
1368pub(crate) fn kv6_iterate<F: FnMut(&Voxel, u32, [f32; 4])>(
1369 state: &Kv6FullState<'_>,
1370 mut callback: F,
1371) {
1372 let kv = state.iter.kv;
1373 let xsiz = kv.xsiz as i32;
1374 let ysiz = kv.ysiz as i32;
1375 let inx = state.iter.inx;
1376 let iny = state.iter.iny;
1377 let inz = state.iter.inz;
1378 let nxplanemin = state.iter.nxplanemin;
1379 let nxplanemax = state.iter.nxplanemax;
1380 let cadd1 = state.cadd4[1];
1381 let cadd_y = state.cadd4[4];
1382 let r2 = state.r2;
1383
1384 let mut xv: usize = 0;
1385 let mut r1 = state.r1_initial;
1386
1387 // First half: x = 0..inx. Top-half quadrants (masks 0xa, 0x6, 0x2).
1388 let mut x: i32 = 0;
1389 while x < inx {
1390 let xu = x as usize;
1391 let xlen = kv.xlen[xu] as usize;
1392 if x < nxplanemin || x >= nxplanemax {
1393 xv += xlen;
1394 r1 = vec4_add(r1, cadd1);
1395 x += 1;
1396 continue;
1397 }
1398 let yv_initial = xv + xlen;
1399 let mut r0 = r1; // movps r0, r1
1400
1401 // Forward y: 0..iny -> mask 0xa.
1402 let mut xv_local = xv;
1403 let mut y: i32 = 0;
1404 while y < iny {
1405 let yu = y as usize;
1406 let len = kv.ylen[xu][yu] as usize;
1407 let v0 = xv_local;
1408 xv_local += len;
1409 draw_boundcube_line(&kv.voxels, v0, xv_local, inz, 0xa, r0, &mut callback);
1410 r0 = vec4_sub(r0, cadd_y); // r0 -= cadd4[4]
1411 y += 1;
1412 }
1413
1414 // Setup for reverse y: r0 = r1 + r2 (= base + (-ysiz)*cadd4[4]),
1415 // then r1 += cadd4[1] for the next x column.
1416 let mut yv_local = yv_initial;
1417 r0 = vec4_add(r1, r2);
1418 r1 = vec4_add(r1, cadd1);
1419
1420 // Reverse y: ysiz-1..iny -> mask 0x6.
1421 let mut y = ysiz - 1;
1422 while y > iny {
1423 r0 = vec4_add(r0, cadd_y); // r0 += cadd4[4]
1424 let yu = y as usize;
1425 let len = kv.ylen[xu][yu] as usize;
1426 let v1_excl = yv_local;
1427 yv_local -= len;
1428 draw_boundcube_line(&kv.voxels, yv_local, v1_excl, inz, 0x6, r0, &mut callback);
1429 y -= 1;
1430 }
1431
1432 // Edge y == iny -> mask 0x2.
1433 if iny >= 0 && (iny as u32) < kv.ysiz {
1434 r0 = vec4_add(r0, cadd_y);
1435 let yu = iny as usize;
1436 let len = kv.ylen[xu][yu] as usize;
1437 let v1_excl = yv_local;
1438 yv_local -= len;
1439 draw_boundcube_line(&kv.voxels, yv_local, v1_excl, inz, 0x2, r0, &mut callback);
1440 }
1441
1442 xv += xlen;
1443 x += 1;
1444 }
1445
1446 // Setup for second half (voxlap5.c:9011): jump r1 to past-end.
1447 // r1 += (xsiz - x) * cadd4[1] with x = post-first-half value.
1448 let dx_remain = (xsiz - x) as f32;
1449 r1 = vec4_add(r1, vec4_scale(cadd1, dx_remain));
1450
1451 // Second half: x = xsiz-1..inx (reverse). Bot-half quadrants
1452 // (masks 0x5, 0x9, 0x1).
1453 let mut xv2: usize = kv.voxels.len();
1454 let mut x = xsiz - 1;
1455 while x > inx {
1456 let xu = x as usize;
1457 let xlen = kv.xlen[xu] as usize;
1458 if x < nxplanemin || x >= nxplanemax {
1459 xv2 -= xlen;
1460 r1 = vec4_sub(r1, cadd1);
1461 x -= 1;
1462 continue;
1463 }
1464 let yv_initial = xv2 - xlen;
1465 // Voxlap order: r1 -= cadd1 first, then r0 = r1 + r2.
1466 r1 = vec4_sub(r1, cadd1);
1467 let mut r0 = vec4_add(r1, r2);
1468
1469 // Reverse y: ysiz-1..iny -> mask 0x5.
1470 let mut xv_local = xv2;
1471 let mut y = ysiz - 1;
1472 while y > iny {
1473 r0 = vec4_add(r0, cadd_y);
1474 let yu = y as usize;
1475 let len = kv.ylen[xu][yu] as usize;
1476 let v1_excl = xv_local;
1477 xv_local -= len;
1478 draw_boundcube_line(&kv.voxels, xv_local, v1_excl, inz, 0x5, r0, &mut callback);
1479 y -= 1;
1480 }
1481
1482 // After reverse y: r0 = r1 (movps r0, r1).
1483 let mut yv_local = yv_initial;
1484 r0 = r1;
1485
1486 // Forward y: 0..iny -> mask 0x9.
1487 let mut y: i32 = 0;
1488 while y < iny {
1489 let yu = y as usize;
1490 let len = kv.ylen[xu][yu] as usize;
1491 let v0 = yv_local;
1492 yv_local += len;
1493 draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x9, r0, &mut callback);
1494 r0 = vec4_sub(r0, cadd_y);
1495 y += 1;
1496 }
1497
1498 // Edge y == iny -> mask 0x1.
1499 if iny >= 0 && (iny as u32) < kv.ysiz {
1500 let yu = iny as usize;
1501 let len = kv.ylen[xu][yu] as usize;
1502 let v0 = yv_local;
1503 yv_local += len;
1504 draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x1, r0, &mut callback);
1505 }
1506
1507 xv2 -= xlen;
1508 x -= 1;
1509 }
1510
1511 // Edge x == inx (middle column). Masks 0x4, 0x8, 0x0.
1512 if inx >= 0 && (inx as u32) < kv.xsiz {
1513 let xu = inx as usize;
1514 if inx < nxplanemin || inx >= nxplanemax {
1515 return;
1516 }
1517 let xlen = kv.xlen[xu] as usize;
1518 let yv_initial = xv2 - xlen;
1519 r1 = vec4_sub(r1, cadd1);
1520 let mut r0 = vec4_add(r1, r2);
1521
1522 // Reverse y -> mask 0x4.
1523 let mut xv_local = xv2;
1524 let mut y = ysiz - 1;
1525 while y > iny {
1526 r0 = vec4_add(r0, cadd_y);
1527 let yu = y as usize;
1528 let len = kv.ylen[xu][yu] as usize;
1529 let v1_excl = xv_local;
1530 xv_local -= len;
1531 draw_boundcube_line(&kv.voxels, xv_local, v1_excl, inz, 0x4, r0, &mut callback);
1532 y -= 1;
1533 }
1534
1535 // After reverse y: r0 = r1.
1536 let mut yv_local = yv_initial;
1537 r0 = r1;
1538
1539 // Forward y -> mask 0x8.
1540 let mut y: i32 = 0;
1541 while y < iny {
1542 let yu = y as usize;
1543 let len = kv.ylen[xu][yu] as usize;
1544 let v0 = yv_local;
1545 yv_local += len;
1546 draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x8, r0, &mut callback);
1547 r0 = vec4_sub(r0, cadd_y);
1548 y += 1;
1549 }
1550
1551 // Edge y == iny -> mask 0x0.
1552 if iny >= 0 && (iny as u32) < kv.ysiz {
1553 let yu = iny as usize;
1554 let len = kv.ylen[xu][yu] as usize;
1555 let v0 = yv_local;
1556 yv_local += len;
1557 draw_boundcube_line(&kv.voxels, v0, yv_local, inz, 0x0, r0, &mut callback);
1558 }
1559 }
1560}
1561
1562/// Draw a sprite into a framebuffer + z-buffer.
1563///
1564/// Top-level dispatcher mirroring voxlap5.c:9818-9828:
1565/// - Skips on `flags & INVISIBLE`.
1566/// - Skips on `flags & KFA` (animation path; out of scope for R6).
1567/// - Skips on `flags & NO_Z` (handled by `drawboundcubenozsse`,
1568/// not yet ported — the four oracle sprite poses all use z-tested
1569/// rendering).
1570///
1571/// Otherwise: cull → setup math → 9-arm per-voxel iteration →
1572/// per-voxel rasterize via the R6.4 `drawboundcubesse` port.
1573///
1574/// Returns the total number of pixels written across all voxels of
1575/// the sprite (== sum of z-test passes). Zero means the sprite
1576/// produced no visible pixels (culled, fully behind near plane, or
1577/// totally occluded).
1578/// Render a batch of sprites in parallel via `rayon::par_iter`.
1579///
1580/// Each sprite runs its own [`draw_sprite`] pass on its own thread,
1581/// writing to the shared [`DrawTarget`] (raw pointers;
1582/// `Copy + Send + Sync`) under the z-test arbitration contract: a
1583/// pixel write only fires when the new sprite's z is strictly less
1584/// than the current zbuffer value. For non-overlapping sprites the
1585/// writes are pairwise-disjoint and the output is byte-identical
1586/// to a sequential pass over the same sprite list. For overlapping
1587/// pixels, two sprites at exactly tied z-values produce a
1588/// non-deterministic last-writer-wins outcome — visually
1589/// indistinguishable but hash-non-deterministic.
1590///
1591/// Returns the sum of `draw_sprite` return values (total pixels
1592/// written across all sprites).
1593///
1594/// `RAYON_NUM_THREADS=1` (or no parallelism worth) ⇒ effectively
1595/// sequential; rayon falls back to running each closure on the
1596/// calling thread without contention.
1597///
1598/// Use this for engine scenes with dozens-to-hundreds of sprites;
1599/// the per-sprite overhead amortises well past ~4 sprites on
1600/// consumer-class hardware.
1601#[allow(clippy::module_name_repetitions)]
1602#[must_use]
1603pub fn draw_sprites_parallel(
1604 target: DrawTarget<'_>,
1605 cam: &CameraState,
1606 settings: &OpticastSettings,
1607 lighting: &SpriteLighting<'_>,
1608 sprites: &[Sprite],
1609) -> u32 {
1610 use rayon::prelude::*;
1611
1612 let render_one = |sprite: &Sprite| {
1613 // `target` is `Copy`, so each closure captures its own
1614 // copy of the (raw fb / zb pointer) view. `cam`,
1615 // `settings`, `lighting` are `&` borrows — Sync.
1616 let mut t = target;
1617 draw_sprite(&mut t, cam, settings, lighting, sprite)
1618 };
1619
1620 sprites.par_iter().map(render_one).sum()
1621}
1622
1623pub fn draw_sprite(
1624 target: &mut DrawTarget<'_>,
1625 cam: &CameraState,
1626 settings: &OpticastSettings,
1627 lighting: &SpriteLighting<'_>,
1628 sprite: &Sprite,
1629) -> u32 {
1630 if sprite.flags & SPRITE_FLAG_INVISIBLE != 0 {
1631 return 0;
1632 }
1633 if sprite.flags & SPRITE_FLAG_KFA != 0 {
1634 return 0;
1635 }
1636 if sprite.flags & SPRITE_FLAG_NO_Z != 0 {
1637 // drawboundcubenozsse port deferred; oracle doesn't exercise it.
1638 return 0;
1639 }
1640 let Some(setup) = kv6_draw_prepare(sprite, cam) else {
1641 return 0;
1642 };
1643 let state = kv6_compute_full_state(
1644 &setup,
1645 sprite,
1646 lighting,
1647 cam,
1648 settings,
1649 target.width,
1650 target.height,
1651 target.pitch_pixels,
1652 );
1653 let mut mm5_tail: u32 = 0;
1654 let mut total_written: u32 = 0;
1655 kv6_iterate(&state, |voxel, mask, r0| {
1656 total_written += drawboundcubesse(voxel, mask, &state, r0, &mut mm5_tail, target);
1657 });
1658 total_written
1659}
1660
1661#[cfg(test)]
1662mod tests {
1663 use super::*;
1664 use crate::camera_math;
1665 use crate::Camera;
1666 use roxlap_formats::kv6::Kv6;
1667
1668 fn empty_kv6() -> Kv6 {
1669 Kv6 {
1670 xsiz: 1,
1671 ysiz: 1,
1672 zsiz: 1,
1673 xpiv: 0.5,
1674 ypiv: 0.5,
1675 zpiv: 0.5,
1676 voxels: Vec::new(),
1677 xlen: vec![0],
1678 ylen: vec![vec![0]],
1679 palette: None,
1680 }
1681 }
1682
1683 /// 17×17×17 kv6 with pivot at the centre — same dimensions as
1684 /// the meltsphere oracle sprite so the cull test exercises a
1685 /// realistic bound cube rather than a 1-voxel point.
1686 fn cube_kv6() -> Kv6 {
1687 Kv6 {
1688 xsiz: 17,
1689 ysiz: 17,
1690 zsiz: 17,
1691 xpiv: 8.5,
1692 ypiv: 8.5,
1693 zpiv: 8.5,
1694 voxels: Vec::new(),
1695 xlen: vec![0; 17],
1696 ylen: vec![vec![0; 17]; 17],
1697 palette: None,
1698 }
1699 }
1700
1701 /// `CameraState` matching the oracle's `sprite_front` pose:
1702 /// pos=(1020,1050,175), yaw=0, pitch=0 → forward = +x.
1703 fn oracle_sprite_front_camera() -> camera_math::CameraState {
1704 let camera = Camera {
1705 pos: [1020.0, 1050.0, 175.0],
1706 // From oracle.c set_camera_yaw_pitch with yaw=0, pitch=0:
1707 // ifor = [1, 0, 0], istr = [0, 1, 0], ihei = [0, 0, 1].
1708 right: [0.0, 1.0, 0.0],
1709 down: [0.0, 0.0, 1.0],
1710 forward: [1.0, 0.0, 0.0],
1711 };
1712 camera_math::derive(&camera, 640, 480, 320.0, 240.0, 320.0)
1713 }
1714
1715 fn oracle_settings() -> OpticastSettings {
1716 OpticastSettings::for_oracle_framebuffer(640, 480)
1717 }
1718
1719 /// Test-only ergonomic shim: build a Kv6FullState with the
1720 /// oracle 640×480 framebuffer geometry. Mirrors the
1721 /// pre-R6.4 signature so tests don't have to spell out
1722 /// width/height/pitch every time.
1723 fn compute_state_for_test<'a>(
1724 setup: &Kv6DrawSetup<'a>,
1725 sprite: &Sprite,
1726 cam: &camera_math::CameraState,
1727 ) -> Kv6FullState<'a> {
1728 let lighting = SpriteLighting::default_oracle();
1729 kv6_compute_full_state(
1730 setup,
1731 sprite,
1732 &lighting,
1733 cam,
1734 &oracle_settings(),
1735 640,
1736 480,
1737 640,
1738 )
1739 }
1740
1741 /// Allocate a 640×480 framebuffer + zbuffer (zbuffer pre-filled
1742 /// with f32::INFINITY so any voxel passes the z-test on first
1743 /// write).
1744 fn alloc_target() -> (Vec<u32>, Vec<f32>) {
1745 let pixels = 640usize * 480usize;
1746 (vec![0u32; pixels], vec![f32::INFINITY; pixels])
1747 }
1748
1749 fn make_target<'a>(fb: &'a mut [u32], zb: &'a mut [f32]) -> DrawTarget<'a> {
1750 DrawTarget::new(fb, zb, 640, 640, 480)
1751 }
1752
1753 /// Bit-pattern compare for two `[f32; 4]` vectors. The setup
1754 /// math produces these via deterministic IEEE-754 ops, so
1755 /// bit-equality is well-defined and dodges `clippy::float_cmp`.
1756 fn bits4(a: [f32; 4]) -> [u32; 4] {
1757 a.map(f32::to_bits)
1758 }
1759
1760 /// Bytes of the dumped C-oracle meltsphere sprite — used by all
1761 /// the kv6-load tests below. Module-scope `const` keeps clippy's
1762 /// `items_after_statements` happy.
1763 const SPRITE_MELTSPHERE_KV6: &[u8] = include_bytes!("../tests/fixtures/sprite_meltsphere.kv6");
1764
1765 #[test]
1766 fn axis_aligned_sets_identity_basis() {
1767 // Compare bit patterns: these are integer-valued floats so
1768 // bit-equality is well-defined and dodges clippy::float_cmp.
1769 let bits = |a: [f32; 3]| a.map(f32::to_bits);
1770 let s = Sprite::axis_aligned(empty_kv6(), [10.0, 20.0, 30.0]);
1771 assert_eq!(bits(s.p), bits([10.0, 20.0, 30.0]));
1772 assert_eq!(bits(s.s), bits([1.0, 0.0, 0.0]));
1773 assert_eq!(bits(s.h), bits([0.0, 1.0, 0.0]));
1774 assert_eq!(bits(s.f), bits([0.0, 0.0, 1.0]));
1775 assert_eq!(s.flags, 0);
1776 }
1777
1778 #[test]
1779 fn invisible_flag_skips_dispatch() {
1780 let cam = oracle_sprite_front_camera();
1781 let mut s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1782 s.flags = SPRITE_FLAG_INVISIBLE;
1783 let (mut fb, mut zb) = alloc_target();
1784 let mut target = make_target(&mut fb, &mut zb);
1785 let lighting = SpriteLighting::default_oracle();
1786 assert_eq!(
1787 draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
1788 0
1789 );
1790 }
1791
1792 #[test]
1793 fn kfa_flag_skips_dispatch() {
1794 let cam = oracle_sprite_front_camera();
1795 let mut s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1796 s.flags = SPRITE_FLAG_KFA;
1797 let (mut fb, mut zb) = alloc_target();
1798 let mut target = make_target(&mut fb, &mut zb);
1799 let lighting = SpriteLighting::default_oracle();
1800 assert_eq!(
1801 draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
1802 0
1803 );
1804 }
1805
1806 #[test]
1807 fn cull_keeps_oracle_sprite_in_front_of_camera() {
1808 // Oracle's `sprite_front` pose: camera at (1020,1050,175)
1809 // looking +x; sprite at (1050,1050,175). Sprite is 30
1810 // units forward, on-axis — clearly inside the frustum.
1811 let cam = oracle_sprite_front_camera();
1812 let s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0, 175.0]);
1813 assert!(
1814 kv6_draw_prepare(&s, &cam).is_some(),
1815 "front-of-camera sprite must NOT be culled"
1816 );
1817 }
1818
1819 #[test]
1820 fn cull_removes_sprite_far_behind_camera() {
1821 // Same camera; sprite far in the -forward direction
1822 // (= behind the camera).
1823 let cam = oracle_sprite_front_camera();
1824 let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
1825 assert!(
1826 kv6_draw_prepare(&s, &cam).is_none(),
1827 "behind-camera sprite must be culled"
1828 );
1829 }
1830
1831 #[test]
1832 fn cull_removes_sprite_far_to_the_right() {
1833 // Camera looks +x; sprite far in the +y direction (right
1834 // axis), far enough that the bound cube is fully outside
1835 // the right-edge frustum plane.
1836 let cam = oracle_sprite_front_camera();
1837 // 30 units forward, 200 units right — well outside the 90°
1838 // FOV's right edge.
1839 let s = Sprite::axis_aligned(cube_kv6(), [1050.0, 1050.0 + 200.0, 175.0]);
1840 assert!(
1841 kv6_draw_prepare(&s, &cam).is_none(),
1842 "far-right sprite must be culled"
1843 );
1844 }
1845
1846 #[test]
1847 fn cull_keeps_sprite_at_camera_position() {
1848 // Sprite centred on the camera — bound cube straddles the
1849 // camera, so by definition it's not fully outside any
1850 // frustum plane and must NOT be culled.
1851 let cam = oracle_sprite_front_camera();
1852 let s = Sprite::axis_aligned(cube_kv6(), cam.pos);
1853 assert!(
1854 kv6_draw_prepare(&s, &cam).is_some(),
1855 "sprite at camera position must not be culled"
1856 );
1857 }
1858
1859 #[test]
1860 fn iterate_visits_each_voxel_exactly_once() {
1861 // Build a synthetic 3×3×3 kv6 with one voxel per (x, y)
1862 // column at z = x + y mod 3. Then iterate and check
1863 // (a) total callback fires == 27 = numvoxs, and (b) every
1864 // voxel index 0..27 was visited exactly once.
1865 let xsiz: u32 = 3;
1866 let ysiz: u32 = 3;
1867 let zsiz: u32 = 3;
1868 let mut voxels = Vec::new();
1869 let mut xlen = vec![0u32; xsiz as usize];
1870 let mut ylen = vec![vec![0u16; ysiz as usize]; xsiz as usize];
1871 for x in 0..xsiz {
1872 for y in 0..ysiz {
1873 let z = ((x + y) % 3) as u16;
1874 voxels.push(Voxel {
1875 col: 0x0080_0000,
1876 z,
1877 vis: 63,
1878 dir: 0,
1879 });
1880 xlen[x as usize] += 1;
1881 ylen[x as usize][y as usize] = 1;
1882 }
1883 }
1884 let kv = Kv6 {
1885 xsiz,
1886 ysiz,
1887 zsiz,
1888 xpiv: 1.5,
1889 ypiv: 1.5,
1890 zpiv: 1.5,
1891 voxels,
1892 xlen,
1893 ylen,
1894 palette: None,
1895 };
1896 let setup = Kv6DrawSetup {
1897 kv: &kv,
1898 ts: [1.0, 0.0, 0.0],
1899 th: [0.0, 1.0, 0.0],
1900 tf: [0.0, 0.0, 1.0],
1901 mip: 0,
1902 };
1903 let cam = oracle_sprite_front_camera();
1904 let synth_sprite = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
1905 let state = compute_state_for_test(&setup, &synth_sprite, &cam);
1906
1907 // Every voxel index must fire exactly once. We use a
1908 // by-pointer identity check via .as_ptr() offsets.
1909 let voxels_ptr = kv.voxels.as_ptr();
1910 let mut visited = vec![0u32; kv.voxels.len()];
1911 let mut total: u32 = 0;
1912 kv6_iterate(&state, |v, _mask, _r0| {
1913 // SAFETY: callback receives a borrow of an entry of
1914 // `kv.voxels`; computing the offset is well-defined.
1915 let idx = unsafe { std::ptr::from_ref::<Voxel>(v).offset_from(voxels_ptr) } as usize;
1916 visited[idx] += 1;
1917 total += 1;
1918 });
1919 assert_eq!(total as usize, kv.voxels.len(), "total callback fires");
1920 for (i, &n) in visited.iter().enumerate() {
1921 assert_eq!(n, 1, "voxel {i} visited {n} times (want 1)");
1922 }
1923 }
1924
1925 #[test]
1926 fn iterate_meltsphere_oracle_visits_each_voxel_once() {
1927 // Load the dumped voxlap-C meltsphere fixture (R6.0e) and
1928 // run the iteration against the oracle's sprite_front
1929 // camera + sprite pose. Expected: every voxel hit exactly
1930 // once, total fires == kv.voxels.len() (= 401).
1931 let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1932 assert_eq!(kv.voxels.len(), 401, "fixture voxel count");
1933
1934 let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1935 let cam = oracle_sprite_front_camera();
1936 let setup = kv6_draw_prepare(&sprite, &cam).expect("oracle sprite must pass cull");
1937 let state = compute_state_for_test(&setup, &sprite, &cam);
1938
1939 let voxels_ptr = sprite.kv6.voxels.as_ptr();
1940 let mut visited = vec![0u32; sprite.kv6.voxels.len()];
1941 let mut total: u32 = 0;
1942 kv6_iterate(&state, |v, _mask, _r0| {
1943 let idx = unsafe { std::ptr::from_ref::<Voxel>(v).offset_from(voxels_ptr) } as usize;
1944 visited[idx] += 1;
1945 total += 1;
1946 });
1947 assert_eq!(total, 401);
1948 let max = visited.iter().copied().max().unwrap();
1949 let min = visited.iter().copied().min().unwrap();
1950 assert_eq!(max, 1, "no voxel may be visited twice");
1951 assert_eq!(min, 1, "no voxel may be skipped");
1952 }
1953
1954 #[test]
1955 fn full_state_basic_invariants() {
1956 // For the oracle sprite_front pose, sanity-check the setup
1957 // values: ztab4_per_z[0] is zero, ztab4_per_z[k] - ztab4_per_z[k-1]
1958 // equals cadd4[2], cadd4[3] = cadd4[1] + cadd4[2], cadd4[7] is
1959 // the 7-bit-OR sum, and r1_initial = (npos*gihz with z2=npos.z)
1960 // - cadd4[4].
1961 let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
1962 let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
1963 let cam = oracle_sprite_front_camera();
1964 let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
1965 let state = compute_state_for_test(&setup, &sprite, &cam);
1966
1967 // ztab4_per_z[0] = [0; 4].
1968 assert_eq!(bits4(state.ztab4_per_z[0]), bits4([0.0; 4]));
1969
1970 // For each subsequent z, ztab4_per_z[z] = ztab4_per_z[z-1] + cadd4[2].
1971 for z in 1..state.ztab4_per_z.len() {
1972 let want = vec4_add(state.ztab4_per_z[z - 1], state.cadd4[2]);
1973 assert_eq!(bits4(state.ztab4_per_z[z]), bits4(want), "ztab4_per_z[{z}]");
1974 }
1975
1976 // cadd4[3] = cadd4[1] + cadd4[2]; cadd4[5] = cadd4[1] + cadd4[4];
1977 // cadd4[6] = cadd4[2] + cadd4[4]; cadd4[7] = cadd4[3] + cadd4[4].
1978 assert_eq!(
1979 bits4(state.cadd4[3]),
1980 bits4(vec4_add(state.cadd4[1], state.cadd4[2]))
1981 );
1982 assert_eq!(
1983 bits4(state.cadd4[5]),
1984 bits4(vec4_add(state.cadd4[1], state.cadd4[4]))
1985 );
1986 assert_eq!(
1987 bits4(state.cadd4[6]),
1988 bits4(vec4_add(state.cadd4[2], state.cadd4[4]))
1989 );
1990 assert_eq!(
1991 bits4(state.cadd4[7]),
1992 bits4(vec4_add(state.cadd4[3], state.cadd4[4]))
1993 );
1994 assert_eq!(bits4(state.cadd4[0]), bits4([0.0; 4]));
1995
1996 // r2 = -ysiz * cadd4[4].
1997 let want_r2 = vec4_scale(state.cadd4[4], -(state.iter.kv.ysiz as f32));
1998 assert_eq!(bits4(state.r2), bits4(want_r2));
1999 }
2000
2001 #[test]
2002 fn drawboundcubesse_culls_invisible_face_mask() {
2003 // Synthetic voxel with vis=0 must short-circuit the
2004 // early-out and not consume the scissor branch.
2005 let v = Voxel {
2006 col: 0,
2007 z: 0,
2008 vis: 0,
2009 dir: 0,
2010 };
2011 let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2012 let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2013 let cam = oracle_sprite_front_camera();
2014 let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
2015 let state = compute_state_for_test(&setup, &sprite, &cam);
2016 let (mut fb, mut zb) = alloc_target();
2017 let mut target = make_target(&mut fb, &mut zb);
2018 let mut tail = 0u32;
2019 assert_eq!(
2020 drawboundcubesse(
2021 &v,
2022 0xff,
2023 &state,
2024 [0.0, 0.0, 100.0, 100.0],
2025 &mut tail,
2026 &mut target,
2027 ),
2028 0
2029 );
2030 }
2031
2032 #[test]
2033 fn drawboundcubesse_culls_voxel_behind_near_plane() {
2034 // Force scisdist > 0 by passing an r0 with very small
2035 // origin.z. Only triggers if scisdist > origin.z; for the
2036 // oracle sprite_front pose `scisdist` is some small
2037 // positive number (sum of any negative post-swap basis-z
2038 // components), so a r0 with z = -1 will cull.
2039 let v = Voxel {
2040 col: 0xff,
2041 z: 0,
2042 vis: 0xff,
2043 dir: 0,
2044 };
2045 let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2046 let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2047 let cam = oracle_sprite_front_camera();
2048 let setup = kv6_draw_prepare(&sprite, &cam).expect("cull pass");
2049 let state = compute_state_for_test(&setup, &sprite, &cam);
2050 // r0.z = -1000 makes origin.z = -1000 + ztab4_per_z[0].z = -1000.
2051 // scisdist >= 0; -1000 < scisdist → cull.
2052 let r0 = [0.0, 0.0, -1000.0, -1000.0];
2053 let (mut fb, mut zb) = alloc_target();
2054 let mut target = make_target(&mut fb, &mut zb);
2055 let mut tail = 0u32;
2056 assert_eq!(
2057 drawboundcubesse(&v, 0xff, &state, r0, &mut tail, &mut target),
2058 0
2059 );
2060 }
2061
2062 #[test]
2063 fn iterate_no_voxels_when_culled() {
2064 // Sprite far behind camera → cull. draw_sprite never
2065 // reaches kv6_iterate, so no callback fires.
2066 let cam = oracle_sprite_front_camera();
2067 let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
2068 // Cull catches it before iteration.
2069 assert!(kv6_draw_prepare(&s, &cam).is_none());
2070 }
2071
2072 #[test]
2073 fn draw_sprite_writes_pixels_for_oracle_meltsphere() {
2074 // R6.4 end-to-end: load the meltsphere fixture, run
2075 // draw_sprite at the sprite_front pose. Expect a non-zero
2076 // pixel count and at least one non-zero framebuffer entry.
2077 let kv = roxlap_formats::kv6::parse(SPRITE_MELTSPHERE_KV6).expect("parse fixture");
2078 let sprite = Sprite::axis_aligned(kv, [1050.0, 1050.0, 175.0]);
2079 let cam = oracle_sprite_front_camera();
2080 let (mut fb, mut zb) = alloc_target();
2081 let mut target = make_target(&mut fb, &mut zb);
2082 let lighting = SpriteLighting::default_oracle();
2083 let written = draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &sprite);
2084 assert!(written > 0, "expected some pixels to be written");
2085 assert!(
2086 fb.iter().any(|&p| p != 0),
2087 "expected at least one non-zero framebuffer entry"
2088 );
2089 // Z-buffer must have shrunk somewhere from f32::INFINITY.
2090 assert!(
2091 zb.iter().any(|&z| z.is_finite()),
2092 "expected at least one finite zbuffer entry"
2093 );
2094 }
2095
2096 #[test]
2097 fn draw_sprite_returns_zero_for_culled_sprite() {
2098 let cam = oracle_sprite_front_camera();
2099 let s = Sprite::axis_aligned(cube_kv6(), [1020.0 - 500.0, 1050.0, 175.0]);
2100 let (mut fb, mut zb) = alloc_target();
2101 let mut target = make_target(&mut fb, &mut zb);
2102 let lighting = SpriteLighting::default_oracle();
2103 assert_eq!(
2104 draw_sprite(&mut target, &cam, &oracle_settings(), &lighting, &s),
2105 0
2106 );
2107 assert!(fb.iter().all(|&p| p == 0));
2108 }
2109
2110 /// `update_reflects` for the oracle sprite_front pose hits the
2111 /// nolighta path (R==G==B kv6col, no fog, lightmode<2). All
2112 /// kv6colmul[k] entries must repeat one u16 modulation factor
2113 /// across all 4 lanes.
2114 #[test]
2115 fn update_reflects_nolighta_lanes_match() {
2116 let s = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
2117 let lighting = SpriteLighting::default_oracle();
2118 let (cm, ca) = update_reflects(&s, &lighting);
2119 assert_eq!(ca, 0, "kv6coladd must be zero (no fog)");
2120 for (k, e) in cm.iter().enumerate() {
2121 let l0 = (e & 0xffff) as u16;
2122 let l1 = ((e >> 16) & 0xffff) as u16;
2123 let l2 = ((e >> 32) & 0xffff) as u16;
2124 let l3 = ((e >> 48) & 0xffff) as u16;
2125 assert_eq!(l0, l1, "kv6colmul[{k}] lane0 != lane1");
2126 assert_eq!(l0, l2, "kv6colmul[{k}] lane0 != lane2");
2127 assert_eq!(l0, l3, "kv6colmul[{k}] lane0 != lane3");
2128 }
2129 }
2130
2131 /// Non-grey kv6col forces the nolightb path. Lanes 0..3 of each
2132 /// `kv6colmul[k]` come from per-channel modulators built from
2133 /// the kv6col bytes — they should NOT all match unless the
2134 /// channels themselves match.
2135 #[test]
2136 fn update_reflects_nolightb_lanes_diverge_for_tinted_kv6col() {
2137 let s = Sprite::axis_aligned(empty_kv6(), [1050.0, 1050.0, 175.0]);
2138 let lighting = SpriteLighting {
2139 kv6col: 0x0040_8040, // R != G != B
2140 lightmode: 0,
2141 lights: &[],
2142 };
2143 let (cm, _) = update_reflects(&s, &lighting);
2144 // Find any direction where the dot is non-zero (most are
2145 // non-zero); that direction's lanes must vary by channel.
2146 let mut saw_divergence = false;
2147 for e in cm.iter() {
2148 let l0 = (e & 0xffff) as u16;
2149 let l1 = ((e >> 16) & 0xffff) as u16;
2150 let l2 = ((e >> 32) & 0xffff) as u16;
2151 if l0 != l1 || l0 != l2 {
2152 saw_divergence = true;
2153 break;
2154 }
2155 }
2156 assert!(
2157 saw_divergence,
2158 "non-grey kv6col must produce per-channel divergence in some kv6colmul slot"
2159 );
2160 }
2161
2162 /// Lightmode-2 with one point light + grey kv6col still
2163 /// produces R==G==B lanes (because the per-channel modulators
2164 /// are all 0x80<<8 = 0x8000). It must produce a non-uniform
2165 /// kv6colmul (some directions face the light, others away),
2166 /// which differs from lightmode<2 where every direction has the
2167 /// same dot magnitude regardless of position.
2168 #[test]
2169 fn update_reflects_lightmode2_produces_directional_shading() {
2170 let s = Sprite::axis_aligned(empty_kv6(), [100.0, 100.0, 100.0]);
2171 let lights = [LightSrc {
2172 pos: [110.0, 100.0, 100.0],
2173 r2: 100.0,
2174 sc: 16.0,
2175 }];
2176 let lighting = SpriteLighting {
2177 kv6col: DEFAULT_KV6COL,
2178 lightmode: 2,
2179 lights: &lights,
2180 };
2181 let (cm, _) = update_reflects(&s, &lighting);
2182 // Some directions must darken (shadow side) while others
2183 // brighten (light side) — the spread between min and max
2184 // tells us shading is happening.
2185 let mut min_w = u16::MAX;
2186 let mut max_w = 0u16;
2187 for e in cm.iter() {
2188 let l0 = (e & 0xffff) as u16;
2189 min_w = min_w.min(l0);
2190 max_w = max_w.max(l0);
2191 }
2192 assert!(
2193 max_w > min_w + 16,
2194 "lightmode-2 should produce directional shading: min={min_w} max={max_w}"
2195 );
2196 }
2197
2198 /// Lightmode-2 with no lights → ambient-only. Should still
2199 /// produce some non-zero kv6colmul (the synthetic ambient slot
2200 /// is non-trivial).
2201 #[test]
2202 fn update_reflects_lightmode2_no_lights_falls_back_to_ambient() {
2203 let s = Sprite::axis_aligned(empty_kv6(), [100.0, 100.0, 100.0]);
2204 let lighting = SpriteLighting {
2205 kv6col: DEFAULT_KV6COL,
2206 lightmode: 2,
2207 lights: &[],
2208 };
2209 let (cm, _) = update_reflects(&s, &lighting);
2210 let any_nonzero = cm.iter().any(|&e| e != 0);
2211 assert!(
2212 any_nonzero,
2213 "lightmode-2 with no lights should still emit ambient shading"
2214 );
2215 }
2216}