Skip to main content

roxlap_core/
world_lighting.rs

1//! Voxlap's world-voxel lighting bake (`updatelighting`,
2//! voxlap5.c:10539).
3//!
4//! Walks every visible voxel inside a 3D bounding box and rewrites
5//! its alpha byte (the per-voxel "brightness" channel that the
6//! rendering path mulhi'es against `kv6colmul`-style modulators)
7//! based on the engine's current `LightSrc` set + lightmode.
8//!
9//! Two modes:
10//! - `lightmode == 1`: cheap directional bake — every voxel gets
11//!   shading from a single hardcoded sun direction
12//!   `(tp.y * 0.5 + tp.z) * 64 + 103.5` clamped to `[0, 255]`.
13//! - `lightmode == 2`: per-light Lambertian bake — for each light
14//!   in range, subtract `g * h * sc` where `g = 1/(d·d²) -
15//!   1/(r·r²)` (cube-falloff with hard cutoff at radius `r`),
16//!   `h = surface_normal · light_delta` (negative ⇒ face front-
17//!   lit, contributes; positive ⇒ self-shadowed, skipped). Result
18//!   subtracts from a base `(tp.y * 0.5 + tp.z) * 16 + 47.5`.
19//!
20//! The surface normal `tp` for each voxel comes from `estnorm` —
21//! a 5×5×5 voxel-solid neighbourhood vote (`ESTNORMRAD == 2` in
22//! voxlap, the production path).
23
24#![allow(
25    clippy::cast_possible_truncation,
26    clippy::cast_possible_wrap,
27    clippy::cast_sign_loss,
28    clippy::cast_precision_loss,
29    clippy::similar_names,
30    clippy::too_many_arguments,
31    clippy::too_many_lines,
32    clippy::doc_markdown,
33    clippy::many_single_char_names,
34    clippy::must_use_candidate,
35    clippy::unnecessary_cast,
36    clippy::cast_lossless,
37    clippy::needless_bool_assign,
38    clippy::needless_range_loop,
39    clippy::no_effect,
40    clippy::identity_op,
41    clippy::if_not_else
42)]
43
44use rayon::prelude::*;
45
46use crate::engine::LightSrc;
47
48/// Voxlap's `MAXZDIM` (`voxlap5.c`). World z runs `0..MAXZDIM`.
49pub(crate) const MAXZDIM: i32 = 256;
50
51/// Voxlap's `ESTNORMRAD == 2` cache window radius. The estnorm
52/// neighbourhood is `(2*RAD+1)³ = 5×5×5` voxels.
53pub(crate) const ESTNORMRAD: i32 = 2;
54
55/// Per-byte popcount table. Voxlap's `bitnum[32]` (voxlap5.c:1477)
56/// — number of set bits in the low 5 bits of each index. Used by
57/// estnorm's neighbourhood-vote reduction.
58pub(crate) const BITNUM: [i8; 32] = [
59    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
60];
61
62/// Per-byte signed-symmetric popcount. Voxlap's `bitsnum[32]`
63/// (voxlap5.c:1487) — packs `popcount` into the low i16 lane and
64/// `popcount - 2·popcount_negative_axis` into the high i16 lane.
65/// The exact derivation is in voxlap's comment block; values
66/// reproduced verbatim.
67#[rustfmt::skip]
68pub(crate) const BITSNUM: [i32; 32] = [
69    0,           1 - (2 << 16), 1 - (1 << 16), 2 - (3 << 16),
70    1,           2 - (2 << 16), 2 - (1 << 16), 3 - (3 << 16),
71    1 + (1 << 16), 2 - (1 << 16), 2,           3 - (2 << 16),
72    2 + (1 << 16), 3 - (1 << 16), 3,           4 - (2 << 16),
73    1 + (2 << 16), 2,           2 + (1 << 16), 3 - (1 << 16),
74    2 + (2 << 16), 3,           3 + (1 << 16), 4 - (1 << 16),
75    2 + (3 << 16), 3 + (1 << 16), 3 + (2 << 16), 4,
76    3 + (3 << 16), 4 + (1 << 16), 4 + (2 << 16), 5,
77];
78
79/// `xbsflor[k] = -1i32 << k` — bits `k..31` set, low `k` bits
80/// clear. Used by `expandbit256` to splat air→solid transitions
81/// onto a partial 32-bit word.
82pub(crate) const fn xbsflor(k: usize) -> u32 {
83    if k >= 32 {
84        0
85    } else {
86        (-1i32 << k) as u32
87    }
88}
89
90/// `xbsceil[k] = ~xbsflor[k]` — low `k` bits set. Solid→air
91/// transitions.
92pub(crate) const fn xbsceil(k: usize) -> u32 {
93    !xbsflor(k)
94}
95
96/// `expandbit256` — slab structure → 256-bit "voxel solid" bit
97/// array (low-bit-first, low-z-first). Mirror of voxlap5.c:1059.
98///
99/// The output `bits` is a `[u32; 8]` (= 256 bits = `MAXZDIM` z
100/// levels). Bit `z` is set iff voxel at column `(x, y)`, depth `z`
101/// is solid (= part of any slab body, including hidden interiors
102/// between slabs).
103///
104/// Walks the slab linked list, alternating between `v[1]`
105/// (air→solid transition at top of slab) and `v[3]` (solid→air
106/// transition at bottom of next slab). Each transition flushes
107/// pending whole-words (full air `0` or full solid `-1`) until
108/// it lands inside the partial word containing the transition,
109/// then OR/ANDs the partial mask via `xbsflor` / `xbsceil`.
110pub(crate) fn expandbit256(column: &[u8], bits: &mut [u32; 8]) {
111    let mut src_idx: usize = 0;
112    let mut dst_idx: usize = 0;
113    let mut bitpos: i32 = 32;
114    let mut word: u32 = 0;
115    let nbits: i32 = (bits.len() as i32) * 32;
116
117    // First iteration: jump straight to the v[1] transition (no
118    // preceding slab whose v[3] we'd need to flush).
119    let mut next_len: i32;
120    let mut delta: i32;
121    let mut go_to_v3 = false;
122
123    'outer: loop {
124        if go_to_v3 {
125            // v[3] : solid → air transition.
126            if src_idx + 3 >= column.len() {
127                break;
128            }
129            delta = i32::from(column[src_idx + 3]) - bitpos;
130            while delta >= 0 {
131                if dst_idx >= bits.len() {
132                    break 'outer;
133                }
134                bits[dst_idx] = word;
135                dst_idx += 1;
136                word = u32::MAX;
137                bitpos += 32;
138                delta -= 32;
139            }
140            word &= xbsceil((delta + 32) as usize);
141        }
142        go_to_v3 = true;
143
144        // v[1] : air → solid transition.
145        if src_idx + 1 >= column.len() {
146            break;
147        }
148        delta = i32::from(column[src_idx + 1]) - bitpos;
149        while delta >= 0 {
150            if dst_idx >= bits.len() {
151                break 'outer;
152            }
153            bits[dst_idx] = word;
154            dst_idx += 1;
155            word = 0;
156            bitpos += 32;
157            delta -= 32;
158        }
159        word |= xbsflor((delta + 32) as usize);
160
161        next_len = i32::from(column[src_idx]);
162        if next_len == 0 {
163            break;
164        }
165        src_idx += (next_len as usize) * 4;
166    }
167
168    // Pad the rest of the buffer with `word`'s tail value (in C the
169    // post-loop word is whatever the last `v[1]` partial-set
170    // produced; remaining whole-words flush as solid `-1`).
171    if bitpos <= nbits {
172        while dst_idx < bits.len() {
173            bits[dst_idx] = word;
174            dst_idx += 1;
175            word = u32::MAX;
176        }
177    }
178}
179
180/// Pre-built `expandbit256` grid covering a 2D bounding region —
181/// `(x1 - x0 + 2*RAD) × (y1 - y0 + 2*RAD)` columns. Trades 32
182/// bytes per column of memory for O(1) bit-window lookups during
183/// the estnorm 5×5 neighbourhood vote.
184///
185/// This is the conceptual equivalent of voxlap's `xbsbuf` cache —
186/// just batch-pre-built rather than rotated row-by-row through
187/// the bake. Memory cost stays manageable: a 448×448 bake (the
188/// `diag_down_lit` oracle scope, which extends to 452×452 with
189/// padding) needs about 6.4 MB.
190#[allow(dead_code)] // vsid field/method preserved for voxlap-parity inspection
191pub(crate) struct EstNormCache {
192    /// Per-column bit arrays. `bits[(yidx) * width + (xidx)]` is
193    /// the slab bit-mask of column `(origin_x + xidx, origin_y +
194    /// yidx)`. `xidx ∈ 0..width`, mapping abs-x into
195    /// `[origin_x - RAD, origin_x + (x1 - x0) - 1 + RAD]`.
196    bits: Vec<[u32; 8]>,
197    /// Top-left of the cache window in world coords (= original
198    /// `x0 - RAD`).
199    origin_x: i32,
200    origin_y: i32,
201    /// Cached-region width (= `x1 - x0 + 2 * RAD`).
202    width: usize,
203    /// Reserved for symmetric debugging — kept so the cache layout
204    /// can be inspected without recomputing from `bits.len()`.
205    #[allow(dead_code)]
206    height: usize,
207    /// Inverse-square-root LUT — `fsqrecip[k] = 1 / sqrt(k)` for
208    /// `k ∈ 0..=5859`. Voxlap's `fsqrecip` table; same precision
209    /// as the C build (no Newton refinement for k > 22).
210    fsqrecip: Vec<f32>,
211    /// Voxel-grid limit (= `vsid`) used for out-of-bounds clamps.
212    vsid: i32,
213}
214
215/// Voxlap's `fsqrecip[5860]` table init (voxlap5.c:12240-12256).
216/// Mirror of the C calculation including the asymmetric Newton-
217/// refinement schedule for indices ≤ 22.
218fn build_fsqrecip() -> Vec<f32> {
219    const N: usize = 5860;
220    let mut t = vec![0.0_f32; N];
221    t[0] = 0.0;
222    t[1] = 1.0;
223    t[2] = (1.0_f32 / 2.0_f32.sqrt()) as f32;
224    t[3] = 1.0 / 3.0_f32.sqrt();
225    let mut i = 3usize;
226    let mut z = 4usize;
227    while z < N {
228        if z + 5 >= N {
229            // Safety stop — cycle increment by 6 may overshoot.
230            break;
231        }
232        t[z] = t[z >> 1] * t[2];
233        t[z + 2] = t[(z + 2) >> 1] * t[2];
234        t[z + 4] = t[(z + 4) >> 1] * t[2];
235        t[z + 5] = t[i] * t[3];
236        i += 2;
237
238        let mut f = (t[z] + t[z + 2]) * 0.5_f32;
239        if z <= 22 {
240            f = (1.5 - 0.5 * ((z + 1) as f32) * f * f) * f;
241        }
242        t[z + 1] = (1.5 - 0.5 * ((z + 1) as f32) * f * f) * f;
243
244        let mut f = (t[z + 2] + t[z + 4]) * 0.5_f32;
245        if z <= 22 {
246            f = (1.5 - 0.5 * ((z + 3) as f32) * f * f) * f;
247        }
248        t[z + 3] = (1.5 - 0.5 * ((z + 3) as f32) * f * f) * f;
249
250        z += 6;
251    }
252    t
253}
254
255impl EstNormCache {
256    /// Build the bit-grid cache covering the bounding region
257    /// `[x0..x1) × [y0..y1)` extended by `ESTNORMRAD` padding on
258    /// each side. Calling [`Self::estnorm`] for any `(x, y)` inside
259    /// the original `[x0..x1) × [y0..y1)` box is then a pure read.
260    #[must_use]
261    pub fn build(
262        world_data: &[u8],
263        column_offsets: &[u32],
264        vsid: u32,
265        x0: i32,
266        y0: i32,
267        x1: i32,
268        y1: i32,
269    ) -> Self {
270        let rad = ESTNORMRAD;
271        let pad_x0 = x0 - rad;
272        let pad_y0 = y0 - rad;
273        let pad_x1 = x1 + rad;
274        let pad_y1 = y1 + rad;
275        let width = (pad_x1 - pad_x0) as usize;
276        let height = (pad_y1 - pad_y0) as usize;
277
278        let mut bits = vec![[0u32; 8]; width * height];
279        let vsid_i = vsid as i32;
280        for yi in 0..height {
281            let y = pad_y0 + yi as i32;
282            for xi in 0..width {
283                let x = pad_x0 + xi as i32;
284                if (x | y) < 0 || x >= vsid_i || y >= vsid_i {
285                    // Out-of-bounds: voxlap's `expandbitstack`
286                    // zeros the bind buffer (= treat as full air).
287                    continue;
288                }
289                let col_idx = (y as u32) * vsid + (x as u32);
290                let off_start = column_offsets[col_idx as usize] as usize;
291                // Slice to end-of-buffer; the slab walker self-
292                // terminates via nextptr. Same fix as world_query +
293                // opticast post-edit-scatter — column_offsets[idx+1]
294                // is the next table entry, NOT the next-byte-offset
295                // after edits.
296                let column = &world_data[off_start..];
297                expandbit256(column, &mut bits[yi * width + xi]);
298            }
299        }
300
301        Self {
302            bits,
303            origin_x: pad_x0,
304            origin_y: pad_y0,
305            width,
306            height,
307            fsqrecip: build_fsqrecip(),
308            vsid: vsid_i,
309        }
310    }
311
312    /// Read 5 consecutive bits starting at z-position `z` from the
313    /// column at `(xi, yi)` cache index. Returns `0..=31`.
314    /// Out-of-range positions:
315    /// - `z < -2`: returns 0 (air above world — though voxlap's
316    ///   convention is "above is sky", same effect).
317    /// - `z >= MAXZDIM`: returns `0x1f` (solid below world).
318    #[inline]
319    fn extract_bits5(&self, xi: usize, yi: usize, z: i32) -> u32 {
320        let col = &self.bits[yi * self.width + xi];
321        if z >= MAXZDIM {
322            return 0x1f;
323        }
324        if z + 5 <= 0 {
325            return 0;
326        }
327        // Combine adjacent words to handle the case where the 5-bit
328        // window straddles a word boundary.
329        let z_bit = z;
330        let word_idx = z_bit.div_euclid(32);
331        let bit_off = z_bit.rem_euclid(32) as u32;
332        let lo = if (0..8).contains(&word_idx) {
333            col[word_idx as usize]
334        } else if word_idx < 0 {
335            0 // air above world
336        } else {
337            u32::MAX // solid below world
338        };
339        let hi = if word_idx + 1 < 8 && word_idx >= -1 {
340            col[(word_idx + 1) as usize]
341        } else if word_idx + 1 < 0 {
342            0
343        } else {
344            u32::MAX
345        };
346        let combined = u64::from(lo) | (u64::from(hi) << 32);
347        ((combined >> bit_off) & 0x1f) as u32
348    }
349
350    /// Estimate the surface normal at `(x, y, z)` from a 5×5×5
351    /// voxel-solid neighbourhood vote. Mirror of voxlap5.c:1501
352    /// (`estnorm`, `ESTNORMRAD == 2` branch).
353    ///
354    /// `(x, y)` must lie inside the cache's `[x0..x1) × [y0..y1)`
355    /// region (panics otherwise — caller guarantees this via the
356    /// bounding-box iteration). `z` is unconstrained (handled via
357    /// air/solid clamping).
358    #[must_use]
359    pub fn estnorm(&self, x: i32, y: i32, z: i32) -> [f32; 3] {
360        let center_xi = (x - self.origin_x) as usize;
361        let center_yi = (y - self.origin_y) as usize;
362
363        let mut nx: i32 = 0;
364        let mut ny: i32 = 0;
365        let mut nz: i32 = 0;
366        let z_window = z - ESTNORMRAD; // top of the 5-bit z window
367
368        for yy in -ESTNORMRAD..=ESTNORMRAD {
369            let yi = (center_yi as i32 + yy) as usize;
370            // Read 5 columns at this yy row (xx = -2..=+2).
371            let b0 = self.extract_bits5(center_xi - 2, yi, z_window) as usize;
372            let b1 = self.extract_bits5(center_xi - 1, yi, z_window) as usize;
373            let b2 = self.extract_bits5(center_xi, yi, z_window) as usize;
374            let b3 = self.extract_bits5(center_xi + 1, yi, z_window) as usize;
375            let b4 = self.extract_bits5(center_xi + 2, yi, z_window) as usize;
376
377            // Per-column popcount differences give x-axis normal
378            // contributions. Voxlap weights:
379            //   2*(N(xx=+2) - N(xx=-2)) + N(xx=+1) - N(xx=-1)
380            // = `n.x` from this row (full normal sum is over yy).
381            nx += ((i32::from(BITNUM[b4]) - i32::from(BITNUM[b0])) << 1) + i32::from(BITNUM[b3])
382                - i32::from(BITNUM[b1]);
383
384            // Sum bitsnum across all 5 columns: `j` is the total
385            // signed-i16-packed contribution. Low 16 bits = number
386            // of solid voxels in this row across all 5 columns and
387            // 5 z levels. High 16 bits = z-axis contribution
388            // (positive bits from upper z, negative from lower).
389            let j = BITSNUM[b0]
390                .wrapping_add(BITSNUM[b1])
391                .wrapping_add(BITSNUM[b2])
392                .wrapping_add(BITSNUM[b3])
393                .wrapping_add(BITSNUM[b4]);
394            nz = nz.wrapping_add(j);
395            // n.y picks only the LOW i16 of `j` (= total solid
396            // count), scaled by yy. The high i16 (z contribution)
397            // doesn't enter n.y.
398            let j_lo16 = (j as i16) as i32;
399            ny = ny.wrapping_add(j_lo16 * yy);
400        }
401        nz >>= 16;
402
403        // Normalise via fsqrecip[len_sq]. Voxlap's table peaks at
404        // 5*5*5 box max = 75² + 15² + 3² = 5859 — within
405        // `fsqrecip`'s 5860-entry range. Out-of-range len_sq values
406        // (e.g. all-zero neighbourhood) get `fsqrecip[0] = 0` ⇒
407        // returns `(0, 0, 0)` which downstream lighting math
408        // tolerates.
409        let len_sq = (nx * nx + ny * ny + nz * nz) as usize;
410        let f = if len_sq < self.fsqrecip.len() {
411            self.fsqrecip[len_sq]
412        } else {
413            0.0
414        };
415        [(nx as f32) * f, (ny as f32) * f, (nz as f32) * f]
416    }
417
418    /// Voxel-grid limit; used by callers to bound their iteration.
419    #[must_use]
420    #[allow(dead_code)] // preserved for voxlap-parity inspection
421    pub(crate) fn vsid(&self) -> i32 {
422        self.vsid
423    }
424}
425
426/// Bake per-voxel lighting into the world's brightness bytes.
427/// Mirror of voxlap's `updatelighting` (`voxlap5.c:10539`).
428///
429/// Walks every visible voxel inside `[x0..x1) × [y0..y1) ×
430/// [z0..z1)` and rewrites its alpha byte (the brightness channel
431/// the rasterizer mulhi'es against `kv6colmul` modulators) under
432/// the current `lightmode` + `lights` state.
433///
434/// - `lightmode == 0`: no-op (fast return).
435/// - `lightmode == 1`: directional sun-style bake — every visible
436///   voxel gets `(tp.y * 0.5 + tp.z) * 64 + 103.5` clamped to
437///   `[0, 255]` from its surface normal `tp`.
438/// - `lightmode >= 2`: per-light Lambertian bake — base
439///   `(tp.y * 0.5 + tp.z) * 16 + 47.5` minus, for each light in
440///   range with surface normal facing it, `g * h * sc` where
441///   `g = 1/(d·d²) - 1/(r·r²)` (cube falloff with hard radius
442///   cutoff) and `h = tp · light_delta`.
443///
444/// Voxlap pads the bbox by `ESTNORMRAD` on each side internally
445/// to give estnorm enough neighbourhood; that's done here too.
446/// `lights` should match the engine's full `vx5.lightsrc[]` —
447/// the function does its own per-tile range filtering.
448///
449/// Mutates `world_data` in place. Caller is responsible for any
450/// `column_offsets` / `vsid` invariants.
451pub fn update_lighting(
452    world_data: &mut [u8],
453    column_offsets: &[u32],
454    vsid: u32,
455    x0: i32,
456    y0: i32,
457    z0: i32,
458    x1: i32,
459    y1: i32,
460    z1: i32,
461    lightmode: u32,
462    lights: &[LightSrc],
463) {
464    if lightmode == 0 {
465        return;
466    }
467    let vsid_i = vsid as i32;
468    let x0p = (x0 - ESTNORMRAD).max(0);
469    let y0p = (y0 - ESTNORMRAD).max(0);
470    let z0p = (z0 - ESTNORMRAD).max(0);
471    let x1p = (x1 + ESTNORMRAD).min(vsid_i);
472    let y1p = (y1 + ESTNORMRAD).min(vsid_i);
473    let z1p = (z1 + ESTNORMRAD).min(MAXZDIM);
474    if x0p >= x1p || y0p >= y1p || z0p >= z1p {
475        return;
476    }
477
478    // Build the cache once for the whole padded bake region.
479    // Voxlap tiles the bake into 64×64 chunks with a per-tile
480    // `lightlst` filter; for our (one-shot bake) use case the
481    // full-region filter computed inside the per-voxel loop is
482    // simpler and not measurably slower at oracle bake sizes.
483    let cache = EstNormCache::build(world_data, column_offsets, vsid, x0p, y0p, x1p, y1p);
484
485    // Per-light precomputed `lightsub[i] = 1 / (sqrt(r2) * r2)` —
486    // the radius-cutoff bias that makes the light contribution go
487    // to exactly zero at distance == sqrt(r2).
488    let lightsub: Vec<f32> = lights.iter().map(|l| 1.0 / (l.r2.sqrt() * l.r2)).collect();
489
490    // R12.4.1: parallelise the per-row bake via rayon. Each `(x, y)`
491    // pair maps to a unique column slice in `world_data`
492    // (`column_offsets[col_idx]..[col_idx + 1]` ranges are pairwise
493    // disjoint — the voxalloc allocator's invariant). Rows split
494    // cleanly across worker threads; per-row x-loops stay serial to
495    // amortise rayon's per-task overhead. Speedup follows
496    // `RAYON_NUM_THREADS` (set `=1` to disable).
497    //
498    // Lighting bakes are typically rare (one-shot at scene load) but
499    // dynamic-lighting / per-edit relighting use cases call
500    // `update_lighting` per frame — at which point the parallel
501    // path matters for interactive responsiveness.
502    // Per-column byte extents `(start, end)`. After voxalloc-driven
503    // edits (e.g. cave-gen's heavy `set_spans` carve, or runtime
504    // bullet-impact carves), columns are scattered in the slab
505    // pool, so `column_offsets[i+1]` is NOT column `i`'s end byte
506    // — voxlap walks each column's slab chain via `slng()` to
507    // recover length. We pre-compute extents here serially before
508    // moving `world_data` into the parallel mutable view; the
509    // slng walk is O(slab_count) per column, typically 1-3 slabs
510    // → tens of microseconds per row at vsid = 1024. Negligible
511    // next to the bake itself.
512    let n_cols = (vsid as usize) * (vsid as usize);
513    let column_extents: Vec<(usize, usize)> = (0..n_cols)
514        .map(|col_idx| {
515            let start = column_offsets[col_idx] as usize;
516            let end = start + roxlap_formats::vxl::slng(&world_data[start..]);
517            (start, end)
518        })
519        .collect();
520
521    let world_view = WorldDataMutView::new(world_data);
522    let row_body = |y: i32| {
523        for x in x0p..x1p {
524            let col_idx = (y as u32) * vsid + (x as u32);
525            let (off_start, off_end) = column_extents[col_idx as usize];
526            // SAFETY: each (x, y) maps to a unique col_idx; column
527            // byte ranges `[off_start, off_end)` are pairwise
528            // disjoint across distinct `col_idx` (voxalloc's
529            // free-list invariant), so no two threads write to
530            // the same byte.
531            let column = unsafe { world_view.column_slice(off_start, off_end) };
532            shade_column(column, x, y, z0p, z1p, lightmode, lights, &lightsub, &cache);
533        }
534    };
535
536    (y0p..y1p).into_par_iter().for_each(row_body);
537}
538
539/// Raw-pointer view of `world_data` so the parallel
540/// [`update_lighting`] body can hand out per-column `&mut [u8]`
541/// slices to multiple threads without each thread needing
542/// `&mut Vec<u8>` (which is exclusive). Constructed from a single
543/// `&mut [u8]` borrow at the start of the parallel section; the
544/// borrow's lifetime gates `WorldDataMutView`'s usable lifetime.
545///
546/// # Safety contract
547/// Callers that hand out concurrent `column_slice` references MUST
548/// guarantee the requested ranges are pairwise non-overlapping
549/// across threads. [`update_lighting`]'s call site relies on
550/// voxalloc's per-column-disjoint-byte-range invariant.
551struct WorldDataMutView<'a> {
552    ptr: *mut u8,
553    len: usize,
554    _marker: std::marker::PhantomData<&'a mut [u8]>,
555}
556
557// SAFETY: `WorldDataMutView` is morally a `&mut [u8]` re-exposed as
558// raw pointers. The disjoint-write invariant is enforced by the
559// caller; concurrent reads of `ptr` / `len` fields are race-free
560// (immutable scalar fields).
561unsafe impl Send for WorldDataMutView<'_> {}
562unsafe impl Sync for WorldDataMutView<'_> {}
563
564impl<'a> WorldDataMutView<'a> {
565    fn new(buf: &'a mut [u8]) -> Self {
566        Self {
567            ptr: buf.as_mut_ptr(),
568            len: buf.len(),
569            _marker: std::marker::PhantomData,
570        }
571    }
572
573    /// Carve out a sub-slice. Caller upholds the disjoint-write
574    /// invariant (see struct doc).
575    ///
576    /// # Safety
577    /// `off_start <= off_end <= self.len`, and the requested range
578    /// must not overlap with ranges concurrently held by other
579    /// threads.
580    unsafe fn column_slice(&self, off_start: usize, off_end: usize) -> &'a mut [u8] {
581        debug_assert!(off_start <= off_end, "column slice: start > end");
582        debug_assert!(off_end <= self.len, "column slice: end past buffer");
583        // SAFETY: caller asserts in-bounds + disjoint-from-other-threads.
584        unsafe { std::slice::from_raw_parts_mut(self.ptr.add(off_start), off_end - off_start) }
585    }
586}
587
588/// Walk one column's slab chain and shade every visible voxel
589/// inside `[z_lo, z_hi)`. Mirror of the inner loop in
590/// voxlap5.c:10588-10650.
591#[allow(clippy::cast_lossless)]
592fn shade_column(
593    column: &mut [u8],
594    x: i32,
595    y: i32,
596    z_lo: i32,
597    z_hi: i32,
598    lightmode: u32,
599    lights: &[LightSrc],
600    lightsub: &[f32],
601    cache: &EstNormCache,
602) {
603    let mut v_off: usize = 0;
604    // cstat = false ⇒ top-of-slab phase (floor colours); true ⇒
605    // ceiling-of-next-slab phase (bottom of current slab's solid
606    // mass, visible from the air pocket below).
607    let mut cstat = false;
608    loop {
609        let (sz0, sz1, voxel_byte_offset_signed): (i32, i32, isize);
610        if !cstat {
611            // Floor colours of the current slab. Voxel z=v[1]..=v[2].
612            // Alpha byte at offset (z - v[1]) * 4 + 7 from header
613            // (header is 4 bytes, voxel record is 4 bytes BGRA, +3
614            // for alpha). The voxlap formula encodes this as
615            // `(z << 2) + offs` with `offs = 7 - (v[1] << 2)`.
616            if v_off + 2 >= column.len() {
617                break;
618            }
619            let v1 = i32::from(column[v_off + 1]);
620            let v2 = i32::from(column[v_off + 2]);
621            sz0 = v1;
622            sz1 = v2 + 1;
623            voxel_byte_offset_signed = (v_off as isize) + 7 - ((sz0 as isize) << 2);
624            cstat = true;
625        } else {
626            // Ceiling colours of the next slab — must read v[0]
627            // BEFORE advancing v_off.
628            if v_off + 2 >= column.len() {
629                break;
630            }
631            let v0 = i32::from(column[v_off]);
632            let v1 = i32::from(column[v_off + 1]);
633            let v2 = i32::from(column[v_off + 2]);
634            let prev_offset = v2 - v1 - v0 + 2; // ceilnum from getcube convention
635            if v0 == 0 {
636                break;
637            }
638            v_off += (v0 as usize) * 4;
639            if v_off + 3 >= column.len() {
640                break;
641            }
642            let v3 = i32::from(column[v_off + 3]);
643            sz1 = v3;
644            sz0 = prev_offset + sz1;
645            voxel_byte_offset_signed = (v_off as isize) + 3 - ((sz1 as isize) << 2);
646            cstat = false;
647        }
648
649        let lo = sz0.max(z_lo);
650        let hi = sz1.min(z_hi);
651        for z in lo..hi {
652            let normal = cache.estnorm(x, y, z);
653            let brightness = compute_brightness(x, y, z, normal, lightmode, lights, lightsub);
654            let byte_off = voxel_byte_offset_signed + ((z as isize) << 2);
655            if byte_off >= 0 && (byte_off as usize) < column.len() {
656                column[byte_off as usize] = brightness;
657            }
658        }
659    }
660}
661
662/// Voxlap's per-voxel brightness math. Computes the `[0, 255]`
663/// alpha byte for one voxel from its surface normal `tp` + the
664/// light list. Mirror of voxlap5.c:10605-10646.
665fn compute_brightness(
666    x: i32,
667    y: i32,
668    z: i32,
669    tp: [f32; 3],
670    lightmode: u32,
671    lights: &[LightSrc],
672    lightsub: &[f32],
673) -> u8 {
674    if lightmode < 2 {
675        // Directional path (voxlap5.c:10607-10612): single sun
676        // direction baked into a hardcoded coefficient pair.
677        // i = (tp.y * 0.5 + tp.z) * 64 + 103.5, clamped to [0, 255].
678        let f = (tp[1] * 0.5 + tp[2]) * 64.0 + 103.5;
679        clamp_to_byte(f)
680    } else {
681        // Point-light path (voxlap5.c:10614-10645). Base brightness
682        // 47.5..63.5 + per-light front-face contribution.
683        let mut f = (tp[1] * 0.5 + tp[2]) * 16.0 + 47.5;
684        let xf = x as f32;
685        let yf = y as f32;
686        let zf = z as f32;
687        for (i, light) in lights.iter().enumerate() {
688            let fx = light.pos[0] - xf;
689            let fy = light.pos[1] - yf;
690            let fz = light.pos[2] - zf;
691            // tp · light_delta: positive ⇒ surface faces away from
692            // light (back-lit, no contribution); negative ⇒ surface
693            // faces light (front-lit, lambertian contribution).
694            let h = tp[0] * fx + tp[1] * fy + tp[2] * fz;
695            if h >= 0.0 {
696                continue;
697            }
698            let g_sq = fx * fx + fy * fy + fz * fz;
699            if g_sq >= light.r2 {
700                continue;
701            }
702            // Voxlap's SSE rcpss/rsqrtss sequence:
703            //   g = (1/g_sq) * rsqrt(g_sq) - lightsub[i]
704            //     = 1/(g_sq * sqrt(g_sq)) - 1/(r2 * sqrt(r2))
705            //     = 1/d³ - 1/r³
706            // The `_mm_rcp_ss` / `_mm_rsqrt_ss` are 12-bit
707            // approximations; the exact `f32::sqrt`-based form
708            // here is more precise but may drift from voxlap C.
709            // Bit-exactness will require switching to the
710            // intrinsic versions on x86_64; deferred until
711            // diag_down_lit oracle convergence demands it.
712            let g = 1.0 / (g_sq * g_sq.sqrt()) - lightsub[i];
713            f -= g * h * light.sc;
714        }
715        clamp_to_byte(f)
716    }
717}
718
719#[inline]
720fn clamp_to_byte(f: f32) -> u8 {
721    // Voxlap's `if (*(int32_t *)&f > 0x437f0000) f = 255` is the
722    // bit-trick form of `if (f > 255.0) f = 255.0`. Negatives wrap
723    // through `ftol` / cast; we clamp explicitly for safety.
724    if f >= 255.0 {
725        255
726    } else if f <= 0.0 {
727        0
728    } else {
729        f as u8
730    }
731}
732
733#[cfg(test)]
734mod tests {
735    use super::*;
736
737    /// xbsflor(0) = -1 (all bits set), xbsflor(32) clamped to 0,
738    /// xbsflor(5) = ~31 = 0xffff_ffe0.
739    #[test]
740    fn xbsflor_xbsceil_known_values() {
741        assert_eq!(xbsflor(0), 0xffff_ffff);
742        assert_eq!(xbsflor(1), 0xffff_fffe);
743        assert_eq!(xbsflor(5), 0xffff_ffe0);
744        assert_eq!(xbsflor(31), 0x8000_0000);
745        assert_eq!(xbsflor(32), 0);
746        assert_eq!(xbsceil(0), 0);
747        assert_eq!(xbsceil(5), 0x1f);
748        assert_eq!(xbsceil(31), 0x7fff_ffff);
749        assert_eq!(xbsceil(32), 0xffff_ffff);
750    }
751
752    /// Single-slab column [next=0, sz0=10, sz1=14, then 5 voxel
753    /// records]. Voxels exist at z = 10..15 (sz0..=sz1). After
754    /// expandbit256, bits 10..15 should be set, all others
755    /// (0..10 and 15..256) should reflect: air above (0..10) and
756    /// solid below (15..256), since voxlap treats z > sz1 of last
757    /// slab as solid.
758    #[test]
759    fn single_slab_z10_to_14_sets_correct_bits() {
760        // Column layout: [next=0, sz0=10, sz1=14, top_color, then 5x
761        // voxel records of 4 bytes each]. We don't use the voxel
762        // record contents; expandbit256 only reads v[0]..v[3].
763        let mut col = vec![0u8, 10, 14, 0]; // header
764        col.extend(vec![0u8; 5 * 4]); // 5 voxel records (z=10..14)
765
766        let mut bits = [0u32; 8];
767        expandbit256(&col, &mut bits);
768
769        // Word 0 covers bits 0..32. Air for z=0..10, solid 10..15,
770        // solid for z=15..32 (since this is the only slab → below
771        // is fully solid).
772        // bits 10..15 from the slab body: 0x7c00 (bits 10,11,12,13,14)
773        // bits 15..32 from "solid below last slab": 0xffff_8000
774        // Combined: 0xffff_fc00.
775        assert_eq!(
776            bits[0], 0xffff_fc00,
777            "word 0 want 0xffff_fc00 got 0x{:08x}",
778            bits[0]
779        );
780        // Words 1..7 should all be 0xffff_ffff (fully solid).
781        for (i, w) in bits.iter().enumerate().skip(1) {
782            assert_eq!(*w, 0xffff_ffff, "word {i} want -1 got 0x{:08x}", *w);
783        }
784    }
785
786    /// fsqrecip[N] should match `1/sqrt(N)` to a reasonable
787    /// tolerance for the values estnorm actually produces.
788    #[test]
789    fn fsqrecip_matches_1_over_sqrt() {
790        let t = build_fsqrecip();
791        for k in 1..=100 {
792            let want = 1.0_f32 / (k as f32).sqrt();
793            let got = t[k];
794            let err = (got - want).abs();
795            assert!(err < 1e-3, "fsqrecip[{k}] = {got}, want {want}, err {err}");
796        }
797        // Spot-check higher values (less precise but still close).
798        for k in [500, 1000, 2000, 5000] {
799            let want = 1.0_f32 / (k as f32).sqrt();
800            let got = t[k];
801            let rel = (got / want - 1.0).abs();
802            assert!(
803                rel < 0.01,
804                "fsqrecip[{k}] = {got}, want {want}, rel-err {rel}"
805            );
806        }
807    }
808
809    /// Build a 4×4 synthetic world with a flat floor at z=20..=24,
810    /// run lightmode-1 update_lighting over the centre 2×2, and
811    /// verify (a) brightness bytes were rewritten, (b) the result
812    /// is in `[0, 255]` for every shaded voxel, (c) the brightness
813    /// is uniform within each (x, y) column at the same z (since
814    /// lightmode-1 depends only on the surface normal).
815    #[test]
816    fn lightmode1_bakes_brightness_into_visible_voxels() {
817        // 4×4 world, single slab at z=20..=24, sentinel column ends.
818        let vsid: u32 = 4;
819        let mut col = vec![0u8, 20, 24, 0]; // header: nextptr=0, z1=20, z2=24
820        for _ in 20..=24 {
821            // 5 voxel records, alpha pre-set to 0xab so we can verify
822            // they got rewritten.
823            col.extend([0x10, 0x20, 0x30, 0xab]);
824        }
825        let col_len = col.len() as u32;
826        let mut data = Vec::new();
827        let mut offsets = vec![0u32; (vsid * vsid + 1) as usize];
828        for i in 0..(vsid * vsid) {
829            offsets[i as usize] = data.len() as u32;
830            data.extend_from_slice(&col);
831        }
832        offsets[(vsid * vsid) as usize] = data.len() as u32;
833        assert_eq!(col_len as usize * (vsid * vsid) as usize, data.len());
834
835        update_lighting(
836            &mut data,
837            &offsets,
838            vsid,
839            1,
840            1,
841            0,
842            3,
843            3,
844            30, // bbox 1..=2 in xy, z 0..30
845            1,  // lightmode 1
846            &[],
847        );
848
849        // Pull every voxel record's alpha byte from the centre
850        // (1, 1) column. Should all be in [0, 255] and ≠ 0xab.
851        let off1 = offsets[(1 * vsid + 1) as usize] as usize;
852        let alphas: Vec<u8> = (0..5).map(|i| data[off1 + 4 + i * 4 + 3]).collect();
853        for (i, &a) in alphas.iter().enumerate() {
854            assert_ne!(a, 0xab, "alpha[{i}] not rewritten");
855        }
856        // The shading should be mostly bright — flat-floor voxels
857        // have ~vertical normals so `(tp.y*0.5 + tp.z)*64 + 103.5`
858        // ≈ 1.0*64 + 103.5 = 167.5.
859        for (i, &a) in alphas.iter().enumerate() {
860            assert!(
861                a > 100,
862                "alpha[{i}]={a} should be on the bright side for top-of-floor voxels"
863            );
864        }
865    }
866
867    /// lightmode-2 with one nearby light should darken voxels on
868    /// the away side relative to the toward side. Use a 5×5 world
869    /// with a flat floor and place a light such that it's on the
870    /// +x side of the centre column — the +x face voxel's neighbour
871    /// columns should end up brighter than the -x.
872    #[test]
873    fn lightmode2_with_light_produces_per_column_variation() {
874        let vsid: u32 = 5;
875        let mut col = vec![0u8, 20, 24, 0];
876        for _ in 20..=24 {
877            col.extend([0x10, 0x20, 0x30, 0]);
878        }
879        let mut data = Vec::new();
880        let mut offsets = vec![0u32; (vsid * vsid + 1) as usize];
881        for i in 0..(vsid * vsid) {
882            offsets[i as usize] = data.len() as u32;
883            data.extend_from_slice(&col);
884        }
885        offsets[(vsid * vsid) as usize] = data.len() as u32;
886
887        let lights = [LightSrc {
888            // World coords: light right next to (4, 2, 20).
889            pos: [4.0, 2.0, 20.0],
890            r2: 50.0 * 50.0,
891            sc: 64.0,
892        }];
893        update_lighting(&mut data, &offsets, vsid, 0, 0, 0, 5, 5, 30, 2, &lights);
894
895        // Sample the alpha at the top-floor voxel of each column
896        // along y=2. Closer-to-light columns should be brighter.
897        let alpha_at = |x: u32, z_idx: usize| {
898            let off = offsets[(2 * vsid + x) as usize] as usize;
899            data[off + 4 + z_idx * 4 + 3]
900        };
901        let close = alpha_at(4, 0); // closest column to light
902        let far = alpha_at(0, 0); // farthest
903        assert!(
904            close >= far,
905            "column nearer the light should be ≥ as bright as the far one (close={close} far={far})"
906        );
907    }
908
909    /// Empty column ([0, 0, 0, ...]) — no slabs. After
910    /// expandbit256, all 256 bits = 0 (full air).
911    #[test]
912    fn empty_column_all_air() {
913        let col = vec![0u8, 0, 0, 0]; // single-slab header at z=0..0, no body
914        let mut bits = [0u32; 8];
915        expandbit256(&col, &mut bits);
916        // bit 0 from "air→solid transition at z=0", but only bit 0
917        // is set within the slab range [0, 0+1). Then "solid below"
918        // fills bits 1..256.
919        // Actually for sz0=sz1=0: voxel record is z=0..0 inclusive
920        // (0 voxels). The bit pattern is 1 set bit at z=0 then
921        // solid below.
922        // word 0: bit 0 set, bits 1..32 set ⇒ 0xffff_ffff.
923        assert_eq!(
924            bits[0], 0xffff_ffff,
925            "empty column word 0 want all-1 got 0x{:08x}",
926            bits[0]
927        );
928    }
929}