roxlap-gpu 0.10.0

//! GPU.10 — KV6 sprite as a DDA-marchable voxel model.
//!
//! Unlike the GPU.9 splatter (one thread per voxel, screen-space
//! squares, overdraw + atomic contention), a sprite model is a small
//! voxel volume the precise ray-DDA marches one ray per pixel —
//! crisp, correct occlusion, no overdraw. This is the GPU.10.0 single
//! sprite; instancing + tiling + LOD come in later sub-substages.
//!
//! The volume reuses the chunk occupancy/colour scheme but sized to
//! the KV6 bbox: per-column occupancy bitmask (`occ_words_per_col`
//! u32s, `CHUNK_Z`-style 32-bits-per-word), a flat colour array in
//! ascending-z order per column, and a `color_offsets` prefix table.
//! The shader finds a voxel's colour by `offset[col] + popcount(bits
//! below z)`, so colours MUST be ascending-z (we sort per column).

#![allow(
    clippy::cast_precision_loss,
    clippy::cast_possible_truncation,
    clippy::cast_possible_wrap,
    clippy::cast_sign_loss,
    clippy::many_single_char_names,
    clippy::similar_names
)]

use bytemuck::{Pod, Zeroable};
use roxlap_formats::kv6::Kv6;
use roxlap_formats::sprite::Sprite;

/// CPU-built voxel volume for one KV6 model.
#[derive(Debug, Clone)]
pub struct SpriteModel {
    /// Voxel extent `(mx, my, mz)`.
    pub dims: [u32; 3],
    /// `ceil(mz / 32)` — u32 words of occupancy per (x, y) column.
    pub occ_words_per_col: u32,
    /// KV6 pivot in model-local voxel space.
    pub pivot: [f32; 3],
    /// Per-column occupancy bitmask, `mx * my * occ_words_per_col`.
    pub occupancy: Vec<u32>,
    /// Voxel colours, ascending z within each column.
    pub colors: Vec<u32>,
    /// Per-voxel surface-normal index (`Kv6::Voxel::dir`, 0..256),
    /// parallel to [`colors`](Self::colors). The GPU sprite shader uses
    /// it to index the per-instance `kv6colmul` lighting table, matching
    /// the CPU rasteriser's normal-based shading.
    pub dirs: Vec<u32>,
    /// Prefix sums: `color_offsets[col]` is the first colour index of
    /// column `col`; length `mx * my + 1`.
    pub color_offsets: Vec<u32>,
    /// World-space size of one voxel of this model (GPU.10.4 LOD): 1.0
    /// at mip-0, doubling each [`SpriteModel::downsample`]. The shader
    /// divides the local ray by this so a coarse voxel spans the right
    /// world extent and the march `t` stays in world units.
    pub voxel_world_size: f32,
}

/// Build the DDA volume from a KV6. Columns are packed in
/// `x + y*mx` order; each column's voxels are sorted ascending by z
/// so the shader's popcount-rank colour lookup is correct.
///
/// # Panics
/// If the KV6's `ylen` counters disagree with `voxels.len()` (a
/// malformed model).
#[must_use]
pub fn build_sprite_model(kv6: &Kv6) -> SpriteModel {
    let (mx, my, mz) = (kv6.xsiz, kv6.ysiz, kv6.zsiz);
    let occ_words_per_col = mz.div_ceil(32).max(1);
    let cols = (mx * my) as usize;

    let mut occupancy = vec![0u32; cols * occ_words_per_col as usize];
    let mut color_offsets = vec![0u32; cols + 1];
    let mut colors: Vec<u32> = Vec::with_capacity(kv6.voxels.len());
    let mut dirs: Vec<u32> = Vec::with_capacity(kv6.voxels.len());

    // Pass 1 — consume voxels in KV6 storage order (x-outer / y-inner)
    // into per-column buckets keyed by `col = x + y*mx`. Each entry is
    // `(z, colour, normal-dir)`.
    let mut buckets: Vec<Vec<(u16, u32, u8)>> = vec![Vec::new(); cols];
    let mut voxel_iter = kv6.voxels.iter();
    for x in 0..mx {
        for y in 0..my {
            let col = (x + y * mx) as usize;
            let count = kv6.ylen[x as usize][y as usize];
            for _ in 0..count {
                let v = voxel_iter.next().expect("KV6 ylen / voxels.len mismatch");
                buckets[col].push((v.z, v.col, v.dir));
            }
        }
    }

    // Pass 2 — emit in COLUMN-INDEX order so `color_offsets` is a true
    // monotonic prefix sum (the shader indexes by `col` either way, but
    // structural edits / mip rebuilds rely on monotonic offsets). Each
    // column's voxels sorted ascending z for the popcount-rank lookup.
    for (col, bucket) in buckets.iter_mut().enumerate() {
        color_offsets[col] = colors.len() as u32;
        bucket.sort_by_key(|(z, _, _)| *z);
        for &(z, col_rgba, dir) in bucket.iter() {
            let z = u32::from(z);
            let base = col * occ_words_per_col as usize + (z >> 5) as usize;
            occupancy[base] |= 1u32 << (z & 31);
            colors.push(col_rgba);
            dirs.push(u32::from(dir));
        }
    }
    color_offsets[cols] = colors.len() as u32;

    SpriteModel {
        dims: [mx, my, mz],
        occ_words_per_col,
        pivot: [kv6.xpiv, kv6.ypiv, kv6.zpiv],
        occupancy,
        color_offsets,
        colors,
        dirs,
        voxel_world_size: 1.0,
    }
}

/// Per-instance transform consumed by the model-DDA shader: the
/// inverse model→world rotation (so a world ray can be brought into
/// model-local space) plus the instance's world position. Stored as
/// three padded columns for std140/std430 (`mat3x3` 16-byte columns).
#[repr(C)]
#[derive(Clone, Copy, Pod, Zeroable, Debug)]
pub struct SpriteInstanceTransform {
    /// Inverse of `[s | h | f]`, column-major, each column padded to
    /// `vec4`. `inv_rot * v = c0*v.x + c1*v.y + c2*v.z`.
    pub inv_rot: [[f32; 4]; 3],
    /// Instance world position (the KV6 pivot maps here).
    pub pos: [f32; 3],
    _pad: f32,
}

impl SpriteInstanceTransform {
    /// Build from a sprite pose. `s/h/f` are the model→world basis
    /// columns; we invert them so the shader can map world→local.
    #[must_use]
    pub fn from_sprite(sprite: &Sprite) -> Self {
        let inv = mat3_inverse([sprite.s, sprite.h, sprite.f]);
        Self {
            inv_rot: [
                [inv[0][0], inv[0][1], inv[0][2], 0.0],
                [inv[1][0], inv[1][1], inv[1][2], 0.0],
                [inv[2][0], inv[2][1], inv[2][2], 0.0],
            ],
            pos: sprite.p,
            _pad: 0.0,
        }
    }
}

/// A registry of sprite models. Instances reference a model by
/// `model_id`, which is a **LOD chain** id: each chain holds one or
/// more concrete mip levels (finest first; GPU.10.4), and the renderer
/// picks the level per instance by distance. Identical KV6s are added
/// once and shared by many instances. **Copy-on-modify**:
/// [`Self::fork`] deep-copies a chain so edits to the fork leave the
/// parent (and its instances) intact.
#[derive(Debug, Clone, Default)]
pub struct SpriteModelRegistry {
    /// Concrete mip-level volumes (the GPU buffers concatenate these).
    entries: Vec<SpriteModel>,
    /// `chains[model_id]` = entry ids, finest (mip-0) first.
    chains: Vec<Vec<u32>>,
}

impl SpriteModelRegistry {
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    fn push_entry(&mut self, model: SpriteModel) -> u32 {
        let id = self.entries.len() as u32;
        self.entries.push(model);
        id
    }

    /// Register a single-level (no-LOD) model; returns its `model_id`.
    pub fn add(&mut self, model: SpriteModel) -> u32 {
        let e = self.push_entry(model);
        let id = self.chains.len() as u32;
        self.chains.push(vec![e]);
        id
    }

    /// Register a model with up to `max_levels` LOD mips (each a 2×
    /// [`SpriteModel::downsample`] of the previous; stops early once a
    /// level collapses to 1³). Returns its `model_id`.
    pub fn add_lod(&mut self, model: SpriteModel, max_levels: u32) -> u32 {
        let mut levels = vec![self.push_entry(model.clone())];
        let mut cur = model;
        for _ in 1..max_levels.max(1) {
            if cur.dims == [1, 1, 1] {
                break;
            }
            cur = cur.downsample();
            levels.push(self.push_entry(cur.clone()));
        }
        let id = self.chains.len() as u32;
        self.chains.push(levels);
        id
    }

    /// Copy-on-modify: deep-copy every level of chain `parent` into new
    /// entries + a new chain, and return its `model_id`. The fork owns
    /// independent voxel data, so mutating it does not affect the
    /// parent or any instance still pointing at it.
    ///
    /// # Panics
    /// If `parent` is not a registered `model_id`.
    pub fn fork(&mut self, parent: u32) -> u32 {
        let src = self.chains[parent as usize].clone();
        let levels: Vec<u32> = src
            .iter()
            .map(|&e| {
                let copy = self.entries[e as usize].clone();
                self.push_entry(copy)
            })
            .collect();
        let id = self.chains.len() as u32;
        self.chains.push(levels);
        id
    }

    /// The finest (mip-0) model of chain `id`.
    #[must_use]
    pub fn model(&self, id: u32) -> &SpriteModel {
        &self.entries[self.chains[id as usize][0] as usize]
    }

    /// Mutable access to the finest (mip-0) model for editing — the
    /// copy-on-modify entry point (typically on a [`Self::fork`]).
    /// After a *structural* edit (occupancy/dims), call
    /// [`Self::rebuild_lod`] so the coarser mips match; a pure recolour
    /// can use [`Self::recolor_chain`] instead.
    pub fn model_mut(&mut self, id: u32) -> &mut SpriteModel {
        let e = self.chains[id as usize][0] as usize;
        &mut self.entries[e]
    }

    /// Recolour every LOD level of chain `id` (so a forked tint shows
    /// at all distances).
    pub fn recolor_chain(&mut self, id: u32, f: impl Fn(u32) -> u32 + Copy) {
        for li in 0..self.chains[id as usize].len() {
            let e = self.chains[id as usize][li] as usize;
            self.entries[e].recolor(f);
        }
    }

    /// Regenerate chain `id`'s coarser mip levels from its (possibly
    /// just-edited) mip-0. Run after a structural edit via
    /// [`Self::model_mut`] so the LOD ladder stays consistent. No-op
    /// for a single-level (no-LOD) chain.
    pub fn rebuild_lod(&mut self, id: u32) {
        let levels = self.chains[id as usize].clone();
        if levels.len() <= 1 {
            return;
        }
        let mut cur = self.entries[levels[0] as usize].clone();
        for &e in &levels[1..] {
            cur = cur.downsample();
            self.entries[e as usize] = cur.clone();
        }
    }

    /// Number of LOD chains (distinct `model_id`s).
    #[must_use]
    pub fn len(&self) -> usize {
        self.chains.len()
    }

    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.chains.is_empty()
    }
}

impl SpriteModel {
    /// Recolour every voxel via `f(old_rgba) -> new_rgba`. Structure
    /// (occupancy / offsets) is untouched, so this is a cheap in-place
    /// edit — handy on a [`SpriteModelRegistry::fork`] to make a tinted
    /// variant. For structural edits, mutate the public occupancy /
    /// colours / dims directly (via `model_mut`) then rebuild the LOD.
    pub fn recolor(&mut self, f: impl Fn(u32) -> u32) {
        for c in &mut self.colors {
            *c = f(*c);
        }
    }

    /// GPU.12 — structural edit of a single voxel within the model's
    /// existing bounds. `Some(rgba)` sets/replaces the voxel at
    /// `(x, y, z)`; `None` clears it. Maintains the ascending-z colour
    /// invariant by inserting/removing at the voxel's popcount rank and
    /// shifting the affected columns' `color_offsets`. Returns `true`
    /// if the model changed. Out-of-bounds coordinates are ignored
    /// (returns `false`) — growing `dims` is a separate concern.
    ///
    /// After editing, call [`SpriteModelRegistry::rebuild_lod`] to
    /// refresh coarser mips, then re-upload via `set_sprite_instances`.
    pub fn set_voxel(&mut self, x: u32, y: u32, z: u32, color: Option<u32>) -> bool {
        if x >= self.dims[0] || y >= self.dims[1] || z >= self.dims[2] {
            return false;
        }
        let owpc = self.occ_words_per_col as usize;
        let cols = (self.dims[0] * self.dims[1]) as usize;
        let col = (x + y * self.dims[0]) as usize;
        let base = col * owpc;
        let zw = (z >> 5) as usize;
        let zb = z & 31;

        // Rank = solid voxels strictly below z in this column.
        let mut rank = 0usize;
        for w in 0..zw {
            rank += self.occupancy[base + w].count_ones() as usize;
        }
        let below_mask = if zb > 0 { (1u32 << zb) - 1 } else { 0 };
        rank += (self.occupancy[base + zw] & below_mask).count_ones() as usize;
        let idx = self.color_offsets[col] as usize + rank;
        let was_set = (self.occupancy[base + zw] >> zb) & 1 == 1;

        if let Some(rgba) = color {
            if was_set {
                self.colors[idx] = rgba; // replace in place (keeps dir)
            } else {
                self.occupancy[base + zw] |= 1u32 << zb;
                self.colors.insert(idx, rgba);
                // No normal supplied by this API — default to dir 0 (the
                // sole caller, the carve hotkey, only ever clears).
                self.dirs.insert(idx, 0);
                for c in &mut self.color_offsets[col + 1..=cols] {
                    *c += 1;
                }
            }
            true
        } else {
            if !was_set {
                return false;
            }
            self.occupancy[base + zw] &= !(1u32 << zb);
            self.colors.remove(idx);
            self.dirs.remove(idx);
            for c in &mut self.color_offsets[col + 1..=cols] {
                *c -= 1;
            }
            true
        }
    }

    /// Radius of a bounding sphere centred at the instance position
    /// (the pivot maps there): the farthest bbox corner from the
    /// pivot. Used for frustum culling. Assumes a unit basis; scaled
    /// instances would multiply this by their max basis length.
    #[must_use]
    pub fn bound_radius(&self) -> f32 {
        let mut r2 = 0.0_f32;
        for &cx in &[0.0, self.dims[0] as f32] {
            for &cy in &[0.0, self.dims[1] as f32] {
                for &cz in &[0.0, self.dims[2] as f32] {
                    let d = [cx - self.pivot[0], cy - self.pivot[1], cz - self.pivot[2]];
                    r2 = r2.max(d[0] * d[0] + d[1] * d[1] + d[2] * d[2]);
                }
            }
        }
        r2.sqrt()
    }

    /// GPU.10.4 — 2× voxel downsample for the next LOD level. A coarse
    /// voxel is solid if any of its 2×2×2 fine voxels is, coloured by
    /// their per-channel average. Dims/pivot halve and
    /// `voxel_world_size` doubles, so the coarse model occupies the
    /// same world box at half the resolution (origin-corner aligned).
    #[must_use]
    #[allow(clippy::manual_checked_ops)] // `n > 0` guards 4 divisions, not one checked_div
    pub fn downsample(&self) -> SpriteModel {
        let [fx, fy, fz] = self.dims;
        let fidx = |x: u32, y: u32, z: u32| (x + y * fx + z * fx * fy) as usize;

        // Reconstruct dense fine voxels (solid flag + colour + normal).
        let mut solid = vec![false; (fx * fy * fz) as usize];
        let mut fine = vec![0u32; (fx * fy * fz) as usize];
        let mut fine_dir = vec![0u32; (fx * fy * fz) as usize];
        for x in 0..fx {
            for y in 0..fy {
                let col = (x + y * fx) as usize;
                let base = col * self.occ_words_per_col as usize;
                let off = self.color_offsets[col] as usize;
                let mut seen = 0usize;
                for z in 0..fz {
                    let w = base + (z >> 5) as usize;
                    if (self.occupancy[w] >> (z & 31)) & 1 == 1 {
                        fine[fidx(x, y, z)] = self.colors[off + seen];
                        fine_dir[fidx(x, y, z)] = self.dirs[off + seen];
                        solid[fidx(x, y, z)] = true;
                        seen += 1;
                    }
                }
            }
        }

        let nx = fx.div_ceil(2).max(1);
        let ny = fy.div_ceil(2).max(1);
        let nz = fz.div_ceil(2).max(1);
        let owpc = nz.div_ceil(32).max(1);
        let cols = (nx * ny) as usize;
        let mut occupancy = vec![0u32; cols * owpc as usize];
        let mut color_offsets = vec![0u32; cols + 1];
        let mut colors: Vec<u32> = Vec::new();
        let mut dirs: Vec<u32> = Vec::new();

        // Emit in column-index order (`ccol = cx + cy*nx`), cy outer,
        // so `color_offsets` is a monotonic prefix sum like build's.
        for cy in 0..ny {
            for cx in 0..nx {
                let ccol = (cx + cy * nx) as usize;
                color_offsets[ccol] = colors.len() as u32;
                for cz in 0..nz {
                    let (mut a, mut r, mut g, mut b, mut n) = (0u32, 0u32, 0u32, 0u32, 0u32);
                    // Normals don't average meaningfully — keep the first
                    // solid child's `dir` as the coarse voxel's normal.
                    let mut rep_dir = 0u32;
                    for dz in 0..2 {
                        for dy in 0..2 {
                            for dx in 0..2 {
                                let (x, y, z) = (2 * cx + dx, 2 * cy + dy, 2 * cz + dz);
                                if x < fx && y < fy && z < fz && solid[fidx(x, y, z)] {
                                    let c = fine[fidx(x, y, z)];
                                    if n == 0 {
                                        rep_dir = fine_dir[fidx(x, y, z)];
                                    }
                                    a += (c >> 24) & 0xff;
                                    r += (c >> 16) & 0xff;
                                    g += (c >> 8) & 0xff;
                                    b += c & 0xff;
                                    n += 1;
                                }
                            }
                        }
                    }
                    if n > 0 {
                        let avg = ((a / n) << 24) | ((r / n) << 16) | ((g / n) << 8) | (b / n);
                        let base = ccol * owpc as usize + (cz >> 5) as usize;
                        occupancy[base] |= 1u32 << (cz & 31);
                        colors.push(avg);
                        dirs.push(rep_dir);
                    }
                }
            }
        }
        color_offsets[cols] = colors.len() as u32;

        SpriteModel {
            dims: [nx, ny, nz],
            occ_words_per_col: owpc,
            pivot: [
                self.pivot[0] * 0.5,
                self.pivot[1] * 0.5,
                self.pivot[2] * 0.5,
            ],
            occupancy,
            colors,
            dirs,
            color_offsets,
            voxel_world_size: self.voxel_world_size * 2.0,
        }
    }
}

/// View frustum for CPU instance culling, in world space. Built each
/// frame from the world camera. `half_w`/`half_h` are the tangents of
/// the half-FOV (so the side planes are `|x| <= half_w * z` etc. in
/// camera space).
#[derive(Clone, Copy, Debug)]
pub struct ViewFrustum {
    pub pos: [f32; 3],
    pub right: [f32; 3],
    pub down: [f32; 3],
    pub forward: [f32; 3],
    pub half_w: f32,
    pub half_h: f32,
    pub far: f32,
}

/// CPU cull record: the GPU instance + its world bounding sphere.
/// Not `Copy` — carries a boxed 256-entry `kv6colmul` table.
#[derive(Clone)]
struct CullInstance {
    /// Instance transform + a placeholder `model_id`; the cull
    /// overwrites `model_id` with the distance-chosen LOD entry.
    gpu: SpriteInstanceGpu,
    /// LOD chain this instance draws (the user-facing `model_id`).
    chain_id: u32,
    center: [f32; 3],
    radius: f32,
    /// voxlap `kv6colmul[256]` — per-surface-normal colour modulation
    /// for this instance's pose + lighting. Defaults to identity
    /// (`0x0100` in every channel lane → unshaded) until the facade sets
    /// it via [`SpriteRegistryResident::set_instance_colmul`]. Packed
    /// into the `colmul` GPU buffer (in visible order) each frame.
    colmul: Box<[u64; 256]>,
}

/// Identity `kv6colmul` table: every channel lane = `0x0100`, so the
/// shader's `(rgb[c] << 8) * 0x0100 >> 16 == rgb[c]` — i.e. no shading.
fn identity_colmul() -> Box<[u64; 256]> {
    const LANE: u64 = 0x0100;
    let w = LANE | (LANE << 16) | (LANE << 32) | (LANE << 48);
    Box::new([w; 256])
}

fn dot3(a: [f32; 3], b: [f32; 3]) -> f32 {
    a[0] * b[0] + a[1] * b[1] + a[2] * b[2]
}

/// One sprite instance: a model reference + world pose.
#[derive(Debug, Clone, Copy)]
pub struct SpriteInstance {
    pub model_id: u32,
    pub transform: SpriteInstanceTransform,
}

/// GPU per-model metadata: where this model's data starts in the
/// shared registry buffers + its dims/pivot. Mirrors `ModelMeta` in
/// the shader (std430, 48 bytes).
#[repr(C)]
#[derive(Clone, Copy, Pod, Zeroable, Debug)]
struct SpriteModelMeta {
    occupancy_offset: u32,
    colors_offset: u32,
    color_offsets_offset: u32,
    occ_words_per_col: u32,
    dims: [u32; 3],
    _pad0: u32,
    pivot: [f32; 3],
    /// GPU.10.4 — world size of one voxel of this (mip) entry.
    voxel_world_size: f32,
}

/// GPU per-instance record. Mirrors `Instance` in the shader (std430,
/// 64 bytes): inverse rotation columns + position + model id.
#[repr(C)]
#[derive(Clone, Copy, Pod, Zeroable, Debug)]
struct SpriteInstanceGpu {
    inv_rot0: [f32; 4],
    inv_rot1: [f32; 4],
    inv_rot2: [f32; 4],
    pos: [f32; 3],
    model_id: u32,
}

/// Invert a 3×3 matrix given as basis columns `[c0, c1, c2]`,
/// returning the inverse as columns. For an orthonormal basis this is
/// the transpose; the general path covers rotation + non-unit scale.
#[must_use]
fn mat3_inverse(cols: [[f32; 3]; 3]) -> [[f32; 3]; 3] {
    let [a, b, c] = cols; // columns
                          // Determinant via scalar triple product a · (b × c).
    let cross = |u: [f32; 3], v: [f32; 3]| {
        [
            u[1] * v[2] - u[2] * v[1],
            u[2] * v[0] - u[0] * v[2],
            u[0] * v[1] - u[1] * v[0],
        ]
    };
    let bc = cross(b, c);
    let ca = cross(c, a);
    let ab = cross(a, b);
    let det = a[0] * bc[0] + a[1] * bc[1] + a[2] * bc[2];
    let inv_det = if det.abs() < 1e-12 { 0.0 } else { 1.0 / det };
    // Inverse rows are (b×c, c×a, a×b)/det; return as columns of the
    // inverse, i.e. transpose of those rows.
    [
        [bc[0] * inv_det, ca[0] * inv_det, ab[0] * inv_det],
        [bc[1] * inv_det, ca[1] * inv_det, ab[1] * inv_det],
        [bc[2] * inv_det, ca[2] * inv_det, ab[2] * inv_det],
    ]
}

/// GPU-resident registry + instances: every model's occupancy /
/// colours / offsets concatenated into shared storage buffers, a
/// per-model metadata table, and a capacity-sized instance buffer
/// rewritten each frame with the frustum-visible subset (GPU.10.2).
/// One bind group serves all models (same approach as the multi-grid
/// scene).
pub struct SpriteRegistryResident {
    pub occupancy: wgpu::Buffer,
    pub colors: wgpu::Buffer,
    /// Per-voxel surface-normal index, concatenated across models in the
    /// same layout as [`colors`](Self::colors). The shader indexes the
    /// per-instance `kv6colmul` table by it.
    pub dirs: wgpu::Buffer,
    pub color_offsets: wgpu::Buffer,
    pub model_meta: wgpu::Buffer,
    /// Holds up to `instance_capacity` instances; the visible subset
    /// is packed into `[0, count)` each frame by [`Self::cull_bin_upload`].
    pub instances: wgpu::Buffer,
    pub instance_capacity: u32,
    /// Per-visible-instance `kv6colmul[256]` tables, packed in the same
    /// order as the `instances` buffer each frame (two u32 per u64
    /// entry: lanes 0|1 then 2|3). Sized `instance_capacity * 256 * 2`
    /// u32; rewritten by [`Self::cull_bin_upload`].
    pub colmul: wgpu::Buffer,
    colmul_cap: u32,
    /// GPU.10.3 — per-tile `(offset, count)` into `tile_instances`,
    /// flat `2 * tiles_x * tiles_y` u32s. Grown to fit the screen.
    pub tile_ranges: wgpu::Buffer,
    tile_ranges_cap: u32,
    /// GPU.10.3 — flat list of visible-instance indices grouped by
    /// tile. Grown to fit the per-frame total.
    pub tile_instances: wgpu::Buffer,
    tile_instances_cap: u32,
    /// CPU cull records (full set), with precomputed bounding spheres.
    cull: Vec<CullInstance>,
    /// GPU.10.4 — LOD chains: `chains[chain_id]` = entry ids, finest
    /// first. The cull picks a level by distance and writes its entry
    /// id into the packed instance's `model_id`.
    chains: Vec<Vec<u32>>,
    /// GPU.12 incremental — CPU mirror of the GPU `model_meta` table, one
    /// per concrete entry. [`Self::update_model`] reads the fixed
    /// occupancy/color_offsets bases from here and rewrites the changed
    /// `colors_offset` on a relocation.
    meta: Vec<SpriteModelMeta>,
    /// GPU.12 incremental — per-entry placement of `colors`/`dirs` in the
    /// shared buffers (drives both; same offsets/ranks). Lets an edit
    /// re-upload one model's data without touching the others.
    colors_alloc: ColorsAllocator,
    /// Per-entry word length of the dims-fixed `occupancy` and
    /// `color_offsets` arrays, kept so [`Self::update_model`] can assert a
    /// carve never changed dims (which would invalidate the in-place
    /// writes — growing dims is out of scope, handled by a full re-upload).
    occ_lens: Vec<u32>,
    coloff_lens: Vec<u32>,
}

impl SpriteRegistryResident {
    /// Concatenate `registry`'s models into shared buffers and prepare
    /// `instances` for per-frame culling. Model-relative indices stay
    /// as built; the shader adds each model's base offset from the
    /// metadata table.
    #[must_use]
    pub fn upload(
        device: &wgpu::Device,
        registry: &SpriteModelRegistry,
        instances: &[SpriteInstance],
    ) -> Self {
        // `occupancy` + `color_offsets` are dims-fixed → tightly
        // concatenated (never grow on a carve). `colors` + `dirs` are
        // variable → laid out by the suballocator with per-slot slack so
        // an incremental edit can rewrite one model in place.
        let entry_lens: Vec<u32> = registry
            .entries
            .iter()
            .map(|m| m.colors.len() as u32)
            .collect();
        let colors_alloc = ColorsAllocator::new(&entry_lens);
        let cap_total = colors_alloc.cap_total();

        let mut all_occ: Vec<u32> = Vec::new();
        let mut all_offsets: Vec<u32> = Vec::new();
        let mut all_colors: Vec<u32> = vec![0; cap_total as usize];
        let mut all_dirs: Vec<u32> = vec![0; cap_total as usize];
        let mut meta: Vec<SpriteModelMeta> = Vec::with_capacity(registry.entries.len());
        let mut occ_lens: Vec<u32> = Vec::with_capacity(registry.entries.len());
        let mut coloff_lens: Vec<u32> = Vec::with_capacity(registry.entries.len());

        // One meta + placed data per concrete (mip-level) entry.
        for (e, m) in registry.entries.iter().enumerate() {
            let slot = colors_alloc.slot(e);
            meta.push(SpriteModelMeta {
                occupancy_offset: all_occ.len() as u32,
                colors_offset: slot.off,
                color_offsets_offset: all_offsets.len() as u32,
                occ_words_per_col: m.occ_words_per_col,
                dims: m.dims,
                _pad0: 0,
                pivot: m.pivot,
                voxel_world_size: m.voxel_world_size,
            });
            occ_lens.push(m.occupancy.len() as u32);
            coloff_lens.push(m.color_offsets.len() as u32);
            all_occ.extend_from_slice(&m.occupancy);
            all_offsets.extend_from_slice(&m.color_offsets);
            let off = slot.off as usize;
            all_colors[off..off + m.colors.len()].copy_from_slice(&m.colors);
            all_dirs[off..off + m.dirs.len()].copy_from_slice(&m.dirs);
        }

        // Per-instance cull records: sphere centred at the instance
        // position, radius from the chain's finest (mip-0) model.
        // `colmul` starts at identity (unshaded) until the facade sets
        // per-instance lighting via `set_instance_colmul`.
        let cull: Vec<CullInstance> = instances
            .iter()
            .map(|i| CullInstance {
                gpu: SpriteInstanceGpu {
                    inv_rot0: i.transform.inv_rot[0],
                    inv_rot1: i.transform.inv_rot[1],
                    inv_rot2: i.transform.inv_rot[2],
                    pos: i.transform.pos,
                    model_id: i.model_id, // placeholder; cull rewrites
                },
                chain_id: i.model_id,
                center: i.transform.pos,
                radius: registry.model(i.model_id).bound_radius(),
                colmul: identity_colmul(),
            })
            .collect();

        // Capacity buffer (COPY_DST so cull can rewrite it each frame),
        // seeded with the full set so frame 0 is valid pre-cull.
        let seed: Vec<SpriteInstanceGpu> = cull.iter().map(|c| c.gpu).collect();
        let instances_buf = {
            use wgpu::util::DeviceExt;
            let one = [SpriteInstanceGpu::zeroed()];
            let src: &[SpriteInstanceGpu] = if seed.is_empty() { &one } else { &seed };
            device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
                label: Some("roxlap-gpu sprite_reg.instances"),
                contents: bytemuck::cast_slice(src),
                usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
            })
        };

        let tile_ranges = storage_dst_u32(device, "roxlap-gpu sprite_reg.tile_ranges", 1);
        let tile_instances = storage_dst_u32(device, "roxlap-gpu sprite_reg.tile_instances", 1);
        // colmul: 256 entries × 2 u32 per visible instance. Sized to the
        // full instance set (worst case all visible); rewritten per frame.
        let colmul_cap = (cull.len() as u32).max(1) * 256 * 2;
        let colmul = storage_dst_u32(device, "roxlap-gpu sprite_reg.colmul", colmul_cap);
        Self {
            occupancy: storage_dst_u32_cap(
                device,
                "roxlap-gpu sprite_reg.occupancy",
                &all_occ,
                all_occ.len() as u32,
            ),
            colors: storage_dst_u32_cap(
                device,
                "roxlap-gpu sprite_reg.colors",
                &all_colors,
                cap_total,
            ),
            dirs: storage_dst_u32_cap(device, "roxlap-gpu sprite_reg.dirs", &all_dirs, cap_total),
            color_offsets: storage_dst_u32_cap(
                device,
                "roxlap-gpu sprite_reg.color_offsets",
                &all_offsets,
                all_offsets.len() as u32,
            ),
            model_meta: storage_dst_pod(device, "roxlap-gpu sprite_reg.model_meta", &meta),
            instances: instances_buf,
            instance_capacity: cull.len() as u32,
            colmul,
            colmul_cap,
            tile_ranges,
            tile_ranges_cap: 1,
            tile_instances,
            tile_instances_cap: 1,
            cull,
            chains: registry.chains.clone(),
            meta,
            colors_alloc,
            occ_lens,
            coloff_lens,
        }
    }

    /// Set the per-instance `kv6colmul[256]` lighting tables (voxlap's
    /// `update_reflects` output), in the same order/length as the
    /// instances passed to [`Self::upload`]. The next
    /// [`Self::cull_bin_upload`] packs the visible subset to the GPU.
    /// Instances beyond `tables.len()` keep their previous tables.
    pub fn set_instance_colmul(&mut self, tables: &[[u64; 256]]) {
        for (ci, t) in self.cull.iter_mut().zip(tables) {
            ci.colmul.copy_from_slice(t);
        }
    }

    /// Refresh instance poses in place from `instances` — for animated
    /// sprites (e.g. KFA limbs re-posed each frame) — **without** any
    /// model-volume re-upload. `instances` must match the set passed to
    /// [`Self::upload`] in length + order; each keeps its `model_id`
    /// (LOD chain) so only the transform + cull centre change. No GPU
    /// write happens here: the next [`Self::cull_bin_upload`] re-uploads
    /// the packed visible subset, as it already does every frame.
    pub fn update_transforms(&mut self, instances: &[SpriteInstance]) {
        debug_assert_eq!(
            instances.len(),
            self.cull.len(),
            "update_transforms instance count must match upload"
        );
        for (ci, inst) in self.cull.iter_mut().zip(instances) {
            ci.gpu.inv_rot0 = inst.transform.inv_rot[0];
            ci.gpu.inv_rot1 = inst.transform.inv_rot[1];
            ci.gpu.inv_rot2 = inst.transform.inv_rot[2];
            ci.gpu.pos = inst.transform.pos;
            // Bounding sphere follows the pivot; radius/chain unchanged.
            ci.center = inst.transform.pos;
        }
    }

    /// GPU.12 incremental — re-upload only the entries of LOD chain
    /// `chain_id` after an in-place edit (carve / recolour) of its model,
    /// **without** rebuilding the whole registry. `registry` must be the
    /// same registry uploaded (same entry ids), with chain `chain_id`'s
    /// entries already edited (`model_mut` + `rebuild_lod`).
    ///
    /// For each entry: occupancy + color_offsets are dims-fixed, so they
    /// are written in place; colors + dirs (variable, parallel) go through
    /// the suballocator — written in place when they fit the slack,
    /// relocated (with a `model_meta` rewrite) when they outgrow it, and
    /// only when the buffer tail overflows are colors/dirs grown + the
    /// whole registry repacked. Instances / cull / colmul are untouched
    /// (a carve never moves an instance or grows its bounds) — that is the
    /// win over [`Self::upload`].
    ///
    /// # Panics (debug)
    /// If an entry's dims changed (occupancy / color_offsets length), which
    /// the in-place path can't absorb — growing dims needs a full
    /// re-upload via [`Self::upload`].
    pub fn update_model(
        &mut self,
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        registry: &SpriteModelRegistry,
        chain_id: u32,
    ) {
        let entries = self.chains[chain_id as usize].clone();
        let mut grew = false;
        for &e in &entries {
            let e = e as usize;
            let m = &registry.entries[e];

            // Dims-fixed arrays: assert unchanged, then write in place.
            debug_assert_eq!(
                m.occupancy.len() as u32,
                self.occ_lens[e],
                "update_model: entry {e} occupancy length changed (dims grew?)"
            );
            debug_assert_eq!(
                m.color_offsets.len() as u32,
                self.coloff_lens[e],
                "update_model: entry {e} color_offsets length changed (dims grew?)"
            );
            queue.write_buffer(
                &self.occupancy,
                u64::from(self.meta[e].occupancy_offset) * 4,
                bytemuck::cast_slice(&m.occupancy),
            );
            queue.write_buffer(
                &self.color_offsets,
                u64::from(self.meta[e].color_offsets_offset) * 4,
                bytemuck::cast_slice(&m.color_offsets),
            );

            // Variable colors/dirs via the suballocator.
            let new_len = m.colors.len() as u32;
            match self.colors_alloc.place(e, new_len) {
                Some(off) => {
                    queue.write_buffer(
                        &self.colors,
                        u64::from(off) * 4,
                        bytemuck::cast_slice(&m.colors),
                    );
                    queue.write_buffer(
                        &self.dirs,
                        u64::from(off) * 4,
                        bytemuck::cast_slice(&m.dirs),
                    );
                    if self.meta[e].colors_offset != off {
                        // Relocated — rewrite this entry's meta record.
                        self.meta[e].colors_offset = off;
                        queue.write_buffer(
                            &self.model_meta,
                            (e * std::mem::size_of::<SpriteModelMeta>()) as u64,
                            bytemuck::bytes_of(&self.meta[e]),
                        );
                    }
                }
                None => grew = true,
            }
        }

        // Buffer overflow on at least one entry → grow colors/dirs and
        // repack the WHOLE registry (rare; offsets for every entry move).
        if grew {
            self.grow_and_repack(device, queue, registry);
        }
    }

    /// Grow the `colors`/`dirs` buffers and repack every entry compactly
    /// (with fresh slack) when an [`Self::update_model`] edit overflowed
    /// the buffer tail. Recreates both buffers (the next frame's bind
    /// group picks up the new handles) and rewrites every `model_meta`
    /// `colors_offset`. O(registry) but rare — logged so a growth burst
    /// is visible.
    fn grow_and_repack(
        &mut self,
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        registry: &SpriteModelRegistry,
    ) {
        let new_lens: Vec<u32> = registry
            .entries
            .iter()
            .map(|m| m.colors.len() as u32)
            .collect();
        self.colors_alloc.repack(&new_lens);
        let cap_total = self.colors_alloc.cap_total();

        let mut all_colors = vec![0u32; cap_total as usize];
        let mut all_dirs = vec![0u32; cap_total as usize];
        for (e, m) in registry.entries.iter().enumerate() {
            let off = self.colors_alloc.slot(e).off as usize;
            all_colors[off..off + m.colors.len()].copy_from_slice(&m.colors);
            all_dirs[off..off + m.dirs.len()].copy_from_slice(&m.dirs);
            self.meta[e].colors_offset = off as u32;
        }
        self.colors = storage_dst_u32_cap(
            device,
            "roxlap-gpu sprite_reg.colors",
            &all_colors,
            cap_total,
        );
        self.dirs = storage_dst_u32_cap(device, "roxlap-gpu sprite_reg.dirs", &all_dirs, cap_total);
        // Every entry's colors_offset moved → rewrite the whole meta table.
        queue.write_buffer(&self.model_meta, 0, bytemuck::cast_slice(&self.meta));
        eprintln!("roxlap-gpu: sprite registry colors/dirs grew + repacked to {cap_total} words");
    }

    /// GPU.10.3 — frustum-cull, pack the visible subset into the
    /// instance buffer, then bin those instances into screen tiles:
    /// project each visible bounding sphere to a screen AABB and append
    /// its (visible) index to every overlapped tile. Uploads the
    /// instance buffer + `tile_ranges` (per-tile offset/count) +
    /// `tile_instances` (flat grouped indices), growing the tile
    /// buffers as needed. Returns `(visible_count, tiles_x, tiles_y)`.
    #[allow(clippy::too_many_arguments)]
    pub fn cull_bin_upload(
        &mut self,
        device: &wgpu::Device,
        queue: &wgpu::Queue,
        f: &ViewFrustum,
        screen_w: u32,
        screen_h: u32,
        tile_size: u32,
        lod_px: f32,
    ) -> (u32, u32, u32) {
        let tiles_x = screen_w.div_ceil(tile_size).max(1);
        let tiles_y = screen_h.div_ceil(tile_size).max(1);
        let n_tiles = (tiles_x * tiles_y) as usize;

        let nw = (1.0 + f.half_w * f.half_w).sqrt();
        let nh = (1.0 + f.half_h * f.half_h).sqrt();
        let cx = screen_w as f32 * 0.5;
        let cy = screen_h as f32 * 0.5;
        let px_per_world = cx / f.half_w; // isotropic: == cy/half_h
        let ts = tile_size as f32;
        let tx_max = tiles_x as i32 - 1;
        let ty_max = tiles_y as i32 - 1;

        let mut visible: Vec<SpriteInstanceGpu> = Vec::with_capacity(self.cull.len());
        // Per-visible tile AABB (tx0, tx1, ty0, ty1) for the bin pass.
        let mut boxes: Vec<[i32; 4]> = Vec::with_capacity(self.cull.len());
        // Per-visible kv6colmul tables, flattened to two u32 per u64
        // entry (lanes 0|1, then 2|3), packed in visible order so the
        // shader indexes `colmul[inst_idx*512 + dir*2 + {0,1}]`.
        let mut visible_colmul: Vec<u32> = Vec::with_capacity(self.cull.len() * 512);
        let mut counts = vec![0u32; n_tiles];

        for ci in &self.cull {
            let rel = [
                ci.center[0] - f.pos[0],
                ci.center[1] - f.pos[1],
                ci.center[2] - f.pos[2],
            ];
            let z = dot3(rel, f.forward);
            let r = ci.radius;
            if z + r < 0.0 || z - r > f.far {
                continue; // behind / beyond far
            }
            let x = dot3(rel, f.right);
            if (x - f.half_w * z) > r * nw || (-x - f.half_w * z) > r * nw {
                continue; // right / left
            }
            let y = dot3(rel, f.down);
            if (y - f.half_h * z) > r * nh || (-y - f.half_h * z) > r * nh {
                continue; // bottom / top
            }

            // Visible: project the sphere to a screen AABB → tile range.
            let (tx0, tx1, ty0, ty1) = if z > 1e-3 {
                let sx = cx + (x / z) * px_per_world;
                let sy = cy + (y / z) * px_per_world;
                let sr = (r / z) * px_per_world;
                (
                    (((sx - sr) / ts).floor() as i32).clamp(0, tx_max),
                    (((sx + sr) / ts).floor() as i32).clamp(0, tx_max),
                    (((sy - sr) / ts).floor() as i32).clamp(0, ty_max),
                    (((sy + sr) / ts).floor() as i32).clamp(0, ty_max),
                )
            } else {
                // Sphere crosses the camera plane — cover all tiles.
                (0, tx_max, 0, ty_max)
            };
            // GPU.10.4 — pick the LOD level by projected voxel size:
            // choose the coarsest level whose voxel still covers at
            // least `lod_px` screen pixels, i.e. step up once a mip-0
            // voxel would be smaller than that. `lod_px = 1` is the
            // natural "don't go sub-pixel" threshold; larger values
            // force LOD in closer (tuning/inspection).
            let chain = &self.chains[ci.chain_id as usize];
            let level = if z > 1e-3 && chain.len() > 1 {
                let voxel_px = px_per_world / z; // mip-0 voxel screen size
                ((lod_px / voxel_px).log2().ceil().max(0.0) as usize).min(chain.len() - 1)
            } else {
                0
            };
            let mut g = ci.gpu;
            g.model_id = chain[level];
            visible.push(g);
            boxes.push([tx0, tx1, ty0, ty1]);
            for &w in ci.colmul.iter() {
                visible_colmul.push((w & 0xffff_ffff) as u32);
                visible_colmul.push((w >> 32) as u32);
            }
            for ty in ty0..=ty1 {
                for tx in tx0..=tx1 {
                    counts[(ty * tiles_x as i32 + tx) as usize] += 1;
                }
            }
        }

        if visible.is_empty() {
            return (0, tiles_x, tiles_y);
        }

        // Prefix-sum counts → per-tile offsets; build the flat grouped
        // index list.
        let mut tile_ranges = vec![0u32; n_tiles * 2];
        let mut running = 0u32;
        for t in 0..n_tiles {
            tile_ranges[2 * t] = running; // offset
            tile_ranges[2 * t + 1] = counts[t]; // count
            running += counts[t];
        }
        let total = running as usize;
        let mut tile_instances = vec![0u32; total.max(1)];
        let mut cursor: Vec<u32> = (0..n_tiles).map(|t| tile_ranges[2 * t]).collect();
        for (vis_idx, b) in boxes.iter().enumerate() {
            for ty in b[2]..=b[3] {
                for tx in b[0]..=b[1] {
                    let t = (ty * tiles_x as i32 + tx) as usize;
                    tile_instances[cursor[t] as usize] = vis_idx as u32;
                    cursor[t] += 1;
                }
            }
        }

        // Upload: instances + (grown) tile buffers. Grow a tile buffer
        // only when this frame needs more than its capacity (wgpu has
        // no Clone on Buffer, so we replace the field in place).
        queue.write_buffer(&self.instances, 0, bytemuck::cast_slice(&visible));
        let need_ranges = tile_ranges.len() as u32;
        if need_ranges > self.tile_ranges_cap {
            self.tile_ranges_cap = need_ranges.next_power_of_two();
            self.tile_ranges = storage_dst_u32(
                device,
                "roxlap-gpu sprite_reg.tile_ranges",
                self.tile_ranges_cap,
            );
        }
        let need_inst = tile_instances.len() as u32;
        if need_inst > self.tile_instances_cap {
            self.tile_instances_cap = need_inst.next_power_of_two();
            self.tile_instances = storage_dst_u32(
                device,
                "roxlap-gpu sprite_reg.tile_instances",
                self.tile_instances_cap,
            );
        }
        queue.write_buffer(&self.tile_ranges, 0, bytemuck::cast_slice(&tile_ranges));
        queue.write_buffer(
            &self.tile_instances,
            0,
            bytemuck::cast_slice(&tile_instances),
        );
        let need_colmul = visible_colmul.len() as u32;
        if need_colmul > self.colmul_cap {
            self.colmul_cap = need_colmul.next_power_of_two();
            self.colmul = storage_dst_u32(device, "roxlap-gpu sprite_reg.colmul", self.colmul_cap);
        }
        queue.write_buffer(&self.colmul, 0, bytemuck::cast_slice(&visible_colmul));

        (visible.len() as u32, tiles_x, tiles_y)
    }
}

/// GPU.12 incremental — per-entry placement of one model's `colors`
/// (and the parallel `dirs`) within the shared registry buffers: a
/// `[off, off+cap)` word window holding `len` live words. `cap >= len`
/// gives slack so a carve that *grows* the surface-voxel count can be
/// rewritten in place without relocating.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
struct ColorSlot {
    off: u32,
    cap: u32,
    len: u32,
}

/// First-fit suballocator over the parallel `colors`/`dirs` buffers
/// (same offsets/ranks → one allocator drives both). Each registry
/// entry owns a [`ColorSlot`]; growth past a slot's `cap` relocates it
/// (freeing the old block) via the free list or a bump tail, and only
/// when the tail would exceed `cap_total` does the caller grow + repack
/// the whole buffer. Pure (no GPU) so it unit-tests on its own.
#[derive(Debug, Default)]
struct ColorsAllocator {
    /// Per-entry slot, indexed by entry id.
    slots: Vec<ColorSlot>,
    /// Freed `(off, cap)` blocks available for first-fit reuse.
    free: Vec<(u32, u32)>,
    /// Next bump-allocation position (words).
    tail: u32,
    /// Total buffer capacity in words.
    cap_total: u32,
}

/// Slack-padded capacity for a `len`-word array: +25% + 16 words, so a
/// few extra surface voxels from a carve fit without relocating.
fn slot_cap(len: u32) -> u32 {
    len + len / 4 + 16
}

impl ColorsAllocator {
    /// Lay every entry out contiguously (with per-slot slack) and add a
    /// global tail headroom so early growth bump-allocates rather than
    /// repacks.
    fn new(entry_lens: &[u32]) -> Self {
        let mut a = Self::default();
        a.repack(entry_lens);
        a
    }

    fn slot(&self, entry: usize) -> ColorSlot {
        self.slots[entry]
    }

    fn cap_total(&self) -> u32 {
        self.cap_total
    }

    /// Repack ALL entries compactly to fit `new_lens`, resetting the
    /// free list + tail and choosing a fresh `cap_total` with headroom.
    /// Used at initial build and on a buffer grow.
    fn repack(&mut self, new_lens: &[u32]) {
        self.free.clear();
        let mut off = 0u32;
        let mut slots = Vec::with_capacity(new_lens.len());
        for &len in new_lens {
            let cap = slot_cap(len);
            slots.push(ColorSlot { off, cap, len });
            off += cap;
        }
        self.slots = slots;
        self.tail = off;
        // Global headroom: +50% + 256 words.
        self.cap_total = off + off / 2 + 256;
    }

    /// Place `new_len` words for `entry`. Returns `Some(off)` with the
    /// (possibly relocated) slot offset, or `None` if the buffer must
    /// grow + repack. On relocation the old block is pushed to the free
    /// list; an in-place fit returns the unchanged offset.
    fn place(&mut self, entry: usize, new_len: u32) -> Option<u32> {
        let cur = self.slots[entry];
        if new_len <= cur.cap {
            self.slots[entry] = ColorSlot {
                len: new_len,
                ..cur
            };
            return Some(cur.off);
        }
        let old = (cur.off, cur.cap);
        // First-fit a freed block big enough for the live data.
        if let Some(i) = self.free.iter().position(|&(_, c)| c >= new_len) {
            let (off, cap) = self.free.remove(i);
            self.free.push(old);
            self.slots[entry] = ColorSlot {
                off,
                cap,
                len: new_len,
            };
            return Some(off);
        }
        // Bump the tail if there's room.
        let want = slot_cap(new_len);
        if self.tail + want <= self.cap_total {
            let off = self.tail;
            self.tail += want;
            self.free.push(old);
            self.slots[entry] = ColorSlot {
                off,
                cap: want,
                len: new_len,
            };
            return Some(off);
        }
        None
    }
}

/// Create a STORAGE buffer of u32s; pads empty input (wgpu rejects
/// zero-sized storage bindings).
#[allow(dead_code)]
fn storage_u32(device: &wgpu::Device, label: &str, data: &[u32]) -> wgpu::Buffer {
    use wgpu::util::DeviceExt;
    let bytes: &[u8] = if data.is_empty() {
        bytemuck::cast_slice(&[0u32])
    } else {
        bytemuck::cast_slice(data)
    };
    device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
        label: Some(label),
        contents: bytes,
        usage: wgpu::BufferUsages::STORAGE,
    })
}

/// Create an uninitialised `STORAGE | COPY_DST` `u32` buffer of `cap`
/// words (≥1). Written each frame via `queue.write_buffer`.
fn storage_dst_u32(device: &wgpu::Device, label: &str, cap: u32) -> wgpu::Buffer {
    device.create_buffer(&wgpu::BufferDescriptor {
        label: Some(label),
        size: u64::from(cap.max(1)) * 4,
        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
        mapped_at_creation: false,
    })
}

/// Create a `STORAGE | COPY_DST` `u32` buffer of `cap` words (≥ data
/// length, ≥ 1), initialised with `data` at offset 0 and the tail left
/// zeroed. Unlike [`storage_u32`] (STORAGE-only, exact-size) this both
/// reserves spare capacity and is `COPY_DST`, so the incremental
/// [`SpriteRegistryResident::update_model`] can `write_buffer` a growing
/// `colors`/`dirs` array in place. Filled via `mapped_at_creation` so no
/// queue is needed at upload time.
fn storage_dst_u32_cap(device: &wgpu::Device, label: &str, data: &[u32], cap: u32) -> wgpu::Buffer {
    let cap = cap.max(data.len() as u32).max(1);
    let buf = device.create_buffer(&wgpu::BufferDescriptor {
        label: Some(label),
        size: u64::from(cap) * 4,
        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
        mapped_at_creation: true,
    });
    if !data.is_empty() {
        buf.slice(..(data.len() as u64 * 4))
            .get_mapped_range_mut()
            .copy_from_slice(bytemuck::cast_slice(data));
    }
    buf.unmap();
    buf
}

/// Create a `STORAGE | COPY_DST` buffer of Pod records, exact-size
/// (≥ 1, zero-padded), so individual records can be rewritten in place
/// by [`SpriteRegistryResident::update_model`] on a relocation. The
/// record *count* never changes on an incremental edit (no model is
/// added/removed), so no slack is needed here.
fn storage_dst_pod<T: Pod + Zeroable>(
    device: &wgpu::Device,
    label: &str,
    data: &[T],
) -> wgpu::Buffer {
    let one = [T::zeroed()];
    let src: &[T] = if data.is_empty() { &one } else { data };
    let buf = device.create_buffer(&wgpu::BufferDescriptor {
        label: Some(label),
        size: std::mem::size_of_val(src) as u64,
        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
        mapped_at_creation: true,
    });
    buf.slice(..)
        .get_mapped_range_mut()
        .copy_from_slice(bytemuck::cast_slice(src));
    buf.unmap();
    buf
}

/// Create a STORAGE buffer of Pod records; pads empty input with one
/// zeroed `T`.
#[allow(dead_code)]
fn storage_pod<T: Pod + Zeroable>(device: &wgpu::Device, label: &str, data: &[T]) -> wgpu::Buffer {
    use wgpu::util::DeviceExt;
    let one = [T::zeroed()];
    let src: &[T] = if data.is_empty() { &one } else { data };
    device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
        label: Some(label),
        contents: bytemuck::cast_slice(src),
        usage: wgpu::BufferUsages::STORAGE,
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use roxlap_formats::kv6::{Kv6, Voxel};

    /// 2×1 kv6: column (0,0) has voxels at z=5 (red) and z=1 (green)
    /// stored OUT of z-order; column (1,0) has one voxel at z=3.
    fn kv6_unsorted() -> Kv6 {
        let mk = |z, col| Voxel {
            col,
            z,
            vis: 0,
            dir: 0,
        };
        Kv6 {
            xsiz: 2,
            ysiz: 1,
            zsiz: 8,
            xpiv: 0.0,
            ypiv: 0.0,
            zpiv: 0.0,
            voxels: vec![mk(5, 0xAA), mk(1, 0xBB), mk(3, 0xCC)],
            xlen: vec![2, 1],
            ylen: vec![vec![2], vec![1]],
            palette: None,
        }
    }

    #[test]
    fn occupancy_bits_set_at_voxel_z() {
        let m = build_sprite_model(&kv6_unsorted());
        assert_eq!(m.dims, [2, 1, 8]);
        assert_eq!(m.occ_words_per_col, 1); // ceil(8/32)
                                            // col 0: bits 1 and 5; col 1: bit 3.
        assert_eq!(m.occupancy[0], (1 << 1) | (1 << 5));
        assert_eq!(m.occupancy[1], 1 << 3);
    }

    #[test]
    fn colors_are_ascending_z_for_rank_lookup() {
        let m = build_sprite_model(&kv6_unsorted());
        // col 0 sorted ascending z ⇒ z=1 (green 0xBB) before z=5 (0xAA).
        assert_eq!(m.color_offsets, vec![0, 2, 3]);
        assert_eq!(&m.colors, &[0xBB, 0xAA, 0xCC]);
    }

    #[test]
    fn identity_basis_inverts_to_identity() {
        let inv = mat3_inverse([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]);
        assert_eq!(inv, [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]);
    }

    #[test]
    fn fork_is_independent_of_parent() {
        let mut reg = SpriteModelRegistry::new();
        let base = reg.add(build_sprite_model(&kv6_unsorted()));
        let forked = reg.fork(base);
        assert_ne!(base, forked);
        // Recolour only the fork.
        reg.model_mut(forked).recolor(|_| 0x11);
        // Parent colours untouched; fork fully overwritten.
        assert_eq!(&reg.model(base).colors, &[0xBB, 0xAA, 0xCC]);
        assert_eq!(&reg.model(forked).colors, &[0x11, 0x11, 0x11]);
    }

    #[test]
    fn registry_gpu_structs_have_expected_sizes() {
        assert_eq!(std::mem::size_of::<SpriteModelMeta>(), 48);
        assert_eq!(std::mem::size_of::<SpriteInstanceGpu>(), 64);
    }

    #[test]
    fn add_lod_builds_halving_mip_chain() {
        let mut reg = SpriteModelRegistry::new();
        // 8×8×8 single voxel-filled column model would be ideal, but
        // kv6_unsorted is 2×1×8 → mips: 2×1×8 → 1×1×4 → 1×1×2 → 1×1×1.
        let id = reg.add_lod(build_sprite_model(&kv6_unsorted()), 4);
        let m0 = reg.model(id);
        assert_eq!(m0.dims, [2, 1, 8]);
        assert!((m0.voxel_world_size - 1.0).abs() < 1e-6);
    }

    /// kv6 from explicit voxels, ordered x-major/y-inner to match
    /// `build_sprite_model`'s column walk.
    fn kv6_from(xsiz: u32, ysiz: u32, zsiz: u32, voxels: &[(u32, u32, u16, u32)]) -> Kv6 {
        let mut ylen = vec![vec![0u16; ysiz as usize]; xsiz as usize];
        let mut flat = Vec::new();
        for x in 0..xsiz {
            for y in 0..ysiz {
                let mut col: Vec<(u16, u32)> = voxels
                    .iter()
                    .filter(|(vx, vy, _, _)| *vx == x && *vy == y)
                    .map(|(_, _, z, c)| (*z, *c))
                    .collect();
                col.sort_by_key(|(z, _)| *z);
                ylen[x as usize][y as usize] = col.len() as u16;
                for (z, c) in col {
                    flat.push(Voxel {
                        col: c,
                        z,
                        vis: 0,
                        dir: 0,
                    });
                }
            }
        }
        let xlen = ylen
            .iter()
            .map(|c| c.iter().map(|&v| u32::from(v)).sum())
            .collect();
        Kv6 {
            xsiz,
            ysiz,
            zsiz,
            xpiv: 0.0,
            ypiv: 0.0,
            zpiv: 0.0,
            voxels: flat,
            xlen,
            ylen,
            palette: None,
        }
    }

    fn offsets_consistent(m: &SpriteModel) -> bool {
        let cols = (m.dims[0] * m.dims[1]) as usize;
        if m.color_offsets.len() != cols + 1 {
            return false;
        }
        // Monotonic non-decreasing + last == colors.len + each column's
        // span == its solid-voxel count.
        for w in m.color_offsets.windows(2) {
            if w[1] < w[0] {
                return false;
            }
        }
        m.color_offsets[cols] as usize == m.colors.len()
    }

    #[test]
    fn carve_two_layers_keeps_offsets_consistent() {
        // Mirror the demo's carve: columns with voxels at varied z,
        // some sharing z=0/z=1, some not.
        let kv6 = kv6_from(
            3,
            2,
            8,
            &[
                (0, 0, 0, 0xA0),
                (0, 0, 1, 0xA1),
                (0, 0, 5, 0xA5),
                (1, 0, 1, 0xB1),
                (2, 1, 0, 0xC0),
                (2, 1, 3, 0xC3),
            ],
        );
        let mut m = build_sprite_model(&kv6);
        assert!(offsets_consistent(&m));
        for z in 0..2u32 {
            for y in 0..m.dims[1] {
                for x in 0..m.dims[0] {
                    m.set_voxel(x, y, z, None);
                }
            }
            assert!(offsets_consistent(&m), "inconsistent after carving z={z}");
            // downsample must not panic on the carved model.
            let _ = m.downsample();
        }
    }

    #[test]
    fn set_voxel_inserts_replaces_and_clears() {
        // col 0 starts with z=1 (0xBB), z=5 (0xAA); col 1 with z=3 (0xCC).
        let mut m = build_sprite_model(&kv6_unsorted());

        // Insert z=3 into col 0 (between z=1 and z=5) → rank 1.
        assert!(m.set_voxel(0, 0, 3, Some(0x55)));
        assert_eq!(m.occupancy[0], (1 << 1) | (1 << 3) | (1 << 5));
        // col 0 colours ascending z: 0xBB(z1), 0x55(z3), 0xAA(z5).
        assert_eq!(m.color_offsets, vec![0, 3, 4]);
        assert_eq!(&m.colors, &[0xBB, 0x55, 0xAA, 0xCC]);

        // Replace z=3 in place (no offset shift).
        assert!(m.set_voxel(0, 0, 3, Some(0x66)));
        assert_eq!(&m.colors, &[0xBB, 0x66, 0xAA, 0xCC]);
        assert_eq!(m.color_offsets, vec![0, 3, 4]);

        // Clear z=1 (rank 0) from col 0.
        assert!(m.set_voxel(0, 0, 1, None));
        assert_eq!(m.occupancy[0], (1 << 3) | (1 << 5));
        assert_eq!(m.color_offsets, vec![0, 2, 3]);
        assert_eq!(&m.colors, &[0x66, 0xAA, 0xCC]);

        // No-ops: clear an empty voxel, edit out of bounds.
        assert!(!m.set_voxel(0, 0, 2, None));
        assert!(!m.set_voxel(9, 0, 0, Some(1)));
    }

    #[test]
    fn rebuild_lod_refreshes_coarse_levels_from_mip0() {
        let mut reg = SpriteModelRegistry::new();
        let id = reg.add_lod(build_sprite_model(&kv6_unsorted()), 3);
        // Recolour mip-0 only via model_mut, then rebuild the ladder.
        reg.model_mut(id).recolor(|_| 0x0000_2000);
        reg.rebuild_lod(id);
        // The mip-1 average of all-0x2000 voxels is still 0x2000.
        let lvl1_entry = reg.chains[id as usize][1] as usize;
        assert!(reg.entries[lvl1_entry]
            .colors
            .iter()
            .all(|&c| c == 0x0000_2000));
    }

    // ---- GPU.12 incremental: colors/dirs suballocator -----------------

    /// Every slot fits its data, has slack, doesn't overlap the next, and
    /// the buffer reserves tail headroom past the last slot.
    fn alloc_invariants(a: &ColorsAllocator, lens: &[u32]) {
        let mut prev_end = 0u32;
        for (e, &len) in lens.iter().enumerate() {
            let s = a.slot(e);
            assert_eq!(s.len, len, "slot {e} len");
            assert!(s.cap >= s.len, "slot {e} cap >= len");
            // In a freshly repacked layout slots are in entry order.
            assert!(s.off >= prev_end, "slot {e} overlaps previous");
            assert!(s.off + s.cap <= a.cap_total(), "slot {e} past cap_total");
            prev_end = s.off + s.cap;
        }
        assert!(a.cap_total() >= prev_end, "tail headroom");
    }

    #[test]
    fn allocator_new_lays_out_with_slack_and_headroom() {
        let lens = [10u32, 0, 64, 7];
        let a = ColorsAllocator::new(&lens);
        alloc_invariants(&a, &lens);
        // Slack: a 64-word slot has cap > 64 so a small carve-grow fits.
        assert!(a.slot(2).cap > 64);
        // Headroom past the bump tail for early growth.
        assert!(a.cap_total() > a.slot(3).off + a.slot(3).cap);
    }

    #[test]
    fn allocator_place_in_place_when_within_cap() {
        let mut a = ColorsAllocator::new(&[10, 20]);
        let off0 = a.slot(0).off;
        let cap0 = a.slot(0).cap;
        // Shrink: still the same slot.
        assert_eq!(a.place(0, 5), Some(off0));
        assert_eq!(a.slot(0).len, 5);
        assert_eq!(a.slot(0).cap, cap0);
        // Grow within slack: same offset, no relocation.
        assert_eq!(a.place(0, cap0), Some(off0));
        assert_eq!(a.slot(0).off, off0);
        assert!(a.free.is_empty(), "no relocation should free anything");
    }

    #[test]
    fn allocator_place_relocates_to_tail_and_frees_old() {
        let mut a = ColorsAllocator::new(&[10, 20]);
        let old0 = (a.slot(0).off, a.slot(0).cap);
        let tail_before = a.tail;
        // Overgrow entry 0 past its cap → relocate to the bump tail.
        let new_len = a.slot(0).cap + 5;
        let off = a.place(0, new_len).expect("fits in headroom");
        assert_eq!(off, tail_before, "relocated to old tail");
        assert_eq!(a.slot(0).off, off);
        assert_eq!(a.slot(0).len, new_len);
        assert!(a.free.contains(&old0), "old slot freed");
    }

    #[test]
    fn allocator_reuses_freed_block_first_fit() {
        // Entry 0 has a large slot; entry 1 a tiny one, so growing 1 must
        // relocate (it can't fit in place) and lands in 0's freed block.
        let mut a = ColorsAllocator::new(&[10, 2]);
        let old0 = (a.slot(0).off, a.slot(0).cap);
        // Relocate entry 0 to the tail, freeing its original block.
        let _ = a.place(0, a.slot(0).cap + 5).unwrap();
        assert!(a.free.contains(&old0));
        // Grow entry 1 past its (tiny) cap but ≤ the freed block's cap →
        // first-fit reuses that block rather than bumping the tail.
        let new1 = a.slot(1).cap + 1;
        assert!(new1 <= old0.1, "freed block big enough");
        let off = a.place(1, new1).expect("reuses freed block");
        assert_eq!(off, old0.0, "first-fit reused the freed slot offset");
        assert!(!a.free.contains(&old0), "freed block consumed");
    }

    #[test]
    fn allocator_signals_grow_then_repack_restores() {
        let mut a = ColorsAllocator::new(&[8, 8]);
        // Force overflow: ask for far more than cap_total.
        let huge = a.cap_total() + 100;
        assert_eq!(a.place(0, huge), None, "overflow must signal grow");
        // Repack with the new lengths compacts + grows the buffer.
        a.repack(&[huge, 8]);
        alloc_invariants(&a, &[huge, 8]);
        assert!(a.cap_total() > huge);
        // After repack the entry now fits in place.
        assert_eq!(a.place(0, huge), Some(a.slot(0).off));
    }

    /// Drive the allocator like a real carve loop (mirroring
    /// `update_model`): one model's colour count drifts up and down
    /// across many edits while two neighbours stay put. Growth is
    /// absorbed in place / via the free list / by the bump tail, and on
    /// the rare overflow we repack (as `update_model` does). After every
    /// edit the live `[off, off+len)` windows must stay disjoint.
    #[test]
    fn allocator_carve_loop_keeps_live_windows_disjoint() {
        let mut a = ColorsAllocator::new(&[40, 12, 40]);
        let mut lens = [40u32, 12, 40];
        // A deterministic up/down walk of entry 1's length, incl. a jump
        // that forces at least one grow+repack.
        let walk = [13u32, 30, 60, 18, 9, 80, 80, 25, 200, 7];
        let mut grew = false;
        for &len in &walk {
            lens[1] = len;
            // Entry 1 re-placed; on overflow, repack the whole set.
            if a.place(1, len).is_none() {
                grew = true;
                a.repack(&lens);
            } else {
                // Neighbours fit in place every time.
                assert_eq!(a.place(0, 40), Some(a.slot(0).off));
                assert_eq!(a.place(2, 40), Some(a.slot(2).off));
            }
            assert_eq!(a.slot(1).len, len);

            // No two entries' live windows overlap.
            let mut wins: Vec<(u32, u32)> =
                (0..3).map(|e| (a.slot(e).off, a.slot(e).len)).collect();
            wins.sort_by_key(|w| w.0);
            for pair in wins.windows(2) {
                let (o0, l0) = pair[0];
                let (o1, _) = pair[1];
                assert!(o0 + l0 <= o1, "live windows overlap: {pair:?}");
            }
        }
        assert!(grew, "the 200-word jump should have forced a repack");
    }
}