Skip to main content

roxlap_gpu/
scene.rs

1//! GPU.5 — multi-grid scene upload + shared storage layout.
2//!
3//! Concatenates every chunk of every grid into one set of storage
4//! buffers + a per-grid offsets table. Each grid keeps its own
5//! `vsid`, `chunks_dims`, `origin_chunk`, and runtime transform;
6//! the shader iterates grids 0..grid_count, transforms the world
7//! camera into each grid's local frame, runs that grid's outer-DDA
8//! over chunks, and tracks the closest hit across all grids.
9//!
10//! Why concatenate rather than one bind group per grid? wgpu's
11//! `MAX_BIND_GROUPS` default is 4; demos with 10+ grids
12//! (`roxlap-scene-demo` has ground + ship + 10 marker pillars =
13//! 12) need a single bind-group layout that scales.
14
15#![allow(
16    clippy::cast_sign_loss,
17    clippy::cast_lossless,
18    clippy::cast_possible_truncation,
19    clippy::cast_possible_wrap,
20    clippy::doc_markdown,
21    clippy::missing_panics_doc,
22    clippy::needless_range_loop,
23    clippy::pub_underscore_fields
24)]
25
26use bytemuck::Zeroable;
27use wgpu::util::DeviceExt;
28
29use crate::decompress::{gpu_mip_count, occ_words_per_column_for_mip, ChunkUpload};
30use crate::grid::GridUpload;
31
32/// GPU.11 — max mip levels the per-slot layout reserves room for in
33/// [`GridStaticMeta`]'s relative-offset tables. Matches
34/// [`crate::decompress::GPU_MAX_MIPS`]; the shader's `array<u32, N>`
35/// must use the same N.
36pub const MAX_GPU_MIPS: usize = 6;
37
38/// GPU.11 — per-slot occupancy/color-offset strides + per-mip
39/// within-slot relative offsets for a grid of side `vsid`. All
40/// chunks of a grid share these (uniform mip count by
41/// [`gpu_mip_count`]). `colors` keep their fixed
42/// [`COLORS_PER_CHUNK_WORDS`] stride; each mip's colours are
43/// concatenated within that block and indexed by the chunk's own
44/// (absolute) `color_offsets`.
45#[derive(Debug, Clone, Copy)]
46pub struct MipLayout {
47    pub mip_count: u32,
48    pub occ_words_per_slot: u32,
49    pub offsets_words_per_slot: u32,
50    /// Within-slot u32 offset where mip `m`'s occupancy starts.
51    pub mip_occ_rel: [u32; MAX_GPU_MIPS],
52    /// Within-slot u32 offset where mip `m`'s color_offsets start.
53    pub mip_coff_rel: [u32; MAX_GPU_MIPS],
54}
55
56impl MipLayout {
57    #[must_use]
58    pub fn for_vsid(vsid: u32) -> Self {
59        let mip_count = gpu_mip_count(vsid);
60        let mut mip_occ_rel = [0u32; MAX_GPU_MIPS];
61        let mut mip_coff_rel = [0u32; MAX_GPU_MIPS];
62        let mut occ_acc = 0u32;
63        let mut coff_acc = 0u32;
64        for m in 0..mip_count {
65            mip_occ_rel[m as usize] = occ_acc;
66            mip_coff_rel[m as usize] = coff_acc;
67            let vsid_m = vsid >> m;
68            let cols = vsid_m * vsid_m;
69            // Each mip stores TWO bitmaps back-to-back: the textured
70            // occupancy then the solid occupancy (cliff-face fix). The
71            // shader reads solid at `tex_base + cols*occ_words_per_col`.
72            occ_acc += 2 * cols * occ_words_per_column_for_mip(m);
73            coff_acc += cols + 1;
74        }
75        Self {
76            mip_count,
77            occ_words_per_slot: occ_acc,
78            offsets_words_per_slot: coff_acc,
79            mip_occ_rel,
80            mip_coff_rel,
81        }
82    }
83}
84
85/// Maximum number of grids the shader's per-grid camera uniform
86/// array can hold. The scene-demo has 12 (1 ground + 1 ship + 10
87/// markers); 16 leaves headroom for a future +4 without re-cooking
88/// the shader. The runtime check rejects scenes that overflow.
89pub const MAX_SCENE_GRIDS: u32 = 16;
90
91/// Per-chunk colour-slot stride, in u32 words (256 KiB). Each
92/// chunk's colour data lives at `meta_idx * COLORS_PER_CHUNK_WORDS`
93/// within its grid's colours range. Fixed-stride layout means
94/// every slot — present or absent at upload time — has the same
95/// capacity, so [`GpuSceneResident::refresh_chunk`] can always
96/// write new colour data into the slot when a chunk arrives via
97/// streaming or is re-baked.
98///
99/// 65536 u32s = 256 KiB. Scene-demo's densest ground-hills chunks
100/// run ~36 k colour entries (~144 KiB) — multiple textured voxels
101/// per column at slopes/cliffs; 256 KiB gives ~1.8× headroom.
102/// Memory cost on the demo's 32×32×1 static grid: 1024 slots ×
103/// 256 KiB = 256 MiB colours (~830 MiB resident scene total).
104/// Chunks past the cap truncate with a stderr warn; GPU.7
105/// sliding-window storage removes the cap entirely.
106pub const COLORS_PER_CHUNK_WORDS: u32 = 65536;
107
108/// Number of separate storage bindings the concatenated occupancy
109/// buffer is split ("paged") across. A single storage binding may
110/// not exceed the device's `max_storage_buffer_binding_size` — on
111/// strict drivers that's a hard 128 MiB (lavapipe), which the
112/// streaming demo's occupancy already reaches. Splitting into pages
113/// keeps every binding under the limit while preserving a single
114/// global word index in the shader (each page is a whole number of
115/// chunk slots, so no slot ever straddles a page boundary).
116///
117/// On GPUs with multi-GiB binding limits (NVK, native Vulkan) the
118/// whole buffer fits in page 0, the other bindings get a 1-word
119/// dummy, and the shader's page select is a single perfectly-
120/// predicted uniform branch → zero hot-loop cost. 4 pages covers
121/// 512 MiB of occupancy even on a 128 MiB-per-binding device.
122pub const MAX_OCC_PAGES: usize = 4;
123
124/// Per-grid runtime transform — voxlap-style (world → grid-local).
125/// `rotation` is column-major and encodes the inverse rotation
126/// applied to the world camera basis before passing it to that
127/// grid's marcher. Identity for the ground; non-trivial for the
128/// rotating ship.
129#[derive(Debug, Clone, Copy)]
130pub struct GridRuntimeTransform {
131    /// Grid-local position of the world origin = `-rotation⁻¹ ·
132    /// grid.position` for a `GridTransform { position, rotation }`.
133    /// The host computes this once per frame.
134    pub grid_origin_world: [f64; 3],
135    /// 3×3 inverse rotation (column-major).
136    pub world_to_grid_rotation: [[f32; 3]; 3],
137}
138
139impl Default for GridRuntimeTransform {
140    fn default() -> Self {
141        Self {
142            grid_origin_world: [0.0, 0.0, 0.0],
143            world_to_grid_rotation: [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]],
144        }
145    }
146}
147
148/// CPU-side aggregation of every grid in a scene. Built once at
149/// startup; per-grid transforms are recomputed each frame and
150/// passed to `render_scene` separately.
151pub struct SceneUpload {
152    pub grids: Vec<GridUpload>,
153}
154
155impl SceneUpload {
156    #[must_use]
157    pub fn grid_count(&self) -> u32 {
158        u32::try_from(self.grids.len()).unwrap_or(u32::MAX)
159    }
160}
161
162/// Per-grid static metadata: offsets into the concatenated storage
163/// buffers + the grid's slot-pool dimensions. Uploaded once.
164///
165/// GPU.7 changes: `chunks_dims` and `origin_chunk` were dropped.
166/// The shader uses modular slot indexing
167/// (`chunk_idx & (pool_dims - 1)`) and verifies slot identity via
168/// `slot_chunk_idx[slot]`, so the upload-time bbox is no longer
169/// relevant to the shader.
170#[repr(C)]
171#[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable, Debug)]
172pub struct GridStaticMeta {
173    /// `occupancy` u32-word offset where this grid's data starts.
174    pub occupancy_offset: u32,
175    pub color_offsets_offset: u32,
176    pub colors_offset: u32,
177    pub chunk_colors_base_offset: u32,
178    pub chunk_occupancy_offset: u32,
179    /// New in GPU.7: u32-word offset where this grid's
180    /// `slot_chunk_idx` array starts (one `vec3<i32>` per slot,
181    /// i.e. 3 u32 words each, plus 1 padding word for std430).
182    pub slot_chunk_idx_offset: u32,
183    pub vsid: u32,
184    pub total_slots: u32,
185    pub pool_dims: [u32; 3],
186    pub _pad0: u32,
187    /// GPU.11 — per-slot occupancy stride (sum over all mips).
188    /// `meta_id`'s occupancy slab starts at
189    /// `occupancy_offset + meta_id * occ_words_per_slot`.
190    pub occ_words_per_slot: u32,
191    /// GPU.11 — per-slot color_offsets stride (sum over all mips).
192    pub offsets_words_per_slot: u32,
193    /// GPU.11 — number of mip levels stored per slot.
194    pub mip_count: u32,
195    pub _pad1: u32,
196    /// GPU.11 — within-slot u32 offset where mip `m`'s occupancy
197    /// starts. `mip_occ_rel[0] == 0` so mip-0 reads are unchanged.
198    pub mip_occ_rel: [u32; MAX_GPU_MIPS],
199    /// GPU.11 — within-slot u32 offset where mip `m`'s color_offsets
200    /// start. `mip_coff_rel[0] == 0`.
201    pub mip_coff_rel: [u32; MAX_GPU_MIPS],
202    /// GPU.13.0 — occupied chunk-AABB (inclusive) in chunk-index space.
203    /// The outer DDA stops once `p_chunk` passes this box along the
204    /// ray's travel direction (no resident chunk can lie ahead). An
205    /// empty grid uses the inverted sentinel (`aabb_min = i32::MAX`,
206    /// `aabb_max = i32::MIN`) so every ray early-outs immediately.
207    /// Maintained live: [`GpuSceneResident::refresh_chunk`] /
208    /// [`GpuSceneResident::evict_chunk`] recompute + re-upload it.
209    pub aabb_min: [i32; 3],
210    pub _pad2: i32,
211    pub aabb_max: [i32; 3],
212    pub _pad3: i32,
213}
214
215/// Sentinel chunk_idx written into empty slot_chunk_idx entries.
216/// Real chunk indices never use `i32::MIN`, so the shader can
217/// distinguish empty slots from collisions via a single equality
218/// check.
219pub const SLOT_EMPTY_SENTINEL: [i32; 3] = [i32::MIN, i32::MIN, i32::MIN];
220
221/// GPU-resident storage for an entire scene's grids.
222pub struct GpuSceneResident {
223    pub grid_count: u32,
224    /// Concatenated per-slot occupancy, split into up to
225    /// [`MAX_OCC_PAGES`] storage bindings so no single binding
226    /// exceeds the device's `max_storage_buffer_binding_size`. The
227    /// vec is always exactly `MAX_OCC_PAGES` long — pages past
228    /// `occupancy_num_pages` are 1-word dummies kept only so the
229    /// bind group has a buffer for every layout entry. Page p holds
230    /// the global word range `[p*occupancy_page_words,
231    /// (p+1)*occupancy_page_words)`; `occupancy_page_words` is a
232    /// whole number of chunk slots so no slot straddles a boundary.
233    pub occupancy_pages: Vec<wgpu::Buffer>,
234    /// Words per occupancy page (a multiple of `occ_words_per_slot`).
235    pub occupancy_page_words: u32,
236    /// Number of real (non-dummy) pages in `occupancy_pages`.
237    pub occupancy_num_pages: u32,
238    pub all_color_offsets: wgpu::Buffer,
239    pub all_colors: wgpu::Buffer,
240    pub all_chunk_colors_base: wgpu::Buffer,
241    pub all_chunk_occupancy: wgpu::Buffer,
242    /// GPU.7 — per-slot chunk_idx for identity verification in the
243    /// shader. Stored as `vec3<i32>` with std430 16-byte stride
244    /// (each entry is `[i32; 4]` on the host: x, y, z, _pad).
245    pub all_slot_chunk_idx: wgpu::Buffer,
246    pub grid_static_meta: wgpu::Buffer,
247    pub total_bytes: u64,
248    /// Cached static metadata for the host's frame-loop work.
249    pub static_meta: Vec<GridStaticMeta>,
250    /// CPU shadow of the per-grid chunk-occupancy bitmap. Each entry
251    /// is the u32 word at `chunk_occupancy_offset + (mi >> 5)`.
252    /// `refresh_chunk` / `evict_chunk` flip the right bit + write
253    /// the affected word back to the GPU.
254    pub(crate) chunk_occupancy_shadow: Vec<Vec<u32>>,
255    /// CPU shadow of `slot_chunk_idx`. Indexed `[scene_idx][slot]`
256    /// → `[i32; 4]` (vec3 + pad). Host uses this to detect "slot is
257    /// holding a different chunk than expected" + as the eviction
258    /// origin.
259    pub(crate) slot_chunk_idx_shadow: Vec<Vec<[i32; 4]>>,
260}
261
262impl GpuSceneResident {
263    /// Pack + upload `info`. Each grid is uploaded as a contiguous
264    /// slab inside the shared storage buffers; per-grid offsets
265    /// live in `grid_static_meta`.
266    ///
267    /// # Panics
268    /// If `info.grids.len() > MAX_SCENE_GRIDS`.
269    pub fn upload(device: &wgpu::Device, info: &SceneUpload) -> Self {
270        let grid_count = info.grid_count();
271        assert!(
272            grid_count <= MAX_SCENE_GRIDS,
273            "GpuSceneResident: scene has {grid_count} grids, shader supports {MAX_SCENE_GRIDS}",
274        );
275
276        let mut all_occupancy: Vec<u32> = Vec::new();
277        let mut all_color_offsets: Vec<u32> = Vec::new();
278        let mut all_colors: Vec<u32> = Vec::new();
279        let mut all_chunk_colors_base: Vec<u32> = Vec::new();
280        let mut all_chunk_occupancy: Vec<u32> = Vec::new();
281        let mut all_slot_chunk_idx: Vec<i32> = Vec::new();
282        let mut static_meta: Vec<GridStaticMeta> = Vec::with_capacity(info.grids.len());
283        let mut chunk_occupancy_shadow: Vec<Vec<u32>> = Vec::with_capacity(info.grids.len());
284        let mut slot_chunk_idx_shadow: Vec<Vec<[i32; 4]>> = Vec::with_capacity(info.grids.len());
285
286        for grid in &info.grids {
287            let vsid = grid.vsid;
288            // GPU.11 — per-slot strides span the whole mip ladder.
289            let layout = MipLayout::for_vsid(vsid);
290            let occ_words_per_slot = layout.occ_words_per_slot as usize;
291            let offsets_words_per_slot = layout.offsets_words_per_slot as usize;
292            let colors_stride = COLORS_PER_CHUNK_WORDS as usize;
293
294            // Validate pool_dims are powers of 2 — required for the
295            // shader's `chunk_idx & (pool_dims - 1)` modular slot
296            // indexing.
297            assert!(
298                grid.pool_dims[0].is_power_of_two()
299                    && grid.pool_dims[1].is_power_of_two()
300                    && grid.pool_dims[2].is_power_of_two(),
301                "scene grid: pool_dims {:?} must all be powers of 2",
302                grid.pool_dims,
303            );
304            let pool_x = grid.pool_dims[0] as usize;
305            let pool_y = grid.pool_dims[1] as usize;
306            let pool_z = grid.pool_dims[2] as usize;
307            let total_slots = pool_x * pool_y * pool_z;
308
309            let mut grid_occupancy = vec![0u32; total_slots * occ_words_per_slot];
310            let mut grid_color_offsets = vec![0u32; total_slots * offsets_words_per_slot];
311            let mut grid_colors = vec![0u32; total_slots * colors_stride];
312            let mut grid_chunk_colors_base = vec![0u32; total_slots];
313            for i in 0..total_slots {
314                grid_chunk_colors_base[i] = (i * colors_stride) as u32;
315            }
316            let mut grid_chunk_occupancy = vec![0u32; total_slots.div_ceil(32)];
317            // slot_chunk_idx: vec3<i32> per slot, std430 stride = 16
318            // bytes (4 u32 words: x, y, z, _pad). Initialise every
319            // slot to the empty sentinel; populated slots overwrite
320            // with the actual chunk_idx below.
321            let mut grid_slot_chunk_idx: Vec<[i32; 4]> = Vec::with_capacity(total_slots);
322            for _ in 0..total_slots {
323                grid_slot_chunk_idx.push([
324                    SLOT_EMPTY_SENTINEL[0],
325                    SLOT_EMPTY_SENTINEL[1],
326                    SLOT_EMPTY_SENTINEL[2],
327                    0,
328                ]);
329            }
330
331            let mask_x = (grid.pool_dims[0] - 1) as i32;
332            let mask_y = (grid.pool_dims[1] - 1) as i32;
333            let mask_z = (grid.pool_dims[2] - 1) as i32;
334            let chunks_per_layer = pool_x * pool_y;
335
336            for (chunk_idx, chunk) in &grid.chunks {
337                assert_eq!(chunk.vsid, vsid, "scene grid: chunk vsid mismatch");
338                let sx = (chunk_idx[0] & mask_x) as usize;
339                let sy = (chunk_idx[1] & mask_y) as usize;
340                let sz = (chunk_idx[2] & mask_z) as usize;
341                let slot_idx = sx + sy * pool_x + sz * chunks_per_layer;
342
343                // GPU.11 — write each mip at its within-slot offset.
344                // occupancy + color_offsets land in per-mip sub-blocks
345                // (mip-0 first, so its data is byte-identical to the
346                // pre-mip layout); colours of every mip concatenate
347                // into the slot's fixed COLORS_PER_CHUNK_WORDS block in
348                // level order, indexed by each chunk's own absolute
349                // `color_offsets`.
350                let occ_start = slot_idx * occ_words_per_slot;
351                let off_start = slot_idx * offsets_words_per_slot;
352                let col_start = slot_idx * colors_stride;
353                let mut color_cursor = 0usize;
354                for (m, mip) in chunk.mips.iter().enumerate() {
355                    let occ_dst = occ_start + layout.mip_occ_rel[m] as usize;
356                    grid_occupancy[occ_dst..occ_dst + mip.occupancy.len()]
357                        .copy_from_slice(&mip.occupancy);
358                    // Solid bitmap immediately follows the textured one.
359                    let solid_dst = occ_dst + mip.occupancy.len();
360                    grid_occupancy[solid_dst..solid_dst + mip.solid_occupancy.len()]
361                        .copy_from_slice(&mip.solid_occupancy);
362                    let coff_dst = off_start + layout.mip_coff_rel[m] as usize;
363                    grid_color_offsets[coff_dst..coff_dst + mip.color_offsets.len()]
364                        .copy_from_slice(&mip.color_offsets);
365
366                    let remaining = colors_stride.saturating_sub(color_cursor);
367                    let n = mip.colors.len().min(remaining);
368                    if n < mip.colors.len() {
369                        eprintln!(
370                            "roxlap-gpu SceneUpload: scene grid chunk {chunk_idx:?} mip {m} \
371                             colours overflow COLORS_PER_CHUNK_WORDS ({colors_stride}); \
372                             truncating",
373                        );
374                    }
375                    grid_colors[col_start + color_cursor..col_start + color_cursor + n]
376                        .copy_from_slice(&mip.colors[..n]);
377                    color_cursor += n;
378                }
379
380                if !chunk.mips[0].colors.is_empty() {
381                    grid_chunk_occupancy[slot_idx >> 5] |= 1u32 << (slot_idx & 31);
382                }
383                grid_slot_chunk_idx[slot_idx] = [chunk_idx[0], chunk_idx[1], chunk_idx[2], 0];
384            }
385
386            // Slot_chunk_idx storage offset: each entry is 4 u32
387            // words (vec3 padded to 16 bytes in std430).
388            let slot_chunk_idx_offset = u32::try_from(all_slot_chunk_idx.len()).expect("fits");
389            // GPU.13.0 — occupied chunk-AABB for the outer-DDA early-out.
390            let (aabb_min, aabb_max) = aabb_of_slots(&grid_slot_chunk_idx);
391            let meta = GridStaticMeta {
392                occupancy_offset: u32::try_from(all_occupancy.len()).expect("fits"),
393                color_offsets_offset: u32::try_from(all_color_offsets.len()).expect("fits"),
394                colors_offset: u32::try_from(all_colors.len()).expect("fits"),
395                chunk_colors_base_offset: u32::try_from(all_chunk_colors_base.len()).expect("fits"),
396                chunk_occupancy_offset: u32::try_from(all_chunk_occupancy.len()).expect("fits"),
397                slot_chunk_idx_offset,
398                vsid,
399                total_slots: total_slots as u32,
400                pool_dims: grid.pool_dims,
401                _pad0: 0,
402                occ_words_per_slot: layout.occ_words_per_slot,
403                offsets_words_per_slot: layout.offsets_words_per_slot,
404                mip_count: layout.mip_count,
405                _pad1: 0,
406                mip_occ_rel: layout.mip_occ_rel,
407                mip_coff_rel: layout.mip_coff_rel,
408                aabb_min,
409                _pad2: 0,
410                aabb_max,
411                _pad3: 0,
412            };
413
414            chunk_occupancy_shadow.push(grid_chunk_occupancy.clone());
415            slot_chunk_idx_shadow.push(grid_slot_chunk_idx.clone());
416
417            all_occupancy.extend_from_slice(&grid_occupancy);
418            all_color_offsets.extend_from_slice(&grid_color_offsets);
419            all_colors.extend_from_slice(&grid_colors);
420            all_chunk_colors_base.extend_from_slice(&grid_chunk_colors_base);
421            all_chunk_occupancy.extend_from_slice(&grid_chunk_occupancy);
422            for entry in &grid_slot_chunk_idx {
423                all_slot_chunk_idx.extend_from_slice(entry);
424            }
425            static_meta.push(meta);
426        }
427
428        // Pad an empty scene's storage buffers — wgpu rejects
429        // zero-size storage bindings.
430        if all_occupancy.is_empty() {
431            all_occupancy.push(0);
432        }
433        if all_color_offsets.is_empty() {
434            all_color_offsets.push(0);
435        }
436        if all_colors.is_empty() {
437            all_colors.push(0);
438        }
439        if all_chunk_colors_base.is_empty() {
440            all_chunk_colors_base.push(0);
441        }
442        if all_chunk_occupancy.is_empty() {
443            all_chunk_occupancy.push(0);
444        }
445        if all_slot_chunk_idx.is_empty() {
446            // 4 zeros = single padded vec3<i32>. wgpu rejects
447            // zero-sized storage bindings.
448            all_slot_chunk_idx.extend_from_slice(&[0; 4]);
449        }
450        if static_meta.is_empty() {
451            static_meta.push(GridStaticMeta::zeroed());
452        }
453
454        let occupancy_bytes = (all_occupancy.len() * 4) as u64;
455        let color_offsets_bytes = (all_color_offsets.len() * 4) as u64;
456        let colors_bytes = (all_colors.len() * 4) as u64;
457        let chunk_colors_base_bytes = (all_chunk_colors_base.len() * 4) as u64;
458        let chunk_occupancy_bytes = (all_chunk_occupancy.len() * 4) as u64;
459        let slot_chunk_idx_bytes = (all_slot_chunk_idx.len() * 4) as u64;
460        let static_meta_bytes = (static_meta.len() * std::mem::size_of::<GridStaticMeta>()) as u64;
461        let total_bytes = occupancy_bytes
462            + color_offsets_bytes
463            + colors_bytes
464            + chunk_colors_base_bytes
465            + chunk_occupancy_bytes
466            + slot_chunk_idx_bytes
467            + static_meta_bytes;
468
469        // Split the concatenated occupancy across storage pages so no
470        // single binding exceeds the device limit. Page size is a
471        // whole number of chunk slots (slot-aligned) so no per-slot
472        // refresh write ever straddles two pages.
473        // GPU.11 — page alignment is now the whole-ladder per-slot
474        // occupancy stride so a slot (all its mips) never straddles a
475        // page boundary.
476        let slot_align_words = info
477            .grids
478            .iter()
479            .map(|g| u64::from(MipLayout::for_vsid(g.vsid).occ_words_per_slot))
480            .max()
481            .unwrap_or(1)
482            .max(1);
483        let (occupancy_pages, occupancy_page_words, occupancy_num_pages) =
484            split_occupancy_pages(device, &all_occupancy, slot_align_words);
485        let all_color_offsets =
486            create_storage(device, "roxlap-gpu scene.color_offsets", &all_color_offsets);
487        let all_colors = create_storage(device, "roxlap-gpu scene.colors", &all_colors);
488        let all_chunk_colors_base = create_storage(
489            device,
490            "roxlap-gpu scene.chunk_colors_base",
491            &all_chunk_colors_base,
492        );
493        let all_chunk_occupancy = create_storage(
494            device,
495            "roxlap-gpu scene.chunk_occupancy",
496            &all_chunk_occupancy,
497        );
498        // GPU.7 slot identity verification buffer. i32 storage.
499        let all_slot_chunk_idx_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
500            label: Some("roxlap-gpu scene.slot_chunk_idx"),
501            contents: bytemuck::cast_slice(&all_slot_chunk_idx),
502            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
503        });
504        let grid_static_meta = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
505            label: Some("roxlap-gpu scene.grid_static_meta"),
506            contents: bytemuck::cast_slice(&static_meta),
507            // GPU.13.0 — COPY_DST so the live chunk-AABB can be patched
508            // into a grid's meta on refresh_chunk / evict_chunk.
509            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
510        });
511
512        Self {
513            grid_count,
514            occupancy_pages,
515            occupancy_page_words,
516            occupancy_num_pages,
517            all_color_offsets,
518            all_colors,
519            all_chunk_colors_base,
520            all_chunk_occupancy,
521            all_slot_chunk_idx: all_slot_chunk_idx_buf,
522            grid_static_meta,
523            total_bytes,
524            static_meta,
525            chunk_occupancy_shadow,
526            slot_chunk_idx_shadow,
527        }
528    }
529
530    pub fn resident_bytes(&self) -> u64 {
531        self.total_bytes
532    }
533
534    /// Install or refresh a chunk in its modular pool slot. GPU.7
535    /// generalises GPU.6's in-place refresh: any chunk_idx maps to
536    /// a slot via `chunk_idx & (pool_dims - 1)`. The previous
537    /// occupant (if a different chunk) is silently replaced — the
538    /// host is responsible for guaranteeing that the pool is sized
539    /// large enough that two simultaneously-resident chunks never
540    /// collide on the same slot.
541    pub fn refresh_chunk(
542        &mut self,
543        queue: &wgpu::Queue,
544        scene_idx: usize,
545        chunk_idx: [i32; 3],
546        chunk: &ChunkUpload,
547    ) -> RefreshOutcome {
548        let Some(meta) = self.static_meta.get(scene_idx).copied() else {
549            return RefreshOutcome::SceneIdxOob;
550        };
551        let slot_idx = modular_slot_idx(chunk_idx, meta.pool_dims);
552
553        // GPU.11 — the per-slot strides span the full mip ladder; the
554        // resident's layout was built from the same `MipLayout`.
555        let layout = MipLayout::for_vsid(meta.vsid);
556        let occ_words_per_slot = layout.occ_words_per_slot as usize;
557        let offsets_words_per_slot = layout.offsets_words_per_slot as usize;
558        let colors_stride = COLORS_PER_CHUNK_WORDS as usize;
559
560        assert_eq!(
561            chunk.mips.len() as u32,
562            layout.mip_count,
563            "refresh_chunk: mip count mismatch (chunk {} vs grid {})",
564            chunk.mips.len(),
565            layout.mip_count,
566        );
567
568        // ---- occupancy ----
569        // Route each mip's write to its page. Page size is slot-
570        // aligned (see `split_occupancy_pages`) so the whole slot's
571        // occupancy ladder lands in a single page.
572        let slot_occ_base = meta.occupancy_offset as usize + slot_idx * occ_words_per_slot;
573        let page_words = self.occupancy_page_words as usize;
574        let page = slot_occ_base / page_words;
575        let slot_local_word = slot_occ_base % page_words;
576        debug_assert!(
577            slot_local_word + occ_words_per_slot <= page_words,
578            "occupancy slot straddles a page boundary — page size not slot-aligned",
579        );
580        let off_slot_base = meta.color_offsets_offset as usize + slot_idx * offsets_words_per_slot;
581        let col_slot_base = meta.colors_offset as usize + slot_idx * colors_stride;
582
583        let mut outcome = RefreshOutcome::Ok;
584        let mut color_cursor = 0usize;
585        for (m, mip) in chunk.mips.iter().enumerate() {
586            // occupancy (textured) then solid, back-to-back.
587            let local = slot_local_word + layout.mip_occ_rel[m] as usize;
588            queue.write_buffer(
589                &self.occupancy_pages[page],
590                (local * 4) as u64,
591                bytemuck::cast_slice(&mip.occupancy),
592            );
593            queue.write_buffer(
594                &self.occupancy_pages[page],
595                ((local + mip.occupancy.len()) * 4) as u64,
596                bytemuck::cast_slice(&mip.solid_occupancy),
597            );
598            // color_offsets
599            let coff = off_slot_base + layout.mip_coff_rel[m] as usize;
600            queue.write_buffer(
601                &self.all_color_offsets,
602                (coff * 4) as u64,
603                bytemuck::cast_slice(&mip.color_offsets),
604            );
605            // colours (concatenated per slot, truncate to stride)
606            let remaining = colors_stride.saturating_sub(color_cursor);
607            let n = mip.colors.len().min(remaining);
608            if n < mip.colors.len() {
609                eprintln!(
610                    "roxlap-gpu refresh_chunk: scene_idx={scene_idx} chunk_idx={chunk_idx:?} \
611                     mip {m} colours overflow stride {colors_stride}; truncating",
612                );
613                outcome = RefreshOutcome::ColorsTruncated;
614            }
615            if n > 0 {
616                queue.write_buffer(
617                    &self.all_colors,
618                    ((col_slot_base + color_cursor) * 4) as u64,
619                    bytemuck::cast_slice(&mip.colors[..n]),
620                );
621            }
622            color_cursor += n;
623        }
624
625        // ---- chunk_occupancy bit ----
626        self.set_chunk_occupancy_bit(
627            queue,
628            scene_idx,
629            &meta,
630            slot_idx,
631            !chunk.mips[0].colors.is_empty(),
632        );
633
634        // ---- slot_chunk_idx (identity for the shader) ----
635        self.set_slot_chunk_idx(queue, scene_idx, &meta, slot_idx, chunk_idx);
636
637        // ---- GPU.13.0 grid-AABB early-out box ----
638        self.sync_aabb(queue, scene_idx);
639
640        outcome
641    }
642
643    /// Evict a chunk's slot — clear its `chunk_occupancy` bit and
644    /// reset `slot_chunk_idx` to the empty sentinel. Used by the
645    /// host when a chunk disappears from the CPU-side `Grid::chunks`
646    /// (e.g. streaming eviction past `r_evict`).
647    ///
648    /// Returns `false` if `scene_idx` is past `grid_count` (no-op);
649    /// `true` otherwise.
650    pub fn evict_chunk(
651        &mut self,
652        queue: &wgpu::Queue,
653        scene_idx: usize,
654        chunk_idx: [i32; 3],
655    ) -> bool {
656        let Some(meta) = self.static_meta.get(scene_idx).copied() else {
657            return false;
658        };
659        let slot_idx = modular_slot_idx(chunk_idx, meta.pool_dims);
660        // Only evict if this slot still claims to hold `chunk_idx`.
661        // Otherwise we'd be wiping out a different (newer) chunk
662        // that happens to share the slot.
663        let shadow_entry = self.slot_chunk_idx_shadow[scene_idx][slot_idx];
664        if shadow_entry[0] != chunk_idx[0]
665            || shadow_entry[1] != chunk_idx[1]
666            || shadow_entry[2] != chunk_idx[2]
667        {
668            return true;
669        }
670        self.set_chunk_occupancy_bit(queue, scene_idx, &meta, slot_idx, false);
671        self.set_slot_chunk_idx(queue, scene_idx, &meta, slot_idx, SLOT_EMPTY_SENTINEL);
672        // GPU.13.0 — eviction may shrink the occupied box; recompute.
673        self.sync_aabb(queue, scene_idx);
674        true
675    }
676
677    fn set_chunk_occupancy_bit(
678        &mut self,
679        queue: &wgpu::Queue,
680        scene_idx: usize,
681        meta: &GridStaticMeta,
682        slot_idx: usize,
683        new_bit: bool,
684    ) {
685        let word_idx = slot_idx >> 5;
686        let bit = slot_idx & 31;
687        let shadow = &mut self.chunk_occupancy_shadow[scene_idx][word_idx];
688        let was_bit = (*shadow >> bit) & 1 == 1;
689        if new_bit == was_bit {
690            return;
691        }
692        if new_bit {
693            *shadow |= 1u32 << bit;
694        } else {
695            *shadow &= !(1u32 << bit);
696        }
697        let global_word_idx = meta.chunk_occupancy_offset as usize + word_idx;
698        queue.write_buffer(
699            &self.all_chunk_occupancy,
700            (global_word_idx * 4) as u64,
701            bytemuck::bytes_of(shadow),
702        );
703    }
704
705    fn set_slot_chunk_idx(
706        &mut self,
707        queue: &wgpu::Queue,
708        scene_idx: usize,
709        meta: &GridStaticMeta,
710        slot_idx: usize,
711        chunk_idx: [i32; 3],
712    ) {
713        let entry = [chunk_idx[0], chunk_idx[1], chunk_idx[2], 0];
714        self.slot_chunk_idx_shadow[scene_idx][slot_idx] = entry;
715        let global_word_idx = meta.slot_chunk_idx_offset as usize + slot_idx * 4;
716        queue.write_buffer(
717            &self.all_slot_chunk_idx,
718            (global_word_idx * 4) as u64,
719            bytemuck::cast_slice(&entry),
720        );
721    }
722
723    /// GPU.13.0 — recompute the grid's occupied chunk-AABB from its
724    /// `slot_chunk_idx` shadow and, if it changed, patch the grid's
725    /// [`GridStaticMeta`] on the GPU. Cheap: scans `total_slots`
726    /// entries and writes 144 bytes only when the box actually moves
727    /// (steady-state re-bakes leave it unchanged → no GPU write).
728    /// Called after every install/eviction so streaming grids keep a
729    /// tight, always-conservative early-out box.
730    fn sync_aabb(&mut self, queue: &wgpu::Queue, scene_idx: usize) {
731        let (aabb_min, aabb_max) = aabb_of_slots(&self.slot_chunk_idx_shadow[scene_idx]);
732        let meta = &mut self.static_meta[scene_idx];
733        if meta.aabb_min == aabb_min && meta.aabb_max == aabb_max {
734            return;
735        }
736        meta.aabb_min = aabb_min;
737        meta.aabb_max = aabb_max;
738        let off = (scene_idx * std::mem::size_of::<GridStaticMeta>()) as u64;
739        queue.write_buffer(&self.grid_static_meta, off, bytemuck::bytes_of(meta));
740    }
741}
742
743/// GPU.13.0 — inclusive chunk-AABB over a grid's `slot_chunk_idx`
744/// shadow, skipping the [`SLOT_EMPTY_SENTINEL`] entries. Returns the
745/// inverted sentinel box (`min = i32::MAX`, `max = i32::MIN`) when no
746/// slot is occupied, which makes the shader's `aabb_passed` early-out
747/// fire for every ray (an empty grid renders nothing).
748fn aabb_of_slots(slots: &[[i32; 4]]) -> ([i32; 3], [i32; 3]) {
749    let mut min = [i32::MAX; 3];
750    let mut max = [i32::MIN; 3];
751    for e in slots {
752        if e[0] == SLOT_EMPTY_SENTINEL[0]
753            && e[1] == SLOT_EMPTY_SENTINEL[1]
754            && e[2] == SLOT_EMPTY_SENTINEL[2]
755        {
756            continue;
757        }
758        for k in 0..3 {
759            if e[k] < min[k] {
760                min[k] = e[k];
761            }
762            if e[k] > max[k] {
763                max[k] = e[k];
764            }
765        }
766    }
767    (min, max)
768}
769
770/// Modular slot index for `chunk_idx` given the grid's
771/// power-of-2 `pool_dims`. Negative `chunk_idx` components map via
772/// two's-complement bitwise AND, matching the shader's
773/// `chunk_idx & (pool_dims - 1)`.
774#[must_use]
775pub fn modular_slot_idx(chunk_idx: [i32; 3], pool_dims: [u32; 3]) -> usize {
776    let mask_x = (pool_dims[0] - 1) as i32;
777    let mask_y = (pool_dims[1] - 1) as i32;
778    let mask_z = (pool_dims[2] - 1) as i32;
779    let sx = (chunk_idx[0] & mask_x) as usize;
780    let sy = (chunk_idx[1] & mask_y) as usize;
781    let sz = (chunk_idx[2] & mask_z) as usize;
782    sx + sy * (pool_dims[0] as usize) + sz * (pool_dims[0] as usize) * (pool_dims[1] as usize)
783}
784
785/// Outcome of `GpuSceneResident::refresh_chunk`. Most callers
786/// can ignore the result; `ColorsTruncated` indicates the chunk's
787/// colour data overflowed the per-slot stride and was clipped.
788#[derive(Debug, Clone, Copy, PartialEq, Eq)]
789pub enum RefreshOutcome {
790    Ok,
791    /// The chunk's colour count exceeded `COLORS_PER_CHUNK_WORDS`;
792    /// the GPU sees the first `stride` colours. Bump
793    /// `COLORS_PER_CHUNK_WORDS` for content that hits this.
794    ColorsTruncated,
795    /// Retained for ABI compatibility; the GPU.7 modular pool no
796    /// longer rejects chunks by bbox.
797    ChunkOutOfBbox,
798    /// `scene_idx` is past `grid_count`. Programming error.
799    SceneIdxOob,
800}
801
802fn create_storage(device: &wgpu::Device, label: &str, data: &[u32]) -> wgpu::Buffer {
803    // GPU.6: include COPY_DST so `refresh_chunk` can `queue.write_buffer`
804    // into existing slots without rebuilding the resident.
805    device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
806        label: Some(label),
807        contents: bytemuck::cast_slice(data),
808        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
809    })
810}
811
812/// Split the concatenated occupancy words into up to
813/// [`MAX_OCC_PAGES`] storage buffers, each no larger than the
814/// device's `max_storage_buffer_binding_size`, then pad the page
815/// list with 1-word dummy buffers so the returned vec is always
816/// exactly `MAX_OCC_PAGES` long (one buffer per bind-group entry).
817///
818/// `slot_align_words` is the per-slot occupancy stride: page size is
819/// rounded down to a multiple of it so no chunk slot — and therefore
820/// no per-slot `refresh_chunk` write — straddles a page boundary.
821/// Returns `(pages, page_words, num_pages)`.
822fn split_occupancy_pages(
823    device: &wgpu::Device,
824    words: &[u32],
825    slot_align_words: u64,
826) -> (Vec<wgpu::Buffer>, u32, u32) {
827    let total_words = words.len() as u64;
828    let limit_words = u64::from(device.limits().max_storage_buffer_binding_size) / 4;
829    // Largest slot-aligned page that fits one binding (≥ 1 slot).
830    let page_slots = (limit_words / slot_align_words).max(1);
831    let mut page_words = page_slots.saturating_mul(slot_align_words);
832    // A tiny scene (or the empty-scene 1-word pad) isn't slot-aligned;
833    // cap the page at the data length so we don't allocate emptiness.
834    page_words = page_words.min(total_words.max(1));
835    let num_pages = total_words.div_ceil(page_words);
836    assert!(
837        num_pages as usize <= MAX_OCC_PAGES,
838        "occupancy needs {num_pages} pages (>{MAX_OCC_PAGES}) at this device's \
839         {limit_words}-word binding limit; shrink the streaming pool or raise MAX_OCC_PAGES",
840    );
841
842    let mut pages: Vec<wgpu::Buffer> = Vec::with_capacity(MAX_OCC_PAGES);
843    let page_words_usize = page_words as usize;
844    for p in 0..num_pages as usize {
845        let start = p * page_words_usize;
846        let end = ((p + 1) * page_words_usize).min(words.len());
847        pages.push(create_storage(
848            device,
849            &format!("roxlap-gpu scene.occupancy.page{p}"),
850            &words[start..end],
851        ));
852    }
853    // Dummy 1-word buffers for the unused bindings.
854    while pages.len() < MAX_OCC_PAGES {
855        pages.push(create_storage(
856            device,
857            "roxlap-gpu scene.occupancy.page_dummy",
858            &[0u32],
859        ));
860    }
861    (
862        pages,
863        u32::try_from(page_words).expect("page_words fits u32"),
864        num_pages as u32,
865    )
866}
867
868#[cfg(test)]
869mod tests {
870    use super::*;
871
872    #[test]
873    fn grid_static_meta_matches_wgsl_std430_size() {
874        // scene_dda.wgsl's GridStaticMeta is read as
875        // array<GridStaticMeta>; the std430 array stride must equal
876        // the Rust size_of or wgpu rejects the binding.
877        // Concretely: 8 u32 (32) + vec3+pad (16) + 4 u32 (16) +
878        // 2*[u32;6] (48) = 112, then GPU.13.0 adds two vec3<i32>+pad
879        // (aabb_min, aabb_max) = 32 → 144 bytes.
880        assert_eq!(std::mem::size_of::<GridStaticMeta>(), 144);
881        assert_eq!(std::mem::align_of::<GridStaticMeta>(), 4);
882    }
883
884    #[test]
885    fn mip_layout_offsets_accumulate() {
886        // vsid=128 → 6 mips. Relative offsets are cumulative; mip-0
887        // sits at 0 so mip-0 reads are byte-identical to pre-mip.
888        let l = MipLayout::for_vsid(128);
889        assert_eq!(l.mip_count, 6);
890        assert_eq!(l.mip_occ_rel[0], 0);
891        assert_eq!(l.mip_coff_rel[0], 0);
892
893        // Recompute the strides independently and compare. Each mip
894        // stores TWO occupancy bitmaps (textured + solid) back-to-back.
895        let mut occ = 0u32;
896        let mut coff = 0u32;
897        for m in 0..6u32 {
898            assert_eq!(l.mip_occ_rel[m as usize], occ, "occ rel mip {m}");
899            assert_eq!(l.mip_coff_rel[m as usize], coff, "coff rel mip {m}");
900            let v = 128u32 >> m;
901            occ += 2 * v * v * occ_words_per_column_for_mip(m);
902            coff += v * v + 1;
903        }
904        assert_eq!(l.occ_words_per_slot, occ);
905        assert_eq!(l.offsets_words_per_slot, coff);
906
907        // mip-0 occupancy stride is 2 × the historical vsid²·8 (tex +
908        // solid bitmaps).
909        assert_eq!(l.mip_occ_rel[1], 2 * 128 * 128 * 8);
910        // The whole ladder is only ~1/7 larger than mip-0 alone
911        // (geometric 1 + 1/8 + 1/64 + …) — here on the doubled base.
912        assert!(l.occ_words_per_slot < 2 * 128 * 128 * 8 * 5 / 4);
913    }
914}