roxlap_gpu/scene.rs
1//! GPU.5 — multi-grid scene upload + shared storage layout.
2//!
3//! Concatenates every chunk of every grid into one set of storage
4//! buffers + a per-grid offsets table. Each grid keeps its own
5//! `vsid`, `chunks_dims`, `origin_chunk`, and runtime transform;
6//! the shader iterates grids 0..grid_count, transforms the world
7//! camera into each grid's local frame, runs that grid's outer-DDA
8//! over chunks, and tracks the closest hit across all grids.
9//!
10//! Why concatenate rather than one bind group per grid? wgpu's
11//! `MAX_BIND_GROUPS` default is 4; demos with 10+ grids
12//! (`roxlap-scene-demo` has ground + ship + 10 marker pillars =
13//! 12) need a single bind-group layout that scales.
14
15#![allow(
16 clippy::cast_sign_loss,
17 clippy::cast_lossless,
18 clippy::cast_possible_truncation,
19 clippy::cast_possible_wrap,
20 clippy::doc_markdown,
21 clippy::missing_panics_doc,
22 clippy::needless_range_loop,
23 clippy::pub_underscore_fields
24)]
25
26use bytemuck::Zeroable;
27use wgpu::util::DeviceExt;
28
29use crate::decompress::{gpu_mip_count, occ_words_per_column_for_mip, ChunkUpload};
30use crate::grid::GridUpload;
31
32/// GPU.11 — max mip levels the per-slot layout reserves room for in
33/// [`GridStaticMeta`]'s relative-offset tables. Matches
34/// [`crate::decompress::GPU_MAX_MIPS`]; the shader's `array<u32, N>`
35/// must use the same N.
36pub const MAX_GPU_MIPS: usize = 6;
37
38/// GPU.11 — per-slot occupancy/color-offset strides + per-mip
39/// within-slot relative offsets for a grid of side `vsid`. All
40/// chunks of a grid share these (uniform mip count by
41/// [`gpu_mip_count`]). `colors` keep their fixed
42/// [`COLORS_PER_CHUNK_WORDS`] stride; each mip's colours are
43/// concatenated within that block and indexed by the chunk's own
44/// (absolute) `color_offsets`.
45#[derive(Debug, Clone, Copy)]
46pub struct MipLayout {
47 pub mip_count: u32,
48 pub occ_words_per_slot: u32,
49 pub offsets_words_per_slot: u32,
50 /// Within-slot u32 offset where mip `m`'s occupancy starts.
51 pub mip_occ_rel: [u32; MAX_GPU_MIPS],
52 /// Within-slot u32 offset where mip `m`'s color_offsets start.
53 pub mip_coff_rel: [u32; MAX_GPU_MIPS],
54}
55
56impl MipLayout {
57 #[must_use]
58 pub fn for_vsid(vsid: u32) -> Self {
59 let mip_count = gpu_mip_count(vsid);
60 let mut mip_occ_rel = [0u32; MAX_GPU_MIPS];
61 let mut mip_coff_rel = [0u32; MAX_GPU_MIPS];
62 let mut occ_acc = 0u32;
63 let mut coff_acc = 0u32;
64 for m in 0..mip_count {
65 mip_occ_rel[m as usize] = occ_acc;
66 mip_coff_rel[m as usize] = coff_acc;
67 let vsid_m = vsid >> m;
68 let cols = vsid_m * vsid_m;
69 // Each mip stores TWO bitmaps back-to-back: the textured
70 // occupancy then the solid occupancy (cliff-face fix). The
71 // shader reads solid at `tex_base + cols*occ_words_per_col`.
72 occ_acc += 2 * cols * occ_words_per_column_for_mip(m);
73 coff_acc += cols + 1;
74 }
75 Self {
76 mip_count,
77 occ_words_per_slot: occ_acc,
78 offsets_words_per_slot: coff_acc,
79 mip_occ_rel,
80 mip_coff_rel,
81 }
82 }
83}
84
85/// Per-chunk colour-slot stride, in u32 words (256 KiB). Each
86/// chunk's colour data lives at `meta_idx * COLORS_PER_CHUNK_WORDS`
87/// within its grid's colours range. Fixed-stride layout means
88/// every slot — present or absent at upload time — has the same
89/// capacity, so [`GpuSceneResident::refresh_chunk`] can always
90/// write new colour data into the slot when a chunk arrives via
91/// streaming or is re-baked.
92///
93/// 65536 u32s = 256 KiB. Scene-demo's densest ground-hills chunks
94/// run ~36 k colour entries (~144 KiB) — multiple textured voxels
95/// per column at slopes/cliffs; 256 KiB gives ~1.8× headroom.
96/// Memory cost on the demo's 32×32×1 static grid: 1024 slots ×
97/// 256 KiB = 256 MiB colours (~830 MiB resident scene total).
98/// Chunks past the cap truncate with a stderr warn; GPU.7
99/// sliding-window storage removes the cap entirely.
100pub const COLORS_PER_CHUNK_WORDS: u32 = 65536;
101
102/// Number of separate storage bindings the concatenated occupancy
103/// buffer is split ("paged") across. A single storage binding may
104/// not exceed the device's `max_storage_buffer_binding_size` — on
105/// strict drivers that's a hard 128 MiB (lavapipe), which the
106/// streaming demo's occupancy already reaches. Splitting into pages
107/// keeps every binding under the limit while preserving a single
108/// global word index in the shader (each page is a whole number of
109/// chunk slots, so no slot ever straddles a page boundary).
110///
111/// On GPUs with multi-GiB binding limits (NVK, native Vulkan) the
112/// whole buffer fits in page 0, the other bindings get a 1-word
113/// dummy, and the shader's page select is a single perfectly-
114/// predicted uniform branch → zero hot-loop cost. 4 pages covers
115/// 512 MiB of occupancy even on a 128 MiB-per-binding device.
116pub const MAX_OCC_PAGES: usize = 4;
117
118/// Per-grid runtime transform — voxlap-style (world → grid-local).
119/// `rotation` is column-major and encodes the inverse rotation
120/// applied to the world camera basis before passing it to that
121/// grid's marcher. Identity for the ground; non-trivial for the
122/// rotating ship.
123#[derive(Debug, Clone, Copy)]
124pub struct GridRuntimeTransform {
125 /// Grid-local position of the world origin = `-rotation⁻¹ ·
126 /// grid.position` for a `GridTransform { position, rotation }`.
127 /// The host computes this once per frame.
128 pub grid_origin_world: [f64; 3],
129 /// 3×3 inverse rotation (column-major).
130 pub world_to_grid_rotation: [[f32; 3]; 3],
131}
132
133impl Default for GridRuntimeTransform {
134 fn default() -> Self {
135 Self {
136 grid_origin_world: [0.0, 0.0, 0.0],
137 world_to_grid_rotation: [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]],
138 }
139 }
140}
141
142/// CPU-side aggregation of every grid in a scene. Built once at
143/// startup; per-grid transforms are recomputed each frame and
144/// passed to `render_scene` separately.
145pub struct SceneUpload {
146 pub grids: Vec<GridUpload>,
147}
148
149impl SceneUpload {
150 #[must_use]
151 pub fn grid_count(&self) -> u32 {
152 u32::try_from(self.grids.len()).unwrap_or(u32::MAX)
153 }
154}
155
156/// Per-grid static metadata: offsets into the concatenated storage
157/// buffers + the grid's slot-pool dimensions. Uploaded once.
158///
159/// GPU.7 changes: `chunks_dims` and `origin_chunk` were dropped.
160/// The shader uses modular slot indexing
161/// (`chunk_idx & (pool_dims - 1)`) and verifies slot identity via
162/// `slot_chunk_idx[slot]`, so the upload-time bbox is no longer
163/// relevant to the shader.
164#[repr(C)]
165#[derive(Clone, Copy, bytemuck::Pod, bytemuck::Zeroable, Debug)]
166pub struct GridStaticMeta {
167 /// `occupancy` u32-word offset where this grid's data starts.
168 pub occupancy_offset: u32,
169 pub color_offsets_offset: u32,
170 pub colors_offset: u32,
171 pub chunk_colors_base_offset: u32,
172 pub chunk_occupancy_offset: u32,
173 /// New in GPU.7: u32-word offset where this grid's
174 /// `slot_chunk_idx` array starts (one `vec3<i32>` per slot,
175 /// i.e. 3 u32 words each, plus 1 padding word for std430).
176 pub slot_chunk_idx_offset: u32,
177 pub vsid: u32,
178 pub total_slots: u32,
179 pub pool_dims: [u32; 3],
180 pub _pad0: u32,
181 /// GPU.11 — per-slot occupancy stride (sum over all mips).
182 /// `meta_id`'s occupancy slab starts at
183 /// `occupancy_offset + meta_id * occ_words_per_slot`.
184 pub occ_words_per_slot: u32,
185 /// GPU.11 — per-slot color_offsets stride (sum over all mips).
186 pub offsets_words_per_slot: u32,
187 /// GPU.11 — number of mip levels stored per slot.
188 pub mip_count: u32,
189 pub _pad1: u32,
190 /// GPU.11 — within-slot u32 offset where mip `m`'s occupancy
191 /// starts. `mip_occ_rel[0] == 0` so mip-0 reads are unchanged.
192 pub mip_occ_rel: [u32; MAX_GPU_MIPS],
193 /// GPU.11 — within-slot u32 offset where mip `m`'s color_offsets
194 /// start. `mip_coff_rel[0] == 0`.
195 pub mip_coff_rel: [u32; MAX_GPU_MIPS],
196 /// GPU.13.0 — occupied chunk-AABB (inclusive) in chunk-index space.
197 /// The outer DDA stops once `p_chunk` passes this box along the
198 /// ray's travel direction (no resident chunk can lie ahead). An
199 /// empty grid uses the inverted sentinel (`aabb_min = i32::MAX`,
200 /// `aabb_max = i32::MIN`) so every ray early-outs immediately.
201 /// Maintained live: [`GpuSceneResident::refresh_chunk`] /
202 /// [`GpuSceneResident::evict_chunk`] recompute + re-upload it.
203 pub aabb_min: [i32; 3],
204 pub _pad2: i32,
205 pub aabb_max: [i32; 3],
206 pub _pad3: i32,
207}
208
209/// Sentinel chunk_idx written into empty slot_chunk_idx entries.
210/// Real chunk indices never use `i32::MIN`, so the shader can
211/// distinguish empty slots from collisions via a single equality
212/// check.
213pub const SLOT_EMPTY_SENTINEL: [i32; 3] = [i32::MIN, i32::MIN, i32::MIN];
214
215/// GPU-resident storage for an entire scene's grids.
216pub struct GpuSceneResident {
217 pub grid_count: u32,
218 /// Concatenated per-slot occupancy, split into up to
219 /// [`MAX_OCC_PAGES`] storage bindings so no single binding
220 /// exceeds the device's `max_storage_buffer_binding_size`. The
221 /// vec is always exactly `MAX_OCC_PAGES` long — pages past
222 /// `occupancy_num_pages` are 1-word dummies kept only so the
223 /// bind group has a buffer for every layout entry. Page p holds
224 /// the global word range `[p*occupancy_page_words,
225 /// (p+1)*occupancy_page_words)`; `occupancy_page_words` is a
226 /// whole number of chunk slots so no slot straddles a boundary.
227 pub occupancy_pages: Vec<wgpu::Buffer>,
228 /// Words per occupancy page (a multiple of `occ_words_per_slot`).
229 pub occupancy_page_words: u32,
230 /// Number of real (non-dummy) pages in `occupancy_pages`.
231 pub occupancy_num_pages: u32,
232 pub all_color_offsets: wgpu::Buffer,
233 pub all_colors: wgpu::Buffer,
234 pub all_chunk_colors_base: wgpu::Buffer,
235 pub all_chunk_occupancy: wgpu::Buffer,
236 /// GPU.7 — per-slot chunk_idx for identity verification in the
237 /// shader. Stored as `vec3<i32>` with std430 16-byte stride
238 /// (each entry is `[i32; 4]` on the host: x, y, z, _pad).
239 pub all_slot_chunk_idx: wgpu::Buffer,
240 pub grid_static_meta: wgpu::Buffer,
241 pub total_bytes: u64,
242 /// Cached static metadata for the host's frame-loop work.
243 pub static_meta: Vec<GridStaticMeta>,
244 /// CPU shadow of the per-grid chunk-occupancy bitmap. Each entry
245 /// is the u32 word at `chunk_occupancy_offset + (mi >> 5)`.
246 /// `refresh_chunk` / `evict_chunk` flip the right bit + write
247 /// the affected word back to the GPU.
248 pub(crate) chunk_occupancy_shadow: Vec<Vec<u32>>,
249 /// CPU shadow of `slot_chunk_idx`. Indexed `[scene_idx][slot]`
250 /// → `[i32; 4]` (vec3 + pad). Host uses this to detect "slot is
251 /// holding a different chunk than expected" + as the eviction
252 /// origin.
253 pub(crate) slot_chunk_idx_shadow: Vec<Vec<[i32; 4]>>,
254 /// Per-grid colour stride in u32 words (the adaptive
255 /// [`COLORS_PER_CHUNK_WORDS`]-or-larger value chosen at upload to
256 /// fit the grid's densest chunk). `refresh_chunk` reads it so a
257 /// streamed re-upload addresses colours with the same stride the
258 /// initial upload used.
259 pub(crate) colors_stride_shadow: Vec<u32>,
260}
261
262impl GpuSceneResident {
263 /// Pack + upload `info`. Each grid is uploaded as a contiguous
264 /// slab inside the shared storage buffers; per-grid offsets
265 /// live in `grid_static_meta`. The grid count is bounded only by
266 /// the device's storage-buffer limits (per-grid cameras + metadata
267 /// are runtime-sized storage arrays, not a fixed shader array).
268 pub fn upload(device: &wgpu::Device, info: &SceneUpload) -> Self {
269 let grid_count = info.grid_count();
270
271 let mut all_occupancy: Vec<u32> = Vec::new();
272 let mut all_color_offsets: Vec<u32> = Vec::new();
273 let mut all_colors: Vec<u32> = Vec::new();
274 let mut all_chunk_colors_base: Vec<u32> = Vec::new();
275 let mut all_chunk_occupancy: Vec<u32> = Vec::new();
276 let mut all_slot_chunk_idx: Vec<i32> = Vec::new();
277 let mut static_meta: Vec<GridStaticMeta> = Vec::with_capacity(info.grids.len());
278 let mut chunk_occupancy_shadow: Vec<Vec<u32>> = Vec::with_capacity(info.grids.len());
279 let mut slot_chunk_idx_shadow: Vec<Vec<[i32; 4]>> = Vec::with_capacity(info.grids.len());
280 // Per-grid colour stride (words/slot) — adaptive to the grid's
281 // densest chunk (see the in-loop derivation). `refresh_chunk`
282 // reads it back so streamed re-uploads use the same stride.
283 let mut grid_colors_strides: Vec<u32> = Vec::with_capacity(info.grids.len());
284
285 for grid in &info.grids {
286 let vsid = grid.vsid;
287 // GPU.11 — per-slot strides span the whole mip ladder.
288 let layout = MipLayout::for_vsid(vsid);
289 let occ_words_per_slot = layout.occ_words_per_slot as usize;
290 let offsets_words_per_slot = layout.offsets_words_per_slot as usize;
291 // Per-slot colour stride. The fixed-stride layout gives every
292 // slot — present or not — the same capacity, so streaming /
293 // re-bake can write a fresh chunk's colours into any slot.
294 // [`COLORS_PER_CHUNK_WORDS`] is sized for sparse terrain
295 // chunks (~36 k colours); a *fully dense* chunk (the cave
296 // demo's single 128×128×256 chunk carries ~207 k colours
297 // across its mip ladder) needs more, or its colours truncate
298 // and the chunk's high-`y` columns render black. Grow the
299 // stride to the grid's densest chunk, floored at the default
300 // so denser chunks that stream in later still fit the common
301 // case. Per-grid: a sparse grid keeps the small stride; only
302 // a grid that actually holds dense chunks pays for the
303 // bigger one.
304 let max_chunk_colors = grid
305 .chunks
306 .iter()
307 .map(|(_, c)| c.mips.iter().map(|m| m.colors.len()).sum::<usize>())
308 .max()
309 .unwrap_or(0);
310 let colors_stride = (COLORS_PER_CHUNK_WORDS as usize).max(max_chunk_colors);
311 grid_colors_strides.push(colors_stride as u32);
312
313 // Validate pool_dims are powers of 2 — required for the
314 // shader's `chunk_idx & (pool_dims - 1)` modular slot
315 // indexing.
316 assert!(
317 grid.pool_dims[0].is_power_of_two()
318 && grid.pool_dims[1].is_power_of_two()
319 && grid.pool_dims[2].is_power_of_two(),
320 "scene grid: pool_dims {:?} must all be powers of 2",
321 grid.pool_dims,
322 );
323 let pool_x = grid.pool_dims[0] as usize;
324 let pool_y = grid.pool_dims[1] as usize;
325 let pool_z = grid.pool_dims[2] as usize;
326 let total_slots = pool_x * pool_y * pool_z;
327
328 let mut grid_occupancy = vec![0u32; total_slots * occ_words_per_slot];
329 let mut grid_color_offsets = vec![0u32; total_slots * offsets_words_per_slot];
330 let mut grid_colors = vec![0u32; total_slots * colors_stride];
331 let mut grid_chunk_colors_base = vec![0u32; total_slots];
332 for i in 0..total_slots {
333 grid_chunk_colors_base[i] = (i * colors_stride) as u32;
334 }
335 let mut grid_chunk_occupancy = vec![0u32; total_slots.div_ceil(32)];
336 // slot_chunk_idx: vec3<i32> per slot, std430 stride = 16
337 // bytes (4 u32 words: x, y, z, _pad). Initialise every
338 // slot to the empty sentinel; populated slots overwrite
339 // with the actual chunk_idx below.
340 let mut grid_slot_chunk_idx: Vec<[i32; 4]> = Vec::with_capacity(total_slots);
341 for _ in 0..total_slots {
342 grid_slot_chunk_idx.push([
343 SLOT_EMPTY_SENTINEL[0],
344 SLOT_EMPTY_SENTINEL[1],
345 SLOT_EMPTY_SENTINEL[2],
346 0,
347 ]);
348 }
349
350 let mask_x = (grid.pool_dims[0] - 1) as i32;
351 let mask_y = (grid.pool_dims[1] - 1) as i32;
352 let mask_z = (grid.pool_dims[2] - 1) as i32;
353 let chunks_per_layer = pool_x * pool_y;
354
355 for (chunk_idx, chunk) in &grid.chunks {
356 assert_eq!(chunk.vsid, vsid, "scene grid: chunk vsid mismatch");
357 let sx = (chunk_idx[0] & mask_x) as usize;
358 let sy = (chunk_idx[1] & mask_y) as usize;
359 let sz = (chunk_idx[2] & mask_z) as usize;
360 let slot_idx = sx + sy * pool_x + sz * chunks_per_layer;
361
362 // GPU.11 — write each mip at its within-slot offset.
363 // occupancy + color_offsets land in per-mip sub-blocks
364 // (mip-0 first, so its data is byte-identical to the
365 // pre-mip layout); colours of every mip concatenate
366 // into the slot's fixed COLORS_PER_CHUNK_WORDS block in
367 // level order, indexed by each chunk's own absolute
368 // `color_offsets`.
369 let occ_start = slot_idx * occ_words_per_slot;
370 let off_start = slot_idx * offsets_words_per_slot;
371 let col_start = slot_idx * colors_stride;
372 let mut color_cursor = 0usize;
373 for (m, mip) in chunk.mips.iter().enumerate() {
374 let occ_dst = occ_start + layout.mip_occ_rel[m] as usize;
375 grid_occupancy[occ_dst..occ_dst + mip.occupancy.len()]
376 .copy_from_slice(&mip.occupancy);
377 // Solid bitmap immediately follows the textured one.
378 let solid_dst = occ_dst + mip.occupancy.len();
379 grid_occupancy[solid_dst..solid_dst + mip.solid_occupancy.len()]
380 .copy_from_slice(&mip.solid_occupancy);
381 let coff_dst = off_start + layout.mip_coff_rel[m] as usize;
382 grid_color_offsets[coff_dst..coff_dst + mip.color_offsets.len()]
383 .copy_from_slice(&mip.color_offsets);
384
385 let remaining = colors_stride.saturating_sub(color_cursor);
386 let n = mip.colors.len().min(remaining);
387 if n < mip.colors.len() {
388 eprintln!(
389 "roxlap-gpu SceneUpload: scene grid chunk {chunk_idx:?} mip {m} \
390 colours overflow COLORS_PER_CHUNK_WORDS ({colors_stride}); \
391 truncating",
392 );
393 }
394 grid_colors[col_start + color_cursor..col_start + color_cursor + n]
395 .copy_from_slice(&mip.colors[..n]);
396 color_cursor += n;
397 }
398
399 if !chunk.mips[0].colors.is_empty() {
400 grid_chunk_occupancy[slot_idx >> 5] |= 1u32 << (slot_idx & 31);
401 }
402 grid_slot_chunk_idx[slot_idx] = [chunk_idx[0], chunk_idx[1], chunk_idx[2], 0];
403 }
404
405 // Slot_chunk_idx storage offset: each entry is 4 u32
406 // words (vec3 padded to 16 bytes in std430).
407 let slot_chunk_idx_offset = u32::try_from(all_slot_chunk_idx.len()).expect("fits");
408 // GPU.13.0 — occupied chunk-AABB for the outer-DDA early-out.
409 let (aabb_min, aabb_max) = aabb_of_slots(&grid_slot_chunk_idx);
410 let meta = GridStaticMeta {
411 occupancy_offset: u32::try_from(all_occupancy.len()).expect("fits"),
412 color_offsets_offset: u32::try_from(all_color_offsets.len()).expect("fits"),
413 colors_offset: u32::try_from(all_colors.len()).expect("fits"),
414 chunk_colors_base_offset: u32::try_from(all_chunk_colors_base.len()).expect("fits"),
415 chunk_occupancy_offset: u32::try_from(all_chunk_occupancy.len()).expect("fits"),
416 slot_chunk_idx_offset,
417 vsid,
418 total_slots: total_slots as u32,
419 pool_dims: grid.pool_dims,
420 _pad0: 0,
421 occ_words_per_slot: layout.occ_words_per_slot,
422 offsets_words_per_slot: layout.offsets_words_per_slot,
423 mip_count: layout.mip_count,
424 _pad1: 0,
425 mip_occ_rel: layout.mip_occ_rel,
426 mip_coff_rel: layout.mip_coff_rel,
427 aabb_min,
428 _pad2: 0,
429 aabb_max,
430 _pad3: 0,
431 };
432
433 chunk_occupancy_shadow.push(grid_chunk_occupancy.clone());
434 slot_chunk_idx_shadow.push(grid_slot_chunk_idx.clone());
435
436 all_occupancy.extend_from_slice(&grid_occupancy);
437 all_color_offsets.extend_from_slice(&grid_color_offsets);
438 all_colors.extend_from_slice(&grid_colors);
439 all_chunk_colors_base.extend_from_slice(&grid_chunk_colors_base);
440 all_chunk_occupancy.extend_from_slice(&grid_chunk_occupancy);
441 for entry in &grid_slot_chunk_idx {
442 all_slot_chunk_idx.extend_from_slice(entry);
443 }
444 static_meta.push(meta);
445 }
446
447 // Pad an empty scene's storage buffers — wgpu rejects
448 // zero-size storage bindings.
449 if all_occupancy.is_empty() {
450 all_occupancy.push(0);
451 }
452 if all_color_offsets.is_empty() {
453 all_color_offsets.push(0);
454 }
455 if all_colors.is_empty() {
456 all_colors.push(0);
457 }
458 if all_chunk_colors_base.is_empty() {
459 all_chunk_colors_base.push(0);
460 }
461 if all_chunk_occupancy.is_empty() {
462 all_chunk_occupancy.push(0);
463 }
464 if all_slot_chunk_idx.is_empty() {
465 // 4 zeros = single padded vec3<i32>. wgpu rejects
466 // zero-sized storage bindings.
467 all_slot_chunk_idx.extend_from_slice(&[0; 4]);
468 }
469 if static_meta.is_empty() {
470 static_meta.push(GridStaticMeta::zeroed());
471 }
472
473 let occupancy_bytes = (all_occupancy.len() * 4) as u64;
474 let color_offsets_bytes = (all_color_offsets.len() * 4) as u64;
475 let colors_bytes = (all_colors.len() * 4) as u64;
476 let chunk_colors_base_bytes = (all_chunk_colors_base.len() * 4) as u64;
477 let chunk_occupancy_bytes = (all_chunk_occupancy.len() * 4) as u64;
478 let slot_chunk_idx_bytes = (all_slot_chunk_idx.len() * 4) as u64;
479 let static_meta_bytes = (static_meta.len() * std::mem::size_of::<GridStaticMeta>()) as u64;
480 let total_bytes = occupancy_bytes
481 + color_offsets_bytes
482 + colors_bytes
483 + chunk_colors_base_bytes
484 + chunk_occupancy_bytes
485 + slot_chunk_idx_bytes
486 + static_meta_bytes;
487
488 // Split the concatenated occupancy across storage pages so no
489 // single binding exceeds the device limit. Page size is a
490 // whole number of chunk slots (slot-aligned) so no per-slot
491 // refresh write ever straddles two pages.
492 // GPU.11 — page alignment is now the whole-ladder per-slot
493 // occupancy stride so a slot (all its mips) never straddles a
494 // page boundary.
495 let slot_align_words = info
496 .grids
497 .iter()
498 .map(|g| u64::from(MipLayout::for_vsid(g.vsid).occ_words_per_slot))
499 .max()
500 .unwrap_or(1)
501 .max(1);
502 let (occupancy_pages, occupancy_page_words, occupancy_num_pages) =
503 split_occupancy_pages(device, &all_occupancy, slot_align_words);
504 let all_color_offsets =
505 create_storage(device, "roxlap-gpu scene.color_offsets", &all_color_offsets);
506 let all_colors = create_storage(device, "roxlap-gpu scene.colors", &all_colors);
507 let all_chunk_colors_base = create_storage(
508 device,
509 "roxlap-gpu scene.chunk_colors_base",
510 &all_chunk_colors_base,
511 );
512 let all_chunk_occupancy = create_storage(
513 device,
514 "roxlap-gpu scene.chunk_occupancy",
515 &all_chunk_occupancy,
516 );
517 // GPU.7 slot identity verification buffer. i32 storage.
518 let all_slot_chunk_idx_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
519 label: Some("roxlap-gpu scene.slot_chunk_idx"),
520 contents: bytemuck::cast_slice(&all_slot_chunk_idx),
521 usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
522 });
523 let grid_static_meta = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
524 label: Some("roxlap-gpu scene.grid_static_meta"),
525 contents: bytemuck::cast_slice(&static_meta),
526 // GPU.13.0 — COPY_DST so the live chunk-AABB can be patched
527 // into a grid's meta on refresh_chunk / evict_chunk.
528 usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
529 });
530
531 Self {
532 grid_count,
533 occupancy_pages,
534 occupancy_page_words,
535 occupancy_num_pages,
536 all_color_offsets,
537 all_colors,
538 all_chunk_colors_base,
539 all_chunk_occupancy,
540 all_slot_chunk_idx: all_slot_chunk_idx_buf,
541 grid_static_meta,
542 total_bytes,
543 static_meta,
544 chunk_occupancy_shadow,
545 slot_chunk_idx_shadow,
546 colors_stride_shadow: grid_colors_strides,
547 }
548 }
549
550 pub fn resident_bytes(&self) -> u64 {
551 self.total_bytes
552 }
553
554 /// Install or refresh a chunk in its modular pool slot. GPU.7
555 /// generalises GPU.6's in-place refresh: any chunk_idx maps to
556 /// a slot via `chunk_idx & (pool_dims - 1)`. The previous
557 /// occupant (if a different chunk) is silently replaced — the
558 /// host is responsible for guaranteeing that the pool is sized
559 /// large enough that two simultaneously-resident chunks never
560 /// collide on the same slot.
561 pub fn refresh_chunk(
562 &mut self,
563 queue: &wgpu::Queue,
564 scene_idx: usize,
565 chunk_idx: [i32; 3],
566 chunk: &ChunkUpload,
567 ) -> RefreshOutcome {
568 let Some(meta) = self.static_meta.get(scene_idx).copied() else {
569 return RefreshOutcome::SceneIdxOob;
570 };
571 let slot_idx = modular_slot_idx(chunk_idx, meta.pool_dims);
572
573 // GPU.11 — the per-slot strides span the full mip ladder; the
574 // resident's layout was built from the same `MipLayout`.
575 let layout = MipLayout::for_vsid(meta.vsid);
576 let occ_words_per_slot = layout.occ_words_per_slot as usize;
577 let offsets_words_per_slot = layout.offsets_words_per_slot as usize;
578 // Same adaptive stride the initial upload chose for this grid.
579 let colors_stride = self
580 .colors_stride_shadow
581 .get(scene_idx)
582 .map_or(COLORS_PER_CHUNK_WORDS as usize, |&s| s as usize);
583
584 assert_eq!(
585 chunk.mips.len() as u32,
586 layout.mip_count,
587 "refresh_chunk: mip count mismatch (chunk {} vs grid {})",
588 chunk.mips.len(),
589 layout.mip_count,
590 );
591
592 // ---- occupancy ----
593 // Route each mip's write to its page. Page size is slot-
594 // aligned (see `split_occupancy_pages`) so the whole slot's
595 // occupancy ladder lands in a single page.
596 let slot_occ_base = meta.occupancy_offset as usize + slot_idx * occ_words_per_slot;
597 let page_words = self.occupancy_page_words as usize;
598 let page = slot_occ_base / page_words;
599 let slot_local_word = slot_occ_base % page_words;
600 debug_assert!(
601 slot_local_word + occ_words_per_slot <= page_words,
602 "occupancy slot straddles a page boundary — page size not slot-aligned",
603 );
604 let off_slot_base = meta.color_offsets_offset as usize + slot_idx * offsets_words_per_slot;
605 let col_slot_base = meta.colors_offset as usize + slot_idx * colors_stride;
606
607 let mut outcome = RefreshOutcome::Ok;
608 let mut color_cursor = 0usize;
609 for (m, mip) in chunk.mips.iter().enumerate() {
610 // occupancy (textured) then solid, back-to-back.
611 let local = slot_local_word + layout.mip_occ_rel[m] as usize;
612 queue.write_buffer(
613 &self.occupancy_pages[page],
614 (local * 4) as u64,
615 bytemuck::cast_slice(&mip.occupancy),
616 );
617 queue.write_buffer(
618 &self.occupancy_pages[page],
619 ((local + mip.occupancy.len()) * 4) as u64,
620 bytemuck::cast_slice(&mip.solid_occupancy),
621 );
622 // color_offsets
623 let coff = off_slot_base + layout.mip_coff_rel[m] as usize;
624 queue.write_buffer(
625 &self.all_color_offsets,
626 (coff * 4) as u64,
627 bytemuck::cast_slice(&mip.color_offsets),
628 );
629 // colours (concatenated per slot, truncate to stride)
630 let remaining = colors_stride.saturating_sub(color_cursor);
631 let n = mip.colors.len().min(remaining);
632 if n < mip.colors.len() {
633 eprintln!(
634 "roxlap-gpu refresh_chunk: scene_idx={scene_idx} chunk_idx={chunk_idx:?} \
635 mip {m} colours overflow stride {colors_stride}; truncating",
636 );
637 outcome = RefreshOutcome::ColorsTruncated;
638 }
639 if n > 0 {
640 queue.write_buffer(
641 &self.all_colors,
642 ((col_slot_base + color_cursor) * 4) as u64,
643 bytemuck::cast_slice(&mip.colors[..n]),
644 );
645 }
646 color_cursor += n;
647 }
648
649 // ---- chunk_occupancy bit ----
650 self.set_chunk_occupancy_bit(
651 queue,
652 scene_idx,
653 &meta,
654 slot_idx,
655 !chunk.mips[0].colors.is_empty(),
656 );
657
658 // ---- slot_chunk_idx (identity for the shader) ----
659 self.set_slot_chunk_idx(queue, scene_idx, &meta, slot_idx, chunk_idx);
660
661 // ---- GPU.13.0 grid-AABB early-out box ----
662 self.sync_aabb(queue, scene_idx);
663
664 outcome
665 }
666
667 /// Evict a chunk's slot — clear its `chunk_occupancy` bit and
668 /// reset `slot_chunk_idx` to the empty sentinel. Used by the
669 /// host when a chunk disappears from the CPU-side `Grid::chunks`
670 /// (e.g. streaming eviction past `r_evict`).
671 ///
672 /// Returns `false` if `scene_idx` is past `grid_count` (no-op);
673 /// `true` otherwise.
674 pub fn evict_chunk(
675 &mut self,
676 queue: &wgpu::Queue,
677 scene_idx: usize,
678 chunk_idx: [i32; 3],
679 ) -> bool {
680 let Some(meta) = self.static_meta.get(scene_idx).copied() else {
681 return false;
682 };
683 let slot_idx = modular_slot_idx(chunk_idx, meta.pool_dims);
684 // Only evict if this slot still claims to hold `chunk_idx`.
685 // Otherwise we'd be wiping out a different (newer) chunk
686 // that happens to share the slot.
687 let shadow_entry = self.slot_chunk_idx_shadow[scene_idx][slot_idx];
688 if shadow_entry[0] != chunk_idx[0]
689 || shadow_entry[1] != chunk_idx[1]
690 || shadow_entry[2] != chunk_idx[2]
691 {
692 return true;
693 }
694 self.set_chunk_occupancy_bit(queue, scene_idx, &meta, slot_idx, false);
695 self.set_slot_chunk_idx(queue, scene_idx, &meta, slot_idx, SLOT_EMPTY_SENTINEL);
696 // GPU.13.0 — eviction may shrink the occupied box; recompute.
697 self.sync_aabb(queue, scene_idx);
698 true
699 }
700
701 fn set_chunk_occupancy_bit(
702 &mut self,
703 queue: &wgpu::Queue,
704 scene_idx: usize,
705 meta: &GridStaticMeta,
706 slot_idx: usize,
707 new_bit: bool,
708 ) {
709 let word_idx = slot_idx >> 5;
710 let bit = slot_idx & 31;
711 let shadow = &mut self.chunk_occupancy_shadow[scene_idx][word_idx];
712 let was_bit = (*shadow >> bit) & 1 == 1;
713 if new_bit == was_bit {
714 return;
715 }
716 if new_bit {
717 *shadow |= 1u32 << bit;
718 } else {
719 *shadow &= !(1u32 << bit);
720 }
721 let global_word_idx = meta.chunk_occupancy_offset as usize + word_idx;
722 queue.write_buffer(
723 &self.all_chunk_occupancy,
724 (global_word_idx * 4) as u64,
725 bytemuck::bytes_of(shadow),
726 );
727 }
728
729 fn set_slot_chunk_idx(
730 &mut self,
731 queue: &wgpu::Queue,
732 scene_idx: usize,
733 meta: &GridStaticMeta,
734 slot_idx: usize,
735 chunk_idx: [i32; 3],
736 ) {
737 let entry = [chunk_idx[0], chunk_idx[1], chunk_idx[2], 0];
738 self.slot_chunk_idx_shadow[scene_idx][slot_idx] = entry;
739 let global_word_idx = meta.slot_chunk_idx_offset as usize + slot_idx * 4;
740 queue.write_buffer(
741 &self.all_slot_chunk_idx,
742 (global_word_idx * 4) as u64,
743 bytemuck::cast_slice(&entry),
744 );
745 }
746
747 /// GPU.13.0 — recompute the grid's occupied chunk-AABB from its
748 /// `slot_chunk_idx` shadow and, if it changed, patch the grid's
749 /// [`GridStaticMeta`] on the GPU. Cheap: scans `total_slots`
750 /// entries and writes 144 bytes only when the box actually moves
751 /// (steady-state re-bakes leave it unchanged → no GPU write).
752 /// Called after every install/eviction so streaming grids keep a
753 /// tight, always-conservative early-out box.
754 fn sync_aabb(&mut self, queue: &wgpu::Queue, scene_idx: usize) {
755 let (aabb_min, aabb_max) = aabb_of_slots(&self.slot_chunk_idx_shadow[scene_idx]);
756 let meta = &mut self.static_meta[scene_idx];
757 if meta.aabb_min == aabb_min && meta.aabb_max == aabb_max {
758 return;
759 }
760 meta.aabb_min = aabb_min;
761 meta.aabb_max = aabb_max;
762 let off = (scene_idx * std::mem::size_of::<GridStaticMeta>()) as u64;
763 queue.write_buffer(&self.grid_static_meta, off, bytemuck::bytes_of(meta));
764 }
765}
766
767/// GPU.13.0 — inclusive chunk-AABB over a grid's `slot_chunk_idx`
768/// shadow, skipping the [`SLOT_EMPTY_SENTINEL`] entries. Returns the
769/// inverted sentinel box (`min = i32::MAX`, `max = i32::MIN`) when no
770/// slot is occupied, which makes the shader's `aabb_passed` early-out
771/// fire for every ray (an empty grid renders nothing).
772fn aabb_of_slots(slots: &[[i32; 4]]) -> ([i32; 3], [i32; 3]) {
773 let mut min = [i32::MAX; 3];
774 let mut max = [i32::MIN; 3];
775 for e in slots {
776 if e[0] == SLOT_EMPTY_SENTINEL[0]
777 && e[1] == SLOT_EMPTY_SENTINEL[1]
778 && e[2] == SLOT_EMPTY_SENTINEL[2]
779 {
780 continue;
781 }
782 for k in 0..3 {
783 if e[k] < min[k] {
784 min[k] = e[k];
785 }
786 if e[k] > max[k] {
787 max[k] = e[k];
788 }
789 }
790 }
791 (min, max)
792}
793
794/// Modular slot index for `chunk_idx` given the grid's
795/// power-of-2 `pool_dims`. Negative `chunk_idx` components map via
796/// two's-complement bitwise AND, matching the shader's
797/// `chunk_idx & (pool_dims - 1)`.
798#[must_use]
799pub fn modular_slot_idx(chunk_idx: [i32; 3], pool_dims: [u32; 3]) -> usize {
800 let mask_x = (pool_dims[0] - 1) as i32;
801 let mask_y = (pool_dims[1] - 1) as i32;
802 let mask_z = (pool_dims[2] - 1) as i32;
803 let sx = (chunk_idx[0] & mask_x) as usize;
804 let sy = (chunk_idx[1] & mask_y) as usize;
805 let sz = (chunk_idx[2] & mask_z) as usize;
806 sx + sy * (pool_dims[0] as usize) + sz * (pool_dims[0] as usize) * (pool_dims[1] as usize)
807}
808
809/// Outcome of `GpuSceneResident::refresh_chunk`. Most callers
810/// can ignore the result; `ColorsTruncated` indicates the chunk's
811/// colour data overflowed the per-slot stride and was clipped.
812#[derive(Debug, Clone, Copy, PartialEq, Eq)]
813pub enum RefreshOutcome {
814 Ok,
815 /// The chunk's colour count exceeded `COLORS_PER_CHUNK_WORDS`;
816 /// the GPU sees the first `stride` colours. Bump
817 /// `COLORS_PER_CHUNK_WORDS` for content that hits this.
818 ColorsTruncated,
819 /// Retained for ABI compatibility; the GPU.7 modular pool no
820 /// longer rejects chunks by bbox.
821 ChunkOutOfBbox,
822 /// `scene_idx` is past `grid_count`. Programming error.
823 SceneIdxOob,
824}
825
826fn create_storage(device: &wgpu::Device, label: &str, data: &[u32]) -> wgpu::Buffer {
827 // GPU.6: include COPY_DST so `refresh_chunk` can `queue.write_buffer`
828 // into existing slots without rebuilding the resident.
829 device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
830 label: Some(label),
831 contents: bytemuck::cast_slice(data),
832 usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
833 })
834}
835
836/// Split the concatenated occupancy words into up to
837/// [`MAX_OCC_PAGES`] storage buffers, each no larger than the
838/// device's `max_storage_buffer_binding_size`, then pad the page
839/// list with 1-word dummy buffers so the returned vec is always
840/// exactly `MAX_OCC_PAGES` long (one buffer per bind-group entry).
841///
842/// `slot_align_words` is the per-slot occupancy stride: page size is
843/// rounded down to a multiple of it so no chunk slot — and therefore
844/// no per-slot `refresh_chunk` write — straddles a page boundary.
845/// Returns `(pages, page_words, num_pages)`.
846fn split_occupancy_pages(
847 device: &wgpu::Device,
848 words: &[u32],
849 slot_align_words: u64,
850) -> (Vec<wgpu::Buffer>, u32, u32) {
851 let total_words = words.len() as u64;
852 // wgpu 29 widened `max_storage_buffer_binding_size` to `u64`.
853 let limit_words = device.limits().max_storage_buffer_binding_size / 4;
854 // Largest slot-aligned page that fits one binding (≥ 1 slot).
855 let page_slots = (limit_words / slot_align_words).max(1);
856 let mut page_words = page_slots.saturating_mul(slot_align_words);
857 // A tiny scene (or the empty-scene 1-word pad) isn't slot-aligned;
858 // cap the page at the data length so we don't allocate emptiness.
859 page_words = page_words.min(total_words.max(1));
860 let num_pages = total_words.div_ceil(page_words);
861 assert!(
862 num_pages as usize <= MAX_OCC_PAGES,
863 "occupancy needs {num_pages} pages (>{MAX_OCC_PAGES}) at this device's \
864 {limit_words}-word binding limit; shrink the streaming pool or raise MAX_OCC_PAGES",
865 );
866
867 let mut pages: Vec<wgpu::Buffer> = Vec::with_capacity(MAX_OCC_PAGES);
868 let page_words_usize = page_words as usize;
869 for p in 0..num_pages as usize {
870 let start = p * page_words_usize;
871 let end = ((p + 1) * page_words_usize).min(words.len());
872 pages.push(create_storage(
873 device,
874 &format!("roxlap-gpu scene.occupancy.page{p}"),
875 &words[start..end],
876 ));
877 }
878 // Dummy 1-word buffers for the unused bindings.
879 while pages.len() < MAX_OCC_PAGES {
880 pages.push(create_storage(
881 device,
882 "roxlap-gpu scene.occupancy.page_dummy",
883 &[0u32],
884 ));
885 }
886 (
887 pages,
888 u32::try_from(page_words).expect("page_words fits u32"),
889 num_pages as u32,
890 )
891}
892
893#[cfg(test)]
894mod tests {
895 use super::*;
896
897 #[test]
898 fn grid_static_meta_matches_wgsl_std430_size() {
899 // scene_dda.wgsl's GridStaticMeta is read as
900 // array<GridStaticMeta>; the std430 array stride must equal
901 // the Rust size_of or wgpu rejects the binding.
902 // Concretely: 8 u32 (32) + vec3+pad (16) + 4 u32 (16) +
903 // 2*[u32;6] (48) = 112, then GPU.13.0 adds two vec3<i32>+pad
904 // (aabb_min, aabb_max) = 32 → 144 bytes.
905 assert_eq!(std::mem::size_of::<GridStaticMeta>(), 144);
906 assert_eq!(std::mem::align_of::<GridStaticMeta>(), 4);
907 }
908
909 #[test]
910 fn mip_layout_offsets_accumulate() {
911 // vsid=128 → 6 mips. Relative offsets are cumulative; mip-0
912 // sits at 0 so mip-0 reads are byte-identical to pre-mip.
913 let l = MipLayout::for_vsid(128);
914 assert_eq!(l.mip_count, 6);
915 assert_eq!(l.mip_occ_rel[0], 0);
916 assert_eq!(l.mip_coff_rel[0], 0);
917
918 // Recompute the strides independently and compare. Each mip
919 // stores TWO occupancy bitmaps (textured + solid) back-to-back.
920 let mut occ = 0u32;
921 let mut coff = 0u32;
922 for m in 0..6u32 {
923 assert_eq!(l.mip_occ_rel[m as usize], occ, "occ rel mip {m}");
924 assert_eq!(l.mip_coff_rel[m as usize], coff, "coff rel mip {m}");
925 let v = 128u32 >> m;
926 occ += 2 * v * v * occ_words_per_column_for_mip(m);
927 coff += v * v + 1;
928 }
929 assert_eq!(l.occ_words_per_slot, occ);
930 assert_eq!(l.offsets_words_per_slot, coff);
931
932 // mip-0 occupancy stride is 2 × the historical vsid²·8 (tex +
933 // solid bitmaps).
934 assert_eq!(l.mip_occ_rel[1], 2 * 128 * 128 * 8);
935 // The whole ladder is only ~1/7 larger than mip-0 alone
936 // (geometric 1 + 1/8 + 1/64 + …) — here on the doubled base.
937 assert!(l.occ_words_per_slot < 2 * 128 * 128 * 8 * 5 / 4);
938 }
939}