awsm-renderer 0.4.2

awsm-renderer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
//! Cluster-LOD cut compute pass (Phase B, B.2).
//!
//! Built eagerly (like `light_culling` / `material_prep`) and gated by
//! `virtual_geometry`. Holds the cut pipeline + bind-group layout; the per-mesh
//! [`ClusterLodBuffers`] and the bind-group instance are created/recreated when a
//! cluster mesh loads. Inert (no dispatch) until a cluster mesh is present.

use awsm_renderer_core::command::compute_pass::ComputePassDescriptor;
use awsm_renderer_core::renderer::AwsmRendererWebGpu;
use glam::{Mat4, Vec3};

use crate::bind_group_layout::BindGroupLayouts;
use crate::cluster_lod::{select_cut_per_cluster, ClusterPage};
use crate::error::Result;
use crate::meshes::MeshKey;
use crate::render::RenderContext;
use crate::render_passes::cluster_lod::{
    bind_group::{ClusterCompactionBindGroups, ClusterCutBindGroups},
    buffers::ClusterLodBuffers,
    pipeline::ClusterLodPipelines,
};
use crate::render_passes::RenderPassInitContext;

pub struct ClusterLodRenderPass {
    pub pipelines: ClusterLodPipelines,
    /// Bind-group prototypes (layout key + paging flag, NO bound group) cloned per
    /// mesh state in [`Self::upload_pages`]. The cut/compaction layouts are identical
    /// for every cluster mesh; only the bound buffers differ — so we capture the
    /// layout once (it needs the init ctx) and stamp a fresh bind group per mesh.
    proto_cut_bg: ClusterCutBindGroups,
    proto_compaction_bg: ClusterCompactionBindGroups,
    /// Resident cluster meshes — one per nanite asset, keyed by `render_mesh`.
    /// SEVERAL render simultaneously: each owns its page-pool buffers, cut +
    /// compaction bind groups, page count, and (under `cluster_paging`) its own
    /// per-frame paging manager. Empty until a cluster mesh loads.
    pub states: Vec<ClusterMeshState>,
}

/// One resident cluster mesh's GPU state (see [`ClusterLodRenderPass::states`]).
pub struct ClusterMeshState {
    /// The render mesh `M` (`add_raw_mesh(cm.positions, cm.indices)`) whose exploded
    /// vertex buffer this cut's compacted indirect stream draws into (M's own draw is
    /// suppressed). The state's identity key.
    pub render_mesh: MeshKey,
    /// Page count (the cut dispatch bound).
    pub cluster_count: u32,
    pub buffers: ClusterLodBuffers,
    pub bind_groups: ClusterCutBindGroups,
    pub compaction_bind_groups: ClusterCompactionBindGroups,
    /// Gap-B dynamic paging (CPU-driven). `Some` only under `cluster_paging`; holds
    /// the FULL un-clamped DAG + drives per-frame residency for THIS mesh.
    pub paging: Option<ClusterPaging>,
}

/// Gap-B dynamic-paging manager (CPU-driven design — see NORTHSTAR-GAPS step 3).
///
/// At our cluster counts (≤~80k for a 5–10M-tri asset) the CPU runs the LOD cut
/// itself each frame against the FULL un-clamped DAG and diffs the desired set
/// against current residency — no GPU feedback/readback. This struct holds that
/// persistent state plus pooled scratch (no per-frame heap allocation).
///
/// **Step 20a (this slice):** holds the full pages + computes the per-frame
/// desired cut + logs how it tracks the camera. No geometry streaming yet (that
/// needs the exploded slot-write API, step 20b), so the drawn frontier — and thus
/// the render — is unchanged this slice.
pub struct ClusterPaging {
    /// The full DAG's un-clamped cluster pages (`lod_error`/`parent_error` are the
    /// bake's real interval — NOT the resident frontier's clamped `0`/`MAX`). The
    /// CPU cut's input. Each page's `first_index`/`index_count` index into
    /// [`Self::indices`] (the original `cm.indices`).
    pub pages: Vec<ClusterPage>,
    /// Reused scratch for the per-frame desired cut (cluster ids). Cleared+refilled
    /// each frame ⇒ no per-frame allocation.
    desired: Vec<u32>,
    /// Frames the paging update has run (diagnostics + LRU timestamps).
    frame: u64,
    /// Last desired-count we logged — log only on change, so the on-device console
    /// shows the cut tracking the camera without per-frame spam.
    last_desired_logged: usize,

    // ── CPU geometry, to build a streamed slot's exploded bytes (consumed in 20b-iv) ──
    /// Original unique-vertex positions (`cm.positions`); a slot's exploded verts
    /// gather these by `indices[page.first_index + k]`.
    #[allow(dead_code)] // read by the per-frame streamer (step 20b-iv)
    positions: Vec<[f32; 3]>,
    /// Original unique-vertex normals (`cm.normals`); empty ⇒ the streamer defaults.
    #[allow(dead_code)] // read by the per-frame streamer (step 20b-iv)
    normals: Vec<[f32; 3]>,
    /// Original triangle index buffer (`cm.indices`) the pages' spans address.
    #[allow(dead_code)] // read by the per-frame streamer (step 20b-iv)
    indices: Vec<u32>,

    // ── residency bookkeeping in FULL-DAG cluster space (the page-pool state) ──
    /// `resident[cluster_id]` = its page-pool slot, or `-1` (absent). Length =
    /// `pages.len()`.
    #[allow(dead_code)] // mutated by the per-frame streamer (step 20b-iv)
    resident: Vec<i32>,
    /// `slot_cluster[slot]` = the full-DAG cluster currently in that slot, or `-1`
    /// (free). Length = `pool_slots`.
    #[allow(dead_code)] // mutated by the per-frame streamer (step 20b-iv)
    slot_cluster: Vec<i32>,
    /// `slot_last_used[slot]` = the `frame` the slot was last in the desired cut
    /// (LRU eviction key). Length = `pool_slots`.
    #[allow(dead_code)] // mutated by the per-frame streamer (step 20b-iv)
    slot_last_used: Vec<u64>,
    /// Fixed page-pool capacity (slots) — the VRAM bound.
    #[allow(dead_code)] // read by the per-frame streamer (step 20b-iv)
    pool_slots: usize,

    // ── pooled per-frame scratch (no per-frame heap allocation in 20b-iv) ──
    /// One slot's exploded visibility bytes (`PAGE_VERTS*56`).
    #[allow(dead_code)] // reused by the per-frame streamer (step 20b-iv)
    slot_bytes_scratch: Vec<u8>,
    /// One slot's triangle-order corner indices (`PAGE_VERTS`) + slot-relative
    /// source indices, reused per stream.
    corner_scratch: Vec<u32>,
    src_idx_scratch: Vec<u32>,
    /// Pooled byte staging for the per-slot GPU writes (page entry + source-indices
    /// span), reused every stream so the buffer-write helpers don't allocate.
    page_bytes_scratch: Vec<u8>,
    src_bytes_scratch: Vec<u8>,
    /// `desired_flag[cluster_id]` = is this cluster in the current frame's desired
    /// cut. Pooled membership test for the eviction sweep (length = full DAG). Set
    /// from `desired` at the top of `stream_paging`, cleared at the end — so it is
    /// `false` everywhere between frames (no per-frame alloc, no per-frame clear of
    /// the whole vector).
    desired_flag: Vec<bool>,
    /// Exploded verts per page-pool slot (= `CLUSTER_PAGE_VERTS`); the slot byte math.
    page_verts: usize,
}

/// Geometry + initial residency seed for the paging manager (step 20b-iii). Keeps
/// [`ClusterLodRenderPass::init_paging`] from growing an unwieldy argument list.
pub struct ClusterPagingInit {
    /// Full-DAG un-clamped pages (`first_index`/`index_count` into `indices`).
    pub pages: Vec<ClusterPage>,
    pub positions: Vec<[f32; 3]>,
    pub normals: Vec<[f32; 3]>,
    pub indices: Vec<u32>,
    /// `slot_cluster[slot]` = the full-DAG cluster id initially uploaded into that
    /// slot (the load-time frontier, in slot order). Its length is the pool size.
    pub slot_cluster: Vec<i32>,
    /// Exploded verts per slot (`CLUSTER_PAGE_VERTS` from the loader).
    pub page_verts: usize,
}

impl ClusterPaging {
    fn new(init: ClusterPagingInit) -> Self {
        let ClusterPagingInit {
            pages,
            positions,
            normals,
            indices,
            slot_cluster,
            page_verts,
        } = init;
        let pool_slots = slot_cluster.len();
        let pages_len = pages.len();
        // Invert slot_cluster → resident (full-DAG cluster space).
        let mut resident = vec![-1i32; pages_len];
        for (slot, &cid) in slot_cluster.iter().enumerate() {
            if cid >= 0 && (cid as usize) < resident.len() {
                resident[cid as usize] = slot as i32;
            }
        }
        Self {
            pages,
            desired: Vec::new(),
            frame: 0,
            last_desired_logged: usize::MAX,
            positions,
            normals,
            indices,
            resident,
            slot_cluster,
            slot_last_used: vec![0u64; pool_slots],
            pool_slots,
            slot_bytes_scratch: Vec::new(),
            corner_scratch: Vec::new(),
            src_idx_scratch: Vec::new(),
            page_bytes_scratch: Vec::new(),
            src_bytes_scratch: Vec::new(),
            desired_flag: vec![false; pages_len],
            page_verts,
        }
    }
}

impl ClusterLodRenderPass {
    /// Builds the bind-group layout + cut compute pipeline. **Creating the
    /// pipeline validates `cluster_cut.wgsl` on-device** (the GPU driver compiles
    /// it here) — the first on-GPU checkpoint for the per-cluster cut.
    pub async fn new(ctx: &mut RenderPassInitContext<'_>) -> Result<Self> {
        let proto_cut_bg = ClusterCutBindGroups::new(ctx)?;
        let proto_compaction_bg = ClusterCompactionBindGroups::new(ctx)?;
        let pipelines = ClusterLodPipelines::new(ctx, &proto_cut_bg, &proto_compaction_bg).await?;
        Ok(Self {
            pipelines,
            proto_cut_bg,
            proto_compaction_bg,
            states: Vec::new(),
        })
    }

    /// The resident state for a render mesh, if loaded.
    pub fn state(&self, render_mesh: MeshKey) -> Option<&ClusterMeshState> {
        self.states.iter().find(|s| s.render_mesh == render_mesh)
    }

    fn state_mut(&mut self, render_mesh: MeshKey) -> Option<&mut ClusterMeshState> {
        self.states
            .iter_mut()
            .find(|s| s.render_mesh == render_mesh)
    }

    /// Drop a cluster mesh's GPU state (e.g. when its node is removed).
    pub fn remove_mesh(&mut self, render_mesh: MeshKey) {
        self.states.retain(|s| s.render_mesh != render_mesh);
    }

    /// Install the Gap-B paging manager with the full DAG + CPU geometry + the
    /// initial residency seed (called at mesh load, only under `cluster_paging`).
    /// Idempotent per mesh: replaces any prior state. The drawn set is still
    /// whatever [`Self::upload_pages`] uploaded (the load-time frontier in slots);
    /// this arms the per-frame CPU cut (step 20a) and seeds the page-pool residency
    /// bookkeeping + CPU geometry the per-frame streamer (step 20b-iv) consumes.
    pub fn init_paging(&mut self, render_mesh: MeshKey, init: ClusterPagingInit) {
        if let Some(state) = self.state_mut(render_mesh) {
            state.paging = Some(ClusterPaging::new(init));
        } else {
            tracing::warn!(
                "init_paging: no cluster state for the render mesh (call upload_pages first)"
            );
        }
    }

    /// Per-frame Gap-B dynamic paging (CPU-driven; step 20b-iv-b-2b). No-op unless
    /// paging is armed + a cluster render mesh + buffers exist.
    ///
    /// Computes the camera-adaptive complete antichain (`select_cut_per_cluster` over
    /// the full DAG), then STREAMS the desired clusters that aren't resident yet into
    /// FREE page-pool slots — writing the slot's exploded visibility verts, its GPU
    /// page (clamped always-draw), its slot-aligned source indices, and its residency
    /// entry. Free-slots-only ⇒ it only ADDS coverage, never removes it ⇒ crack-free
    /// (the coarser ancestor stays resident+drawn until its region is fully refined;
    /// at most a transient z-fight overlap, never a hole). Bounded: refinement caps at
    /// `pool_slots` (coarser where it doesn't fit). LRU eviction (to recycle slots that
    /// leave the antichain) is the next layer (20b-iv-b-2c). All scratch is pooled.
    pub fn stream_paging(
        &mut self,
        gpu: &AwsmRendererWebGpu,
        meshes: &crate::meshes::Meshes,
        cam_pos: Vec3,
        tan_half_fov_y: f32,
        viewport_h: f32,
        pixel_budget: f32,
    ) -> Result<()> {
        for state in &mut self.states {
            let render_mesh = state.render_mesh;
            let buffers = &state.buffers;
            let Some(p) = state.paging.as_mut() else {
                continue;
            };
            p.frame += 1;
            select_cut_per_cluster(
                &p.pages,
                &Mat4::IDENTITY,
                cam_pos,
                tan_half_fov_y,
                viewport_h,
                pixel_budget,
                &mut p.desired,
            );

            let data_buf = meshes.visibility_geometry_data_gpu_buffer();
            let data_off = meshes.visibility_geometry_data_buffer_offset(render_mesh)?;
            let pv = p.page_verts;
            const STRIDE: usize = 56; // visibility vertex bytes
            const MAX_LOADS: usize = 96; // cap streams/frame so a camera jump doesn't hitch

            // Mark this frame's desired cut for the membership test the eviction sweep
            // (below) needs. Cleared again at the end of the frame so `desired_flag` is
            // all-false between frames — no per-frame alloc, no full-vector clear.
            for &c in &p.desired {
                p.desired_flag[c as usize] = true;
            }

            let mut next_free = 0usize; // free-slot scan cursor (monotone within a frame)
            let mut streamed = 0usize;
            // True once every desired cluster is resident — only then is it crack-free to
            // evict the resident-but-no-longer-desired slots (resident becomes EXACTLY the
            // antichain `desired`). While loads are still pending we keep the coarser
            // ancestors resident (transient overlap = z-fight, never holes).
            let mut all_desired_resident = true;
            let mut i = 0usize;
            while i < p.desired.len() {
                let cluster = p.desired[i] as usize;
                i += 1;
                if p.resident[cluster] >= 0 {
                    p.slot_last_used[p.resident[cluster] as usize] = p.frame; // keep warm
                    continue;
                }
                if streamed >= MAX_LOADS {
                    all_desired_resident = false; // capped this frame ⇒ more to stream next
                    continue;
                }
                // Find a FREE slot (stream-into-free-before-evict ⇒ crack-free).
                while next_free < p.pool_slots && p.slot_cluster[next_free] >= 0 {
                    next_free += 1;
                }
                if next_free >= p.pool_slots {
                    all_desired_resident = false; // pool full — bounded partial refinement
                    break;
                }
                let slot = next_free;
                next_free += 1;

                let page = p.pages[cluster];
                let ic = (page.index_count as usize).min(pv);
                let f = page.first_index as usize;
                // Slot corner indices (triangle order), padded to a full slot.
                p.corner_scratch.clear();
                for k in 0..pv {
                    let v = if k < ic {
                        p.indices[f + k]
                    } else if ic > 0 {
                        p.indices[f]
                    } else {
                        0
                    };
                    p.corner_scratch.push(v);
                }
                crate::mesh_pack::pack_visibility_slot_bytes(
                    &p.positions,
                    &p.normals,
                    &p.corner_scratch,
                    slot,
                    pv,
                    awsm_renderer_core::pipeline::primitive::FrontFace::Ccw,
                    &mut p.slot_bytes_scratch,
                );
                gpu.write_buffer(
                    data_buf,
                    Some(crate::renderer::cluster_slot_data_offset(
                        data_off,
                        slot,
                        pv * STRIDE,
                    )),
                    p.slot_bytes_scratch.as_slice(),
                    None,
                    None,
                )?;
                // The slot's GPU page: clamp always-draw, slot-aligned source span.
                let mut gp = page;
                gp.lod_error = 0.0;
                gp.parent_error = f32::MAX;
                gp.first_index = (slot * pv) as u32;
                gp.index_count = ic as u32;
                buffers.write_page_entry(gpu, slot, &gp, &mut p.page_bytes_scratch)?;
                p.src_idx_scratch.clear();
                for k in 0..ic {
                    p.src_idx_scratch.push((slot * pv + k) as u32);
                }
                buffers.write_source_indices_span(
                    gpu,
                    (slot * pv) as u32,
                    &p.src_idx_scratch,
                    &mut p.src_bytes_scratch,
                )?;
                // GPU resident is SLOT-indexed: mark this slot drawable (value = slot).
                buffers.write_resident_entry(gpu, slot, slot as i32)?;
                p.resident[cluster] = slot as i32;
                p.slot_cluster[slot] = cluster as i32;
                p.slot_last_used[slot] = p.frame;
                streamed += 1;
            }

            // Eviction sweep — only when the whole desired cut is resident, so dropping
            // the no-longer-desired slots leaves EXACTLY the crack-free antichain. This is
            // what makes the draw FALL on zoom-out and recycles slots within the bounded
            // pool. Capped per frame; pages stay always-draw so a free slot simply isn't
            // selected by the cut (resident < 0).
            let mut evicted = 0usize;
            if all_desired_resident {
                for slot in 0..p.pool_slots {
                    if evicted >= MAX_LOADS {
                        break;
                    }
                    let c = p.slot_cluster[slot];
                    if c >= 0 && !p.desired_flag[c as usize] {
                        buffers.write_resident_entry(gpu, slot, -1)?;
                        p.resident[c as usize] = -1;
                        p.slot_cluster[slot] = -1;
                        evicted += 1;
                    }
                }
            }

            // Clear this frame's desired marks (keep `desired_flag` all-false between
            // frames without a full-vector reset).
            for &c in &p.desired {
                p.desired_flag[c as usize] = false;
            }

            let desired = p.desired.len();
            if streamed > 0 || evicted > 0 || desired != p.last_desired_logged {
                p.last_desired_logged = desired;
                tracing::info!(
                    "cluster paging (Gap B, frame {}): desired={desired} (full DAG={}, pool={}), \
                 streamed {streamed}, evicted {evicted} [20b-iv-b-2b]",
                    p.frame,
                    p.pages.len(),
                    p.pool_slots,
                );
            }
        }
        Ok(())
    }

    /// Upload a cluster mesh's pages (once, at mesh load): (re)allocate the
    /// buffers to hold `pages`, write them, and rebuild the bind group against
    /// the new buffers. Idempotent per mesh.
    pub fn upload_pages(
        &mut self,
        render_mesh: MeshKey,
        gpu: &AwsmRendererWebGpu,
        layouts: &BindGroupLayouts,
        pages: &[ClusterPage],
        indices: &[u32],
    ) -> Result<()> {
        let count = pages.len() as u32;
        let index_count = indices.len() as u32;
        // Find-or-create this mesh's state (clone the bind-group prototypes into
        // locals first so we don't borrow `self` while pushing to `self.states`).
        if self.state(render_mesh).is_none() {
            let bind_groups = self.proto_cut_bg.clone();
            let compaction_bind_groups = self.proto_compaction_bg.clone();
            let buffers = ClusterLodBuffers::with_capacity(gpu, count.max(1), index_count.max(3))?;
            self.states.push(ClusterMeshState {
                render_mesh,
                cluster_count: 0,
                buffers,
                bind_groups,
                compaction_bind_groups,
                paging: None,
            });
        }
        let state = self.state_mut(render_mesh).unwrap();
        state.buffers.ensure_capacity(gpu, count, index_count)?;
        state.buffers.write_pages(gpu, pages)?;
        state.buffers.write_source_indices(gpu, indices)?;
        state.cluster_count = count;
        state.bind_groups.recreate(gpu, layouts, &state.buffers)?;
        state
            .compaction_bind_groups
            .recreate(gpu, layouts, &state.buffers)?;
        Ok(())
    }

    /// Upload the Gap-B residency table (`cluster_id → slot`). Must be called after
    /// [`Self::upload_pages`] (the buffers must exist). No-op if no cluster mesh is
    /// loaded. Only the `cluster_paging` path calls this.
    pub fn upload_resident(
        &mut self,
        render_mesh: MeshKey,
        gpu: &AwsmRendererWebGpu,
        layouts: &BindGroupLayouts,
        resident: &[i32],
    ) -> Result<()> {
        let Some(state) = self.state_mut(render_mesh) else {
            return Ok(());
        };
        state.buffers.write_resident(gpu, resident)?;
        // The paging cut bind group has a `resident` entry that could only be
        // bound once the table existed — (re)build it now that it does.
        state.bind_groups.recreate(gpu, layouts, &state.buffers)?;
        Ok(())
    }

    /// Dispatch the per-cluster cut + compaction for EVERY resident cluster mesh.
    /// For each state: write the per-frame params, run the `cut` compute over
    /// `ceil(cluster_count/64)` workgroups (writes 0/1 per cluster into
    /// `selected`), then reset the indirect args and run the compaction (packs the
    /// selected clusters' index pages into `compacted_indices` + bumps
    /// `draw_args.index_count`). After this each state's `draw_args` drives one
    /// `drawIndexedIndirect(compacted_indices)` from the geometry pass's cluster
    /// draw override. MUST run before the geometry pass (it reads the results this
    /// frame). No-op if no cluster mesh is loaded. (Instance world is identity for
    /// now — the per-instance world is the follow-up; the camera + viewport are
    /// live.)
    ///
    /// `first_instance` is per state: the render mesh M's meta slot, so the
    /// indirect draw's vertex shader resolves M's material meta
    /// (`geometry_mesh_metas[instance_index]`).
    ///
    /// Returns the readback kick `(readback_buffer, cluster_count)` for the FIRST
    /// state when the cadence (frame 5, then every 30) fires and no readback is in
    /// flight — the caller copies `draw_args` → readback inside the encoder and
    /// maps it after submit (diagnostics; logs the drawn cut on change).
    pub fn dispatch_all(
        &self,
        ctx: &RenderContext,
        readback: &std::sync::Mutex<crate::renderer::ClusterCutReadback>,
        cam_pos: Vec3,
        tan_half_fov_y: f32,
        viewport_h: f32,
        pixel_budget: f32,
    ) -> Result<Option<(web_sys::GpuBuffer, u32)>> {
        for state in &self.states {
            if state.cluster_count == 0 {
                continue;
            }
            // first_instance = the render mesh M's meta slot.
            let first_instance = ctx
                .meshes
                .meta
                .geometry_buffer_offset(state.render_mesh)
                .ok()
                .map(|off| {
                    off as u32
                        / crate::meshes::meta::geometry_meta::GEOMETRY_MESH_META_BYTE_ALIGNMENT
                            as u32
                })
                .unwrap_or(0);

            state.buffers.write_params(
                ctx.gpu,
                &Mat4::IDENTITY,
                cam_pos,
                tan_half_fov_y,
                viewport_h,
                pixel_budget,
                1.0,
                state.cluster_count,
            )?;
            {
                let cp = ctx.command_encoder.begin_compute_pass(Some(
                    &ComputePassDescriptor::new(Some("Cluster Cut")).into(),
                ));
                cp.set_pipeline(ctx.pipelines.compute.get(self.pipelines.cut)?);
                cp.set_bind_group(0, state.bind_groups.get_bind_group()?, None)?;
                cp.dispatch_workgroups(
                    ClusterLodBuffers::dispatch_groups(state.cluster_count),
                    Some(1),
                    Some(1),
                );
                cp.end();
            }
            // queue.writeBuffer is ordered before the submitted compute pass.
            state.buffers.init_draw_args(ctx.gpu, first_instance)?;
            {
                let cp = ctx.command_encoder.begin_compute_pass(Some(
                    &ComputePassDescriptor::new(Some("Cluster Compaction")).into(),
                ));
                cp.set_pipeline(ctx.pipelines.compute.get(self.pipelines.compaction)?);
                cp.set_bind_group(0, state.compaction_bind_groups.get_bind_group()?, None)?;
                cp.dispatch_workgroups(
                    ClusterLodBuffers::dispatch_groups(state.cluster_count),
                    Some(1),
                    Some(1),
                );
                cp.end();
            }
        }

        // Readback verification of draw_args.index_count for the first resident
        // mesh. Re-fires on a cadence (frame 5, then every 30) so the drawn cut is
        // observable as the camera/scene change; the async handler logs on change.
        let Some(state) = self.states.iter().find(|s| s.cluster_count > 0) else {
            return Ok(None);
        };
        let want = {
            let mut st = readback.lock().unwrap();
            st.frames += 1;
            !st.inflight && (st.frames == 5 || st.frames % 30 == 0)
        };
        if want {
            ctx.command_encoder.copy_buffer_to_buffer(
                &state.buffers.draw_args_buffer,
                0,
                &state.buffers.readback_buffer,
                0,
                4,
            )?;
            return Ok(Some((
                state.buffers.readback_buffer.clone(),
                state.cluster_count,
            )));
        }
        Ok(None)
    }
}