// GPU instance compaction.
//
// One thread per occlusion instance. For each instance, if the
// cull's `visible_this_frame[i]` is 1, atomicAdd 1 to the matching
// per-mesh `IndirectDrawArgs.instance_count`. The per-mesh slot
// index comes from `instances[i].mesh_meta_offset / META_SIZE` —
// matching MaterialMeshMeta's per-mesh stride.
struct OcclusionInstance {
world_aabb_min: vec3<f32>,
_pad0: u32,
world_aabb_max: vec3<f32>,
_pad1: u32,
mesh_meta_offset: u32,
instance_attr_base: u32,
// See cull.wgsl — repurposed slot, written into the
// IndirectDrawArgs by this shader so the CPU is no longer
// a writer of `args_buffer`.
index_count: u32,
_pad2: u32,
};
// IndirectDrawArgs slot (32 B): the leading 5 u32s are the WebGPU
// `drawIndexedIndirect` layout — `(index_count, instance_count,
// first_index, base_vertex, first_instance)`. The trailing 3 u32s
// are padding for nice alignment.
struct IndirectDrawArgs {
index_count: u32,
instance_count: atomic<u32>,
first_index: u32,
base_vertex: u32,
first_instance: u32,
_pad0: u32,
_pad1: u32,
_pad2: u32,
};
struct OcclusionParams {
active_count: u32,
_pad0: u32,
_pad1: u32,
_pad2: u32,
};
@group(0) @binding(0) var<storage, read> instances: array<OcclusionInstance>;
@group(0) @binding(1) var<storage, read> visible_this_frame: array<u32>;
@group(0) @binding(2) var<storage, read_write> indirect_args: array<IndirectDrawArgs>;
@group(0) @binding(3) var<uniform> params: OcclusionParams;
// Must match `MATERIAL_MESH_META_BYTE_ALIGNMENT` (256 B). The cull
// stages a mesh_meta_offset in bytes; we divide to get the per-mesh
// slot index.
const MESH_META_STRIDE_BYTES: u32 = 256u;
@compute @workgroup_size(64)
fn cs_main(@builtin(global_invocation_id) gid: vec3<u32>) {
let i = gid.x;
// Bound by the active instance count, not `arrayLength` (which
// returns capacity). Tail threads in the workgroup-rounded
// dispatch would otherwise read `visible_this_frame[i]` from
// slots that the cull's matching `if (i >= count) return` left
// untouched — i.e. last frame's value — and double-count phantom
// mesh instances. See cull.wgsl for the matched comment.
let count = params.active_count;
if (i >= count) {
return;
}
let visible = visible_this_frame[i];
if (visible == 0u) {
return;
}
let mesh_slot = instances[i].mesh_meta_offset / MESH_META_STRIDE_BYTES;
let args_capacity = arrayLength(&indirect_args);
if (mesh_slot >= args_capacity) {
return;
}
// Write the static drawIndirect fields here rather than from the
// CPU. `queue.writeBuffer` would have overwritten them BEFORE the
// already-recorded geometry pass executed (queue order ≠ command
// order), so a CPU-side prep zeroed `instance_count` ahead of the
// earlier-recorded `draw_indexed_indirect` consumer. Doing the
// writes here keeps args_buffer GPU-owned: every update happens
// in command order strictly after the geometry pass's read.
//
// first_index / base_vertex stay at zero — the args_buffer was
// cleared by `command_encoder.clear_buffer` between geometry and
// cull (see render.rs), so we don't need to re-emit them. For
// non-instanced meshes (the only path through drawIndirect),
// each mesh_slot is touched by at most one thread, so the
// non-atomic write of `index_count` and `first_instance` has no
// races; instance_count is still atomicAdded since the cull may
// mark multiple instances of one mesh visible under future
// instancing extensions.
indirect_args[mesh_slot].index_count = instances[i].index_count;
{% if write_first_instance %}
// Per-mesh slot index goes into `first_instance` so the vertex
// shader's `geometry_mesh_metas[instance_index]` storage-array
// read resolves to this mesh's meta. Requires the
// `indirect-first-instance` WebGPU feature on the device — the
// CPU only constructs the shader with this template branch when
// the feature is on. The portable path (toggle off) instead leaves
// first_instance at 0 (kept from `clear_buffer`) and threads the
// slot identity through a uniform-with-dynamic-offset bind group
// set per draw.
indirect_args[mesh_slot].first_instance = mesh_slot;
{% endif %}
atomicAdd(&indirect_args[mesh_slot].instance_count, 1u);
}