// Material classify compute pass.
//
// Per 8×8 tile, scan the visibility buffer and discover which opaque
// `shader_id`s its pixels belong to. Aggregate via a workgroup-shared
// bitmask (one bit per registered bucket, including dynamic
// materials), then thread 0 atomically appends the tile's coords to
// each bucket bit is set in. The total atomic traffic is ~1 per
// workgroup-bit, regardless of the 64 threads inside.
//
// Skybox pixels (`triangle_index == U32_MAX`) are routed to the dedicated
// SKYBOX bucket (index 0). Its opaque pipeline is the `skybox_primary.wgsl`
// writer, dispatched over that bucket's tiles; the material kernels
// (compute.wgsl) shade real geometry only. See
// material_opaque/.../skybox_primary.wgsl.
//
// The bit constants + the shader_id → bit dispatch chain + the
// per-bucket extract block are all walked from the same
// `bucket_entries` list the templated `ClassifyOutput` struct uses.
{{ shader_id_consts|safe }}
{% include "shared_wgsl/math.wgsl" %}
{% if emit_edge_data && multisampled_geometry %}
// Linearize a raw depth-buffer value into view-space Z. Ported from
// main's helpers/msaa.wgsl::viewSpaceDepth — same math, same name
// adjusted to snake_case. Required so a relative depth threshold
// (EDGE_DEPTH_THRESHOLD = 0.02 = 2%) works consistently across the
// scene; raw NDC depth is non-linear so the same threshold near vs
// far would produce wildly different real-world distances.
fn view_space_depth(camera: Camera, depth: f32, pixel_coords: vec2<f32>, screen_dims: vec2<f32>) -> f32 {
let ndc_xy = vec2<f32>(
(pixel_coords.x / screen_dims.x) * 2.0 - 1.0,
1.0 - (pixel_coords.y / screen_dims.y) * 2.0,
);
let clip_pos = vec4<f32>(ndc_xy, depth, 1.0);
let view_pos_h = camera.inv_proj * clip_pos;
let view_pos = view_pos_h.xyz / view_pos_h.w;
return view_pos.z;
}
{% endif %}
// Bits in the workgroup-shared mask. One per registered bucket; the
// SKYBOX bucket is at index 0 (its id is 0, sorting ahead of every real
// material) so the skybox routing (which assigns `BUCKET_BIT_SKYBOX` for
// fully-uncovered pixels) maps cleanly to word 0, bit 0.
// Each bucket lives in word `index / 32` at bit `index % 32`;
// `tile_mask` is `array<atomic<u32>, n_words>` so the bucket budget can
// exceed 32 by raising `MAX_BUCKET_WORDS` (n_words). At n_words == 1
// this is equivalent to the original single-word `1u << index`.
{% for entry in bucket_entries %}
const {{ entry.bucket_bit_const() }}: u32 = (1u << {{ loop.index0 % 32 }}u);
{% endfor %}
var<workgroup> tile_mask: array<atomic<u32>, {{ n_words }}u>;
@compute @workgroup_size(8, 8, 1)
fn cs_main(
@builtin(workgroup_id) wg: vec3<u32>,
@builtin(local_invocation_id) lid: vec3<u32>,
@builtin(local_invocation_index) lii: u32,
) {
// Zero the workgroup mask once per dispatch. The barrier below
// makes the zero visible to every thread before the per-pixel
// OR's land.
if lii == 0u {
{% for w in words_iter %}
atomicStore(&tile_mask[{{ w }}u], 0u);
{% endfor %}
}
workgroupBarrier();
let screen_dims = textureDimensions(visibility_data_tex);
let coords = vec2<i32>(wg.xy * 8u + lid.xy);
let in_bounds = coords.x < i32(screen_dims.x) && coords.y < i32(screen_dims.y);
// `local_word` selects which `tile_mask[]` word this pixel's bucket
// bit lives in (bucket index / 32); `local_bit` is the bit within
// that word. At n_words == 1 `local_word` is always 0.
var local_word: u32 = 0u;
var local_bit: u32 = 0u;
if in_bounds {
let vis = textureLoad(visibility_data_tex, coords, 0);
let tri = join32(vis.x, vis.y);
if tri == U32_MAX {
// Fully-uncovered ("sky") pixel → the dedicated SKYBOX bucket
// (index 0 → word 0), whose opaque pipeline is the
// `skybox_primary` writer. See material_opaque/.../skybox_primary.wgsl.
local_word = 0u;
local_bit = BUCKET_BIT_SKYBOX;
} else {
let meta_offset = join32(vis.z, vis.w);
let mesh_meta = material_mesh_metas[meta_offset / 256u];
if mesh_meta.is_hud == 0u {
// shader_id is stored as the first u32 of each
// material payload; `material_offset` is in bytes.
let shader_id = materials_data[mesh_meta.material_offset / 4u];
{% for entry in bucket_entries %}
{% if loop.first %}if{% else %}else if{% endif %} shader_id == {{ entry.shader_id_const() }} {
local_word = {{ loop.index0 / 32 }}u;
local_bit = {{ entry.bucket_bit_const() }};
}
{% endfor %}
}
// HUD pixels are redrawn by the transparency pass — skip
// them in classify so the opaque pipelines don't process
// tiles that contain only HUD geometry.
}
}
if local_bit != 0u {
atomicOr(&tile_mask[local_word], local_bit);
}
workgroupBarrier();
if lii == 0u {
// Snapshot every mask word once (avoids a per-bucket atomic
// load). `mask_word_<w>` is read by each bucket's extract block
// via its word index `index / 32`.
{% for w in words_iter %}
let mask_word_{{ w }} = atomicLoad(&tile_mask[{{ w }}u]);
{% endfor %}
let tile = vec2<u32>(wg.xy);
// One extract block per registered bucket. The atomic returns
// the previous count, which also doubles as the next free
// index into the tile array at
// `classify_output.<name>_offset + index`.
{% for entry in bucket_entries %}
if (mask_word_{{ loop.index0 / 32 }} & {{ entry.bucket_bit_const() }}) != 0u {
let idx_{{ loop.index0 }} = atomicAdd(&classify_output.{{ entry.args_field() }}.workgroup_count_x, 1u);
let slot_{{ loop.index0 }} = classify_output.{{ entry.offset_field() }} + idx_{{ loop.index0 }};
if slot_{{ loop.index0 }} < classify_output.{{ entry.offset_field() }} + classify_output.bucket_capacity {
classify_output.tiles[slot_{{ loop.index0 }}] = tile;
}
}
{% endfor %}
}
{% if emit_edge_data && multisampled_geometry %}
// ─────────────────────────────────────────────────────────────
// MSAA edge emission (Priority 3).
//
// For each pixel: scan all 4 samples; collect the distinct
// shader_ids; if there are at least 2 distinct shader_ids
// (counting "skybox" as its own shader_id), the pixel is an edge
// pixel. Allocate a compact edge_pixel_id via the global atomic,
// write its (x, y) into edge_to_xy, build the slot_map (4 bytes
// packed: up to 4 distinct shader_ids), and append a per-shader
// (edge_pixel_id, sample_mask) entry to each contributing bucket's
// sample list.
//
// Saturation: if edge_count saturates at MAX_EDGE_BUDGET, the
// `edge_buffers.edge_overflow_count` atomic increments (see else
// branch below) and we fall through without writing. The dropped
// pixels render with the primary-sample shading written by the
// material's primary pass — visually a missing-MSAA-resolve, not a
// black hole. Stage 3.8 MVP (see edge_buffers.rs::note_edge_overflow_observed)
// surfaces this via a one-shot tracing::warn when overflow is
// observed CPU-side; the full atomic-add fallback (a hash-bucketed
// overflow accumulator region) is parked as Block C.2 future work.
if (in_bounds) {
// Scan 4 samples — FULLY-STATIC version, no dynamic indexing
// anywhere. Naga/Tint silently no-op'd dynamic-index writes
// into both `vec4<u32>` and (turns out per the user's repro)
// also `array<u32, 4>` in some configurations. Working around
// entirely by unrolling and using individual `let`s.
let v0 = textureLoad(visibility_data_tex, coords, 0);
let v1 = textureLoad(visibility_data_tex, coords, 1);
let v2 = textureLoad(visibility_data_tex, coords, 2);
let v3 = textureLoad(visibility_data_tex, coords, 3);
let tri_0 = join32(v0.x, v0.y);
let tri_1 = join32(v1.x, v1.y);
let tri_2 = join32(v2.x, v2.y);
let tri_3 = join32(v3.x, v3.y);
let mat_off_0 = join32(v0.z, v0.w);
let mat_off_1 = join32(v1.z, v1.w);
let mat_off_2 = join32(v2.z, v2.w);
let mat_off_3 = join32(v3.z, v3.w);
// Helper bucket-id derivation, inlined per sample.
// `sample_sid` semantics: 0xFEu = skybox/uncovered/HUD,
// [0, bucket_count) = real bucket, 0xFFu = sentinel-unmapped
// (shouldn't happen in practice; treated like skybox below).
var sid_0: u32 = 0xFFu;
if (tri_0 == U32_MAX) {
sid_0 = 0xFEu;
} else {
let mm = material_mesh_metas[mat_off_0 / 256u];
if (mm.is_hud == 1u) { sid_0 = 0xFEu; }
else {
let raw_sid = materials_data[mm.material_offset / 4u];
{% for entry in bucket_entries %}
if (raw_sid == {{ entry.shader_id_const() }}) { sid_0 = {{ loop.index0 }}u; }
{% endfor %}
}
}
var sid_1: u32 = 0xFFu;
if (tri_1 == U32_MAX) {
sid_1 = 0xFEu;
} else {
let mm = material_mesh_metas[mat_off_1 / 256u];
if (mm.is_hud == 1u) { sid_1 = 0xFEu; }
else {
let raw_sid = materials_data[mm.material_offset / 4u];
{% for entry in bucket_entries %}
if (raw_sid == {{ entry.shader_id_const() }}) { sid_1 = {{ loop.index0 }}u; }
{% endfor %}
}
}
var sid_2: u32 = 0xFFu;
if (tri_2 == U32_MAX) {
sid_2 = 0xFEu;
} else {
let mm = material_mesh_metas[mat_off_2 / 256u];
if (mm.is_hud == 1u) { sid_2 = 0xFEu; }
else {
let raw_sid = materials_data[mm.material_offset / 4u];
{% for entry in bucket_entries %}
if (raw_sid == {{ entry.shader_id_const() }}) { sid_2 = {{ loop.index0 }}u; }
{% endfor %}
}
}
var sid_3: u32 = 0xFFu;
if (tri_3 == U32_MAX) {
sid_3 = 0xFEu;
} else {
let mm = material_mesh_metas[mat_off_3 / 256u];
if (mm.is_hud == 1u) { sid_3 = 0xFEu; }
else {
let raw_sid = materials_data[mm.material_offset / 4u];
{% for entry in bucket_entries %}
if (raw_sid == {{ entry.shader_id_const() }}) { sid_3 = {{ loop.index0 }}u; }
{% endfor %}
}
}
// Build the distinct-shader-id list (`seen[0..seen_count)`) by
// explicit static comparisons. Static `seen_*` vars avoid the
// dynamic-write-into-array problem.
var seen_0: u32 = sid_0;
var seen_1: u32 = 0xFFu;
var seen_2: u32 = 0xFFu;
var seen_3: u32 = 0xFFu;
var seen_count: u32 = 1u;
if (sid_1 != seen_0) {
seen_1 = sid_1;
seen_count = 2u;
}
let sid_2_new = (sid_2 != seen_0) && (sid_2 != seen_1);
if (sid_2_new) {
if (seen_count == 1u) { seen_1 = sid_2; }
else if (seen_count == 2u) { seen_2 = sid_2; }
seen_count = seen_count + 1u;
}
let sid_3_new = (sid_3 != seen_0) && (sid_3 != seen_1) && (sid_3 != seen_2);
if (sid_3_new) {
if (seen_count == 1u) { seen_1 = sid_3; }
else if (seen_count == 2u) { seen_2 = sid_3; }
else if (seen_count == 3u) { seen_3 = sid_3; }
seen_count = seen_count + 1u;
}
var slot_map: u32 = 0xFFFFFFFFu;
// Edge pixel: 2+ distinct shader_ids (counts skybox as one)
// OR samples cover different meshes (different mat_off).
// We deliberately do NOT include "different tri_id within the
// same mesh" — that fires at intra-mesh triangle seams of
// tessellated curved surfaces, and the resulting per-sample
// re-shading produces wireframe-like artifacts (samples on
// adjacent triangles can have wildly different bary derivs /
// depth, so the average isn't a smooth blend).
//
// SILHOUETTE detection: a pixel is a silhouette edge iff its
// 4 samples are MIXED between "covered by geometry" and "not".
// Diagnosis showed that tri_id (v.x / v.y channels) IS distinct
// per sample at silhouettes (and intra-mesh tri seams), while
// mat_meta_offset (v.z / v.w) gets broadcast across samples by
// the fragment shader output path on this Tint compile. Using
// mat_meta diff alone never caught silhouettes; using tri_id
// diff caught silhouettes AND intra-mesh tri seams — but the
// latter produced wireframe artifacts when edge_resolve
// shaded per-sample. The clean discriminator: was sample N
// *covered* by any triangle? `tri_id == U32_MAX` means
// uncovered (clear value). Mixed coverage across the 4
// samples = silhouette against skybox/uncovered region. All
// covered with differing tri_ids = intra-mesh seam (NOT a
// silhouette; skip).
let cov_0 = tri_0 != U32_MAX;
let cov_1 = tri_1 != U32_MAX;
let cov_2 = tri_2 != U32_MAX;
let cov_3 = tri_3 != U32_MAX;
// Mesh-vs-skybox silhouette (coverage mismatch) OR mesh-vs-mesh
// silhouette in the same pixel (different mat_meta_offset
// across samples). Earlier hypothesis that mat_meta gets
// broadcast by Tint may have been wrong — empirical retest with
// the depth-tex / view-space pipeline showed the per-sample data
// IS distinct after all when MULTIPLE fragments cover the same
// pixel (e.g. capsule fragment over samples 0,1 + platform
// fragment over samples 2,3 each write their own mat_meta).
let any_cov_differs = (cov_0 != cov_1)
|| (cov_0 != cov_2)
|| (cov_0 != cov_3);
let any_mat_differs = (mat_off_0 != mat_off_1)
|| (mat_off_0 != mat_off_2)
|| (mat_off_0 != mat_off_3);
let any_mesh_differs = any_cov_differs || any_mat_differs;
// PORT OF MAIN'S MSAA EDGE DETECTION (edge_mask_depth_msaa +
// edge_mask_neighbors).
//
// Two checks combined:
// (1) Per-sample VIEW-SPACE depth variance in this pixel —
// catches in-pixel mesh-vs-mesh silhouettes where ≥2
// samples cover different surfaces. Uses view-space depth
// (linearized via camera.inv_proj) so the same relative
// threshold works near AND far from the camera. Raw
// depth-buffer comparison fired too aggressively at far
// depths and not at all near the camera.
// (2) 4-neighbor view-space depth/coverage check — catches
// silhouettes where the silhouette runs *between* pixels
// (current pixel fully covered, neighbor fully uncovered
// or at very different depth). Matches main's
// edge_mask_neighbors which returns true for uncovered
// neighbors AND for covered neighbors with depth delta >
// EDGE_DEPTH_THRESHOLD.
//
// Thresholds taken verbatim from main:
// EDGE_DEPTH_THRESHOLD = 0.02 (2% relative view-space depth)
// EDGE_MSAA_DEPTH_THRESHOLD = 0.02 (same scale, same threshold)
let camera = camera_from_raw(camera_raw);
let screen_dims_f32 = vec2<f32>(f32(screen_dims.x), f32(screen_dims.y));
let pixel_center = vec2<f32>(f32(coords.x) + 0.5, f32(coords.y) + 0.5);
// ── Check 1: in-pixel depth variance (view-space) ──────────
var sample_count: u32 = 0u;
var vdmin: f32 = 1.0e9;
var vdmax: f32 = -1.0e9;
if (cov_0) {
let d = textureLoad(depth_tex, coords, 0);
let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
vdmin = min(vdmin, vd);
vdmax = max(vdmax, vd);
sample_count = sample_count + 1u;
}
if (cov_1) {
let d = textureLoad(depth_tex, coords, 1);
let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
vdmin = min(vdmin, vd);
vdmax = max(vdmax, vd);
sample_count = sample_count + 1u;
}
if (cov_2) {
let d = textureLoad(depth_tex, coords, 2);
let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
vdmin = min(vdmin, vd);
vdmax = max(vdmax, vd);
sample_count = sample_count + 1u;
}
if (cov_3) {
let d = textureLoad(depth_tex, coords, 3);
let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
vdmin = min(vdmin, vd);
vdmax = max(vdmax, vd);
sample_count = sample_count + 1u;
}
var depth_edge: bool = false;
if (sample_count >= 2u) {
let depth_range = abs(vdmax - vdmin);
let avg_depth = abs((vdmax + vdmin) * 0.5);
depth_edge = depth_range > (0.02 * avg_depth);
}
// ── Check 2: 4-neighbor coverage + normal + depth check ─────
//
// Order matches main's `edge_mask_neighbors`:
// 1. neighbor uncovered (tri_id == U32_MAX) → edge.
// 2. normal discontinuity (dot < 0.95 ≈ 18°) → edge.
// 3. depth discontinuity (relative 2% in view-space) → edge.
//
// The normal check is critical at tile-facet boundaries on the
// platform top, where adjacent same-mesh tiles share depth +
// mat_meta but rotate their surface normal by a small angle.
// Depth-only neighbour detection misses these and leaves a
// diagonal stripe of aliasing along the platform's top-front
// edge.
let EDGE_NORMAL_THRESHOLD: f32 = 0.95;
var neighbor_edge: bool = false;
if (cov_0) {
let center_d = textureLoad(depth_tex, coords, 0);
let center_vd = view_space_depth(camera, center_d, pixel_center, screen_dims_f32);
let depth_threshold = 0.02 * abs(center_vd);
// Center normal: sample-0 of the multisampled normal_tangent
// texture, unpacked via the shared TBN helper. Same convention
// the primary opaque + edge_resolve paths use.
let center_nt = textureLoad(normal_tangent_tex, coords, 0);
let center_tbn = unpack_normal_tangent(center_nt);
let center_normal = center_tbn.N;
let neighbor_offsets = array<vec2<i32>, 4>(
vec2<i32>(1, 0),
vec2<i32>(-1, 0),
vec2<i32>(0, 1),
vec2<i32>(0, -1),
);
let pixel_offsets = array<vec2<f32>, 4>(
vec2<f32>(1.0, 0.0),
vec2<f32>(-1.0, 0.0),
vec2<f32>(0.0, 1.0),
vec2<f32>(0.0, -1.0),
);
for (var ni = 0; ni < 4; ni++) {
let n_coords = coords + neighbor_offsets[ni];
if (n_coords.x < 0 || n_coords.y < 0
|| n_coords.x >= i32(screen_dims.x)
|| n_coords.y >= i32(screen_dims.y)) {
continue;
}
let nv = textureLoad(visibility_data_tex, n_coords, 0);
let n_tri = join32(nv.x, nv.y);
if (n_tri == U32_MAX) {
neighbor_edge = true;
break;
}
// Normal discontinuity (sample 0 of neighbour pixel).
let n_nt = textureLoad(normal_tangent_tex, n_coords, 0);
let n_tbn = unpack_normal_tangent(n_nt);
if (dot(center_normal, n_tbn.N) < EDGE_NORMAL_THRESHOLD) {
neighbor_edge = true;
break;
}
// Depth discontinuity (view-space relative).
let n_depth = textureLoad(depth_tex, n_coords, 0);
let n_pixel_center = pixel_center + pixel_offsets[ni];
let n_vd = view_space_depth(camera, n_depth, n_pixel_center, screen_dims_f32);
if (abs(center_vd - n_vd) > depth_threshold) {
neighbor_edge = true;
break;
}
}
}
// Edge gate. Four signals, all needed for parity with main's
// `msaa_resolve_samples` edge detection:
// * `seen_count >= 2u` — multi-shader-id silhouette
// * `any_mesh_differs` — mesh-vs-mesh (different mat_meta)
// * `depth_edge` — per-sample depth variance
// (same-mesh in-pixel, e.g. the
// platform's top/front-face seam)
// * `neighbor_edge` — 4-neighbour depth/coverage check
// (silhouette runs between pixels)
//
// depth_edge + neighbor_edge were previously disabled because
// they produced "black sides on capsules" — that was an artefact
// of the texture-pool-arrays-len template mismatch in
// `texture_pool_sample_grad` (every same-mesh edge fell into the
// `default → vec4(0)` branch in edge_resolve's compiled shader).
// With the recompile wired into `finalize_gpu_textures`
// (textures.rs), per-sample shading at same-mesh edges produces
// the right value and the depth signals can be re-enabled.
if (seen_count >= 2u || any_mesh_differs || depth_edge || neighbor_edge) {
// Allocate compact edge_pixel_id. The atomic counter lives
// in args_buffer / `edge_buffers` (drives indirect dispatch
// for final_blend); we also mirror it into edge_data's
// header so the resolve shaders can read it without binding
// args_buffer (saves a storage-buffer slot vs the 10-cap).
let edge_id = atomicAdd(&edge_buffers.edge_count, 1u);
atomicAdd(&edge_data[edge_layout.edge_count_index], 1u);
if (edge_id < edge_layout.max_edge_budget) {
// Write edge_to_xy via atomicStore (edge_data is
// declared as array<atomic<u32>> so even plain stores
// go through the atomic interface).
let packed_xy = (u32(coords.x) & 0xFFFFu) | ((u32(coords.y) & 0xFFFFu) << 16u);
atomicStore(&edge_data[edge_layout.edge_to_xy_base + edge_id], packed_xy);
// Pack slot_map: 4 bytes, each byte is a bucket index
// (or 0xFE for skybox, 0xFF for empty slot).
slot_map = (seen_0 & 0xFFu)
| ((seen_1 & 0xFFu) << 8u)
| ((seen_2 & 0xFFu) << 16u)
| ((seen_3 & 0xFFu) << 24u);
atomicStore(&edge_data[edge_layout.edge_slot_map_base + edge_id], slot_map);
// Clear this edge pixel's 4 accumulator slots (4 vec4<f32>
// = 16 u32 words) so a bucket whose per-shader edge_resolve
// pipeline isn't resident this frame leaves count==0 — which
// final_blend skips — instead of reading a stale value left
// at the same slot index by a previous frame's edge pixel.
// This is what makes the resolve per-bucket-independent
// (render_pass.rs dropped the all-or-nothing gate). Bounded
// by the live edge count: only freshly-allocated edge pixels
// are cleared, never the full MAX_EDGE_BUDGET accumulator.
let accum_clear_base = edge_layout.accumulator_base + edge_id * 16u;
for (var ci: u32 = 0u; ci < 16u; ci = ci + 1u) {
atomicStore(&edge_data[accum_clear_base + ci], 0u);
}
// For each per-shader sample mask: append (edge_id,
// sample_mask) to that bucket's sample list. Skybox
// samples route to the skybox sample list (separate
// reserved region). Unrolled per-sample to avoid any
// dynamic indexing into per-sample arrays.
var skybox_mask: u32 = 0u;
{% for entry in bucket_entries %}
var mask_{{ loop.index0 }}: u32 = 0u;
{% endfor %}
// Sample 0
if (sid_0 == 0xFEu) { skybox_mask |= 1u; }
{% for entry in bucket_entries %}
else if (sid_0 == {{ loop.index0 }}u) { mask_{{ loop.index0 }} |= 1u; }
{% endfor %}
// Sample 1
if (sid_1 == 0xFEu) { skybox_mask |= 2u; }
{% for entry in bucket_entries %}
else if (sid_1 == {{ loop.index0 }}u) { mask_{{ loop.index0 }} |= 2u; }
{% endfor %}
// Sample 2
if (sid_2 == 0xFEu) { skybox_mask |= 4u; }
{% for entry in bucket_entries %}
else if (sid_2 == {{ loop.index0 }}u) { mask_{{ loop.index0 }} |= 4u; }
{% endfor %}
// Sample 3
if (sid_3 == 0xFEu) { skybox_mask |= 8u; }
{% for entry in bucket_entries %}
else if (sid_3 == {{ loop.index0 }}u) { mask_{{ loop.index0 }} |= 8u; }
{% endfor %}
// Append per-bucket entries. The slot is allocated via
// the data-side per-bucket counter (which doubles as
// edge_resolve's `entry_count` thread-bound). The
// args-side `workgroup_count_x` only needs to grow
// every 64th slot, matching edge_resolve's
// `@workgroup_size(64)` — without that gate we'd
// dispatch 64× more workgroups than needed (each would
// early-exit via the `thread_index >= entry_count`
// check, but the wasted dispatch overhead adds up at
// high edge density).
{% for entry in bucket_entries %}
if (mask_{{ loop.index0 }} != 0u) {
let slot_idx_{{ loop.index0 }} = atomicAdd(&edge_data[edge_layout.per_shader_count_base + {{ loop.index0 }}u], 1u);
if ((slot_idx_{{ loop.index0 }} & 63u) == 0u) {
atomicAdd(&edge_buffers.{{ entry.args_field() }}_edge.workgroup_count_x, 1u);
}
if (slot_idx_{{ loop.index0 }} < edge_layout.sample_entries_per_bucket) {
let entry_packed_{{ loop.index0 }} = (edge_id & 0x00FFFFFFu) | ((mask_{{ loop.index0 }} & 0xFFu) << 24u);
atomicStore(&edge_data[edge_layout.{{ entry.args_field() }}_sample_list_base + slot_idx_{{ loop.index0 }}], entry_packed_{{ loop.index0 }});
}
}
{% endfor %}
// Skybox sample list — same allocator pattern as the
// per-bucket case above. Slot via the data-side counter
// (read by skybox_edge_resolve as `entry_count`); the
// workgroup_count_x in args_buffer grows every 64th
// slot to match skybox_edge_resolve's @workgroup_size(64).
if (skybox_mask != 0u) {
let sky_slot_idx = atomicAdd(&edge_data[edge_layout.skybox_count_index], 1u);
if ((sky_slot_idx & 63u) == 0u) {
atomicAdd(&edge_buffers.skybox_edge_args.workgroup_count_x, 1u);
}
if (sky_slot_idx < edge_layout.sample_entries_per_bucket) {
let sky_entry_packed = (edge_id & 0x00FFFFFFu) | ((skybox_mask & 0xFFu) << 24u);
atomicStore(&edge_data[edge_layout.skybox_sample_list_base + sky_slot_idx], sky_entry_packed);
}
}
// Final blend args: one workgroup per edge pixel
// (workgroup_size = 64, so divide by 64).
if ((edge_id & 63u) == 0u) {
atomicAdd(&edge_buffers.final_blend_args.workgroup_count_x, 1u);
}
} else {
atomicAdd(&edge_buffers.edge_overflow_count, 1u);
}
}
}
{% endif %}
}