awsm-renderer 0.3.1

// Material classify compute pass.
//
// Per 8×8 tile, scan the visibility buffer and discover which opaque
// `shader_id`s its pixels belong to. Aggregate via a workgroup-shared
// bitmask (one bit per registered bucket, including dynamic
// materials), then thread 0 atomically appends the tile's coords to
// each bucket bit is set in. The total atomic traffic is ~1 per
// workgroup-bit, regardless of the 64 threads inside.
//
// Skybox pixels (`triangle_index == U32_MAX`) are routed to the dedicated
// SKYBOX bucket (index 0). Its opaque pipeline is the `skybox_primary.wgsl`
// writer, dispatched over that bucket's tiles; the material kernels
// (compute.wgsl) shade real geometry only. See
// material_opaque/.../skybox_primary.wgsl.
//
// The bit constants + the shader_id → bit dispatch chain + the
// per-bucket extract block are all walked from the same
// `bucket_entries` list the templated `ClassifyOutput` struct uses.
//
// §4a/§4c made the per-pixel + per-sample maps data-driven (`bucket_lut`)
// and the fan-out + append index-driven, so the old per-bucket
// `SHADER_ID_<NAME>` / `BUCKET_BIT_<NAME>` constants are no longer
// referenced — the classify shader text now depends only on counts/widths
// (bucket_count, n_words, edge_slot_bits), never on bucket identities.

{% include "shared_wgsl/math.wgsl" %}

{% if emit_edge_data && multisampled_geometry %}
// Linearize a raw depth-buffer value into view-space Z. Ported from
// main's helpers/msaa.wgsl::viewSpaceDepth — same math, same name
// adjusted to snake_case. Required so a relative depth threshold
// (EDGE_DEPTH_THRESHOLD = 0.02 = 2%) works consistently across the
// scene; raw NDC depth is non-linear so the same threshold near vs
// far would produce wildly different real-world distances.
fn view_space_depth(camera: Camera, depth: f32, pixel_coords: vec2<f32>, screen_dims: vec2<f32>) -> f32 {
    let ndc_xy = vec2<f32>(
        (pixel_coords.x / screen_dims.x) * 2.0 - 1.0,
        1.0 - (pixel_coords.y / screen_dims.y) * 2.0,
    );
    let clip_pos = vec4<f32>(ndc_xy, depth, 1.0);
    let view_pos_h = camera.inv_proj * clip_pos;
    let view_pos = view_pos_h.xyz / view_pos_h.w;
    return view_pos.z;
}

// Full-u32 edge-sample sentinels (§5), width-INDEPENDENT: a real bucket
// index (0..65533) can never equal these, so `sid`/`seen`/append logic needs
// no per-width branching. The slot_map PACK truncates them to the
// width-correct packed sentinel (`& 0xFF` → 0xFE/0xFF for 8-bit; `& 0xFFFF`
// → 0xFFFE/0xFFFF for 16-bit), so the 8-bit packed output is byte-identical
// to before.
const SID_SKYBOX: u32 = 0xFFFFFFFEu;
const SID_EMPTY: u32 = 0xFFFFFFFFu;

// (Unified-edge U2b-3) The per-bucket + skybox edge-SAMPLE-LIST machinery
// (`append_edge_sample`) was removed: those lists fed only the legacy
// cs_edge / skybox_edge_resolve pipelines, which are gone. The unified
// `cs_shade` kernel drives edge shading from the per-pixel edge-id texture +
// the packed slot map instead, so classify no longer appends sample-list
// entries (and `data_buffer` no longer allocates the lists). SID_SKYBOX /
// SID_EMPTY below are still used by the slot_map pack.
{% endif %}

// Workgroup-shared bucket mask. One bit per registered bucket; the SKYBOX
// bucket is at index 0 (id 0 sorts first) → word 0, bit 0. Each bucket lives
// in word `index / 32` at bit `index % 32`; `tile_mask` is
// `array<atomic<u32>, n_words>` (n_words = ceil(live_count / 32)) so the
// bucket budget grows past 32 as the live count crosses each word boundary.
var<workgroup> tile_mask: array<atomic<u32>, {{ n_words }}u>;

@compute @workgroup_size(8, 8, 1)
fn cs_main(
    @builtin(workgroup_id) wg: vec3<u32>,
    @builtin(local_invocation_id) lid: vec3<u32>,
    @builtin(local_invocation_index) lii: u32,
) {
    // Zero the workgroup mask once per dispatch. The barrier below
    // makes the zero visible to every thread before the per-pixel
    // OR's land.
    if lii == 0u {
        {% for w in words_iter %}
        atomicStore(&tile_mask[{{ w }}u], 0u);
        {% endfor %}
    }
    workgroupBarrier();

    let screen_dims = textureDimensions(visibility_data_tex);
    let coords = vec2<i32>(wg.xy * 8u + lid.xy);
    let in_bounds = coords.x < i32(screen_dims.x) && coords.y < i32(screen_dims.y);

    // Per-pixel map → O(1) LUT load (§4a). `bucket_lut[raw_sid]` replaces
    // the old O(buckets) `shader_id == SHADER_ID_*` if/else chain: one
    // dependent load, warp-coherent (neighbouring pixels share a material →
    // same slot). `bucket_index == 0xFFFFFFFF` (NOT_FOUND) reproduces the
    // old "no arm matched" fall-through — the pixel contributes no bucket
    // bit. The bucket index's word is `bucket_index / 32`, its bit
    // `bucket_index % 32`; at n_words == 1 the word is always 0.
    var bucket_index: u32 = 0xFFFFFFFFu;
    if in_bounds {
        let vis = textureLoad(visibility_data_tex, coords, 0);
        let tri = join32(vis.x, vis.y);
        if tri == U32_MAX {
            // Fully-uncovered ("sky") pixel → the dedicated SKYBOX bucket
            // (index 0, since id 0 sorts first), whose opaque pipeline is the
            // `skybox_primary` writer. Routed directly — sky pixels carry no
            // material payload to look up. See skybox_primary.wgsl.
            bucket_index = 0u;
        } else {
            let meta_offset = join32(vis.z, vis.w);
            let mesh_meta = material_mesh_metas[meta_offset / 256u];
            if mesh_meta.is_hud == 0u {
                // shader_id is stored as the first u32 of each
                // material payload; `material_offset` is in bytes.
                let shader_id = materials_data[mesh_meta.material_offset / 4u];
                bucket_index = bucket_lut[shader_id];
            }
            // HUD pixels are redrawn by the transparency pass — skip
            // them in classify so the opaque pipelines don't process
            // tiles that contain only HUD geometry.
        }
    }

    if bucket_index != 0xFFFFFFFFu {
        atomicOr(&tile_mask[bucket_index / 32u], 1u << (bucket_index % 32u));
    }
    {% if multisampled_geometry %}
    // Unified-edge ANY-sample tile_mask (U0). The sample-0 `bucket_index`
    // OR above runs UNCHANGED (so the `unified_edge=false` WGSL is byte
    // -identical); this block ADDS samples 1..3 so a bucket's tile list
    // covers tiles where it appears at ANY sample (atomicOr is idempotent,
    // so re-OR'ing sample 0 here is unnecessary — only 1..3 are added).
    // This makes an edge-only material's tiles reachable by its own
    // (future) unified dispatch. Inert in U0: the extra tiles only add a
    // few check-and-skip lanes to the existing per-pixel shader_id guard in
    // cs_opaque → same output. Per-sample bucket id derived the same way the
    // sample-0 path does (sky → bucket 0; HUD → skip; else bucket_lut).
    // Unrolled (no dynamic indexing into per-sample locals).
    if in_bounds {
        let uv1 = textureLoad(visibility_data_tex, coords, 1);
        let uv2 = textureLoad(visibility_data_tex, coords, 2);
        let uv3 = textureLoad(visibility_data_tex, coords, 3);
        let utri1 = join32(uv1.x, uv1.y);
        let utri2 = join32(uv2.x, uv2.y);
        let utri3 = join32(uv3.x, uv3.y);
        var ubucket1: u32 = 0xFFFFFFFFu;
        if utri1 == U32_MAX {
            ubucket1 = 0u;
        } else {
            let mm = material_mesh_metas[join32(uv1.z, uv1.w) / 256u];
            if mm.is_hud == 0u { ubucket1 = bucket_lut[materials_data[mm.material_offset / 4u]]; }
        }
        var ubucket2: u32 = 0xFFFFFFFFu;
        if utri2 == U32_MAX {
            ubucket2 = 0u;
        } else {
            let mm = material_mesh_metas[join32(uv2.z, uv2.w) / 256u];
            if mm.is_hud == 0u { ubucket2 = bucket_lut[materials_data[mm.material_offset / 4u]]; }
        }
        var ubucket3: u32 = 0xFFFFFFFFu;
        if utri3 == U32_MAX {
            ubucket3 = 0u;
        } else {
            let mm = material_mesh_metas[join32(uv3.z, uv3.w) / 256u];
            if mm.is_hud == 0u { ubucket3 = bucket_lut[materials_data[mm.material_offset / 4u]]; }
        }
        if ubucket1 != 0xFFFFFFFFu {
            atomicOr(&tile_mask[ubucket1 / 32u], 1u << (ubucket1 % 32u));
        }
        if ubucket2 != 0xFFFFFFFFu {
            atomicOr(&tile_mask[ubucket2 / 32u], 1u << (ubucket2 % 32u));
        }
        if ubucket3 != 0xFFFFFFFFu {
            atomicOr(&tile_mask[ubucket3 / 32u], 1u << (ubucket3 % 32u));
        }
    }
    {% endif %}
    workgroupBarrier();

    if lii == 0u {
        let tile = vec2<u32>(wg.xy);

        // Data-driven fan-out (§4b): iterate only the bucket bits
        // actually set in this tile's mask — `O(active buckets in tile)`,
        // never `O(total buckets)`. A typical tile touches 1–3 materials,
        // so this is ~1–3 iterations whether the registry holds 16 or
        // 1024 buckets. `firstTrailingBit` walks the set bits of each
        // mask word; `bucket_index = w*32 + bit` indexes the `args` /
        // `offsets` arrays directly. The atomicAdd returns the previous
        // count, which doubles as the next free slot into the bucket's
        // `tiles` sub-range at `offsets[bucket_index] + idx`.
        for (var w = 0u; w < {{ n_words }}u; w = w + 1u) {
            var bits = atomicLoad(&tile_mask[w]);
            while (bits != 0u) {
                let b = firstTrailingBit(bits);
                bits = bits & (bits - 1u);
                let bucket_index = w * 32u + b;
                let idx = atomicAdd(&classify_output.args[bucket_index].workgroup_count_x, 1u);
                let base = classify_output.offsets[bucket_index];
                let slot = base + idx;
                if slot < base + classify_output.bucket_capacity {
                    classify_output.tiles[slot] = tile;
                }
            }
        }
    }

    {% if emit_edge_data && multisampled_geometry %}
    // ─────────────────────────────────────────────────────────────
    // MSAA edge emission (Priority 3).
    //
    // For each pixel: scan all 4 samples; collect the distinct
    // shader_ids; if there are at least 2 distinct shader_ids
    // (counting "skybox" as its own shader_id), the pixel is an edge
    // pixel. Allocate a compact edge_pixel_id via the global atomic,
    // write its (x, y) into edge_to_xy, build the slot_map (4 bytes
    // packed: up to 4 distinct shader_ids), and append a per-shader
    // (edge_pixel_id, sample_mask) entry to each contributing bucket's
    // sample list.
    //
    // Saturation: if edge_count saturates at MAX_EDGE_BUDGET, the
    // `edge_buffers.edge_overflow_count` atomic increments (see else
    // branch below) and we fall through without writing. The dropped
    // pixels render with the primary-sample shading written by the
    // material's primary pass — visually a missing-MSAA-resolve, not a
    // black hole. Stage 3.8 MVP (see edge_buffers.rs::note_edge_overflow_observed)
    // surfaces this via a one-shot tracing::warn when overflow is
    // observed CPU-side; the full atomic-add fallback (a hash-bucketed
    // overflow accumulator region) is parked as Block C.2 future work.
    if (in_bounds) {
        // Unified-edge (U0): initialize this pixel's edge-id to the
        // U32_MAX sentinel ("not an edge pixel"). Edge pixels overwrite
        // it with their compact edge_pixel_id below. Every in-bounds pixel
        // is written exactly once here, so non-edge pixels reliably read
        // the sentinel without a separate per-frame clear. Inert in U0
        // (edge_id_tex is unread).
        textureStore(edge_id_tex, coords, vec4<u32>(0xFFFFFFFFu, 0u, 0u, 0u));
        // Scan 4 samples — FULLY-STATIC version, no dynamic indexing
        // anywhere. Naga/Tint silently no-op'd dynamic-index writes
        // into both `vec4<u32>` and (turns out per the user's repro)
        // also `array<u32, 4>` in some configurations. Working around
        // entirely by unrolling and using individual `let`s.

        let v0 = textureLoad(visibility_data_tex, coords, 0);
        let v1 = textureLoad(visibility_data_tex, coords, 1);
        let v2 = textureLoad(visibility_data_tex, coords, 2);
        let v3 = textureLoad(visibility_data_tex, coords, 3);

        let tri_0 = join32(v0.x, v0.y);
        let tri_1 = join32(v1.x, v1.y);
        let tri_2 = join32(v2.x, v2.y);
        let tri_3 = join32(v3.x, v3.y);

        let mat_off_0 = join32(v0.z, v0.w);
        let mat_off_1 = join32(v1.z, v1.w);
        let mat_off_2 = join32(v2.z, v2.w);
        let mat_off_3 = join32(v3.z, v3.w);

        // Helper bucket-id derivation, inlined per sample.
        // Per-sample map → O(1) LUT load (§4a), one per sample. `sid_s`
        // is SID_SKYBOX (skybox/uncovered/HUD), a real bucket index
        // [0, bucket_count), or SID_EMPTY (unmapped NOT_FOUND fall-through —
        // the data-driven form of the old "no arm matched"). Width-independent
        // full-u32 sentinels (§5) so the same code serves 8- and 16-bit slot
        // widths; they truncate at the slot_map pack. Each `sid_s` stays a
        // static local — no dynamic indexing into per-sample locals (Naga/Tint
        // scar, :163); only the `bucket_lut` storage load is dynamically
        // indexed, which is safe.
        var sid_0: u32 = SID_EMPTY;
        if (tri_0 == U32_MAX) {
            sid_0 = SID_SKYBOX;
        } else {
            let mm = material_mesh_metas[mat_off_0 / 256u];
            if (mm.is_hud == 1u) { sid_0 = SID_SKYBOX; }
            else {
                let bi = bucket_lut[materials_data[mm.material_offset / 4u]];
                if (bi != 0xFFFFFFFFu) { sid_0 = bi; }
            }
        }
        var sid_1: u32 = SID_EMPTY;
        if (tri_1 == U32_MAX) {
            sid_1 = SID_SKYBOX;
        } else {
            let mm = material_mesh_metas[mat_off_1 / 256u];
            if (mm.is_hud == 1u) { sid_1 = SID_SKYBOX; }
            else {
                let bi = bucket_lut[materials_data[mm.material_offset / 4u]];
                if (bi != 0xFFFFFFFFu) { sid_1 = bi; }
            }
        }
        var sid_2: u32 = SID_EMPTY;
        if (tri_2 == U32_MAX) {
            sid_2 = SID_SKYBOX;
        } else {
            let mm = material_mesh_metas[mat_off_2 / 256u];
            if (mm.is_hud == 1u) { sid_2 = SID_SKYBOX; }
            else {
                let bi = bucket_lut[materials_data[mm.material_offset / 4u]];
                if (bi != 0xFFFFFFFFu) { sid_2 = bi; }
            }
        }
        var sid_3: u32 = SID_EMPTY;
        if (tri_3 == U32_MAX) {
            sid_3 = SID_SKYBOX;
        } else {
            let mm = material_mesh_metas[mat_off_3 / 256u];
            if (mm.is_hud == 1u) { sid_3 = SID_SKYBOX; }
            else {
                let bi = bucket_lut[materials_data[mm.material_offset / 4u]];
                if (bi != 0xFFFFFFFFu) { sid_3 = bi; }
            }
        }

        // Build the distinct-shader-id list (`seen[0..seen_count)`) by
        // explicit static comparisons. Static `seen_*` vars avoid the
        // dynamic-write-into-array problem. Unused slots = SID_EMPTY.
        var seen_0: u32 = sid_0;
        var seen_1: u32 = SID_EMPTY;
        var seen_2: u32 = SID_EMPTY;
        var seen_3: u32 = SID_EMPTY;
        var seen_count: u32 = 1u;

        if (sid_1 != seen_0) {
            seen_1 = sid_1;
            seen_count = 2u;
        }
        let sid_2_new = (sid_2 != seen_0) && (sid_2 != seen_1);
        if (sid_2_new) {
            if (seen_count == 1u) { seen_1 = sid_2; }
            else if (seen_count == 2u) { seen_2 = sid_2; }
            seen_count = seen_count + 1u;
        }
        let sid_3_new = (sid_3 != seen_0) && (sid_3 != seen_1) && (sid_3 != seen_2);
        if (sid_3_new) {
            if (seen_count == 1u) { seen_1 = sid_3; }
            else if (seen_count == 2u) { seen_2 = sid_3; }
            else if (seen_count == 3u) { seen_3 = sid_3; }
            seen_count = seen_count + 1u;
        }

        // Edge pixel: 2+ distinct shader_ids (counts skybox as one)
        // OR samples cover different meshes (different mat_off).
        // We deliberately do NOT include "different tri_id within the
        // same mesh" — that fires at intra-mesh triangle seams of
        // tessellated curved surfaces, and the resulting per-sample
        // re-shading produces wireframe-like artifacts (samples on
        // adjacent triangles can have wildly different bary derivs /
        // depth, so the average isn't a smooth blend).
        //
        // SILHOUETTE detection: a pixel is a silhouette edge iff its
        // 4 samples are MIXED between "covered by geometry" and "not".
        // Diagnosis showed that tri_id (v.x / v.y channels) IS distinct
        // per sample at silhouettes (and intra-mesh tri seams), while
        // mat_meta_offset (v.z / v.w) gets broadcast across samples by
        // the fragment shader output path on this Tint compile. Using
        // mat_meta diff alone never caught silhouettes; using tri_id
        // diff caught silhouettes AND intra-mesh tri seams — but the
        // latter produced wireframe artifacts when edge_resolve
        // shaded per-sample. The clean discriminator: was sample N
        // *covered* by any triangle? `tri_id == U32_MAX` means
        // uncovered (clear value). Mixed coverage across the 4
        // samples = silhouette against skybox/uncovered region. All
        // covered with differing tri_ids = intra-mesh seam (NOT a
        // silhouette; skip).
        let cov_0 = tri_0 != U32_MAX;
        let cov_1 = tri_1 != U32_MAX;
        let cov_2 = tri_2 != U32_MAX;
        let cov_3 = tri_3 != U32_MAX;
        // Mesh-vs-skybox silhouette (coverage mismatch) OR mesh-vs-mesh
        // silhouette in the same pixel (different mat_meta_offset
        // across samples). Earlier hypothesis that mat_meta gets
        // broadcast by Tint may have been wrong — empirical retest with
        // the depth-tex / view-space pipeline showed the per-sample data
        // IS distinct after all when MULTIPLE fragments cover the same
        // pixel (e.g. capsule fragment over samples 0,1 + platform
        // fragment over samples 2,3 each write their own mat_meta).
        let any_cov_differs = (cov_0 != cov_1)
            || (cov_0 != cov_2)
            || (cov_0 != cov_3);
        let any_mat_differs = (mat_off_0 != mat_off_1)
            || (mat_off_0 != mat_off_2)
            || (mat_off_0 != mat_off_3);
        let any_mesh_differs = any_cov_differs || any_mat_differs;

        // PORT OF MAIN'S MSAA EDGE DETECTION (edge_mask_depth_msaa +
        // edge_mask_neighbors).
        //
        // Two checks combined:
        // (1) Per-sample VIEW-SPACE depth variance in this pixel —
        //     catches in-pixel mesh-vs-mesh silhouettes where ≥2
        //     samples cover different surfaces. Uses view-space depth
        //     (linearized via camera.inv_proj) so the same relative
        //     threshold works near AND far from the camera. Raw
        //     depth-buffer comparison fired too aggressively at far
        //     depths and not at all near the camera.
        // (2) 4-neighbor view-space depth/coverage check — catches
        //     silhouettes where the silhouette runs *between* pixels
        //     (current pixel fully covered, neighbor fully uncovered
        //     or at very different depth). Matches main's
        //     edge_mask_neighbors which returns true for uncovered
        //     neighbors AND for covered neighbors with depth delta >
        //     EDGE_DEPTH_THRESHOLD.
        //
        // Thresholds taken verbatim from main:
        //   EDGE_DEPTH_THRESHOLD = 0.02  (2% relative view-space depth)
        //   EDGE_MSAA_DEPTH_THRESHOLD = 0.02  (same scale, same threshold)
        let camera = camera_from_raw(camera_raw);
        let screen_dims_f32 = vec2<f32>(f32(screen_dims.x), f32(screen_dims.y));
        let pixel_center = vec2<f32>(f32(coords.x) + 0.5, f32(coords.y) + 0.5);

        // ── Check 1: in-pixel depth variance (view-space) ──────────
        var sample_count: u32 = 0u;
        var vdmin: f32 = 1.0e9;
        var vdmax: f32 = -1.0e9;
        if (cov_0) {
            let d = textureLoad(depth_tex, coords, 0);
            let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
            vdmin = min(vdmin, vd);
            vdmax = max(vdmax, vd);
            sample_count = sample_count + 1u;
        }
        if (cov_1) {
            let d = textureLoad(depth_tex, coords, 1);
            let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
            vdmin = min(vdmin, vd);
            vdmax = max(vdmax, vd);
            sample_count = sample_count + 1u;
        }
        if (cov_2) {
            let d = textureLoad(depth_tex, coords, 2);
            let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
            vdmin = min(vdmin, vd);
            vdmax = max(vdmax, vd);
            sample_count = sample_count + 1u;
        }
        if (cov_3) {
            let d = textureLoad(depth_tex, coords, 3);
            let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
            vdmin = min(vdmin, vd);
            vdmax = max(vdmax, vd);
            sample_count = sample_count + 1u;
        }
        var depth_edge: bool = false;
        if (sample_count >= 2u) {
            let depth_range = abs(vdmax - vdmin);
            let avg_depth = abs((vdmax + vdmin) * 0.5);
            depth_edge = depth_range > (0.02 * avg_depth);
        }

        // ── Check 2: 4-neighbor coverage + normal + depth check ─────
        //
        // Order matches main's `edge_mask_neighbors`:
        //   1. neighbor uncovered (tri_id == U32_MAX) → edge.
        //   2. normal discontinuity (dot < 0.95 ≈ 18°) → edge.
        //   3. depth discontinuity (relative 2% in view-space) → edge.
        //
        // The normal check is critical at tile-facet boundaries on the
        // platform top, where adjacent same-mesh tiles share depth +
        // mat_meta but rotate their surface normal by a small angle.
        // Depth-only neighbour detection misses these and leaves a
        // diagonal stripe of aliasing along the platform's top-front
        // edge.
        let EDGE_NORMAL_THRESHOLD: f32 = 0.95;
        var neighbor_edge: bool = false;
        if (cov_0) {
            let center_d = textureLoad(depth_tex, coords, 0);
            let center_vd = view_space_depth(camera, center_d, pixel_center, screen_dims_f32);
            let depth_threshold = 0.02 * abs(center_vd);
            // Center normal: sample-0 of the multisampled normal_tangent
            // texture, unpacked via the shared TBN helper. Same convention
            // the primary opaque + edge_resolve paths use.
            let center_nt = textureLoad(normal_tangent_tex, coords, 0);
            let center_tbn = unpack_normal_tangent(center_nt);
            let center_normal = center_tbn.N;
            let neighbor_offsets = array<vec2<i32>, 4>(
                vec2<i32>(1, 0),
                vec2<i32>(-1, 0),
                vec2<i32>(0, 1),
                vec2<i32>(0, -1),
            );
            let pixel_offsets = array<vec2<f32>, 4>(
                vec2<f32>(1.0, 0.0),
                vec2<f32>(-1.0, 0.0),
                vec2<f32>(0.0, 1.0),
                vec2<f32>(0.0, -1.0),
            );
            for (var ni = 0; ni < 4; ni++) {
                let n_coords = coords + neighbor_offsets[ni];
                if (n_coords.x < 0 || n_coords.y < 0
                    || n_coords.x >= i32(screen_dims.x)
                    || n_coords.y >= i32(screen_dims.y)) {
                    continue;
                }
                let nv = textureLoad(visibility_data_tex, n_coords, 0);
                let n_tri = join32(nv.x, nv.y);
                if (n_tri == U32_MAX) {
                    neighbor_edge = true;
                    break;
                }
                // Normal discontinuity (sample 0 of neighbour pixel).
                let n_nt = textureLoad(normal_tangent_tex, n_coords, 0);
                let n_tbn = unpack_normal_tangent(n_nt);
                if (dot(center_normal, n_tbn.N) < EDGE_NORMAL_THRESHOLD) {
                    neighbor_edge = true;
                    break;
                }
                // Depth discontinuity (view-space relative).
                let n_depth = textureLoad(depth_tex, n_coords, 0);
                let n_pixel_center = pixel_center + pixel_offsets[ni];
                let n_vd = view_space_depth(camera, n_depth, n_pixel_center, screen_dims_f32);
                if (abs(center_vd - n_vd) > depth_threshold) {
                    neighbor_edge = true;
                    break;
                }
            }
        }

        // Edge gate. Four signals, all needed for parity with main's
        // `msaa_resolve_samples` edge detection:
        //   * `seen_count >= 2u`        — multi-shader-id silhouette
        //   * `any_mesh_differs`        — mesh-vs-mesh (different mat_meta)
        //   * `depth_edge`              — per-sample depth variance
        //                                 (same-mesh in-pixel, e.g. the
        //                                 platform's top/front-face seam)
        //   * `neighbor_edge`           — 4-neighbour depth/coverage check
        //                                 (silhouette runs between pixels)
        //
        // depth_edge + neighbor_edge were previously disabled because
        // they produced "black sides on capsules" — that was an artefact
        // of the texture-pool-arrays-len template mismatch in
        // `texture_pool_sample_grad` (every same-mesh edge fell into the
        // `default → vec4(0)` branch in edge_resolve's compiled shader).
        // With the recompile wired into `finalize_gpu_textures`
        // (textures.rs), per-sample shading at same-mesh edges produces
        // the right value and the depth signals can be re-enabled.
        if (seen_count >= 2u || any_mesh_differs || depth_edge || neighbor_edge) {
            // Allocate compact edge_pixel_id. The atomic counter lives
            // in args_buffer / `edge_buffers` (drives indirect dispatch
            // for final_blend); we also mirror it into edge_data's
            // header so the resolve shaders can read it without binding
            // args_buffer (saves a storage-buffer slot vs the 10-cap).
            let edge_id = atomicAdd(&edge_buffers.edge_count, 1u);
            atomicAdd(&edge_data[edge_layout.edge_count_index], 1u);
            if (edge_id < edge_layout.max_edge_budget) {
                // Write edge_to_xy via atomicStore (edge_data is
                // declared as array<atomic<u32>> so even plain stores
                // go through the atomic interface).
                let packed_xy = (u32(coords.x) & 0xFFFFu) | ((u32(coords.y) & 0xFFFFu) << 16u);
                atomicStore(&edge_data[edge_layout.edge_to_xy_base + edge_id], packed_xy);

                // Unified-edge (U0): mirror the compact edge_pixel_id into
                // the per-pixel edge-id texture. Overwrites the U32_MAX
                // sentinel written above for this pixel. WRITTEN-only in U0
                // (the future unified kernel reads it to branch
                // interior-vs-edge + index the per-sample accumulator). The
                // existing edge-sample-list machinery (append_edge_sample
                // below) stays INTACT alongside it.
                textureStore(edge_id_tex, coords, vec4<u32>(edge_id, 0u, 0u, 0u));

                // Pack the slot_map (§5): the 4 per-sample bucket ids, each
                // truncated to the slot width. 8-bit (≤254 buckets): one u32
                // (4×8) — byte-identical to before, sentinels 0xFE/0xFF.
                // 16-bit (>254): two u32 (4×16), sentinels 0xFFFE/0xFFFF.
                {% if edge_slot_bits == 16 %}
                let slot_base = edge_layout.edge_slot_map_base + edge_id * 2u;
                atomicStore(&edge_data[slot_base],
                    (seen_0 & 0xFFFFu) | ((seen_1 & 0xFFFFu) << 16u));
                atomicStore(&edge_data[slot_base + 1u],
                    (seen_2 & 0xFFFFu) | ((seen_3 & 0xFFFFu) << 16u));
                {% else %}
                let slot_map = (seen_0 & 0xFFu)
                    | ((seen_1 & 0xFFu) << 8u)
                    | ((seen_2 & 0xFFu) << 16u)
                    | ((seen_3 & 0xFFu) << 24u);
                atomicStore(&edge_data[edge_layout.edge_slot_map_base + edge_id], slot_map);
                {% endif %}

                // Clear this edge pixel's 4 accumulator slots (4 vec4<f32>
                // = 16 u32 words) so a bucket whose per-shader edge_resolve
                // pipeline isn't resident this frame leaves count==0 — which
                // final_blend skips — instead of reading a stale value left
                // at the same slot index by a previous frame's edge pixel.
                // This is what makes the resolve per-bucket-independent
                // (render_pass.rs dropped the all-or-nothing gate). Bounded
                // by the live edge count: only freshly-allocated edge pixels
                // are cleared, never the full MAX_EDGE_BUDGET accumulator.
                let accum_clear_base = edge_layout.accumulator_base + edge_id * 16u;
                for (var ci: u32 = 0u; ci < 16u; ci = ci + 1u) {
                    atomicStore(&edge_data[accum_clear_base + ci], 0u);
                }

                // (Unified-edge U2b-3) Sample-list append removed — see the
                // note where `append_edge_sample` used to be defined. cs_shade
                // reads the packed slot map (above) + the per-pixel edge-id
                // texture to drive per-sample edge shading, so no per-bucket
                // lists are built.
                // Final blend args: one workgroup per edge pixel
                // (workgroup_size = 64, so divide by 64).
                if ((edge_id & 63u) == 0u) {
                    atomicAdd(&edge_buffers.final_blend_args.workgroup_count_x, 1u);
                }
            } else {
                atomicAdd(&edge_buffers.edge_overflow_count, 1u);
            }
        }
    }
    {% endif %}
}