awsm-renderer 0.3.3

awsm-renderer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
// Material classify compute pass.
//
// Per 8×8 tile, scan the visibility buffer and discover which opaque
// `shader_id`s its pixels belong to. Aggregate via a workgroup-shared
// bitmask (one bit per registered bucket, including dynamic
// materials), then thread 0 atomically appends the tile's coords to
// each bucket bit is set in. The total atomic traffic is ~1 per
// workgroup-bit, regardless of the 64 threads inside.
//
// Skybox pixels (`triangle_index == U32_MAX`) are routed to the dedicated
// SKYBOX bucket (index 0). Its opaque pipeline is the `skybox_primary.wgsl`
// writer, dispatched over that bucket's tiles; the material kernels
// (compute.wgsl) shade real geometry only. See
// material_opaque/.../skybox_primary.wgsl.
//
// The bit constants + the shader_id → bit dispatch chain + the
// per-bucket extract block are all walked from the same
// `bucket_entries` list the templated `ClassifyOutput` struct uses.
//
// §4a/§4c made the per-pixel + per-sample maps data-driven (`bucket_lut`)
// and the fan-out + append index-driven, so the old per-bucket
// `SHADER_ID_<NAME>` / `BUCKET_BIT_<NAME>` constants are no longer
// referenced — the classify shader text now depends only on counts/widths
// (bucket_count, n_words, edge_slot_bits), never on bucket identities.

{% include "shared_wgsl/math.wgsl" %}

{% if emit_edge_data && multisampled_geometry %}
// Linearize a raw depth-buffer value into view-space Z. Ported from
// main's helpers/msaa.wgsl::viewSpaceDepth — same math, same name
// adjusted to snake_case. Required so a relative depth threshold
// (EDGE_DEPTH_THRESHOLD = 0.02 = 2%) works consistently across the
// scene; raw NDC depth is non-linear so the same threshold near vs
// far would produce wildly different real-world distances.
fn view_space_depth(camera: Camera, depth: f32, pixel_coords: vec2<f32>, screen_dims: vec2<f32>) -> f32 {
    let ndc_xy = vec2<f32>(
        (pixel_coords.x / screen_dims.x) * 2.0 - 1.0,
        1.0 - (pixel_coords.y / screen_dims.y) * 2.0,
    );
    let clip_pos = vec4<f32>(ndc_xy, depth, 1.0);
    let view_pos_h = camera.inv_proj * clip_pos;
    let view_pos = view_pos_h.xyz / view_pos_h.w;
    return view_pos.z;
}

// Full-u32 edge-sample sentinels (§5), width-INDEPENDENT: a real bucket
// index (0..65533) can never equal these, so `sid`/`seen`/append logic needs
// no per-width branching. The slot_map PACK truncates them to the
// width-correct packed sentinel (`& 0xFF` → 0xFE/0xFF for 8-bit; `& 0xFFFF`
// → 0xFFFE/0xFFFF for 16-bit), so the 8-bit packed output is byte-identical
// to before.
const SID_SKYBOX: u32 = 0xFFFFFFFEu;
const SID_EMPTY: u32 = 0xFFFFFFFFu;

// (Unified-edge U2b-3) The per-bucket + skybox edge-SAMPLE-LIST machinery
// (`append_edge_sample`) was removed: those lists fed only the legacy
// cs_edge / skybox_edge_resolve pipelines, which are gone. The unified
// `cs_shade` kernel drives edge shading from the per-pixel edge-id texture +
// the packed slot map instead, so classify no longer appends sample-list
// entries (and `data_buffer` no longer allocates the lists). SID_SKYBOX /
// SID_EMPTY below are still used by the slot_map pack.
{% endif %}

// Workgroup-shared bucket mask. One bit per registered bucket; the SKYBOX
// bucket is at index 0 (id 0 sorts first) → word 0, bit 0. Each bucket lives
// in word `index / 32` at bit `index % 32`; `tile_mask` is
// `array<atomic<u32>, n_words>` (n_words = ceil(live_count / 32)) so the
// bucket budget grows past 32 as the live count crosses each word boundary.
var<workgroup> tile_mask: array<atomic<u32>, {{ n_words }}u>;

@compute @workgroup_size(8, 8, 1)
fn cs_main(
    @builtin(workgroup_id) wg: vec3<u32>,
    @builtin(local_invocation_id) lid: vec3<u32>,
    @builtin(local_invocation_index) lii: u32,
) {
    // Zero the workgroup mask once per dispatch. The barrier below
    // makes the zero visible to every thread before the per-pixel
    // OR's land.
    if lii == 0u {
        {% for w in words_iter %}
        atomicStore(&tile_mask[{{ w }}u], 0u);
        {% endfor %}
    }
    workgroupBarrier();

    let screen_dims = textureDimensions(visibility_data_tex);
    let coords = vec2<i32>(wg.xy * 8u + lid.xy);
    let in_bounds = coords.x < i32(screen_dims.x) && coords.y < i32(screen_dims.y);

    // Per-pixel map → O(1) LUT load (§4a). `bucket_lut[raw_sid]` replaces
    // the old O(buckets) `shader_id == SHADER_ID_*` if/else chain: one
    // dependent load, warp-coherent (neighbouring pixels share a material →
    // same slot). `bucket_index == 0xFFFFFFFF` (NOT_FOUND) reproduces the
    // old "no arm matched" fall-through — the pixel contributes no bucket
    // bit. The bucket index's word is `bucket_index / 32`, its bit
    // `bucket_index % 32`; at n_words == 1 the word is always 0.
    var bucket_index: u32 = 0xFFFFFFFFu;
    if in_bounds {
        let vis = textureLoad(visibility_data_tex, coords, 0);
        let tri = join32(vis.x, vis.y);
        if tri == U32_MAX {
            // Fully-uncovered ("sky") pixel → the dedicated SKYBOX bucket
            // (index 0, since id 0 sorts first), whose opaque pipeline is the
            // `skybox_primary` writer. Routed directly — sky pixels carry no
            // material payload to look up. See skybox_primary.wgsl.
            bucket_index = 0u;
        } else {
            let meta_offset = join32(vis.z, vis.w);
            let mesh_meta = material_mesh_metas[meta_offset / 256u];
            if mesh_meta.is_hud == 0u {
                // shader_id is stored as the first u32 of each
                // material payload; `material_offset` is in bytes.
                let shader_id = materials_data[mesh_meta.material_offset / 4u];
                bucket_index = bucket_lut[shader_id];
            }
            // HUD pixels are redrawn by the transparency pass — skip
            // them in classify so the opaque pipelines don't process
            // tiles that contain only HUD geometry.
        }
    }

    if bucket_index != 0xFFFFFFFFu {
        atomicOr(&tile_mask[bucket_index / 32u], 1u << (bucket_index % 32u));
    }
    {% if multisampled_geometry %}
    // Unified-edge ANY-sample tile_mask (U0). The sample-0 `bucket_index`
    // OR above runs UNCHANGED (so the `unified_edge=false` WGSL is byte
    // -identical); this block ADDS samples 1..3 so a bucket's tile list
    // covers tiles where it appears at ANY sample (atomicOr is idempotent,
    // so re-OR'ing sample 0 here is unnecessary — only 1..3 are added).
    // This makes an edge-only material's tiles reachable by its own
    // (future) unified dispatch. Inert in U0: the extra tiles only add a
    // few check-and-skip lanes to the existing per-pixel shader_id guard in
    // cs_opaque → same output. Per-sample bucket id derived the same way the
    // sample-0 path does (sky → bucket 0; HUD → skip; else bucket_lut).
    // Unrolled (no dynamic indexing into per-sample locals).
    if in_bounds {
        let uv1 = textureLoad(visibility_data_tex, coords, 1);
        let uv2 = textureLoad(visibility_data_tex, coords, 2);
        let uv3 = textureLoad(visibility_data_tex, coords, 3);
        let utri1 = join32(uv1.x, uv1.y);
        let utri2 = join32(uv2.x, uv2.y);
        let utri3 = join32(uv3.x, uv3.y);
        var ubucket1: u32 = 0xFFFFFFFFu;
        if utri1 == U32_MAX {
            ubucket1 = 0u;
        } else {
            let mm = material_mesh_metas[join32(uv1.z, uv1.w) / 256u];
            if mm.is_hud == 0u { ubucket1 = bucket_lut[materials_data[mm.material_offset / 4u]]; }
        }
        var ubucket2: u32 = 0xFFFFFFFFu;
        if utri2 == U32_MAX {
            ubucket2 = 0u;
        } else {
            let mm = material_mesh_metas[join32(uv2.z, uv2.w) / 256u];
            if mm.is_hud == 0u { ubucket2 = bucket_lut[materials_data[mm.material_offset / 4u]]; }
        }
        var ubucket3: u32 = 0xFFFFFFFFu;
        if utri3 == U32_MAX {
            ubucket3 = 0u;
        } else {
            let mm = material_mesh_metas[join32(uv3.z, uv3.w) / 256u];
            if mm.is_hud == 0u { ubucket3 = bucket_lut[materials_data[mm.material_offset / 4u]]; }
        }
        if ubucket1 != 0xFFFFFFFFu {
            atomicOr(&tile_mask[ubucket1 / 32u], 1u << (ubucket1 % 32u));
        }
        if ubucket2 != 0xFFFFFFFFu {
            atomicOr(&tile_mask[ubucket2 / 32u], 1u << (ubucket2 % 32u));
        }
        if ubucket3 != 0xFFFFFFFFu {
            atomicOr(&tile_mask[ubucket3 / 32u], 1u << (ubucket3 % 32u));
        }
    }
    {% endif %}
    workgroupBarrier();

    if lii == 0u {
        let tile = vec2<u32>(wg.xy);

        // Data-driven fan-out (§4b): iterate only the bucket bits
        // actually set in this tile's mask — `O(active buckets in tile)`,
        // never `O(total buckets)`. A typical tile touches 1–3 materials,
        // so this is ~1–3 iterations whether the registry holds 16 or
        // 1024 buckets. `firstTrailingBit` walks the set bits of each
        // mask word; `bucket_index = w*32 + bit` indexes the `args` /
        // `offsets` arrays directly. The atomicAdd returns the previous
        // count, which doubles as the next free slot into the bucket's
        // `tiles` sub-range at `offsets[bucket_index] + idx`.
        for (var w = 0u; w < {{ n_words }}u; w = w + 1u) {
            var bits = atomicLoad(&tile_mask[w]);
            while (bits != 0u) {
                let b = firstTrailingBit(bits);
                bits = bits & (bits - 1u);
                let bucket_index = w * 32u + b;
                let idx = atomicAdd(&classify_output.args[bucket_index].workgroup_count_x, 1u);
                let base = classify_output.offsets[bucket_index];
                let slot = base + idx;
                if slot < base + classify_output.bucket_capacity {
                    classify_output.tiles[slot] = tile;
                }
            }
        }
    }

    {% if emit_edge_data && multisampled_geometry %}
    // ─────────────────────────────────────────────────────────────
    // MSAA edge emission (Priority 3).
    //
    // For each pixel: scan all 4 samples; collect the distinct
    // shader_ids; if there are at least 2 distinct shader_ids
    // (counting "skybox" as its own shader_id), the pixel is an edge
    // pixel. Allocate a compact edge_pixel_id via the global atomic,
    // write its (x, y) into edge_to_xy, build the slot_map (4 bytes
    // packed: up to 4 distinct shader_ids), and append a per-shader
    // (edge_pixel_id, sample_mask) entry to each contributing bucket's
    // sample list.
    //
    // Saturation: if edge_count saturates at MAX_EDGE_BUDGET, the
    // `edge_buffers.edge_overflow_count` atomic increments (see else
    // branch below) and we fall through without writing. The dropped
    // pixels render with the primary-sample shading written by the
    // material's primary pass — visually a missing-MSAA-resolve, not a
    // black hole. Stage 3.8 MVP (see edge_buffers.rs::note_edge_overflow_observed)
    // surfaces this via a one-shot tracing::warn when overflow is
    // observed CPU-side; the full atomic-add fallback (a hash-bucketed
    // overflow accumulator region) is parked as Block C.2 future work.
    if (in_bounds) {
        // Unified-edge (U0): initialize this pixel's edge-id to the
        // U32_MAX sentinel ("not an edge pixel"). Edge pixels overwrite
        // it with their compact edge_pixel_id below. Every in-bounds pixel
        // is written exactly once here, so non-edge pixels reliably read
        // the sentinel without a separate per-frame clear. Inert in U0
        // (edge_id_tex is unread).
        textureStore(edge_id_tex, coords, vec4<u32>(0xFFFFFFFFu, 0u, 0u, 0u));
        // Scan 4 samples — FULLY-STATIC version, no dynamic indexing
        // anywhere. Naga/Tint silently no-op'd dynamic-index writes
        // into both `vec4<u32>` and (turns out per the user's repro)
        // also `array<u32, 4>` in some configurations. Working around
        // entirely by unrolling and using individual `let`s.

        let v0 = textureLoad(visibility_data_tex, coords, 0);
        let v1 = textureLoad(visibility_data_tex, coords, 1);
        let v2 = textureLoad(visibility_data_tex, coords, 2);
        let v3 = textureLoad(visibility_data_tex, coords, 3);

        let tri_0 = join32(v0.x, v0.y);
        let tri_1 = join32(v1.x, v1.y);
        let tri_2 = join32(v2.x, v2.y);
        let tri_3 = join32(v3.x, v3.y);

        let mat_off_0 = join32(v0.z, v0.w);
        let mat_off_1 = join32(v1.z, v1.w);
        let mat_off_2 = join32(v2.z, v2.w);
        let mat_off_3 = join32(v3.z, v3.w);

        // Helper bucket-id derivation, inlined per sample.
        // Per-sample map → O(1) LUT load (§4a), one per sample. `sid_s`
        // is SID_SKYBOX (skybox/uncovered/HUD), a real bucket index
        // [0, bucket_count), or SID_EMPTY (unmapped NOT_FOUND fall-through —
        // the data-driven form of the old "no arm matched"). Width-independent
        // full-u32 sentinels (§5) so the same code serves 8- and 16-bit slot
        // widths; they truncate at the slot_map pack. Each `sid_s` stays a
        // static local — no dynamic indexing into per-sample locals (Naga/Tint
        // scar, :163); only the `bucket_lut` storage load is dynamically
        // indexed, which is safe.
        var sid_0: u32 = SID_EMPTY;
        if (tri_0 == U32_MAX) {
            sid_0 = SID_SKYBOX;
        } else {
            let mm = material_mesh_metas[mat_off_0 / 256u];
            if (mm.is_hud == 1u) { sid_0 = SID_SKYBOX; }
            else {
                let bi = bucket_lut[materials_data[mm.material_offset / 4u]];
                if (bi != 0xFFFFFFFFu) { sid_0 = bi; }
            }
        }
        var sid_1: u32 = SID_EMPTY;
        if (tri_1 == U32_MAX) {
            sid_1 = SID_SKYBOX;
        } else {
            let mm = material_mesh_metas[mat_off_1 / 256u];
            if (mm.is_hud == 1u) { sid_1 = SID_SKYBOX; }
            else {
                let bi = bucket_lut[materials_data[mm.material_offset / 4u]];
                if (bi != 0xFFFFFFFFu) { sid_1 = bi; }
            }
        }
        var sid_2: u32 = SID_EMPTY;
        if (tri_2 == U32_MAX) {
            sid_2 = SID_SKYBOX;
        } else {
            let mm = material_mesh_metas[mat_off_2 / 256u];
            if (mm.is_hud == 1u) { sid_2 = SID_SKYBOX; }
            else {
                let bi = bucket_lut[materials_data[mm.material_offset / 4u]];
                if (bi != 0xFFFFFFFFu) { sid_2 = bi; }
            }
        }
        var sid_3: u32 = SID_EMPTY;
        if (tri_3 == U32_MAX) {
            sid_3 = SID_SKYBOX;
        } else {
            let mm = material_mesh_metas[mat_off_3 / 256u];
            if (mm.is_hud == 1u) { sid_3 = SID_SKYBOX; }
            else {
                let bi = bucket_lut[materials_data[mm.material_offset / 4u]];
                if (bi != 0xFFFFFFFFu) { sid_3 = bi; }
            }
        }

        // Build the distinct-shader-id list (`seen[0..seen_count)`) by
        // explicit static comparisons. Static `seen_*` vars avoid the
        // dynamic-write-into-array problem. Unused slots = SID_EMPTY.
        var seen_0: u32 = sid_0;
        var seen_1: u32 = SID_EMPTY;
        var seen_2: u32 = SID_EMPTY;
        var seen_3: u32 = SID_EMPTY;
        var seen_count: u32 = 1u;

        if (sid_1 != seen_0) {
            seen_1 = sid_1;
            seen_count = 2u;
        }
        let sid_2_new = (sid_2 != seen_0) && (sid_2 != seen_1);
        if (sid_2_new) {
            if (seen_count == 1u) { seen_1 = sid_2; }
            else if (seen_count == 2u) { seen_2 = sid_2; }
            seen_count = seen_count + 1u;
        }
        let sid_3_new = (sid_3 != seen_0) && (sid_3 != seen_1) && (sid_3 != seen_2);
        if (sid_3_new) {
            if (seen_count == 1u) { seen_1 = sid_3; }
            else if (seen_count == 2u) { seen_2 = sid_3; }
            else if (seen_count == 3u) { seen_3 = sid_3; }
            seen_count = seen_count + 1u;
        }

        // Edge pixel: 2+ distinct shader_ids (counts skybox as one)
        // OR samples cover different meshes (different mat_off).
        // We deliberately do NOT include "different tri_id within the
        // same mesh" — that fires at intra-mesh triangle seams of
        // tessellated curved surfaces, and the resulting per-sample
        // re-shading produces wireframe-like artifacts (samples on
        // adjacent triangles can have wildly different bary derivs /
        // depth, so the average isn't a smooth blend).
        //
        // SILHOUETTE detection: a pixel is a silhouette edge iff its
        // 4 samples are MIXED between "covered by geometry" and "not".
        // Diagnosis showed that tri_id (v.x / v.y channels) IS distinct
        // per sample at silhouettes (and intra-mesh tri seams), while
        // mat_meta_offset (v.z / v.w) gets broadcast across samples by
        // the fragment shader output path on this Tint compile. Using
        // mat_meta diff alone never caught silhouettes; using tri_id
        // diff caught silhouettes AND intra-mesh tri seams — but the
        // latter produced wireframe artifacts when edge_resolve
        // shaded per-sample. The clean discriminator: was sample N
        // *covered* by any triangle? `tri_id == U32_MAX` means
        // uncovered (clear value). Mixed coverage across the 4
        // samples = silhouette against skybox/uncovered region. All
        // covered with differing tri_ids = intra-mesh seam (NOT a
        // silhouette; skip).
        let cov_0 = tri_0 != U32_MAX;
        let cov_1 = tri_1 != U32_MAX;
        let cov_2 = tri_2 != U32_MAX;
        let cov_3 = tri_3 != U32_MAX;
        // Mesh-vs-skybox silhouette (coverage mismatch) OR mesh-vs-mesh
        // silhouette in the same pixel (different mat_meta_offset
        // across samples). Earlier hypothesis that mat_meta gets
        // broadcast by Tint may have been wrong — empirical retest with
        // the depth-tex / view-space pipeline showed the per-sample data
        // IS distinct after all when MULTIPLE fragments cover the same
        // pixel (e.g. capsule fragment over samples 0,1 + platform
        // fragment over samples 2,3 each write their own mat_meta).
        let any_cov_differs = (cov_0 != cov_1)
            || (cov_0 != cov_2)
            || (cov_0 != cov_3);
        let any_mat_differs = (mat_off_0 != mat_off_1)
            || (mat_off_0 != mat_off_2)
            || (mat_off_0 != mat_off_3);
        let any_mesh_differs = any_cov_differs || any_mat_differs;

        // PORT OF MAIN'S MSAA EDGE DETECTION (edge_mask_depth_msaa +
        // edge_mask_neighbors).
        //
        // Two checks combined:
        // (1) Per-sample VIEW-SPACE depth variance in this pixel —
        //     catches in-pixel mesh-vs-mesh silhouettes where ≥2
        //     samples cover different surfaces. Uses view-space depth
        //     (linearized via camera.inv_proj) so the same relative
        //     threshold works near AND far from the camera. Raw
        //     depth-buffer comparison fired too aggressively at far
        //     depths and not at all near the camera.
        // (2) 4-neighbor view-space depth/coverage check — catches
        //     silhouettes where the silhouette runs *between* pixels
        //     (current pixel fully covered, neighbor fully uncovered
        //     or at very different depth). Matches main's
        //     edge_mask_neighbors which returns true for uncovered
        //     neighbors AND for covered neighbors with depth delta >
        //     EDGE_DEPTH_THRESHOLD.
        //
        // Thresholds taken verbatim from main:
        //   EDGE_DEPTH_THRESHOLD = 0.02  (2% relative view-space depth)
        //   EDGE_MSAA_DEPTH_THRESHOLD = 0.02  (same scale, same threshold)
        let camera = camera_from_raw(camera_raw);
        let screen_dims_f32 = vec2<f32>(f32(screen_dims.x), f32(screen_dims.y));
        let pixel_center = vec2<f32>(f32(coords.x) + 0.5, f32(coords.y) + 0.5);

        // ── Check 1: in-pixel depth variance (view-space) ──────────
        var sample_count: u32 = 0u;
        var vdmin: f32 = 1.0e9;
        var vdmax: f32 = -1.0e9;
        if (cov_0) {
            let d = textureLoad(depth_tex, coords, 0);
            let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
            vdmin = min(vdmin, vd);
            vdmax = max(vdmax, vd);
            sample_count = sample_count + 1u;
        }
        if (cov_1) {
            let d = textureLoad(depth_tex, coords, 1);
            let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
            vdmin = min(vdmin, vd);
            vdmax = max(vdmax, vd);
            sample_count = sample_count + 1u;
        }
        if (cov_2) {
            let d = textureLoad(depth_tex, coords, 2);
            let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
            vdmin = min(vdmin, vd);
            vdmax = max(vdmax, vd);
            sample_count = sample_count + 1u;
        }
        if (cov_3) {
            let d = textureLoad(depth_tex, coords, 3);
            let vd = view_space_depth(camera, d, pixel_center, screen_dims_f32);
            vdmin = min(vdmin, vd);
            vdmax = max(vdmax, vd);
            sample_count = sample_count + 1u;
        }
        var depth_edge: bool = false;
        if (sample_count >= 2u) {
            let depth_range = abs(vdmax - vdmin);
            let avg_depth = abs((vdmax + vdmin) * 0.5);
            depth_edge = depth_range > (0.02 * avg_depth);
        }

        // ── Check 2: 4-neighbor coverage + normal + depth check ─────
        //
        // Order matches main's `edge_mask_neighbors`:
        //   1. neighbor uncovered (tri_id == U32_MAX) → edge.
        //   2. normal discontinuity (dot < 0.95 ≈ 18°) → edge.
        //   3. depth discontinuity (relative 2% in view-space) → edge.
        //
        // The normal check is critical at tile-facet boundaries on the
        // platform top, where adjacent same-mesh tiles share depth +
        // mat_meta but rotate their surface normal by a small angle.
        // Depth-only neighbour detection misses these and leaves a
        // diagonal stripe of aliasing along the platform's top-front
        // edge.
        let EDGE_NORMAL_THRESHOLD: f32 = 0.95;
        var neighbor_edge: bool = false;
        if (cov_0) {
            let center_d = textureLoad(depth_tex, coords, 0);
            let center_vd = view_space_depth(camera, center_d, pixel_center, screen_dims_f32);
            let depth_threshold = 0.02 * abs(center_vd);
            // Center normal: sample-0 of the multisampled normal_tangent
            // texture, unpacked via the shared TBN helper. Same convention
            // the primary opaque + edge_resolve paths use.
            let center_nt = textureLoad(normal_tangent_tex, coords, 0);
            let center_tbn = unpack_normal_tangent(center_nt);
            let center_normal = center_tbn.N;
            let neighbor_offsets = array<vec2<i32>, 4>(
                vec2<i32>(1, 0),
                vec2<i32>(-1, 0),
                vec2<i32>(0, 1),
                vec2<i32>(0, -1),
            );
            let pixel_offsets = array<vec2<f32>, 4>(
                vec2<f32>(1.0, 0.0),
                vec2<f32>(-1.0, 0.0),
                vec2<f32>(0.0, 1.0),
                vec2<f32>(0.0, -1.0),
            );
            for (var ni = 0; ni < 4; ni++) {
                let n_coords = coords + neighbor_offsets[ni];
                if (n_coords.x < 0 || n_coords.y < 0
                    || n_coords.x >= i32(screen_dims.x)
                    || n_coords.y >= i32(screen_dims.y)) {
                    continue;
                }
                let nv = textureLoad(visibility_data_tex, n_coords, 0);
                let n_tri = join32(nv.x, nv.y);
                if (n_tri == U32_MAX) {
                    neighbor_edge = true;
                    break;
                }
                // Normal discontinuity (sample 0 of neighbour pixel).
                let n_nt = textureLoad(normal_tangent_tex, n_coords, 0);
                let n_tbn = unpack_normal_tangent(n_nt);
                if (dot(center_normal, n_tbn.N) < EDGE_NORMAL_THRESHOLD) {
                    neighbor_edge = true;
                    break;
                }
                // Depth discontinuity (view-space relative).
                let n_depth = textureLoad(depth_tex, n_coords, 0);
                let n_pixel_center = pixel_center + pixel_offsets[ni];
                let n_vd = view_space_depth(camera, n_depth, n_pixel_center, screen_dims_f32);
                if (abs(center_vd - n_vd) > depth_threshold) {
                    neighbor_edge = true;
                    break;
                }
            }
        }

        // Edge gate. Four signals, all needed for parity with main's
        // `msaa_resolve_samples` edge detection:
        //   * `seen_count >= 2u`        — multi-shader-id silhouette
        //   * `any_mesh_differs`        — mesh-vs-mesh (different mat_meta)
        //   * `depth_edge`              — per-sample depth variance
        //                                 (same-mesh in-pixel, e.g. the
        //                                 platform's top/front-face seam)
        //   * `neighbor_edge`           — 4-neighbour depth/coverage check
        //                                 (silhouette runs between pixels)
        //
        // depth_edge + neighbor_edge were previously disabled because
        // they produced "black sides on capsules" — that was an artefact
        // of the texture-pool-arrays-len template mismatch in
        // `texture_pool_sample_grad` (every same-mesh edge fell into the
        // `default → vec4(0)` branch in edge_resolve's compiled shader).
        // With the recompile wired into `finalize_gpu_textures`
        // (textures.rs), per-sample shading at same-mesh edges produces
        // the right value and the depth signals can be re-enabled.
        if (seen_count >= 2u || any_mesh_differs || depth_edge || neighbor_edge) {
            // Allocate compact edge_pixel_id. The atomic counter lives
            // in args_buffer / `edge_buffers` (drives indirect dispatch
            // for final_blend); we also mirror it into edge_data's
            // header so the resolve shaders can read it without binding
            // args_buffer (saves a storage-buffer slot vs the 10-cap).
            let edge_id = atomicAdd(&edge_buffers.edge_count, 1u);
            atomicAdd(&edge_data[edge_layout.edge_count_index], 1u);
            if (edge_id < edge_layout.max_edge_budget) {
                // Write edge_to_xy via atomicStore (edge_data is
                // declared as array<atomic<u32>> so even plain stores
                // go through the atomic interface).
                let packed_xy = (u32(coords.x) & 0xFFFFu) | ((u32(coords.y) & 0xFFFFu) << 16u);
                atomicStore(&edge_data[edge_layout.edge_to_xy_base + edge_id], packed_xy);

                // Unified-edge (U0): mirror the compact edge_pixel_id into
                // the per-pixel edge-id texture. Overwrites the U32_MAX
                // sentinel written above for this pixel. WRITTEN-only in U0
                // (the future unified kernel reads it to branch
                // interior-vs-edge + index the per-sample accumulator). The
                // existing edge-sample-list machinery (append_edge_sample
                // below) stays INTACT alongside it.
                textureStore(edge_id_tex, coords, vec4<u32>(edge_id, 0u, 0u, 0u));

                // Pack the slot_map (§5): the 4 per-sample bucket ids, each
                // truncated to the slot width. 8-bit (≤254 buckets): one u32
                // (4×8) — byte-identical to before, sentinels 0xFE/0xFF.
                // 16-bit (>254): two u32 (4×16), sentinels 0xFFFE/0xFFFF.
                {% if edge_slot_bits == 16 %}
                let slot_base = edge_layout.edge_slot_map_base + edge_id * 2u;
                atomicStore(&edge_data[slot_base],
                    (seen_0 & 0xFFFFu) | ((seen_1 & 0xFFFFu) << 16u));
                atomicStore(&edge_data[slot_base + 1u],
                    (seen_2 & 0xFFFFu) | ((seen_3 & 0xFFFFu) << 16u));
                {% else %}
                let slot_map = (seen_0 & 0xFFu)
                    | ((seen_1 & 0xFFu) << 8u)
                    | ((seen_2 & 0xFFu) << 16u)
                    | ((seen_3 & 0xFFu) << 24u);
                atomicStore(&edge_data[edge_layout.edge_slot_map_base + edge_id], slot_map);
                {% endif %}

                // Clear this edge pixel's 4 accumulator slots (4 vec4<f32>
                // = 16 u32 words) so a bucket whose per-shader edge_resolve
                // pipeline isn't resident this frame leaves count==0 — which
                // final_blend skips — instead of reading a stale value left
                // at the same slot index by a previous frame's edge pixel.
                // This is what makes the resolve per-bucket-independent
                // (render_pass.rs dropped the all-or-nothing gate). Bounded
                // by the live edge count: only freshly-allocated edge pixels
                // are cleared, never the full MAX_EDGE_BUDGET accumulator.
                let accum_clear_base = edge_layout.accumulator_base + edge_id * 16u;
                for (var ci: u32 = 0u; ci < 16u; ci = ci + 1u) {
                    atomicStore(&edge_data[accum_clear_base + ci], 0u);
                }

                // (Unified-edge U2b-3) Sample-list append removed — see the
                // note where `append_edge_sample` used to be defined. cs_shade
                // reads the packed slot map (above) + the per-pixel edge-id
                // texture to drive per-sample edge shading, so no per-bucket
                // lists are built.
                // Final blend args: one workgroup per edge pixel
                // (workgroup_size = 64, so divide by 64).
                if ((edge_id & 63u) == 0u) {
                    atomicAdd(&edge_buffers.final_blend_args.workgroup_count_x, 1u);
                }
            } else {
                atomicAdd(&edge_buffers.edge_overflow_count, 1u);
            }
        }
    }
    {% endif %}
}