awsm-renderer 0.4.0

awsm-renderer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
// Shadow bind-group declarations. The bind-group slot is supplied by
// the containing template via `shadow_group_index` — opaque uses slot
// 3. The transparent pass currently doesn't bind these (the adapter's
// `maxBindGroups=4` budget is fully consumed by transparent's existing
// groups).
//
// Bindings 0..=7 must stay in lockstep with
// `shared::material::bind_group::shadow_bind_group_layout_entries`.

const MAX_SHADOW_DESCRIPTORS: u32 = 32u;

struct ShadowDescriptor {
    // Light-space view-projection used at sample time.
    view_projection: mat4x4<f32>,
    // (atlas.x, atlas.y, atlas.w, atlas.h) in normalised UV space.
    atlas_rect: vec4<f32>,
    // (depth_bias, normal_bias, hardness, pcss_penumbra_scale)
    bias_params: vec4<f32>,
    // (split_far_view_z, cascade_index, cascade_count_in_light, evsm_flag)
    // `evsm_flag` is 1.0 when this cascade should sample EVSM moments
    // from `evsm_atlas` instead of the PCF depth atlas. The flag +
    // sample-site dispatch are wired; the moment-write compute pass
    // and Gaussian blur landed alongside it. If a future tweak leaves
    // EVSM disabled the cascade falls back to PCF on `shadow_atlas`.
    cascade_info: vec4<f32>,
};

struct ShadowGlobals {
    // (atlas.w, atlas.h, evsm.w, evsm.h)
    atlas_sizes: vec4<f32>,
    // (evsm_exponent, evsm_blur_radius, sscs_step_count, sscs_enabled)
    evsm_sscs: vec4<f32>,
    // (debug_cascade_colors, max_point_shadows, pad, pad)
    flags: vec4<u32>,
    // (cascade_array.w, cascade_array.h, max_layers, _) — per-layer
    // dimensions of the directional cascade texture array.
    cascade_array: vec4<f32>,
};

struct ShadowDescriptorArray {
    items: array<ShadowDescriptor, MAX_SHADOW_DESCRIPTORS>,
};

@group({{ shadow_group_index }}) @binding(0) var shadow_atlas: texture_depth_2d;
@group({{ shadow_group_index }}) @binding(1) var shadow_atlas_sampler: sampler_comparison;
@group({{ shadow_group_index }}) @binding(2) var shadow_cube_array: texture_depth_cube_array;
@group({{ shadow_group_index }}) @binding(3) var shadow_cube_sampler: sampler_comparison;
@group({{ shadow_group_index }}) @binding(4) var evsm_atlas: texture_2d<f32>;
@group({{ shadow_group_index }}) @binding(5) var evsm_atlas_sampler: sampler;
@group({{ shadow_group_index }}) @binding(6) var<uniform> shadow_globals: ShadowGlobals;
@group({{ shadow_group_index }}) @binding(7) var<uniform> shadow_descriptors: ShadowDescriptorArray;
@group({{ shadow_group_index }}) @binding(8) var shadow_cascade_array: texture_depth_2d_array;
@group({{ shadow_group_index }}) @binding(9) var shadow_cube_2d_array: texture_depth_2d_array;

// Sentinel for "no shadow" — packed into `LightPacked.row4.z`. Kept ungated:
// `apply_lighting` compares against it even before any shadow sample, and an
// unused const is free.
const SHADOW_INDEX_NONE: u32 = 0xFFFFFFFFu;

// ── Shadow SAMPLING (PCSS / PCF / EVSM / cube + SSCS) ───────────────────────
// Called ONLY from `apply_lighting`, so the whole block is gated on
// `needs_shadow_sampling` (= inc.apply_lighting). Materials that don't run
// first-party lighting (every custom material + unlit/toon/flipbook + the empty
// kernel) drop this ~50 KB of WGSL entirely. The bind group + structs above stay
// (ABI — the pipeline layout always has the shadow group).
{% if needs_shadow_sampling %}

// 16 Poisson-distributed samples in `[-1, 1]^2`. Used by both the
// PCSS blocker search and the variable-kernel PCF pass. The same
// table doubled-up keeps the WGSL small; a per-pixel rotation breaks
// up the regular pattern.
const POISSON_DISK_16: array<vec2<f32>, 16> = array<vec2<f32>, 16>(
    vec2<f32>(-0.94201624, -0.39906216),
    vec2<f32>( 0.94558609, -0.76890725),
    vec2<f32>(-0.09418410, -0.92938870),
    vec2<f32>( 0.34495938,  0.29387760),
    vec2<f32>(-0.91588581,  0.45771432),
    vec2<f32>(-0.81544232, -0.87912464),
    vec2<f32>(-0.38277543,  0.27676845),
    vec2<f32>( 0.97484398,  0.75648379),
    vec2<f32>( 0.44323325, -0.97511554),
    vec2<f32>( 0.53742981, -0.47373420),
    vec2<f32>(-0.26496911, -0.41893023),
    vec2<f32>( 0.79197514,  0.19090188),
    vec2<f32>(-0.24188840,  0.99706507),
    vec2<f32>(-0.81409955,  0.91437590),
    vec2<f32>( 0.19984126,  0.78641367),
    vec2<f32>( 0.14383161, -0.14100790),
);

// Inter-leaved Gradient Noise — Jorge Jimenez's hash, returns a
// per-pixel angle in `[0, 2π]`. Used to rotate the Poisson disk so
// adjacent fragments don't sample identical patterns.
fn pcss_disk_angle(coords: vec2<f32>) -> f32 {
    let magic = vec3<f32>(0.06711056, 0.00583715, 52.9829189);
    let noise = fract(magic.z * fract(dot(coords, magic.xy)));
    return noise * 6.2831853;
}

fn pcss_rotate(v: vec2<f32>, sin_a: f32, cos_a: f32) -> vec2<f32> {
    return vec2<f32>(v.x * cos_a - v.y * sin_a, v.x * sin_a + v.y * cos_a);
}

// Screen-space contact shadows (SSCS). Short ray-march in view space
// from `world_pos` toward `light_dir` (the surface→light direction),
// using the already-bound depth buffer (`depth_tex`). Returns `[0, 1]`
// visibility — multiplied into the main shadow term to darken micro-
// occluders that the shadow map misses (gaps under feet, hair, etc.).
//
// `shadow_globals.evsm_sscs.w` is the master enable; `.z` is the step
// count. Uses single-sample depth reads even when the geometry pass
// was rendered with MSAA (we read sample 0).
//
// The transparent pass doesn't bind a `depth_tex` (sampling the
// in-progress depth target on the same pass would be a feedback loop),
// so its shader template sets `sscs_available = false` and this
// function short-circuits to "fully lit" before any depth fetch.
fn apply_sscs(world_pos: vec3<f32>, light_dir: vec3<f32>) -> f32 {
{% if sscs_available %}
    let enabled = shadow_globals.evsm_sscs.w;
    if enabled < 0.5 {
        return 1.0;
    }
    let steps = u32(max(shadow_globals.evsm_sscs.z, 1.0));
    if steps == 0u {
        return 1.0;
    }

    // SSCS — Screen-Space Contact Shadows. A short ray-march from
    // each receiver toward the light, sampling the geometry-pass
    // depth buffer at each step. Used purely as a *contact-shadow
    // refinement* on top of the cascade map: it darkens the narrow
    // band right where caster geometry meets receiver geometry,
    // where the cascade's texel resolution leaves a "Peter Pan"
    // gap. It is NOT a substitute for the main shadow.
    //
    // The comparison is done in **linear view-space Z** (metres),
    // not NDC.z. This matters: NDC.z under perspective compresses
    // wildly with distance — a `0.001` NDC.z window covers ~1 mm at
    // the near plane but ~5 m at view-z = -50 m, so any NDC.z-based
    // thickness window misclassifies far receivers' rays against
    // unrelated background geometry. Earlier revisions had exactly
    // this failure mode (visible trails at zoom-out).
    //
    // Math:
    //   * receiver view-Z is `(camera.view · world_pos).z` (linear).
    //   * walking the ray `t_world` metres along `light_dir` changes
    //     view-Z by `(camera.view · light_dir).z · t_world` — also
    //     linear, so each march step's view-Z is exact.
    //   * the sampled depth-buffer texel is converted back to
    //     view-Z via `inv_proj`, which handles both perspective and
    //     ortho cameras correctly.
    //   * a scene texel "in front of the ray" satisfies
    //     `scene_view_z - ray_view_z > 0` (closer to camera = less
    //     negative). The thickness window is in metres and
    //     consistent across all depths.

    // Tunables — all are physical (metres or per-frame budget).
    // World-space step length is fixed so the same surface point
    // samples the same world positions every frame; only the depth
    // buffer read at each step's screen projection varies. This
    // matches the original Drobot 2017 formulation and avoids the
    // temporal jitter that a pixel-driven march produces (the
    // pixel-per-world ratio changes as the camera zooms, so a
    // fixed-pixel march samples different world positions every
    // frame even for the same surface).
    let SSCS_STEP_WORLD: f32 = 0.04;          // 4 cm per step → 64 cm reach @ 16 steps
    let SSCS_THICKNESS: f32 = 0.05;           // 5 cm slab counts as occluder
    let SSCS_SELF_OCCLUSION_EPS: f32 = 0.002; // 2 mm self-occlusion guard
    let MAX_DARKENING: f32 = 0.35;            // SSCS is refinement, not shadow

    let viewport_size = camera_raw.viewport.zw;
    let depth_dim = vec2<i32>(viewport_size);

    // Linear view-space Z values are used for the depth comparison
    // (NDC.z is non-linear under perspective — a fixed NDC.z window
    // would over/under-cover the slab at different depths and was
    // the bug behind the original "trailing at zoom-out" artefact).
    let recv_view_z = (camera_raw.view * vec4<f32>(world_pos, 1.0)).z;
    // View-Z slope per world-space metre along the ray; `light_dir`
    // is a direction vector (w = 0).
    let view_z_per_world = (camera_raw.view * vec4<f32>(light_dir, 0.0)).z;

    // World-space-stable per-fragment jitter on the start offset to
    // dither step quantisation between neighbouring receivers without
    // introducing per-frame noise. Hashing on the pixel coordinate
    // would change every camera move (same surface → different
    // pixel) which manifests as visible flicker; world-space
    // hashing is camera-invariant.
    let jitter_seed = world_pos.xz * 137.0
        + vec2<f32>(world_pos.y * 31.0, world_pos.y * 17.0);
    let jitter = pcss_disk_angle(jitter_seed) * (1.0 / 6.2831853);
    let t_start_world = (1.0 + jitter) * SSCS_STEP_WORLD;

    var hits: f32 = 0.0;
    for (var i: u32 = 0u; i < steps; i = i + 1u) {
        let t_world = t_start_world + SSCS_STEP_WORLD * f32(i);

        // Same world point every frame — project it now to find the
        // depth-buffer texel to sample.
        let ray_world = world_pos + light_dir * t_world;
        let clip = camera_raw.view_proj * vec4<f32>(ray_world, 1.0);
        if clip.w <= 0.0 {
            continue;
        }
        let ndc = clip.xyz / clip.w;
        if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 {
            continue;
        }
        let px_uv = ndc.xy * vec2<f32>(0.5, -0.5) + vec2<f32>(0.5, 0.5);
        let px_f = px_uv * viewport_size;
        let px = vec2<i32>(px_f);
        if px.x < 0 || px.y < 0 || px.x >= depth_dim.x || px.y >= depth_dim.y {
            continue;
        }
        let scene_ndc_z = textureLoad(depth_tex, px, 0);
        if scene_ndc_z >= 1.0 {
            // Background — no occluder to find here.
            continue;
        }

        // Ray view-Z is linear in `t_world` — exact, no projection
        // round-trip needed.
        let ray_view_z = recv_view_z + view_z_per_world * t_world;
        // Linearise the sampled depth via the camera's inv_proj.
        // For perspective this is non-affine; for ortho it's a
        // simple scale. Either way the .z / .w form is correct.
        let scene_view_h = camera_raw.inv_proj
            * vec4<f32>(ndc.xy, scene_ndc_z, 1.0);
        let scene_view_z = scene_view_h.z / scene_view_h.w;

        // Both view-Z values are linear and negative for points in
        // front of the camera. A scene texel closer to the camera
        // than the ray has `scene_view_z > ray_view_z` (less
        // negative). The thickness slab keeps far-background
        // geometry from counting as an occluder.
        let dz = scene_view_z - ray_view_z;
        if dz > SSCS_SELF_OCCLUSION_EPS && dz < SSCS_THICKNESS {
            hits = hits + 1.0;
        }
    }

    let occluded = hits / f32(steps);
    return 1.0 - occluded * MAX_DARKENING;
{% else %}
    return 1.0;
{% endif %}
}

// Cube near plane — MUST match the value used in `Mat4::perspective_rh`
// for cube face generation in `Shadows::write_gpu`.
const POINT_SHADOW_NEAR: f32 = 0.05;

// Point-light cube shadow sample.
//
// Each cube face stores perspective NDC.z written by the rasterizer
// (90° FOV, `near = POINT_SHADOW_NEAR`, `far = light_range`). The
// projection is post-multiplied by a Y-flip on the writer side so the
// rasterized image lines up with WebGPU's D3D-style cube sampling
// convention (texel `t=0` → world +Y on the +X face, etc.) — see
// `Shadows::write_gpu`. That flip doesn't change NDC.z, so the depth
// formula below stays the same on both sides.
//
// The receiver recreates that NDC.z by projecting `length(light, P)`
// onto the *dominant* cube axis of the light-to-surface direction:
//
//     view_depth = distance(light, P) · |dir.major|
//     ndc_z      = (far / (far - near)) · (1 - near / view_depth)
//
// Same formula generates both the rasterized atlas value and the
// receiver reference, so they compare directly — no linear-depth FS
// override, no per-tap face recompute, no seam math.
fn sample_shadow_cube(desc: ShadowDescriptor, world_pos: vec3<f32>, world_normal: vec3<f32>) -> f32 {
    let light_pos = desc.atlas_rect.xyz;
    let range = max(desc.atlas_rect.w, 0.01);
    let slot = i32(desc.cascade_info.y);

    let biased_pos = world_pos + world_normal * desc.bias_params.y;
    let light_to_surface = biased_pos - light_pos;
    let dist = length(light_to_surface);
    if dist >= range {
        return 1.0;
    }
    let dir = light_to_surface / max(dist, 1e-4);

    // Major-axis (cube-face) projected depth.
    let abs_d = abs(dir);
    let major = max(abs_d.x, max(abs_d.y, abs_d.z));
    let view_depth = dist * max(major, 1e-4);

    // Same perspective NDC.z formula as the rasterizer.
    let near = POINT_SHADOW_NEAR;
    let ndc_z = (range / (range - near)) * (1.0 - near / max(view_depth, near));

    // Slope-aware constant bias. `n_dot_dir` floor at 0.05 keeps
    // grazing surfaces from running away to huge bias values
    // (`bias → ∞` as `n_dot_dir → 0`); the user-authored
    // `desc.bias_params.x` (the per-light `depth_bias`) is trusted
    // as-is. An earlier floor of `max(..., 0.001)` here silently
    // overrode any inspector value smaller than 0.001 — that was
    // ~10× the NDC gap between a receiver and a box's back face at
    // a typical 4 m point-light distance, so contacts could never
    // close even after lowering `depth_bias`. If you DO want a
    // global floor for some project, gate it on
    // `ShadowsConfig::min_point_depth_bias` (not present today).
    let n_dot_dir = abs(dot(dir, world_normal));
    let bias = desc.bias_params.x / max(n_dot_dir, 0.05);
    let ref_depth = clamp(ndc_z, 0.0, 1.0) - bias;
    let hardness = desc.bias_params.z;

    if hardness < 0.5 {
        return textureSampleCompareLevel(
            shadow_cube_array,
            shadow_cube_sampler,
            dir,
            slot,
            ref_depth,
        );
    }

    // Soft and PCSS share the same disc-on-tangent-plane tap layout:
    // each tap recomputes its own direction-from-light + NDC.z + bias
    // (rather than rotating the central `dir`) so a flat receiver
    // doesn't self-shadow into a kernel-shaped patch. The PCSS path
    // additionally does a blocker-search pre-pass using
    // `shadow_cube_2d_array` (raw depth reads) to scale the kernel.
    let abs_n = abs(world_normal);
    let up_hint = select(
        vec3<f32>(0.0, 1.0, 0.0),
        vec3<f32>(1.0, 0.0, 0.0),
        abs_n.y > 0.99,
    );
    let tangent = normalize(cross(up_hint, world_normal));
    let bitangent = cross(world_normal, tangent);

    let angle = pcss_disk_angle(
        biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
    );
    let sin_a = sin(angle);
    let cos_a = cos(angle);

    if hardness < 1.5 {
        // Soft — fixed 16-tap rotated Poisson, ~15 cm world disc.
        // Distance tapering applies ONLY to the PCSS branch below
        // (where the variable-kernel PCF can absorb the noise floor
        // a smaller sample count introduces). The Soft path is the
        // user's "I want a clean smooth shadow, no contact hardening"
        // setting; dropping its tap count introduces visible Poisson-
        // rotation banding on large smooth receivers (the floor in
        // the canonical "directional light + character on a plane"
        // test). 16 fixed = visually clean; the tap-count knob exists
        // for the PCSS branch's wide-kernel pass.
        // World-space disc radius. Base 0.15 m at `pcss_penumbra_scale == 1`;
        // the per-light knob (bias_params.w) is the user's softness control,
        // shared with PCSS so one slider governs both modes for point lights too.
        let SOFT_WORLD_RADIUS: f32 = 0.15 * max(desc.bias_params.w, 0.0);
        var sum = 0.0;
        for (var i = 0u; i < 16u; i = i + 1u) {
            let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * SOFT_WORLD_RADIUS;
            let tap_pos = biased_pos + tangent * off.x + bitangent * off.y;
            let tap_to_light = tap_pos - light_pos;
            let tap_dist = length(tap_to_light);
            let tap_dir = tap_to_light / max(tap_dist, 1e-4);
            let tap_abs = abs(tap_dir);
            let tap_major = max(tap_abs.x, max(tap_abs.y, tap_abs.z));
            let tap_view_depth = tap_dist * max(tap_major, 1e-4);
            let tap_ndc_z =
                (range / (range - near)) * (1.0 - near / max(tap_view_depth, near));
            let tap_n_dot_dir = abs(dot(tap_dir, world_normal));
            let tap_bias = desc.bias_params.x / max(tap_n_dot_dir, 0.05);
            let tap_ref = clamp(tap_ndc_z, 0.0, 1.0) - tap_bias;
            sum += textureSampleCompareLevel(
                shadow_cube_array,
                shadow_cube_sampler,
                tap_dir,
                slot,
                tap_ref,
            );
        }
        return sum / 16.0;
    }

    // PCSS — real blocker search + variable kernel.
    //
    // Stage 1 (blocker search): sample a fixed 16-tap "search" disc
    // sized by `pcss_penumbra_scale` (a virtual light disc radius in
    // metres). At each tap, project the tap's light direction onto
    // the right cube face, fetch raw depth via the 2D-array view,
    // and average the depths of taps that lie in front of the
    // receiver.
    //
    // Stage 2 (variable PCF): derive a penumbra radius from the
    // standard PCSS formula `(d_recv - d_avg) * light_size / d_avg`
    // and re-sample with `textureSampleCompareLevel`, this time
    // through the cube sampler so we get hardware bilinear PCF.
    //
    // The cube faces share a single NDC.z formula with the writer:
    //   ndc_z = (range / (range - near)) * (1 - near / view_depth)
    // so `textureLoad`-ed depths are directly comparable to the
    // per-tap `ref_depth` we compute here.
    let pcss_scale = max(desc.bias_params.w, 0.01);
    // Blocker-search disc: fixed 30 cm world radius scaled by
    // `pcss_penumbra_scale`. Bigger = fatter blocker estimate.
    let pcss_search_world_radius = 0.30 * pcss_scale;
    // Cube face dimension (px) for face-UV → texel conversion. All
    // faces share the same square resolution.
    let cube_dims = textureDimensions(shadow_cube_2d_array, 0);
    let cube_face_size = vec2<f32>(f32(cube_dims.x), f32(cube_dims.y));

    // Fixed 16-tap blocker search. We previously tapered this by
    // `dist / range` to save fragment cost on distant receivers,
    // but the variable-kernel PCF below needs all 16 samples to
    // resolve smoothly — undersampled wide penumbras showed
    // visible Poisson-rotation banding (cube version less obvious
    // than directional, but present). The unused helper
    // `pcss_tap_count` is kept above for future re-introduction
    // once a quality-preserving tap budget is worked out.
    var blocker_sum = 0.0;
    var blocker_count = 0u;
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * pcss_search_world_radius;
        let tap_pos = biased_pos + tangent * off.x + bitangent * off.y;
        let tap_to_light = tap_pos - light_pos;
        let tap_dist = length(tap_to_light);
        let tap_dir = tap_to_light / max(tap_dist, 1e-4);
        let tap_abs = abs(tap_dir);
        let tap_major = max(tap_abs.x, max(tap_abs.y, tap_abs.z));
        let tap_view_depth = tap_dist * max(tap_major, 1e-4);
        let tap_ndc_z = clamp(
            (range / (range - near)) * (1.0 - near / max(tap_view_depth, near)),
            0.0,
            1.0,
        );
        // Inline cube-direction → (face, uv) projection. Standard
        // D3D cube convention; the writer's post-projection Y-flip is
        // already baked into the texel layout.
        var tap_face: u32 = 0u;
        var tap_uc: f32 = 0.0;
        var tap_vc: f32 = 0.0;
        var tap_ma: f32 = 1e-4;
        if tap_abs.x >= tap_abs.y && tap_abs.x >= tap_abs.z {
            if tap_dir.x > 0.0 {
                tap_face = 0u; tap_uc = -tap_dir.z; tap_vc = -tap_dir.y; tap_ma = tap_abs.x;
            } else {
                tap_face = 1u; tap_uc =  tap_dir.z; tap_vc = -tap_dir.y; tap_ma = tap_abs.x;
            }
        } else if tap_abs.y >= tap_abs.z {
            if tap_dir.y > 0.0 {
                tap_face = 2u; tap_uc =  tap_dir.x; tap_vc =  tap_dir.z; tap_ma = tap_abs.y;
            } else {
                tap_face = 3u; tap_uc =  tap_dir.x; tap_vc = -tap_dir.z; tap_ma = tap_abs.y;
            }
        } else {
            if tap_dir.z > 0.0 {
                tap_face = 4u; tap_uc =  tap_dir.x; tap_vc = -tap_dir.y; tap_ma = tap_abs.z;
            } else {
                tap_face = 5u; tap_uc = -tap_dir.x; tap_vc = -tap_dir.y; tap_ma = tap_abs.z;
            }
        }
        let tap_inv = 0.5 / max(tap_ma, 1e-4);
        let face_uv = vec2<f32>(tap_uc * tap_inv + 0.5, tap_vc * tap_inv + 0.5);
        let layer = i32(slot) * 6 + i32(tap_face);
        let tex_xy = clamp(
            vec2<i32>(face_uv * cube_face_size),
            vec2<i32>(0, 0),
            vec2<i32>(cube_dims.xy) - vec2<i32>(1, 1),
        );
        let d = textureLoad(shadow_cube_2d_array, tex_xy, layer, 0);
        // Bias-free blocker test — we want a clean estimate of how
        // many genuine occluders sit in front of the receiver. The
        // 0.0005 epsilon matches the directional PCSS path.
        if d < tap_ndc_z - 0.0005 {
            blocker_sum = blocker_sum + d;
            blocker_count = blocker_count + 1u;
        }
    }
    if blocker_count == 0u {
        return 1.0;
    }
    if blocker_count == 16u {
        return 0.0;
    }
    let avg_blocker = blocker_sum / f32(blocker_count);
    // PCSS penumbra in NDC.z space: `(z_recv - z_blocker) * light /
    // z_blocker`. Map back to a world-space disc radius on the
    // receiver tangent plane by treating the receiver-to-light
    // distance as the projection distance — light_size in world
    // metres = `pcss_penumbra_scale × 1m × penumbra_ratio`.
    let recv_ndc_z = clamp(ndc_z, 0.0, 1.0);
    let penumbra_ratio = clamp(
        (recv_ndc_z - avg_blocker) / max(avg_blocker, 1e-4),
        0.0,
        4.0,
    );
    // Clamp to keep the kernel between "more than Soft" (10 cm) and
    // "still affordable" (1 m world disc — already huge at typical
    // point-light scales).
    let penumbra_world_radius = clamp(
        pcss_search_world_radius * penumbra_ratio,
        0.10,
        1.00,
    );

    var sum = 0.0;
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * penumbra_world_radius;
        let tap_pos = biased_pos + tangent * off.x + bitangent * off.y;
        let tap_to_light = tap_pos - light_pos;
        let tap_dist = length(tap_to_light);
        let tap_dir = tap_to_light / max(tap_dist, 1e-4);
        let tap_abs = abs(tap_dir);
        let tap_major = max(tap_abs.x, max(tap_abs.y, tap_abs.z));
        let tap_view_depth = tap_dist * max(tap_major, 1e-4);
        let tap_ndc_z =
            (range / (range - near)) * (1.0 - near / max(tap_view_depth, near));
        let tap_n_dot_dir = abs(dot(tap_dir, world_normal));
        let tap_bias = desc.bias_params.x / max(tap_n_dot_dir, 0.05);
        let tap_ref = clamp(tap_ndc_z, 0.0, 1.0) - tap_bias;
        sum += textureSampleCompareLevel(
            shadow_cube_array,
            shadow_cube_sampler,
            tap_dir,
            slot,
            tap_ref,
        );
    }
    return sum / 16.0;
}


// EVSM sample. Reads the four exponential moments from `evsm_atlas`
// (written + blurred by the compute passes in `shadows::evsm`),
// reconstructs positive and negative one-tailed Chebyshev visibility,
// and returns `min(pos, neg)`. The pre-write blur is the source of
// softness — at sample time we do a single bilinear fetch.
//
// The exponent used at write time is `shadow_globals.evsm_sscs.x`
// (config.evsm_exponent). Receiver and writer must agree, else the
// curve mismatches and shadows go solid / clear.
fn chebyshev_upper(moments_2: vec2<f32>, t: f32) -> f32 {
    // moments_2.x = E[exp_z], moments_2.y = E[exp_z²].
    // variance = E[X²] − (E[X])²; clamped above a small floor so a
    // flat receiver doesn't divide by zero.
    let mean = moments_2.x;
    let variance = max(moments_2.y - mean * mean, 1e-5);
    let d = t - mean;
    if d <= 0.0 {
        return 1.0;
    }
    let p_max = variance / (variance + d * d);
    // Linstep light-bleed reduction — clamp the lower tail so partial
    // occluders don't lift the shadow into halftone.
    return clamp((p_max - 0.2) / 0.8, 0.0, 1.0);
}

fn sample_shadow_evsm(
    desc: ShadowDescriptor,
    world_pos: vec3<f32>,
    world_normal: vec3<f32>,
) -> f32 {
    let biased_pos = world_pos + world_normal * desc.bias_params.y;
    let clip = desc.view_projection * vec4<f32>(biased_pos, 1.0);
    if clip.w <= 0.0 {
        return 1.0;
    }
    let ndc = clip.xyz / clip.w;
    if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 || ndc.z < 0.0 || ndc.z > 1.0 {
        return 1.0;
    }
    let uv_local = vec2<f32>(ndc.x * 0.5 + 0.5, -ndc.y * 0.5 + 0.5);
    let atlas_uv = desc.atlas_rect.xy + uv_local * desc.atlas_rect.zw;

    // Clamp to the EVSM cascade's own tile inset by half a texel so the
    // bilinear fetch never crosses the rect boundary. Without this the
    // 2×2 bilinear tap at the tile edge reads from neighbouring rect
    // moments (or uninitialised RGBA16F memory if no other EVSM
    // cascade was packed there), producing a hard rectangular cliff
    // exactly at the cascade outline. Same defence as the PCF path
    // does via `tile_min` / `tile_max`.
    let inv_evsm_atlas = vec2<f32>(
        1.0 / shadow_globals.atlas_sizes.z,
        1.0 / shadow_globals.atlas_sizes.w,
    );
    let evsm_tile_min = desc.atlas_rect.xy + 0.5 * inv_evsm_atlas;
    let evsm_tile_max = desc.atlas_rect.xy + desc.atlas_rect.zw - 0.5 * inv_evsm_atlas;
    let clamped_uv = clamp(atlas_uv, evsm_tile_min, evsm_tile_max);

    let moments = textureSampleLevel(evsm_atlas, evsm_atlas_sampler, clamped_uv, 0.0);
    let exponent = shadow_globals.evsm_sscs.x;
    // Map receiver depth [0,1] to the same [-1,1] space the writer
    // used (see `shadows::evsm::MOMENT_WRITE_WGSL`).
    let z = 2.0 * ndc.z - 1.0;
    let pos_t = exp(exponent * z);
    let neg_t = -exp(-exponent * z);
    let v_pos = chebyshev_upper(moments.xy, pos_t);
    let v_neg = chebyshev_upper(moments.zw, neg_t);
    return min(v_pos, v_neg);
}

// Sample a directional-cascade descriptor (kind = 3) backed by the
// `shadow_cascade_array` texture. Layout in atlas_rect:
//   .x = layer index (as f32)
//   .y = 0 (cascade starts at layer origin)
//   .zw = used sub-rect width/height in normalised UV
//
// Hardness branches mirror `sample_shadow_descriptor`'s 2D path; the
// only difference is the bound texture and an explicit layer argument
// on every compare/load.
fn sample_shadow_cascade_array(
    desc: ShadowDescriptor,
    world_pos: vec3<f32>,
    world_normal: vec3<f32>,
) -> f32 {
    let layer = i32(desc.atlas_rect.x);
    let biased_pos = world_pos + world_normal * desc.bias_params.y;
    let clip = desc.view_projection * vec4<f32>(biased_pos, 1.0);
    if clip.w <= 0.0 {
        return 1.0;
    }
    let ndc = clip.xyz / clip.w;
    if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 || ndc.z < 0.0 || ndc.z > 1.0 {
        return 1.0;
    }
    let uv_local = vec2<f32>(ndc.x * 0.5 + 0.5, -ndc.y * 0.5 + 0.5);
    // Cascades always start at the layer origin; multiply by the
    // sub-rect size in normalised UV so smaller cascades don't read
    // outside their valid region.
    let atlas_uv = uv_local * desc.atlas_rect.zw;
    let ref_depth = ndc.z - desc.bias_params.x;
    let hardness = desc.bias_params.z;

    let inv_atlas = vec2<f32>(
        1.0 / shadow_globals.cascade_array.x,
        1.0 / shadow_globals.cascade_array.y,
    );
    // Half-texel inset to keep the bilinear / PCF taps inside the
    // valid sub-rect of the layer when `used_res < layer_size`.
    let tile_min = 0.5 * inv_atlas;
    let tile_max = desc.atlas_rect.zw - 0.5 * inv_atlas;

    if hardness < 0.5 {
        return textureSampleCompareLevel(
            shadow_cascade_array,
            shadow_atlas_sampler,
            clamp(atlas_uv, tile_min, tile_max),
            layer,
            ref_depth,
        );
    }
    if hardness < 1.5 {
        // Soft — fixed 16-tap rotated Poisson. See the matching
        // comment in `sample_shadow_cube`'s Soft branch — tapering
        // here introduced visible banding on large smooth receivers
        // (e.g. a floor plane under a directional light). The
        // PCSS branch below still tapers; the Soft path is fixed.
        let world_per_texel = max(desc.cascade_info.y, 1e-4);
        // World-unit penumbra → texel kernel below; scale-invariant. Base 0.12 m
        // at `pcss_penumbra_scale == 1`; the per-light knob (bias_params.w) is
        // the user's softness control, shared with PCSS. Fixed-width (no blocker
        // search), so keep the base modest — it does not narrow toward contact.
        let soft_world_radius = 0.12 * max(desc.bias_params.w, 0.0);
        let radius_texels = clamp(soft_world_radius / world_per_texel, 2.0, 10.0);
        let angle = pcss_disk_angle(
            biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
        );
        let sin_a = sin(angle);
        let cos_a = cos(angle);
        var sum = 0.0;
        for (var i = 0u; i < 16u; i = i + 1u) {
            let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * radius_texels;
            sum += textureSampleCompareLevel(
                shadow_cascade_array, shadow_atlas_sampler,
                clamp(atlas_uv + off * inv_atlas, tile_min, tile_max),
                layer,
                ref_depth,
            );
        }
        return sum / 16.0;
    }
    // PCSS — same recipe as the 2D path, with the cascade-array
    // texture and explicit `layer` arg.
    let pcss_scale = max(desc.bias_params.w, 0.01);
    let world_per_texel_pcss = max(desc.cascade_info.y, 1e-4);
    let pcss_light_world_radius = 1.0 * pcss_scale;
    let atlas_uv_to_texels = vec2<f32>(
        shadow_globals.cascade_array.x,
        shadow_globals.cascade_array.y,
    );
    let angle = pcss_disk_angle(
        biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
    );
    let sin_a = sin(angle);
    let cos_a = cos(angle);
    let search_radius_texels = clamp(
        pcss_light_world_radius / world_per_texel_pcss,
        4.0,
        64.0,
    );
    // Fixed 16-tap blocker + PCF. The earlier tapered version
    // (`pcss_tap_count(ndc.z)`) showed clear ribbon/striping
    // artifacts on the canonical "robot on a floor under a
    // directional light" test — `ndc.z` is uncorrelated with
    // PCSS penumbra width, so fragments at `ndc.z ≈ 1` ended up
    // with 4 samples on a wide kernel, undersampling enough to
    // expose the rotated-Poisson disc as banding. Tapering is
    // parked here (and on the cube + 2D paths) until a quality-
    // preserving budget is worked out.
    var blocker_sum = 0.0;
    var blocker_count = 0u;
    let tile_min_px = vec2<i32>(tile_min * atlas_uv_to_texels);
    let tile_max_px = vec2<i32>(tile_max * atlas_uv_to_texels);
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * search_radius_texels;
        let sample_uv = atlas_uv + off * inv_atlas;
        let coord = vec2<i32>(sample_uv * atlas_uv_to_texels);
        let c = clamp(coord, tile_min_px, tile_max_px);
        let d = textureLoad(shadow_cascade_array, c, layer, 0);
        if d < ref_depth - 0.0005 {
            blocker_sum = blocker_sum + d;
            blocker_count = blocker_count + 1u;
        }
    }
    if blocker_count == 0u {
        return 1.0;
    }
    if blocker_count == 16u {
        return 0.0;
    }
    let avg_blocker = blocker_sum / f32(blocker_count);
    let light_size_texels = pcss_light_world_radius / world_per_texel_pcss;
    let penumbra_texels = clamp(
        (ref_depth - avg_blocker) * light_size_texels / max(avg_blocker, 1e-4),
        2.0,
        24.0,
    );
    // Wide PCSS kernels sample texels far from the fragment; on a sloped /
    // curved receiver the depth stored there differs by the surface slope and
    // self-shadows into acne. Scale the comparison bias with the kernel width so
    // wider penumbras get proportional slack — the softness hides the extra
    // peter-panning a near-contact (narrow-kernel) fragment would otherwise show.
    let pcss_ref = ref_depth - desc.bias_params.x * penumbra_texels * 0.5;
    var pcf_sum = 0.0;
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * penumbra_texels;
        pcf_sum = pcf_sum + textureSampleCompareLevel(
            shadow_cascade_array,
            shadow_atlas_sampler,
            clamp(atlas_uv + off * inv_atlas, tile_min, tile_max),
            layer,
            pcss_ref,
        );
    }
    return pcf_sum / 16.0;
}

// Sample a single shadow descriptor (cascade / spot / face). Returns
// `[0, 1]` visibility (1.0 = lit, 0.0 = fully shadowed).
//
// Hardness branches:
//   0.0 = Hard, 1-tap.
//   1.0 = Soft, 3x3 PCF.
//   2.0 = PCSS — blocker search + variable-kernel PCF.
fn sample_shadow_descriptor(
    descriptor_index: u32,
    world_pos: vec3<f32>,
    world_normal: vec3<f32>,
) -> f32 {
    if descriptor_index >= MAX_SHADOW_DESCRIPTORS {
        return 1.0;
    }
    let desc = shadow_descriptors.items[descriptor_index];
    // cascade_info.w encodes the descriptor kind:
    //   0.0 = 2D PCF on `shadow_atlas` (spot)
    //   1.0 = 2D EVSM cascade — read moments from `evsm_atlas`
    //   2.0 = cube (point light)
    //   3.0 = directional cascade on `shadow_cascade_array`
    let kind = desc.cascade_info.w;
    if kind > 2.5 {
        return sample_shadow_cascade_array(desc, world_pos, world_normal);
    }
    if kind > 1.5 {
        return sample_shadow_cube(desc, world_pos, world_normal);
    }
    if kind > 0.5 {
        return sample_shadow_evsm(desc, world_pos, world_normal);
    }

    // Offset the receiver along its surface normal by `normal_bias`
    // world-space units before projecting into shadow space. This
    // pushes the sample point *toward* the light, which is how we
    // dodge acne on slanted surfaces without relying solely on a
    // constant depth bias (cascade Z-ranges differ a lot, so a flat
    // depth bias is either too soft or too aggressive). The
    // pipeline's slope-scale bias and `bias_params.x` depth bias
    // handle the residual.
    let biased_pos = world_pos + world_normal * desc.bias_params.y;
    let clip = desc.view_projection * vec4<f32>(biased_pos, 1.0);
    if clip.w <= 0.0 {
        return 1.0;
    }
    let ndc = clip.xyz / clip.w;
    if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 || ndc.z < 0.0 || ndc.z > 1.0 {
        return 1.0;
    }
    let uv_local = vec2<f32>(ndc.x * 0.5 + 0.5, -ndc.y * 0.5 + 0.5);
    let atlas_uv = desc.atlas_rect.xy + uv_local * desc.atlas_rect.zw;
    let ref_depth = ndc.z - desc.bias_params.x;
    let hardness = desc.bias_params.z;

    // PCF / PCSS taps must stay inside this cascade's tile of the
    // atlas. The tile-pack allocator places cascades edge-to-edge,
    // so a kernel that crosses the boundary samples a totally
    // unrelated cascade's depth (or another light's spot tile) and
    // produces a fringe of bogus shadow at the tile seam. The inset
    // is half a texel so bilinear PCF taps don't read past the edge
    // either.
    let inv_atlas = vec2<f32>(
        1.0 / shadow_globals.atlas_sizes.x,
        1.0 / shadow_globals.atlas_sizes.y,
    );
    let tile_min = desc.atlas_rect.xy + 0.5 * inv_atlas;
    let tile_max = desc.atlas_rect.xy + desc.atlas_rect.zw - 0.5 * inv_atlas;

    if hardness < 0.5 {
        return textureSampleCompareLevel(
            shadow_atlas,
            shadow_atlas_sampler,
            clamp(atlas_uv, tile_min, tile_max),
            ref_depth,
        );
    }
    if hardness < 1.5 {
        // Tap-rotated 16-sample Poisson disk PCF. The kernel is
        // sized in *world units* (`SOFT_WORLD_RADIUS`) and the
        // per-cascade texel-radius is recovered by dividing by the
        // cascade's `world_per_texel` (stored in `cascade_info.y`).
        // That keeps the perceived soft-edge width identical in every
        // cascade — without this, the near cascade's 2048 texels
        // covering a tiny world span produces razor-sharp shadows
        // while the far cascade's same 2048 texels covering a much
        // larger span produces soft ones, and the boundary between
        // is visible as a step in penumbra width.
        let world_per_texel = max(desc.cascade_info.y, 1e-4);
        // Penumbra half-width in WORLD units (converted to a texel kernel by
        // the divide below), so the perceived soft edge is identical regardless
        // of scene scale or which cascade resolves it — nothing here assumes a
        // particular scene size. The 0.12 m base is the default at
        // `pcss_penumbra_scale == 1`; that per-light knob (bias_params.w) is the
        // user's softness control, shared with PCSS so one slider governs both
        // modes. Unlike PCSS this kernel is fixed-width (no blocker search), so
        // it does not narrow toward contact — keep the base modest.
        let soft_world_radius = 0.12 * max(desc.bias_params.w, 0.0);
        // Clamp at 3 texels min (a too-tight kernel collapses to a
        // single 2×2 bilinear compare and the cascade-boundary blend
        // shows a "soft → razor" step). 20 texels max so the near
        // cascade doesn't waste kernel area where world_per_texel is
        // sub-millimetre.
        let radius_texels = clamp(soft_world_radius / world_per_texel, 2.0, 10.0);

        // Per-fragment rotation hash. MUST be keyed on world position
        // (not `atlas_uv`) — atlas_uv shifts by exactly one texel
        // every time the stable-fit's texel-snap moves, and the snap
        // moves whenever the camera translates by enough to cross a
        // texel boundary in light view. A pixel-keyed hash would
        // therefore rotate the tap pattern for every receiver in
        // lockstep on every such snap, producing a frame of
        // shimmer at every snap step. World-space hashing is
        // invariant under the camera's discrete grid jumps.
        let angle = pcss_disk_angle(
            biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
        );
        let sin_a = sin(angle);
        let cos_a = cos(angle);
        // Fixed 16 taps on the Soft path — see `sample_shadow_cube`'s
        // Soft branch for the full rationale. Tapering here banded
        // large smooth receivers; the PCSS branch below still
        // tapers because its variable-kernel PCF absorbs the noise.
        var sum = 0.0;
        for (var i = 0u; i < 16u; i = i + 1u) {
            let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * radius_texels;
            sum += textureSampleCompareLevel(
                shadow_atlas, shadow_atlas_sampler,
                clamp(atlas_uv + off * inv_atlas, tile_min, tile_max),
                ref_depth,
            );
        }
        return sum / 16.0;
    }
    // PCSS — blocker-search + variable-kernel PCF.
    //
    // `pcss_penumbra_scale` (`bias_params.w`) is a multiplier on a
    // base 1 m "light disc" radius — i.e. how large the simulated
    // sun / area light appears at the receiver. With the default
    // scale = 1.0, the search & penumbra grow as if the light were a
    // 1 m disc; smaller values give sharper contact, larger values
    // give more dramatic falloff.
    //
    // Everything below is sized in *world units* (then converted to
    // texels via `world_per_texel` per cascade) so the cost / quality
    // of PCSS stays comparable across cascades — without that scaling
    // the search radius collapses to a few texels on the far cascade
    // and the algorithm degenerates into PCF.
    let pcss_scale = max(desc.bias_params.w, 0.01);
    let world_per_texel_pcss = max(desc.cascade_info.y, 1e-4);
    let pcss_light_world_radius = 1.0 * pcss_scale; // virtual light disc radius (m)
    let atlas_uv_to_texels = vec2<f32>(
        shadow_globals.atlas_sizes.x,
        shadow_globals.atlas_sizes.y,
    );
    // World-space rotation hash (see Soft PCF branch above — atlas
    // coordinates shift discretely with the stable-fit snap as the
    // camera moves, which would cause a frame of shimmer at every
    // texel jump; hashing on world position is invariant).
    let angle = pcss_disk_angle(
        biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
    );
    let sin_a = sin(angle);
    let cos_a = cos(angle);
    // Blocker-search radius: track the light disc directly so a wider
    // virtual light sees more potential blockers (correct PCSS
    // behaviour — small light = sharper shadow because fewer
    // occluders matter). Bounded so it never collapses to under
    // 4 texels (a 4-texel search misses isolated blockers on near
    // cascades) and never exceeds a quarter of the tile (anything
    // larger reads almost the entire tile every sample).
    let search_radius_texels = clamp(
        pcss_light_world_radius / world_per_texel_pcss,
        4.0,
        64.0,
    );

    // Fixed 16-tap blocker + PCF. Same rationale as the cascade-
    // array PCSS path: tapering by `ndc.z` undersamples wide
    // penumbras and shows as visible disc-rotation banding.
    var blocker_sum = 0.0;
    var blocker_count = 0u;
    let tile_min_px = vec2<i32>(tile_min * atlas_uv_to_texels);
    let tile_max_px = vec2<i32>(tile_max * atlas_uv_to_texels);
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * search_radius_texels;
        let sample_uv = atlas_uv + off * inv_atlas;
        let coord = vec2<i32>(sample_uv * atlas_uv_to_texels);
        // Clamp to the cascade's own tile so the blocker search
        // doesn't read from an adjacent cascade's depth values.
        let c = clamp(coord, tile_min_px, tile_max_px);
        let d = textureLoad(shadow_atlas, c, 0);
        if d < ref_depth - 0.0005 {
            blocker_sum = blocker_sum + d;
            blocker_count = blocker_count + 1u;
        }
    }
    if blocker_count == 0u {
        return 1.0; // fully lit fast path
    }
    if blocker_count == 16u {
        // Every blocker-search sample was below the receiver's
        // biased depth — the receiver is deep inside the umbra
        // and the second 16-tap PCF would average to ≈ 0
        // anyway. Skip it.
        return 0.0;
    }
    let avg_blocker = blocker_sum / f32(blocker_count);
    // Classic PCSS penumbra: `(d_receiver − d_blocker) · light_size /
    // d_blocker`, but with light_size expressed in *world units* via
    // `world_per_texel`. The clamps keep the kernel between "more
    // than `Soft`" (4 texels) and "still affordable" (40 texels —
    // the 16-tap loop amortises hardware bilinear so this is fine).
    let light_size_texels = pcss_light_world_radius / world_per_texel_pcss;
    let penumbra_texels = clamp(
        (ref_depth - avg_blocker) * light_size_texels / max(avg_blocker, 1e-4),
        2.0,
        24.0,
    );
    // Wide PCSS kernels sample texels far from the fragment; on a sloped /
    // curved receiver the depth stored there differs by the surface slope and
    // self-shadows into acne. Scale the comparison bias with the kernel width so
    // wider penumbras get proportional slack — the softness hides the extra
    // peter-panning a near-contact (narrow-kernel) fragment would otherwise show.
    // The slack is the user's own `depth_bias` (bias_params.x) times the kernel
    // radius, so it inherits the per-light tuning instead of a fresh constant.
    let pcss_ref = ref_depth - desc.bias_params.x * penumbra_texels * 0.5;
    var pcf_sum = 0.0;
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * penumbra_texels;
        pcf_sum = pcf_sum + textureSampleCompareLevel(
            shadow_atlas,
            shadow_atlas_sampler,
            clamp(atlas_uv + off * inv_atlas, tile_min, tile_max),
            pcss_ref,
        );
    }
    return pcf_sum / 16.0;
}

// Per-light cascade selection with smooth blending across split
// boundaries. `descriptor_base` points to the first cascade descriptor
// of a directional light; `cascade_info.z` gives the cascade count.
//
// We walk descriptors descriptor_base..base+count and pick the first
// whose `cascade_info.x` (split_far in world-space depth) exceeds
// `view_z`. To hide the abrupt softness jump that comes from each
// successive cascade halving its atlas resolution, the last
// `CASCADE_BLEND` fraction of every cascade's depth range linearly
// fades into the next cascade's sample (or to fully lit for the
// final cascade — receivers past the very end get no shadow).
//
// Returns 1.0 (no shadow) if `view_z` is beyond the last cascade.
// Fraction of each cascade's depth range that fades into the next
// cascade. Stretching this band wider spreads the (unavoidable)
// quality difference between cascades across a larger area, which
// the eye stops reading as a hard edge AND keeps receivers near
// cascade boundaries from flickering when the camera moves them
// across the boundary in discrete texel-snap jumps. 50% is the AAA
// default — the corresponding `BLEND_OVERLAP` in `fit_cascades`
// ensures the next cascade's frustum covers this whole band.
const CASCADE_BLEND: f32 = 0.5;

fn sample_shadow_directional(
    descriptor_base: u32,
    world_pos: vec3<f32>,
    world_normal: vec3<f32>,
    view_z: f32,
) -> f32 {
    if descriptor_base == SHADOW_INDEX_NONE {
        return 1.0;
    }
    if descriptor_base >= MAX_SHADOW_DESCRIPTORS {
        return 1.0;
    }
    // Point/cube lights: single descriptor, no cascade walk. Their
    // `view_projection` is intentionally `Mat4::ZERO` (the cube path
    // uses `atlas_rect.xyz/.w` = (light_pos, range) + a world-space
    // direction instead of a projection), so the cascade picker's
    // `cand_clip.w <= 0.0` test below would reject them and silently
    // return "fully lit". Dispatch straight to `sample_shadow_descriptor`
    // which routes cube descriptors to `sample_shadow_cube`.
    //
    // Kind values: 0.0 = 2D PCF (spot), 1.0 = 2D EVSM (cascade),
    // 2.0 = cube, 3.0 = cascade-array PCF (directional). Only kind=2.0
    // is the single-descriptor short-circuit; the cascade-array case
    // still needs the cascade walk because directional lights pack
    // multiple cascades.
    let base_kind = shadow_descriptors.items[descriptor_base].cascade_info.w;
    if base_kind > 1.5 && base_kind < 2.5 {
        return sample_shadow_descriptor(descriptor_base, world_pos, world_normal);
    }
    let cascade_count = u32(shadow_descriptors.items[descriptor_base].cascade_info.z);
    // Cascade pick: walk descriptors near→far and stop at the first
    // one that contains the receiver in *both* depth (`view_z` inside
    // the cascade's split range) AND lateral NDC (clip.xy ∈ [-1, 1]).
    //
    // The lateral check is what we used to silently miss — picking
    // purely by `view_z` then projecting could land us on a cascade
    // whose XY frustum clipped the receiver, and `sample_shadow_descriptor`
    // would short-circuit to "fully lit". That produced a hard
    // diagonal cliff at each cascade's lateral edge whenever the
    // outer cascade actually had coverage there. Falling through to
    // the next cascade outward keeps the shadow continuous across
    // lateral boundaries the same way the depth-axis blend handles
    // split boundaries.
    var picked: u32 = SHADOW_INDEX_NONE;
    var picked_local: u32 = 0u;
    for (var i = 0u; i < cascade_count; i = i + 1u) {
        let idx = descriptor_base + i;
        if idx >= MAX_SHADOW_DESCRIPTORS {
            break;
        }
        let split_far = shadow_descriptors.items[idx].cascade_info.x;
        if view_z > split_far {
            continue;
        }
        let cand = shadow_descriptors.items[idx];
        let cand_clip = cand.view_projection * vec4<f32>(world_pos, 1.0);
        if cand_clip.w <= 0.0 {
            continue;
        }
        let cand_ndc = cand_clip.xyz / cand_clip.w;
        if cand_ndc.x < -1.0 || cand_ndc.x > 1.0
            || cand_ndc.y < -1.0 || cand_ndc.y > 1.0
            || cand_ndc.z < 0.0 || cand_ndc.z > 1.0
        {
            continue;
        }
        picked = idx;
        picked_local = i;
        break;
    }
    if picked == SHADOW_INDEX_NONE {
        return 1.0;
    }
    let split_far = shadow_descriptors.items[picked].cascade_info.x;
    var split_near: f32 = 0.0;
    if picked_local > 0u {
        split_near = shadow_descriptors.items[picked - 1u].cascade_info.x;
    }
    let span = max(split_far - split_near, 1e-4);
    let normalized = clamp((view_z - split_near) / span, 0.0, 1.0);

    let primary = sample_shadow_descriptor(picked, world_pos, world_normal);
    if normalized < 1.0 - CASCADE_BLEND {
        return primary;
    }
    let blend_t = (normalized - (1.0 - CASCADE_BLEND)) / CASCADE_BLEND;
    let next_local = picked_local + 1u;
    if next_local >= cascade_count {
        // Final cascade fades to fully lit at the very edge of the
        // light's max_distance so receivers don't pop from shadowed
        // to lit.
        return mix(primary, 1.0, blend_t);
    }
    let next_idx = descriptor_base + next_local;
    if next_idx >= MAX_SHADOW_DESCRIPTORS {
        return primary;
    }
    let secondary = sample_shadow_descriptor(next_idx, world_pos, world_normal);
    return mix(primary, secondary, blend_t);
}

// DEBUG: returns the picked cascade index (0..3) as a float, or
// 4.0 if no cascade was picked. Mirrors `sample_shadow_directional`'s
// picker so the colour overlay matches what shadow sampling actually
// uses — both the `view_z` split test AND the lateral NDC test.
fn debug_picked_cascade(
    descriptor_base: u32,
    world_pos: vec3<f32>,
    view_z: f32,
) -> f32 {
    if descriptor_base == SHADOW_INDEX_NONE || descriptor_base >= MAX_SHADOW_DESCRIPTORS {
        return 4.0;
    }
    let cascade_count = u32(shadow_descriptors.items[descriptor_base].cascade_info.z);
    for (var i = 0u; i < cascade_count; i = i + 1u) {
        let idx = descriptor_base + i;
        if idx >= MAX_SHADOW_DESCRIPTORS {
            break;
        }
        let desc = shadow_descriptors.items[idx];
        if view_z > desc.cascade_info.x {
            continue;
        }
        let clip = desc.view_projection * vec4<f32>(world_pos, 1.0);
        if clip.w <= 0.0 {
            continue;
        }
        let ndc = clip.xyz / clip.w;
        if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 || ndc.z < 0.0 || ndc.z > 1.0 {
            continue;
        }
        return f32(i);
    }
    return 4.0;
}

// Debug-overlay tint for cascade visualisation. Driven by
// `shadow_globals.flags.x` (`debug_cascade_colors`). Returns the
// cascade-tinted color if enabled, otherwise the input unchanged.
//
// The palette additionally distinguishes EVSM cascades from PCF
// cascades — EVSM cascades get a warm tone (orange / yellow) while
// PCF cascades get a cool tone (red / green / blue). The
// `cascade_info.w` flag is the source of truth (1.0 → EVSM, 0.0 →
// PCF), set on the writer side in `Shadows::write_gpu`.
fn debug_cascade_tint(
    base_color: vec3<f32>,
    descriptor_base: u32,
    world_pos: vec3<f32>,
    view_z: f32,
) -> vec3<f32> {
    if shadow_globals.flags.x == 0u {
        return base_color;
    }
    let picked = debug_picked_cascade(descriptor_base, world_pos, view_z);
    let picked_idx = u32(picked);
    if picked_idx >= 4u {
        return base_color;
    }
    // PCF (cool): red / green / blue / cyan
    let pcf_palette = array<vec3<f32>, 4>(
        vec3<f32>(1.0, 0.3, 0.3),
        vec3<f32>(0.3, 1.0, 0.3),
        vec3<f32>(0.3, 0.5, 1.0),
        vec3<f32>(0.3, 0.9, 1.0),
    );
    // EVSM (warm): scarlet / orange / yellow / gold. The receiver-
    // side dispatch uses `cascade_info.w > 0.5` for "this descriptor
    // is EVSM"; mirror that here so the overlay tracks reality.
    let evsm_palette = array<vec3<f32>, 4>(
        vec3<f32>(1.0, 0.4, 0.1),
        vec3<f32>(1.0, 0.6, 0.1),
        vec3<f32>(1.0, 0.85, 0.1),
        vec3<f32>(1.0, 1.0, 0.3),
    );
    let idx = descriptor_base + picked_idx;
    let kind = shadow_descriptors.items[idx].cascade_info.w;
    // EVSM (kind = 1.0) → warm palette; PCF flavours (cascade-array
    // kind = 3.0 and the 2D-atlas spot kind = 0.0) → cool palette.
    let is_evsm = kind > 0.5 && kind < 1.5;
    let tint = select(pcf_palette[picked_idx], evsm_palette[picked_idx], is_evsm);
    return mix(base_color, tint, 0.35);
}

{% endif %}{# end needs_shadow_sampling — shadow sampling functions #}