awsm-renderer 0.4.0

// Shadow bind-group declarations. The bind-group slot is supplied by
// the containing template via `shadow_group_index` — opaque uses slot
// 3. The transparent pass currently doesn't bind these (the adapter's
// `maxBindGroups=4` budget is fully consumed by transparent's existing
// groups).
//
// Bindings 0..=7 must stay in lockstep with
// `shared::material::bind_group::shadow_bind_group_layout_entries`.

const MAX_SHADOW_DESCRIPTORS: u32 = 32u;

struct ShadowDescriptor {
    // Light-space view-projection used at sample time.
    view_projection: mat4x4<f32>,
    // (atlas.x, atlas.y, atlas.w, atlas.h) in normalised UV space.
    atlas_rect: vec4<f32>,
    // (depth_bias, normal_bias, hardness, pcss_penumbra_scale)
    bias_params: vec4<f32>,
    // (split_far_view_z, cascade_index, cascade_count_in_light, evsm_flag)
    // `evsm_flag` is 1.0 when this cascade should sample EVSM moments
    // from `evsm_atlas` instead of the PCF depth atlas. The flag +
    // sample-site dispatch are wired; the moment-write compute pass
    // and Gaussian blur landed alongside it. If a future tweak leaves
    // EVSM disabled the cascade falls back to PCF on `shadow_atlas`.
    cascade_info: vec4<f32>,
};

struct ShadowGlobals {
    // (atlas.w, atlas.h, evsm.w, evsm.h)
    atlas_sizes: vec4<f32>,
    // (evsm_exponent, evsm_blur_radius, sscs_step_count, sscs_enabled)
    evsm_sscs: vec4<f32>,
    // (debug_cascade_colors, max_point_shadows, pad, pad)
    flags: vec4<u32>,
    // (cascade_array.w, cascade_array.h, max_layers, _) — per-layer
    // dimensions of the directional cascade texture array.
    cascade_array: vec4<f32>,
};

struct ShadowDescriptorArray {
    items: array<ShadowDescriptor, MAX_SHADOW_DESCRIPTORS>,
};

@group({{ shadow_group_index }}) @binding(0) var shadow_atlas: texture_depth_2d;
@group({{ shadow_group_index }}) @binding(1) var shadow_atlas_sampler: sampler_comparison;
@group({{ shadow_group_index }}) @binding(2) var shadow_cube_array: texture_depth_cube_array;
@group({{ shadow_group_index }}) @binding(3) var shadow_cube_sampler: sampler_comparison;
@group({{ shadow_group_index }}) @binding(4) var evsm_atlas: texture_2d<f32>;
@group({{ shadow_group_index }}) @binding(5) var evsm_atlas_sampler: sampler;
@group({{ shadow_group_index }}) @binding(6) var<uniform> shadow_globals: ShadowGlobals;
@group({{ shadow_group_index }}) @binding(7) var<uniform> shadow_descriptors: ShadowDescriptorArray;
@group({{ shadow_group_index }}) @binding(8) var shadow_cascade_array: texture_depth_2d_array;
@group({{ shadow_group_index }}) @binding(9) var shadow_cube_2d_array: texture_depth_2d_array;

// Sentinel for "no shadow" — packed into `LightPacked.row4.z`. Kept ungated:
// `apply_lighting` compares against it even before any shadow sample, and an
// unused const is free.
const SHADOW_INDEX_NONE: u32 = 0xFFFFFFFFu;

// ── Shadow SAMPLING (PCSS / PCF / EVSM / cube + SSCS) ───────────────────────
// Called ONLY from `apply_lighting`, so the whole block is gated on
// `needs_shadow_sampling` (= inc.apply_lighting). Materials that don't run
// first-party lighting (every custom material + unlit/toon/flipbook + the empty
// kernel) drop this ~50 KB of WGSL entirely. The bind group + structs above stay
// (ABI — the pipeline layout always has the shadow group).
{% if needs_shadow_sampling %}

// 16 Poisson-distributed samples in `[-1, 1]^2`. Used by both the
// PCSS blocker search and the variable-kernel PCF pass. The same
// table doubled-up keeps the WGSL small; a per-pixel rotation breaks
// up the regular pattern.
const POISSON_DISK_16: array<vec2<f32>, 16> = array<vec2<f32>, 16>(
    vec2<f32>(-0.94201624, -0.39906216),
    vec2<f32>( 0.94558609, -0.76890725),
    vec2<f32>(-0.09418410, -0.92938870),
    vec2<f32>( 0.34495938,  0.29387760),
    vec2<f32>(-0.91588581,  0.45771432),
    vec2<f32>(-0.81544232, -0.87912464),
    vec2<f32>(-0.38277543,  0.27676845),
    vec2<f32>( 0.97484398,  0.75648379),
    vec2<f32>( 0.44323325, -0.97511554),
    vec2<f32>( 0.53742981, -0.47373420),
    vec2<f32>(-0.26496911, -0.41893023),
    vec2<f32>( 0.79197514,  0.19090188),
    vec2<f32>(-0.24188840,  0.99706507),
    vec2<f32>(-0.81409955,  0.91437590),
    vec2<f32>( 0.19984126,  0.78641367),
    vec2<f32>( 0.14383161, -0.14100790),
);

// Inter-leaved Gradient Noise — Jorge Jimenez's hash, returns a
// per-pixel angle in `[0, 2π]`. Used to rotate the Poisson disk so
// adjacent fragments don't sample identical patterns.
fn pcss_disk_angle(coords: vec2<f32>) -> f32 {
    let magic = vec3<f32>(0.06711056, 0.00583715, 52.9829189);
    let noise = fract(magic.z * fract(dot(coords, magic.xy)));
    return noise * 6.2831853;
}

fn pcss_rotate(v: vec2<f32>, sin_a: f32, cos_a: f32) -> vec2<f32> {
    return vec2<f32>(v.x * cos_a - v.y * sin_a, v.x * sin_a + v.y * cos_a);
}

// Screen-space contact shadows (SSCS). Short ray-march in view space
// from `world_pos` toward `light_dir` (the surface→light direction),
// using the already-bound depth buffer (`depth_tex`). Returns `[0, 1]`
// visibility — multiplied into the main shadow term to darken micro-
// occluders that the shadow map misses (gaps under feet, hair, etc.).
//
// `shadow_globals.evsm_sscs.w` is the master enable; `.z` is the step
// count. Uses single-sample depth reads even when the geometry pass
// was rendered with MSAA (we read sample 0).
//
// The transparent pass doesn't bind a `depth_tex` (sampling the
// in-progress depth target on the same pass would be a feedback loop),
// so its shader template sets `sscs_available = false` and this
// function short-circuits to "fully lit" before any depth fetch.
fn apply_sscs(world_pos: vec3<f32>, light_dir: vec3<f32>) -> f32 {
{% if sscs_available %}
    let enabled = shadow_globals.evsm_sscs.w;
    if enabled < 0.5 {
        return 1.0;
    }
    let steps = u32(max(shadow_globals.evsm_sscs.z, 1.0));
    if steps == 0u {
        return 1.0;
    }

    // SSCS — Screen-Space Contact Shadows. A short ray-march from
    // each receiver toward the light, sampling the geometry-pass
    // depth buffer at each step. Used purely as a *contact-shadow
    // refinement* on top of the cascade map: it darkens the narrow
    // band right where caster geometry meets receiver geometry,
    // where the cascade's texel resolution leaves a "Peter Pan"
    // gap. It is NOT a substitute for the main shadow.
    //
    // The comparison is done in **linear view-space Z** (metres),
    // not NDC.z. This matters: NDC.z under perspective compresses
    // wildly with distance — a `0.001` NDC.z window covers ~1 mm at
    // the near plane but ~5 m at view-z = -50 m, so any NDC.z-based
    // thickness window misclassifies far receivers' rays against
    // unrelated background geometry. Earlier revisions had exactly
    // this failure mode (visible trails at zoom-out).
    //
    // Math:
    //   * receiver view-Z is `(camera.view · world_pos).z` (linear).
    //   * walking the ray `t_world` metres along `light_dir` changes
    //     view-Z by `(camera.view · light_dir).z · t_world` — also
    //     linear, so each march step's view-Z is exact.
    //   * the sampled depth-buffer texel is converted back to
    //     view-Z via `inv_proj`, which handles both perspective and
    //     ortho cameras correctly.
    //   * a scene texel "in front of the ray" satisfies
    //     `scene_view_z - ray_view_z > 0` (closer to camera = less
    //     negative). The thickness window is in metres and
    //     consistent across all depths.

    // Tunables — all are physical (metres or per-frame budget).
    // World-space step length is fixed so the same surface point
    // samples the same world positions every frame; only the depth
    // buffer read at each step's screen projection varies. This
    // matches the original Drobot 2017 formulation and avoids the
    // temporal jitter that a pixel-driven march produces (the
    // pixel-per-world ratio changes as the camera zooms, so a
    // fixed-pixel march samples different world positions every
    // frame even for the same surface).
    let SSCS_STEP_WORLD: f32 = 0.04;          // 4 cm per step → 64 cm reach @ 16 steps
    let SSCS_THICKNESS: f32 = 0.05;           // 5 cm slab counts as occluder
    let SSCS_SELF_OCCLUSION_EPS: f32 = 0.002; // 2 mm self-occlusion guard
    let MAX_DARKENING: f32 = 0.35;            // SSCS is refinement, not shadow

    let viewport_size = camera_raw.viewport.zw;
    let depth_dim = vec2<i32>(viewport_size);

    // Linear view-space Z values are used for the depth comparison
    // (NDC.z is non-linear under perspective — a fixed NDC.z window
    // would over/under-cover the slab at different depths and was
    // the bug behind the original "trailing at zoom-out" artefact).
    let recv_view_z = (camera_raw.view * vec4<f32>(world_pos, 1.0)).z;
    // View-Z slope per world-space metre along the ray; `light_dir`
    // is a direction vector (w = 0).
    let view_z_per_world = (camera_raw.view * vec4<f32>(light_dir, 0.0)).z;

    // World-space-stable per-fragment jitter on the start offset to
    // dither step quantisation between neighbouring receivers without
    // introducing per-frame noise. Hashing on the pixel coordinate
    // would change every camera move (same surface → different
    // pixel) which manifests as visible flicker; world-space
    // hashing is camera-invariant.
    let jitter_seed = world_pos.xz * 137.0
        + vec2<f32>(world_pos.y * 31.0, world_pos.y * 17.0);
    let jitter = pcss_disk_angle(jitter_seed) * (1.0 / 6.2831853);
    let t_start_world = (1.0 + jitter) * SSCS_STEP_WORLD;

    var hits: f32 = 0.0;
    for (var i: u32 = 0u; i < steps; i = i + 1u) {
        let t_world = t_start_world + SSCS_STEP_WORLD * f32(i);

        // Same world point every frame — project it now to find the
        // depth-buffer texel to sample.
        let ray_world = world_pos + light_dir * t_world;
        let clip = camera_raw.view_proj * vec4<f32>(ray_world, 1.0);
        if clip.w <= 0.0 {
            continue;
        }
        let ndc = clip.xyz / clip.w;
        if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 {
            continue;
        }
        let px_uv = ndc.xy * vec2<f32>(0.5, -0.5) + vec2<f32>(0.5, 0.5);
        let px_f = px_uv * viewport_size;
        let px = vec2<i32>(px_f);
        if px.x < 0 || px.y < 0 || px.x >= depth_dim.x || px.y >= depth_dim.y {
            continue;
        }
        let scene_ndc_z = textureLoad(depth_tex, px, 0);
        if scene_ndc_z >= 1.0 {
            // Background — no occluder to find here.
            continue;
        }

        // Ray view-Z is linear in `t_world` — exact, no projection
        // round-trip needed.
        let ray_view_z = recv_view_z + view_z_per_world * t_world;
        // Linearise the sampled depth via the camera's inv_proj.
        // For perspective this is non-affine; for ortho it's a
        // simple scale. Either way the .z / .w form is correct.
        let scene_view_h = camera_raw.inv_proj
            * vec4<f32>(ndc.xy, scene_ndc_z, 1.0);
        let scene_view_z = scene_view_h.z / scene_view_h.w;

        // Both view-Z values are linear and negative for points in
        // front of the camera. A scene texel closer to the camera
        // than the ray has `scene_view_z > ray_view_z` (less
        // negative). The thickness slab keeps far-background
        // geometry from counting as an occluder.
        let dz = scene_view_z - ray_view_z;
        if dz > SSCS_SELF_OCCLUSION_EPS && dz < SSCS_THICKNESS {
            hits = hits + 1.0;
        }
    }

    let occluded = hits / f32(steps);
    return 1.0 - occluded * MAX_DARKENING;
{% else %}
    return 1.0;
{% endif %}
}

// Cube near plane — MUST match the value used in `Mat4::perspective_rh`
// for cube face generation in `Shadows::write_gpu`.
const POINT_SHADOW_NEAR: f32 = 0.05;

// Point-light cube shadow sample.
//
// Each cube face stores perspective NDC.z written by the rasterizer
// (90° FOV, `near = POINT_SHADOW_NEAR`, `far = light_range`). The
// projection is post-multiplied by a Y-flip on the writer side so the
// rasterized image lines up with WebGPU's D3D-style cube sampling
// convention (texel `t=0` → world +Y on the +X face, etc.) — see
// `Shadows::write_gpu`. That flip doesn't change NDC.z, so the depth
// formula below stays the same on both sides.
//
// The receiver recreates that NDC.z by projecting `length(light, P)`
// onto the *dominant* cube axis of the light-to-surface direction:
//
//     view_depth = distance(light, P) · |dir.major|
//     ndc_z      = (far / (far - near)) · (1 - near / view_depth)
//
// Same formula generates both the rasterized atlas value and the
// receiver reference, so they compare directly — no linear-depth FS
// override, no per-tap face recompute, no seam math.
fn sample_shadow_cube(desc: ShadowDescriptor, world_pos: vec3<f32>, world_normal: vec3<f32>) -> f32 {
    let light_pos = desc.atlas_rect.xyz;
    let range = max(desc.atlas_rect.w, 0.01);
    let slot = i32(desc.cascade_info.y);

    let biased_pos = world_pos + world_normal * desc.bias_params.y;
    let light_to_surface = biased_pos - light_pos;
    let dist = length(light_to_surface);
    if dist >= range {
        return 1.0;
    }
    let dir = light_to_surface / max(dist, 1e-4);

    // Major-axis (cube-face) projected depth.
    let abs_d = abs(dir);
    let major = max(abs_d.x, max(abs_d.y, abs_d.z));
    let view_depth = dist * max(major, 1e-4);

    // Same perspective NDC.z formula as the rasterizer.
    let near = POINT_SHADOW_NEAR;
    let ndc_z = (range / (range - near)) * (1.0 - near / max(view_depth, near));

    // Slope-aware constant bias. `n_dot_dir` floor at 0.05 keeps
    // grazing surfaces from running away to huge bias values
    // (`bias → ∞` as `n_dot_dir → 0`); the user-authored
    // `desc.bias_params.x` (the per-light `depth_bias`) is trusted
    // as-is. An earlier floor of `max(..., 0.001)` here silently
    // overrode any inspector value smaller than 0.001 — that was
    // ~10× the NDC gap between a receiver and a box's back face at
    // a typical 4 m point-light distance, so contacts could never
    // close even after lowering `depth_bias`. If you DO want a
    // global floor for some project, gate it on
    // `ShadowsConfig::min_point_depth_bias` (not present today).
    let n_dot_dir = abs(dot(dir, world_normal));
    let bias = desc.bias_params.x / max(n_dot_dir, 0.05);
    let ref_depth = clamp(ndc_z, 0.0, 1.0) - bias;
    let hardness = desc.bias_params.z;

    if hardness < 0.5 {
        return textureSampleCompareLevel(
            shadow_cube_array,
            shadow_cube_sampler,
            dir,
            slot,
            ref_depth,
        );
    }

    // Soft and PCSS share the same disc-on-tangent-plane tap layout:
    // each tap recomputes its own direction-from-light + NDC.z + bias
    // (rather than rotating the central `dir`) so a flat receiver
    // doesn't self-shadow into a kernel-shaped patch. The PCSS path
    // additionally does a blocker-search pre-pass using
    // `shadow_cube_2d_array` (raw depth reads) to scale the kernel.
    let abs_n = abs(world_normal);
    let up_hint = select(
        vec3<f32>(0.0, 1.0, 0.0),
        vec3<f32>(1.0, 0.0, 0.0),
        abs_n.y > 0.99,
    );
    let tangent = normalize(cross(up_hint, world_normal));
    let bitangent = cross(world_normal, tangent);

    let angle = pcss_disk_angle(
        biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
    );
    let sin_a = sin(angle);
    let cos_a = cos(angle);

    if hardness < 1.5 {
        // Soft — fixed 16-tap rotated Poisson, ~15 cm world disc.
        // Distance tapering applies ONLY to the PCSS branch below
        // (where the variable-kernel PCF can absorb the noise floor
        // a smaller sample count introduces). The Soft path is the
        // user's "I want a clean smooth shadow, no contact hardening"
        // setting; dropping its tap count introduces visible Poisson-
        // rotation banding on large smooth receivers (the floor in
        // the canonical "directional light + character on a plane"
        // test). 16 fixed = visually clean; the tap-count knob exists
        // for the PCSS branch's wide-kernel pass.
        // World-space disc radius. Base 0.15 m at `pcss_penumbra_scale == 1`;
        // the per-light knob (bias_params.w) is the user's softness control,
        // shared with PCSS so one slider governs both modes for point lights too.
        let SOFT_WORLD_RADIUS: f32 = 0.15 * max(desc.bias_params.w, 0.0);
        var sum = 0.0;
        for (var i = 0u; i < 16u; i = i + 1u) {
            let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * SOFT_WORLD_RADIUS;
            let tap_pos = biased_pos + tangent * off.x + bitangent * off.y;
            let tap_to_light = tap_pos - light_pos;
            let tap_dist = length(tap_to_light);
            let tap_dir = tap_to_light / max(tap_dist, 1e-4);
            let tap_abs = abs(tap_dir);
            let tap_major = max(tap_abs.x, max(tap_abs.y, tap_abs.z));
            let tap_view_depth = tap_dist * max(tap_major, 1e-4);
            let tap_ndc_z =
                (range / (range - near)) * (1.0 - near / max(tap_view_depth, near));
            let tap_n_dot_dir = abs(dot(tap_dir, world_normal));
            let tap_bias = desc.bias_params.x / max(tap_n_dot_dir, 0.05);
            let tap_ref = clamp(tap_ndc_z, 0.0, 1.0) - tap_bias;
            sum += textureSampleCompareLevel(
                shadow_cube_array,
                shadow_cube_sampler,
                tap_dir,
                slot,
                tap_ref,
            );
        }
        return sum / 16.0;
    }

    // PCSS — real blocker search + variable kernel.
    //
    // Stage 1 (blocker search): sample a fixed 16-tap "search" disc
    // sized by `pcss_penumbra_scale` (a virtual light disc radius in
    // metres). At each tap, project the tap's light direction onto
    // the right cube face, fetch raw depth via the 2D-array view,
    // and average the depths of taps that lie in front of the
    // receiver.
    //
    // Stage 2 (variable PCF): derive a penumbra radius from the
    // standard PCSS formula `(d_recv - d_avg) * light_size / d_avg`
    // and re-sample with `textureSampleCompareLevel`, this time
    // through the cube sampler so we get hardware bilinear PCF.
    //
    // The cube faces share a single NDC.z formula with the writer:
    //   ndc_z = (range / (range - near)) * (1 - near / view_depth)
    // so `textureLoad`-ed depths are directly comparable to the
    // per-tap `ref_depth` we compute here.
    let pcss_scale = max(desc.bias_params.w, 0.01);
    // Blocker-search disc: fixed 30 cm world radius scaled by
    // `pcss_penumbra_scale`. Bigger = fatter blocker estimate.
    let pcss_search_world_radius = 0.30 * pcss_scale;
    // Cube face dimension (px) for face-UV → texel conversion. All
    // faces share the same square resolution.
    let cube_dims = textureDimensions(shadow_cube_2d_array, 0);
    let cube_face_size = vec2<f32>(f32(cube_dims.x), f32(cube_dims.y));

    // Fixed 16-tap blocker search. We previously tapered this by
    // `dist / range` to save fragment cost on distant receivers,
    // but the variable-kernel PCF below needs all 16 samples to
    // resolve smoothly — undersampled wide penumbras showed
    // visible Poisson-rotation banding (cube version less obvious
    // than directional, but present). The unused helper
    // `pcss_tap_count` is kept above for future re-introduction
    // once a quality-preserving tap budget is worked out.
    var blocker_sum = 0.0;
    var blocker_count = 0u;
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * pcss_search_world_radius;
        let tap_pos = biased_pos + tangent * off.x + bitangent * off.y;
        let tap_to_light = tap_pos - light_pos;
        let tap_dist = length(tap_to_light);
        let tap_dir = tap_to_light / max(tap_dist, 1e-4);
        let tap_abs = abs(tap_dir);
        let tap_major = max(tap_abs.x, max(tap_abs.y, tap_abs.z));
        let tap_view_depth = tap_dist * max(tap_major, 1e-4);
        let tap_ndc_z = clamp(
            (range / (range - near)) * (1.0 - near / max(tap_view_depth, near)),
            0.0,
            1.0,
        );
        // Inline cube-direction → (face, uv) projection. Standard
        // D3D cube convention; the writer's post-projection Y-flip is
        // already baked into the texel layout.
        var tap_face: u32 = 0u;
        var tap_uc: f32 = 0.0;
        var tap_vc: f32 = 0.0;
        var tap_ma: f32 = 1e-4;
        if tap_abs.x >= tap_abs.y && tap_abs.x >= tap_abs.z {
            if tap_dir.x > 0.0 {
                tap_face = 0u; tap_uc = -tap_dir.z; tap_vc = -tap_dir.y; tap_ma = tap_abs.x;
            } else {
                tap_face = 1u; tap_uc =  tap_dir.z; tap_vc = -tap_dir.y; tap_ma = tap_abs.x;
            }
        } else if tap_abs.y >= tap_abs.z {
            if tap_dir.y > 0.0 {
                tap_face = 2u; tap_uc =  tap_dir.x; tap_vc =  tap_dir.z; tap_ma = tap_abs.y;
            } else {
                tap_face = 3u; tap_uc =  tap_dir.x; tap_vc = -tap_dir.z; tap_ma = tap_abs.y;
            }
        } else {
            if tap_dir.z > 0.0 {
                tap_face = 4u; tap_uc =  tap_dir.x; tap_vc = -tap_dir.y; tap_ma = tap_abs.z;
            } else {
                tap_face = 5u; tap_uc = -tap_dir.x; tap_vc = -tap_dir.y; tap_ma = tap_abs.z;
            }
        }
        let tap_inv = 0.5 / max(tap_ma, 1e-4);
        let face_uv = vec2<f32>(tap_uc * tap_inv + 0.5, tap_vc * tap_inv + 0.5);
        let layer = i32(slot) * 6 + i32(tap_face);
        let tex_xy = clamp(
            vec2<i32>(face_uv * cube_face_size),
            vec2<i32>(0, 0),
            vec2<i32>(cube_dims.xy) - vec2<i32>(1, 1),
        );
        let d = textureLoad(shadow_cube_2d_array, tex_xy, layer, 0);
        // Bias-free blocker test — we want a clean estimate of how
        // many genuine occluders sit in front of the receiver. The
        // 0.0005 epsilon matches the directional PCSS path.
        if d < tap_ndc_z - 0.0005 {
            blocker_sum = blocker_sum + d;
            blocker_count = blocker_count + 1u;
        }
    }
    if blocker_count == 0u {
        return 1.0;
    }
    if blocker_count == 16u {
        return 0.0;
    }
    let avg_blocker = blocker_sum / f32(blocker_count);
    // PCSS penumbra in NDC.z space: `(z_recv - z_blocker) * light /
    // z_blocker`. Map back to a world-space disc radius on the
    // receiver tangent plane by treating the receiver-to-light
    // distance as the projection distance — light_size in world
    // metres = `pcss_penumbra_scale × 1m × penumbra_ratio`.
    let recv_ndc_z = clamp(ndc_z, 0.0, 1.0);
    let penumbra_ratio = clamp(
        (recv_ndc_z - avg_blocker) / max(avg_blocker, 1e-4),
        0.0,
        4.0,
    );
    // Clamp to keep the kernel between "more than Soft" (10 cm) and
    // "still affordable" (1 m world disc — already huge at typical
    // point-light scales).
    let penumbra_world_radius = clamp(
        pcss_search_world_radius * penumbra_ratio,
        0.10,
        1.00,
    );

    var sum = 0.0;
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * penumbra_world_radius;
        let tap_pos = biased_pos + tangent * off.x + bitangent * off.y;
        let tap_to_light = tap_pos - light_pos;
        let tap_dist = length(tap_to_light);
        let tap_dir = tap_to_light / max(tap_dist, 1e-4);
        let tap_abs = abs(tap_dir);
        let tap_major = max(tap_abs.x, max(tap_abs.y, tap_abs.z));
        let tap_view_depth = tap_dist * max(tap_major, 1e-4);
        let tap_ndc_z =
            (range / (range - near)) * (1.0 - near / max(tap_view_depth, near));
        let tap_n_dot_dir = abs(dot(tap_dir, world_normal));
        let tap_bias = desc.bias_params.x / max(tap_n_dot_dir, 0.05);
        let tap_ref = clamp(tap_ndc_z, 0.0, 1.0) - tap_bias;
        sum += textureSampleCompareLevel(
            shadow_cube_array,
            shadow_cube_sampler,
            tap_dir,
            slot,
            tap_ref,
        );
    }
    return sum / 16.0;
}


// EVSM sample. Reads the four exponential moments from `evsm_atlas`
// (written + blurred by the compute passes in `shadows::evsm`),
// reconstructs positive and negative one-tailed Chebyshev visibility,
// and returns `min(pos, neg)`. The pre-write blur is the source of
// softness — at sample time we do a single bilinear fetch.
//
// The exponent used at write time is `shadow_globals.evsm_sscs.x`
// (config.evsm_exponent). Receiver and writer must agree, else the
// curve mismatches and shadows go solid / clear.
fn chebyshev_upper(moments_2: vec2<f32>, t: f32) -> f32 {
    // moments_2.x = E[exp_z], moments_2.y = E[exp_z²].
    // variance = E[X²] − (E[X])²; clamped above a small floor so a
    // flat receiver doesn't divide by zero.
    let mean = moments_2.x;
    let variance = max(moments_2.y - mean * mean, 1e-5);
    let d = t - mean;
    if d <= 0.0 {
        return 1.0;
    }
    let p_max = variance / (variance + d * d);
    // Linstep light-bleed reduction — clamp the lower tail so partial
    // occluders don't lift the shadow into halftone.
    return clamp((p_max - 0.2) / 0.8, 0.0, 1.0);
}

fn sample_shadow_evsm(
    desc: ShadowDescriptor,
    world_pos: vec3<f32>,
    world_normal: vec3<f32>,
) -> f32 {
    let biased_pos = world_pos + world_normal * desc.bias_params.y;
    let clip = desc.view_projection * vec4<f32>(biased_pos, 1.0);
    if clip.w <= 0.0 {
        return 1.0;
    }
    let ndc = clip.xyz / clip.w;
    if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 || ndc.z < 0.0 || ndc.z > 1.0 {
        return 1.0;
    }
    let uv_local = vec2<f32>(ndc.x * 0.5 + 0.5, -ndc.y * 0.5 + 0.5);
    let atlas_uv = desc.atlas_rect.xy + uv_local * desc.atlas_rect.zw;

    // Clamp to the EVSM cascade's own tile inset by half a texel so the
    // bilinear fetch never crosses the rect boundary. Without this the
    // 2×2 bilinear tap at the tile edge reads from neighbouring rect
    // moments (or uninitialised RGBA16F memory if no other EVSM
    // cascade was packed there), producing a hard rectangular cliff
    // exactly at the cascade outline. Same defence as the PCF path
    // does via `tile_min` / `tile_max`.
    let inv_evsm_atlas = vec2<f32>(
        1.0 / shadow_globals.atlas_sizes.z,
        1.0 / shadow_globals.atlas_sizes.w,
    );
    let evsm_tile_min = desc.atlas_rect.xy + 0.5 * inv_evsm_atlas;
    let evsm_tile_max = desc.atlas_rect.xy + desc.atlas_rect.zw - 0.5 * inv_evsm_atlas;
    let clamped_uv = clamp(atlas_uv, evsm_tile_min, evsm_tile_max);

    let moments = textureSampleLevel(evsm_atlas, evsm_atlas_sampler, clamped_uv, 0.0);
    let exponent = shadow_globals.evsm_sscs.x;
    // Map receiver depth [0,1] to the same [-1,1] space the writer
    // used (see `shadows::evsm::MOMENT_WRITE_WGSL`).
    let z = 2.0 * ndc.z - 1.0;
    let pos_t = exp(exponent * z);
    let neg_t = -exp(-exponent * z);
    let v_pos = chebyshev_upper(moments.xy, pos_t);
    let v_neg = chebyshev_upper(moments.zw, neg_t);
    return min(v_pos, v_neg);
}

// Sample a directional-cascade descriptor (kind = 3) backed by the
// `shadow_cascade_array` texture. Layout in atlas_rect:
//   .x = layer index (as f32)
//   .y = 0 (cascade starts at layer origin)
//   .zw = used sub-rect width/height in normalised UV
//
// Hardness branches mirror `sample_shadow_descriptor`'s 2D path; the
// only difference is the bound texture and an explicit layer argument
// on every compare/load.
fn sample_shadow_cascade_array(
    desc: ShadowDescriptor,
    world_pos: vec3<f32>,
    world_normal: vec3<f32>,
) -> f32 {
    let layer = i32(desc.atlas_rect.x);
    let biased_pos = world_pos + world_normal * desc.bias_params.y;
    let clip = desc.view_projection * vec4<f32>(biased_pos, 1.0);
    if clip.w <= 0.0 {
        return 1.0;
    }
    let ndc = clip.xyz / clip.w;
    if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 || ndc.z < 0.0 || ndc.z > 1.0 {
        return 1.0;
    }
    let uv_local = vec2<f32>(ndc.x * 0.5 + 0.5, -ndc.y * 0.5 + 0.5);
    // Cascades always start at the layer origin; multiply by the
    // sub-rect size in normalised UV so smaller cascades don't read
    // outside their valid region.
    let atlas_uv = uv_local * desc.atlas_rect.zw;
    let ref_depth = ndc.z - desc.bias_params.x;
    let hardness = desc.bias_params.z;

    let inv_atlas = vec2<f32>(
        1.0 / shadow_globals.cascade_array.x,
        1.0 / shadow_globals.cascade_array.y,
    );
    // Half-texel inset to keep the bilinear / PCF taps inside the
    // valid sub-rect of the layer when `used_res < layer_size`.
    let tile_min = 0.5 * inv_atlas;
    let tile_max = desc.atlas_rect.zw - 0.5 * inv_atlas;

    if hardness < 0.5 {
        return textureSampleCompareLevel(
            shadow_cascade_array,
            shadow_atlas_sampler,
            clamp(atlas_uv, tile_min, tile_max),
            layer,
            ref_depth,
        );
    }
    if hardness < 1.5 {
        // Soft — fixed 16-tap rotated Poisson. See the matching
        // comment in `sample_shadow_cube`'s Soft branch — tapering
        // here introduced visible banding on large smooth receivers
        // (e.g. a floor plane under a directional light). The
        // PCSS branch below still tapers; the Soft path is fixed.
        let world_per_texel = max(desc.cascade_info.y, 1e-4);
        // World-unit penumbra → texel kernel below; scale-invariant. Base 0.12 m
        // at `pcss_penumbra_scale == 1`; the per-light knob (bias_params.w) is
        // the user's softness control, shared with PCSS. Fixed-width (no blocker
        // search), so keep the base modest — it does not narrow toward contact.
        let soft_world_radius = 0.12 * max(desc.bias_params.w, 0.0);
        let radius_texels = clamp(soft_world_radius / world_per_texel, 2.0, 10.0);
        let angle = pcss_disk_angle(
            biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
        );
        let sin_a = sin(angle);
        let cos_a = cos(angle);
        var sum = 0.0;
        for (var i = 0u; i < 16u; i = i + 1u) {
            let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * radius_texels;
            sum += textureSampleCompareLevel(
                shadow_cascade_array, shadow_atlas_sampler,
                clamp(atlas_uv + off * inv_atlas, tile_min, tile_max),
                layer,
                ref_depth,
            );
        }
        return sum / 16.0;
    }
    // PCSS — same recipe as the 2D path, with the cascade-array
    // texture and explicit `layer` arg.
    let pcss_scale = max(desc.bias_params.w, 0.01);
    let world_per_texel_pcss = max(desc.cascade_info.y, 1e-4);
    let pcss_light_world_radius = 1.0 * pcss_scale;
    let atlas_uv_to_texels = vec2<f32>(
        shadow_globals.cascade_array.x,
        shadow_globals.cascade_array.y,
    );
    let angle = pcss_disk_angle(
        biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
    );
    let sin_a = sin(angle);
    let cos_a = cos(angle);
    let search_radius_texels = clamp(
        pcss_light_world_radius / world_per_texel_pcss,
        4.0,
        64.0,
    );
    // Fixed 16-tap blocker + PCF. The earlier tapered version
    // (`pcss_tap_count(ndc.z)`) showed clear ribbon/striping
    // artifacts on the canonical "robot on a floor under a
    // directional light" test — `ndc.z` is uncorrelated with
    // PCSS penumbra width, so fragments at `ndc.z ≈ 1` ended up
    // with 4 samples on a wide kernel, undersampling enough to
    // expose the rotated-Poisson disc as banding. Tapering is
    // parked here (and on the cube + 2D paths) until a quality-
    // preserving budget is worked out.
    var blocker_sum = 0.0;
    var blocker_count = 0u;
    let tile_min_px = vec2<i32>(tile_min * atlas_uv_to_texels);
    let tile_max_px = vec2<i32>(tile_max * atlas_uv_to_texels);
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * search_radius_texels;
        let sample_uv = atlas_uv + off * inv_atlas;
        let coord = vec2<i32>(sample_uv * atlas_uv_to_texels);
        let c = clamp(coord, tile_min_px, tile_max_px);
        let d = textureLoad(shadow_cascade_array, c, layer, 0);
        if d < ref_depth - 0.0005 {
            blocker_sum = blocker_sum + d;
            blocker_count = blocker_count + 1u;
        }
    }
    if blocker_count == 0u {
        return 1.0;
    }
    if blocker_count == 16u {
        return 0.0;
    }
    let avg_blocker = blocker_sum / f32(blocker_count);
    let light_size_texels = pcss_light_world_radius / world_per_texel_pcss;
    let penumbra_texels = clamp(
        (ref_depth - avg_blocker) * light_size_texels / max(avg_blocker, 1e-4),
        2.0,
        24.0,
    );
    // Wide PCSS kernels sample texels far from the fragment; on a sloped /
    // curved receiver the depth stored there differs by the surface slope and
    // self-shadows into acne. Scale the comparison bias with the kernel width so
    // wider penumbras get proportional slack — the softness hides the extra
    // peter-panning a near-contact (narrow-kernel) fragment would otherwise show.
    let pcss_ref = ref_depth - desc.bias_params.x * penumbra_texels * 0.5;
    var pcf_sum = 0.0;
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * penumbra_texels;
        pcf_sum = pcf_sum + textureSampleCompareLevel(
            shadow_cascade_array,
            shadow_atlas_sampler,
            clamp(atlas_uv + off * inv_atlas, tile_min, tile_max),
            layer,
            pcss_ref,
        );
    }
    return pcf_sum / 16.0;
}

// Sample a single shadow descriptor (cascade / spot / face). Returns
// `[0, 1]` visibility (1.0 = lit, 0.0 = fully shadowed).
//
// Hardness branches:
//   0.0 = Hard, 1-tap.
//   1.0 = Soft, 3x3 PCF.
//   2.0 = PCSS — blocker search + variable-kernel PCF.
fn sample_shadow_descriptor(
    descriptor_index: u32,
    world_pos: vec3<f32>,
    world_normal: vec3<f32>,
) -> f32 {
    if descriptor_index >= MAX_SHADOW_DESCRIPTORS {
        return 1.0;
    }
    let desc = shadow_descriptors.items[descriptor_index];
    // cascade_info.w encodes the descriptor kind:
    //   0.0 = 2D PCF on `shadow_atlas` (spot)
    //   1.0 = 2D EVSM cascade — read moments from `evsm_atlas`
    //   2.0 = cube (point light)
    //   3.0 = directional cascade on `shadow_cascade_array`
    let kind = desc.cascade_info.w;
    if kind > 2.5 {
        return sample_shadow_cascade_array(desc, world_pos, world_normal);
    }
    if kind > 1.5 {
        return sample_shadow_cube(desc, world_pos, world_normal);
    }
    if kind > 0.5 {
        return sample_shadow_evsm(desc, world_pos, world_normal);
    }

    // Offset the receiver along its surface normal by `normal_bias`
    // world-space units before projecting into shadow space. This
    // pushes the sample point *toward* the light, which is how we
    // dodge acne on slanted surfaces without relying solely on a
    // constant depth bias (cascade Z-ranges differ a lot, so a flat
    // depth bias is either too soft or too aggressive). The
    // pipeline's slope-scale bias and `bias_params.x` depth bias
    // handle the residual.
    let biased_pos = world_pos + world_normal * desc.bias_params.y;
    let clip = desc.view_projection * vec4<f32>(biased_pos, 1.0);
    if clip.w <= 0.0 {
        return 1.0;
    }
    let ndc = clip.xyz / clip.w;
    if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 || ndc.z < 0.0 || ndc.z > 1.0 {
        return 1.0;
    }
    let uv_local = vec2<f32>(ndc.x * 0.5 + 0.5, -ndc.y * 0.5 + 0.5);
    let atlas_uv = desc.atlas_rect.xy + uv_local * desc.atlas_rect.zw;
    let ref_depth = ndc.z - desc.bias_params.x;
    let hardness = desc.bias_params.z;

    // PCF / PCSS taps must stay inside this cascade's tile of the
    // atlas. The tile-pack allocator places cascades edge-to-edge,
    // so a kernel that crosses the boundary samples a totally
    // unrelated cascade's depth (or another light's spot tile) and
    // produces a fringe of bogus shadow at the tile seam. The inset
    // is half a texel so bilinear PCF taps don't read past the edge
    // either.
    let inv_atlas = vec2<f32>(
        1.0 / shadow_globals.atlas_sizes.x,
        1.0 / shadow_globals.atlas_sizes.y,
    );
    let tile_min = desc.atlas_rect.xy + 0.5 * inv_atlas;
    let tile_max = desc.atlas_rect.xy + desc.atlas_rect.zw - 0.5 * inv_atlas;

    if hardness < 0.5 {
        return textureSampleCompareLevel(
            shadow_atlas,
            shadow_atlas_sampler,
            clamp(atlas_uv, tile_min, tile_max),
            ref_depth,
        );
    }
    if hardness < 1.5 {
        // Tap-rotated 16-sample Poisson disk PCF. The kernel is
        // sized in *world units* (`SOFT_WORLD_RADIUS`) and the
        // per-cascade texel-radius is recovered by dividing by the
        // cascade's `world_per_texel` (stored in `cascade_info.y`).
        // That keeps the perceived soft-edge width identical in every
        // cascade — without this, the near cascade's 2048 texels
        // covering a tiny world span produces razor-sharp shadows
        // while the far cascade's same 2048 texels covering a much
        // larger span produces soft ones, and the boundary between
        // is visible as a step in penumbra width.
        let world_per_texel = max(desc.cascade_info.y, 1e-4);
        // Penumbra half-width in WORLD units (converted to a texel kernel by
        // the divide below), so the perceived soft edge is identical regardless
        // of scene scale or which cascade resolves it — nothing here assumes a
        // particular scene size. The 0.12 m base is the default at
        // `pcss_penumbra_scale == 1`; that per-light knob (bias_params.w) is the
        // user's softness control, shared with PCSS so one slider governs both
        // modes. Unlike PCSS this kernel is fixed-width (no blocker search), so
        // it does not narrow toward contact — keep the base modest.
        let soft_world_radius = 0.12 * max(desc.bias_params.w, 0.0);
        // Clamp at 3 texels min (a too-tight kernel collapses to a
        // single 2×2 bilinear compare and the cascade-boundary blend
        // shows a "soft → razor" step). 20 texels max so the near
        // cascade doesn't waste kernel area where world_per_texel is
        // sub-millimetre.
        let radius_texels = clamp(soft_world_radius / world_per_texel, 2.0, 10.0);

        // Per-fragment rotation hash. MUST be keyed on world position
        // (not `atlas_uv`) — atlas_uv shifts by exactly one texel
        // every time the stable-fit's texel-snap moves, and the snap
        // moves whenever the camera translates by enough to cross a
        // texel boundary in light view. A pixel-keyed hash would
        // therefore rotate the tap pattern for every receiver in
        // lockstep on every such snap, producing a frame of
        // shimmer at every snap step. World-space hashing is
        // invariant under the camera's discrete grid jumps.
        let angle = pcss_disk_angle(
            biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
        );
        let sin_a = sin(angle);
        let cos_a = cos(angle);
        // Fixed 16 taps on the Soft path — see `sample_shadow_cube`'s
        // Soft branch for the full rationale. Tapering here banded
        // large smooth receivers; the PCSS branch below still
        // tapers because its variable-kernel PCF absorbs the noise.
        var sum = 0.0;
        for (var i = 0u; i < 16u; i = i + 1u) {
            let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * radius_texels;
            sum += textureSampleCompareLevel(
                shadow_atlas, shadow_atlas_sampler,
                clamp(atlas_uv + off * inv_atlas, tile_min, tile_max),
                ref_depth,
            );
        }
        return sum / 16.0;
    }
    // PCSS — blocker-search + variable-kernel PCF.
    //
    // `pcss_penumbra_scale` (`bias_params.w`) is a multiplier on a
    // base 1 m "light disc" radius — i.e. how large the simulated
    // sun / area light appears at the receiver. With the default
    // scale = 1.0, the search & penumbra grow as if the light were a
    // 1 m disc; smaller values give sharper contact, larger values
    // give more dramatic falloff.
    //
    // Everything below is sized in *world units* (then converted to
    // texels via `world_per_texel` per cascade) so the cost / quality
    // of PCSS stays comparable across cascades — without that scaling
    // the search radius collapses to a few texels on the far cascade
    // and the algorithm degenerates into PCF.
    let pcss_scale = max(desc.bias_params.w, 0.01);
    let world_per_texel_pcss = max(desc.cascade_info.y, 1e-4);
    let pcss_light_world_radius = 1.0 * pcss_scale; // virtual light disc radius (m)
    let atlas_uv_to_texels = vec2<f32>(
        shadow_globals.atlas_sizes.x,
        shadow_globals.atlas_sizes.y,
    );
    // World-space rotation hash (see Soft PCF branch above — atlas
    // coordinates shift discretely with the stable-fit snap as the
    // camera moves, which would cause a frame of shimmer at every
    // texel jump; hashing on world position is invariant).
    let angle = pcss_disk_angle(
        biased_pos.xz * 137.0 + vec2<f32>(biased_pos.y * 31.0, biased_pos.y * 17.0),
    );
    let sin_a = sin(angle);
    let cos_a = cos(angle);
    // Blocker-search radius: track the light disc directly so a wider
    // virtual light sees more potential blockers (correct PCSS
    // behaviour — small light = sharper shadow because fewer
    // occluders matter). Bounded so it never collapses to under
    // 4 texels (a 4-texel search misses isolated blockers on near
    // cascades) and never exceeds a quarter of the tile (anything
    // larger reads almost the entire tile every sample).
    let search_radius_texels = clamp(
        pcss_light_world_radius / world_per_texel_pcss,
        4.0,
        64.0,
    );

    // Fixed 16-tap blocker + PCF. Same rationale as the cascade-
    // array PCSS path: tapering by `ndc.z` undersamples wide
    // penumbras and shows as visible disc-rotation banding.
    var blocker_sum = 0.0;
    var blocker_count = 0u;
    let tile_min_px = vec2<i32>(tile_min * atlas_uv_to_texels);
    let tile_max_px = vec2<i32>(tile_max * atlas_uv_to_texels);
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * search_radius_texels;
        let sample_uv = atlas_uv + off * inv_atlas;
        let coord = vec2<i32>(sample_uv * atlas_uv_to_texels);
        // Clamp to the cascade's own tile so the blocker search
        // doesn't read from an adjacent cascade's depth values.
        let c = clamp(coord, tile_min_px, tile_max_px);
        let d = textureLoad(shadow_atlas, c, 0);
        if d < ref_depth - 0.0005 {
            blocker_sum = blocker_sum + d;
            blocker_count = blocker_count + 1u;
        }
    }
    if blocker_count == 0u {
        return 1.0; // fully lit fast path
    }
    if blocker_count == 16u {
        // Every blocker-search sample was below the receiver's
        // biased depth — the receiver is deep inside the umbra
        // and the second 16-tap PCF would average to ≈ 0
        // anyway. Skip it.
        return 0.0;
    }
    let avg_blocker = blocker_sum / f32(blocker_count);
    // Classic PCSS penumbra: `(d_receiver − d_blocker) · light_size /
    // d_blocker`, but with light_size expressed in *world units* via
    // `world_per_texel`. The clamps keep the kernel between "more
    // than `Soft`" (4 texels) and "still affordable" (40 texels —
    // the 16-tap loop amortises hardware bilinear so this is fine).
    let light_size_texels = pcss_light_world_radius / world_per_texel_pcss;
    let penumbra_texels = clamp(
        (ref_depth - avg_blocker) * light_size_texels / max(avg_blocker, 1e-4),
        2.0,
        24.0,
    );
    // Wide PCSS kernels sample texels far from the fragment; on a sloped /
    // curved receiver the depth stored there differs by the surface slope and
    // self-shadows into acne. Scale the comparison bias with the kernel width so
    // wider penumbras get proportional slack — the softness hides the extra
    // peter-panning a near-contact (narrow-kernel) fragment would otherwise show.
    // The slack is the user's own `depth_bias` (bias_params.x) times the kernel
    // radius, so it inherits the per-light tuning instead of a fresh constant.
    let pcss_ref = ref_depth - desc.bias_params.x * penumbra_texels * 0.5;
    var pcf_sum = 0.0;
    for (var i = 0u; i < 16u; i = i + 1u) {
        let off = pcss_rotate(POISSON_DISK_16[i], sin_a, cos_a) * penumbra_texels;
        pcf_sum = pcf_sum + textureSampleCompareLevel(
            shadow_atlas,
            shadow_atlas_sampler,
            clamp(atlas_uv + off * inv_atlas, tile_min, tile_max),
            pcss_ref,
        );
    }
    return pcf_sum / 16.0;
}

// Per-light cascade selection with smooth blending across split
// boundaries. `descriptor_base` points to the first cascade descriptor
// of a directional light; `cascade_info.z` gives the cascade count.
//
// We walk descriptors descriptor_base..base+count and pick the first
// whose `cascade_info.x` (split_far in world-space depth) exceeds
// `view_z`. To hide the abrupt softness jump that comes from each
// successive cascade halving its atlas resolution, the last
// `CASCADE_BLEND` fraction of every cascade's depth range linearly
// fades into the next cascade's sample (or to fully lit for the
// final cascade — receivers past the very end get no shadow).
//
// Returns 1.0 (no shadow) if `view_z` is beyond the last cascade.
// Fraction of each cascade's depth range that fades into the next
// cascade. Stretching this band wider spreads the (unavoidable)
// quality difference between cascades across a larger area, which
// the eye stops reading as a hard edge AND keeps receivers near
// cascade boundaries from flickering when the camera moves them
// across the boundary in discrete texel-snap jumps. 50% is the AAA
// default — the corresponding `BLEND_OVERLAP` in `fit_cascades`
// ensures the next cascade's frustum covers this whole band.
const CASCADE_BLEND: f32 = 0.5;

fn sample_shadow_directional(
    descriptor_base: u32,
    world_pos: vec3<f32>,
    world_normal: vec3<f32>,
    view_z: f32,
) -> f32 {
    if descriptor_base == SHADOW_INDEX_NONE {
        return 1.0;
    }
    if descriptor_base >= MAX_SHADOW_DESCRIPTORS {
        return 1.0;
    }
    // Point/cube lights: single descriptor, no cascade walk. Their
    // `view_projection` is intentionally `Mat4::ZERO` (the cube path
    // uses `atlas_rect.xyz/.w` = (light_pos, range) + a world-space
    // direction instead of a projection), so the cascade picker's
    // `cand_clip.w <= 0.0` test below would reject them and silently
    // return "fully lit". Dispatch straight to `sample_shadow_descriptor`
    // which routes cube descriptors to `sample_shadow_cube`.
    //
    // Kind values: 0.0 = 2D PCF (spot), 1.0 = 2D EVSM (cascade),
    // 2.0 = cube, 3.0 = cascade-array PCF (directional). Only kind=2.0
    // is the single-descriptor short-circuit; the cascade-array case
    // still needs the cascade walk because directional lights pack
    // multiple cascades.
    let base_kind = shadow_descriptors.items[descriptor_base].cascade_info.w;
    if base_kind > 1.5 && base_kind < 2.5 {
        return sample_shadow_descriptor(descriptor_base, world_pos, world_normal);
    }
    let cascade_count = u32(shadow_descriptors.items[descriptor_base].cascade_info.z);
    // Cascade pick: walk descriptors near→far and stop at the first
    // one that contains the receiver in *both* depth (`view_z` inside
    // the cascade's split range) AND lateral NDC (clip.xy ∈ [-1, 1]).
    //
    // The lateral check is what we used to silently miss — picking
    // purely by `view_z` then projecting could land us on a cascade
    // whose XY frustum clipped the receiver, and `sample_shadow_descriptor`
    // would short-circuit to "fully lit". That produced a hard
    // diagonal cliff at each cascade's lateral edge whenever the
    // outer cascade actually had coverage there. Falling through to
    // the next cascade outward keeps the shadow continuous across
    // lateral boundaries the same way the depth-axis blend handles
    // split boundaries.
    var picked: u32 = SHADOW_INDEX_NONE;
    var picked_local: u32 = 0u;
    for (var i = 0u; i < cascade_count; i = i + 1u) {
        let idx = descriptor_base + i;
        if idx >= MAX_SHADOW_DESCRIPTORS {
            break;
        }
        let split_far = shadow_descriptors.items[idx].cascade_info.x;
        if view_z > split_far {
            continue;
        }
        let cand = shadow_descriptors.items[idx];
        let cand_clip = cand.view_projection * vec4<f32>(world_pos, 1.0);
        if cand_clip.w <= 0.0 {
            continue;
        }
        let cand_ndc = cand_clip.xyz / cand_clip.w;
        if cand_ndc.x < -1.0 || cand_ndc.x > 1.0
            || cand_ndc.y < -1.0 || cand_ndc.y > 1.0
            || cand_ndc.z < 0.0 || cand_ndc.z > 1.0
        {
            continue;
        }
        picked = idx;
        picked_local = i;
        break;
    }
    if picked == SHADOW_INDEX_NONE {
        return 1.0;
    }
    let split_far = shadow_descriptors.items[picked].cascade_info.x;
    var split_near: f32 = 0.0;
    if picked_local > 0u {
        split_near = shadow_descriptors.items[picked - 1u].cascade_info.x;
    }
    let span = max(split_far - split_near, 1e-4);
    let normalized = clamp((view_z - split_near) / span, 0.0, 1.0);

    let primary = sample_shadow_descriptor(picked, world_pos, world_normal);
    if normalized < 1.0 - CASCADE_BLEND {
        return primary;
    }
    let blend_t = (normalized - (1.0 - CASCADE_BLEND)) / CASCADE_BLEND;
    let next_local = picked_local + 1u;
    if next_local >= cascade_count {
        // Final cascade fades to fully lit at the very edge of the
        // light's max_distance so receivers don't pop from shadowed
        // to lit.
        return mix(primary, 1.0, blend_t);
    }
    let next_idx = descriptor_base + next_local;
    if next_idx >= MAX_SHADOW_DESCRIPTORS {
        return primary;
    }
    let secondary = sample_shadow_descriptor(next_idx, world_pos, world_normal);
    return mix(primary, secondary, blend_t);
}

// DEBUG: returns the picked cascade index (0..3) as a float, or
// 4.0 if no cascade was picked. Mirrors `sample_shadow_directional`'s
// picker so the colour overlay matches what shadow sampling actually
// uses — both the `view_z` split test AND the lateral NDC test.
fn debug_picked_cascade(
    descriptor_base: u32,
    world_pos: vec3<f32>,
    view_z: f32,
) -> f32 {
    if descriptor_base == SHADOW_INDEX_NONE || descriptor_base >= MAX_SHADOW_DESCRIPTORS {
        return 4.0;
    }
    let cascade_count = u32(shadow_descriptors.items[descriptor_base].cascade_info.z);
    for (var i = 0u; i < cascade_count; i = i + 1u) {
        let idx = descriptor_base + i;
        if idx >= MAX_SHADOW_DESCRIPTORS {
            break;
        }
        let desc = shadow_descriptors.items[idx];
        if view_z > desc.cascade_info.x {
            continue;
        }
        let clip = desc.view_projection * vec4<f32>(world_pos, 1.0);
        if clip.w <= 0.0 {
            continue;
        }
        let ndc = clip.xyz / clip.w;
        if ndc.x < -1.0 || ndc.x > 1.0 || ndc.y < -1.0 || ndc.y > 1.0 || ndc.z < 0.0 || ndc.z > 1.0 {
            continue;
        }
        return f32(i);
    }
    return 4.0;
}

// Debug-overlay tint for cascade visualisation. Driven by
// `shadow_globals.flags.x` (`debug_cascade_colors`). Returns the
// cascade-tinted color if enabled, otherwise the input unchanged.
//
// The palette additionally distinguishes EVSM cascades from PCF
// cascades — EVSM cascades get a warm tone (orange / yellow) while
// PCF cascades get a cool tone (red / green / blue). The
// `cascade_info.w` flag is the source of truth (1.0 → EVSM, 0.0 →
// PCF), set on the writer side in `Shadows::write_gpu`.
fn debug_cascade_tint(
    base_color: vec3<f32>,
    descriptor_base: u32,
    world_pos: vec3<f32>,
    view_z: f32,
) -> vec3<f32> {
    if shadow_globals.flags.x == 0u {
        return base_color;
    }
    let picked = debug_picked_cascade(descriptor_base, world_pos, view_z);
    let picked_idx = u32(picked);
    if picked_idx >= 4u {
        return base_color;
    }
    // PCF (cool): red / green / blue / cyan
    let pcf_palette = array<vec3<f32>, 4>(
        vec3<f32>(1.0, 0.3, 0.3),
        vec3<f32>(0.3, 1.0, 0.3),
        vec3<f32>(0.3, 0.5, 1.0),
        vec3<f32>(0.3, 0.9, 1.0),
    );
    // EVSM (warm): scarlet / orange / yellow / gold. The receiver-
    // side dispatch uses `cascade_info.w > 0.5` for "this descriptor
    // is EVSM"; mirror that here so the overlay tracks reality.
    let evsm_palette = array<vec3<f32>, 4>(
        vec3<f32>(1.0, 0.4, 0.1),
        vec3<f32>(1.0, 0.6, 0.1),
        vec3<f32>(1.0, 0.85, 0.1),
        vec3<f32>(1.0, 1.0, 0.3),
    );
    let idx = descriptor_base + picked_idx;
    let kind = shadow_descriptors.items[idx].cascade_info.w;
    // EVSM (kind = 1.0) → warm palette; PCF flavours (cascade-array
    // kind = 3.0 and the 2D-atlas spot kind = 0.0) → cool palette.
    let is_evsm = kind > 0.5 && kind < 1.5;
    let tint = select(pcf_palette[picked_idx], evsm_palette[picked_idx], is_evsm);
    return mix(base_color, tint, 0.35);
}

{% endif %}{# end needs_shadow_sampling — shadow sampling functions #}