awsm-renderer 0.3.3

// Light culling compute shader (two-level / clustered).
//
// Two @compute entry points run as consecutive dispatches in one compute
// pass:
//
//   cs_tile  (Stage A): one workgroup per 2D screen tile (tile_x, tile_y).
//     Tests each punctual light's bounding sphere against the tile
//     column's four side planes (which are Z-independent) and
//     atomic-appends the survivors into the tile's slice of `tile_lights`.
//     The expensive side-plane test runs once per (tile, light) here
//     instead of once per (froxel, light) — a ~SLICE_COUNT× reduction in
//     side-plane work.
//
//   cs_main  (Stage B): one workgroup per froxel (tile_x, tile_y, z_slice).
//     Reads only its tile's candidate list from `tile_lights` and applies
//     the cheap per-slice Z-test, atomic-appending survivors into the
//     froxel's slice of `lights_storage`. The output is identical to the
//     old single-pass cull (same per-froxel lists) because the side
//     planes don't depend on Z — so the tile candidate set is exactly the
//     union over the column's froxels. `overflow_counter` + the runtime
//     `max_per_froxel_capacity` auto-grow behave exactly as before.
//
// WebGPU inserts a memory barrier between the two dispatches (cs_main
// reads `tile_lights` that cs_tile wrote), so a single compute pass
// suffices.
//
// The per-froxel slice base in `lights_storage` is
// `cull_params.mesh_indices_capacity_u32 + froxel_idx * stride`, where
// `stride = cull_params.max_per_froxel_capacity + 1` (slot 0 = atomic
// count, slots 1.. = light indices). The head region
// `[0..mesh_indices_capacity_u32)` is the CPU-written per-mesh slice and
// is left untouched here.

const TILE_PIXEL_SIZE: u32 = 16u;
const SLICE_COUNT: u32 = {{ slice_count }}u;
const WORKGROUP_SIZE_LIGHTS: u32 = 64u;
// The per-2D-tile candidate budget is a **runtime** value
// (`cull_params.tile_light_capacity`, per-tile stride = budget + 1). The
// host grows it toward the live punctual-light count and sizes
// `tile_lights` to match — so a tile slice can't overflow and Stage B
// needs no fallback, while staying small for low-light scenes.

// Project an NDC corner (z = 0 → near plane) to a normalized view-space
// ray direction emanating from the camera origin. PERSPECTIVE ONLY — for
// orthographic cameras view rays are parallel (not origin-emanating), so
// this direction is meaningless; the ortho path uses `ndc_to_view_pos`
// + an axis-aligned box test instead (see `cs_tile`).
fn ndc_to_view_dir(ndc: vec2<f32>) -> vec3<f32> {
    let clip = vec4<f32>(ndc, 0.0, 1.0);
    let view = camera_raw.inv_proj * clip;
    return normalize(view.xyz / view.w);
}

// Project an NDC corner (z = 0 → near plane) to a view-space POSITION
// (not normalized). For an orthographic projection the x/y of this point
// are the view-space box bounds of the tile column (constant in depth),
// which is exactly what the ortho side cull needs.
fn ndc_to_view_pos(ndc: vec2<f32>) -> vec3<f32> {
    let clip = vec4<f32>(ndc, 0.0, 1.0);
    let view = camera_raw.inv_proj * clip;
    return view.xyz / view.w;
}

// The view-space x/y extent of a screen tile's column. Used by the
// orthographic side cull (the frustum column is an axis-aligned box, not
// a pyramid, so the perspective side-plane construction does not apply).
struct TileViewBox {
    min_x: f32,
    max_x: f32,
    min_y: f32,
    max_y: f32,
};

fn tile_view_box(tile_x: u32, tile_y: u32) -> TileViewBox {
    let viewport_f = vec2<f32>(f32(cull_params.viewport_w), f32(cull_params.viewport_h));
    let tile_pixel_min = vec2<f32>(f32(tile_x), f32(tile_y)) * f32(TILE_PIXEL_SIZE);
    let tile_pixel_max = min(tile_pixel_min + vec2<f32>(f32(TILE_PIXEL_SIZE)), viewport_f);
    let ndc_x_min = tile_pixel_min.x / viewport_f.x * 2.0 - 1.0;
    let ndc_x_max = tile_pixel_max.x / viewport_f.x * 2.0 - 1.0;
    let ndc_y_min = 1.0 - tile_pixel_max.y / viewport_f.y * 2.0;
    let ndc_y_max = 1.0 - tile_pixel_min.y / viewport_f.y * 2.0;
    let lo = ndc_to_view_pos(vec2<f32>(ndc_x_min, ndc_y_min));
    let hi = ndc_to_view_pos(vec2<f32>(ndc_x_max, ndc_y_max));
    var b: TileViewBox;
    b.min_x = min(lo.x, hi.x);
    b.max_x = max(lo.x, hi.x);
    b.min_y = min(lo.y, hi.y);
    b.max_y = max(lo.y, hi.y);
    return b;
}

// Perspective vs orthographic: a perspective projection has
// `proj[3].w == 0` (the w' = -z perspective divide); orthographic has
// `proj[3].w == 1`. Mirrors `CameraMatrices::is_orthographic` on the CPU.
fn camera_is_orthographic() -> bool {
    return abs(camera_raw.proj[3].w) > 0.5;
}

// Distance from sphere center to the plane `dot(normal, p) = 0` passing
// through the origin. Positive when the center is on the inward side.
fn signed_dist_through_origin(normal: vec3<f32>, p: vec3<f32>) -> f32 {
    return dot(normal, p);
}

// The four inward side-plane normals of a screen tile's view-space
// frustum column. Z-independent, so every froxel in the column shares
// them — which is exactly why Stage A can compute the side test once.
struct SidePlanes {
    left: vec3<f32>,
    right: vec3<f32>,
    top: vec3<f32>,
    bottom: vec3<f32>,
};

fn tile_side_planes(tile_x: u32, tile_y: u32) -> SidePlanes {
    let viewport_f = vec2<f32>(f32(cull_params.viewport_w), f32(cull_params.viewport_h));
    let tile_pixel_min = vec2<f32>(f32(tile_x), f32(tile_y)) * f32(TILE_PIXEL_SIZE);
    let tile_pixel_max = min(tile_pixel_min + vec2<f32>(f32(TILE_PIXEL_SIZE)), viewport_f);

    // WebGPU screen-space: top-left origin, Y down. NDC: +Y up.
    let ndc_x_min = tile_pixel_min.x / viewport_f.x * 2.0 - 1.0;
    let ndc_x_max = tile_pixel_max.x / viewport_f.x * 2.0 - 1.0;
    let ndc_y_min = 1.0 - tile_pixel_max.y / viewport_f.y * 2.0;
    let ndc_y_max = 1.0 - tile_pixel_min.y / viewport_f.y * 2.0;

    let bl = ndc_to_view_dir(vec2<f32>(ndc_x_min, ndc_y_min));
    let br = ndc_to_view_dir(vec2<f32>(ndc_x_max, ndc_y_min));
    let tl = ndc_to_view_dir(vec2<f32>(ndc_x_min, ndc_y_max));
    let tr = ndc_to_view_dir(vec2<f32>(ndc_x_max, ndc_y_max));

    // Right-handed cross products oriented so dot(normal, interior_ray) > 0.
    var planes: SidePlanes;
    planes.left = normalize(cross(tl, bl));
    planes.right = normalize(cross(br, tr));
    planes.top = normalize(cross(tr, tl));
    planes.bottom = normalize(cross(bl, br));
    return planes;
}

// ── Stage A: per-2D-tile side-plane cull ──────────────────────────────
@compute @workgroup_size(WORKGROUP_SIZE_LIGHTS)
fn cs_tile(
    @builtin(workgroup_id) wid: vec3<u32>,
    @builtin(local_invocation_id) lid: vec3<u32>,
) {
    let tile_x = wid.x;
    let tile_y = wid.y;
    if (tile_x >= cull_params.tiles_x || tile_y >= cull_params.tiles_y) {
        return;
    }

    let tile_cap = cull_params.tile_light_capacity;
    let tile_idx = tile_y * cull_params.tiles_x + tile_x;
    let tile_base = tile_idx * (tile_cap + 1u);

    // Thread 0 resets the per-tile candidate count.
    if (lid.x == 0u) {
        atomicStore(&tile_lights[tile_base], 0u);
    }

    // Tile column geometry. Perspective uses 4 side planes through the
    // camera origin; orthographic uses an axis-aligned view-space box
    // (parallel rays — the pyramid plane construction does not apply).
    let is_ortho = camera_is_orthographic();
    let planes = tile_side_planes(tile_x, tile_y);
    let vbox = tile_view_box(tile_x, tile_y);

    // Sync so all threads see the zeroed count before appending.
    workgroupBarrier();

    let total_lights = lights_info.data.x;  // n_lights
    var li = lid.x;
    loop {
        if (li >= total_lights) { break; }

        let p = lights[li];
        let kind = u32(p.kind_outer_pad.x);
        // Skip directional lights — infinite extent; they live in the
        // shading shaders' global-prefix loop.
        if (kind != 1u) {
            let pos_world = p.pos_range.xyz;
            let range = p.pos_range.w;
            let pos_view = (camera_raw.view * vec4<f32>(pos_world, 1.0)).xyz;

            // Side test only (no Z): a sphere straddling or inside the
            // tile column is a candidate for some froxel in it. Stage B
            // applies the Z-slice test.
            var side_ok: bool;
            if (is_ortho) {
                // Orthographic: view-space column is a box; the sphere
                // overlaps it iff its center is within `range` of the
                // box in x and y.
                side_ok = pos_view.x >= vbox.min_x - range
                    && pos_view.x <= vbox.max_x + range
                    && pos_view.y >= vbox.min_y - range
                    && pos_view.y <= vbox.max_y + range;
            } else {
                let l_ok = signed_dist_through_origin(planes.left, pos_view) >= -range;
                let r_ok = signed_dist_through_origin(planes.right, pos_view) >= -range;
                let t_ok = signed_dist_through_origin(planes.top, pos_view) >= -range;
                let b_ok = signed_dist_through_origin(planes.bottom, pos_view) >= -range;
                side_ok = l_ok && r_ok && t_ok && b_ok;
            }

            if (side_ok) {
                let slot = atomicAdd(&tile_lights[tile_base], 1u);
                // `tile_cap` tracks the live punctual-light count, which
                // bounds candidates-per-tile, so `slot` is always in
                // range; the guard is defense-in-depth.
                if (slot < tile_cap) {
                    atomicStore(&tile_lights[tile_base + 1u + slot], li);
                }
            }
        }

        li = li + WORKGROUP_SIZE_LIGHTS;
    }
}

// ── Stage B: per-froxel Z-slice refine ────────────────────────────────
@compute @workgroup_size(WORKGROUP_SIZE_LIGHTS)
fn cs_main(
    @builtin(workgroup_id) wid: vec3<u32>,
    @builtin(local_invocation_id) lid: vec3<u32>,
) {
    let tile_x = wid.x;
    let tile_y = wid.y;
    let z_slice = wid.z;

    if (tile_x >= cull_params.tiles_x || tile_y >= cull_params.tiles_y || z_slice >= SLICE_COUNT) {
        return;
    }

    let tiles_per_layer = cull_params.tiles_x * cull_params.tiles_y;
    let froxel_idx = z_slice * tiles_per_layer + tile_y * cull_params.tiles_x + tile_x;
    let froxel_stride = cull_params.max_per_froxel_capacity + 1u;
    let froxel_base = cull_params.mesh_indices_capacity_u32 + froxel_idx * froxel_stride;

    // Thread 0 resets the per-froxel count.
    if (lid.x == 0u) {
        atomicStore(&lights_storage[froxel_base], 0u);
    }

    // Z range for this slice (exponential mapping). Positive forward.
    let s_lo = f32(z_slice) / f32(SLICE_COUNT);
    let s_hi = f32(z_slice + 1u) / f32(SLICE_COUNT);
    let z_lo = cull_params.z_near * exp(cull_params.log_far_over_near * s_lo);
    let z_hi = cull_params.z_near * exp(cull_params.log_far_over_near * s_hi);

    // Candidates from this froxel's 2D tile (written by cs_tile). All are
    // non-directional and already passed the side planes.
    let tile_cap = cull_params.tile_light_capacity;
    let tile_idx = tile_y * cull_params.tiles_x + tile_x;
    let tile_base = tile_idx * (tile_cap + 1u);
    let cand_count = min(atomicLoad(&tile_lights[tile_base]), tile_cap);

    // Sync so all threads see the zeroed froxel count before appending.
    workgroupBarrier();

    var ci = lid.x;
    loop {
        if (ci >= cand_count) { break; }

        let light_index = atomicLoad(&tile_lights[tile_base + 1u + ci]);
        let p = lights[light_index];
        let pos_world = p.pos_range.xyz;
        let range = p.pos_range.w;
        let view_z = -(camera_raw.view * vec4<f32>(pos_world, 1.0)).z;

        // Z-plane test: sphere span [view_z - range, view_z + range] must
        // overlap [z_lo, z_hi]. The side planes already passed in Stage A.
        let z_ok = (view_z + range >= z_lo) && (view_z - range <= z_hi);

        if (z_ok) {
            let slot = atomicAdd(&lights_storage[froxel_base], 1u);
            if (slot < cull_params.max_per_froxel_capacity) {
                atomicStore(&lights_storage[froxel_base + 1u + slot], light_index);
            } else {
                // Past capacity: the index is dropped this frame (consumers
                // clamp `count` to `max_per_froxel_capacity`); the CPU's
                // overflow readback raises the budget next frame.
                atomicAdd(&overflow_counter, 1u);
            }
        }

        ci = ci + WORKGROUP_SIZE_LIGHTS;
    }
}