roxlap-gpu 0.5.0

// GPU.4 — outer DDA over chunks + inner DDA over voxels.
//
// Each pixel: build a ray from the camera basis, run an
// Amanatides–Woo DDA over chunk indices. At each step, the
// chunk-occupancy bitmap says "any voxels here?". If yes, run an
// inner Amanatides–Woo DDA bounded to that chunk's voxel range; if
// no, advance to the next chunk in one outer step.
//
// Coordinate convention:
// * `camera_pos` is in grid-local voxel units (host translates from
//   world coords). For grid 0 at identity transform, world == grid.
// * Z is DOWN (voxlap). `camera_down` is the +z direction; pixel
//   (0, 0) at the top of the screen maps to -camera_down.
// * Chunks are XY × CHUNK_Z voxels (typically 128 × 128 × 256).
//   `chunks_dims` is the count along each axis; the grid spans
//   chunk indices [origin_chunk, origin_chunk + chunks_dims).
// * Bedrock voxels are NOT in occupancy (bedrock-as-air refactor —
//   `decompress.rs` strips them). Rays through bedrock fall to sky.

const OCC_WORDS_PER_COLUMN: u32 = 8u; // CHUNK_Z (256) / 32
const CHUNK_Z: u32 = 256u;
// Worst-case inner steps per chunk = vsid + vsid + CHUNK_Z ~= 512.
// Keep loose so a near-axis-aligned ray crossing the chunk diagonal
// terminates cleanly.
const MAX_INNER_STEPS: u32 = 768u;

struct Uniforms {
    camera_pos: vec3<f32>,
    _pad0: f32,
    camera_right: vec3<f32>,
    _pad1: f32,
    camera_down: vec3<f32>,
    _pad2: f32,
    camera_forward: vec3<f32>,
    fov_y_rad: f32,
    screen_size: vec2<u32>,
    vsid: u32,
    max_outer_steps: u32,
    chunks_dims: vec3<u32>,
    _pad3: u32,
    origin_chunk: vec3<i32>,
    _pad4: u32,
};

@group(0) @binding(0) var<uniform> u: Uniforms;
@group(0) @binding(1) var<storage, read> occupancy: array<u32>;
@group(0) @binding(2) var<storage, read> color_offsets: array<u32>;
@group(0) @binding(3) var<storage, read> colors: array<u32>;
@group(0) @binding(4) var<storage, read> chunk_colors_base: array<u32>;
@group(0) @binding(5) var<storage, read> chunk_occupancy: array<u32>;
@group(0) @binding(6) var output: texture_storage_2d<rgba8unorm, write>;

// ---- helpers --------------------------------------------------------------

fn meta_idx_of(chunk_idx: vec3<i32>) -> i32 {
    let rel = chunk_idx - u.origin_chunk;
    if (rel.x < 0 || rel.y < 0 || rel.z < 0 ||
        u32(rel.x) >= u.chunks_dims.x ||
        u32(rel.y) >= u.chunks_dims.y ||
        u32(rel.z) >= u.chunks_dims.z) {
        return -1;
    }
    return rel.x
        + rel.y * i32(u.chunks_dims.x)
        + rel.z * i32(u.chunks_dims.x * u.chunks_dims.y);
}

fn chunk_has_content(meta_id: i32) -> bool {
    if (meta_id < 0) {
        return false;
    }
    let mi = u32(meta_id);
    return (chunk_occupancy[mi >> 5u] & (1u << (mi & 31u))) != 0u;
}

fn voxel_solid(meta_id: u32, p_in_chunk: vec3<i32>) -> bool {
    let col_idx = u32(p_in_chunk.x) + u32(p_in_chunk.y) * u.vsid;
    let cols_per_chunk = u.vsid * u.vsid;
    let occ_base = meta_id * cols_per_chunk * OCC_WORDS_PER_COLUMN;
    let col_word_base = occ_base + col_idx * OCC_WORDS_PER_COLUMN;
    let z_word = u32(p_in_chunk.z) >> 5u;
    let z_bit = u32(p_in_chunk.z) & 31u;
    return (occupancy[col_word_base + z_word] & (1u << z_bit)) != 0u;
}

fn voxel_color(meta_id: u32, p_in_chunk: vec3<i32>) -> vec3<f32> {
    let col_idx = u32(p_in_chunk.x) + u32(p_in_chunk.y) * u.vsid;
    let cols_per_chunk = u.vsid * u.vsid;
    let occ_base = meta_id * cols_per_chunk * OCC_WORDS_PER_COLUMN;
    let col_word_base = occ_base + col_idx * OCC_WORDS_PER_COLUMN;
    let z_word = u32(p_in_chunk.z) >> 5u;
    let z_bit = u32(p_in_chunk.z) & 31u;

    var rank: u32 = 0u;
    for (var w: u32 = 0u; w < z_word; w = w + 1u) {
        rank = rank + countOneBits(occupancy[col_word_base + w]);
    }
    var mask: u32 = 0u;
    if (z_bit > 0u) {
        mask = (1u << z_bit) - 1u;
    }
    rank = rank + countOneBits(occupancy[col_word_base + z_word] & mask);

    let offsets_base = meta_id * (cols_per_chunk + 1u);
    let chunk_local_offset = color_offsets[offsets_base + col_idx];
    let packed = colors[chunk_colors_base[meta_id] + chunk_local_offset + rank];

    let a = f32((packed >> 24u) & 0xffu);
    let r = f32((packed >> 16u) & 0xffu);
    let g = f32((packed >> 8u) & 0xffu);
    let b = f32(packed & 0xffu);
    let brightness = a * (1.0 / 128.0);
    return vec3<f32>(r, g, b) * (brightness / 255.0);
}

fn sky_color(dir: vec3<f32>) -> vec3<f32> {
    let down_amount = clamp(dir.z * 0.5 + 0.5, 0.0, 1.0);
    let zenith = vec3<f32>(0.18, 0.28, 0.55);
    let horizon = vec3<f32>(0.66, 0.74, 0.88);
    return mix(zenith, horizon, down_amount);
}

// Set `t_max` infinity for axes the ray is parallel to. WGSL has no
// f32::INFINITY constant; use a very large value instead.
fn shield_parallel(t_max: vec3<f32>, dir: vec3<f32>) -> vec3<f32> {
    var t = t_max;
    if (dir.x == 0.0) { t.x = 1.0e30; }
    if (dir.y == 0.0) { t.y = 1.0e30; }
    if (dir.z == 0.0) { t.z = 1.0e30; }
    return t;
}

// ---- main marcher ---------------------------------------------------------

@compute @workgroup_size(8, 8)
fn render_grid(@builtin(global_invocation_id) gid: vec3<u32>) {
    if (gid.x >= u.screen_size.x || gid.y >= u.screen_size.y) {
        return;
    }

    // Build the per-pixel ray.
    let aspect = f32(u.screen_size.x) / f32(u.screen_size.y);
    let half_h = tan(u.fov_y_rad * 0.5);
    let half_w = half_h * aspect;
    let ndc_x = (f32(gid.x) + 0.5) / f32(u.screen_size.x) * 2.0 - 1.0;
    let ndc_y_top_pos = 1.0 - (f32(gid.y) + 0.5) / f32(u.screen_size.y) * 2.0;
    let dir = normalize(
        u.camera_forward
        + ndc_x * half_w * u.camera_right
        - ndc_y_top_pos * half_h * u.camera_down
    );

    let chunk_dim = vec3<f32>(f32(u.vsid), f32(u.vsid), f32(CHUNK_Z));

    // Outer DDA setup in chunk-space (each chunk is a "cell").
    var p_chunk = vec3<i32>(floor(u.camera_pos / chunk_dim));
    let step_chunk = vec3<i32>(sign(dir));
    let t_delta_chunk = abs(chunk_dim / dir);
    let next_boundary_chunk = vec3<f32>(
        select(f32(p_chunk.x), f32(p_chunk.x + 1), step_chunk.x > 0) * chunk_dim.x,
        select(f32(p_chunk.y), f32(p_chunk.y + 1), step_chunk.y > 0) * chunk_dim.y,
        select(f32(p_chunk.z), f32(p_chunk.z + 1), step_chunk.z > 0) * chunk_dim.z,
    );
    var t_max_chunk = shield_parallel(
        (next_boundary_chunk - u.camera_pos) / dir,
        dir,
    );

    // `t_enter` = world-units t at which the ray entered the
    // current chunk. Used to pick the inner DDA's voxel-entry
    // point.
    var t_enter: f32 = 0.0;
    var hit_color = sky_color(dir);
    var done = false;

    for (var step: u32 = 0u; step < u.max_outer_steps; step = step + 1u) {
        let meta_id = meta_idx_of(p_chunk);
        if (chunk_has_content(meta_id)) {
            // Inner DDA bounded to this chunk.
            let t_chunk_exit = min(t_max_chunk.x, min(t_max_chunk.y, t_max_chunk.z));
            // Voxel coords inside the chunk. Compute from the
            // ray's entry position; clamp to the chunk's [0, vsid)
            // × [0, CHUNK_Z) range to absorb float-rounding error
            // at chunk boundaries.
            let entry_world = u.camera_pos + t_enter * dir;
            let chunk_origin_world = vec3<f32>(p_chunk) * chunk_dim;
            let entry_in_chunk = entry_world - chunk_origin_world;
            var p_voxel = vec3<i32>(floor(entry_in_chunk));
            p_voxel = clamp(
                p_voxel,
                vec3<i32>(0),
                vec3<i32>(i32(u.vsid - 1u), i32(u.vsid - 1u), i32(CHUNK_Z - 1u)),
            );

            // Voxel-level DDA. `t_max_voxel` is in WORLD units so
            // we can compare directly against `t_chunk_exit`.
            let next_voxel_world = vec3<f32>(
                select(f32(p_voxel.x), f32(p_voxel.x + 1), step_chunk.x > 0)
                    + chunk_origin_world.x,
                select(f32(p_voxel.y), f32(p_voxel.y + 1), step_chunk.y > 0)
                    + chunk_origin_world.y,
                select(f32(p_voxel.z), f32(p_voxel.z + 1), step_chunk.z > 0)
                    + chunk_origin_world.z,
            );
            var t_max_voxel = shield_parallel(
                (next_voxel_world - u.camera_pos) / dir,
                dir,
            );
            let t_delta_voxel = abs(1.0 / dir);

            for (var iv: u32 = 0u; iv < MAX_INNER_STEPS; iv = iv + 1u) {
                if (voxel_solid(u32(meta_id), p_voxel)) {
                    hit_color = voxel_color(u32(meta_id), p_voxel);
                    done = true;
                    break;
                }
                // Step voxel; if we leave the chunk, return to the
                // outer DDA. Use the smallest-`t_max` axis like the
                // standard Amanatides–Woo.
                if (t_max_voxel.x < t_max_voxel.y && t_max_voxel.x < t_max_voxel.z) {
                    p_voxel.x = p_voxel.x + step_chunk.x;
                    t_max_voxel.x = t_max_voxel.x + t_delta_voxel.x;
                    if (p_voxel.x < 0 || u32(p_voxel.x) >= u.vsid) {
                        break;
                    }
                } else if (t_max_voxel.y < t_max_voxel.z) {
                    p_voxel.y = p_voxel.y + step_chunk.y;
                    t_max_voxel.y = t_max_voxel.y + t_delta_voxel.y;
                    if (p_voxel.y < 0 || u32(p_voxel.y) >= u.vsid) {
                        break;
                    }
                } else {
                    p_voxel.z = p_voxel.z + step_chunk.z;
                    t_max_voxel.z = t_max_voxel.z + t_delta_voxel.z;
                    if (p_voxel.z < 0 || u32(p_voxel.z) >= CHUNK_Z) {
                        break;
                    }
                }
            }
            if (done) {
                break;
            }
        }

        // Outer step: advance to the next chunk along the ray.
        if (t_max_chunk.x < t_max_chunk.y && t_max_chunk.x < t_max_chunk.z) {
            t_enter = t_max_chunk.x;
            p_chunk.x = p_chunk.x + step_chunk.x;
            t_max_chunk.x = t_max_chunk.x + t_delta_chunk.x;
        } else if (t_max_chunk.y < t_max_chunk.z) {
            t_enter = t_max_chunk.y;
            p_chunk.y = p_chunk.y + step_chunk.y;
            t_max_chunk.y = t_max_chunk.y + t_delta_chunk.y;
        } else {
            t_enter = t_max_chunk.z;
            p_chunk.z = p_chunk.z + step_chunk.z;
            t_max_chunk.z = t_max_chunk.z + t_delta_chunk.z;
        }

        // Bail early once we leave the grid's bounding box and the
        // ray is moving AWAY from it on every axis (can't re-enter).
        let rel = p_chunk - u.origin_chunk;
        let outside_lo = rel.x < 0 || rel.y < 0 || rel.z < 0;
        let outside_hi = u32(rel.x) >= u.chunks_dims.x ||
                         u32(rel.y) >= u.chunks_dims.y ||
                         u32(rel.z) >= u.chunks_dims.z;
        if (outside_lo || outside_hi) {
            // Don't bail — the ray may have left the grid through
            // one face while still moving toward voxels through
            // another. Just keep marching. The `max_outer_steps`
            // cap (from the host scan-dist slider) terminates
            // wayward rays.
        }
    }

    textureStore(output, vec2<i32>(gid.xy), vec4<f32>(hit_color, 1.0));
}