vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
// WGSL lowering for `workgroup.visitor` — a workgroup-scoped mark
// and emit visitor backed by a shared bitmap and a bounded emit
// buffer. The invariant is "every first-visit writes one emit
// slot; subsequent visits are no-ops". CAS-style atomic ORs over
// the visited bitmap guarantee at-most-once emission across lanes.
//
// Commands:
//   op = 1 → visit(node) → result.status = 0 (first) | 1 (already),
//                          result.node   = node
//   op = 2 → emitted_len() → result.len = current emit_buffer length
//   op = 3 → reset() → clears the bitmap and the emit buffer
//
// Status words:
//   0 = first_visit, 1 = already_visited, 2 = out_of_range.

const VISITOR_UNIVERSE:     u32 = 256u;
const VISITOR_EMIT_CAPACITY: u32 = 256u;
const VISITOR_OP_VISIT:   u32 = 1u;
const VISITOR_OP_LEN:     u32 = 2u;
const VISITOR_OP_RESET:   u32 = 3u;
const VISITOR_FIRST_VISIT:    u32 = 0u;
const VISITOR_ALREADY_VISITED: u32 = 1u;
const VISITOR_OUT_OF_RANGE:    u32 = 2u;

struct VisitorCommand {
    op: u32,
    node: u32,
    reserved: u32,
    lane: u32,
}

struct VisitorResult {
    status: u32,
    node: u32,
    emitted_len: u32,
    reserved: u32,
}

@group(0) @binding(0) var<storage, read>       visitor_commands: array<VisitorCommand>;
@group(0) @binding(1) var<storage, read_write> visitor_results:  array<VisitorResult>;
@group(0) @binding(2) var<storage, read_write> visitor_emit_out: array<u32>;

var<workgroup> visitor_bitmap:   array<atomic<u32>, 8>; // 256 bits.
var<workgroup> visitor_emit_len: atomic<u32>;

fn bit_mask(node: u32) -> u32 {
    return 1u << (node % 32u);
}

fn bit_word(node: u32) -> u32 {
    return node / 32u;
}

@compute @workgroup_size(64, 1, 1)
fn workgroup_visitor_visit(@builtin(local_invocation_id) local_id: vec3<u32>) {
    let lane = local_id.x;
    if (lane == 0u) {
        for (var i: u32 = 0u; i < 8u; i = i + 1u) {
            atomicStore(&visitor_bitmap[i], 0u);
        }
        atomicStore(&visitor_emit_len, 0u);
    }
    workgroupBarrier();

    let command_count = arrayLength(&visitor_commands);
    var command_index = 0u;
    loop {
        if (command_index >= command_count) {
            break;
        }
        if (visitor_commands[command_index].lane == lane) {
            let cmd = visitor_commands[command_index];
            var result: VisitorResult;
            result.status = 0u;
            result.node   = cmd.node;
            result.emitted_len = 0u;
            result.reserved = 0u;

            switch (cmd.op) {
                case 1u: { // visit
                    if (cmd.node >= VISITOR_UNIVERSE) {
                        result.status = VISITOR_OUT_OF_RANGE;
                    } else {
                        let mask = bit_mask(cmd.node);
                        let word = bit_word(cmd.node);
                        let prev = atomicOr(&visitor_bitmap[word], mask);
                        if ((prev & mask) == 0u) {
                            let slot = atomicAdd(&visitor_emit_len, 1u);
                            if (slot < VISITOR_EMIT_CAPACITY) {
                                visitor_emit_out[slot] = cmd.node;
                            }
                            result.status = VISITOR_FIRST_VISIT;
                        } else {
                            result.status = VISITOR_ALREADY_VISITED;
                        }
                    }
                }
                case 2u: { // emitted_len
                    result.emitted_len = atomicLoad(&visitor_emit_len);
                }
                case 3u: { // reset
                    // Clear the visited bitmap.
                    for (var i: u32 = 0u; i < 8u; i = i + 1u) {
                        atomicStore(&visitor_bitmap[i], 0u);
                    }
                    // Audit fix (workgroup-primitives C3): zero the
                    // emit buffer contents, not just the length
                    // counter. Prior behavior left prior-dispatch
                    // node ids visible to any host that maps the
                    // visitor_emit_out binding directly, leaking
                    // info across dispatch boundaries. CPU
                    // WorkgroupVisitor::reset() already clears the
                    // Vec; the shader must match for byte-identical
                    // parity.
                    for (var i: u32 = 0u; i < VISITOR_EMIT_CAPACITY; i = i + 1u) {
                        visitor_emit_out[i] = 0u;
                    }
                    atomicStore(&visitor_emit_len, 0u);
                }
                default: {}
            }
            visitor_results[command_index] = result;
        }
        workgroupBarrier();
        command_index = command_index + 1u;
    }
}