vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
// WGSL lowering for `workgroup.string_interner` — a bounded FNV-1a
// string interner resident in workgroup SRAM.
//
// The interner keeps two parallel tables in shared memory:
//
//   var<workgroup> interner_slots:  array<SlotEntry, 256>;
//   var<workgroup> interner_bytes:  array<atomic<u32>, 512>;
//
// Each slot stores `(hash, byte_offset, byte_len)`. A symbol is the
// slot index. Intern commands first FNV-1a-hash their payload, probe
// the slot table linearly (open-addressing) until they find either a
// matching slot (existing symbol) or an empty slot (new symbol).
//
// Commands:
//   op = 1 → intern(bytes_offset, bytes_len) → result.symbol = id, status
//   op = 2 → lookup(symbol)                  → result.bytes_offset/len, status
//   op = 3 → len()                           → result.len = slot count
//
// Status words:
//   0 = ok, 1 = out_of_slots, 2 = out_of_bytes, 3 = unknown_symbol.
//
// The FNV-1a prime and offset basis are hard-coded to keep CPU/GPU
// parity byte-identical.

const INTERNER_SLOT_CAPACITY: u32 = 256u;
const INTERNER_BYTE_CAPACITY: u32 = 2048u;
const INTERNER_BYTE_WORD_CAPACITY: u32 = 512u;
const INTERNER_WORKGROUP_SIZE: u32 = 64u;
const FNV_OFFSET_BASIS: u32 = 0x811C9DC5u;
const FNV_PRIME:        u32 = 0x01000193u;

const INTERNER_OP_INTERN: u32 = 1u;
const INTERNER_OP_LOOKUP: u32 = 2u;
const INTERNER_OP_LEN:    u32 = 3u;

const INTERNER_OK:             u32 = 0u;
const INTERNER_OUT_OF_SLOTS:   u32 = 1u;
const INTERNER_OUT_OF_BYTES:   u32 = 2u;
const INTERNER_UNKNOWN_SYMBOL: u32 = 3u;

struct InternerCommand {
    op: u32,
    bytes_offset: u32,
    bytes_len: u32,
    symbol: u32,
}

struct InternerResult {
    status: u32,
    symbol: u32,
    bytes_offset: u32,
    bytes_len: u32,
}

@group(0) @binding(0) var<storage, read>       interner_commands: array<InternerCommand>;
@group(0) @binding(1) var<storage, read>       interner_source_bytes: array<u32>;
@group(0) @binding(2) var<storage, read_write> interner_results: array<InternerResult>;

struct SlotEntry {
    hash: u32,
    byte_offset: u32,
    byte_len: u32,
    reserved: u32,
}

var<workgroup> interner_slots: array<SlotEntry, 256>;
var<workgroup> interner_bytes: array<atomic<u32>, 512>;
var<workgroup> interner_slot_len:  atomic<u32>;
var<workgroup> interner_byte_len:  atomic<u32>;

fn read_source_byte(offset: u32) -> u32 {
    let word_idx = offset / 4u;
    let byte_idx = offset % 4u;
    return (interner_source_bytes[word_idx] >> (byte_idx * 8u)) & 0xFFu;
}

fn fnv1a(offset: u32, len: u32) -> u32 {
    var hash: u32 = FNV_OFFSET_BASIS;
    for (var i: u32 = 0u; i < len; i = i + 1u) {
        hash = hash ^ read_source_byte(offset + i);
        hash = hash * FNV_PRIME;
    }
    return hash;
}

fn compare_existing(offset_a: u32, len_a: u32, offset_b: u32, len_b: u32) -> bool {
    if (len_a != len_b) {
        return false;
    }
    for (var i: u32 = 0u; i < len_a; i = i + 1u) {
        let lhs_word = atomicLoad(&interner_bytes[(offset_a + i) / 4u]);
        let lhs = (lhs_word >> (((offset_a + i) % 4u) * 8u)) & 0xFFu;
        let rhs = read_source_byte(offset_b + i);
        if (lhs != rhs) {
            return false;
        }
    }
    return true;
}

@compute @workgroup_size(64, 1, 1)
fn workgroup_string_interner_intern(@builtin(local_invocation_id) local_id: vec3<u32>) {
    let lane = local_id.x;
    if (lane == 0u) {
        atomicStore(&interner_slot_len, 0u);
        atomicStore(&interner_byte_len, 0u);
    }
    for (var word_idx = lane; word_idx < INTERNER_BYTE_WORD_CAPACITY; word_idx = word_idx + INTERNER_WORKGROUP_SIZE) {
        atomicStore(&interner_bytes[word_idx], 0u);
    }
    workgroupBarrier();

    let command_count = arrayLength(&interner_commands);
    var command_index = 0u;
    loop {
        if (command_index >= command_count) {
            break;
        }
        if (interner_commands[command_index].symbol == lane) {
            let cmd = interner_commands[command_index];
            var result: InternerResult;
            result.status = INTERNER_OK;
            result.symbol = 0u;
            result.bytes_offset = 0u;
            result.bytes_len = 0u;

            switch (cmd.op) {
                case 1u: { // intern
                    let hash = fnv1a(cmd.bytes_offset, cmd.bytes_len);
                    let slot_count = atomicLoad(&interner_slot_len);
                    var found: u32 = 0xFFFFFFFFu;
                    for (var i: u32 = 0u; i < slot_count; i = i + 1u) {
                        if (interner_slots[i].hash == hash
                            && compare_existing(
                                interner_slots[i].byte_offset,
                                interner_slots[i].byte_len,
                                cmd.bytes_offset,
                                cmd.bytes_len,
                            )) {
                            found = i;
                            break;
                        }
                    }
                    if (found != 0xFFFFFFFFu) {
                        result.symbol = found;
                    } else if (slot_count >= INTERNER_SLOT_CAPACITY) {
                        result.status = INTERNER_OUT_OF_SLOTS;
                    } else {
                        let byte_len = atomicLoad(&interner_byte_len);
                        if (byte_len + cmd.bytes_len > INTERNER_BYTE_CAPACITY) {
                            result.status = INTERNER_OUT_OF_BYTES;
                        } else {
                            for (var i: u32 = 0u; i < cmd.bytes_len; i = i + 1u) {
                                let dst = byte_len + i;
                                let src = read_source_byte(cmd.bytes_offset + i);
                                let word_idx = dst / 4u;
                                let shift = (dst % 4u) * 8u;
                                atomicOr(&interner_bytes[word_idx], src << shift);
                            }
                            interner_slots[slot_count].hash = hash;
                            interner_slots[slot_count].byte_offset = byte_len;
                            interner_slots[slot_count].byte_len = cmd.bytes_len;
                            atomicStore(&interner_byte_len, byte_len + cmd.bytes_len);
                            atomicStore(&interner_slot_len, slot_count + 1u);
                            result.symbol = slot_count;
                        }
                    }
                }
                case 2u: { // lookup
                    let slot_count = atomicLoad(&interner_slot_len);
                    if (cmd.symbol >= slot_count) {
                        result.status = INTERNER_UNKNOWN_SYMBOL;
                    } else {
                        result.symbol = cmd.symbol;
                        result.bytes_offset = interner_slots[cmd.symbol].byte_offset;
                        result.bytes_len = interner_slots[cmd.symbol].byte_len;
                    }
                }
                case 3u: { // len
                    result.symbol = atomicLoad(&interner_slot_len);
                }
                default: {}
            }
            interner_results[command_index] = result;
        }
        workgroupBarrier();
        command_index = command_index + 1u;
    }
}