vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
// Portable GPU kernel for string_matching.substring_find_all.
// Bindings: 0 params(len_a, len_b, param_c), 1 haystack, 2 needle, 3 atomic count, 4 offsets.
@group(0) @binding(1) var<storage, read> haystack_words: array<u32>;
@group(0) @binding(2) var<storage, read> needle_words: array<u32>;
@group(0) @binding(3) var<storage, read_write> match_count: atomic<u32>;
@group(0) @binding(4) var<storage, read_write> offsets: array<u32>;
fn packed_byte(words: ptr<storage, array<u32>, read>, index: u32) -> u32 {
    return ((*words)[index >> 2u] >> ((index & 3u) << 3u)) & 0xffu;
}
fn matches_at(start: u32) -> bool {
    var i = 0u;
    loop {
        if (i >= params.len_b) { break; }
        if (packed_byte(&haystack_words, start + i) != packed_byte(&needle_words, i)) { return false; }
        i = i + 1u;
    }
    return true;
}
fn emit(offset: u32) {
    let slot = atomicAdd(&match_count, 1u);
    if (slot < params.param_c) { offsets[slot] = offset; }
}
@compute @workgroup_size(256, 1, 1)
fn string_matching_substring_find_all(@builtin(global_invocation_id) id: vec3<u32>) {
    let start = id.x;
    if (params.len_b == 0u) {
        if (start <= params.len_a) { emit(start); }
        return;
    }
    if (params.len_b > params.len_a) { return; }
    if (start <= params.len_a - params.len_b && matches_at(start)) { emit(start); }
}