vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
// Portable GPU kernel for string_similarity.ngram_extract.
//
// Bindings:
// 0 params: len_a, len_b, param_c, param_d
// 1 input_words: little-endian packed bytes
// 2 output_words: each n-gram copied into output_stride_words packed u32 words

@group(0) @binding(1) var<storage, read> input_words: array<u32>;
@group(0) @binding(2) var<storage, read_write> output_words: array<u32>;

fn byte_at(index: u32) -> u32 {
    let word = input_words[index >> 2u];
    let shift = (index & 3u) << 3u;
    return (word >> shift) & 0xffu;
}

fn store_byte(base_word: u32, offset: u32, byte: u32) {
    let word_index = base_word + (offset >> 2u);
    let shift = (offset & 3u) << 3u;
    let mask = 0xffu << shift;
    let old_word = output_words[word_index];
    output_words[word_index] = (old_word & ~mask) | ((byte & 0xffu) << shift);
}

@compute @workgroup_size(256, 1, 1)
fn string_similarity_ngram_extract(@builtin(global_invocation_id) id: vec3<u32>) {
    if (params.len_b == 0u || params.len_b > params.len_a || params.param_c == 0u) {
        return;
    }
    let gram_index = id.x;
    if (gram_index > params.len_a - params.len_b) {
        return;
    }

    let base = gram_index * params.param_c;
    var offset = 0u;
    loop {
        if (offset >= params.len_b) {
            break;
        }
        store_byte(base, offset, byte_at(gram_index + offset));
        offset = offset + 1u;
    }
}