vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
struct ByteHistogramParams {
    byte_len: u32,
    reserved0: u32,
    reserved1: u32,
    reserved2: u32,
}

@group(0) @binding(0) var<uniform> params: ByteHistogramParams;
@group(0) @binding(1) var<storage, read> input_words: array<u32>;
@group(0) @binding(2) var<storage, read_write> histogram_counts: array<u32>;

var<workgroup> tile_bytes: array<u32, 256>;
var<workgroup> tile_histogram: array<atomic<u32>, 256>;

@compute @workgroup_size(256, 1, 1)
fn stats_byte_histogram(
    @builtin(global_invocation_id) id: vec3<u32>,
    @builtin(local_invocation_id) local_id: vec3<u32>,
) {
    let lane = local_id.x;
    atomicStore(&tile_histogram[lane], 0u);
    workgroupBarrier();

    var tile_start = 0u;
    loop {
        if (tile_start >= params.byte_len) {
            break;
        }

        let input_index = tile_start + lane;
        if (input_index < params.byte_len) {
            tile_bytes[lane] = vyre_packed_byte(&input_words, input_index);
        } else {
            tile_bytes[lane] = 0u;
        }
        workgroupBarrier();

        if (input_index < params.byte_len) {
            atomicAdd(&tile_histogram[tile_bytes[lane]], 1u);
        }
        workgroupBarrier();

        tile_start = tile_start + 256u;
    }

    histogram_counts[id.x] = atomicLoad(&tile_histogram[lane]);
}