vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
@group(0) @binding(0) var<storage, read> input_words: array<u64>;
@group(0) @binding(1) var<storage, read_write> output_words: array<u64>;

const XXH64_P1: u64 = 0x9e3779b185ebca87u;
const XXH64_P2: u64 = 0xc2b2ae3d27d4eb4fu;
const XXH64_P3: u64 = 0x165667b19e3779f9u;
const XXH64_P4: u64 = 0x85ebca77c2b2ae63u;
const XXH64_P5: u64 = 0x27d4eb2f165667c5u;

fn xxh64_rotl(x: u64, n: u32) -> u64 { return (x << n) | (x >> ((64u - n) & 63u)); }
fn xxh64_round(acc: u64, lane: u64) -> u64 { return xxh64_rotl(acc + lane * XXH64_P2, 31u) * XXH64_P1; }
fn xxh64_merge(acc: u64, lane: u64) -> u64 { return (acc ^ xxh64_round(0u, lane)) * XXH64_P1 + XXH64_P4; }
fn xxh64_avalanche(x_in: u64) -> u64 {
  var x = x_in;
  x = x ^ (x >> 33u);
  x = x * XXH64_P2;
  x = x ^ (x >> 29u);
  x = x * XXH64_P3;
  return x ^ (x >> 32u);
}

@compute @workgroup_size(1, 1, 1)
fn hash_xxhash64(@builtin(global_invocation_id) id: vec3<u32>) {
  if (id.x != 0u) { return; }
  let len = arrayLength(&input_words);
  var index = 0u;
  var acc: u64;
  if (len >= 4u) {
    var v1 = XXH64_P1 + XXH64_P2;
    var v2 = XXH64_P2;
    var v3 = 0u;
    var v4 = 0u - XXH64_P1;
    while (index + 4u <= len) {
      v1 = xxh64_round(v1, input_words[index]);
      v2 = xxh64_round(v2, input_words[index + 1u]);
      v3 = xxh64_round(v3, input_words[index + 2u]);
      v4 = xxh64_round(v4, input_words[index + 3u]);
      index = index + 4u;
    }
    acc = xxh64_rotl(v1, 1u) + xxh64_rotl(v2, 7u) + xxh64_rotl(v3, 12u) + xxh64_rotl(v4, 18u);
    acc = xxh64_merge(acc, v1);
    acc = xxh64_merge(acc, v2);
    acc = xxh64_merge(acc, v3);
    acc = xxh64_merge(acc, v4);
  } else {
    acc = XXH64_P5;
  }
  acc = acc + u64(len * 8u);
  while (index < len) {
    acc = acc ^ xxh64_round(0u, input_words[index]);
    acc = xxh64_rotl(acc, 27u) * XXH64_P1 + XXH64_P4;
    index = index + 1u;
  }
  output_words[0u] = xxh64_avalanche(acc);
}