vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
//! XXH64 and XXH3-64 CPU reference implementations with seed 0.

// accumulate.rs
pub fn accumulate(acc: &mut [u64; 8], stripe: &[u8], secret_offset: usize) {
    for i in 0..8 {
        let lane = read64(stripe, i * 8);
        let value = lane ^ read_secret64(secret_offset + i * 8);
        acc[i ^ 1] = acc[i ^ 1].wrapping_add(lane);
        acc[i] = acc[i].wrapping_add((value & 0xffff_ffff).wrapping_mul(value >> 32));
    }
}

// accumulate_block.rs
pub fn accumulate_block(acc: &mut [u64; 8], block: &[u8]) {
    for (stripe, bytes) in block.chunks_exact(64).enumerate() {
        accumulate(acc, bytes, stripe * 8);
    }
}

// avalanche.rs
pub fn avalanche(mut x: u64) -> u64 {
    x ^= x >> 37;
    x = x.wrapping_mul(PMX1);
    x ^ (x >> 32)
}

// avalanche_xxh64.rs
pub fn avalanche_xxh64(mut x: u64) -> u64 {
    x ^= x >> 33;
    x = x.wrapping_mul(P64_2);
    x ^= x >> 29;
    x = x.wrapping_mul(P64_3);
    x ^ (x >> 32)
}

// consume_tail.rs
pub fn consume_tail(input: &[u8], mut index: usize, mut acc: u64) -> u64 {
    while index + 8 <= input.len() {
        acc ^= round(0, read64(input, index));
        acc = acc.rotate_left(27).wrapping_mul(P64_1).wrapping_add(P64_4);
        index += 8;
    }
    if index + 4 <= input.len() {
        acc ^= u64::from(read32(input, index)).wrapping_mul(P64_1);
        acc = acc.rotate_left(23).wrapping_mul(P64_2).wrapping_add(P64_3);
        index += 4;
    }
    while index < input.len() {
        acc ^= u64::from(input[index]).wrapping_mul(P64_5);
        acc = acc.rotate_left(11).wrapping_mul(P64_1);
        index += 1;
    }
    avalanche_xxh64(acc)
}

// final_merge.rs
pub fn final_merge(acc: &[u64; 8], init: u64, secret_offset: usize) -> u64 {
    let mut result = init;
    for i in 0..4 {
        let product = ((acc[i * 2] ^ read_secret64(secret_offset + i * 16)) as u128)
            .wrapping_mul((acc[i * 2 + 1] ^ read_secret64(secret_offset + i * 16 + 8)) as u128);
        result = result.wrapping_add((product as u64) ^ (product >> 64) as u64);
    }
    avalanche(result)
}

// merge_acc.rs
pub fn merge_acc(acc: u64, lane: u64) -> u64 {
    (acc ^ round(0, lane))
        .wrapping_mul(P64_1)
        .wrapping_add(P64_4)
}

// mix16.rs
pub fn mix16(input: &[u8], offset: usize, secret_offset: usize) -> u64 {
    let lhs = read64(input, offset) ^ read_secret64(secret_offset);
    let rhs = read64(input, offset + 8) ^ read_secret64(secret_offset + 8);
    let product = (lhs as u128).wrapping_mul(rhs as u128);
    (product as u64) ^ (product >> 64) as u64
}

// p32_1.rs
pub const P32_1: u64 = 0x9e37_79b1;

// p32_2.rs
pub const P32_2: u64 = 0x85eb_ca77;

// p32_3.rs
pub const P32_3: u64 = 0xc2b2_ae3d;

// p64_1.rs
pub const P64_1: u64 = 0x9e37_79b1_85eb_ca87;

// p64_2.rs
pub const P64_2: u64 = 0xc2b2_ae3d_27d4_eb4f;

// p64_3.rs
pub const P64_3: u64 = 0x1656_67b1_9e37_79f9;

// p64_4.rs
pub const P64_4: u64 = 0x85eb_ca77_c2b2_ae63;

// p64_5.rs
pub const P64_5: u64 = 0x27d4_eb2f_1656_67c5;

// pmx1.rs
pub const PMX1: u64 = 0x1656_6791_9e37_79f9;

// pmx2.rs
pub const PMX2: u64 = 0x9fb2_1c65_1e98_df25;

// read32.rs
pub fn read32(bytes: &[u8], offset: usize) -> u32 {
    u32::from_le_bytes([
        bytes[offset],
        bytes[offset + 1],
        bytes[offset + 2],
        bytes[offset + 3],
    ])
}

// read64.rs
pub fn read64(bytes: &[u8], offset: usize) -> u64 {
    u64::from_le_bytes([
        bytes[offset],
        bytes[offset + 1],
        bytes[offset + 2],
        bytes[offset + 3],
        bytes[offset + 4],
        bytes[offset + 5],
        bytes[offset + 6],
        bytes[offset + 7],
    ])
}

// read_secret32.rs
pub fn read_secret32(offset: usize) -> u32 {
    read32(&SECRET, offset)
}

// read_secret64.rs
pub fn read_secret64(offset: usize) -> u64 {
    read64(&SECRET, offset)
}

// round.rs
pub fn round(acc: u64, lane: u64) -> u64 {
    acc.wrapping_add(lane.wrapping_mul(P64_2))
        .rotate_left(31)
        .wrapping_mul(P64_1)
}

// scramble.rs
pub fn scramble(acc: &mut [u64; 8]) {
    for i in 0..8 {
        acc[i] ^= acc[i] >> 47;
        acc[i] ^= read_secret64(SECRET.len() - 64 + i * 8);
        acc[i] = acc[i].wrapping_mul(P32_1);
    }
}

// secret.rs
pub const SECRET: [u8; 192] = [
    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
];

// xxh3_large.rs
pub fn xxh3_large(input: &[u8]) -> u64 {
    let mut acc = [P32_3, P64_1, P64_2, P64_3, P64_4, P32_2, P64_5, P32_1];
    let stripes_per_block = (SECRET.len() - 64) / 8;
    let block_size = stripes_per_block * 64;
    let mut offset = 0usize;
    while offset + block_size < input.len() {
        accumulate_block(&mut acc, &input[offset..offset + block_size]);
        scramble(&mut acc);
        offset += block_size;
    }
    let last = &input[offset..];
    let full_stripes = (last.len() - 1) / 64;
    for stripe in 0..full_stripes {
        accumulate(&mut acc, &last[stripe * 64..stripe * 64 + 64], stripe * 8);
    }
    accumulate(&mut acc, &input[input.len() - 64..], SECRET.len() - 71);
    final_merge(&acc, (input.len() as u64).wrapping_mul(P64_1), 11)
}

// xxh3_len_129_to_240.rs
pub fn xxh3_len_129_to_240(input: &[u8]) -> u64 {
    let mut acc = (input.len() as u64).wrapping_mul(P64_1);
    let chunks = input.len() >> 4;
    for i in 0..8 {
        acc = acc.wrapping_add(mix16(input, i * 16, i * 16));
    }
    acc = avalanche(acc);
    for i in 8..chunks {
        acc = acc.wrapping_add(mix16(input, i * 16, (i - 8) * 16 + 3));
    }
    avalanche(acc.wrapping_add(mix16(input, input.len() - 16, 119)))
}

// xxh3_len_17_to_128.rs
pub fn xxh3_len_17_to_128(input: &[u8]) -> u64 {
    let mut acc = (input.len() as u64).wrapping_mul(P64_1);
    let rounds = ((input.len() - 1) >> 5) + 1;
    for i in (0..rounds).rev() {
        acc = acc.wrapping_add(mix16(input, i * 16, i * 32));
        acc = acc.wrapping_add(mix16(input, input.len() - i * 16 - 16, i * 32 + 16));
    }
    avalanche(acc)
}

// xxh3_len_1_to_3.rs
pub fn xxh3_len_1_to_3(input: &[u8]) -> u64 {
    let combined = u32::from(input[input.len() - 1])
        | ((input.len() as u32) << 8)
        | (u32::from(input[0]) << 16)
        | (u32::from(input[input.len() >> 1]) << 24);
    let value = u64::from(read_secret32(0) ^ read_secret32(4)) ^ u64::from(combined);
    avalanche_xxh64(value)
}

// xxh3_len_4_to_8.rs
pub fn xxh3_len_4_to_8(input: &[u8]) -> u64 {
    let first = read32(input, 0);
    let last = read32(input, input.len() - 4);
    let combined = u64::from(last) | (u64::from(first) << 32);
    let mut value = (read_secret64(8) ^ read_secret64(16)) ^ combined;
    value ^= value.rotate_left(49) ^ value.rotate_left(24);
    value = value.wrapping_mul(PMX2);
    value ^= (value >> 35).wrapping_add(input.len() as u64);
    value = value.wrapping_mul(PMX2);
    value ^ (value >> 28)
}

// xxh3_len_9_to_16.rs
pub fn xxh3_len_9_to_16(input: &[u8]) -> u64 {
    let low = (read_secret64(24) ^ read_secret64(32)) ^ read64(input, 0);
    let high = (read_secret64(40) ^ read_secret64(48)) ^ read64(input, input.len() - 8);
    let product = (low as u128).wrapping_mul(high as u128);
    let folded = (product as u64) ^ (product >> 64) as u64;
    avalanche(
        low.swap_bytes()
            .wrapping_add(high)
            .wrapping_add(folded)
            .wrapping_add(input.len() as u64),
    )
}

// xxhash3_64.rs
/// Compute XXH3-64 with seed 0 and the default secret.
#[must_use]
pub(crate) fn xxhash3_64(input: &[u8]) -> u64 {
    match input.len() {
        0 => avalanche_xxh64(read_secret64(56) ^ read_secret64(64)),
        1..=3 => xxh3_len_1_to_3(input),
        4..=8 => xxh3_len_4_to_8(input),
        9..=16 => xxh3_len_9_to_16(input),
        17..=128 => xxh3_len_17_to_128(input),
        129..=240 => xxh3_len_129_to_240(input),
        _ => xxh3_large(input),
    }
}

// xxhash64.rs
/// Compute XXH64 with seed 0.
#[must_use]
pub(crate) fn xxhash64(input: &[u8]) -> u64 {
    let mut index = 0usize;
    let mut acc = if input.len() >= 32 {
        let mut v1 = P64_1.wrapping_add(P64_2);
        let mut v2 = P64_2;
        let mut v3 = 0;
        let mut v4 = 0u64.wrapping_sub(P64_1);
        while index + 32 <= input.len() {
            v1 = round(v1, read64(input, index));
            v2 = round(v2, read64(input, index + 8));
            v3 = round(v3, read64(input, index + 16));
            v4 = round(v4, read64(input, index + 24));
            index += 32;
        }
        let mut merged = v1
            .rotate_left(1)
            .wrapping_add(v2.rotate_left(7))
            .wrapping_add(v3.rotate_left(12))
            .wrapping_add(v4.rotate_left(18));
        for v in [v1, v2, v3, v4] {
            merged = merge_acc(merged, v);
        }
        merged
    } else {
        P64_5
    };
    acc = acc.wrapping_add(input.len() as u64);
    consume_tail(input, index, acc)
}