vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
//! BLAKE2s-256 and BLAKE2b-512 CPU reference implementations.

// blake2b_bytes.rs
/// Compute BLAKE2b with caller-selected output length up to 64 bytes.
#[must_use]
pub(crate) fn blake2b_bytes(input: &[u8], out_len: usize) -> Vec<u8> {
    assert!(
        (1..=64).contains(&out_len),
        "BLAKE2b output length must be 1..=64"
    );
    let mut h = IV64;
    h[0] ^= 0x0101_0000 ^ out_len as u64;
    if input.is_empty() {
        compress64(&mut h, &[0; 128], 0, true);
    } else {
        let mut offset = 0usize;
        while offset + 128 < input.len() {
            compress64(
                &mut h,
                &input[offset..offset + 128],
                offset as u128 + 128,
                false,
            );
            offset += 128;
        }
        let mut block = [0u8; 128];
        block[..input.len() - offset].copy_from_slice(&input[offset..]);
        compress64(&mut h, &block, input.len() as u128, true);
    }
    let mut out = Vec::with_capacity(64);
    for word in h {
        out.extend_from_slice(&word.to_le_bytes());
    }
    out.truncate(out_len);
    out
}

// blake2b_words.rs
/// Compute unkeyed BLAKE2b-512 and return eight big-endian digest words.
#[must_use]
pub(crate) fn blake2b_words(input: &[u8]) -> [u64; 8] {
    let mut h = IV64;
    h[0] ^= 0x0101_0040;
    if input.is_empty() {
        compress64(&mut h, &[0; 128], 0, true);
    } else {
        let mut offset = 0usize;
        while offset + 128 < input.len() {
            compress64(
                &mut h,
                &input[offset..offset + 128],
                offset as u128 + 128,
                false,
            );
            offset += 128;
        }
        let mut block = [0u8; 128];
        block[..input.len() - offset].copy_from_slice(&input[offset..]);
        compress64(&mut h, &block, input.len() as u128, true);
    }
    h.map(u64::to_be)
}

// blake2s_words.rs
/// Compute unkeyed BLAKE2s-256 and return eight big-endian digest words.
#[must_use]
pub(crate) fn blake2s_words(input: &[u8]) -> [u32; 8] {
    let mut h = IV32;
    h[0] ^= 0x0101_0020;
    if input.is_empty() {
        compress32(&mut h, &[0; 64], 0, true);
    } else {
        let mut offset = 0usize;
        while offset + 64 < input.len() {
            compress32(
                &mut h,
                &input[offset..offset + 64],
                offset as u64 + 64,
                false,
            );
            offset += 64;
        }
        let mut block = [0u8; 64];
        block[..input.len() - offset].copy_from_slice(&input[offset..]);
        compress32(&mut h, &block, input.len() as u64, true);
    }
    h.map(u32::to_be)
}

// compress32.rs
pub fn compress32(h: &mut [u32; 8], block: &[u8], count: u64, last: bool) {
    let mut m = [0u32; 16];
    for (slot, bytes) in m.iter_mut().zip(block.chunks_exact(4)) {
        *slot = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]);
    }
    let mut v = [0u32; 16];
    v[..8].copy_from_slice(h);
    v[8..].copy_from_slice(&IV32);
    v[12] ^= count as u32;
    v[13] ^= (count >> 32) as u32;
    if last {
        v[14] = !v[14];
    }
    for s in SIGMA.iter().take(10) {
        g32(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
        g32(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
        g32(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
        g32(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
        g32(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
        g32(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
        g32(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
        g32(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
    }
    for i in 0..8 {
        h[i] ^= v[i] ^ v[i + 8];
    }
}

// compress64.rs
pub fn compress64(h: &mut [u64; 8], block: &[u8], count: u128, last: bool) {
    let mut m = [0u64; 16];
    for (slot, bytes) in m.iter_mut().zip(block.chunks_exact(8)) {
        *slot = u64::from_le_bytes([
            bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7],
        ]);
    }
    let mut v = [0u64; 16];
    v[..8].copy_from_slice(h);
    v[8..].copy_from_slice(&IV64);
    v[12] ^= count as u64;
    v[13] ^= (count >> 64) as u64;
    if last {
        v[14] = !v[14];
    }
    for s in &SIGMA {
        g64(&mut v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
        g64(&mut v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
        g64(&mut v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
        g64(&mut v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
        g64(&mut v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
        g64(&mut v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
        g64(&mut v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
        g64(&mut v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
    }
    for i in 0..8 {
        h[i] ^= v[i] ^ v[i + 8];
    }
}

// g32.rs
pub fn g32(v: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
    v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
    v[d] = (v[d] ^ v[a]).rotate_right(16);
    v[c] = v[c].wrapping_add(v[d]);
    v[b] = (v[b] ^ v[c]).rotate_right(12);
    v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
    v[d] = (v[d] ^ v[a]).rotate_right(8);
    v[c] = v[c].wrapping_add(v[d]);
    v[b] = (v[b] ^ v[c]).rotate_right(7);
}

// g64.rs
pub fn g64(v: &mut [u64; 16], a: usize, b: usize, c: usize, d: usize, x: u64, y: u64) {
    v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
    v[d] = (v[d] ^ v[a]).rotate_right(32);
    v[c] = v[c].wrapping_add(v[d]);
    v[b] = (v[b] ^ v[c]).rotate_right(24);
    v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
    v[d] = (v[d] ^ v[a]).rotate_right(16);
    v[c] = v[c].wrapping_add(v[d]);
    v[b] = (v[b] ^ v[c]).rotate_right(63);
}

// iv32.rs
pub const IV32: [u32; 8] = [
    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
];

// iv64.rs
pub const IV64: [u64; 8] = [
    0x6a09e667f3bcc908,
    0xbb67ae8584caa73b,
    0x3c6ef372fe94f82b,
    0xa54ff53a5f1d36f1,
    0x510e527fade682d1,
    0x9b05688c2b3e6c1f,
    0x1f83d9abfb41bd6b,
    0x5be0cd19137e2179,
];

// sigma.rs
pub const SIGMA: [[usize; 16]; 12] = [
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
    [11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
    [7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
    [9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
    [2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
    [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
    [13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
    [6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
    [10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
    [14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
];