vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
//! BLAKE3-256 CPU reference implementation.

/// Compute an unkeyed BLAKE3 output with caller-selected byte length.
pub fn blake3_bytes(input: &[u8], out_len: usize) -> Vec<u8> {
    let root = root_output(input);
    let mut out = Vec::with_capacity(out_len);
    let mut counter = 0u64;
    while out.len() < out_len {
        let words = compress(
            &root.input_chaining_value,
            &root.block_words,
            counter,
            root.block_len,
            root.flags | ROOT,
        );
        for word in words.iter().take(8) {
            out.extend_from_slice(&word.to_le_bytes());
            if out.len() >= out_len {
                out.truncate(out_len);
                return out;
            }
        }
        counter += 1;
    }
    out
}

// blake3_words.rs
/// Compute unkeyed BLAKE3 and return eight big-endian digest words.
#[must_use]
pub(crate) fn blake3_words(input: &[u8]) -> [u32; 8] {
    let bytes = blake3_bytes(input, 32);
    let mut out = [0u32; 8];
    for (slot, chunk) in out.iter_mut().zip(bytes.chunks_exact(4)) {
        *slot = u32::from_be_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]);
    }
    out
}

/// Return the chaining value produced by one BLAKE3 output block.
pub fn chaining_value(output: Output) -> [u32; 8] {
    let words = compress(
        &output.input_chaining_value,
        &output.block_words,
        output.counter,
        output.block_len,
        output.flags,
    );
    let mut out = [0u32; 8];
    out.copy_from_slice(&words[..8]);
    out
}

/// BLAKE3 flag marking the last block in a chunk.
pub const CHUNK_END: u32 = 2;

/// Compress one BLAKE3 chunk into an output descriptor.
pub fn chunk_output(chunk: &[u8], chunk_counter: u64) -> Output {
    let mut cv = IV;
    let blocks = if chunk.is_empty() {
        1
    } else {
        chunk.len().div_ceil(64)
    };
    for block_index in 0..blocks {
        let start = block_index * 64;
        let end = usize::min(start + 64, chunk.len());
        let block = &chunk[start..end];
        let mut words = [0u32; 16];
        for (slot, bytes) in words.iter_mut().zip(block.chunks(4)) {
            let mut padded = [0u8; 4];
            padded[..bytes.len()].copy_from_slice(bytes);
            *slot = u32::from_le_bytes(padded);
        }
        let mut flags = 0;
        if block_index == 0 {
            flags |= CHUNK_START;
        }
        if block_index == blocks - 1 {
            flags |= CHUNK_END;
            return Output {
                input_chaining_value: cv,
                block_words: words,
                counter: chunk_counter,
                block_len: block.len() as u32,
                flags,
            };
        }
        let compressed = compress(&cv, &words, chunk_counter, 64, flags);
        cv.copy_from_slice(&compressed[..8]);
    }
    unreachable!("chunk loop always returns on final block")
}

/// BLAKE3 flag marking the first block in a chunk.
pub const CHUNK_START: u32 = 1;

/// Run the BLAKE3 compression function for one block.
pub fn compress(
    cv: &[u32; 8],
    block: &[u32; 16],
    counter: u64,
    block_len: u32,
    flags: u32,
) -> [u32; 16] {
    let mut v = [
        cv[0],
        cv[1],
        cv[2],
        cv[3],
        cv[4],
        cv[5],
        cv[6],
        cv[7],
        IV[0],
        IV[1],
        IV[2],
        IV[3],
        counter as u32,
        (counter >> 32) as u32,
        block_len,
        flags,
    ];
    let mut m = *block;
    for _ in 0..7 {
        round(&mut v, &m);
        m = permute(m);
    }
    [
        v[0] ^ v[8],
        v[1] ^ v[9],
        v[2] ^ v[10],
        v[3] ^ v[11],
        v[4] ^ v[12],
        v[5] ^ v[13],
        v[6] ^ v[14],
        v[7] ^ v[15],
        v[8] ^ cv[0],
        v[9] ^ cv[1],
        v[10] ^ cv[2],
        v[11] ^ cv[3],
        v[12] ^ cv[4],
        v[13] ^ cv[5],
        v[14] ^ cv[6],
        v[15] ^ cv[7],
    ]
}

/// Pack two chaining values into the parent-node message words.
pub fn cv_pair_words(left: [u32; 8], right: [u32; 8]) -> [u32; 16] {
    let mut words = [0u32; 16];
    words[..8].copy_from_slice(&left);
    words[8..].copy_from_slice(&right);
    words
}

/// Apply one BLAKE3 quarter-round to the working vector.
pub fn g(v: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
    v[a] = v[a].wrapping_add(v[b]).wrapping_add(x);
    v[d] = (v[d] ^ v[a]).rotate_right(16);
    v[c] = v[c].wrapping_add(v[d]);
    v[b] = (v[b] ^ v[c]).rotate_right(12);
    v[a] = v[a].wrapping_add(v[b]).wrapping_add(y);
    v[d] = (v[d] ^ v[a]).rotate_right(8);
    v[c] = v[c].wrapping_add(v[d]);
    v[b] = (v[b] ^ v[c]).rotate_right(7);
}

/// BLAKE3 initialization vector.
pub const IV: [u32; 8] = [
    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
];

/// Message-word permutation applied between BLAKE3 rounds.
pub const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8];

/// Intermediate BLAKE3 output block before root finalization.
#[derive(Clone, Copy)]
pub struct Output {
    /// Chaining value entering the compression call.
    pub(crate) input_chaining_value: [u32; 8],
    /// Sixteen message words for the compression call.
    pub(crate) block_words: [u32; 16],
    /// Chunk or output counter.
    pub(crate) counter: u64,
    /// Number of valid bytes in `block_words`.
    pub(crate) block_len: u32,
    /// BLAKE3 flag bits for this block.
    pub(crate) flags: u32,
}

/// BLAKE3 flag marking a parent node.
pub const PARENT: u32 = 4;

/// Return the parent chaining value for two child chaining values.
pub fn parent_cv(left: [u32; 8], right: [u32; 8]) -> [u32; 8] {
    chaining_value(parent_output(left, right))
}

/// Build the output descriptor for a parent node.
pub fn parent_output(left: [u32; 8], right: [u32; 8]) -> Output {
    let words = cv_pair_words(left, right);
    Output {
        input_chaining_value: IV,
        block_words: words,
        counter: 0,
        block_len: 64,
        flags: PARENT,
    }
}

/// Apply the BLAKE3 message permutation.
pub fn permute(input: [u32; 16]) -> [u32; 16] {
    let mut output = [0u32; 16];
    for (slot, source) in output.iter_mut().zip(MSG_PERMUTATION) {
        *slot = input[source];
    }
    output
}

/// BLAKE3 flag marking root output generation.
pub const ROOT: u32 = 8;

/// Reduce an input message to the BLAKE3 root output descriptor.
pub fn root_output(input: &[u8]) -> Output {
    if input.is_empty() {
        return chunk_output(&[], 0);
    }
    if input.len() <= 1024 {
        return chunk_output(input, 0);
    }

    let mut cvs: Vec<[u32; 8]> = input
        .chunks(1024)
        .enumerate()
        .map(|(counter, chunk)| chaining_value(chunk_output(chunk, counter as u64)))
        .collect();
    while cvs.len() > 2 {
        let mut next = Vec::with_capacity(cvs.len().div_ceil(2));
        let mut pairs = cvs.chunks_exact(2);
        for pair in &mut pairs {
            next.push(parent_cv(pair[0], pair[1]));
        }
        if let Some(carry) = pairs.remainder().first() {
            next.push(*carry);
        }
        cvs = next;
    }
    Output {
        input_chaining_value: IV,
        block_words: cv_pair_words(cvs[0], cvs[1]),
        counter: 0,
        block_len: 64,
        flags: PARENT,
    }
}

/// Apply one full BLAKE3 round to the working vector.
pub fn round(v: &mut [u32; 16], m: &[u32; 16]) {
    g(v, 0, 4, 8, 12, m[0], m[1]);
    g(v, 1, 5, 9, 13, m[2], m[3]);
    g(v, 2, 6, 10, 14, m[4], m[5]);
    g(v, 3, 7, 11, 15, m[6], m[7]);
    g(v, 0, 5, 10, 15, m[8], m[9]);
    g(v, 1, 6, 11, 12, m[10], m[11]);
    g(v, 2, 7, 8, 13, m[12], m[13]);
    g(v, 3, 4, 9, 14, m[14], m[15]);
}