haz-cache 0.2.0

Content-addressed cache for haz task outputs using BLAKE3.
Documentation
//! Hash-function dispatch.
//!
//! The cache-key derivation feeds canonicalised bytes into a single
//! hash function per workspace invocation (`CACHE-002`). The set of
//! supported functions is closed and append-only: BLAKE3 (default)
//! and SHA-256.
//!
//! [`Hasher`] is an enum because the registry is closed by design;
//! see also `docs/spec/09-caching.md` `CACHE-002`. Adding a new
//! variant requires bumping `hash_function_id` per the registry's
//! append-only rule.

use haz_domain::settings::cache::HashAlgo;
use sha2::Digest as _;

/// Streaming hash-function dispatcher.
///
/// Construct with [`Hasher::new`], feed canonical bytes via
/// [`Hasher::update`], finalise with [`Hasher::finalize`]. The
/// finalised digest is a fixed-width 32-byte array: both
/// specification-recognised functions (`CACHE-002`) emit 32 bytes
/// (BLAKE3's 256-bit output, SHA-256's natural width). The cache
/// key is the finalised digest unchanged (`CACHE-001`,
/// `CACHE-009`).
///
/// `Hasher` is intentionally not [`Clone`]: cache-key derivation
/// consumes a single hasher across all components in order. Cloning
/// the in-progress state would invite subtle errors where two keys
/// share a prefix and diverge.
///
/// [`Hasher::Blake3`]'s state is large (about 2 KiB of internal
/// buffers); it is boxed to keep the enum's stack footprint
/// modest, since SHA-256's state is roughly 112 bytes.
pub enum Hasher {
    /// BLAKE3-256.
    Blake3(Box<blake3::Hasher>),
    /// SHA-256.
    Sha256(sha2::Sha256),
}

impl Hasher {
    /// Construct a fresh hasher for `algo`.
    #[must_use]
    pub fn new(algo: HashAlgo) -> Self {
        match algo {
            HashAlgo::Blake3 => Self::Blake3(Box::new(blake3::Hasher::new())),
            HashAlgo::Sha256 => Self::Sha256(sha2::Sha256::new()),
        }
    }

    /// Feed `bytes` into the hash. May be called any number of
    /// times; the final digest depends on the concatenation of all
    /// supplied byte slices in call order.
    pub fn update(&mut self, bytes: &[u8]) {
        match self {
            Self::Blake3(h) => {
                h.update(bytes);
            }
            Self::Sha256(h) => {
                sha2::Digest::update(h, bytes);
            }
        }
    }

    /// Consume the hasher and return the 32-byte digest.
    #[must_use]
    pub fn finalize(self) -> [u8; 32] {
        match self {
            Self::Blake3(h) => *h.finalize().as_bytes(),
            Self::Sha256(h) => {
                let out = sha2::Digest::finalize(h);
                out.into()
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use haz_domain::settings::cache::HashAlgo;

    use crate::hasher::Hasher;

    fn hash_bytes(algo: HashAlgo, bytes: &[u8]) -> [u8; 32] {
        let mut h = Hasher::new(algo);
        h.update(bytes);
        h.finalize()
    }

    #[test]
    fn cache_002_blake3_empty_input_matches_reference_vector() {
        // BLAKE3 reference vector for the empty input.
        let expected = [
            0xaf, 0x13, 0x49, 0xb9, 0xf5, 0xf9, 0xa1, 0xa6, 0xa0, 0x40, 0x4d, 0xea, 0x36, 0xdc,
            0xc9, 0x49, 0x9b, 0xcb, 0x25, 0xc9, 0xad, 0xc1, 0x12, 0xb7, 0xcc, 0x9a, 0x93, 0xca,
            0xe4, 0x1f, 0x32, 0x62,
        ];
        assert_eq!(hash_bytes(HashAlgo::Blake3, b""), expected);
    }

    #[test]
    fn cache_002_sha256_empty_input_matches_reference_vector() {
        // SHA-256 reference vector for the empty input.
        let expected = [
            0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f,
            0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b,
            0x78, 0x52, 0xb8, 0x55,
        ];
        assert_eq!(hash_bytes(HashAlgo::Sha256, b""), expected);
    }

    #[test]
    fn cache_002_sha256_abc_matches_reference_vector() {
        // FIPS 180-2 SHA-256 reference for "abc".
        let expected = [
            0xba, 0x78, 0x16, 0xbf, 0x8f, 0x01, 0xcf, 0xea, 0x41, 0x41, 0x40, 0xde, 0x5d, 0xae,
            0x22, 0x23, 0xb0, 0x03, 0x61, 0xa3, 0x96, 0x17, 0x7a, 0x9c, 0xb4, 0x10, 0xff, 0x61,
            0xf2, 0x00, 0x15, 0xad,
        ];
        assert_eq!(hash_bytes(HashAlgo::Sha256, b"abc"), expected);
    }

    #[test]
    fn cache_002_blake3_and_sha256_diverge_on_same_input() {
        let blake = hash_bytes(HashAlgo::Blake3, b"identical");
        let sha = hash_bytes(HashAlgo::Sha256, b"identical");
        assert_ne!(
            blake, sha,
            "the two hash functions must never coincide on a single input"
        );
    }

    #[test]
    fn update_chunking_is_irrelevant_to_digest() {
        // Equivalent under both algorithms: the digest depends on
        // the concatenation, not on how the bytes were chunked.
        for algo in [HashAlgo::Blake3, HashAlgo::Sha256] {
            let mut chunked = Hasher::new(algo);
            chunked.update(b"hello, ");
            chunked.update(b"world");
            let chunked = chunked.finalize();

            let mut whole = Hasher::new(algo);
            whole.update(b"hello, world");
            let whole = whole.finalize();

            assert_eq!(chunked, whole, "{algo:?} digest depends on chunking");
        }
    }

    #[test]
    fn fresh_hasher_each_call() {
        // Sanity: distinct Hasher instances of the same algo are
        // independent. Hashing the same bytes twice produces the
        // same digest both times.
        for algo in [HashAlgo::Blake3, HashAlgo::Sha256] {
            let a = hash_bytes(algo, b"twice");
            let b = hash_bytes(algo, b"twice");
            assert_eq!(a, b);
        }
    }
}