haz-cache 0.2.0

Content-addressed cache for haz task outputs using BLAKE3.
Documentation
//! On-disk cache-entry layout per `CACHE-010`, `CACHE-012`, and
//! `CACHE-013`.
//!
//! Pure path-computation helpers. None of these functions touch
//! the filesystem; they translate a workspace-root path and a
//! [`CacheKey`] into the canonical paths the cache uses to read
//! and write entry data.
//!
//! Layout tree under `<workspace-root>/.haz/cache/`:
//!
//! ```text
//! .haz/cache/
//! `-- <shard>/                          (first two hex chars of key)
//!     `-- <hex-key>/                    (entry directory)
//!         |-- manifest.json             (CACHE-011)
//!         |-- stdout                    (CACHE-012)
//!         |-- stderr                    (CACHE-012)
//!         `-- outputs/
//!             `-- <hex-content-hash>    (CACHE-013, one per blob)
//! ```

use std::path::{Path, PathBuf};

use crate::hex;
use crate::key::CacheKey;

/// File name of the manifest within an entry directory
/// (`CACHE-011`).
pub const MANIFEST_FILE_NAME: &str = "manifest.json";

/// File name of the captured stdout stream within an entry
/// directory (`CACHE-012`).
pub const STDOUT_FILE_NAME: &str = "stdout";

/// File name of the captured stderr stream within an entry
/// directory (`CACHE-012`).
pub const STDERR_FILE_NAME: &str = "stderr";

/// Subdirectory holding output blobs within an entry directory
/// (`CACHE-013`).
pub const OUTPUTS_SUBDIR: &str = "outputs";

/// Compute the cache root directory under `workspace_root`:
/// `<workspace_root>/.haz/cache`.
#[must_use]
pub fn cache_root(workspace_root: &Path) -> PathBuf {
    workspace_root.join(".haz").join("cache")
}

/// The shard component of `key` per `CACHE-010`: the first two
/// lowercase hexadecimal characters of the key.
#[must_use]
pub fn shard(key: &CacheKey) -> String {
    let hex = key.to_hex();
    hex[..2].to_owned()
}

/// Shard directory under `cache_root`: `<cache_root>/<shard>`.
#[must_use]
pub fn shard_dir(cache_root: &Path, key: &CacheKey) -> PathBuf {
    cache_root.join(shard(key))
}

/// Entry directory of `key`: `<cache_root>/<shard>/<hex-key>`.
#[must_use]
pub fn entry_dir(cache_root: &Path, key: &CacheKey) -> PathBuf {
    shard_dir(cache_root, key).join(key.to_hex())
}

/// Path to the manifest file of `key`'s entry.
#[must_use]
pub fn manifest_path(cache_root: &Path, key: &CacheKey) -> PathBuf {
    entry_dir(cache_root, key).join(MANIFEST_FILE_NAME)
}

/// Path to the captured stdout file of `key`'s entry.
#[must_use]
pub fn stdout_path(cache_root: &Path, key: &CacheKey) -> PathBuf {
    entry_dir(cache_root, key).join(STDOUT_FILE_NAME)
}

/// Path to the captured stderr file of `key`'s entry.
#[must_use]
pub fn stderr_path(cache_root: &Path, key: &CacheKey) -> PathBuf {
    entry_dir(cache_root, key).join(STDERR_FILE_NAME)
}

/// Subdirectory holding output blobs of `key`'s entry.
#[must_use]
pub fn outputs_dir(cache_root: &Path, key: &CacheKey) -> PathBuf {
    entry_dir(cache_root, key).join(OUTPUTS_SUBDIR)
}

/// Path to a single output blob, keyed by its content hash
/// (`CACHE-013`).
#[must_use]
pub fn output_blob_path(cache_root: &Path, key: &CacheKey, content_hash: &[u8; 32]) -> PathBuf {
    outputs_dir(cache_root, key).join(hex::encode_32(content_hash))
}

/// The name of the two-phase-store tmp directory for `key` with
/// the caller-supplied `random_suffix`, per `CACHE-017`:
/// `.tmp-<hex-key>-<random>`. The caller chooses the random
/// suffix; the layout helper only joins it into the canonical
/// shape so concurrent stores of the same key on the same shard
/// do not collide.
#[must_use]
pub fn tmp_entry_dir(cache_root: &Path, key: &CacheKey, random_suffix: &str) -> PathBuf {
    let name = format!(".tmp-{}-{}", key.to_hex(), random_suffix);
    shard_dir(cache_root, key).join(name)
}

/// Staging directory used by restoration (`CACHE-019`,
/// `CACHE-020`): blob bytes are written here first, then renamed
/// onto their workspace-absolute targets so a partial publish is
/// detectable and contained.
///
/// Placed directly under `cache_root` (not on a shard) because it
/// is a transient publishing scratch space, not an entry in the
/// content-addressed sense. The naming pattern
/// `.restore-<hex-key>-<random>` keeps it distinct from
/// [`tmp_entry_dir`]'s store-time `.tmp-<hex-key>-<random>` so
/// future invalidation logic can tell the two apart.
#[must_use]
pub fn restore_staging_dir(cache_root: &Path, key: &CacheKey, random_suffix: &str) -> PathBuf {
    let name = format!(".restore-{}-{}", key.to_hex(), random_suffix);
    cache_root.join(name)
}

#[cfg(test)]
mod tests {
    use std::path::Path;

    use crate::CacheKey;
    use crate::layout::{
        cache_root, entry_dir, manifest_path, output_blob_path, outputs_dir, restore_staging_dir,
        shard, shard_dir, stderr_path, stdout_path, tmp_entry_dir,
    };

    /// Build a [`CacheKey`] whose hex form has a predictable
    /// 2-char shard prefix and a recognisable tail.
    fn key_with_first_byte(first: u8) -> CacheKey {
        let mut bytes = [0u8; 32];
        bytes[0] = first;
        for (i, b) in bytes.iter_mut().enumerate().skip(1) {
            *b = u8::try_from(i & 0xFF).unwrap();
        }
        CacheKey::from_bytes(bytes)
    }

    #[test]
    fn cache_010_cache_root_is_workspace_dot_haz_cache() {
        let root = cache_root(Path::new("/ws"));
        assert_eq!(root, Path::new("/ws/.haz/cache"));
    }

    #[test]
    fn cache_010_shard_is_first_two_hex_chars_of_key() {
        let key = key_with_first_byte(0xAB);
        assert_eq!(shard(&key), "ab");

        let key = key_with_first_byte(0x00);
        assert_eq!(shard(&key), "00");

        let key = key_with_first_byte(0xFF);
        assert_eq!(shard(&key), "ff");
    }

    #[test]
    fn cache_010_shard_dir_joins_cache_root_and_shard() {
        let key = key_with_first_byte(0xAB);
        let root = Path::new("/ws/.haz/cache");
        assert_eq!(shard_dir(root, &key), Path::new("/ws/.haz/cache/ab"));
    }

    #[test]
    fn cache_010_entry_dir_is_shard_dir_joined_with_full_hex_key() {
        let key = key_with_first_byte(0xAB);
        let root = Path::new("/ws/.haz/cache");
        let entry = entry_dir(root, &key);
        let expected = format!("/ws/.haz/cache/ab/{}", key.to_hex());
        assert_eq!(entry, Path::new(&expected));
    }

    #[test]
    fn cache_011_manifest_path_lives_inside_entry_dir() {
        let key = key_with_first_byte(0xAB);
        let root = Path::new("/ws/.haz/cache");
        let mpath = manifest_path(root, &key);
        let expected = format!("/ws/.haz/cache/ab/{}/manifest.json", key.to_hex());
        assert_eq!(mpath, Path::new(&expected));
    }

    #[test]
    fn cache_012_stdout_and_stderr_paths_use_canonical_names() {
        let key = key_with_first_byte(0xAB);
        let root = Path::new("/ws/.haz/cache");
        assert!(stdout_path(root, &key).ends_with("stdout"));
        assert!(stderr_path(root, &key).ends_with("stderr"));
    }

    #[test]
    fn cache_013_outputs_dir_is_outputs_under_entry_dir() {
        let key = key_with_first_byte(0xAB);
        let root = Path::new("/ws/.haz/cache");
        let od = outputs_dir(root, &key);
        let expected = format!("/ws/.haz/cache/ab/{}/outputs", key.to_hex());
        assert_eq!(od, Path::new(&expected));
    }

    #[test]
    fn cache_013_output_blob_path_is_keyed_by_content_hash() {
        let key = key_with_first_byte(0xAB);
        let content_hash = [0xCDu8; 32];
        let root = Path::new("/ws/.haz/cache");
        let blob = output_blob_path(root, &key, &content_hash);
        let expected = format!(
            "/ws/.haz/cache/ab/{}/outputs/{}",
            key.to_hex(),
            "cd".repeat(32)
        );
        assert_eq!(blob, Path::new(&expected));
    }

    #[test]
    fn cache_017_tmp_entry_dir_uses_dot_tmp_prefix_with_random_suffix() {
        let key = key_with_first_byte(0xAB);
        let root = Path::new("/ws/.haz/cache");
        let tmp = tmp_entry_dir(root, &key, "r4nd0m");
        let expected = format!("/ws/.haz/cache/ab/.tmp-{}-r4nd0m", key.to_hex());
        assert_eq!(tmp, Path::new(&expected));
    }

    #[test]
    fn cache_017_tmp_entry_dir_and_entry_dir_have_same_parent() {
        // The rename of the tmp dir into its final entry path
        // must be same-directory for `CACHE-017` atomicity. The
        // layout helpers must preserve that invariant.
        let key = key_with_first_byte(0xAB);
        let root = Path::new("/ws/.haz/cache");
        let tmp = tmp_entry_dir(root, &key, "rnd");
        let final_entry = entry_dir(root, &key);
        assert_eq!(tmp.parent().unwrap(), final_entry.parent().unwrap());
    }

    #[test]
    fn restore_staging_dir_lives_directly_under_cache_root() {
        let key = key_with_first_byte(0xAB);
        let root = Path::new("/ws/.haz/cache");
        let staging = restore_staging_dir(root, &key, "r4nd0m");
        let expected = format!("/ws/.haz/cache/.restore-{}-r4nd0m", key.to_hex());
        assert_eq!(staging, Path::new(&expected));
    }

    #[test]
    fn restore_staging_dir_name_is_distinct_from_tmp_entry_dir() {
        // Future invalidation logic must be able to discriminate
        // store-time tmp directories (incomplete entries) from
        // restore-time staging directories (transient publishing
        // state). The two naming patterns must not collide.
        let key = key_with_first_byte(0xAB);
        let root = Path::new("/ws/.haz/cache");
        let store_tmp = tmp_entry_dir(root, &key, "rnd");
        let restore_staging = restore_staging_dir(root, &key, "rnd");
        assert_ne!(store_tmp, restore_staging);
        let store_name = store_tmp.file_name().unwrap().to_string_lossy();
        let restore_name = restore_staging.file_name().unwrap().to_string_lossy();
        assert!(store_name.starts_with(".tmp-"));
        assert!(restore_name.starts_with(".restore-"));
    }
}