haz-cache 0.2.0

Content-addressed cache for haz task outputs using BLAKE3.
Documentation
//! Canonical serialisation of cache-key components
//! (`CACHE-005..009`).
//!
//! Each function here writes a single component into the running
//! [`Hasher`] in the canonical byte sequence the spec mandates.
//! Multi-item components (input files, predecessors, env entries)
//! enforce their ordering rule inside the function: callers MAY
//! pass items in any order; the function sorts them before
//! contribution.

use std::collections::BTreeMap;

use haz_domain::action::{ShellType, TaskAction};
use haz_domain::env::EnvVarName;
use haz_domain::name::{ProjectName, TaskName};

use crate::hasher::Hasher;

/// Tag byte distinguishing `command` from `shell` in CACHE-005.
const TAG_COMMAND: u8 = 0x01;
/// Tag byte distinguishing `shell` from `command` in CACHE-005.
const TAG_SHELL: u8 = 0x02;

/// Tag byte for "host variable is absent" in CACHE-008's
/// `from_host` contribution.
const ENV_ABSENT_MARKER: u8 = 0x00;

/// Contribution of one file matched by an `inputs` pattern
/// (`CACHE-006`).
///
/// `workspace_absolute_path` is the path's canonical workspace-
/// absolute form (`PATH-*`); `content_hash` is the file's content
/// hash under the cache's configured hash function. The cache does
/// NOT compute the content hash itself in this layer: callers
/// resolve globs to files, hash each file's contents through the
/// same [`Hasher`] algorithm, and hand the resulting digests here.
pub struct InputFile<'a> {
    /// Canonical workspace-absolute path of the file.
    pub workspace_absolute_path: &'a str,
    /// Content hash of the file's bytes under the configured hash
    /// function.
    pub content_hash: [u8; 32],
}

/// Contribution of one hard-edge predecessor's captured streams
/// (`CACHE-007`).
///
/// `stdout_hash` and `stderr_hash` are hashes of the predecessor's
/// stdout and stderr byte streams under the cache's configured
/// hash function. The two are kept distinct: a stream switch
/// (stdout -> stderr) changes the predecessor's contribution.
pub struct PredecessorStreams<'a> {
    /// Project component of the predecessor's identity.
    pub project: &'a ProjectName,
    /// Task component of the predecessor's identity.
    pub task: &'a TaskName,
    /// Hash of the predecessor's stdout bytes.
    pub stdout_hash: [u8; 32],
    /// Hash of the predecessor's stderr bytes.
    pub stderr_hash: [u8; 32],
}

/// Resolved environment contribution (`CACHE-008`).
///
/// `from_host` carries each allow-listed variable name together
/// with the value it took in the host process at key-derivation
/// time, or `None` when the host did not set that name. The cache
/// does NOT consult `std::env` itself: callers resolve the values
/// (typically `std::env::var(name).ok()`) and hand the map here.
///
/// `overrides` carries the task-level `env.override` map. On a
/// name appearing in both maps, the `overrides` entry wins: the
/// `from_host` entry for that name does NOT contribute to the key.
/// Enforcement happens inside [`contribute_env`], so the caller MAY
/// leave the `from_host` map as-passed.
pub struct EnvContribution<'a> {
    /// Allow-listed host variable names paired with their values
    /// at key-derivation time (`None` for absent).
    pub from_host: &'a BTreeMap<EnvVarName, Option<String>>,
    /// Task-level hardcoded name/value overrides.
    pub overrides: &'a BTreeMap<EnvVarName, String>,
}

/// Write `action` into `hasher` per `CACHE-005`.
///
/// # Panics
///
/// Panics if the argv length or any argument's byte length exceeds
/// [`u32::MAX`]; both bounds are structurally impossible for real
/// workspaces (a 4 GiB single argument or 4 billion arguments).
pub fn contribute_action(hasher: &mut Hasher, action: &TaskAction) {
    match action {
        TaskAction::Command(argv) => {
            hasher.update(&[TAG_COMMAND]);
            let count =
                u32::try_from(argv.len()).expect("argv length within u32::MAX is structural");
            hasher.update(&count.to_be_bytes());
            for arg in argv {
                write_length_prefixed(hasher, arg.as_bytes());
            }
        }
        TaskAction::Shell { script, shell } => {
            hasher.update(&[TAG_SHELL]);
            write_length_prefixed(hasher, shell_identifier(shell).as_bytes());
            write_length_prefixed(hasher, script.as_bytes());
        }
    }
}

/// Write `files` into `hasher` per `CACHE-006` and the file-
/// ordering clause of `CACHE-009`.
///
/// # Panics
///
/// Panics if the file count or any path's byte length exceeds
/// [`u32::MAX`]; both bounds are structurally impossible for real
/// workspaces.
pub fn contribute_input_files(hasher: &mut Hasher, files: &[InputFile<'_>]) {
    let mut sorted: Vec<&InputFile<'_>> = files.iter().collect();
    sorted.sort_by(|a, b| {
        a.workspace_absolute_path
            .as_bytes()
            .cmp(b.workspace_absolute_path.as_bytes())
    });

    let count =
        u32::try_from(sorted.len()).expect("input-file count within u32::MAX is structural");
    hasher.update(&count.to_be_bytes());
    for f in sorted {
        hasher.update(&f.content_hash);
        write_length_prefixed(hasher, f.workspace_absolute_path.as_bytes());
    }
}

/// Write `predecessors` into `hasher` per `CACHE-007` and the
/// predecessor-ordering clause of `CACHE-009`.
///
/// # Panics
///
/// Panics if the predecessor count or any name's byte length
/// exceeds [`u32::MAX`]; structurally impossible for real
/// workspaces.
pub fn contribute_predecessors(hasher: &mut Hasher, predecessors: &[PredecessorStreams<'_>]) {
    let mut sorted: Vec<&PredecessorStreams<'_>> = predecessors.iter().collect();
    sorted.sort_by(|a, b| {
        let lhs = (
            AsRef::<str>::as_ref(a.project.as_ref()).as_bytes(),
            AsRef::<str>::as_ref(a.task.as_ref()).as_bytes(),
        );
        let rhs = (
            AsRef::<str>::as_ref(b.project.as_ref()).as_bytes(),
            AsRef::<str>::as_ref(b.task.as_ref()).as_bytes(),
        );
        lhs.cmp(&rhs)
    });

    let count =
        u32::try_from(sorted.len()).expect("predecessor count within u32::MAX is structural");
    hasher.update(&count.to_be_bytes());
    for p in sorted {
        write_length_prefixed(hasher, AsRef::<str>::as_ref(p.project.as_ref()).as_bytes());
        write_length_prefixed(hasher, AsRef::<str>::as_ref(p.task.as_ref()).as_bytes());
        hasher.update(&p.stdout_hash);
        hasher.update(&p.stderr_hash);
    }
}

/// Write the environment contribution into `hasher` per
/// `CACHE-008` and the env-ordering clause of `CACHE-009`.
///
/// Order of the two sub-components: `from_host` first, then
/// `overrides`. Names appearing in both are dropped from the
/// `from_host` contribution.
///
/// # Panics
///
/// Panics if either map's entry count, or any name or value byte
/// length, exceeds [`u32::MAX`]; structurally impossible for real
/// workspaces.
pub fn contribute_env(hasher: &mut Hasher, env: &EnvContribution<'_>) {
    // `from_host`, minus names shadowed by `overrides`. BTreeMap
    // iteration is already lexicographic, satisfying CACHE-009.
    let from_host_effective: Vec<(&EnvVarName, &Option<String>)> = env
        .from_host
        .iter()
        .filter(|(name, _)| !env.overrides.contains_key(*name))
        .collect();

    let from_host_count = u32::try_from(from_host_effective.len())
        .expect("env from_host count within u32::MAX is structural");
    hasher.update(&from_host_count.to_be_bytes());
    for (name, value) in &from_host_effective {
        write_length_prefixed(hasher, AsRef::<str>::as_ref(name.as_ref()).as_bytes());
        match value {
            Some(v) => write_length_prefixed(hasher, v.as_bytes()),
            None => hasher.update(&[ENV_ABSENT_MARKER]),
        }
    }

    let override_count = u32::try_from(env.overrides.len())
        .expect("env override count within u32::MAX is structural");
    hasher.update(&override_count.to_be_bytes());
    for (name, value) in env.overrides {
        write_length_prefixed(hasher, AsRef::<str>::as_ref(name.as_ref()).as_bytes());
        write_length_prefixed(hasher, value.as_bytes());
    }
}

/// Write `bytes` length-prefixed (4-byte big-endian unsigned
/// integer) followed by the bytes themselves. The canonical
/// per-item encoding shared by every component above.
fn write_length_prefixed(hasher: &mut Hasher, bytes: &[u8]) {
    let len = u32::try_from(bytes.len()).expect("item length within u32::MAX is structural");
    hasher.update(&len.to_be_bytes());
    hasher.update(bytes);
}

/// The canonical identifier of `shell` as it enters the cache key
/// per `CACHE-005`. First-class variants emit their literal name
/// (`sh`, `bash`); custom shells emit the validated
/// [`haz_domain::action::NonEmptyAsciiName`] verbatim.
fn shell_identifier(shell: &ShellType) -> &str {
    match shell {
        ShellType::Sh => "sh",
        ShellType::Bash => "bash",
        ShellType::Other(name) => AsRef::<str>::as_ref(name.as_ref()),
    }
}