koala-artifact 1.0.4

Reviewer artifact format and sampling verifier.
Documentation
//! Hash computation for artifacts (ADR-0003 / ADR-0005).
//!
//! Normalization strips fields that legitimately drift across reruns —
//! timestamps in command output, the absolute path of the working
//! checkout — and sorts output lines so that callers whose order is
//! determined by the filesystem (e.g. `grep -rn`) hash stably.

use regex::Regex;
use sha2::{Digest, Sha256};
use std::path::Path;
use std::sync::OnceLock;

static TIMESTAMP_RE: OnceLock<Regex> = OnceLock::new();

fn timestamp_re() -> &'static Regex {
    TIMESTAMP_RE.get_or_init(|| {
        // ISO 8601 with optional fractional seconds and timezone, plus the
        // common `YYYY-MM-DD HH:MM:SS` variant cargo / many CLIs emit.
        Regex::new(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?")
            .expect("timestamp regex compiles")
    })
}

const TIMESTAMP_PLACEHOLDER: &str = "<TS>";
const ROOT_PLACEHOLDER: &str = "<ROOT>";

fn strip_volatile(s: &str, repo_root: &Path) -> String {
    let mut t = s.to_string();
    if let Some(root_str) = repo_root.to_str() {
        if !root_str.is_empty() && root_str != "." {
            t = t.replace(root_str, ROOT_PLACEHOLDER);
        }
    }
    timestamp_re()
        .replace_all(&t, TIMESTAMP_PLACEHOLDER)
        .into_owned()
}

/// Build the canonical bytes that feed into the artifact hash.
///
/// The format is internal — its only contract is determinism.
pub fn normalized_input(
    command: &[String],
    exit_code: i32,
    output: &str,
    repo_root: &Path,
) -> String {
    let cmd_joined: Vec<String> = command
        .iter()
        .map(|a| strip_volatile(a, repo_root))
        .collect();
    let cmd_line = cmd_joined.join("\u{1f}"); // unit separator — won't appear in argv

    let stripped = strip_volatile(output, repo_root);
    let mut lines: Vec<&str> = stripped.lines().collect();
    lines.sort_unstable();
    let normalized_output = lines.join("\n");

    format!("command:{cmd_line}\nexit:{exit_code}\noutput:\n{normalized_output}\n")
}

pub fn sha256_hex(s: &str) -> String {
    let mut h = Sha256::new();
    h.update(s.as_bytes());
    let digest = h.finalize();
    let mut out = String::with_capacity(64);
    for b in digest {
        use std::fmt::Write;
        write!(&mut out, "{b:02x}").expect("writing to String");
    }
    out
}

/// Public hash, formatted as `sha256:<hex>` like the wiki examples.
pub fn compute_hash(command: &[String], exit_code: i32, output: &str, repo_root: &Path) -> String {
    format!(
        "sha256:{}",
        sha256_hex(&normalized_input(command, exit_code, output, repo_root))
    )
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn timestamps_stripped_so_hash_stable() {
        let cmd = vec!["echo".to_string(), "hi".to_string()];
        let out_a = "started 2026-05-07T14:32:01Z done";
        let out_b = "started 2026-05-08T09:00:00.123Z done";
        let root = PathBuf::from("/tmp/checkout-a");
        let h_a = compute_hash(&cmd, 0, out_a, &root);
        let h_b = compute_hash(&cmd, 0, out_b, &root);
        assert_eq!(h_a, h_b);
    }

    #[test]
    fn absolute_repo_paths_collapsed() {
        let cmd = vec!["grep".to_string(), "-r".to_string(), "x".to_string()];
        let h_a = compute_hash(
            &cmd,
            0,
            "/home/alice/repo/src/foo.rs:1:x\n/home/alice/repo/src/bar.rs:2:x",
            Path::new("/home/alice/repo"),
        );
        let h_b = compute_hash(
            &cmd,
            0,
            "/srv/build/repo/src/foo.rs:1:x\n/srv/build/repo/src/bar.rs:2:x",
            Path::new("/srv/build/repo"),
        );
        assert_eq!(h_a, h_b);
    }

    #[test]
    fn line_order_does_not_affect_hash() {
        let cmd = vec!["grep".to_string()];
        let root = Path::new("/tmp/r");
        let h_a = compute_hash(&cmd, 0, "alpha\nbeta\ngamma", root);
        let h_b = compute_hash(&cmd, 0, "gamma\nalpha\nbeta", root);
        assert_eq!(h_a, h_b);
    }

    #[test]
    fn exit_code_is_part_of_hash() {
        let cmd = vec!["true".to_string()];
        let root = Path::new("/tmp/r");
        assert_ne!(
            compute_hash(&cmd, 0, "out", root),
            compute_hash(&cmd, 1, "out", root),
        );
    }

    #[test]
    fn command_args_are_part_of_hash() {
        let root = Path::new("/tmp/r");
        let h_a = compute_hash(&["grep".into(), "foo".into()], 0, "out", root);
        let h_b = compute_hash(&["grep".into(), "bar".into()], 0, "out", root);
        assert_ne!(h_a, h_b);
    }
}