koala-artifact 1.0.4

Reviewer artifact format and sampling verifier.
Documentation
//! `verify` re-runs a sample of recorded commands and compares hashes.
//!
//! Sampling is deterministic given a seed (defaults to a constant so
//! reruns under tests are reproducible). On hash mismatch, verify
//! synthesises a small per-line diff against the stored output for the
//! reviewer to triage.

use crate::normalize::{compute_hash, sha256_hex};
use crate::path::ArtifactPath;
use crate::record::ArtifactRecord;
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use std::process::Command;
use walkdir::WalkDir;

#[derive(Debug, Clone)]
pub struct VerifyOptions {
    pub repo_root: PathBuf,
    /// Sample ratio in percent. 1..=100; values are clamped at the boundary.
    /// Default 10 (matches ADR-0005).
    pub sample_ratio_percent: u32,
    /// Seed for deterministic sampling. Reviewer can't predict CI's seed
    /// when it's tied to PR + run id (see ADR-0005). Tests pin an explicit
    /// value.
    pub seed: String,
    /// Cap on diff lines emitted on mismatch.
    pub diff_lines: usize,
}

impl Default for VerifyOptions {
    fn default() -> Self {
        Self {
            repo_root: PathBuf::from("."),
            sample_ratio_percent: 10,
            seed: String::from("koala-artifact-default"),
            diff_lines: 5,
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DiffLine {
    pub side: DiffSide,
    pub text: String,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DiffSide {
    /// Line was in the recorded artifact but missing from the rerun.
    Removed,
    /// Line is in the rerun but wasn't in the recorded artifact.
    Added,
}

impl DiffSide {
    pub fn marker(&self) -> char {
        match self {
            Self::Removed => '-',
            Self::Added => '+',
        }
    }
}

#[derive(Debug, Clone)]
pub struct VerifyOutcome {
    /// Path relative to the repo root.
    pub artifact: PathBuf,
    pub status: VerifyStatus,
}

#[derive(Debug, Clone)]
pub enum VerifyStatus {
    /// Re-ran successfully and the hash matched the artifact.
    Match { hash: String },
    /// Hash disagreed — artifact was tampered or the command is non-deterministic.
    Mismatch {
        expected: String,
        actual: String,
        diff: Vec<DiffLine>,
    },
    /// Could not parse the file or run the command. Reported but distinct
    /// from a real tamper finding.
    Error(String),
}

#[derive(Debug, Clone)]
pub struct VerifyReport {
    /// All artifacts found under `.review/round-*/`.
    pub total: usize,
    /// Subset that we actually re-ran.
    pub sampled: usize,
    pub results: Vec<VerifyOutcome>,
}

impl VerifyReport {
    pub fn pass_count(&self) -> usize {
        self.results
            .iter()
            .filter(|r| matches!(r.status, VerifyStatus::Match { .. }))
            .count()
    }

    pub fn mismatch_count(&self) -> usize {
        self.results
            .iter()
            .filter(|r| matches!(r.status, VerifyStatus::Mismatch { .. }))
            .count()
    }

    pub fn error_count(&self) -> usize {
        self.results
            .iter()
            .filter(|r| matches!(r.status, VerifyStatus::Error(_)))
            .count()
    }

    /// Mismatches are gating; parse / spawn errors are also surfaced as
    /// non-zero exit because they indicate a malformed artifact.
    pub fn is_clean(&self) -> bool {
        self.mismatch_count() == 0 && self.error_count() == 0
    }
}

#[derive(Debug)]
pub enum VerifyError {
    Walk(io::Error),
    BadOptions(String),
}

impl std::fmt::Display for VerifyError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Walk(e) => write!(f, "walking .review failed: {e}"),
            Self::BadOptions(s) => write!(f, "{s}"),
        }
    }
}

impl std::error::Error for VerifyError {}

pub fn verify(opts: &VerifyOptions) -> Result<VerifyReport, VerifyError> {
    if opts.sample_ratio_percent == 0 || opts.sample_ratio_percent > 100 {
        return Err(VerifyError::BadOptions(format!(
            "sample ratio must be 1..=100, got {}",
            opts.sample_ratio_percent
        )));
    }

    let mut artifacts = collect_artifacts(&opts.repo_root);
    artifacts.sort();
    let total = artifacts.len();

    let sample = select_sample(&artifacts, opts.sample_ratio_percent, &opts.seed);
    let sampled = sample.len();

    let mut results = Vec::with_capacity(sampled);
    for rel in sample {
        results.push(verify_one(&opts.repo_root, &rel, opts.diff_lines));
    }

    Ok(VerifyReport {
        total,
        sampled,
        results,
    })
}

/// All `.review/round-*/<file>.md` paths, returned relative to `repo_root`.
/// Files that don't match the canonical layout are silently skipped — they
/// might be `_assets/` directories or stray notes.
fn collect_artifacts(repo_root: &Path) -> Vec<PathBuf> {
    let dir = repo_root.join(".review");
    if !dir.is_dir() {
        return Vec::new();
    }
    WalkDir::new(&dir)
        .into_iter()
        .filter_map(Result::ok)
        .filter(|e| e.file_type().is_file())
        .filter_map(|e| e.path().strip_prefix(repo_root).ok().map(Path::to_path_buf))
        .filter(|rel| ArtifactPath::parse_relative(rel).is_ok())
        .collect()
}

/// Sort artifacts by `sha256(seed || path)`; take the first
/// `ceil(N * ratio / 100)` (always at least 1 when N ≥ 1). Deterministic
/// given seed, opaque to the reviewer.
fn select_sample(items: &[PathBuf], ratio_percent: u32, seed: &str) -> Vec<PathBuf> {
    if items.is_empty() {
        return Vec::new();
    }
    let target = std::cmp::max(1, (items.len() * ratio_percent as usize).div_ceil(100));
    let mut scored: Vec<(String, &PathBuf)> = items
        .iter()
        .map(|p| {
            let key = format!("{seed}\u{1f}{}", p.display());
            (sha256_hex(&key), p)
        })
        .collect();
    scored.sort_by(|a, b| a.0.cmp(&b.0));
    scored
        .into_iter()
        .take(target)
        .map(|(_, p)| p.clone())
        .collect()
}

fn verify_one(repo_root: &Path, rel: &Path, diff_cap: usize) -> VerifyOutcome {
    let abs = repo_root.join(rel);
    let text = match fs::read_to_string(&abs) {
        Ok(s) => s,
        Err(e) => {
            return VerifyOutcome {
                artifact: rel.to_path_buf(),
                status: VerifyStatus::Error(format!("read failed: {e}")),
            };
        }
    };
    let record = match ArtifactRecord::parse(&text) {
        Ok(r) => r,
        Err(e) => {
            return VerifyOutcome {
                artifact: rel.to_path_buf(),
                status: VerifyStatus::Error(format!("parse failed: {e}")),
            };
        }
    };

    let actual = match rerun(repo_root, &record.command) {
        Ok(out) => out,
        Err(e) => {
            return VerifyOutcome {
                artifact: rel.to_path_buf(),
                status: VerifyStatus::Error(format!("rerun failed: {e}")),
            };
        }
    };
    let actual_hash = compute_hash(&record.command, actual.exit_code, &actual.output, repo_root);
    if actual_hash == record.hash {
        VerifyOutcome {
            artifact: rel.to_path_buf(),
            status: VerifyStatus::Match { hash: actual_hash },
        }
    } else {
        let diff = line_diff(&record.output, &actual.output, diff_cap);
        VerifyOutcome {
            artifact: rel.to_path_buf(),
            status: VerifyStatus::Mismatch {
                expected: record.hash.clone(),
                actual: actual_hash,
                diff,
            },
        }
    }
}

struct RerunOutput {
    exit_code: i32,
    output: String,
}

fn rerun(repo_root: &Path, command: &[String]) -> Result<RerunOutput, io::Error> {
    if command.is_empty() {
        return Err(io::Error::new(io::ErrorKind::InvalidInput, "empty command"));
    }
    let out = Command::new(&command[0])
        .args(&command[1..])
        .current_dir(repo_root)
        .output()?;
    let exit_code = out.status.code().unwrap_or(-1);
    let mut combined = Vec::with_capacity(out.stdout.len() + out.stderr.len());
    combined.extend_from_slice(&out.stdout);
    if !out.stderr.is_empty() {
        if !combined.is_empty() && !combined.ends_with(b"\n") {
            combined.push(b'\n');
        }
        combined.extend_from_slice(&out.stderr);
    }
    Ok(RerunOutput {
        exit_code,
        output: String::from_utf8_lossy(&combined).into_owned(),
    })
}

/// Cheap symmetric diff: lines in `expected` and not `actual` are
/// `Removed`, lines in `actual` and not `expected` are `Added`. Capped at
/// `cap` total entries — the verify printout only needs a hint.
fn line_diff(expected: &str, actual: &str, cap: usize) -> Vec<DiffLine> {
    use std::collections::HashSet;
    let exp: HashSet<&str> = expected.lines().collect();
    let act: HashSet<&str> = actual.lines().collect();
    let mut out = Vec::new();

    let mut removed: Vec<&&str> = exp.difference(&act).collect();
    removed.sort();
    for s in removed {
        if out.len() >= cap {
            return out;
        }
        out.push(DiffLine {
            side: DiffSide::Removed,
            text: (*s).to_string(),
        });
    }
    let mut added: Vec<&&str> = act.difference(&exp).collect();
    added.sort();
    for s in added {
        if out.len() >= cap {
            return out;
        }
        out.push(DiffLine {
            side: DiffSide::Added,
            text: (*s).to_string(),
        });
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn select_sample_is_deterministic_per_seed() {
        let items: Vec<PathBuf> = (0..20)
            .map(|i| PathBuf::from(format!("a-{i}.md")))
            .collect();
        let a = select_sample(&items, 25, "seed-X");
        let b = select_sample(&items, 25, "seed-X");
        assert_eq!(a, b);
        let c = select_sample(&items, 25, "seed-Y");
        assert_ne!(a, c, "different seed should pick a different subset");
    }

    #[test]
    fn select_sample_respects_ratio() {
        let items: Vec<PathBuf> = (0..10)
            .map(|i| PathBuf::from(format!("a-{i}.md")))
            .collect();
        assert_eq!(select_sample(&items, 100, "s").len(), 10);
        assert_eq!(select_sample(&items, 50, "s").len(), 5);
        assert_eq!(select_sample(&items, 10, "s").len(), 1);
        // Ceiling: 10% of 1 → at least 1.
        assert_eq!(select_sample(&[PathBuf::from("x.md")], 10, "s").len(), 1);
    }

    #[test]
    fn empty_repo_returns_empty_report() {
        let dir = tempfile::tempdir().unwrap();
        let report = verify(&VerifyOptions {
            repo_root: dir.path().to_path_buf(),
            ..Default::default()
        })
        .unwrap();
        assert_eq!(report.total, 0);
        assert_eq!(report.sampled, 0);
    }
}