use anodizer_core::DeterminismReport;
use anyhow::{Context, Result};
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone)]
pub(super) struct ArtifactInfo {
pub(super) hash: String,
pub(super) size_bytes: u64,
pub(super) relative_path: String,
pub(super) stage: String,
pub(super) head_sample: Vec<u8>,
pub(super) tail_sample: Vec<u8>,
}
pub(super) const HEAD_SAMPLE_BYTES: usize = 16 * 1024;
pub(super) const TAIL_SAMPLE_BYTES: usize = 16 * 1024;
pub(super) fn discover_artifacts(worktree_path: &Path) -> Result<Vec<PathBuf>> {
let mut out = Vec::new();
let dist = worktree_path.join("dist");
if dist.exists() {
visit_dir(&dist, &mut out)?;
}
let target_root = worktree_path.join(".det-tmp").join("target");
if target_root.exists() {
collect_raw_binaries(&target_root, &mut out)?;
}
out.sort();
Ok(out)
}
fn visit_dir(dir: &Path, out: &mut Vec<PathBuf>) -> Result<()> {
for entry in
std::fs::read_dir(dir).with_context(|| format!("reading directory {}", dir.display()))?
{
let entry = entry?;
let ft = entry.file_type()?;
if ft.is_dir() {
visit_dir(&entry.path(), out)?;
} else if ft.is_file() {
out.push(entry.path());
}
}
Ok(())
}
fn collect_raw_binaries(target_root: &Path, out: &mut Vec<PathBuf>) -> Result<()> {
let entries = match std::fs::read_dir(target_root) {
Ok(e) => e,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(()),
Err(e) => return Err(e).with_context(|| format!("reading {}", target_root.display())),
};
for entry in entries {
let entry = entry?;
let name = entry.file_name();
let name_s = name.to_string_lossy();
if !entry.file_type()?.is_dir() {
continue;
}
if name_s == "release" {
push_release_dir_files(&entry.path(), out)?;
} else if name_s == "debug"
|| name_s == ".rustc_info.json"
|| name_s == "CACHEDIR.TAG"
|| name_s.starts_with('.')
{
continue;
} else {
let release_dir = entry.path().join("release");
if release_dir.is_dir() {
push_release_dir_files(&release_dir, out)?;
}
}
}
Ok(())
}
fn push_release_dir_files(release_dir: &Path, out: &mut Vec<PathBuf>) -> Result<()> {
for entry in std::fs::read_dir(release_dir)
.with_context(|| format!("reading {}", release_dir.display()))?
{
let entry = entry?;
if !entry.file_type()?.is_file() {
continue;
}
let path = entry.path();
match path.extension().and_then(|s| s.to_str()) {
None => out.push(path),
Some("exe") => out.push(path),
_ => continue,
}
}
Ok(())
}
pub(super) fn hash_artifacts(
worktree_path: &Path,
paths: &[PathBuf],
) -> Result<BTreeMap<String, ArtifactInfo>> {
use sha2::{Digest, Sha256};
let mut out = BTreeMap::new();
let target_root = worktree_path.join(".det-tmp").join("target");
for p in paths {
let bytes =
std::fs::read(p).with_context(|| format!("reading artifact {}", p.display()))?;
let mut hasher = Sha256::new();
hasher.update(&bytes);
let digest = format!("sha256:{:x}", hasher.finalize());
let relative = p
.strip_prefix(worktree_path)
.unwrap_or(p)
.to_string_lossy()
.into_owned();
let name = if let Ok(under_target) = p.strip_prefix(&target_root) {
let suffix = under_target.to_string_lossy().replace('\\', "/");
format!("target/{}", suffix)
} else {
let dist_root = worktree_path.join("dist");
if let Ok(under_dist) = p.strip_prefix(&dist_root) {
under_dist.to_string_lossy().replace('\\', "/")
} else {
p.strip_prefix(worktree_path)
.unwrap_or(p)
.to_string_lossy()
.replace('\\', "/")
}
};
let stage = infer_stage_from_path(&relative);
let head_len = bytes.len().min(HEAD_SAMPLE_BYTES);
let head_sample = bytes[..head_len].to_vec();
let tail_sample = if bytes.len() <= HEAD_SAMPLE_BYTES {
Vec::new()
} else {
let tail_start = bytes
.len()
.saturating_sub(TAIL_SAMPLE_BYTES)
.max(HEAD_SAMPLE_BYTES);
bytes[tail_start..].to_vec()
};
out.insert(
name,
ArtifactInfo {
hash: digest,
size_bytes: bytes.len() as u64,
relative_path: relative,
stage,
head_sample,
tail_sample,
},
);
}
Ok(out)
}
pub(super) fn copy_artifacts_to_dump(
worktree_path: &Path,
paths: &[PathBuf],
dump_root: &Path,
) -> Result<()> {
let target_root = worktree_path.join(".det-tmp").join("target");
for p in paths {
let dest_rel = if let Ok(under_target) = p.strip_prefix(&target_root) {
PathBuf::from("target").join(under_target)
} else if let Ok(under_worktree) = p.strip_prefix(worktree_path) {
under_worktree.to_path_buf()
} else {
PathBuf::from(p.file_name().unwrap_or_default())
};
let dest = dump_root.join(dest_rel);
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent)
.with_context(|| format!("creating dump parent {}", parent.display()))?;
}
if let Err(e) = std::fs::copy(p, &dest) {
eprintln!(
"warn: drift-bin dump failed for {} -> {}: {}",
p.display(),
dest.display(),
e
);
}
}
Ok(())
}
pub(super) fn prune_dump_to_drifted(dump_root: &Path, report: &DeterminismReport) {
if !dump_root.exists() {
return;
}
let drift_names: std::collections::HashSet<&str> =
report.drift.iter().map(|d| d.artifact.as_str()).collect();
let Ok(run_dirs) = std::fs::read_dir(dump_root) else {
return;
};
for run_entry in run_dirs.flatten() {
let run_path = run_entry.path();
if !run_path.is_dir() {
continue;
}
prune_dump_subtree(&run_path, &run_path, &drift_names);
}
}
fn prune_dump_subtree(root: &Path, dir: &Path, drift_names: &std::collections::HashSet<&str>) {
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
prune_dump_subtree(root, &path, drift_names);
if std::fs::read_dir(&path)
.map(|mut it| it.next().is_none())
.unwrap_or(false)
{
let _ = std::fs::remove_dir(&path);
}
} else if path.is_file() {
let rel = path
.strip_prefix(root)
.map(|r| r.to_string_lossy().replace('\\', "/"))
.unwrap_or_default();
let basename = path
.file_name()
.and_then(|s| s.to_str())
.unwrap_or_default();
if !drift_names.contains(rel.as_str()) && !drift_names.contains(basename) {
let _ = std::fs::remove_file(&path);
}
}
}
}
pub(super) fn infer_stage_from_path(rel: &str) -> String {
let lower = rel.replace('\\', "/").to_lowercase();
if lower.contains("/.det-tmp/target/") || lower.starts_with(".det-tmp/target/") {
return "build".into();
}
if lower.ends_with(".sig") || lower.ends_with(".pem") || lower.ends_with(".cert") {
"sign".into()
} else if lower.contains("checksums")
|| lower.ends_with("sha256sum")
|| lower.ends_with("sha256sums")
|| lower.ends_with(".sha256")
{
"checksum".into()
} else if lower.ends_with(".sbom.json")
|| lower.ends_with(".cdx.json")
|| lower.ends_with(".spdx.json")
{
"sbom".into()
} else if lower.ends_with(".tar.gz")
|| lower.ends_with(".tar.xz")
|| lower.ends_with(".tar.zst")
|| lower.ends_with(".zip")
|| lower.ends_with(".tar")
{
"archive".into()
} else if lower.ends_with(".crate") {
"cargo-package".into()
} else {
"unknown".into()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn stage_inference_matches_known_extensions() {
assert_eq!(infer_stage_from_path("dist/foo.tar.gz"), "archive");
assert_eq!(infer_stage_from_path("dist/foo.zip"), "archive");
assert_eq!(infer_stage_from_path("dist/foo.crate"), "cargo-package");
assert_eq!(infer_stage_from_path("dist/foo.sbom.json"), "sbom");
assert_eq!(infer_stage_from_path("dist/foo.tar.gz.sig"), "sign");
assert_eq!(infer_stage_from_path("dist/checksums.txt"), "checksum");
assert_eq!(infer_stage_from_path("dist/SHA256SUMS"), "checksum");
assert_eq!(infer_stage_from_path("dist/mystery.bin"), "unknown");
assert_eq!(
infer_stage_from_path(".det-tmp\\target\\x86_64-pc-windows-msvc\\release\\anodize.exe"),
"build"
);
assert_eq!(infer_stage_from_path("dist\\foo.tar.gz"), "archive");
}
#[test]
fn discover_artifacts_includes_raw_cargo_binaries() {
let tmp = tempfile::tempdir().unwrap();
let wt = tmp.path();
let dist = wt.join("dist");
std::fs::create_dir_all(&dist).unwrap();
std::fs::write(dist.join("anodize_0.3.0_linux_amd64.tar.gz"), b"archive").unwrap();
let triple_release = wt
.join(".det-tmp")
.join("target")
.join("x86_64-unknown-linux-gnu")
.join("release");
std::fs::create_dir_all(&triple_release).unwrap();
std::fs::write(triple_release.join("anodize"), b"raw-bin-linux").unwrap();
std::fs::write(triple_release.join("anodize.d"), b"depfile").unwrap();
std::fs::create_dir_all(triple_release.join("deps")).unwrap();
std::fs::write(triple_release.join("deps").join("libfoo.rlib"), b"rlib").unwrap();
let win_release = wt
.join(".det-tmp")
.join("target")
.join("x86_64-pc-windows-msvc")
.join("release");
std::fs::create_dir_all(&win_release).unwrap();
std::fs::write(win_release.join("anodize.exe"), b"raw-bin-windows").unwrap();
std::fs::write(win_release.join("anodize.pdb"), b"pdb").unwrap();
let host_release = wt.join(".det-tmp").join("target").join("release");
std::fs::create_dir_all(&host_release).unwrap();
std::fs::write(host_release.join("anodize"), b"raw-bin-host").unwrap();
let artifacts = discover_artifacts(wt).expect("discover");
let names: Vec<String> = artifacts
.iter()
.map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
.collect();
assert!(
names
.iter()
.any(|n| n == "anodize_0.3.0_linux_amd64.tar.gz"),
"dist artifact missing: {names:?}"
);
assert_eq!(
names.iter().filter(|n| n.as_str() == "anodize").count(),
2,
"expected 2 `anodize` raw binaries (linux + host), got: {names:?}"
);
assert!(
names.iter().any(|n| n == "anodize.exe"),
"windows raw binary missing: {names:?}"
);
for forbidden in ["anodize.d", "anodize.pdb", "libfoo.rlib"] {
assert!(
!names.iter().any(|n| n == forbidden),
"cargo scratch `{forbidden}` leaked into discovery: {names:?}"
);
}
let map = hash_artifacts(wt, &artifacts).expect("hash");
let target_keys: Vec<&String> = map.keys().filter(|k| k.starts_with("target/")).collect();
assert_eq!(
target_keys.len(),
3,
"expected 3 `target/...`-prefixed map keys, got: {:?}",
map.keys().collect::<Vec<_>>()
);
for k in &target_keys {
assert!(
!k.contains('\\'),
"raw-binary map key contains backslash: {k}"
);
}
assert!(
target_keys
.iter()
.any(|k| { k.as_str() == "target/x86_64-unknown-linux-gnu/release/anodize" }),
"expected `target/x86_64-unknown-linux-gnu/release/anodize` key, got: {target_keys:?}"
);
for k in &target_keys {
assert_eq!(
map.get(k.as_str()).map(|i| i.stage.as_str()),
Some("build"),
"raw binary `{k}` must be attributed to `build` stage"
);
}
}
#[test]
fn prune_dump_to_drifted_keeps_files_matched_by_basename() {
use anodizer_core::{
AllowList, ArtifactRow, CURRENT_SCHEMA_VERSION, DeterminismReport, DriftRow,
};
let tmp = tempfile::tempdir().unwrap();
let dump_root = tmp.path();
for run_idx in 0..2 {
let run = dump_root.join(format!("run-{run_idx}"));
std::fs::create_dir_all(run.join("dist")).unwrap();
std::fs::write(run.join("dist/artifacts.json"), b"{}").unwrap();
std::fs::write(run.join("dist/keep-me-not.tar.gz"), b"green").unwrap();
let raw = run
.join("target")
.join("x86_64-unknown-linux-gnu")
.join("release");
std::fs::create_dir_all(&raw).unwrap();
std::fs::write(raw.join("anodize"), b"binary").unwrap();
}
let report = DeterminismReport {
schema_version: CURRENT_SCHEMA_VERSION,
anodize_version: "0.3.0".into(),
commit: "abc".into(),
commit_timestamp: 0,
runs: 2,
stages_under_test: vec!["archive".into()],
allowlist: AllowList::default(),
artifacts: vec![],
drift: vec![
DriftRow {
artifact: "artifacts.json".into(),
hashes: vec!["sha256:a".into(), "sha256:b".into()],
differing_bytes_summary: None,
},
DriftRow {
artifact: "target/x86_64-unknown-linux-gnu/release/anodize".into(),
hashes: vec!["sha256:c".into(), "sha256:d".into()],
differing_bytes_summary: None,
},
],
drift_count: 2,
};
let _ = ArtifactRow {
name: "noop".into(),
path: "noop".into(),
size_bytes: 0,
stage: "unknown".into(),
deterministic: true,
nondeterministic_reason: None,
hash: None,
hashes: vec![],
};
prune_dump_to_drifted(dump_root, &report);
for run_idx in 0..2 {
let run = dump_root.join(format!("run-{run_idx}"));
assert!(
run.join("dist/artifacts.json").is_file(),
"drifted dist artifact must survive prune (basename match)"
);
assert!(
run.join("target/x86_64-unknown-linux-gnu/release/anodize")
.is_file(),
"drifted raw binary must survive prune (rel-path match)"
);
assert!(
!run.join("dist/keep-me-not.tar.gz").exists(),
"non-drifted artifact must be pruned"
);
}
}
#[test]
fn hash_artifacts_samples_tail_for_mid_size_files() {
let tmp = tempfile::tempdir().unwrap();
let wt = tmp.path();
let dist = wt.join("dist");
std::fs::create_dir_all(&dist).unwrap();
let mut bytes = vec![0u8; 24 * 1024];
for (i, b) in bytes.iter_mut().enumerate() {
*b = (i & 0xff) as u8;
}
std::fs::write(dist.join("artifacts.json"), &bytes).unwrap();
let paths = discover_artifacts(wt).unwrap();
let map = hash_artifacts(wt, &paths).unwrap();
let info = map.get("artifacts.json").expect("artifacts.json must hash");
assert_eq!(info.head_sample.len(), HEAD_SAMPLE_BYTES);
assert_eq!(
info.tail_sample.len(),
bytes.len() - HEAD_SAMPLE_BYTES,
"tail must cover bytes [HEAD..size] to close the gap"
);
let mut reconstructed = info.head_sample.clone();
reconstructed.extend_from_slice(&info.tail_sample);
assert_eq!(
reconstructed, bytes,
"head + tail must concatenate back to the original artifact"
);
}
#[test]
fn discover_artifacts_tolerates_missing_target_dir() {
let tmp = tempfile::tempdir().unwrap();
let wt = tmp.path();
let dist = wt.join("dist");
std::fs::create_dir_all(&dist).unwrap();
std::fs::write(dist.join("foo.tar.gz"), b"x").unwrap();
let out = discover_artifacts(wt).expect("must not error on missing target dir");
assert_eq!(out.len(), 1);
}
#[test]
fn hash_artifacts_distinguishes_same_basename_across_arch_dirs() {
let tmp = tempfile::tempdir().unwrap();
let wt = tmp.path();
let amd64_dir = wt
.join("dist")
.join("makeself")
.join("default")
.join("linux_amd64");
let arm64_dir = wt
.join("dist")
.join("makeself")
.join("default")
.join("linux_arm64");
std::fs::create_dir_all(&amd64_dir).unwrap();
std::fs::create_dir_all(&arm64_dir).unwrap();
std::fs::write(amd64_dir.join("anodizer"), b"amd64-bytes").unwrap();
std::fs::write(arm64_dir.join("anodizer"), b"arm64-bytes").unwrap();
let paths = discover_artifacts(wt).unwrap();
let map = hash_artifacts(wt, &paths).unwrap();
let amd64_key = "makeself/default/linux_amd64/anodizer";
let arm64_key = "makeself/default/linux_arm64/anodizer";
assert!(
map.contains_key(amd64_key),
"amd64 entry missing; map keys: {:?}",
map.keys().collect::<Vec<_>>()
);
assert!(
map.contains_key(arm64_key),
"arm64 entry missing; map keys: {:?}",
map.keys().collect::<Vec<_>>()
);
let amd64_hash = &map[amd64_key].hash;
let arm64_hash = &map[arm64_key].hash;
assert_ne!(
amd64_hash, arm64_hash,
"distinct arch files must produce distinct hashes"
);
}
}