mod types;
use anyhow::{Context, anyhow};
use clap::Args;
use kdam::BarExt;
use kdam::term;
use owo_colors::OwoColorize;
use rayon::prelude::*;
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::fs;
use std::fs::File;
use std::io::IsTerminal;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
use std::sync::mpsc;
use std::thread;
use std::time::Duration;
use wait_timeout::ChildExt;
use walkdir::WalkDir;
use crate::utils::progress_bar::ProgressBar;
use crate::validate::types::LeaderboardResults;
use regex::Regex;
#[derive(Args, Debug)]
pub struct ValidateArgs {
#[arg(long, help = "The path to where the agentic conversions are stored")]
pub conversions_dir: PathBuf,
#[arg(long, help = "The path where the benchmark directory is stored")]
pub validations_dir: PathBuf,
#[arg(
long,
default_value_t = 10,
help = "How much time before we hit evaluation timeout (minutes)."
)]
pub timeout: u64,
#[arg(
long,
help = "If set, write a leaderboard JSON file into the conversions dir."
)]
pub leaderboard_out: bool,
#[arg(
long,
help = "If set, skip running `make test` and only re-parse existing run.log files."
)]
pub dont_rerun: bool,
}
enum UiMessage {
Tick(usize),
Log(String),
}
pub fn run(args: ValidateArgs) -> anyhow::Result<i32> {
let conversions_dir = args.conversions_dir.clone();
let dirs: Vec<_> = WalkDir::new(&conversions_dir)
.min_depth(1)
.follow_links(false)
.into_iter()
.filter_map(Result::ok)
.filter(|entry| {
entry.file_type().is_dir()
&& entry.file_name().to_string_lossy().starts_with("run_")
})
.filter_map(|entry| {
Some(entry.path().parent()?.to_path_buf())
})
.collect::<HashSet<_>>()
.into_iter()
.collect::<Vec<_>>();
log::debug!("Found {} conversions", dirs.len());
let total = dirs.len();
let (tx, rx) = mpsc::channel::<UiMessage>();
let tui = thread::spawn(move || -> anyhow::Result<()> {
term::init(std::io::stderr().is_terminal());
let mut pb = total.progress("Evaluating Conversions", " Eval");
while let Ok(msg) = rx.recv() {
match msg {
UiMessage::Tick(n) => {
pb.update(n)?;
},
UiMessage::Log(l) => {
pb.write(l)?;
},
}
}
eprint!("");
Ok(())
});
let collected_log_paths: Vec<anyhow::Result<Vec<PathBuf>>> = dirs
.par_iter()
.map_with(tx.clone(), |tx, dir| {
let res: anyhow::Result<Vec<PathBuf>> = fs::read_dir(dir)
.with_context(|| {
format!(
"Failed to read sub directories of {}",
dir.display()
)
})
.and_then(|entries| {
entries
.filter_map(Result::ok) .map(|e| e.path())
.filter(|p| {
p.is_dir()
&& p.file_name()
.expect("Unable to open the directory")
.to_string_lossy()
.starts_with("run_")
})
.try_fold(Vec::new(), |mut log_paths, subdir| {
let log_path = if args.dont_rerun {
subdir.join("validation").join("run.log")
} else {
copy_validation_harness_and_run_make_test(
&args.validations_dir,
&subdir,
args.timeout,
)?
};
log_paths.push(log_path);
Ok(log_paths)
})
});
match &res {
Ok(_) => {
let _ = tx.send(UiMessage::Log(format!(
"{}\t{}",
"[INFO]".to_string().bold().bright_cyan(),
format!(
"Successfully validated {}",
dir.to_string_lossy()
)
.bold()
.bright_white()
)));
},
Err(e) => {
let _ = tx.send(UiMessage::Log(format!(
"{}\t{}",
"[ERROR]".to_string().bold().bright_magenta(),
format!("{}", e).bold().bright_magenta()
)));
},
}
let _ = tx.send(UiMessage::Tick(1));
res
})
.collect();
drop(tx);
tui.join().map_err(|e| anyhow!("TUI panicked: {:?}", e))??;
for log_path in
collected_log_paths.into_iter().filter_map(Result::ok).flatten()
{
parse_run_log_and_update_metadata(&log_path, &args.validations_dir)?;
}
if args.leaderboard_out {
generate_leaderboard(&dirs, &args.conversions_dir)?;
}
Ok(0)
}
fn generate_leaderboard(
dirs: &[PathBuf],
conversions_dir: &Path,
) -> anyhow::Result<()> {
let metadatas: Vec<types::Metadata> = dirs
.iter()
.flat_map(|d| fs::read_dir(d).ok().into_iter().flatten())
.filter_map(Result::ok)
.map(|f| f.path())
.filter(|p| p.file_name().and_then(|f| f.to_str()).is_some_and(|f| f.starts_with("run_")))
.filter_map(|subdir| {
let path = subdir.join("metadata.json");
let file = File::open(&path).ok()?;
serde_json::from_reader(file).ok()
}).collect();
let mut groups: BTreeMap<
(String, String, String, String),
Vec<types::Metadata>,
> = BTreeMap::new();
metadatas.into_iter().for_each(|metadata| {
let key = (
metadata.source_framework.to_string(),
metadata.target_framework.to_string(),
metadata.layer.clone(),
metadata.app.clone(),
);
groups.entry(key).or_default().push(metadata);
});
let sample =
groups.values().next().and_then(|v| v.first()).ok_or_else(|| {
anyhow!("no metadata.json files found under conversions dir")
})?;
let solution = types::LeaderboardSolution {
agent: sample
.solution_name
.clone()
.unwrap_or_else(|| sample.agent.clone()),
model: sample.model.clone().unwrap_or_default(),
variant: sample.variant.clone(),
date: chrono::Utc::now().format("%Y-%m-%d").to_string(),
};
let results: Vec<LeaderboardResults> = groups
.into_iter()
.map(|((from, to, layer, app), mut repeats)| {
repeats.sort_by_key(|m| m.repeat);
types::LeaderboardResults {
from,
to,
layer,
app,
repeats: repeats
.into_iter()
.map(|m| types::Repeat {
compile: matches!(
m.compile_ok,
types::ValidationOutcome::True
),
run: matches!(
m.deploy_ok,
types::ValidationOutcome::True
),
tests_passed: m.tests_passed.unwrap_or(0),
tests_total: m.num_smoke_tests.unwrap_or(0),
})
.collect(),
}
})
.collect();
let leaderboard = types::Leaderboard { solution, results };
let agent_slug = leaderboard.solution.agent.replace(' ', "-");
let model_slug = leaderboard.solution.model.replace(' ', "-");
let filename = match &leaderboard.solution.variant {
Some(v) if !v.is_empty() => {
format!("{}__{}__{}.json", agent_slug, model_slug, v.replace(' ', "-"))
}
_ => format!("{}__{}.json", agent_slug, model_slug),
};
let out_path = conversions_dir.join(filename);
let mut f = File::create(&out_path).with_context(|| {
format!("failed to create leaderboard file {}", out_path.display())
})?;
f.write_all(serde_json::to_string_pretty(&leaderboard)?.as_bytes())?;
log::info!("Wrote leaderboard to {}", out_path.display());
Ok(())
}
fn copy_validation_harness_and_run_make_test(
validations_dir: &PathBuf,
conversions_dir: &PathBuf,
timeout_in_minutes: u64,
) -> anyhow::Result<PathBuf> {
let (layer, app, framework) = read_metadata_json(conversions_dir)?;
let src = validations_dir.join(layer).join(app).join(framework);
let dst = conversions_dir.join("output");
fs::read_dir(&src)?
.filter_map(Result::ok)
.map(|e| e.path())
.filter(|p| {
p.file_name().and_then(|n| n.to_str()).is_some_and(|n| {
matches!(
n,
"Makefile"
| "makefile"
| "Dockerfile"
| "smoke.py"
| "metadata.json"
) || (n == "smoke" && p.is_dir())
})
})
.for_each(|src_path| {
let dst_path = dst.join(src_path.file_name().unwrap()); log::info!(
"Copying {} -> {}",
src_path.display(),
dst_path.display()
);
let copy_result = if src_path.is_dir() {
copy_dir_recursive(&src_path, &dst_path)
.expect("TODO: panic message");
fs::copy(
src_path
.parent()
.expect(
"Unable to access the parent dir of current smoke",
)
.join("metadata.json"),
&dst_path,
)
.map(|_| ())
.map_err(anyhow::Error::from)
} else {
fs::copy(&src_path, &dst_path)
.map(|_| ())
.map_err(anyhow::Error::from)
};
if let Err(e) = copy_result {
log::warn!(
"Failed to copy {} -> {}: {e}",
src_path.display(),
dst_path.display()
);
}
});
let timeout = Duration::from_secs(timeout_in_minutes * 60);
let log_dir = conversions_dir.join("validation");
fs::create_dir_all(&log_dir)
.with_context(|| format!("Failed to create log dir {:?}", log_dir))?;
let log_path = log_dir.join("run.log");
let log_file = File::create(&log_path)
.with_context(|| format!("Failed to create log file {:?}", log_path))?;
let log_file_err = log_file
.try_clone()
.with_context(|| "Failed to clone log file handle")?;
let mut child = Command::new("make")
.current_dir(&dst)
.args(["test"])
.stdin(Stdio::null())
.stderr(Stdio::from(log_file_err))
.stdout(Stdio::from(log_file))
.spawn()
.with_context(|| {
format!("Failed to run make tests on {:?}", conversions_dir)
})?;
match child
.wait_timeout(timeout)
.context("couldn't spawn child with wait_timeout")?
{
Some(_) => {},
None => {
child.kill()?;
child.wait()?; },
};
Ok(log_path)
}
fn parse_run_log_and_update_metadata(log_path: &Path, validations_dir: &Path) -> anyhow::Result<()> {
let log = fs::read_to_string(log_path).with_context(|| {
format!("failed to read run log at {}", log_path.display())
})?;
let metadata_path = log_path
.parent()
.unwrap()
.parent()
.context("log path has no parent directory")?
.join("metadata.json");
let mut metadata: types::Metadata = serde_json::from_reader(
File::open(&metadata_path).with_context(|| {
format!(
"failed to open metadata JSON at {}",
metadata_path.display()
)
})?,
)
.with_context(|| {
format!("failed to parse metadata JSON at {}", metadata_path.display())
})?;
let smoke_test_metadata_path = validations_dir
.join(&metadata.layer)
.join(&metadata.app)
.join(metadata.target_framework.to_string())
.join("metadata.json");
match File::open(&smoke_test_metadata_path) {
Ok(f) => {
let smoke: types::SmokeTestMetadata = serde_json::from_reader(f)
.with_context(|| {
format!(
"failed to parse smoke test metadata at {}",
smoke_test_metadata_path.display()
)
})?;
metadata.num_smoke_tests = Some(smoke.num_smoke_tests);
},
Err(_) => log::warn!(
"No smoke test metadata at {} — leaving num_smoke_tests unset",
smoke_test_metadata_path.display()
),
}
let (compile_outcome, compile_reason, compile_category) =
analyze_compile(&log);
let (deploy_outcome, deploy_reason, deploy_category) =
analyze_deploy(&log, &compile_outcome);
let (test_outcome, test_reason, test_category, inconclusive) =
analyze_tests(&log, &deploy_outcome);
metadata.compile_ok = compile_outcome;
metadata.deploy_ok = deploy_outcome;
metadata.tests_passed = test_outcome;
metadata.inconclusive = inconclusive;
let mut reasons = Vec::new();
if let Some(r) = compile_reason {
reasons.push(r);
}
if let Some(r) = deploy_reason {
reasons.push(r);
}
if let Some(r) = test_reason {
reasons.push(r);
}
if !reasons.is_empty() {
metadata.failure_reason = Some(reasons.join("; "));
}
metadata.failure_category =
test_category.or(deploy_category).or(compile_category);
let mut metadata_file =
File::create(&metadata_path).with_context(|| {
format!(
"failed to open metadata JSON for writing at {}",
metadata_path.display()
)
})?;
metadata_file
.write_all(serde_json::to_string_pretty(&metadata)?.as_bytes())?;
Ok(())
}
fn analyze_compile(
log: &str,
) -> (types::ValidationOutcome, Option<String>, Option<types::FailureCategory>)
{
if log.contains("BUILD FAILURE") {
if log.contains("COMPILATION ERROR")
|| log.contains("cannot find symbol")
|| log.contains("package") && log.contains("does not exist")
{
let reason =
"Compilation errors detected in Maven build".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::CompileError),
);
}
if log.contains("NoPluginFoundForPrefixException")
|| log.contains("No plugin found for prefix")
{
let reason =
"Maven plugin not found - wrong framework build tool used"
.to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::BuildConfigError),
);
}
if log.contains("Could not resolve dependencies")
|| log.contains("Failed to collect dependencies")
{
let reason = "Maven dependency resolution failed".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::CompileDependency),
);
}
let reason = "Maven build failed".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::BuildFailure),
);
}
if log.contains("naming to docker.io") {
return (types::ValidationOutcome::True, None, None);
}
if log.contains("BUILD SUCCESS") {
return (types::ValidationOutcome::True, None, None);
}
if log.contains("docker build")
&& (log.contains("ERROR") || log.contains("failed to"))
&& !log.contains("naming to docker.io")
{
let reason = "Docker build failed".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::DockerBuildError),
);
}
if log.contains("pull access denied")
&& !log.contains("naming to docker.io")
{
let reason = "Docker image not found (build likely failed in a shared build step)".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::DockerImageMissing),
);
}
if log.contains("make: *** [build]") {
let reason = "Build step failed".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::BuildFailure),
);
}
(
types::ValidationOutcome::Unk,
Some("No clear compile outcome found in logs".to_string()),
Some(types::FailureCategory::Unknown),
)
}
fn analyze_deploy(
log: &str,
compile_ok: &types::ValidationOutcome,
) -> (types::ValidationOutcome, Option<String>, Option<types::FailureCategory>)
{
if matches!(compile_ok, types::ValidationOutcome::False) {
return (
types::ValidationOutcome::False,
Some("Cannot deploy - compilation failed".to_string()),
Some(types::FailureCategory::CompileDependency),
);
}
if log.contains("pplication started and ready.") {
return (types::ValidationOutcome::True, None, None);
}
let test_summary_pattern =
Regex::new(r"=+ .*(?:passed|failed|error).*=+").unwrap();
if test_summary_pattern.is_match(log) {
return (types::ValidationOutcome::True, None, None);
}
if log.contains("short test summary info")
|| log.contains("PASSED")
|| log.contains("FAILED smoke.py")
{
return (types::ValidationOutcome::True, None, None);
}
if log.contains("docker run -d")
&& (log.contains("Connection refused")
|| log.contains("container exited"))
{
let reason =
"Container started but application failed to start".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::AppStartupFailure),
);
}
if log.contains("pull access denied") {
let reason = "Docker image not found".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::DockerImageMissing),
);
}
if log.contains("container name")
&& log.contains("already in use")
&& !log.contains("naming to docker.io")
{
let reason = "Container name conflict from previous run".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::ContainerConflict),
);
}
if log.contains("make: *** [up]") {
if matches!(compile_ok, types::ValidationOutcome::True) {
let reason = "Deployment failed after successful build".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::DeployFailure),
);
}
let reason = "make up failed".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::DeployFailure),
);
}
if log.contains("Terminated: 15") {
let reason = "Process was terminated (SIGTERM)".to_string();
return (
types::ValidationOutcome::False,
Some(reason),
Some(types::FailureCategory::ProcessTerminated),
);
}
if log.contains("waiting for app to start...") {
let after_waiting =
log.split("waiting for app to start...").last().unwrap_or("");
if !after_waiting.contains("pplication started and ready")
&& !after_waiting.contains("PASSED")
&& !after_waiting.contains("FAILED")
&& !after_waiting.contains("===")
{
let reason = "Validation process cut short - log ends at health check wait (rerun needed)".to_string();
return (
types::ValidationOutcome::Unk,
Some(reason),
Some(types::FailureCategory::ValidationTruncated),
);
}
}
if matches!(compile_ok, types::ValidationOutcome::True) {
return (
types::ValidationOutcome::Unk,
Some(
"Compiled successfully but no deploy outcome found".to_string(),
),
Some(types::FailureCategory::Unknown),
);
}
(
types::ValidationOutcome::Unk,
Some("No clear deploy outcome found".to_string()),
Some(types::FailureCategory::Unknown),
)
}
fn analyze_tests(
log: &str,
deploy_ok: &types::ValidationOutcome,
) -> (Option<u32>, Option<String>, Option<types::FailureCategory>, bool) {
if matches!(deploy_ok, types::ValidationOutcome::False) {
return (
Some(0),
Some("Cannot test - deployment failed".to_string()),
Some(types::FailureCategory::DeployDependency),
false,
);
}
let summary_pattern =
Regex::new(r"=+ (.*?(?:passed|failed|error).*?) =+").unwrap();
let summaries: Vec<_> = summary_pattern.captures_iter(log).collect();
if let Some(last_summary) = summaries.last() {
let summary = &last_summary[1];
let passed_re = Regex::new(r"(\d+) passed").unwrap();
let failed_re = Regex::new(r"(\d+) failed").unwrap();
let error_re = Regex::new(r"(\d+) error").unwrap();
let passed: u32 = passed_re
.captures(summary)
.and_then(|c| c[1].parse().ok())
.unwrap_or(0);
let failed: u32 = failed_re
.captures(summary)
.and_then(|c| c[1].parse().ok())
.unwrap_or(0);
let errors: u32 = error_re
.captures(summary)
.and_then(|c| c[1].parse().ok())
.unwrap_or(0);
let total = passed + failed + errors;
if total > 0 {
if failed > 0 || errors > 0 {
let reason = format!(
"{} failed, {} errors, {} passed out of {} tests",
failed, errors, passed, total
);
return (
Some(passed),
Some(reason),
Some(types::FailureCategory::TestFailures),
false,
);
} else {
return (Some(passed), None, None, false);
}
}
}
if log.contains("make: *** [test] Error 137") {
let reason =
"Test process killed (Error 137 - likely OOM/timeout)".to_string();
return (
None,
Some(reason),
Some(types::FailureCategory::TestTimeoutOom),
true,
);
}
if log.contains("make: *** [test] Error 1") {
let reason = "Test step failed with Error 1".to_string();
return (
Some(0),
Some(reason),
Some(types::FailureCategory::TestFailure),
true,
);
}
if matches!(deploy_ok, types::ValidationOutcome::True) {
let reason =
"App deployed but no test results found in log".to_string();
return (
None,
Some(reason),
Some(types::FailureCategory::NoTestOutput),
true,
);
}
(None, None, None, false)
}
fn read_metadata_json(path: &Path) -> anyhow::Result<(String, String, String)> {
let file = File::open(path.join("metadata.json")).with_context(|| {
format!("failed to open metadata.json in {}", path.display())
})?;
let metadata: types::Metadata = serde_json::from_reader(file)
.with_context(|| {
format!("failed to parse metadata JSON from {}", path.display())
})?;
Ok((metadata.layer, metadata.app, metadata.target_framework.to_string()))
}
fn copy_dir_recursive(from: &Path, to: &Path) -> anyhow::Result<()> {
fs::create_dir_all(to)?;
for entry in fs::read_dir(from)? {
let entry = entry?;
let src_path = entry.path();
let dst_path = to.join(entry.file_name());
if src_path.is_dir() {
copy_dir_recursive(&src_path, &dst_path)?;
} else if src_path.is_file() {
fs::copy(&src_path, &dst_path)?;
}
}
Ok(())
}