use std::path::PathBuf;
use anyhow::{Context, Result};
use async_trait::async_trait;
use tokio::process::Command;
use crate::evolution::trial::BenchmarkAdapter;
use crate::evolution::types::{Feedback, Task, Trajectory};
pub struct FileBenchmark {
tasks_path: PathBuf,
score_cmd: Option<String>,
}
impl FileBenchmark {
pub fn new(tasks_path: PathBuf) -> Self {
Self {
tasks_path,
score_cmd: None,
}
}
pub fn with_score_cmd(mut self, cmd: impl Into<String>) -> Self {
self.score_cmd = Some(cmd.into());
self
}
}
#[async_trait]
impl BenchmarkAdapter for FileBenchmark {
async fn get_tasks(&self, _split: &str, limit: usize) -> Result<Vec<Task>> {
let content = std::fs::read_to_string(&self.tasks_path)
.with_context(|| format!("Failed to read task file: {}", self.tasks_path.display()))?;
let mut tasks = Vec::new();
for (i, line) in content.lines().enumerate() {
let line = line.trim();
if line.is_empty() {
continue;
}
let task: Task = serde_json::from_str(line)
.with_context(|| format!("Failed to parse task at line {}", i + 1))?;
tasks.push(task);
if limit > 0 && tasks.len() >= limit {
break;
}
}
Ok(tasks)
}
async fn evaluate(&self, task: &Task, trajectory: &Trajectory) -> Result<Feedback> {
if let Some(cmd) = &self.score_cmd {
let tmp_path = std::env::temp_dir()
.join(format!("collet_score_{}.txt", task.id.replace('/', "_")));
std::fs::write(&tmp_path, &trajectory.output)?;
let parts: Vec<&str> = cmd.split_whitespace().collect();
let (program, extra_args) = parts.split_first().unwrap_or((&"sh", &[]));
let out = Command::new(program)
.args(extra_args)
.arg(&task.id)
.arg(&tmp_path)
.output()
.await
.with_context(|| format!("Failed to run score command: {cmd}"))?;
let _ = std::fs::remove_file(&tmp_path);
let stdout = String::from_utf8_lossy(&out.stdout);
let raw_score: f64 = stdout.trim().parse().unwrap_or(0.0);
let score = raw_score.clamp(0.0, 1.0);
let success = score >= 0.5;
let detail = format!(
"score_cmd exit={}, score={score:.3}",
out.status.code().unwrap_or(-1)
);
return Ok(Feedback {
success,
score,
detail,
raw: Default::default(),
});
}
let trimmed = trajectory.output.trim();
let (success, score, detail) = if trimmed.is_empty() {
(false, 0.0, "Empty output".to_string())
} else if trimmed.len() < 20 {
(false, 0.2, "Output too short".to_string())
} else {
(true, 0.5, format!("Output length: {} chars", trimmed.len()))
};
Ok(Feedback {
success,
score,
detail,
raw: Default::default(),
})
}
}