use anyhow::Result;
use async_trait::async_trait;
use crate::evolution::trial::BenchmarkAdapter;
use crate::evolution::types::{Feedback, Task, Trajectory};
pub struct SingleTaskBenchmark {
task: Task,
}
impl SingleTaskBenchmark {
pub fn from_text(text: impl Into<String>) -> Self {
Self {
task: Task {
id: "inline-0".to_string(),
input: text.into(),
metadata: Default::default(),
},
}
}
}
#[async_trait]
impl BenchmarkAdapter for SingleTaskBenchmark {
async fn get_tasks(&self, _split: &str, _limit: usize) -> Result<Vec<Task>> {
Ok(vec![self.task.clone()])
}
async fn evaluate(&self, _task: &Task, trajectory: &Trajectory) -> Result<Feedback> {
let trimmed = trajectory.output.trim();
let (success, score, detail) = if trimmed.is_empty() {
(false, 0.0, "Agent produced no output".to_string())
} else {
(
true,
0.7,
format!("Task completed ({} chars)", trimmed.len()),
)
};
Ok(Feedback {
success,
score,
detail,
raw: Default::default(),
})
}
}