collet 0.1.0 - Docs.rs

//! Benchmark adapters for the evolution system.
//!
//! Each adapter implements [`BenchmarkAdapter`] and provides:
//! - `get_tasks()` — return a batch of evaluation tasks
//! - `evaluate()` — score an agent's trajectory
//!
//! # Available adapters
//!
//! | Adapter | Quality | Setup |
//! |---------|---------|-------|
//! | [`NullBenchmark`] | — | none (placeholder) |
//! | [`FileBenchmark`] | medium | local JSONL file |
//! | [`SweBenchAdapter`] | high | `pip install swebench` |

pub mod file;
pub mod swebench;

pub use file::FileBenchmark;
pub use swebench::SweBenchAdapter;

// ---------------------------------------------------------------------------
// NullBenchmark — moved here from evolve.rs so it lives with its siblings
// ---------------------------------------------------------------------------

use anyhow::Result;
use async_trait::async_trait;

use crate::evolution::trial::BenchmarkAdapter;
use crate::evolution::types::{Feedback, Task, Trajectory};

pub use self::single::SingleTaskBenchmark;
mod single;

/// No-op benchmark that always returns empty tasks and zero scores.
///
/// Useful for smoke-testing the evolution loop without a real dataset.
pub struct NullBenchmark;

#[async_trait]
impl BenchmarkAdapter for NullBenchmark {
    async fn get_tasks(&self, _split: &str, _limit: usize) -> Result<Vec<Task>> {
        Ok(Vec::new())
    }

    async fn evaluate(&self, task: &Task, _trajectory: &Trajectory) -> Result<Feedback> {
        Ok(Feedback {
            success: false,
            score: 0.0,
            detail: format!("No benchmark configured for task '{}'", task.id),
            raw: Default::default(),
        })
    }
}