collet 0.1.0 - Docs.rs

//! SingleTaskBenchmark — in-memory benchmark from natural-language input.
//!
//! Used when the user passes free text to `/evolve` or `collet evolve`:
//!
//! ```text
//! /evolve fix the login bug --cycles 3
//! collet evolve "add input validation" --cycles 5
//! ```
//!
//! Evaluation is heuristic: non-empty output scores 0.7 (task attempted),
//! empty output scores 0.0.  There is no ground truth — the engine uses the
//! trajectory to decide what to improve in the workspace.

use anyhow::Result;
use async_trait::async_trait;

use crate::evolution::trial::BenchmarkAdapter;
use crate::evolution::types::{Feedback, Task, Trajectory};

/// In-memory benchmark holding a single task created from user-provided text.
pub struct SingleTaskBenchmark {
    task: Task,
}

impl SingleTaskBenchmark {
    pub fn from_text(text: impl Into<String>) -> Self {
        Self {
            task: Task {
                id: "inline-0".to_string(),
                input: text.into(),
                metadata: Default::default(),
            },
        }
    }
}

#[async_trait]
impl BenchmarkAdapter for SingleTaskBenchmark {
    async fn get_tasks(&self, _split: &str, _limit: usize) -> Result<Vec<Task>> {
        Ok(vec![self.task.clone()])
    }

    async fn evaluate(&self, _task: &Task, trajectory: &Trajectory) -> Result<Feedback> {
        let trimmed = trajectory.output.trim();
        let (success, score, detail) = if trimmed.is_empty() {
            (false, 0.0, "Agent produced no output".to_string())
        } else {
            (
                true,
                0.7,
                format!("Task completed ({} chars)", trimmed.len()),
            )
        };
        Ok(Feedback {
            success,
            score,
            detail,
            raw: Default::default(),
        })
    }
}