collet 0.1.0

Relentless agentic coding orchestrator with zero-drop agent loops
Documentation
//! File-based benchmark adapter — loads tasks from a JSONL file.
//!
//! Task format (one JSON object per line):
//! ```json
//! {"id": "task-001", "input": "Fix the off-by-one error in search()", "metadata": {}}
//! ```
//!
//! Evaluation:
//! - If `score_cmd` is set: runs `<cmd> <task_id> <output_path>` and reads a float from stdout.
//! - Otherwise: simple heuristic — non-empty output scores 0.5, empty scores 0.0.

use std::path::PathBuf;

use anyhow::{Context, Result};
use async_trait::async_trait;
use tokio::process::Command;

use crate::evolution::trial::BenchmarkAdapter;
use crate::evolution::types::{Feedback, Task, Trajectory};

/// Loads tasks from a local JSONL file and optionally evaluates via a shell command.
pub struct FileBenchmark {
    /// Path to tasks JSONL (one `Task`-shaped JSON object per line).
    tasks_path: PathBuf,
    /// Optional external scorer: `<cmd> <task_id> <output_tmpfile>` → float on stdout.
    score_cmd: Option<String>,
}

impl FileBenchmark {
    pub fn new(tasks_path: PathBuf) -> Self {
        Self {
            tasks_path,
            score_cmd: None,
        }
    }

    pub fn with_score_cmd(mut self, cmd: impl Into<String>) -> Self {
        self.score_cmd = Some(cmd.into());
        self
    }
}

#[async_trait]
impl BenchmarkAdapter for FileBenchmark {
    async fn get_tasks(&self, _split: &str, limit: usize) -> Result<Vec<Task>> {
        let content = std::fs::read_to_string(&self.tasks_path)
            .with_context(|| format!("Failed to read task file: {}", self.tasks_path.display()))?;

        let mut tasks = Vec::new();
        for (i, line) in content.lines().enumerate() {
            let line = line.trim();
            if line.is_empty() {
                continue;
            }
            let task: Task = serde_json::from_str(line)
                .with_context(|| format!("Failed to parse task at line {}", i + 1))?;
            tasks.push(task);
            if limit > 0 && tasks.len() >= limit {
                break;
            }
        }
        Ok(tasks)
    }

    async fn evaluate(&self, task: &Task, trajectory: &Trajectory) -> Result<Feedback> {
        if let Some(cmd) = &self.score_cmd {
            // Write agent output to a temp file and invoke the scorer.
            let tmp_path = std::env::temp_dir()
                .join(format!("collet_score_{}.txt", task.id.replace('/', "_")));
            std::fs::write(&tmp_path, &trajectory.output)?;

            let parts: Vec<&str> = cmd.split_whitespace().collect();
            let (program, extra_args) = parts.split_first().unwrap_or((&"sh", &[]));

            let out = Command::new(program)
                .args(extra_args)
                .arg(&task.id)
                .arg(&tmp_path)
                .output()
                .await
                .with_context(|| format!("Failed to run score command: {cmd}"))?;

            let _ = std::fs::remove_file(&tmp_path);

            let stdout = String::from_utf8_lossy(&out.stdout);
            let raw_score: f64 = stdout.trim().parse().unwrap_or(0.0);
            let score = raw_score.clamp(0.0, 1.0);
            let success = score >= 0.5;
            let detail = format!(
                "score_cmd exit={}, score={score:.3}",
                out.status.code().unwrap_or(-1)
            );

            return Ok(Feedback {
                success,
                score,
                detail,
                raw: Default::default(),
            });
        }

        // Heuristic: reward non-empty, non-trivial output.
        let trimmed = trajectory.output.trim();
        let (success, score, detail) = if trimmed.is_empty() {
            (false, 0.0, "Empty output".to_string())
        } else if trimmed.len() < 20 {
            (false, 0.2, "Output too short".to_string())
        } else {
            (true, 0.5, format!("Output length: {} chars", trimmed.len()))
        };

        Ok(Feedback {
            success,
            score,
            detail,
            raw: Default::default(),
        })
    }
}