zeph-core 0.15.1

Core agent loop, configuration, context builder, metrics, and vault for Zeph
Documentation
// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
// SPDX-License-Identifier: MIT OR Apache-2.0

//! Benchmark dataset types and TOML loading.

use std::path::Path;

use serde::Deserialize;

use super::error::EvalError;

/// Maximum allowed benchmark file size (10 MiB).
const MAX_BENCHMARK_SIZE: u64 = 10 * 1024 * 1024;

/// A set of benchmark cases loaded from a TOML file.
///
/// # TOML format
///
/// ```toml
/// [[cases]]
/// prompt = "What is the capital of France?"
/// reference = "Paris"
/// tags = ["geography", "factual"]
///
/// [[cases]]
/// prompt = "Explain async/await in Rust."
/// context = "You are a Rust expert."
/// ```
#[derive(Debug, Clone, Deserialize)]
pub struct BenchmarkSet {
    pub cases: Vec<BenchmarkCase>,
}

/// A single benchmark case.
#[derive(Debug, Clone, Deserialize)]
pub struct BenchmarkCase {
    /// The prompt sent to the subject model.
    pub prompt: String,
    /// Optional system context for the subject model.
    #[serde(default)]
    pub context: Option<String>,
    /// Optional reference answer for the judge to calibrate scoring.
    #[serde(default)]
    pub reference: Option<String>,
    /// Optional tags for filtering or reporting.
    #[serde(default)]
    pub tags: Option<Vec<String>>,
}

impl BenchmarkSet {
    /// Load a benchmark set from a TOML file.
    ///
    /// Performs size guard (10 MiB limit) and canonicalisation before reading.
    /// Symlinks that escape the file's parent directory are rejected.
    ///
    /// # Errors
    ///
    /// Returns [`EvalError::BenchmarkLoad`] if the file cannot be read,
    /// [`EvalError::BenchmarkParse`] if the TOML is invalid,
    /// [`EvalError::BenchmarkTooLarge`] if the file exceeds the size limit, or
    /// [`EvalError::PathTraversal`] if canonicalization reveals a symlink escape.
    pub fn from_file(path: &Path) -> Result<Self, EvalError> {
        // Canonicalize to resolve symlinks before opening — eliminates TOCTOU race.
        let canonical = std::fs::canonicalize(path)
            .map_err(|e| EvalError::BenchmarkLoad(path.display().to_string(), e))?;

        // Verify the canonical path stays within the parent directory.
        // This prevents symlinks from escaping into arbitrary filesystem locations.
        if let Some(parent) = path.parent()
            && let Ok(canonical_parent) = std::fs::canonicalize(parent)
            && !canonical.starts_with(&canonical_parent)
        {
            return Err(EvalError::PathTraversal(canonical.display().to_string()));
        }

        // Guard against unbounded memory use from oversized files.
        let metadata = std::fs::metadata(&canonical)
            .map_err(|e| EvalError::BenchmarkLoad(canonical.display().to_string(), e))?;
        if metadata.len() > MAX_BENCHMARK_SIZE {
            return Err(EvalError::BenchmarkTooLarge {
                path: canonical.display().to_string(),
                size: metadata.len(),
                limit: MAX_BENCHMARK_SIZE,
            });
        }

        let content = std::fs::read_to_string(&canonical)
            .map_err(|e| EvalError::BenchmarkLoad(canonical.display().to_string(), e))?;
        toml::from_str(&content)
            .map_err(|e| EvalError::BenchmarkParse(canonical.display().to_string(), e.to_string()))
    }

    /// Validate that the benchmark set is non-empty.
    ///
    /// # Errors
    ///
    /// Returns [`EvalError::EmptyBenchmarkSet`] if `cases` is empty.
    pub fn validate(&self) -> Result<(), EvalError> {
        if self.cases.is_empty() {
            return Err(EvalError::EmptyBenchmarkSet);
        }
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    #![allow(clippy::redundant_closure_for_method_calls)]

    use super::*;

    fn parse(toml: &str) -> BenchmarkSet {
        toml::from_str(toml).expect("valid TOML")
    }

    #[test]
    fn benchmark_from_toml_happy_path() {
        let toml = r#"
[[cases]]
prompt = "What is 2+2?"
"#;
        let set = parse(toml);
        assert_eq!(set.cases.len(), 1);
        assert_eq!(set.cases[0].prompt, "What is 2+2?");
        assert!(set.cases[0].context.is_none());
        assert!(set.cases[0].reference.is_none());
        assert!(set.cases[0].tags.is_none());
    }

    #[test]
    fn benchmark_from_toml_with_all_fields() {
        let toml = r#"
[[cases]]
prompt = "Explain Rust ownership."
context = "You are a Rust expert."
reference = "Ownership is Rust's memory management model."
tags = ["rust", "concepts"]
"#;
        let set = parse(toml);
        assert_eq!(set.cases.len(), 1);
        let case = &set.cases[0];
        assert_eq!(case.context.as_deref(), Some("You are a Rust expert."));
        assert!(case.reference.is_some());
        assert_eq!(case.tags.as_ref().map(std::vec::Vec::len), Some(2));
    }

    #[test]
    fn benchmark_empty_cases_rejected() {
        let set = BenchmarkSet { cases: vec![] };
        assert!(matches!(set.validate(), Err(EvalError::EmptyBenchmarkSet)));
    }

    #[test]
    fn benchmark_from_file_missing_file() {
        let result = BenchmarkSet::from_file(Path::new("/nonexistent/path/benchmark.toml"));
        assert!(matches!(result, Err(EvalError::BenchmarkLoad(_, _))));
    }

    #[test]
    fn benchmark_from_toml_invalid_syntax() {
        let bad = "[[cases\nprompt = 'unclosed'";
        let result: Result<BenchmarkSet, _> = toml::from_str(bad);
        assert!(result.is_err());
    }

    #[test]
    fn benchmark_from_file_invalid_toml() {
        use std::io::Write;
        let mut f = tempfile::NamedTempFile::new().unwrap();
        writeln!(f, "not valid toml ][[]").unwrap();
        let result = BenchmarkSet::from_file(f.path());
        assert!(matches!(result, Err(EvalError::BenchmarkParse(_, _))));
    }

    #[test]
    fn benchmark_from_file_too_large() {
        // Write a file larger than MAX_BENCHMARK_SIZE by writing in chunks.
        // We override the limit via a helper that accepts a custom limit instead of
        // creating a truly 10 MiB file. Test the error variant directly via a stub.
        // Since we cannot override the constant, we verify the error type is correct
        // by constructing it directly.
        let err = EvalError::BenchmarkTooLarge {
            path: "/tmp/bench.toml".into(),
            size: MAX_BENCHMARK_SIZE + 1,
            limit: MAX_BENCHMARK_SIZE,
        };
        assert!(err.to_string().contains("exceeds size limit"));
    }

    #[test]
    fn benchmark_from_file_size_guard_allows_normal_file() {
        use std::io::Write;
        let mut f = tempfile::NamedTempFile::new().unwrap();
        writeln!(f, "[[cases]]\nprompt = \"hello\"").unwrap();
        // Normal-sized file must load without size error.
        let result = BenchmarkSet::from_file(f.path());
        assert!(result.is_ok());
    }

    #[test]
    fn benchmark_validate_passes_for_nonempty() {
        let set = BenchmarkSet {
            cases: vec![BenchmarkCase {
                prompt: "hello".into(),
                context: None,
                reference: None,
                tags: None,
            }],
        };
        assert!(set.validate().is_ok());
    }
}