Skip to main content

Crate serdes_ai_evals

Crate serdes_ai_evals 

Source
Expand description

§serdes-ai-evals

Evaluation framework for testing and benchmarking serdes-ai agents.

This crate provides tools for systematically evaluating agent performance across test cases, measuring accuracy, latency, and cost.

§Core Concepts

§Built-in Evaluators

§Example

use serdes_ai_evals::{EvalRunner, EvalCase, EvalSuite, ExactMatchScorer, ContainsScorer};

let suite = EvalSuite::new("weather_agent_tests")
    .add_case(EvalCase::new()
        .input("What's the weather in NYC?")
        .expected_contains("New York")
        .expected_contains("temperature"))
    .add_case(EvalCase::new()
        .input("Weather in London")
        .expected_contains("London"));

let runner = EvalRunner::new()
    .evaluator(ContainsScorer::new("weather"));

// Would run with actual agent

§Quick Evaluation

use serdes_ai_evals::quick_eval;

let report = quick_eval(
    vec![
        ("What is 2+2?", Some("4")),
        ("What is 3+3?", Some("6")),
    ],
    |input| async move { calculate(input) },
).await?;

println!("Pass rate: {:.1}%", report.summary.pass_rate * 100.0);

Re-exports§

pub use case::Case;
pub use case::EvalCase;
pub use case::Expected;
pub use dataset::Dataset;
pub use dataset::DatasetBuilder;
pub use error::EvalError;
pub use error::EvalResult;
pub use evaluator::BoxedEvaluator;
pub use evaluator::EvaluationResult;
pub use evaluator::Evaluator;
pub use evaluator::EvaluatorContext;
pub use evaluator::EvaluatorSet;
pub use evaluator::NamedEvaluationResult;
pub use evaluator::TypedEvaluator;
pub use metrics::AggregateMetrics;
pub use metrics::EvalMetrics;
pub use metrics::TokenUsage;
pub use report::CaseResult;
pub use report::EvaluationReport;
pub use report::EvaluatorStats;
pub use report::ReportSummary;
pub use result::EvalResult as LegacyEvalResult;
pub use runner::quick_eval;
pub use runner::EvalOptions;
pub use runner::EvalRunner;
pub use scorers::AlwaysFailScorer;
pub use scorers::AlwaysPassScorer;
pub use scorers::ContainsScorer;
pub use scorers::ExactMatchScorer;
pub use scorers::FunctionScorer;
pub use scorers::LengthScorer;
pub use scorers::LlmJudgeScorer;
pub use scorers::NotContainsScorer;
pub use scorers::RegexScorer;
pub use scorers::Scorer;
pub use suite::EvalSuite;

Modules§

case
Evaluation case definitions.
dataset
Dataset management for evaluation cases.
error
Evaluation errors.
evaluator
Evaluator traits and implementations.
metrics
Evaluation metrics and statistics.
prelude
Prelude for common imports.
report
Evaluation reports and summaries.
result
Evaluation result types.
runner
Evaluation runner.
scorers
Common evaluator implementations (scorers).
suite
Evaluation suite definitions.