serdes_ai_evals/lib.rs
1//! # serdes-ai-evals
2//!
3//! Evaluation framework for testing and benchmarking serdes-ai agents.
4//!
5//! This crate provides tools for systematically evaluating agent performance
6//! across test cases, measuring accuracy, latency, and cost.
7//!
8//! ## Core Concepts
9//!
10//! - **[`Case`] / [`EvalCase`]**: Individual test cases with inputs and expected outputs
11//! - **[`Dataset`] / [`EvalSuite`]**: Collections of test cases
12//! - **[`Evaluator`]**: Trait for implementing custom evaluators
13//! - **[`EvalRunner`]**: Runs evaluations and collects results
14//! - **[`EvaluationReport`]**: Detailed results with statistics
15//!
16//! ## Built-in Evaluators
17//!
18//! - **[`ExactMatchScorer`]**: Output must match expected exactly
19//! - **[`ContainsScorer`]**: Output must contain expected substring
20//! - **[`RegexScorer`]**: Output must match regex pattern
21//! - **[`LengthScorer`]**: Output must meet length constraints
22//! - **[`FunctionScorer`]**: Custom evaluation function
23//!
24//! ## Example
25//!
26//! ```ignore
27//! use serdes_ai_evals::{EvalRunner, EvalCase, EvalSuite, ExactMatchScorer, ContainsScorer};
28//!
29//! let suite = EvalSuite::new("weather_agent_tests")
30//! .add_case(EvalCase::new()
31//! .input("What's the weather in NYC?")
32//! .expected_contains("New York")
33//! .expected_contains("temperature"))
34//! .add_case(EvalCase::new()
35//! .input("Weather in London")
36//! .expected_contains("London"));
37//!
38//! let runner = EvalRunner::new()
39//! .evaluator(ContainsScorer::new("weather"));
40//!
41//! // Would run with actual agent
42//! ```
43//!
44//! ## Quick Evaluation
45//!
46//! ```ignore
47//! use serdes_ai_evals::quick_eval;
48//!
49//! let report = quick_eval(
50//! vec![
51//! ("What is 2+2?", Some("4")),
52//! ("What is 3+3?", Some("6")),
53//! ],
54//! |input| async move { calculate(input) },
55//! ).await?;
56//!
57//! println!("Pass rate: {:.1}%", report.summary.pass_rate * 100.0);
58//! ```
59
60#![warn(missing_docs)]
61#![deny(unsafe_code)]
62
63pub mod case;
64pub mod dataset;
65pub mod error;
66pub mod evaluator;
67pub mod metrics;
68pub mod report;
69pub mod result;
70pub mod runner;
71pub mod scorers;
72pub mod suite;
73
74// Re-exports
75pub use case::{Case, EvalCase, Expected};
76pub use dataset::{Dataset, DatasetBuilder};
77pub use error::{EvalError, EvalResult};
78pub use evaluator::{
79 BoxedEvaluator, EvaluationResult, Evaluator, EvaluatorContext, EvaluatorSet,
80 NamedEvaluationResult, TypedEvaluator,
81};
82pub use metrics::{AggregateMetrics, EvalMetrics, TokenUsage};
83pub use report::{CaseResult, EvaluationReport, EvaluatorStats, ReportSummary};
84pub use result::EvalResult as LegacyEvalResult;
85pub use runner::{quick_eval, EvalOptions, EvalRunner};
86pub use scorers::{
87 AlwaysFailScorer, AlwaysPassScorer, ContainsScorer, ExactMatchScorer, FunctionScorer,
88 LengthScorer, LlmJudgeScorer, NotContainsScorer, RegexScorer, Scorer,
89};
90pub use suite::EvalSuite;
91
92/// Prelude for common imports.
93pub mod prelude {
94 pub use crate::{
95 quick_eval, Case, ContainsScorer, Dataset, EvalCase, EvalOptions, EvalRunner, EvalSuite,
96 EvaluationReport, EvaluationResult, Evaluator, ExactMatchScorer,
97 };
98}
99
100#[cfg(test)]
101mod tests {
102 use super::*;
103
104 #[test]
105 fn test_prelude_imports() {
106 use crate::prelude::*;
107
108 let case: Case<String> = Case::new("test".to_string());
109 assert_eq!(case.inputs, "test");
110 }
111
112 #[tokio::test]
113 async fn test_basic_evaluation() {
114 let runner = EvalRunner::new().evaluator(ExactMatchScorer::new());
115
116 let cases = vec![
117 ("a".to_string(), Some("a".to_string())),
118 ("b".to_string(), Some("c".to_string())),
119 ];
120
121 let report = runner
122 .run_simple(&cases, |s| {
123 let s = s.to_string();
124 async move { s }
125 })
126 .await
127 .unwrap();
128
129 assert_eq!(report.summary.total_cases, 2);
130 assert_eq!(report.summary.passed, 1);
131 }
132}