adk_eval/lib.rs
1//! # adk-eval
2//!
3//! Agent evaluation framework for ADK-Rust.
4//!
5//! This crate provides comprehensive tools for testing and validating agent behavior,
6//! enabling developers to ensure their agents perform correctly and consistently.
7//!
8//! ## Features
9//!
10//! - **Test Definitions**: Structured format for defining test cases (`.test.json`)
11//! - **Trajectory Evaluation**: Validate tool call sequences
12//! - **Response Quality**: Assess final output quality with multiple metrics
13//! - **Multiple Criteria**: Ground truth, rubric-based, and LLM-judged evaluation
14//! - **Automation**: Run evaluations programmatically or via CLI
15//!
16//! ## Quick Start
17//!
18//! ```rust,ignore
19//! use adk_eval::{Evaluator, EvaluationConfig, EvaluationCriteria};
20//! use std::sync::Arc;
21//!
22//! #[tokio::main]
23//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
24//! // Create your agent
25//! let agent = create_my_agent()?;
26//!
27//! // Configure evaluator
28//! let config = EvaluationConfig {
29//! criteria: EvaluationCriteria {
30//! tool_trajectory_score: Some(1.0), // Exact tool match
31//! response_similarity: Some(0.8), // 80% text similarity
32//! ..Default::default()
33//! },
34//! ..Default::default()
35//! };
36//!
37//! let evaluator = Evaluator::new(config);
38//!
39//! // Run evaluation
40//! let result = evaluator
41//! .evaluate_file(agent, "tests/my_agent.test.json")
42//! .await?;
43//!
44//! assert!(result.passed, "Evaluation failed: {:?}", result.failures);
45//! Ok(())
46//! }
47//! ```
48
49pub mod criteria;
50pub mod error;
51pub mod evaluator;
52pub mod llm_judge;
53pub mod report;
54pub mod schema;
55pub mod scoring;
56
57#[cfg(feature = "personas")]
58pub mod personas;
59
60pub mod optimizer;
61
62// Re-exports
63pub use criteria::{
64 EvaluationCriteria, ResponseMatchConfig, Rubric, RubricConfig, ToolTrajectoryConfig,
65};
66pub use error::{EvalError, Result};
67pub use evaluator::{EvaluationConfig, Evaluator};
68pub use llm_judge::{
69 LlmJudge, LlmJudgeConfig, RubricEvaluationResult, RubricScore, SemanticMatchResult,
70};
71pub use report::{EvaluationReport, EvaluationResult, Failure, TestCaseResult};
72pub use schema::{EvalCase, EvalSet, IntermediateData, SessionInput, TestFile, ToolUse, Turn};
73pub use scoring::{ResponseScorer, ToolTrajectoryScorer};
74
75// Optimizer re-exports
76pub use optimizer::{OptimizationResult, OptimizerConfig, PromptOptimizer};
77
78/// Prelude for convenient imports
79pub mod prelude {
80 pub use crate::criteria::{
81 EvaluationCriteria, ResponseMatchConfig, Rubric, RubricConfig, ToolTrajectoryConfig,
82 };
83 pub use crate::error::{EvalError, Result};
84 pub use crate::evaluator::{EvaluationConfig, Evaluator};
85 pub use crate::llm_judge::{
86 LlmJudge, LlmJudgeConfig, RubricEvaluationResult, SemanticMatchResult,
87 };
88 pub use crate::report::{EvaluationReport, EvaluationResult, Failure, TestCaseResult};
89 pub use crate::schema::{EvalCase, EvalSet, TestFile, ToolUse, Turn};
90}