Expand description
§adk-eval
Agent evaluation framework for ADK-Rust.
This crate provides comprehensive tools for testing and validating agent behavior, enabling developers to ensure their agents perform correctly and consistently.
§Features
- Test Definitions: Structured format for defining test cases (
.test.json) - Trajectory Evaluation: Validate tool call sequences
- Response Quality: Assess final output quality with multiple metrics
- Multiple Criteria: Ground truth, rubric-based, and LLM-judged evaluation
- Automation: Run evaluations programmatically or via CLI
§Quick Start
ⓘ
use adk_eval::{Evaluator, EvaluationConfig, EvaluationCriteria};
use std::sync::Arc;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Create your agent
let agent = create_my_agent()?;
// Configure evaluator
let config = EvaluationConfig {
criteria: EvaluationCriteria {
tool_trajectory_score: Some(1.0), // Exact tool match
response_similarity: Some(0.8), // 80% text similarity
..Default::default()
},
..Default::default()
};
let evaluator = Evaluator::new(config);
// Run evaluation
let result = evaluator
.evaluate_file(agent, "tests/my_agent.test.json")
.await?;
assert!(result.passed, "Evaluation failed: {:?}", result.failures);
Ok(())
}Re-exports§
pub use criteria::EvaluationCriteria;pub use criteria::ResponseMatchConfig;pub use criteria::Rubric;pub use criteria::RubricConfig;pub use criteria::ToolTrajectoryConfig;pub use error::EvalError;pub use error::Result;pub use evaluator::EvaluationConfig;pub use evaluator::Evaluator;pub use llm_judge::LlmJudge;pub use llm_judge::LlmJudgeConfig;pub use llm_judge::RubricEvaluationResult;pub use llm_judge::RubricScore;pub use llm_judge::SemanticMatchResult;pub use report::EvaluationReport;pub use report::EvaluationResult;pub use report::Failure;pub use report::TestCaseResult;pub use schema::EvalCase;pub use schema::EvalSet;pub use schema::IntermediateData;pub use schema::SessionInput;pub use schema::TestFile;pub use schema::ToolUse;pub use schema::Turn;pub use scoring::ResponseScorer;pub use scoring::ToolTrajectoryScorer;pub use optimizer::OptimizationResult;pub use optimizer::OptimizerConfig;pub use optimizer::PromptOptimizer;pub use annotation::AnnotationRecord;pub use annotation::AnnotationStore;pub use annotation::HumanVerdict;pub use baseline::Baseline;pub use baseline::BaselineStore;pub use baseline::Regression;pub use conversation_scorer::ConversationMetrics;pub use conversation_scorer::ConversationScorer;pub use conversation_scorer::ConversationScorerConfig;pub use cost_tracker::CostMetrics;pub use cost_tracker::CostTracker;pub use pricing::ModelPricing;pub use structured_judge::JudgeRubric;pub use structured_judge::ScalePoint;pub use structured_judge::StructuredJudge;pub use structured_judge::StructuredJudgeConfig;pub use structured_judge::StructuredVerdict;pub use structured_judge::Verdict;pub use test_generator::EvalCaseMetadata;pub use test_generator::GeneratorConfig;pub use test_generator::TestGenerator;pub use trace_analyzer::ToolCallRecord;pub use trace_analyzer::TraceAnalysis;pub use trace_analyzer::TraceAnalyzer;pub use trace_analyzer::TraceDiagnostic;pub use trace_analyzer::TracePattern;
Modules§
- annotation
- Human annotation workflow via JSONL export/import.
- baseline
- Baseline storage for regression detection.
- conversation_
scorer - Multi-turn conversation metrics.
- cost_
tracker - Cost and latency tracking for evaluation runs.
- criteria
- Evaluation criteria definitions
- error
- Error types for the evaluation framework
- evaluator
- Core evaluator implementation
- llm_
judge - LLM-based evaluation scoring
- optimizer
- Prompt optimization engine.
- prelude
- Prelude for convenient imports
- pricing
- Per-model pricing configuration for cost estimation.
- report
- Evaluation result reporting
- schema
- Test file schema definitions
- scoring
- Scoring implementations for evaluation criteria
- structured_
judge - Structured LLM judge producing typed verdicts.
- test_
generator - LLM-driven test case generation.
- trace_
analyzer - Execution trace analysis for detecting inefficiencies.