use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct EvaluationCriteria {
#[serde(default)]
pub tool_trajectory_score: Option<f64>,
#[serde(default)]
pub tool_trajectory_config: Option<ToolTrajectoryConfig>,
#[serde(default)]
pub response_similarity: Option<f64>,
#[serde(default)]
pub response_match_config: Option<ResponseMatchConfig>,
#[serde(default)]
pub semantic_match_score: Option<f64>,
#[serde(default)]
pub semantic_match_config: Option<SemanticMatchConfig>,
#[serde(default)]
pub rubric_quality_score: Option<f64>,
#[serde(default)]
pub rubric_config: Option<RubricConfig>,
#[serde(default)]
pub safety_score: Option<f64>,
#[serde(default)]
pub hallucination_score: Option<f64>,
#[serde(default)]
pub custom: Vec<CustomCriterion>,
}
impl EvaluationCriteria {
pub fn exact_tools() -> Self {
Self {
tool_trajectory_score: Some(1.0),
tool_trajectory_config: Some(ToolTrajectoryConfig {
strict_order: true,
strict_args: true,
}),
..Default::default()
}
}
pub fn semantic_match(threshold: f64) -> Self {
Self { semantic_match_score: Some(threshold), ..Default::default() }
}
pub fn response_similarity(threshold: f64) -> Self {
Self { response_similarity: Some(threshold), ..Default::default() }
}
pub fn with_tool_trajectory(mut self, threshold: f64) -> Self {
self.tool_trajectory_score = Some(threshold);
self
}
pub fn with_response_similarity(mut self, threshold: f64) -> Self {
self.response_similarity = Some(threshold);
self
}
pub fn with_semantic_match(mut self, threshold: f64) -> Self {
self.semantic_match_score = Some(threshold);
self
}
pub fn with_rubrics(mut self, threshold: f64, rubrics: Vec<Rubric>) -> Self {
self.rubric_quality_score = Some(threshold);
self.rubric_config = Some(RubricConfig { rubrics });
self
}
pub fn has_criteria(&self) -> bool {
self.tool_trajectory_score.is_some()
|| self.response_similarity.is_some()
|| self.semantic_match_score.is_some()
|| self.rubric_quality_score.is_some()
|| self.safety_score.is_some()
|| self.hallucination_score.is_some()
|| !self.custom.is_empty()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolTrajectoryConfig {
#[serde(default = "default_true")]
pub strict_order: bool,
#[serde(default)]
pub strict_args: bool,
}
impl Default for ToolTrajectoryConfig {
fn default() -> Self {
Self { strict_order: true, strict_args: false }
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ResponseMatchConfig {
#[serde(default)]
pub algorithm: SimilarityAlgorithm,
#[serde(default = "default_true")]
pub normalize: bool,
#[serde(default = "default_true")]
pub ignore_case: bool,
#[serde(default)]
pub ignore_punctuation: bool,
}
impl Default for ResponseMatchConfig {
fn default() -> Self {
Self {
algorithm: SimilarityAlgorithm::default(),
normalize: true,
ignore_case: true,
ignore_punctuation: false,
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SimilarityAlgorithm {
Exact,
Contains,
Levenshtein,
#[default]
Jaccard,
Rouge1,
Rouge2,
RougeL,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SemanticMatchConfig {
#[serde(default = "default_judge_model")]
pub judge_model: String,
pub custom_prompt: Option<String>,
}
impl Default for SemanticMatchConfig {
fn default() -> Self {
Self { judge_model: default_judge_model(), custom_prompt: None }
}
}
fn default_judge_model() -> String {
"gemini-2.5-flash".to_string()
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct RubricConfig {
pub rubrics: Vec<Rubric>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Rubric {
pub name: String,
pub description: String,
#[serde(default = "default_weight")]
pub weight: f64,
#[serde(default)]
pub levels: Vec<RubricLevel>,
}
impl Rubric {
pub fn new(name: &str, description: &str) -> Self {
Self {
name: name.to_string(),
description: description.to_string(),
weight: 1.0,
levels: vec![],
}
}
pub fn with_weight(mut self, weight: f64) -> Self {
self.weight = weight;
self
}
pub fn with_levels(mut self, levels: Vec<RubricLevel>) -> Self {
self.levels = levels;
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RubricLevel {
pub score: f64,
pub description: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CustomCriterion {
pub name: String,
pub description: String,
pub threshold: f64,
#[serde(default)]
pub config: serde_json::Value,
}
fn default_true() -> bool {
true
}
fn default_weight() -> f64 {
1.0
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_criteria_builder() {
let criteria = EvaluationCriteria::exact_tools()
.with_response_similarity(0.8)
.with_semantic_match(0.9);
assert_eq!(criteria.tool_trajectory_score, Some(1.0));
assert_eq!(criteria.response_similarity, Some(0.8));
assert_eq!(criteria.semantic_match_score, Some(0.9));
assert!(criteria.has_criteria());
}
#[test]
fn test_rubric_creation() {
let rubric = Rubric::new("Accuracy", "Response is factually correct")
.with_weight(0.7)
.with_levels(vec![
RubricLevel { score: 1.0, description: "Completely accurate".to_string() },
RubricLevel { score: 0.5, description: "Partially accurate".to_string() },
RubricLevel { score: 0.0, description: "Inaccurate".to_string() },
]);
assert_eq!(rubric.name, "Accuracy");
assert_eq!(rubric.weight, 0.7);
assert_eq!(rubric.levels.len(), 3);
}
#[test]
fn test_default_criteria() {
let criteria = EvaluationCriteria::default();
assert!(!criteria.has_criteria());
}
}