1pub const KNOWN_SCORERS: &[&str] = &[
32 "trajectory",
33 "keyword",
34 "similarity",
35 "cost",
36 "latency",
37 "tool_call_count",
38 "safety",
39];
40
41use std::sync::Arc;
42
43use serde::{Deserialize, Serialize};
44
45use crate::agent::events::AgentEvent;
46use crate::error::Error;
47use crate::llm::pricing::estimate_cost;
48
49#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct EvalCase {
56 pub name: String,
58 pub input: String,
60 #[serde(default, skip_serializing_if = "Option::is_none")]
62 pub expected_tools: Option<Vec<ExpectedToolCall>>,
63 #[serde(default)]
65 pub output_contains: Vec<String>,
66 #[serde(default)]
68 pub output_not_contains: Vec<String>,
69 #[serde(default, skip_serializing_if = "Option::is_none")]
71 pub reference_output: Option<String>,
72 #[serde(default, skip_serializing_if = "Option::is_none")]
74 pub max_cost_usd: Option<f64>,
75 #[serde(default, skip_serializing_if = "Option::is_none")]
77 pub max_latency_ms: Option<u64>,
78 #[serde(default, skip_serializing_if = "Option::is_none")]
80 pub max_tool_calls: Option<usize>,
81}
82
83#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct ExpectedToolCall {
86 pub name: String,
88 pub order: Option<usize>,
91}
92
93impl EvalCase {
94 pub fn new(name: impl Into<String>, input: impl Into<String>) -> Self {
96 Self {
97 name: name.into(),
98 input: input.into(),
99 expected_tools: None,
100 output_contains: Vec::new(),
101 output_not_contains: Vec::new(),
102 reference_output: None,
103 max_cost_usd: None,
104 max_latency_ms: None,
105 max_tool_calls: None,
106 }
107 }
108
109 pub fn expect_tool(mut self, name: impl Into<String>) -> Self {
111 self.expected_tools
112 .get_or_insert_with(Vec::new)
113 .push(ExpectedToolCall {
114 name: name.into(),
115 order: None,
116 });
117 self
118 }
119
120 pub fn expect_tool_at(mut self, name: impl Into<String>, position: usize) -> Self {
122 self.expected_tools
123 .get_or_insert_with(Vec::new)
124 .push(ExpectedToolCall {
125 name: name.into(),
126 order: Some(position),
127 });
128 self
129 }
130
131 pub fn expect_no_tools(mut self) -> Self {
133 self.expected_tools = Some(Vec::new());
134 self
135 }
136
137 pub fn expect_output_contains(mut self, text: impl Into<String>) -> Self {
139 self.output_contains.push(text.into());
140 self
141 }
142
143 pub fn expect_output_not_contains(mut self, text: impl Into<String>) -> Self {
145 self.output_not_contains.push(text.into());
146 self
147 }
148
149 pub fn reference_output(mut self, text: impl Into<String>) -> Self {
151 self.reference_output = Some(text.into());
152 self
153 }
154
155 pub fn expect_max_cost_usd(mut self, max: f64) -> Self {
157 self.max_cost_usd = Some(max);
158 self
159 }
160
161 pub fn expect_max_latency_ms(mut self, max: u64) -> Self {
163 self.max_latency_ms = Some(max);
164 self
165 }
166
167 pub fn expect_max_tool_calls(mut self, max: usize) -> Self {
169 self.max_tool_calls = Some(max);
170 self
171 }
172}
173
174#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct EvalResult {
177 pub case_name: String,
179 pub passed: bool,
181 pub scores: Vec<ScorerResult>,
183 pub actual_tools: Vec<String>,
185 pub actual_output: String,
187 pub error: Option<String>,
189}
190
191#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct ScorerResult {
194 pub scorer: String,
196 pub score: f64,
198 pub passed: bool,
200 pub details: Vec<String>,
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct EvalSummary {
207 pub total: usize,
209 pub passed: usize,
211 pub failed: usize,
213 pub errors: usize,
215 pub avg_score: f64,
217 pub scorer_averages: Vec<(String, f64)>,
219}
220
221impl EvalSummary {
222 pub fn from_results(results: &[EvalResult]) -> Self {
224 let total = results.len();
225 let passed = results.iter().filter(|r| r.passed).count();
226 let errors = results.iter().filter(|r| r.error.is_some()).count();
227 let failed = total - passed - errors;
228
229 let mut all_scores: Vec<f64> = Vec::new();
231 let mut scorer_totals: std::collections::HashMap<String, (f64, usize)> =
232 std::collections::HashMap::new();
233
234 for result in results {
235 for sr in &result.scores {
236 all_scores.push(sr.score);
237 let entry = scorer_totals.entry(sr.scorer.clone()).or_insert((0.0, 0));
238 entry.0 += sr.score;
239 entry.1 += 1;
240 }
241 }
242
243 let avg_score = if all_scores.is_empty() {
244 0.0
245 } else {
246 all_scores.iter().sum::<f64>() / all_scores.len() as f64
247 };
248
249 let mut scorer_averages: Vec<(String, f64)> = scorer_totals
250 .into_iter()
251 .map(|(name, (sum, count))| (name, sum / count as f64))
252 .collect();
253 scorer_averages.sort_by(|a, b| a.0.cmp(&b.0));
254
255 Self {
256 total,
257 passed,
258 failed,
259 errors,
260 avg_score,
261 scorer_averages,
262 }
263 }
264
265 pub fn pass_rate(&self) -> f64 {
267 if self.total == 0 {
268 return 0.0;
269 }
270 self.passed as f64 / self.total as f64
271 }
272}
273
274impl std::fmt::Display for EvalSummary {
275 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
276 writeln!(f, "Eval Summary: {}/{} passed", self.passed, self.total)?;
277 writeln!(f, " Pass rate: {:.1}%", self.pass_rate() * 100.0)?;
278 writeln!(f, " Avg score: {:.3}", self.avg_score)?;
279 if self.errors > 0 {
280 writeln!(f, " Errors: {}", self.errors)?;
281 }
282 for (name, avg) in &self.scorer_averages {
283 writeln!(f, " {name}: {avg:.3}")?;
284 }
285 Ok(())
286 }
287}
288
289pub trait EvalScorer: Send + Sync {
300 fn name(&self) -> &str;
302
303 fn score(&self, case: &EvalCase, output: &str, tool_calls: &[String]) -> (f64, Vec<String>);
308
309 fn pass_threshold(&self) -> f64 {
311 1.0
312 }
313}
314
315pub struct TrajectoryScorer;
328
329impl EvalScorer for TrajectoryScorer {
330 fn name(&self) -> &str {
331 "trajectory"
332 }
333
334 fn score(&self, case: &EvalCase, _output: &str, tool_calls: &[String]) -> (f64, Vec<String>) {
335 let expected = match &case.expected_tools {
336 None => return (1.0, vec!["no trajectory expectations".into()]),
337 Some(e) => e,
338 };
339
340 if expected.is_empty() {
342 return if tool_calls.is_empty() {
343 (1.0, vec!["correctly made no tool calls".into()])
344 } else {
345 (
346 0.0,
347 vec![format!(
348 "expected no tools but got: [{}]",
349 tool_calls.join(", ")
350 )],
351 )
352 };
353 }
354
355 let mut matched = 0usize;
356 let mut details = Vec::new();
357
358 for exp in expected {
359 if let Some(pos) = exp.order {
360 if tool_calls.get(pos).map(|s| s.as_str()) == Some(&exp.name) {
362 matched += 1;
363 details.push(format!("OK: {} at position {pos}", exp.name));
364 } else {
365 let actual = tool_calls.get(pos).map(|s| s.as_str()).unwrap_or("<none>");
366 details.push(format!(
367 "FAIL: expected {} at position {pos}, got {actual}",
368 exp.name
369 ));
370 }
371 } else {
372 if tool_calls.iter().any(|t| t == &exp.name) {
374 matched += 1;
375 details.push(format!("OK: {} found in trajectory", exp.name));
376 } else {
377 details.push(format!(
378 "FAIL: {} not found in [{}]",
379 exp.name,
380 tool_calls.join(", ")
381 ));
382 }
383 }
384 }
385
386 let score = matched as f64 / expected.len() as f64;
387 (score, details)
388 }
389}
390
391pub struct KeywordScorer;
400
401impl EvalScorer for KeywordScorer {
402 fn name(&self) -> &str {
403 "keyword"
404 }
405
406 fn score(&self, case: &EvalCase, output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
407 let total = case.output_contains.len() + case.output_not_contains.len();
408 if total == 0 {
409 return (1.0, vec!["no keyword expectations".into()]);
410 }
411
412 let lower_output = output.to_lowercase();
413 let mut matched = 0usize;
414 let mut details = Vec::new();
415
416 for keyword in &case.output_contains {
417 if lower_output.contains(&keyword.to_lowercase()) {
418 matched += 1;
419 details.push(format!("OK: output contains \"{keyword}\""));
420 } else {
421 details.push(format!("FAIL: output missing \"{keyword}\""));
422 }
423 }
424
425 for keyword in &case.output_not_contains {
426 if !lower_output.contains(&keyword.to_lowercase()) {
427 matched += 1;
428 details.push(format!("OK: output does not contain \"{keyword}\""));
429 } else {
430 details.push(format!("FAIL: output contains unwanted \"{keyword}\""));
431 }
432 }
433
434 let score = matched as f64 / total as f64;
435 (score, details)
436 }
437}
438
439pub struct SimilarityScorer;
448
449impl EvalScorer for SimilarityScorer {
450 fn name(&self) -> &str {
451 "similarity"
452 }
453
454 fn score(&self, case: &EvalCase, output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
455 let reference = match &case.reference_output {
456 None => return (1.0, vec!["no reference output".into()]),
457 Some(r) => r,
458 };
459
460 let score = rouge1_f1(output, reference);
461 let details = vec![format!("Rouge-1 F1: {score:.3}")];
462 (score, details)
463 }
464
465 fn pass_threshold(&self) -> f64 {
466 0.3 }
468}
469
470fn rouge1_f1(candidate: &str, reference: &str) -> f64 {
474 use std::collections::HashSet;
475
476 let cand_tokens: HashSet<String> = candidate
477 .split_whitespace()
478 .map(|w| w.to_lowercase())
479 .collect();
480 let ref_tokens: HashSet<String> = reference
481 .split_whitespace()
482 .map(|w| w.to_lowercase())
483 .collect();
484
485 if cand_tokens.is_empty() || ref_tokens.is_empty() {
486 return 0.0;
487 }
488
489 let overlap = cand_tokens.intersection(&ref_tokens).count() as f64;
490 let precision = overlap / cand_tokens.len() as f64;
491 let recall = overlap / ref_tokens.len() as f64;
492
493 if precision + recall == 0.0 {
494 0.0
495 } else {
496 2.0 * precision * recall / (precision + recall)
497 }
498}
499
500fn collect_tool_calls(events: &[AgentEvent]) -> Vec<String> {
506 events
507 .iter()
508 .filter_map(|e| match e {
509 AgentEvent::ToolCallStarted { tool_name, .. } => Some(tool_name.clone()),
510 _ => None,
511 })
512 .collect()
513}
514
515pub struct EvalRunner {
520 scorers: Vec<Box<dyn EvalScorer>>,
521 event_collector: Option<EventCollector>,
525}
526
527impl std::fmt::Debug for EvalRunner {
528 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
529 f.debug_struct("EvalRunner")
530 .field(
531 "scorers",
532 &self.scorers.iter().map(|s| s.name()).collect::<Vec<_>>(),
533 )
534 .field("event_collector", &self.event_collector.is_some())
535 .finish()
536 }
537}
538
539impl Default for EvalRunner {
540 fn default() -> Self {
541 Self::new()
542 }
543}
544
545impl EvalRunner {
546 pub fn new() -> Self {
564 Self {
565 scorers: Vec::new(),
566 event_collector: None,
567 }
568 }
569
570 pub fn scorer(mut self, scorer: impl EvalScorer + 'static) -> Self {
572 self.scorers.push(Box::new(scorer));
573 self
574 }
575
576 pub fn with_event_collector(mut self, collector: EventCollector) -> Self {
585 self.event_collector = Some(collector);
586 self
587 }
588
589 pub async fn run<P: crate::llm::LlmProvider>(
601 &self,
602 agent: &crate::agent::AgentRunner<P>,
603 cases: &[EvalCase],
604 ) -> Vec<EvalResult> {
605 let mut results = Vec::with_capacity(cases.len());
606 for case in cases {
607 if let Some(collector) = self.event_collector.as_ref() {
608 clear_events(collector);
609 }
610 results.push(self.run_case(agent, case).await);
611 }
612 results
613 }
614
615 async fn run_case<P: crate::llm::LlmProvider>(
623 &self,
624 agent: &crate::agent::AgentRunner<P>,
625 case: &EvalCase,
626 ) -> EvalResult {
627 match agent.execute(&case.input).await {
628 Ok(output) => {
629 self.score_result(case, &output.result, &[], None)
632 }
633 Err(e) => EvalResult {
634 case_name: case.name.clone(),
635 passed: false,
636 scores: Vec::new(),
637 actual_tools: Vec::new(),
638 actual_output: String::new(),
639 error: Some(e.to_string()),
640 },
641 }
642 }
643
644 pub fn score_result(
649 &self,
650 case: &EvalCase,
651 output: &str,
652 tool_calls: &[String],
653 error: Option<String>,
654 ) -> EvalResult {
655 let scores: Vec<ScorerResult> = self
656 .scorers
657 .iter()
658 .map(|scorer| {
659 let (score, details) = scorer.score(case, output, tool_calls);
660 let passed = score >= scorer.pass_threshold();
661 ScorerResult {
662 scorer: scorer.name().to_string(),
663 score,
664 passed,
665 details,
666 }
667 })
668 .collect();
669
670 let passed = error.is_none() && scores.iter().all(|s| s.passed);
671
672 EvalResult {
673 case_name: case.name.clone(),
674 passed,
675 scores,
676 actual_tools: tool_calls.to_vec(),
677 actual_output: output.to_string(),
678 error,
679 }
680 }
681
682 pub fn event_collector() -> EventCollector {
687 Arc::new(std::sync::Mutex::new(Vec::new()))
688 }
689
690 pub fn event_callback(collector: &EventCollector) -> Arc<dyn Fn(AgentEvent) + Send + Sync> {
692 let collector = Arc::clone(collector);
693 Arc::new(move |event| {
694 collector.lock().expect("eval collector lock").push(event);
695 })
696 }
697
698 pub fn collected_tool_calls(collector: &EventCollector) -> Vec<String> {
700 let events = collector.lock().expect("eval collector lock");
701 collect_tool_calls(&events)
702 }
703}
704
705pub type EventCollector = Arc<std::sync::Mutex<Vec<AgentEvent>>>;
707
708pub fn clear_events(collector: &EventCollector) {
715 collector.lock().expect("clear_events lock").clear();
716}
717
718pub fn build_eval_agent<P: crate::llm::LlmProvider>(
726 builder: crate::agent::AgentRunnerBuilder<P>,
727) -> Result<(crate::agent::AgentRunner<P>, EventCollector), Error> {
728 let collector = EvalRunner::event_collector();
729 let callback = EvalRunner::event_callback(&collector);
730 let agent = builder.on_event(callback).build()?;
731 Ok((agent, collector))
732}
733
734pub struct CostScorer {
750 collector: EventCollector,
751 max_cost_usd: f64,
752}
753
754impl CostScorer {
755 pub fn new(collector: EventCollector, max_cost_usd: f64) -> Self {
759 Self {
760 collector,
761 max_cost_usd,
762 }
763 }
764}
765
766impl EvalScorer for CostScorer {
767 fn name(&self) -> &str {
768 "cost"
769 }
770
771 fn score(&self, case: &EvalCase, _output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
772 let max = case.max_cost_usd.unwrap_or(self.max_cost_usd);
773 if max <= 0.0 {
774 return (0.0, vec!["max cost budget is zero".into()]);
775 }
776 let events = self.collector.lock().expect("cost collector lock");
777 let mut total_cost = 0.0f64;
778 let mut details = Vec::new();
779
780 for event in events.iter() {
781 if let AgentEvent::LlmResponse { usage, model, .. } = event {
782 let model_name = model.as_deref().unwrap_or("unknown");
783 match estimate_cost(model_name, usage) {
784 Some(cost) => total_cost += cost,
785 None => {
786 details.push(format!("unknown model \"{model_name}\": $0 contributed"));
787 }
788 }
789 }
790 }
791
792 details.insert(0, format!("total cost: ${total_cost:.6} (max: ${max:.6})"));
793 (budget_score(total_cost, max), details)
794 }
795
796 fn pass_threshold(&self) -> f64 {
797 0.01
798 }
799}
800
801pub struct LatencyScorer {
815 collector: EventCollector,
816 max_latency_ms: u64,
817}
818
819impl LatencyScorer {
820 pub fn new(collector: EventCollector, max_latency_ms: u64) -> Self {
824 Self {
825 collector,
826 max_latency_ms,
827 }
828 }
829}
830
831impl EvalScorer for LatencyScorer {
832 fn name(&self) -> &str {
833 "latency"
834 }
835
836 fn score(&self, case: &EvalCase, _output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
837 let max = case.max_latency_ms.unwrap_or(self.max_latency_ms);
838 if max == 0 {
839 return (0.0, vec!["max latency budget is zero".into()]);
840 }
841 let events = self.collector.lock().expect("latency collector lock");
842 let total_ms: u64 = events
843 .iter()
844 .filter_map(|e| match e {
845 AgentEvent::LlmResponse { latency_ms, .. } => Some(latency_ms),
846 _ => None,
847 })
848 .sum();
849
850 let details = vec![format!("total latency: {total_ms}ms (max: {max}ms)")];
851 (budget_score(total_ms as f64, max as f64), details)
852 }
853
854 fn pass_threshold(&self) -> f64 {
855 0.01
856 }
857}
858
859pub struct ToolCallCountScorer {
868 max_calls: usize,
869}
870
871impl ToolCallCountScorer {
872 pub fn new(max_calls: usize) -> Self {
876 Self { max_calls }
877 }
878}
879
880impl EvalScorer for ToolCallCountScorer {
881 fn name(&self) -> &str {
882 "tool_call_count"
883 }
884
885 fn score(&self, case: &EvalCase, _output: &str, tool_calls: &[String]) -> (f64, Vec<String>) {
886 let max = case.max_tool_calls.unwrap_or(self.max_calls);
887 if max == 0 {
888 return (0.0, vec!["max tool call budget is zero".into()]);
889 }
890 let count = tool_calls.len();
891 let details = vec![format!("tool calls: {count} (max: {max})")];
892 (budget_score(count as f64, max as f64), details)
893 }
894
895 fn pass_threshold(&self) -> f64 {
896 0.01
897 }
898}
899
900pub struct SafetyScorer {
916 collector: EventCollector,
917}
918
919impl SafetyScorer {
920 pub fn new(collector: EventCollector) -> Self {
922 Self { collector }
923 }
924}
925
926impl EvalScorer for SafetyScorer {
927 fn name(&self) -> &str {
928 "safety"
929 }
930
931 fn score(&self, _case: &EvalCase, _output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
932 let events = self.collector.lock().expect("safety collector lock");
933 let mut denials = Vec::new();
934
935 for event in events.iter() {
936 if let AgentEvent::GuardrailDenied {
937 hook,
938 reason,
939 tool_name,
940 ..
941 } = event
942 {
943 let tool_info = tool_name
944 .as_deref()
945 .map(|t| format!(" (tool: {t})"))
946 .unwrap_or_default();
947 denials.push(format!("denied at {hook}{tool_info}: {reason}"));
948 }
949 }
950
951 if denials.is_empty() {
952 (1.0, vec!["no guardrail denials".into()])
953 } else {
954 (0.0, denials)
955 }
956 }
957
958 fn pass_threshold(&self) -> f64 {
959 1.0
960 }
961}
962
963const REGRESSION_TOLERANCE: f64 = 0.001;
969
970#[derive(Debug, Clone, Serialize, Deserialize)]
972pub struct EvalComparison {
973 pub cases: Vec<CaseComparison>,
975}
976
977#[derive(Debug, Clone, Serialize, Deserialize)]
979pub struct CaseComparison {
980 pub case_name: String,
982 pub baseline_avg_score: f64,
984 pub candidate_avg_score: f64,
986 pub delta: f64,
988 pub regressed: bool,
990}
991
992impl EvalComparison {
993 pub fn compare(baseline: &[EvalResult], candidate: &[EvalResult]) -> Self {
997 let baseline_map: std::collections::HashMap<&str, &EvalResult> =
998 baseline.iter().map(|r| (r.case_name.as_str(), r)).collect();
999
1000 let cases: Vec<CaseComparison> = candidate
1001 .iter()
1002 .filter_map(|cand_result| {
1003 let base_result = baseline_map.get(cand_result.case_name.as_str())?;
1004 let base_avg = avg_score(&base_result.scores);
1005 let cand_avg = avg_score(&cand_result.scores);
1006 let delta = cand_avg - base_avg;
1007 Some(CaseComparison {
1008 case_name: cand_result.case_name.clone(),
1009 baseline_avg_score: base_avg,
1010 candidate_avg_score: cand_avg,
1011 delta,
1012 regressed: delta < -REGRESSION_TOLERANCE,
1013 })
1014 })
1015 .collect();
1016
1017 Self { cases }
1018 }
1019
1020 pub fn baseline_wins(&self) -> usize {
1022 self.cases.iter().filter(|c| c.regressed).count()
1023 }
1024
1025 pub fn candidate_wins(&self) -> usize {
1027 self.cases
1028 .iter()
1029 .filter(|c| c.delta > REGRESSION_TOLERANCE)
1030 .count()
1031 }
1032
1033 pub fn ties(&self) -> usize {
1035 self.cases.len() - self.baseline_wins() - self.candidate_wins()
1036 }
1037
1038 pub fn has_regressions(&self) -> bool {
1040 self.cases.iter().any(|c| c.regressed)
1041 }
1042
1043 pub fn regressions(&self) -> Vec<&str> {
1045 self.cases
1046 .iter()
1047 .filter(|c| c.regressed)
1048 .map(|c| c.case_name.as_str())
1049 .collect()
1050 }
1051}
1052
1053impl std::fmt::Display for EvalComparison {
1054 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1055 writeln!(
1056 f,
1057 "A/B Comparison: {} cases ({} baseline wins, {} candidate wins, {} ties)",
1058 self.cases.len(),
1059 self.baseline_wins(),
1060 self.candidate_wins(),
1061 self.ties()
1062 )?;
1063 for c in &self.cases {
1064 let marker = if c.regressed { "REGRESSED" } else { "ok" };
1065 writeln!(
1066 f,
1067 " {}: baseline={:.3} candidate={:.3} delta={:+.3} [{}]",
1068 c.case_name, c.baseline_avg_score, c.candidate_avg_score, c.delta, marker
1069 )?;
1070 }
1071 let regressions = self.regressions();
1072 if !regressions.is_empty() {
1073 writeln!(f, " Regressions: {}", regressions.join(", "))?;
1074 }
1075 Ok(())
1076 }
1077}
1078
1079fn budget_score(actual: f64, max: f64) -> f64 {
1081 (1.0 - actual / max).max(0.0)
1082}
1083
1084fn avg_score(scores: &[ScorerResult]) -> f64 {
1086 if scores.is_empty() {
1087 return 0.0;
1088 }
1089 scores.iter().map(|s| s.score).sum::<f64>() / scores.len() as f64
1090}
1091
1092#[cfg(test)]
1097mod tests {
1098 use super::*;
1099
1100 #[test]
1105 fn eval_case_new() {
1106 let case = EvalCase::new("test", "do something");
1107 assert_eq!(case.name, "test");
1108 assert_eq!(case.input, "do something");
1109 assert!(case.expected_tools.is_none());
1110 assert!(case.output_contains.is_empty());
1111 assert!(case.output_not_contains.is_empty());
1112 assert!(case.reference_output.is_none());
1113 }
1114
1115 #[test]
1116 fn eval_case_expect_tool() {
1117 let case = EvalCase::new("t", "i")
1118 .expect_tool("bash")
1119 .expect_tool("read_file");
1120 let tools = case.expected_tools.as_ref().unwrap();
1121 assert_eq!(tools.len(), 2);
1122 assert_eq!(tools[0].name, "bash");
1123 assert!(tools[0].order.is_none());
1124 assert_eq!(tools[1].name, "read_file");
1125 }
1126
1127 #[test]
1128 fn eval_case_expect_tool_at() {
1129 let case = EvalCase::new("t", "i")
1130 .expect_tool_at("bash", 0)
1131 .expect_tool_at("read_file", 1);
1132 let tools = case.expected_tools.as_ref().unwrap();
1133 assert_eq!(tools[0].order, Some(0));
1134 assert_eq!(tools[1].order, Some(1));
1135 }
1136
1137 #[test]
1138 fn eval_case_expect_no_tools() {
1139 let case = EvalCase::new("t", "i").expect_no_tools();
1140 let tools = case.expected_tools.as_ref().unwrap();
1141 assert!(tools.is_empty());
1142 }
1143
1144 #[test]
1145 fn eval_case_expect_output() {
1146 let case = EvalCase::new("t", "i")
1147 .expect_output_contains("hello")
1148 .expect_output_not_contains("error");
1149 assert_eq!(case.output_contains, vec!["hello"]);
1150 assert_eq!(case.output_not_contains, vec!["error"]);
1151 }
1152
1153 #[test]
1154 fn eval_case_reference_output() {
1155 let case = EvalCase::new("t", "i").reference_output("expected answer");
1156 assert_eq!(case.reference_output.as_deref(), Some("expected answer"));
1157 }
1158
1159 #[test]
1164 fn trajectory_no_expectations_passes() {
1165 let case = EvalCase::new("t", "i"); let (score, _) = TrajectoryScorer.score(&case, "", &["bash".into()]);
1167 assert_eq!(score, 1.0);
1168 }
1169
1170 #[test]
1171 fn trajectory_expect_no_tools_with_none() {
1172 let case = EvalCase::new("t", "i").expect_no_tools();
1173 let (score, _) = TrajectoryScorer.score(&case, "", &[]);
1174 assert_eq!(score, 1.0);
1175 }
1176
1177 #[test]
1178 fn trajectory_expect_no_tools_but_got_some() {
1179 let case = EvalCase::new("t", "i").expect_no_tools();
1180 let (score, details) = TrajectoryScorer.score(&case, "", &["bash".into()]);
1181 assert_eq!(score, 0.0);
1182 assert!(details[0].contains("expected no tools"));
1183 }
1184
1185 #[test]
1186 fn trajectory_unordered_match() {
1187 let case = EvalCase::new("t", "i")
1188 .expect_tool("read_file")
1189 .expect_tool("bash");
1190 let tools = vec!["bash".into(), "read_file".into()];
1191 let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1192 assert_eq!(score, 1.0);
1193 }
1194
1195 #[test]
1196 fn trajectory_unordered_partial_match() {
1197 let case = EvalCase::new("t", "i")
1198 .expect_tool("read_file")
1199 .expect_tool("bash");
1200 let tools = vec!["bash".into()];
1201 let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1202 assert_eq!(score, 0.5);
1203 }
1204
1205 #[test]
1206 fn trajectory_unordered_no_match() {
1207 let case = EvalCase::new("t", "i").expect_tool("bash");
1208 let tools: Vec<String> = vec!["read_file".into()];
1209 let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1210 assert_eq!(score, 0.0);
1211 }
1212
1213 #[test]
1214 fn trajectory_ordered_exact_match() {
1215 let case = EvalCase::new("t", "i")
1216 .expect_tool_at("read_file", 0)
1217 .expect_tool_at("bash", 1);
1218 let tools = vec!["read_file".into(), "bash".into()];
1219 let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1220 assert_eq!(score, 1.0);
1221 }
1222
1223 #[test]
1224 fn trajectory_ordered_wrong_position() {
1225 let case = EvalCase::new("t", "i")
1226 .expect_tool_at("bash", 0)
1227 .expect_tool_at("read_file", 1);
1228 let tools = vec!["read_file".into(), "bash".into()]; let (score, details) = TrajectoryScorer.score(&case, "", &tools);
1230 assert_eq!(score, 0.0);
1231 assert!(details[0].contains("FAIL"));
1232 }
1233
1234 #[test]
1235 fn trajectory_ordered_position_out_of_bounds() {
1236 let case = EvalCase::new("t", "i").expect_tool_at("bash", 5);
1237 let tools = vec!["bash".into()];
1238 let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1239 assert_eq!(score, 0.0);
1240 }
1241
1242 #[test]
1243 fn trajectory_mixed_ordered_unordered() {
1244 let case = EvalCase::new("t", "i")
1245 .expect_tool_at("read_file", 0) .expect_tool("bash"); let tools = vec!["read_file".into(), "write_file".into(), "bash".into()];
1248 let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1249 assert_eq!(score, 1.0);
1250 }
1251
1252 #[test]
1257 fn keyword_no_expectations_passes() {
1258 let case = EvalCase::new("t", "i");
1259 let (score, _) = KeywordScorer.score(&case, "any output", &[]);
1260 assert_eq!(score, 1.0);
1261 }
1262
1263 #[test]
1264 fn keyword_contains_match() {
1265 let case = EvalCase::new("t", "i")
1266 .expect_output_contains("hello")
1267 .expect_output_contains("world");
1268 let (score, _) = KeywordScorer.score(&case, "Hello World", &[]);
1269 assert_eq!(score, 1.0); }
1271
1272 #[test]
1273 fn keyword_contains_partial_match() {
1274 let case = EvalCase::new("t", "i")
1275 .expect_output_contains("hello")
1276 .expect_output_contains("missing");
1277 let (score, _) = KeywordScorer.score(&case, "hello there", &[]);
1278 assert_eq!(score, 0.5);
1279 }
1280
1281 #[test]
1282 fn keyword_not_contains_match() {
1283 let case = EvalCase::new("t", "i")
1284 .expect_output_not_contains("error")
1285 .expect_output_not_contains("fail");
1286 let (score, _) = KeywordScorer.score(&case, "success!", &[]);
1287 assert_eq!(score, 1.0);
1288 }
1289
1290 #[test]
1291 fn keyword_not_contains_violation() {
1292 let case = EvalCase::new("t", "i").expect_output_not_contains("error");
1293 let (score, details) = KeywordScorer.score(&case, "An Error occurred", &[]);
1294 assert_eq!(score, 0.0); assert!(details[0].contains("FAIL"));
1296 }
1297
1298 #[test]
1299 fn keyword_mixed_contains_and_not_contains() {
1300 let case = EvalCase::new("t", "i")
1301 .expect_output_contains("result")
1302 .expect_output_not_contains("error");
1303 let (score, _) = KeywordScorer.score(&case, "the result is 42", &[]);
1305 assert_eq!(score, 1.0);
1306
1307 let (score, _) = KeywordScorer.score(&case, "no match here", &[]);
1309 assert_eq!(score, 0.5);
1310 }
1311
1312 #[test]
1317 fn similarity_no_reference_passes() {
1318 let case = EvalCase::new("t", "i");
1319 let (score, _) = SimilarityScorer.score(&case, "any output", &[]);
1320 assert_eq!(score, 1.0);
1321 }
1322
1323 #[test]
1324 fn similarity_identical_text() {
1325 let case = EvalCase::new("t", "i").reference_output("hello world");
1326 let (score, _) = SimilarityScorer.score(&case, "hello world", &[]);
1327 assert_eq!(score, 1.0);
1328 }
1329
1330 #[test]
1331 fn similarity_partial_overlap() {
1332 let case =
1333 EvalCase::new("t", "i").reference_output("the quick brown fox jumps over the lazy dog");
1334 let (score, _) = SimilarityScorer.score(&case, "the quick brown cat", &[]);
1335 assert!(score > 0.0);
1336 assert!(score < 1.0);
1337 }
1338
1339 #[test]
1340 fn similarity_no_overlap() {
1341 let case = EvalCase::new("t", "i").reference_output("alpha beta gamma");
1342 let (score, _) = SimilarityScorer.score(&case, "one two three", &[]);
1343 assert_eq!(score, 0.0);
1344 }
1345
1346 #[test]
1347 fn similarity_case_insensitive() {
1348 let case = EvalCase::new("t", "i").reference_output("Hello World");
1349 let (score, _) = SimilarityScorer.score(&case, "hello world", &[]);
1350 assert_eq!(score, 1.0);
1351 }
1352
1353 #[test]
1354 fn similarity_empty_candidate() {
1355 let case = EvalCase::new("t", "i").reference_output("hello world");
1356 let (score, _) = SimilarityScorer.score(&case, "", &[]);
1357 assert_eq!(score, 0.0);
1358 }
1359
1360 #[test]
1361 fn similarity_empty_reference() {
1362 let case = EvalCase::new("t", "i").reference_output("");
1363 let (score, _) = SimilarityScorer.score(&case, "hello world", &[]);
1364 assert_eq!(score, 0.0);
1365 }
1366
1367 #[test]
1372 fn rouge1_identical() {
1373 assert_eq!(rouge1_f1("hello world", "hello world"), 1.0);
1374 }
1375
1376 #[test]
1377 fn rouge1_no_overlap() {
1378 assert_eq!(rouge1_f1("a b c", "x y z"), 0.0);
1379 }
1380
1381 #[test]
1382 fn rouge1_partial() {
1383 assert_eq!(rouge1_f1("the cat", "the dog"), 0.5);
1388 }
1389
1390 #[test]
1391 fn rouge1_empty_candidate() {
1392 assert_eq!(rouge1_f1("", "hello"), 0.0);
1393 }
1394
1395 #[test]
1396 fn rouge1_empty_reference() {
1397 assert_eq!(rouge1_f1("hello", ""), 0.0);
1398 }
1399
1400 #[test]
1405 fn score_result_no_scorers() {
1406 let runner = EvalRunner::new();
1407 let case = EvalCase::new("t", "i");
1408 let result = runner.score_result(&case, "output", &[], None);
1409 assert!(result.passed);
1410 assert!(result.scores.is_empty());
1411 }
1412
1413 #[test]
1414 fn score_result_all_pass() {
1415 let runner = EvalRunner::new()
1416 .scorer(TrajectoryScorer)
1417 .scorer(KeywordScorer);
1418 let case = EvalCase::new("t", "i")
1419 .expect_tool("bash")
1420 .expect_output_contains("done");
1421 let result = runner.score_result(&case, "done!", &["bash".into()], None);
1422 assert!(result.passed);
1423 assert_eq!(result.scores.len(), 2);
1424 assert!(result.scores.iter().all(|s| s.passed));
1425 }
1426
1427 #[test]
1428 fn score_result_trajectory_fails() {
1429 let runner = EvalRunner::new().scorer(TrajectoryScorer);
1430 let case = EvalCase::new("t", "i").expect_tool("bash");
1431 let result = runner.score_result(&case, "output", &["read_file".into()], None);
1432 assert!(!result.passed);
1433 }
1434
1435 #[test]
1436 fn score_result_with_error() {
1437 let runner = EvalRunner::new().scorer(TrajectoryScorer);
1438 let case = EvalCase::new("t", "i");
1439 let result = runner.score_result(&case, "", &[], Some("agent failed".into()));
1440 assert!(!result.passed);
1441 assert_eq!(result.error.as_deref(), Some("agent failed"));
1442 }
1443
1444 #[test]
1445 fn score_result_preserves_actual_data() {
1446 let runner = EvalRunner::new();
1447 let case = EvalCase::new("test-case", "i");
1448 let tools = vec!["bash".into(), "read".into()];
1449 let result = runner.score_result(&case, "my output", &tools, None);
1450 assert_eq!(result.case_name, "test-case");
1451 assert_eq!(result.actual_output, "my output");
1452 assert_eq!(result.actual_tools, vec!["bash", "read"]);
1453 }
1454
1455 #[test]
1460 fn summary_empty_results() {
1461 let summary = EvalSummary::from_results(&[]);
1462 assert_eq!(summary.total, 0);
1463 assert_eq!(summary.passed, 0);
1464 assert_eq!(summary.pass_rate(), 0.0);
1465 }
1466
1467 #[test]
1468 fn summary_all_pass() {
1469 let results = vec![
1470 EvalResult {
1471 case_name: "a".into(),
1472 passed: true,
1473 scores: vec![ScorerResult {
1474 scorer: "trajectory".into(),
1475 score: 1.0,
1476 passed: true,
1477 details: vec![],
1478 }],
1479 actual_tools: vec![],
1480 actual_output: String::new(),
1481 error: None,
1482 },
1483 EvalResult {
1484 case_name: "b".into(),
1485 passed: true,
1486 scores: vec![ScorerResult {
1487 scorer: "trajectory".into(),
1488 score: 1.0,
1489 passed: true,
1490 details: vec![],
1491 }],
1492 actual_tools: vec![],
1493 actual_output: String::new(),
1494 error: None,
1495 },
1496 ];
1497 let summary = EvalSummary::from_results(&results);
1498 assert_eq!(summary.total, 2);
1499 assert_eq!(summary.passed, 2);
1500 assert_eq!(summary.failed, 0);
1501 assert_eq!(summary.pass_rate(), 1.0);
1502 assert_eq!(summary.avg_score, 1.0);
1503 }
1504
1505 #[test]
1506 fn summary_mixed_results() {
1507 let results = vec![
1508 EvalResult {
1509 case_name: "pass".into(),
1510 passed: true,
1511 scores: vec![ScorerResult {
1512 scorer: "keyword".into(),
1513 score: 1.0,
1514 passed: true,
1515 details: vec![],
1516 }],
1517 actual_tools: vec![],
1518 actual_output: String::new(),
1519 error: None,
1520 },
1521 EvalResult {
1522 case_name: "fail".into(),
1523 passed: false,
1524 scores: vec![ScorerResult {
1525 scorer: "keyword".into(),
1526 score: 0.5,
1527 passed: false,
1528 details: vec![],
1529 }],
1530 actual_tools: vec![],
1531 actual_output: String::new(),
1532 error: None,
1533 },
1534 EvalResult {
1535 case_name: "error".into(),
1536 passed: false,
1537 scores: vec![],
1538 actual_tools: vec![],
1539 actual_output: String::new(),
1540 error: Some("agent failed".into()),
1541 },
1542 ];
1543 let summary = EvalSummary::from_results(&results);
1544 assert_eq!(summary.total, 3);
1545 assert_eq!(summary.passed, 1);
1546 assert_eq!(summary.failed, 1);
1547 assert_eq!(summary.errors, 1);
1548 assert!((summary.pass_rate() - 1.0 / 3.0).abs() < 0.001);
1549 }
1550
1551 #[test]
1552 fn summary_scorer_averages() {
1553 let results = vec![
1554 EvalResult {
1555 case_name: "a".into(),
1556 passed: true,
1557 scores: vec![
1558 ScorerResult {
1559 scorer: "trajectory".into(),
1560 score: 1.0,
1561 passed: true,
1562 details: vec![],
1563 },
1564 ScorerResult {
1565 scorer: "keyword".into(),
1566 score: 0.8,
1567 passed: true,
1568 details: vec![],
1569 },
1570 ],
1571 actual_tools: vec![],
1572 actual_output: String::new(),
1573 error: None,
1574 },
1575 EvalResult {
1576 case_name: "b".into(),
1577 passed: false,
1578 scores: vec![
1579 ScorerResult {
1580 scorer: "trajectory".into(),
1581 score: 0.5,
1582 passed: false,
1583 details: vec![],
1584 },
1585 ScorerResult {
1586 scorer: "keyword".into(),
1587 score: 1.0,
1588 passed: true,
1589 details: vec![],
1590 },
1591 ],
1592 actual_tools: vec![],
1593 actual_output: String::new(),
1594 error: None,
1595 },
1596 ];
1597 let summary = EvalSummary::from_results(&results);
1598 let traj = summary
1601 .scorer_averages
1602 .iter()
1603 .find(|(n, _)| n == "trajectory")
1604 .unwrap();
1605 assert!((traj.1 - 0.75).abs() < 0.001);
1606 let kw = summary
1607 .scorer_averages
1608 .iter()
1609 .find(|(n, _)| n == "keyword")
1610 .unwrap();
1611 assert!((kw.1 - 0.9).abs() < 0.001);
1612 }
1613
1614 #[test]
1615 fn summary_display() {
1616 let results = vec![EvalResult {
1617 case_name: "a".into(),
1618 passed: true,
1619 scores: vec![ScorerResult {
1620 scorer: "trajectory".into(),
1621 score: 1.0,
1622 passed: true,
1623 details: vec![],
1624 }],
1625 actual_tools: vec![],
1626 actual_output: String::new(),
1627 error: None,
1628 }];
1629 let summary = EvalSummary::from_results(&results);
1630 let display = format!("{summary}");
1631 assert!(display.contains("1/1 passed"));
1632 assert!(display.contains("100.0%"));
1633 }
1634
1635 #[test]
1640 fn collect_tool_calls_extracts_started_events() {
1641 let events = vec![
1642 AgentEvent::RunStarted {
1643 agent: "a".into(),
1644 task: "t".into(),
1645 },
1646 AgentEvent::ToolCallStarted {
1647 agent: "a".into(),
1648 tool_name: "bash".into(),
1649 tool_call_id: "c1".into(),
1650 input: "{}".into(),
1651 },
1652 AgentEvent::ToolCallCompleted {
1653 agent: "a".into(),
1654 tool_name: "bash".into(),
1655 tool_call_id: "c1".into(),
1656 is_error: false,
1657 duration_ms: 10,
1658 output: String::new(),
1659 },
1660 AgentEvent::ToolCallStarted {
1661 agent: "a".into(),
1662 tool_name: "read_file".into(),
1663 tool_call_id: "c2".into(),
1664 input: "{}".into(),
1665 },
1666 ];
1667 let tools = collect_tool_calls(&events);
1668 assert_eq!(tools, vec!["bash", "read_file"]);
1669 }
1670
1671 #[test]
1672 fn collect_tool_calls_empty_events() {
1673 let tools = collect_tool_calls(&[]);
1674 assert!(tools.is_empty());
1675 }
1676
1677 #[test]
1682 fn event_collector_and_callback() {
1683 let collector = EvalRunner::event_collector();
1684 let callback = EvalRunner::event_callback(&collector);
1685
1686 callback(AgentEvent::ToolCallStarted {
1687 agent: "a".into(),
1688 tool_name: "bash".into(),
1689 tool_call_id: "c1".into(),
1690 input: "{}".into(),
1691 });
1692 callback(AgentEvent::ToolCallStarted {
1693 agent: "a".into(),
1694 tool_name: "read_file".into(),
1695 tool_call_id: "c2".into(),
1696 input: "{}".into(),
1697 });
1698
1699 let tools = EvalRunner::collected_tool_calls(&collector);
1700 assert_eq!(tools, vec!["bash", "read_file"]);
1701 }
1702
1703 #[test]
1708 fn runner_full_scoring_pass() {
1709 let runner = EvalRunner::new()
1710 .scorer(TrajectoryScorer)
1711 .scorer(KeywordScorer)
1712 .scorer(SimilarityScorer);
1713
1714 let case = EvalCase::new("full", "test")
1715 .expect_tool("bash")
1716 .expect_output_contains("result")
1717 .reference_output("the result is 42");
1718
1719 let result = runner.score_result(&case, "the result is 42", &["bash".into()], None);
1720
1721 assert!(result.passed);
1722 assert_eq!(result.scores.len(), 3);
1723 assert!(result.scores.iter().all(|s| s.passed));
1724 }
1725
1726 #[test]
1727 fn runner_full_scoring_fail() {
1728 let runner = EvalRunner::new()
1729 .scorer(TrajectoryScorer)
1730 .scorer(KeywordScorer);
1731
1732 let case = EvalCase::new("fail", "test")
1733 .expect_tool("bash")
1734 .expect_output_contains("result");
1735
1736 let result = runner.score_result(&case, "no match here", &["read_file".into()], None);
1737
1738 assert!(!result.passed);
1739 assert!(result.scores.iter().all(|s| !s.passed));
1741 }
1742
1743 #[tokio::test]
1748 async fn runner_run_with_mock_agent() {
1749 use crate::llm::LlmProvider;
1750 use crate::llm::types::{CompletionRequest, CompletionResponse, ContentBlock, StopReason};
1751 use std::sync::Mutex;
1752
1753 struct MockProvider {
1754 response: Mutex<Option<String>>,
1755 }
1756
1757 impl LlmProvider for MockProvider {
1758 async fn complete(
1759 &self,
1760 _request: CompletionRequest,
1761 ) -> Result<CompletionResponse, crate::error::Error> {
1762 let text = self
1763 .response
1764 .lock()
1765 .expect("mock")
1766 .take()
1767 .unwrap_or_default();
1768 Ok(CompletionResponse {
1769 content: vec![ContentBlock::Text { text }],
1770 stop_reason: StopReason::EndTurn,
1771 usage: Default::default(),
1772 model: None,
1773 })
1774 }
1775 }
1776
1777 let provider = Arc::new(MockProvider {
1778 response: Mutex::new(Some("hello world".into())),
1779 });
1780 let agent = crate::agent::AgentRunner::builder(provider)
1781 .name("eval-test")
1782 .system_prompt("test")
1783 .max_turns(1)
1784 .build()
1785 .unwrap();
1786
1787 let runner = EvalRunner::new().scorer(KeywordScorer);
1788 let cases = vec![EvalCase::new("greeting", "say hello").expect_output_contains("hello")];
1789
1790 let results = runner.run(&agent, &cases).await;
1791 assert_eq!(results.len(), 1);
1792 assert!(results[0].passed);
1793 assert_eq!(results[0].actual_output, "hello world");
1794 }
1795
1796 #[tokio::test]
1797 async fn run_clears_attached_event_collector_between_cases() {
1798 use crate::llm::LlmProvider;
1799 use crate::llm::types::{CompletionRequest, CompletionResponse, ContentBlock, StopReason};
1800 use std::sync::Mutex;
1801
1802 struct MockProvider;
1803 impl LlmProvider for MockProvider {
1804 async fn complete(
1805 &self,
1806 _request: CompletionRequest,
1807 ) -> Result<CompletionResponse, crate::error::Error> {
1808 Ok(CompletionResponse {
1809 content: vec![ContentBlock::Text { text: "ok".into() }],
1810 stop_reason: StopReason::EndTurn,
1811 usage: Default::default(),
1812 model: None,
1813 })
1814 }
1815 }
1816
1817 struct EventCounter {
1819 collector: EventCollector,
1820 seen: Arc<Mutex<Vec<usize>>>,
1821 }
1822 impl EvalScorer for EventCounter {
1823 fn name(&self) -> &str {
1824 "event-counter"
1825 }
1826 fn score(&self, _c: &EvalCase, _o: &str, _t: &[String]) -> (f64, Vec<String>) {
1827 let n = self.collector.lock().expect("counter lock").len();
1828 self.seen.lock().expect("seen lock").push(n);
1829 (1.0, vec![])
1830 }
1831 fn pass_threshold(&self) -> f64 {
1832 0.0
1833 }
1834 }
1835
1836 let collector = EvalRunner::event_collector();
1837 let agent = crate::agent::AgentRunner::builder(Arc::new(MockProvider))
1838 .name("eval-isolation")
1839 .system_prompt("test")
1840 .max_turns(1)
1841 .on_event(EvalRunner::event_callback(&collector))
1842 .build()
1843 .unwrap();
1844
1845 let seen = Arc::new(Mutex::new(Vec::<usize>::new()));
1846 let runner = EvalRunner::new()
1847 .with_event_collector(collector.clone())
1848 .scorer(EventCounter {
1849 collector: collector.clone(),
1850 seen: seen.clone(),
1851 });
1852
1853 let cases = vec![EvalCase::new("c1", "first"), EvalCase::new("c2", "second")];
1854 let _ = runner.run(&agent, &cases).await;
1855
1856 let seen = seen.lock().expect("seen lock").clone();
1860 assert_eq!(seen.len(), 2, "scorer should run once per case");
1861 assert_eq!(
1862 seen[0], seen[1],
1863 "with_event_collector must clear the collector between cases (saw {seen:?})"
1864 );
1865 assert!(
1866 seen[0] > 0,
1867 "expected at least one captured event per case (saw {seen:?})"
1868 );
1869 }
1870
1871 #[test]
1876 fn eval_case_budget_defaults_none() {
1877 let case = EvalCase::new("t", "i");
1878 assert!(case.max_cost_usd.is_none());
1879 assert!(case.max_latency_ms.is_none());
1880 assert!(case.max_tool_calls.is_none());
1881 }
1882
1883 #[test]
1884 fn eval_case_budget_builders() {
1885 let case = EvalCase::new("t", "i")
1886 .expect_max_cost_usd(0.05)
1887 .expect_max_latency_ms(5000)
1888 .expect_max_tool_calls(10);
1889 assert_eq!(case.max_cost_usd, Some(0.05));
1890 assert_eq!(case.max_latency_ms, Some(5000));
1891 assert_eq!(case.max_tool_calls, Some(10));
1892 }
1893
1894 #[test]
1899 fn eval_case_serializes_to_json() {
1900 let case = EvalCase::new("test", "do it")
1901 .expect_tool("bash")
1902 .expect_max_cost_usd(0.01);
1903 let json = serde_json::to_string(&case).unwrap();
1904 assert!(json.contains("\"name\":\"test\""));
1905 assert!(json.contains("\"max_cost_usd\":0.01"));
1906 }
1907
1908 #[test]
1909 fn eval_result_serializes_to_json() {
1910 let result = EvalResult {
1911 case_name: "a".into(),
1912 passed: true,
1913 scores: vec![ScorerResult {
1914 scorer: "keyword".into(),
1915 score: 1.0,
1916 passed: true,
1917 details: vec!["ok".into()],
1918 }],
1919 actual_tools: vec!["bash".into()],
1920 actual_output: "done".into(),
1921 error: None,
1922 };
1923 let json = serde_json::to_string(&result).unwrap();
1924 assert!(json.contains("\"passed\":true"));
1925 assert!(json.contains("\"scorer\":\"keyword\""));
1926 }
1927
1928 #[test]
1929 fn eval_summary_serializes_to_json() {
1930 let summary = EvalSummary {
1931 total: 2,
1932 passed: 1,
1933 failed: 1,
1934 errors: 0,
1935 avg_score: 0.75,
1936 scorer_averages: vec![("keyword".into(), 0.9)],
1937 };
1938 let json = serde_json::to_string(&summary).unwrap();
1939 assert!(json.contains("\"total\":2"));
1940 assert!(json.contains("\"avg_score\":0.75"));
1941 }
1942
1943 #[test]
1944 fn eval_case_omits_none_budget_fields() {
1945 let case = EvalCase::new("t", "i");
1946 let json = serde_json::to_string(&case).unwrap();
1947 assert!(!json.contains("max_cost_usd"));
1948 assert!(!json.contains("max_latency_ms"));
1949 assert!(!json.contains("max_tool_calls"));
1950 }
1951
1952 fn make_llm_response_event(
1957 model: Option<&str>,
1958 input: u32,
1959 output: u32,
1960 latency: u64,
1961 ) -> AgentEvent {
1962 use crate::llm::types::TokenUsage;
1963 AgentEvent::LlmResponse {
1964 agent: "a".into(),
1965 turn: 1,
1966 usage: TokenUsage {
1967 input_tokens: input,
1968 output_tokens: output,
1969 ..Default::default()
1970 },
1971 stop_reason: crate::llm::types::StopReason::EndTurn,
1972 tool_call_count: 0,
1973 text: String::new(),
1974 latency_ms: latency,
1975 model: model.map(|s| s.to_string()),
1976 time_to_first_token_ms: 0,
1977 }
1978 }
1979
1980 #[test]
1981 fn cost_scorer_under_budget() {
1982 let collector = EvalRunner::event_collector();
1983 {
1984 let mut events = collector.lock().unwrap();
1985 events.push(make_llm_response_event(
1987 Some("claude-sonnet-4-20250514"),
1988 1000,
1989 500,
1990 100,
1991 ));
1992 }
1993 let scorer = CostScorer::new(collector, 1.0);
1994 let case = EvalCase::new("t", "i");
1995 let (score, details) = scorer.score(&case, "", &[]);
1996 assert!(score > 0.95); assert!(details[0].contains("total cost:"));
1998 }
1999
2000 #[test]
2001 fn cost_scorer_over_budget() {
2002 let collector = EvalRunner::event_collector();
2003 {
2004 let mut events = collector.lock().unwrap();
2005 events.push(make_llm_response_event(
2007 Some("claude-sonnet-4-20250514"),
2008 0,
2009 10_000_000,
2010 100,
2011 ));
2012 }
2013 let scorer = CostScorer::new(collector, 0.01);
2014 let case = EvalCase::new("t", "i");
2015 let (score, _) = scorer.score(&case, "", &[]);
2016 assert_eq!(score, 0.0);
2017 }
2018
2019 #[test]
2020 fn cost_scorer_unknown_model() {
2021 let collector = EvalRunner::event_collector();
2022 {
2023 let mut events = collector.lock().unwrap();
2024 events.push(make_llm_response_event(
2025 Some("unknown-model-xyz"),
2026 1000,
2027 1000,
2028 100,
2029 ));
2030 }
2031 let scorer = CostScorer::new(collector, 1.0);
2032 let case = EvalCase::new("t", "i");
2033 let (score, details) = scorer.score(&case, "", &[]);
2034 assert_eq!(score, 1.0); assert!(details.iter().any(|d| d.contains("unknown model")));
2036 }
2037
2038 #[test]
2039 fn cost_scorer_no_model_field() {
2040 let collector = EvalRunner::event_collector();
2041 {
2042 let mut events = collector.lock().unwrap();
2043 events.push(make_llm_response_event(None, 1000, 1000, 100));
2044 }
2045 let scorer = CostScorer::new(collector, 1.0);
2046 let case = EvalCase::new("t", "i");
2047 let (score, _) = scorer.score(&case, "", &[]);
2048 assert_eq!(score, 1.0); }
2050
2051 #[test]
2052 fn cost_scorer_case_override() {
2053 let collector = EvalRunner::event_collector();
2054 {
2055 let mut events = collector.lock().unwrap();
2056 events.push(make_llm_response_event(
2057 Some("claude-sonnet-4-20250514"),
2058 100_000,
2059 50_000,
2060 100,
2061 ));
2062 }
2063 let scorer = CostScorer::new(collector, 100.0); let case = EvalCase::new("t", "i").expect_max_cost_usd(0.0001); let (score, _) = scorer.score(&case, "", &[]);
2066 assert_eq!(score, 0.0); }
2068
2069 #[test]
2070 fn cost_scorer_pass_threshold() {
2071 let scorer = CostScorer::new(EvalRunner::event_collector(), 1.0);
2072 assert_eq!(scorer.pass_threshold(), 0.01);
2073 }
2074
2075 #[test]
2076 fn cost_scorer_zero_budget() {
2077 let scorer = CostScorer::new(EvalRunner::event_collector(), 0.0);
2078 let case = EvalCase::new("t", "i");
2079 let (score, details) = scorer.score(&case, "", &[]);
2080 assert_eq!(score, 0.0);
2081 assert!(details[0].contains("zero"));
2082 }
2083
2084 #[test]
2089 fn latency_scorer_under_budget() {
2090 let collector = EvalRunner::event_collector();
2091 {
2092 let mut events = collector.lock().unwrap();
2093 events.push(make_llm_response_event(None, 0, 0, 500));
2094 events.push(make_llm_response_event(None, 0, 0, 300));
2095 }
2096 let scorer = LatencyScorer::new(collector, 5000);
2097 let case = EvalCase::new("t", "i");
2098 let (score, details) = scorer.score(&case, "", &[]);
2099 assert!((score - 0.84).abs() < 0.001);
2101 assert!(details[0].contains("800ms"));
2102 }
2103
2104 #[test]
2105 fn latency_scorer_over_budget() {
2106 let collector = EvalRunner::event_collector();
2107 {
2108 let mut events = collector.lock().unwrap();
2109 events.push(make_llm_response_event(None, 0, 0, 10_000));
2110 }
2111 let scorer = LatencyScorer::new(collector, 5000);
2112 let case = EvalCase::new("t", "i");
2113 let (score, _) = scorer.score(&case, "", &[]);
2114 assert_eq!(score, 0.0);
2115 }
2116
2117 #[test]
2118 fn latency_scorer_case_override() {
2119 let collector = EvalRunner::event_collector();
2120 {
2121 let mut events = collector.lock().unwrap();
2122 events.push(make_llm_response_event(None, 0, 0, 500));
2123 }
2124 let scorer = LatencyScorer::new(collector, 10_000); let case = EvalCase::new("t", "i").expect_max_latency_ms(1000); let (score, _) = scorer.score(&case, "", &[]);
2127 assert!((score - 0.5).abs() < 0.001);
2129 }
2130
2131 #[test]
2132 fn latency_scorer_no_events() {
2133 let collector = EvalRunner::event_collector();
2134 let scorer = LatencyScorer::new(collector, 5000);
2135 let case = EvalCase::new("t", "i");
2136 let (score, _) = scorer.score(&case, "", &[]);
2137 assert_eq!(score, 1.0);
2138 }
2139
2140 #[test]
2141 fn latency_scorer_pass_threshold() {
2142 let scorer = LatencyScorer::new(EvalRunner::event_collector(), 5000);
2143 assert_eq!(scorer.pass_threshold(), 0.01);
2144 }
2145
2146 #[test]
2147 fn latency_scorer_zero_budget() {
2148 let scorer = LatencyScorer::new(EvalRunner::event_collector(), 0);
2149 let case = EvalCase::new("t", "i");
2150 let (score, details) = scorer.score(&case, "", &[]);
2151 assert_eq!(score, 0.0);
2152 assert!(details[0].contains("zero"));
2153 }
2154
2155 #[test]
2160 fn tool_call_count_under_budget() {
2161 let scorer = ToolCallCountScorer::new(10);
2162 let case = EvalCase::new("t", "i");
2163 let tools: Vec<String> = vec!["a".into(), "b".into(), "c".into()];
2164 let (score, details) = scorer.score(&case, "", &tools);
2165 assert!((score - 0.7).abs() < 0.001);
2167 assert!(details[0].contains("tool calls: 3"));
2168 }
2169
2170 #[test]
2171 fn tool_call_count_over_budget() {
2172 let scorer = ToolCallCountScorer::new(2);
2173 let case = EvalCase::new("t", "i");
2174 let tools: Vec<String> = vec!["a".into(), "b".into(), "c".into()];
2175 let (score, _) = scorer.score(&case, "", &tools);
2176 assert_eq!(score, 0.0);
2177 }
2178
2179 #[test]
2180 fn tool_call_count_zero_calls() {
2181 let scorer = ToolCallCountScorer::new(10);
2182 let case = EvalCase::new("t", "i");
2183 let (score, _) = scorer.score(&case, "", &[]);
2184 assert_eq!(score, 1.0);
2185 }
2186
2187 #[test]
2188 fn tool_call_count_case_override() {
2189 let scorer = ToolCallCountScorer::new(100); let case = EvalCase::new("t", "i").expect_max_tool_calls(2); let tools: Vec<String> = vec!["a".into(), "b".into(), "c".into()];
2192 let (score, _) = scorer.score(&case, "", &tools);
2193 assert_eq!(score, 0.0); }
2195
2196 #[test]
2197 fn tool_call_count_pass_threshold() {
2198 let scorer = ToolCallCountScorer::new(10);
2199 assert_eq!(scorer.pass_threshold(), 0.01);
2200 }
2201
2202 #[test]
2203 fn tool_call_count_zero_budget() {
2204 let scorer = ToolCallCountScorer::new(0);
2205 let case = EvalCase::new("t", "i");
2206 let (score, details) = scorer.score(&case, "", &[]);
2207 assert_eq!(score, 0.0);
2208 assert!(details[0].contains("zero"));
2209 }
2210
2211 #[test]
2216 fn safety_scorer_no_denials() {
2217 let collector = EvalRunner::event_collector();
2218 {
2219 let mut events = collector.lock().unwrap();
2220 events.push(AgentEvent::RunStarted {
2221 agent: "a".into(),
2222 task: "t".into(),
2223 });
2224 }
2225 let scorer = SafetyScorer::new(collector);
2226 let case = EvalCase::new("t", "i");
2227 let (score, details) = scorer.score(&case, "", &[]);
2228 assert_eq!(score, 1.0);
2229 assert!(details[0].contains("no guardrail denials"));
2230 }
2231
2232 #[test]
2233 fn safety_scorer_with_denial() {
2234 let collector = EvalRunner::event_collector();
2235 {
2236 let mut events = collector.lock().unwrap();
2237 events.push(AgentEvent::GuardrailDenied {
2238 agent: "a".into(),
2239 hook: "post_llm".into(),
2240 reason: "unsafe content".into(),
2241 tool_name: None,
2242 });
2243 }
2244 let scorer = SafetyScorer::new(collector);
2245 let case = EvalCase::new("t", "i");
2246 let (score, details) = scorer.score(&case, "", &[]);
2247 assert_eq!(score, 0.0);
2248 assert!(details[0].contains("unsafe content"));
2249 }
2250
2251 #[test]
2252 fn safety_scorer_tool_denial() {
2253 let collector = EvalRunner::event_collector();
2254 {
2255 let mut events = collector.lock().unwrap();
2256 events.push(AgentEvent::GuardrailDenied {
2257 agent: "a".into(),
2258 hook: "pre_tool".into(),
2259 reason: "blocked".into(),
2260 tool_name: Some("bash".into()),
2261 });
2262 }
2263 let scorer = SafetyScorer::new(collector);
2264 let case = EvalCase::new("t", "i");
2265 let (score, details) = scorer.score(&case, "", &[]);
2266 assert_eq!(score, 0.0);
2267 assert!(details[0].contains("(tool: bash)"));
2268 }
2269
2270 #[test]
2271 fn safety_scorer_multiple_denials() {
2272 let collector = EvalRunner::event_collector();
2273 {
2274 let mut events = collector.lock().unwrap();
2275 events.push(AgentEvent::GuardrailDenied {
2276 agent: "a".into(),
2277 hook: "post_llm".into(),
2278 reason: "reason1".into(),
2279 tool_name: None,
2280 });
2281 events.push(AgentEvent::GuardrailDenied {
2282 agent: "a".into(),
2283 hook: "pre_tool".into(),
2284 reason: "reason2".into(),
2285 tool_name: Some("bash".into()),
2286 });
2287 }
2288 let scorer = SafetyScorer::new(collector);
2289 let case = EvalCase::new("t", "i");
2290 let (score, details) = scorer.score(&case, "", &[]);
2291 assert_eq!(score, 0.0);
2292 assert_eq!(details.len(), 2);
2293 }
2294
2295 #[test]
2296 fn safety_scorer_pass_threshold() {
2297 let scorer = SafetyScorer::new(EvalRunner::event_collector());
2298 assert_eq!(scorer.pass_threshold(), 1.0);
2299 }
2300
2301 fn make_eval_result(name: &str, scores: Vec<(&str, f64)>) -> EvalResult {
2306 EvalResult {
2307 case_name: name.into(),
2308 passed: true,
2309 scores: scores
2310 .into_iter()
2311 .map(|(scorer, score)| ScorerResult {
2312 scorer: scorer.into(),
2313 score,
2314 passed: score >= 0.5,
2315 details: vec![],
2316 })
2317 .collect(),
2318 actual_tools: vec![],
2319 actual_output: String::new(),
2320 error: None,
2321 }
2322 }
2323
2324 #[test]
2325 fn comparison_no_regressions() {
2326 let baseline = vec![
2327 make_eval_result("a", vec![("keyword", 0.8)]),
2328 make_eval_result("b", vec![("keyword", 0.6)]),
2329 ];
2330 let candidate = vec![
2331 make_eval_result("a", vec![("keyword", 0.9)]),
2332 make_eval_result("b", vec![("keyword", 0.7)]),
2333 ];
2334 let cmp = EvalComparison::compare(&baseline, &candidate);
2335 assert!(!cmp.has_regressions());
2336 assert_eq!(cmp.candidate_wins(), 2);
2337 assert_eq!(cmp.baseline_wins(), 0);
2338 assert_eq!(cmp.ties(), 0);
2339 assert_eq!(cmp.cases.len(), 2);
2340 }
2341
2342 #[test]
2343 fn comparison_with_regression() {
2344 let baseline = vec![make_eval_result("a", vec![("keyword", 0.9)])];
2345 let candidate = vec![make_eval_result("a", vec![("keyword", 0.5)])];
2346 let cmp = EvalComparison::compare(&baseline, &candidate);
2347 assert!(cmp.has_regressions());
2348 assert_eq!(cmp.regressions(), vec!["a"]);
2349 assert_eq!(cmp.baseline_wins(), 1);
2350 assert_eq!(cmp.candidate_wins(), 0);
2351 assert!(cmp.cases[0].regressed);
2352 assert!((cmp.cases[0].delta - (-0.4)).abs() < 0.001);
2353 }
2354
2355 #[test]
2356 fn comparison_ties() {
2357 let baseline = vec![make_eval_result("a", vec![("keyword", 0.8)])];
2358 let candidate = vec![make_eval_result("a", vec![("keyword", 0.8)])];
2359 let cmp = EvalComparison::compare(&baseline, &candidate);
2360 assert!(!cmp.has_regressions());
2361 assert_eq!(cmp.ties(), 1);
2362 }
2363
2364 #[test]
2365 fn comparison_skips_unmatched_cases() {
2366 let baseline = vec![make_eval_result("a", vec![("keyword", 0.8)])];
2367 let candidate = vec![make_eval_result("b", vec![("keyword", 0.9)])];
2368 let cmp = EvalComparison::compare(&baseline, &candidate);
2369 assert!(cmp.cases.is_empty());
2370 }
2371
2372 #[test]
2373 fn comparison_mixed_results() {
2374 let baseline = vec![
2375 make_eval_result("a", vec![("k", 0.8), ("t", 0.6)]),
2376 make_eval_result("b", vec![("k", 0.5), ("t", 0.9)]),
2377 make_eval_result("c", vec![("k", 1.0)]),
2378 ];
2379 let candidate = vec![
2380 make_eval_result("a", vec![("k", 0.9), ("t", 0.8)]), make_eval_result("b", vec![("k", 0.3), ("t", 0.5)]), make_eval_result("c", vec![("k", 1.0)]), ];
2384 let cmp = EvalComparison::compare(&baseline, &candidate);
2385 assert_eq!(cmp.candidate_wins(), 1);
2386 assert_eq!(cmp.baseline_wins(), 1);
2387 assert_eq!(cmp.ties(), 1);
2388 assert_eq!(cmp.regressions(), vec!["b"]);
2389 }
2390
2391 #[test]
2392 fn comparison_display() {
2393 let baseline = vec![make_eval_result("a", vec![("k", 0.8)])];
2394 let candidate = vec![make_eval_result("a", vec![("k", 0.6)])];
2395 let cmp = EvalComparison::compare(&baseline, &candidate);
2396 let display = format!("{cmp}");
2397 assert!(display.contains("REGRESSED"));
2398 assert!(display.contains("Regressions: a"));
2399 }
2400
2401 #[test]
2402 fn comparison_serializes_to_json() {
2403 let baseline = vec![make_eval_result("a", vec![("k", 0.8)])];
2404 let candidate = vec![make_eval_result("a", vec![("k", 0.9)])];
2405 let cmp = EvalComparison::compare(&baseline, &candidate);
2406 let json = serde_json::to_string(&cmp).unwrap();
2407 assert!(json.contains("\"case_name\":\"a\""));
2408 assert!(json.contains("\"regressed\":false"));
2409 assert_eq!(cmp.candidate_wins(), 1);
2410 }
2411
2412 #[test]
2413 fn comparison_empty_inputs() {
2414 let cmp = EvalComparison::compare(&[], &[]);
2415 assert!(cmp.cases.is_empty());
2416 assert!(!cmp.has_regressions());
2417 }
2418
2419 #[test]
2424 fn avg_score_empty() {
2425 assert_eq!(avg_score(&[]), 0.0);
2426 }
2427
2428 #[test]
2429 fn avg_score_single() {
2430 let scores = vec![ScorerResult {
2431 scorer: "k".into(),
2432 score: 0.7,
2433 passed: true,
2434 details: vec![],
2435 }];
2436 assert!((avg_score(&scores) - 0.7).abs() < 0.001);
2437 }
2438
2439 #[test]
2440 fn avg_score_multiple() {
2441 let scores = vec![
2442 ScorerResult {
2443 scorer: "k".into(),
2444 score: 0.6,
2445 passed: true,
2446 details: vec![],
2447 },
2448 ScorerResult {
2449 scorer: "t".into(),
2450 score: 0.8,
2451 passed: true,
2452 details: vec![],
2453 },
2454 ];
2455 assert!((avg_score(&scores) - 0.7).abs() < 0.001);
2456 }
2457
2458 #[test]
2463 fn runner_with_tool_call_count_scorer() {
2464 let runner = EvalRunner::new().scorer(ToolCallCountScorer::new(5));
2465 let case = EvalCase::new("t", "i");
2466 let tools: Vec<String> = vec!["a".into(), "b".into()];
2467 let result = runner.score_result(&case, "output", &tools, None);
2468 assert!(result.passed);
2469 assert!((result.scores[0].score - 0.6).abs() < 0.001);
2471 }
2472
2473 #[test]
2474 fn runner_with_safety_scorer() {
2475 let collector = EvalRunner::event_collector();
2476 let runner = EvalRunner::new().scorer(SafetyScorer::new(Arc::clone(&collector)));
2477 let case = EvalCase::new("t", "i");
2478 let result = runner.score_result(&case, "output", &[], None);
2479 assert!(result.passed);
2480 assert_eq!(result.scores[0].score, 1.0);
2481 }
2482
2483 #[test]
2488 fn clear_events_resets_collector() {
2489 let collector = EvalRunner::event_collector();
2490 {
2491 let mut events = collector.lock().unwrap();
2492 events.push(make_llm_response_event(None, 0, 0, 1000));
2493 events.push(AgentEvent::GuardrailDenied {
2494 agent: "a".into(),
2495 hook: "post_llm".into(),
2496 reason: "bad".into(),
2497 tool_name: None,
2498 });
2499 }
2500 assert_eq!(collector.lock().unwrap().len(), 2);
2501 clear_events(&collector);
2502 assert!(collector.lock().unwrap().is_empty());
2503 }
2504
2505 #[test]
2506 fn clear_events_fixes_accumulation_between_cases() {
2507 let collector = EvalRunner::event_collector();
2508 let scorer = LatencyScorer::new(Arc::clone(&collector), 1000);
2509 let case = EvalCase::new("t", "i");
2510
2511 {
2513 collector
2514 .lock()
2515 .unwrap()
2516 .push(make_llm_response_event(None, 0, 0, 500));
2517 }
2518 let (score1, _) = scorer.score(&case, "", &[]);
2519 assert!((score1 - 0.5).abs() < 0.001);
2520
2521 clear_events(&collector);
2523 {
2524 collector
2525 .lock()
2526 .unwrap()
2527 .push(make_llm_response_event(None, 0, 0, 300));
2528 }
2529 let (score2, _) = scorer.score(&case, "", &[]);
2530 assert!((score2 - 0.7).abs() < 0.001);
2532 }
2533
2534 #[test]
2539 fn comparison_tiny_delta_is_tie() {
2540 let baseline = vec![make_eval_result("a", vec![("k", 0.8005)])];
2542 let candidate = vec![make_eval_result("a", vec![("k", 0.8)])];
2543 let cmp = EvalComparison::compare(&baseline, &candidate);
2544 assert!(!cmp.has_regressions());
2545 assert_eq!(cmp.ties(), 1);
2546 }
2547
2548 #[test]
2549 fn comparison_significant_delta_is_regression() {
2550 let baseline = vec![make_eval_result("a", vec![("k", 0.81)])];
2552 let candidate = vec![make_eval_result("a", vec![("k", 0.8)])];
2553 let cmp = EvalComparison::compare(&baseline, &candidate);
2554 assert!(cmp.has_regressions());
2555 assert_eq!(cmp.regressions(), vec!["a"]);
2556 }
2557
2558 #[test]
2563 fn eval_case_serde_round_trip() {
2564 let case = EvalCase::new("greeting", "Say hello")
2565 .expect_tool("bash")
2566 .expect_tool_at("read_file", 1)
2567 .expect_output_contains("hello")
2568 .expect_output_not_contains("goodbye")
2569 .reference_output("Hello there!")
2570 .expect_max_cost_usd(0.05)
2571 .expect_max_latency_ms(5000)
2572 .expect_max_tool_calls(10);
2573
2574 let json = serde_json::to_string(&case).expect("serialize EvalCase");
2575 let parsed: EvalCase = serde_json::from_str(&json).expect("deserialize EvalCase");
2576
2577 assert_eq!(parsed.name, "greeting");
2578 assert_eq!(parsed.input, "Say hello");
2579 assert_eq!(parsed.expected_tools.as_ref().unwrap().len(), 2);
2580 assert_eq!(parsed.expected_tools.as_ref().unwrap()[0].name, "bash");
2581 assert!(parsed.expected_tools.as_ref().unwrap()[0].order.is_none());
2582 assert_eq!(parsed.expected_tools.as_ref().unwrap()[1].order, Some(1));
2583 assert_eq!(parsed.output_contains, vec!["hello"]);
2584 assert_eq!(parsed.output_not_contains, vec!["goodbye"]);
2585 assert_eq!(parsed.reference_output.as_deref(), Some("Hello there!"));
2586 assert_eq!(parsed.max_cost_usd, Some(0.05));
2587 assert_eq!(parsed.max_latency_ms, Some(5000));
2588 assert_eq!(parsed.max_tool_calls, Some(10));
2589 }
2590
2591 #[test]
2592 fn eval_case_deserialize_minimal() {
2593 let json = r#"{"name":"simple","input":"do it"}"#;
2594 let case: EvalCase = serde_json::from_str(json).expect("deserialize minimal");
2595 assert_eq!(case.name, "simple");
2596 assert_eq!(case.input, "do it");
2597 assert!(case.expected_tools.is_none());
2598 assert!(case.output_contains.is_empty());
2599 }
2600
2601 #[test]
2602 fn eval_result_serde_round_trip() {
2603 let result = EvalResult {
2604 case_name: "test-case".into(),
2605 passed: true,
2606 scores: vec![ScorerResult {
2607 scorer: "keyword".into(),
2608 score: 0.85,
2609 passed: true,
2610 details: vec!["OK: found hello".into()],
2611 }],
2612 actual_tools: vec!["bash".into(), "read".into()],
2613 actual_output: "Hello world".into(),
2614 error: None,
2615 };
2616
2617 let json = serde_json::to_string(&result).expect("serialize EvalResult");
2618 let parsed: EvalResult = serde_json::from_str(&json).expect("deserialize EvalResult");
2619
2620 assert_eq!(parsed.case_name, "test-case");
2621 assert!(parsed.passed);
2622 assert_eq!(parsed.scores.len(), 1);
2623 assert_eq!(parsed.scores[0].scorer, "keyword");
2624 assert!((parsed.scores[0].score - 0.85).abs() < f64::EPSILON);
2625 assert_eq!(parsed.actual_tools, vec!["bash", "read"]);
2626 assert_eq!(parsed.actual_output, "Hello world");
2627 assert!(parsed.error.is_none());
2628 }
2629
2630 #[test]
2631 fn eval_summary_serde_round_trip() {
2632 let summary = EvalSummary {
2633 total: 10,
2634 passed: 8,
2635 failed: 1,
2636 errors: 1,
2637 avg_score: 0.9,
2638 scorer_averages: vec![("keyword".into(), 0.95), ("trajectory".into(), 0.85)],
2639 };
2640 let json = serde_json::to_string(&summary).expect("serialize");
2641 let parsed: EvalSummary = serde_json::from_str(&json).expect("deserialize");
2642 assert_eq!(parsed.total, 10);
2643 assert_eq!(parsed.passed, 8);
2644 assert_eq!(parsed.scorer_averages.len(), 2);
2645 }
2646
2647 #[test]
2648 fn eval_comparison_serde_round_trip() {
2649 let cmp = EvalComparison {
2650 cases: vec![CaseComparison {
2651 case_name: "test".into(),
2652 baseline_avg_score: 0.8,
2653 candidate_avg_score: 0.9,
2654 delta: 0.1,
2655 regressed: false,
2656 }],
2657 };
2658 let json = serde_json::to_string(&cmp).expect("serialize");
2659 let parsed: EvalComparison = serde_json::from_str(&json).expect("deserialize");
2660 assert_eq!(parsed.cases.len(), 1);
2661 assert!(!parsed.cases[0].regressed);
2662 assert!((parsed.cases[0].delta - 0.1).abs() < f64::EPSILON);
2663 }
2664}