Skip to main content

heartbit_core/eval/
mod.rs

1//! Agent evaluation framework.
2//!
3//! Provides tools for measuring agent quality through repeatable test cases.
4//! Inspired by Google ADK's built-in eval: tool trajectory comparison,
5//! response quality scoring, and composable scorers.
6//!
7//! # Quick start
8//!
9//! ```rust,ignore
10//! use heartbit::eval::{EvalCase, EvalRunner, KeywordScorer, TrajectoryScorer};
11//!
12//! let cases = vec![
13//!     EvalCase::new("greeting", "Say hello")
14//!         .expect_output_contains("hello")
15//!         .expect_no_tools(),
16//!     EvalCase::new("file-read", "Read /tmp/test.txt")
17//!         .expect_tool("read_file")
18//!         .expect_output_contains("content"),
19//! ];
20//!
21//! let runner = EvalRunner::new()
22//!     .scorer(TrajectoryScorer)
23//!     .scorer(KeywordScorer);
24//!
25//! let results = runner.run(&agent, &cases).await;
26//! let summary = EvalSummary::from_results(&results);
27//! println!("{summary}");
28//! ```
29
30/// All known scorer names supported by the eval framework.
31pub const KNOWN_SCORERS: &[&str] = &[
32    "trajectory",
33    "keyword",
34    "similarity",
35    "cost",
36    "latency",
37    "tool_call_count",
38    "safety",
39];
40
41use std::sync::Arc;
42
43use serde::{Deserialize, Serialize};
44
45use crate::agent::events::AgentEvent;
46use crate::error::Error;
47use crate::llm::pricing::estimate_cost;
48
49// ---------------------------------------------------------------------------
50// Core types
51// ---------------------------------------------------------------------------
52
53/// A single evaluation test case.
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct EvalCase {
56    /// Human-readable name for the test case.
57    pub name: String,
58    /// The task input to send to the agent.
59    pub input: String,
60    /// Expected tool calls in order (if `Some`). Empty vec means "expect no tools."
61    #[serde(default, skip_serializing_if = "Option::is_none")]
62    pub expected_tools: Option<Vec<ExpectedToolCall>>,
63    /// Strings that should appear in the agent's output.
64    #[serde(default)]
65    pub output_contains: Vec<String>,
66    /// Strings that must NOT appear in the agent's output.
67    #[serde(default)]
68    pub output_not_contains: Vec<String>,
69    /// Optional reference output for similarity scoring.
70    #[serde(default, skip_serializing_if = "Option::is_none")]
71    pub reference_output: Option<String>,
72    /// Maximum acceptable cost in USD for this case.
73    #[serde(default, skip_serializing_if = "Option::is_none")]
74    pub max_cost_usd: Option<f64>,
75    /// Maximum acceptable total LLM latency in milliseconds.
76    #[serde(default, skip_serializing_if = "Option::is_none")]
77    pub max_latency_ms: Option<u64>,
78    /// Maximum acceptable number of tool calls.
79    #[serde(default, skip_serializing_if = "Option::is_none")]
80    pub max_tool_calls: Option<usize>,
81}
82
83/// An expected tool call in a trajectory.
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct ExpectedToolCall {
86    /// Tool name (exact match).
87    pub name: String,
88    /// If `Some(n)`, the tool must be called at position `n` (0-indexed).
89    /// If `None`, the tool must appear anywhere in the trajectory.
90    pub order: Option<usize>,
91}
92
93impl EvalCase {
94    /// Create a new eval case with a name and input task.
95    pub fn new(name: impl Into<String>, input: impl Into<String>) -> Self {
96        Self {
97            name: name.into(),
98            input: input.into(),
99            expected_tools: None,
100            output_contains: Vec::new(),
101            output_not_contains: Vec::new(),
102            reference_output: None,
103            max_cost_usd: None,
104            max_latency_ms: None,
105            max_tool_calls: None,
106        }
107    }
108
109    /// Expect a specific tool to be called (order-independent).
110    pub fn expect_tool(mut self, name: impl Into<String>) -> Self {
111        self.expected_tools
112            .get_or_insert_with(Vec::new)
113            .push(ExpectedToolCall {
114                name: name.into(),
115                order: None,
116            });
117        self
118    }
119
120    /// Expect a tool at a specific position in the trajectory (0-indexed).
121    pub fn expect_tool_at(mut self, name: impl Into<String>, position: usize) -> Self {
122        self.expected_tools
123            .get_or_insert_with(Vec::new)
124            .push(ExpectedToolCall {
125                name: name.into(),
126                order: Some(position),
127            });
128        self
129    }
130
131    /// Expect no tool calls at all.
132    pub fn expect_no_tools(mut self) -> Self {
133        self.expected_tools = Some(Vec::new());
134        self
135    }
136
137    /// Expect the output to contain a string.
138    pub fn expect_output_contains(mut self, text: impl Into<String>) -> Self {
139        self.output_contains.push(text.into());
140        self
141    }
142
143    /// Expect the output to NOT contain a string.
144    pub fn expect_output_not_contains(mut self, text: impl Into<String>) -> Self {
145        self.output_not_contains.push(text.into());
146        self
147    }
148
149    /// Set a reference output for similarity scoring.
150    pub fn reference_output(mut self, text: impl Into<String>) -> Self {
151        self.reference_output = Some(text.into());
152        self
153    }
154
155    /// Set maximum acceptable cost in USD.
156    pub fn expect_max_cost_usd(mut self, max: f64) -> Self {
157        self.max_cost_usd = Some(max);
158        self
159    }
160
161    /// Set maximum acceptable total LLM latency in milliseconds.
162    pub fn expect_max_latency_ms(mut self, max: u64) -> Self {
163        self.max_latency_ms = Some(max);
164        self
165    }
166
167    /// Set maximum acceptable number of tool calls.
168    pub fn expect_max_tool_calls(mut self, max: usize) -> Self {
169        self.max_tool_calls = Some(max);
170        self
171    }
172}
173
174/// Result of evaluating a single test case.
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct EvalResult {
177    /// Name of the test case.
178    pub case_name: String,
179    /// Whether the case passed (all scorers above threshold).
180    pub passed: bool,
181    /// Per-scorer results.
182    pub scores: Vec<ScorerResult>,
183    /// Actual tool calls made by the agent (in order).
184    pub actual_tools: Vec<String>,
185    /// Actual agent output text.
186    pub actual_output: String,
187    /// Error if the agent failed to execute.
188    pub error: Option<String>,
189}
190
191/// Result from a single scorer.
192#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct ScorerResult {
194    /// Scorer name.
195    pub scorer: String,
196    /// Score value (0.0 to 1.0).
197    pub score: f64,
198    /// Whether this scorer passed.
199    pub passed: bool,
200    /// Human-readable details.
201    pub details: Vec<String>,
202}
203
204/// Aggregate summary of multiple eval results.
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct EvalSummary {
207    /// Total cases evaluated.
208    pub total: usize,
209    /// Cases that passed all scorers.
210    pub passed: usize,
211    /// Cases that failed at least one scorer.
212    pub failed: usize,
213    /// Cases that errored (agent execution failure).
214    pub errors: usize,
215    /// Average score across all cases and scorers.
216    pub avg_score: f64,
217    /// Per-scorer average scores.
218    pub scorer_averages: Vec<(String, f64)>,
219}
220
221impl EvalSummary {
222    /// Compute summary statistics from eval results.
223    pub fn from_results(results: &[EvalResult]) -> Self {
224        let total = results.len();
225        let passed = results.iter().filter(|r| r.passed).count();
226        let errors = results.iter().filter(|r| r.error.is_some()).count();
227        let failed = total - passed - errors;
228
229        // Collect all scores
230        let mut all_scores: Vec<f64> = Vec::new();
231        let mut scorer_totals: std::collections::HashMap<String, (f64, usize)> =
232            std::collections::HashMap::new();
233
234        for result in results {
235            for sr in &result.scores {
236                all_scores.push(sr.score);
237                let entry = scorer_totals.entry(sr.scorer.clone()).or_insert((0.0, 0));
238                entry.0 += sr.score;
239                entry.1 += 1;
240            }
241        }
242
243        let avg_score = if all_scores.is_empty() {
244            0.0
245        } else {
246            all_scores.iter().sum::<f64>() / all_scores.len() as f64
247        };
248
249        let mut scorer_averages: Vec<(String, f64)> = scorer_totals
250            .into_iter()
251            .map(|(name, (sum, count))| (name, sum / count as f64))
252            .collect();
253        scorer_averages.sort_by(|a, b| a.0.cmp(&b.0));
254
255        Self {
256            total,
257            passed,
258            failed,
259            errors,
260            avg_score,
261            scorer_averages,
262        }
263    }
264
265    /// Overall pass rate as a fraction (0.0 to 1.0).
266    pub fn pass_rate(&self) -> f64 {
267        if self.total == 0 {
268            return 0.0;
269        }
270        self.passed as f64 / self.total as f64
271    }
272}
273
274impl std::fmt::Display for EvalSummary {
275    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
276        writeln!(f, "Eval Summary: {}/{} passed", self.passed, self.total)?;
277        writeln!(f, "  Pass rate: {:.1}%", self.pass_rate() * 100.0)?;
278        writeln!(f, "  Avg score: {:.3}", self.avg_score)?;
279        if self.errors > 0 {
280            writeln!(f, "  Errors: {}", self.errors)?;
281        }
282        for (name, avg) in &self.scorer_averages {
283            writeln!(f, "  {name}: {avg:.3}")?;
284        }
285        Ok(())
286    }
287}
288
289// ---------------------------------------------------------------------------
290// EvalScorer trait
291// ---------------------------------------------------------------------------
292
293/// Pluggable scoring function for evaluation.
294///
295/// Scorers evaluate different aspects of agent behavior:
296/// - Tool trajectory correctness
297/// - Output content quality
298/// - Response similarity to reference
299pub trait EvalScorer: Send + Sync {
300    /// Scorer name for reporting.
301    fn name(&self) -> &str;
302
303    /// Score the agent's execution against the eval case.
304    ///
305    /// Returns a score between 0.0 (worst) and 1.0 (best).
306    /// The `details` vec can include human-readable explanations.
307    fn score(&self, case: &EvalCase, output: &str, tool_calls: &[String]) -> (f64, Vec<String>);
308
309    /// Minimum score to pass (default: 1.0 for binary pass/fail).
310    fn pass_threshold(&self) -> f64 {
311        1.0
312    }
313}
314
315// ---------------------------------------------------------------------------
316// TrajectoryScorer
317// ---------------------------------------------------------------------------
318
319/// Scores tool call trajectory against expected tool calls.
320///
321/// Scoring logic:
322/// - If no expected tools are specified (`None`), score is 1.0 (pass).
323/// - If expected tools is empty vec, score is 1.0 only if no tools were called.
324/// - For ordered expectations: exact position match required.
325/// - For unordered expectations: tool must appear anywhere in trajectory.
326/// - Score = matched expectations / total expectations.
327pub struct TrajectoryScorer;
328
329impl EvalScorer for TrajectoryScorer {
330    fn name(&self) -> &str {
331        "trajectory"
332    }
333
334    fn score(&self, case: &EvalCase, _output: &str, tool_calls: &[String]) -> (f64, Vec<String>) {
335        let expected = match &case.expected_tools {
336            None => return (1.0, vec!["no trajectory expectations".into()]),
337            Some(e) => e,
338        };
339
340        // Expect no tools
341        if expected.is_empty() {
342            return if tool_calls.is_empty() {
343                (1.0, vec!["correctly made no tool calls".into()])
344            } else {
345                (
346                    0.0,
347                    vec![format!(
348                        "expected no tools but got: [{}]",
349                        tool_calls.join(", ")
350                    )],
351                )
352            };
353        }
354
355        let mut matched = 0usize;
356        let mut details = Vec::new();
357
358        for exp in expected {
359            if let Some(pos) = exp.order {
360                // Ordered match: check exact position
361                if tool_calls.get(pos).map(|s| s.as_str()) == Some(&exp.name) {
362                    matched += 1;
363                    details.push(format!("OK: {} at position {pos}", exp.name));
364                } else {
365                    let actual = tool_calls.get(pos).map(|s| s.as_str()).unwrap_or("<none>");
366                    details.push(format!(
367                        "FAIL: expected {} at position {pos}, got {actual}",
368                        exp.name
369                    ));
370                }
371            } else {
372                // Unordered match: check presence
373                if tool_calls.iter().any(|t| t == &exp.name) {
374                    matched += 1;
375                    details.push(format!("OK: {} found in trajectory", exp.name));
376                } else {
377                    details.push(format!(
378                        "FAIL: {} not found in [{}]",
379                        exp.name,
380                        tool_calls.join(", ")
381                    ));
382                }
383            }
384        }
385
386        let score = matched as f64 / expected.len() as f64;
387        (score, details)
388    }
389}
390
391// ---------------------------------------------------------------------------
392// KeywordScorer
393// ---------------------------------------------------------------------------
394
395/// Scores output against expected keyword presence/absence.
396///
397/// Score = (contains_matches + not_contains_matches) / total_expectations.
398/// Case-insensitive matching.
399pub struct KeywordScorer;
400
401impl EvalScorer for KeywordScorer {
402    fn name(&self) -> &str {
403        "keyword"
404    }
405
406    fn score(&self, case: &EvalCase, output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
407        let total = case.output_contains.len() + case.output_not_contains.len();
408        if total == 0 {
409            return (1.0, vec!["no keyword expectations".into()]);
410        }
411
412        let lower_output = output.to_lowercase();
413        let mut matched = 0usize;
414        let mut details = Vec::new();
415
416        for keyword in &case.output_contains {
417            if lower_output.contains(&keyword.to_lowercase()) {
418                matched += 1;
419                details.push(format!("OK: output contains \"{keyword}\""));
420            } else {
421                details.push(format!("FAIL: output missing \"{keyword}\""));
422            }
423        }
424
425        for keyword in &case.output_not_contains {
426            if !lower_output.contains(&keyword.to_lowercase()) {
427                matched += 1;
428                details.push(format!("OK: output does not contain \"{keyword}\""));
429            } else {
430                details.push(format!("FAIL: output contains unwanted \"{keyword}\""));
431            }
432        }
433
434        let score = matched as f64 / total as f64;
435        (score, details)
436    }
437}
438
439// ---------------------------------------------------------------------------
440// SimilarityScorer (Rouge-1 unigram overlap)
441// ---------------------------------------------------------------------------
442
443/// Scores output similarity to a reference using unigram overlap (Rouge-1 F1).
444///
445/// If no `reference_output` is set on the case, returns 1.0 (pass).
446/// Uses word-level tokenization with case-insensitive matching.
447pub struct SimilarityScorer;
448
449impl EvalScorer for SimilarityScorer {
450    fn name(&self) -> &str {
451        "similarity"
452    }
453
454    fn score(&self, case: &EvalCase, output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
455        let reference = match &case.reference_output {
456            None => return (1.0, vec!["no reference output".into()]),
457            Some(r) => r,
458        };
459
460        let score = rouge1_f1(output, reference);
461        let details = vec![format!("Rouge-1 F1: {score:.3}")];
462        (score, details)
463    }
464
465    fn pass_threshold(&self) -> f64 {
466        0.3 // Lenient by default
467    }
468}
469
470/// Compute Rouge-1 F1 score between candidate and reference texts.
471///
472/// Uses whitespace tokenization and case-insensitive matching.
473fn rouge1_f1(candidate: &str, reference: &str) -> f64 {
474    use std::collections::HashSet;
475
476    let cand_tokens: HashSet<String> = candidate
477        .split_whitespace()
478        .map(|w| w.to_lowercase())
479        .collect();
480    let ref_tokens: HashSet<String> = reference
481        .split_whitespace()
482        .map(|w| w.to_lowercase())
483        .collect();
484
485    if cand_tokens.is_empty() || ref_tokens.is_empty() {
486        return 0.0;
487    }
488
489    let overlap = cand_tokens.intersection(&ref_tokens).count() as f64;
490    let precision = overlap / cand_tokens.len() as f64;
491    let recall = overlap / ref_tokens.len() as f64;
492
493    if precision + recall == 0.0 {
494        0.0
495    } else {
496        2.0 * precision * recall / (precision + recall)
497    }
498}
499
500// ---------------------------------------------------------------------------
501// EvalRunner
502// ---------------------------------------------------------------------------
503
504/// Collects tool call names from `AgentEvent::ToolCallStarted` events.
505fn collect_tool_calls(events: &[AgentEvent]) -> Vec<String> {
506    events
507        .iter()
508        .filter_map(|e| match e {
509            AgentEvent::ToolCallStarted { tool_name, .. } => Some(tool_name.clone()),
510            _ => None,
511        })
512        .collect()
513}
514
515/// Runs evaluation cases against an agent and collects scored results.
516///
517/// The runner wires an `OnEvent` callback to capture the tool call trajectory,
518/// then scores each case using the configured scorers.
519pub struct EvalRunner {
520    scorers: Vec<Box<dyn EvalScorer>>,
521    /// When set, [`EvalRunner::run`] clears this collector before each case
522    /// so event-aware scorers (`CostScorer`, `LatencyScorer`, `SafetyScorer`)
523    /// see only the events from the current case.
524    event_collector: Option<EventCollector>,
525}
526
527impl std::fmt::Debug for EvalRunner {
528    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
529        f.debug_struct("EvalRunner")
530            .field(
531                "scorers",
532                &self.scorers.iter().map(|s| s.name()).collect::<Vec<_>>(),
533            )
534            .field("event_collector", &self.event_collector.is_some())
535            .finish()
536    }
537}
538
539impl Default for EvalRunner {
540    fn default() -> Self {
541        Self::new()
542    }
543}
544
545impl EvalRunner {
546    /// Create a new eval runner with no scorers.
547    ///
548    /// Add scorers with [`scorer`](Self::scorer); each runs against every
549    /// case's actual output to produce a [`ScorerResult`].
550    ///
551    /// # Example
552    ///
553    /// ```rust
554    /// use heartbit_core::eval::{EvalCase, EvalRunner, KeywordScorer};
555    ///
556    /// let runner = EvalRunner::new().scorer(KeywordScorer);
557    /// let case = EvalCase::new("capital", "What is the capital of France?")
558    ///     .expect_output_contains("Paris");
559    /// // No real LLM call here — score the "actual output" directly.
560    /// let result = runner.score_result(&case, "The capital of France is Paris.", &[], None);
561    /// assert!(result.passed);
562    /// ```
563    pub fn new() -> Self {
564        Self {
565            scorers: Vec::new(),
566            event_collector: None,
567        }
568    }
569
570    /// Add a scorer to the runner.
571    pub fn scorer(mut self, scorer: impl EvalScorer + 'static) -> Self {
572        self.scorers.push(Box::new(scorer));
573        self
574    }
575
576    /// Attach an event collector that [`EvalRunner::run`] will clear before
577    /// each case. This is required when running 2+ cases with event-aware
578    /// scorers ([`CostScorer`], [`LatencyScorer`], [`SafetyScorer`]) against
579    /// the same collector — without it, events accumulate across cases and
580    /// make per-case budgets incorrect from the second case onward.
581    ///
582    /// Pass the same collector you wired into the agent via
583    /// [`EvalRunner::event_callback`] / [`build_eval_agent`].
584    pub fn with_event_collector(mut self, collector: EventCollector) -> Self {
585        self.event_collector = Some(collector);
586        self
587    }
588
589    /// Run all eval cases against an agent, returning results.
590    ///
591    /// Each case runs the agent independently (fresh execution per case). When
592    /// an event collector is attached via [`EvalRunner::with_event_collector`],
593    /// it is cleared before each case so event-aware scorers see only the
594    /// events generated by that case.
595    ///
596    /// **Limitation:** This method cannot capture tool call trajectory data
597    /// because the agent's `OnEvent` callback is set at build time. For
598    /// trajectory scoring, build the agent with [`build_eval_agent`] and use
599    /// [`score_result`](EvalRunner::score_result) with the collected events.
600    pub async fn run<P: crate::llm::LlmProvider>(
601        &self,
602        agent: &crate::agent::AgentRunner<P>,
603        cases: &[EvalCase],
604    ) -> Vec<EvalResult> {
605        let mut results = Vec::with_capacity(cases.len());
606        for case in cases {
607            if let Some(collector) = self.event_collector.as_ref() {
608                clear_events(collector);
609            }
610            results.push(self.run_case(agent, case).await);
611        }
612        results
613    }
614
615    /// Run a single eval case.
616    ///
617    /// **Note:** Tool trajectory data is NOT captured here because the agent's
618    /// `OnEvent` callback cannot be changed after construction. Trajectory
619    /// scoring will vacuously pass (no expectations) or fail (expectations
620    /// present but no tools observed). For trajectory scoring, use
621    /// [`build_eval_agent`] + [`EvalRunner::score_result`] instead.
622    async fn run_case<P: crate::llm::LlmProvider>(
623        &self,
624        agent: &crate::agent::AgentRunner<P>,
625        case: &EvalCase,
626    ) -> EvalResult {
627        match agent.execute(&case.input).await {
628            Ok(output) => {
629                // Tool call names are not available from AgentOutput (only count).
630                // Pass empty trajectory — keyword/similarity scoring still works.
631                self.score_result(case, &output.result, &[], None)
632            }
633            Err(e) => EvalResult {
634                case_name: case.name.clone(),
635                passed: false,
636                scores: Vec::new(),
637                actual_tools: Vec::new(),
638                actual_output: String::new(),
639                error: Some(e.to_string()),
640            },
641        }
642    }
643
644    /// Score a case result with pre-collected tool calls.
645    ///
646    /// Use this when you have tool call data from an external source
647    /// (e.g., `OnEvent` callback, audit trail, or manual testing).
648    pub fn score_result(
649        &self,
650        case: &EvalCase,
651        output: &str,
652        tool_calls: &[String],
653        error: Option<String>,
654    ) -> EvalResult {
655        let scores: Vec<ScorerResult> = self
656            .scorers
657            .iter()
658            .map(|scorer| {
659                let (score, details) = scorer.score(case, output, tool_calls);
660                let passed = score >= scorer.pass_threshold();
661                ScorerResult {
662                    scorer: scorer.name().to_string(),
663                    score,
664                    passed,
665                    details,
666                }
667            })
668            .collect();
669
670        let passed = error.is_none() && scores.iter().all(|s| s.passed);
671
672        EvalResult {
673            case_name: case.name.clone(),
674            passed,
675            scores,
676            actual_tools: tool_calls.to_vec(),
677            actual_output: output.to_string(),
678            error,
679        }
680    }
681
682    /// Create an event collector callback for capturing tool call trajectory.
683    ///
684    /// Wire this into `AgentRunnerBuilder::on_event()` before building the agent.
685    /// After execution, call `collected_tool_calls()` on the returned vec.
686    pub fn event_collector() -> EventCollector {
687        Arc::new(std::sync::Mutex::new(Vec::new()))
688    }
689
690    /// Build an `OnEvent` callback that pushes events into the collector.
691    pub fn event_callback(collector: &EventCollector) -> Arc<dyn Fn(AgentEvent) + Send + Sync> {
692        let collector = Arc::clone(collector);
693        Arc::new(move |event| {
694            collector.lock().expect("eval collector lock").push(event);
695        })
696    }
697
698    /// Extract tool call names from a collected event vec.
699    pub fn collected_tool_calls(collector: &EventCollector) -> Vec<String> {
700        let events = collector.lock().expect("eval collector lock");
701        collect_tool_calls(&events)
702    }
703}
704
705/// Shared event collector for eval tool call trajectory capture.
706pub type EventCollector = Arc<std::sync::Mutex<Vec<AgentEvent>>>;
707
708/// Clear all events from a collector.
709///
710/// Call this between eval cases when reusing a single collector across
711/// multiple agent executions. Event-aware scorers (`CostScorer`,
712/// `LatencyScorer`, `SafetyScorer`) read accumulated events, so stale
713/// events from previous cases will corrupt scores if not cleared.
714pub fn clear_events(collector: &EventCollector) {
715    collector.lock().expect("clear_events lock").clear();
716}
717
718/// Build an eval-ready agent with event collection.
719///
720/// Returns `(agent, collector)`. After `agent.execute()`, use
721/// `EvalRunner::collected_tool_calls(&collector)` to get the trajectory.
722///
723/// This is a convenience helper — you can also wire the event callback
724/// manually via `AgentRunnerBuilder::on_event()`.
725pub fn build_eval_agent<P: crate::llm::LlmProvider>(
726    builder: crate::agent::AgentRunnerBuilder<P>,
727) -> Result<(crate::agent::AgentRunner<P>, EventCollector), Error> {
728    let collector = EvalRunner::event_collector();
729    let callback = EvalRunner::event_callback(&collector);
730    let agent = builder.on_event(callback).build()?;
731    Ok((agent, collector))
732}
733
734// ---------------------------------------------------------------------------
735// CostScorer
736// ---------------------------------------------------------------------------
737
738/// Scores agent execution against a cost budget.
739///
740/// Reads `LlmResponse` events from the event collector, estimates cost using
741/// `estimate_cost(model, usage)`, and scores against the budget.
742/// Unknown models contribute $0 (documented, not penalized).
743///
744/// **Important:** The collector accumulates events across cases. When using
745/// [`EvalRunner::run`] with multiple cases, attach the same collector via
746/// [`EvalRunner::with_event_collector`] so per-case events are isolated.
747/// When calling [`EvalRunner::score_result`] manually, call [`clear_events`]
748/// yourself between cases.
749pub struct CostScorer {
750    collector: EventCollector,
751    max_cost_usd: f64,
752}
753
754impl CostScorer {
755    /// Create a cost scorer with a default max cost budget.
756    ///
757    /// The case's `max_cost_usd` overrides this default when set.
758    pub fn new(collector: EventCollector, max_cost_usd: f64) -> Self {
759        Self {
760            collector,
761            max_cost_usd,
762        }
763    }
764}
765
766impl EvalScorer for CostScorer {
767    fn name(&self) -> &str {
768        "cost"
769    }
770
771    fn score(&self, case: &EvalCase, _output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
772        let max = case.max_cost_usd.unwrap_or(self.max_cost_usd);
773        if max <= 0.0 {
774            return (0.0, vec!["max cost budget is zero".into()]);
775        }
776        let events = self.collector.lock().expect("cost collector lock");
777        let mut total_cost = 0.0f64;
778        let mut details = Vec::new();
779
780        for event in events.iter() {
781            if let AgentEvent::LlmResponse { usage, model, .. } = event {
782                let model_name = model.as_deref().unwrap_or("unknown");
783                match estimate_cost(model_name, usage) {
784                    Some(cost) => total_cost += cost,
785                    None => {
786                        details.push(format!("unknown model \"{model_name}\": $0 contributed"));
787                    }
788                }
789            }
790        }
791
792        details.insert(0, format!("total cost: ${total_cost:.6} (max: ${max:.6})"));
793        (budget_score(total_cost, max), details)
794    }
795
796    fn pass_threshold(&self) -> f64 {
797        0.01
798    }
799}
800
801// ---------------------------------------------------------------------------
802// LatencyScorer
803// ---------------------------------------------------------------------------
804
805/// Scores agent execution against a latency budget.
806///
807/// Sums `latency_ms` from `LlmResponse` events and scores against the budget.
808///
809/// **Important:** The collector accumulates events across cases. When using
810/// [`EvalRunner::run`] with multiple cases, attach the same collector via
811/// [`EvalRunner::with_event_collector`] so per-case events are isolated.
812/// When calling [`EvalRunner::score_result`] manually, call [`clear_events`]
813/// yourself between cases.
814pub struct LatencyScorer {
815    collector: EventCollector,
816    max_latency_ms: u64,
817}
818
819impl LatencyScorer {
820    /// Create a latency scorer with a default max latency in milliseconds.
821    ///
822    /// The case's `max_latency_ms` overrides this default when set.
823    pub fn new(collector: EventCollector, max_latency_ms: u64) -> Self {
824        Self {
825            collector,
826            max_latency_ms,
827        }
828    }
829}
830
831impl EvalScorer for LatencyScorer {
832    fn name(&self) -> &str {
833        "latency"
834    }
835
836    fn score(&self, case: &EvalCase, _output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
837        let max = case.max_latency_ms.unwrap_or(self.max_latency_ms);
838        if max == 0 {
839            return (0.0, vec!["max latency budget is zero".into()]);
840        }
841        let events = self.collector.lock().expect("latency collector lock");
842        let total_ms: u64 = events
843            .iter()
844            .filter_map(|e| match e {
845                AgentEvent::LlmResponse { latency_ms, .. } => Some(latency_ms),
846                _ => None,
847            })
848            .sum();
849
850        let details = vec![format!("total latency: {total_ms}ms (max: {max}ms)")];
851        (budget_score(total_ms as f64, max as f64), details)
852    }
853
854    fn pass_threshold(&self) -> f64 {
855        0.01
856    }
857}
858
859// ---------------------------------------------------------------------------
860// ToolCallCountScorer
861// ---------------------------------------------------------------------------
862
863/// Scores agent execution against a tool call count budget.
864///
865/// Uses the `tool_calls` slice length from the scorer arguments.
866/// Does not require an `EventCollector`.
867pub struct ToolCallCountScorer {
868    max_calls: usize,
869}
870
871impl ToolCallCountScorer {
872    /// Create a tool call count scorer with a default maximum.
873    ///
874    /// The case's `max_tool_calls` overrides this default when set.
875    pub fn new(max_calls: usize) -> Self {
876        Self { max_calls }
877    }
878}
879
880impl EvalScorer for ToolCallCountScorer {
881    fn name(&self) -> &str {
882        "tool_call_count"
883    }
884
885    fn score(&self, case: &EvalCase, _output: &str, tool_calls: &[String]) -> (f64, Vec<String>) {
886        let max = case.max_tool_calls.unwrap_or(self.max_calls);
887        if max == 0 {
888            return (0.0, vec!["max tool call budget is zero".into()]);
889        }
890        let count = tool_calls.len();
891        let details = vec![format!("tool calls: {count} (max: {max})")];
892        (budget_score(count as f64, max as f64), details)
893    }
894
895    fn pass_threshold(&self) -> f64 {
896        0.01
897    }
898}
899
900// ---------------------------------------------------------------------------
901// SafetyScorer
902// ---------------------------------------------------------------------------
903
904/// Scores agent execution for guardrail safety.
905///
906/// Checks for `GuardrailDenied` events in the event collector.
907/// Score is 0.0 if any denial occurred, 1.0 otherwise.
908/// Warnings pass (only denials fail).
909///
910/// **Important:** The collector accumulates events across cases. When using
911/// [`EvalRunner::run`] with multiple cases, attach the same collector via
912/// [`EvalRunner::with_event_collector`] so per-case events are isolated.
913/// When calling [`EvalRunner::score_result`] manually, call [`clear_events`]
914/// yourself between cases.
915pub struct SafetyScorer {
916    collector: EventCollector,
917}
918
919impl SafetyScorer {
920    /// Create a safety scorer that reads from the given event collector.
921    pub fn new(collector: EventCollector) -> Self {
922        Self { collector }
923    }
924}
925
926impl EvalScorer for SafetyScorer {
927    fn name(&self) -> &str {
928        "safety"
929    }
930
931    fn score(&self, _case: &EvalCase, _output: &str, _tool_calls: &[String]) -> (f64, Vec<String>) {
932        let events = self.collector.lock().expect("safety collector lock");
933        let mut denials = Vec::new();
934
935        for event in events.iter() {
936            if let AgentEvent::GuardrailDenied {
937                hook,
938                reason,
939                tool_name,
940                ..
941            } = event
942            {
943                let tool_info = tool_name
944                    .as_deref()
945                    .map(|t| format!(" (tool: {t})"))
946                    .unwrap_or_default();
947                denials.push(format!("denied at {hook}{tool_info}: {reason}"));
948            }
949        }
950
951        if denials.is_empty() {
952            (1.0, vec!["no guardrail denials".into()])
953        } else {
954            (0.0, denials)
955        }
956    }
957
958    fn pass_threshold(&self) -> f64 {
959        1.0
960    }
961}
962
963// ---------------------------------------------------------------------------
964// EvalComparison (A/B with regression detection)
965// ---------------------------------------------------------------------------
966
967/// Tolerance for score comparison. Differences smaller than this are ties.
968const REGRESSION_TOLERANCE: f64 = 0.001;
969
970/// Comparison of two eval runs for A/B testing and regression detection.
971#[derive(Debug, Clone, Serialize, Deserialize)]
972pub struct EvalComparison {
973    /// Per-case comparison results.
974    pub cases: Vec<CaseComparison>,
975}
976
977/// Comparison of a single case between baseline and candidate runs.
978#[derive(Debug, Clone, Serialize, Deserialize)]
979pub struct CaseComparison {
980    /// Test case name.
981    pub case_name: String,
982    /// Average score across all scorers in the baseline run.
983    pub baseline_avg_score: f64,
984    /// Average score across all scorers in the candidate run.
985    pub candidate_avg_score: f64,
986    /// Score delta (candidate - baseline). Negative means regression.
987    pub delta: f64,
988    /// Whether the candidate regressed on this case.
989    pub regressed: bool,
990}
991
992impl EvalComparison {
993    /// Compare baseline and candidate eval results.
994    ///
995    /// Matches results by `case_name`. Cases present in only one run are skipped.
996    pub fn compare(baseline: &[EvalResult], candidate: &[EvalResult]) -> Self {
997        let baseline_map: std::collections::HashMap<&str, &EvalResult> =
998            baseline.iter().map(|r| (r.case_name.as_str(), r)).collect();
999
1000        let cases: Vec<CaseComparison> = candidate
1001            .iter()
1002            .filter_map(|cand_result| {
1003                let base_result = baseline_map.get(cand_result.case_name.as_str())?;
1004                let base_avg = avg_score(&base_result.scores);
1005                let cand_avg = avg_score(&cand_result.scores);
1006                let delta = cand_avg - base_avg;
1007                Some(CaseComparison {
1008                    case_name: cand_result.case_name.clone(),
1009                    baseline_avg_score: base_avg,
1010                    candidate_avg_score: cand_avg,
1011                    delta,
1012                    regressed: delta < -REGRESSION_TOLERANCE,
1013                })
1014            })
1015            .collect();
1016
1017        Self { cases }
1018    }
1019
1020    /// Number of cases where baseline scored higher.
1021    pub fn baseline_wins(&self) -> usize {
1022        self.cases.iter().filter(|c| c.regressed).count()
1023    }
1024
1025    /// Number of cases where candidate scored higher.
1026    pub fn candidate_wins(&self) -> usize {
1027        self.cases
1028            .iter()
1029            .filter(|c| c.delta > REGRESSION_TOLERANCE)
1030            .count()
1031    }
1032
1033    /// Number of cases with equal scores (within tolerance).
1034    pub fn ties(&self) -> usize {
1035        self.cases.len() - self.baseline_wins() - self.candidate_wins()
1036    }
1037
1038    /// Whether any case regressed from baseline to candidate.
1039    pub fn has_regressions(&self) -> bool {
1040        self.cases.iter().any(|c| c.regressed)
1041    }
1042
1043    /// Names of cases where candidate scored lower than baseline.
1044    pub fn regressions(&self) -> Vec<&str> {
1045        self.cases
1046            .iter()
1047            .filter(|c| c.regressed)
1048            .map(|c| c.case_name.as_str())
1049            .collect()
1050    }
1051}
1052
1053impl std::fmt::Display for EvalComparison {
1054    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1055        writeln!(
1056            f,
1057            "A/B Comparison: {} cases ({} baseline wins, {} candidate wins, {} ties)",
1058            self.cases.len(),
1059            self.baseline_wins(),
1060            self.candidate_wins(),
1061            self.ties()
1062        )?;
1063        for c in &self.cases {
1064            let marker = if c.regressed { "REGRESSED" } else { "ok" };
1065            writeln!(
1066                f,
1067                "  {}: baseline={:.3} candidate={:.3} delta={:+.3} [{}]",
1068                c.case_name, c.baseline_avg_score, c.candidate_avg_score, c.delta, marker
1069            )?;
1070        }
1071        let regressions = self.regressions();
1072        if !regressions.is_empty() {
1073            writeln!(f, "  Regressions: {}", regressions.join(", "))?;
1074        }
1075        Ok(())
1076    }
1077}
1078
1079/// Linear budget score: 1.0 when `actual` is 0, 0.0 when `actual >= max`.
1080fn budget_score(actual: f64, max: f64) -> f64 {
1081    (1.0 - actual / max).max(0.0)
1082}
1083
1084/// Average score from a slice of scorer results. Returns 0.0 if empty.
1085fn avg_score(scores: &[ScorerResult]) -> f64 {
1086    if scores.is_empty() {
1087        return 0.0;
1088    }
1089    scores.iter().map(|s| s.score).sum::<f64>() / scores.len() as f64
1090}
1091
1092// ===========================================================================
1093// Tests
1094// ===========================================================================
1095
1096#[cfg(test)]
1097mod tests {
1098    use super::*;
1099
1100    // -----------------------------------------------------------------------
1101    // EvalCase builder tests
1102    // -----------------------------------------------------------------------
1103
1104    #[test]
1105    fn eval_case_new() {
1106        let case = EvalCase::new("test", "do something");
1107        assert_eq!(case.name, "test");
1108        assert_eq!(case.input, "do something");
1109        assert!(case.expected_tools.is_none());
1110        assert!(case.output_contains.is_empty());
1111        assert!(case.output_not_contains.is_empty());
1112        assert!(case.reference_output.is_none());
1113    }
1114
1115    #[test]
1116    fn eval_case_expect_tool() {
1117        let case = EvalCase::new("t", "i")
1118            .expect_tool("bash")
1119            .expect_tool("read_file");
1120        let tools = case.expected_tools.as_ref().unwrap();
1121        assert_eq!(tools.len(), 2);
1122        assert_eq!(tools[0].name, "bash");
1123        assert!(tools[0].order.is_none());
1124        assert_eq!(tools[1].name, "read_file");
1125    }
1126
1127    #[test]
1128    fn eval_case_expect_tool_at() {
1129        let case = EvalCase::new("t", "i")
1130            .expect_tool_at("bash", 0)
1131            .expect_tool_at("read_file", 1);
1132        let tools = case.expected_tools.as_ref().unwrap();
1133        assert_eq!(tools[0].order, Some(0));
1134        assert_eq!(tools[1].order, Some(1));
1135    }
1136
1137    #[test]
1138    fn eval_case_expect_no_tools() {
1139        let case = EvalCase::new("t", "i").expect_no_tools();
1140        let tools = case.expected_tools.as_ref().unwrap();
1141        assert!(tools.is_empty());
1142    }
1143
1144    #[test]
1145    fn eval_case_expect_output() {
1146        let case = EvalCase::new("t", "i")
1147            .expect_output_contains("hello")
1148            .expect_output_not_contains("error");
1149        assert_eq!(case.output_contains, vec!["hello"]);
1150        assert_eq!(case.output_not_contains, vec!["error"]);
1151    }
1152
1153    #[test]
1154    fn eval_case_reference_output() {
1155        let case = EvalCase::new("t", "i").reference_output("expected answer");
1156        assert_eq!(case.reference_output.as_deref(), Some("expected answer"));
1157    }
1158
1159    // -----------------------------------------------------------------------
1160    // TrajectoryScorer tests
1161    // -----------------------------------------------------------------------
1162
1163    #[test]
1164    fn trajectory_no_expectations_passes() {
1165        let case = EvalCase::new("t", "i"); // no expected_tools
1166        let (score, _) = TrajectoryScorer.score(&case, "", &["bash".into()]);
1167        assert_eq!(score, 1.0);
1168    }
1169
1170    #[test]
1171    fn trajectory_expect_no_tools_with_none() {
1172        let case = EvalCase::new("t", "i").expect_no_tools();
1173        let (score, _) = TrajectoryScorer.score(&case, "", &[]);
1174        assert_eq!(score, 1.0);
1175    }
1176
1177    #[test]
1178    fn trajectory_expect_no_tools_but_got_some() {
1179        let case = EvalCase::new("t", "i").expect_no_tools();
1180        let (score, details) = TrajectoryScorer.score(&case, "", &["bash".into()]);
1181        assert_eq!(score, 0.0);
1182        assert!(details[0].contains("expected no tools"));
1183    }
1184
1185    #[test]
1186    fn trajectory_unordered_match() {
1187        let case = EvalCase::new("t", "i")
1188            .expect_tool("read_file")
1189            .expect_tool("bash");
1190        let tools = vec!["bash".into(), "read_file".into()];
1191        let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1192        assert_eq!(score, 1.0);
1193    }
1194
1195    #[test]
1196    fn trajectory_unordered_partial_match() {
1197        let case = EvalCase::new("t", "i")
1198            .expect_tool("read_file")
1199            .expect_tool("bash");
1200        let tools = vec!["bash".into()];
1201        let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1202        assert_eq!(score, 0.5);
1203    }
1204
1205    #[test]
1206    fn trajectory_unordered_no_match() {
1207        let case = EvalCase::new("t", "i").expect_tool("bash");
1208        let tools: Vec<String> = vec!["read_file".into()];
1209        let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1210        assert_eq!(score, 0.0);
1211    }
1212
1213    #[test]
1214    fn trajectory_ordered_exact_match() {
1215        let case = EvalCase::new("t", "i")
1216            .expect_tool_at("read_file", 0)
1217            .expect_tool_at("bash", 1);
1218        let tools = vec!["read_file".into(), "bash".into()];
1219        let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1220        assert_eq!(score, 1.0);
1221    }
1222
1223    #[test]
1224    fn trajectory_ordered_wrong_position() {
1225        let case = EvalCase::new("t", "i")
1226            .expect_tool_at("bash", 0)
1227            .expect_tool_at("read_file", 1);
1228        let tools = vec!["read_file".into(), "bash".into()]; // swapped
1229        let (score, details) = TrajectoryScorer.score(&case, "", &tools);
1230        assert_eq!(score, 0.0);
1231        assert!(details[0].contains("FAIL"));
1232    }
1233
1234    #[test]
1235    fn trajectory_ordered_position_out_of_bounds() {
1236        let case = EvalCase::new("t", "i").expect_tool_at("bash", 5);
1237        let tools = vec!["bash".into()];
1238        let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1239        assert_eq!(score, 0.0);
1240    }
1241
1242    #[test]
1243    fn trajectory_mixed_ordered_unordered() {
1244        let case = EvalCase::new("t", "i")
1245            .expect_tool_at("read_file", 0) // ordered
1246            .expect_tool("bash"); // unordered
1247        let tools = vec!["read_file".into(), "write_file".into(), "bash".into()];
1248        let (score, _) = TrajectoryScorer.score(&case, "", &tools);
1249        assert_eq!(score, 1.0);
1250    }
1251
1252    // -----------------------------------------------------------------------
1253    // KeywordScorer tests
1254    // -----------------------------------------------------------------------
1255
1256    #[test]
1257    fn keyword_no_expectations_passes() {
1258        let case = EvalCase::new("t", "i");
1259        let (score, _) = KeywordScorer.score(&case, "any output", &[]);
1260        assert_eq!(score, 1.0);
1261    }
1262
1263    #[test]
1264    fn keyword_contains_match() {
1265        let case = EvalCase::new("t", "i")
1266            .expect_output_contains("hello")
1267            .expect_output_contains("world");
1268        let (score, _) = KeywordScorer.score(&case, "Hello World", &[]);
1269        assert_eq!(score, 1.0); // case-insensitive
1270    }
1271
1272    #[test]
1273    fn keyword_contains_partial_match() {
1274        let case = EvalCase::new("t", "i")
1275            .expect_output_contains("hello")
1276            .expect_output_contains("missing");
1277        let (score, _) = KeywordScorer.score(&case, "hello there", &[]);
1278        assert_eq!(score, 0.5);
1279    }
1280
1281    #[test]
1282    fn keyword_not_contains_match() {
1283        let case = EvalCase::new("t", "i")
1284            .expect_output_not_contains("error")
1285            .expect_output_not_contains("fail");
1286        let (score, _) = KeywordScorer.score(&case, "success!", &[]);
1287        assert_eq!(score, 1.0);
1288    }
1289
1290    #[test]
1291    fn keyword_not_contains_violation() {
1292        let case = EvalCase::new("t", "i").expect_output_not_contains("error");
1293        let (score, details) = KeywordScorer.score(&case, "An Error occurred", &[]);
1294        assert_eq!(score, 0.0); // case-insensitive
1295        assert!(details[0].contains("FAIL"));
1296    }
1297
1298    #[test]
1299    fn keyword_mixed_contains_and_not_contains() {
1300        let case = EvalCase::new("t", "i")
1301            .expect_output_contains("result")
1302            .expect_output_not_contains("error");
1303        // Both pass
1304        let (score, _) = KeywordScorer.score(&case, "the result is 42", &[]);
1305        assert_eq!(score, 1.0);
1306
1307        // contains fails, not_contains passes
1308        let (score, _) = KeywordScorer.score(&case, "no match here", &[]);
1309        assert_eq!(score, 0.5);
1310    }
1311
1312    // -----------------------------------------------------------------------
1313    // SimilarityScorer tests
1314    // -----------------------------------------------------------------------
1315
1316    #[test]
1317    fn similarity_no_reference_passes() {
1318        let case = EvalCase::new("t", "i");
1319        let (score, _) = SimilarityScorer.score(&case, "any output", &[]);
1320        assert_eq!(score, 1.0);
1321    }
1322
1323    #[test]
1324    fn similarity_identical_text() {
1325        let case = EvalCase::new("t", "i").reference_output("hello world");
1326        let (score, _) = SimilarityScorer.score(&case, "hello world", &[]);
1327        assert_eq!(score, 1.0);
1328    }
1329
1330    #[test]
1331    fn similarity_partial_overlap() {
1332        let case =
1333            EvalCase::new("t", "i").reference_output("the quick brown fox jumps over the lazy dog");
1334        let (score, _) = SimilarityScorer.score(&case, "the quick brown cat", &[]);
1335        assert!(score > 0.0);
1336        assert!(score < 1.0);
1337    }
1338
1339    #[test]
1340    fn similarity_no_overlap() {
1341        let case = EvalCase::new("t", "i").reference_output("alpha beta gamma");
1342        let (score, _) = SimilarityScorer.score(&case, "one two three", &[]);
1343        assert_eq!(score, 0.0);
1344    }
1345
1346    #[test]
1347    fn similarity_case_insensitive() {
1348        let case = EvalCase::new("t", "i").reference_output("Hello World");
1349        let (score, _) = SimilarityScorer.score(&case, "hello world", &[]);
1350        assert_eq!(score, 1.0);
1351    }
1352
1353    #[test]
1354    fn similarity_empty_candidate() {
1355        let case = EvalCase::new("t", "i").reference_output("hello world");
1356        let (score, _) = SimilarityScorer.score(&case, "", &[]);
1357        assert_eq!(score, 0.0);
1358    }
1359
1360    #[test]
1361    fn similarity_empty_reference() {
1362        let case = EvalCase::new("t", "i").reference_output("");
1363        let (score, _) = SimilarityScorer.score(&case, "hello world", &[]);
1364        assert_eq!(score, 0.0);
1365    }
1366
1367    // -----------------------------------------------------------------------
1368    // Rouge-1 F1 unit tests
1369    // -----------------------------------------------------------------------
1370
1371    #[test]
1372    fn rouge1_identical() {
1373        assert_eq!(rouge1_f1("hello world", "hello world"), 1.0);
1374    }
1375
1376    #[test]
1377    fn rouge1_no_overlap() {
1378        assert_eq!(rouge1_f1("a b c", "x y z"), 0.0);
1379    }
1380
1381    #[test]
1382    fn rouge1_partial() {
1383        // Candidate: {the, cat} Reference: {the, dog}
1384        // Overlap: {the} = 1
1385        // Precision: 1/2, Recall: 1/2
1386        // F1: 2 * 0.5 * 0.5 / (0.5 + 0.5) = 0.5
1387        assert_eq!(rouge1_f1("the cat", "the dog"), 0.5);
1388    }
1389
1390    #[test]
1391    fn rouge1_empty_candidate() {
1392        assert_eq!(rouge1_f1("", "hello"), 0.0);
1393    }
1394
1395    #[test]
1396    fn rouge1_empty_reference() {
1397        assert_eq!(rouge1_f1("hello", ""), 0.0);
1398    }
1399
1400    // -----------------------------------------------------------------------
1401    // EvalRunner::score_result tests
1402    // -----------------------------------------------------------------------
1403
1404    #[test]
1405    fn score_result_no_scorers() {
1406        let runner = EvalRunner::new();
1407        let case = EvalCase::new("t", "i");
1408        let result = runner.score_result(&case, "output", &[], None);
1409        assert!(result.passed);
1410        assert!(result.scores.is_empty());
1411    }
1412
1413    #[test]
1414    fn score_result_all_pass() {
1415        let runner = EvalRunner::new()
1416            .scorer(TrajectoryScorer)
1417            .scorer(KeywordScorer);
1418        let case = EvalCase::new("t", "i")
1419            .expect_tool("bash")
1420            .expect_output_contains("done");
1421        let result = runner.score_result(&case, "done!", &["bash".into()], None);
1422        assert!(result.passed);
1423        assert_eq!(result.scores.len(), 2);
1424        assert!(result.scores.iter().all(|s| s.passed));
1425    }
1426
1427    #[test]
1428    fn score_result_trajectory_fails() {
1429        let runner = EvalRunner::new().scorer(TrajectoryScorer);
1430        let case = EvalCase::new("t", "i").expect_tool("bash");
1431        let result = runner.score_result(&case, "output", &["read_file".into()], None);
1432        assert!(!result.passed);
1433    }
1434
1435    #[test]
1436    fn score_result_with_error() {
1437        let runner = EvalRunner::new().scorer(TrajectoryScorer);
1438        let case = EvalCase::new("t", "i");
1439        let result = runner.score_result(&case, "", &[], Some("agent failed".into()));
1440        assert!(!result.passed);
1441        assert_eq!(result.error.as_deref(), Some("agent failed"));
1442    }
1443
1444    #[test]
1445    fn score_result_preserves_actual_data() {
1446        let runner = EvalRunner::new();
1447        let case = EvalCase::new("test-case", "i");
1448        let tools = vec!["bash".into(), "read".into()];
1449        let result = runner.score_result(&case, "my output", &tools, None);
1450        assert_eq!(result.case_name, "test-case");
1451        assert_eq!(result.actual_output, "my output");
1452        assert_eq!(result.actual_tools, vec!["bash", "read"]);
1453    }
1454
1455    // -----------------------------------------------------------------------
1456    // EvalSummary tests
1457    // -----------------------------------------------------------------------
1458
1459    #[test]
1460    fn summary_empty_results() {
1461        let summary = EvalSummary::from_results(&[]);
1462        assert_eq!(summary.total, 0);
1463        assert_eq!(summary.passed, 0);
1464        assert_eq!(summary.pass_rate(), 0.0);
1465    }
1466
1467    #[test]
1468    fn summary_all_pass() {
1469        let results = vec![
1470            EvalResult {
1471                case_name: "a".into(),
1472                passed: true,
1473                scores: vec![ScorerResult {
1474                    scorer: "trajectory".into(),
1475                    score: 1.0,
1476                    passed: true,
1477                    details: vec![],
1478                }],
1479                actual_tools: vec![],
1480                actual_output: String::new(),
1481                error: None,
1482            },
1483            EvalResult {
1484                case_name: "b".into(),
1485                passed: true,
1486                scores: vec![ScorerResult {
1487                    scorer: "trajectory".into(),
1488                    score: 1.0,
1489                    passed: true,
1490                    details: vec![],
1491                }],
1492                actual_tools: vec![],
1493                actual_output: String::new(),
1494                error: None,
1495            },
1496        ];
1497        let summary = EvalSummary::from_results(&results);
1498        assert_eq!(summary.total, 2);
1499        assert_eq!(summary.passed, 2);
1500        assert_eq!(summary.failed, 0);
1501        assert_eq!(summary.pass_rate(), 1.0);
1502        assert_eq!(summary.avg_score, 1.0);
1503    }
1504
1505    #[test]
1506    fn summary_mixed_results() {
1507        let results = vec![
1508            EvalResult {
1509                case_name: "pass".into(),
1510                passed: true,
1511                scores: vec![ScorerResult {
1512                    scorer: "keyword".into(),
1513                    score: 1.0,
1514                    passed: true,
1515                    details: vec![],
1516                }],
1517                actual_tools: vec![],
1518                actual_output: String::new(),
1519                error: None,
1520            },
1521            EvalResult {
1522                case_name: "fail".into(),
1523                passed: false,
1524                scores: vec![ScorerResult {
1525                    scorer: "keyword".into(),
1526                    score: 0.5,
1527                    passed: false,
1528                    details: vec![],
1529                }],
1530                actual_tools: vec![],
1531                actual_output: String::new(),
1532                error: None,
1533            },
1534            EvalResult {
1535                case_name: "error".into(),
1536                passed: false,
1537                scores: vec![],
1538                actual_tools: vec![],
1539                actual_output: String::new(),
1540                error: Some("agent failed".into()),
1541            },
1542        ];
1543        let summary = EvalSummary::from_results(&results);
1544        assert_eq!(summary.total, 3);
1545        assert_eq!(summary.passed, 1);
1546        assert_eq!(summary.failed, 1);
1547        assert_eq!(summary.errors, 1);
1548        assert!((summary.pass_rate() - 1.0 / 3.0).abs() < 0.001);
1549    }
1550
1551    #[test]
1552    fn summary_scorer_averages() {
1553        let results = vec![
1554            EvalResult {
1555                case_name: "a".into(),
1556                passed: true,
1557                scores: vec![
1558                    ScorerResult {
1559                        scorer: "trajectory".into(),
1560                        score: 1.0,
1561                        passed: true,
1562                        details: vec![],
1563                    },
1564                    ScorerResult {
1565                        scorer: "keyword".into(),
1566                        score: 0.8,
1567                        passed: true,
1568                        details: vec![],
1569                    },
1570                ],
1571                actual_tools: vec![],
1572                actual_output: String::new(),
1573                error: None,
1574            },
1575            EvalResult {
1576                case_name: "b".into(),
1577                passed: false,
1578                scores: vec![
1579                    ScorerResult {
1580                        scorer: "trajectory".into(),
1581                        score: 0.5,
1582                        passed: false,
1583                        details: vec![],
1584                    },
1585                    ScorerResult {
1586                        scorer: "keyword".into(),
1587                        score: 1.0,
1588                        passed: true,
1589                        details: vec![],
1590                    },
1591                ],
1592                actual_tools: vec![],
1593                actual_output: String::new(),
1594                error: None,
1595            },
1596        ];
1597        let summary = EvalSummary::from_results(&results);
1598        // trajectory avg: (1.0 + 0.5) / 2 = 0.75
1599        // keyword avg: (0.8 + 1.0) / 2 = 0.9
1600        let traj = summary
1601            .scorer_averages
1602            .iter()
1603            .find(|(n, _)| n == "trajectory")
1604            .unwrap();
1605        assert!((traj.1 - 0.75).abs() < 0.001);
1606        let kw = summary
1607            .scorer_averages
1608            .iter()
1609            .find(|(n, _)| n == "keyword")
1610            .unwrap();
1611        assert!((kw.1 - 0.9).abs() < 0.001);
1612    }
1613
1614    #[test]
1615    fn summary_display() {
1616        let results = vec![EvalResult {
1617            case_name: "a".into(),
1618            passed: true,
1619            scores: vec![ScorerResult {
1620                scorer: "trajectory".into(),
1621                score: 1.0,
1622                passed: true,
1623                details: vec![],
1624            }],
1625            actual_tools: vec![],
1626            actual_output: String::new(),
1627            error: None,
1628        }];
1629        let summary = EvalSummary::from_results(&results);
1630        let display = format!("{summary}");
1631        assert!(display.contains("1/1 passed"));
1632        assert!(display.contains("100.0%"));
1633    }
1634
1635    // -----------------------------------------------------------------------
1636    // collect_tool_calls tests
1637    // -----------------------------------------------------------------------
1638
1639    #[test]
1640    fn collect_tool_calls_extracts_started_events() {
1641        let events = vec![
1642            AgentEvent::RunStarted {
1643                agent: "a".into(),
1644                task: "t".into(),
1645            },
1646            AgentEvent::ToolCallStarted {
1647                agent: "a".into(),
1648                tool_name: "bash".into(),
1649                tool_call_id: "c1".into(),
1650                input: "{}".into(),
1651            },
1652            AgentEvent::ToolCallCompleted {
1653                agent: "a".into(),
1654                tool_name: "bash".into(),
1655                tool_call_id: "c1".into(),
1656                is_error: false,
1657                duration_ms: 10,
1658                output: String::new(),
1659            },
1660            AgentEvent::ToolCallStarted {
1661                agent: "a".into(),
1662                tool_name: "read_file".into(),
1663                tool_call_id: "c2".into(),
1664                input: "{}".into(),
1665            },
1666        ];
1667        let tools = collect_tool_calls(&events);
1668        assert_eq!(tools, vec!["bash", "read_file"]);
1669    }
1670
1671    #[test]
1672    fn collect_tool_calls_empty_events() {
1673        let tools = collect_tool_calls(&[]);
1674        assert!(tools.is_empty());
1675    }
1676
1677    // -----------------------------------------------------------------------
1678    // Event collector tests
1679    // -----------------------------------------------------------------------
1680
1681    #[test]
1682    fn event_collector_and_callback() {
1683        let collector = EvalRunner::event_collector();
1684        let callback = EvalRunner::event_callback(&collector);
1685
1686        callback(AgentEvent::ToolCallStarted {
1687            agent: "a".into(),
1688            tool_name: "bash".into(),
1689            tool_call_id: "c1".into(),
1690            input: "{}".into(),
1691        });
1692        callback(AgentEvent::ToolCallStarted {
1693            agent: "a".into(),
1694            tool_name: "read_file".into(),
1695            tool_call_id: "c2".into(),
1696            input: "{}".into(),
1697        });
1698
1699        let tools = EvalRunner::collected_tool_calls(&collector);
1700        assert_eq!(tools, vec!["bash", "read_file"]);
1701    }
1702
1703    // -----------------------------------------------------------------------
1704    // Integration: EvalRunner with full scoring
1705    // -----------------------------------------------------------------------
1706
1707    #[test]
1708    fn runner_full_scoring_pass() {
1709        let runner = EvalRunner::new()
1710            .scorer(TrajectoryScorer)
1711            .scorer(KeywordScorer)
1712            .scorer(SimilarityScorer);
1713
1714        let case = EvalCase::new("full", "test")
1715            .expect_tool("bash")
1716            .expect_output_contains("result")
1717            .reference_output("the result is 42");
1718
1719        let result = runner.score_result(&case, "the result is 42", &["bash".into()], None);
1720
1721        assert!(result.passed);
1722        assert_eq!(result.scores.len(), 3);
1723        assert!(result.scores.iter().all(|s| s.passed));
1724    }
1725
1726    #[test]
1727    fn runner_full_scoring_fail() {
1728        let runner = EvalRunner::new()
1729            .scorer(TrajectoryScorer)
1730            .scorer(KeywordScorer);
1731
1732        let case = EvalCase::new("fail", "test")
1733            .expect_tool("bash")
1734            .expect_output_contains("result");
1735
1736        let result = runner.score_result(&case, "no match here", &["read_file".into()], None);
1737
1738        assert!(!result.passed);
1739        // Both trajectory and keyword should fail
1740        assert!(result.scores.iter().all(|s| !s.passed));
1741    }
1742
1743    // -----------------------------------------------------------------------
1744    // EvalRunner::run with mock agent
1745    // -----------------------------------------------------------------------
1746
1747    #[tokio::test]
1748    async fn runner_run_with_mock_agent() {
1749        use crate::llm::LlmProvider;
1750        use crate::llm::types::{CompletionRequest, CompletionResponse, ContentBlock, StopReason};
1751        use std::sync::Mutex;
1752
1753        struct MockProvider {
1754            response: Mutex<Option<String>>,
1755        }
1756
1757        impl LlmProvider for MockProvider {
1758            async fn complete(
1759                &self,
1760                _request: CompletionRequest,
1761            ) -> Result<CompletionResponse, crate::error::Error> {
1762                let text = self
1763                    .response
1764                    .lock()
1765                    .expect("mock")
1766                    .take()
1767                    .unwrap_or_default();
1768                Ok(CompletionResponse {
1769                    content: vec![ContentBlock::Text { text }],
1770                    stop_reason: StopReason::EndTurn,
1771                    usage: Default::default(),
1772                    model: None,
1773                })
1774            }
1775        }
1776
1777        let provider = Arc::new(MockProvider {
1778            response: Mutex::new(Some("hello world".into())),
1779        });
1780        let agent = crate::agent::AgentRunner::builder(provider)
1781            .name("eval-test")
1782            .system_prompt("test")
1783            .max_turns(1)
1784            .build()
1785            .unwrap();
1786
1787        let runner = EvalRunner::new().scorer(KeywordScorer);
1788        let cases = vec![EvalCase::new("greeting", "say hello").expect_output_contains("hello")];
1789
1790        let results = runner.run(&agent, &cases).await;
1791        assert_eq!(results.len(), 1);
1792        assert!(results[0].passed);
1793        assert_eq!(results[0].actual_output, "hello world");
1794    }
1795
1796    #[tokio::test]
1797    async fn run_clears_attached_event_collector_between_cases() {
1798        use crate::llm::LlmProvider;
1799        use crate::llm::types::{CompletionRequest, CompletionResponse, ContentBlock, StopReason};
1800        use std::sync::Mutex;
1801
1802        struct MockProvider;
1803        impl LlmProvider for MockProvider {
1804            async fn complete(
1805                &self,
1806                _request: CompletionRequest,
1807            ) -> Result<CompletionResponse, crate::error::Error> {
1808                Ok(CompletionResponse {
1809                    content: vec![ContentBlock::Text { text: "ok".into() }],
1810                    stop_reason: StopReason::EndTurn,
1811                    usage: Default::default(),
1812                    model: None,
1813                })
1814            }
1815        }
1816
1817        // A scorer that records how many events the collector held at score time.
1818        struct EventCounter {
1819            collector: EventCollector,
1820            seen: Arc<Mutex<Vec<usize>>>,
1821        }
1822        impl EvalScorer for EventCounter {
1823            fn name(&self) -> &str {
1824                "event-counter"
1825            }
1826            fn score(&self, _c: &EvalCase, _o: &str, _t: &[String]) -> (f64, Vec<String>) {
1827                let n = self.collector.lock().expect("counter lock").len();
1828                self.seen.lock().expect("seen lock").push(n);
1829                (1.0, vec![])
1830            }
1831            fn pass_threshold(&self) -> f64 {
1832                0.0
1833            }
1834        }
1835
1836        let collector = EvalRunner::event_collector();
1837        let agent = crate::agent::AgentRunner::builder(Arc::new(MockProvider))
1838            .name("eval-isolation")
1839            .system_prompt("test")
1840            .max_turns(1)
1841            .on_event(EvalRunner::event_callback(&collector))
1842            .build()
1843            .unwrap();
1844
1845        let seen = Arc::new(Mutex::new(Vec::<usize>::new()));
1846        let runner = EvalRunner::new()
1847            .with_event_collector(collector.clone())
1848            .scorer(EventCounter {
1849                collector: collector.clone(),
1850                seen: seen.clone(),
1851            });
1852
1853        let cases = vec![EvalCase::new("c1", "first"), EvalCase::new("c2", "second")];
1854        let _ = runner.run(&agent, &cases).await;
1855
1856        // Each case generates the same set of events (RunStarted, LlmRequest,
1857        // LlmResponse, RunCompleted, ...). Without per-case clearing the
1858        // second case would see strictly more events than the first.
1859        let seen = seen.lock().expect("seen lock").clone();
1860        assert_eq!(seen.len(), 2, "scorer should run once per case");
1861        assert_eq!(
1862            seen[0], seen[1],
1863            "with_event_collector must clear the collector between cases (saw {seen:?})"
1864        );
1865        assert!(
1866            seen[0] > 0,
1867            "expected at least one captured event per case (saw {seen:?})"
1868        );
1869    }
1870
1871    // -----------------------------------------------------------------------
1872    // EvalCase budget builder tests
1873    // -----------------------------------------------------------------------
1874
1875    #[test]
1876    fn eval_case_budget_defaults_none() {
1877        let case = EvalCase::new("t", "i");
1878        assert!(case.max_cost_usd.is_none());
1879        assert!(case.max_latency_ms.is_none());
1880        assert!(case.max_tool_calls.is_none());
1881    }
1882
1883    #[test]
1884    fn eval_case_budget_builders() {
1885        let case = EvalCase::new("t", "i")
1886            .expect_max_cost_usd(0.05)
1887            .expect_max_latency_ms(5000)
1888            .expect_max_tool_calls(10);
1889        assert_eq!(case.max_cost_usd, Some(0.05));
1890        assert_eq!(case.max_latency_ms, Some(5000));
1891        assert_eq!(case.max_tool_calls, Some(10));
1892    }
1893
1894    // -----------------------------------------------------------------------
1895    // Serialize tests
1896    // -----------------------------------------------------------------------
1897
1898    #[test]
1899    fn eval_case_serializes_to_json() {
1900        let case = EvalCase::new("test", "do it")
1901            .expect_tool("bash")
1902            .expect_max_cost_usd(0.01);
1903        let json = serde_json::to_string(&case).unwrap();
1904        assert!(json.contains("\"name\":\"test\""));
1905        assert!(json.contains("\"max_cost_usd\":0.01"));
1906    }
1907
1908    #[test]
1909    fn eval_result_serializes_to_json() {
1910        let result = EvalResult {
1911            case_name: "a".into(),
1912            passed: true,
1913            scores: vec![ScorerResult {
1914                scorer: "keyword".into(),
1915                score: 1.0,
1916                passed: true,
1917                details: vec!["ok".into()],
1918            }],
1919            actual_tools: vec!["bash".into()],
1920            actual_output: "done".into(),
1921            error: None,
1922        };
1923        let json = serde_json::to_string(&result).unwrap();
1924        assert!(json.contains("\"passed\":true"));
1925        assert!(json.contains("\"scorer\":\"keyword\""));
1926    }
1927
1928    #[test]
1929    fn eval_summary_serializes_to_json() {
1930        let summary = EvalSummary {
1931            total: 2,
1932            passed: 1,
1933            failed: 1,
1934            errors: 0,
1935            avg_score: 0.75,
1936            scorer_averages: vec![("keyword".into(), 0.9)],
1937        };
1938        let json = serde_json::to_string(&summary).unwrap();
1939        assert!(json.contains("\"total\":2"));
1940        assert!(json.contains("\"avg_score\":0.75"));
1941    }
1942
1943    #[test]
1944    fn eval_case_omits_none_budget_fields() {
1945        let case = EvalCase::new("t", "i");
1946        let json = serde_json::to_string(&case).unwrap();
1947        assert!(!json.contains("max_cost_usd"));
1948        assert!(!json.contains("max_latency_ms"));
1949        assert!(!json.contains("max_tool_calls"));
1950    }
1951
1952    // -----------------------------------------------------------------------
1953    // CostScorer tests
1954    // -----------------------------------------------------------------------
1955
1956    fn make_llm_response_event(
1957        model: Option<&str>,
1958        input: u32,
1959        output: u32,
1960        latency: u64,
1961    ) -> AgentEvent {
1962        use crate::llm::types::TokenUsage;
1963        AgentEvent::LlmResponse {
1964            agent: "a".into(),
1965            turn: 1,
1966            usage: TokenUsage {
1967                input_tokens: input,
1968                output_tokens: output,
1969                ..Default::default()
1970            },
1971            stop_reason: crate::llm::types::StopReason::EndTurn,
1972            tool_call_count: 0,
1973            text: String::new(),
1974            latency_ms: latency,
1975            model: model.map(|s| s.to_string()),
1976            time_to_first_token_ms: 0,
1977        }
1978    }
1979
1980    #[test]
1981    fn cost_scorer_under_budget() {
1982        let collector = EvalRunner::event_collector();
1983        {
1984            let mut events = collector.lock().unwrap();
1985            // claude-sonnet-4-20250514: $3/M input, $15/M output
1986            events.push(make_llm_response_event(
1987                Some("claude-sonnet-4-20250514"),
1988                1000,
1989                500,
1990                100,
1991            ));
1992        }
1993        let scorer = CostScorer::new(collector, 1.0);
1994        let case = EvalCase::new("t", "i");
1995        let (score, details) = scorer.score(&case, "", &[]);
1996        assert!(score > 0.95); // tiny cost relative to $1 budget
1997        assert!(details[0].contains("total cost:"));
1998    }
1999
2000    #[test]
2001    fn cost_scorer_over_budget() {
2002        let collector = EvalRunner::event_collector();
2003        {
2004            let mut events = collector.lock().unwrap();
2005            // 10M output tokens at $15/M = $150
2006            events.push(make_llm_response_event(
2007                Some("claude-sonnet-4-20250514"),
2008                0,
2009                10_000_000,
2010                100,
2011            ));
2012        }
2013        let scorer = CostScorer::new(collector, 0.01);
2014        let case = EvalCase::new("t", "i");
2015        let (score, _) = scorer.score(&case, "", &[]);
2016        assert_eq!(score, 0.0);
2017    }
2018
2019    #[test]
2020    fn cost_scorer_unknown_model() {
2021        let collector = EvalRunner::event_collector();
2022        {
2023            let mut events = collector.lock().unwrap();
2024            events.push(make_llm_response_event(
2025                Some("unknown-model-xyz"),
2026                1000,
2027                1000,
2028                100,
2029            ));
2030        }
2031        let scorer = CostScorer::new(collector, 1.0);
2032        let case = EvalCase::new("t", "i");
2033        let (score, details) = scorer.score(&case, "", &[]);
2034        assert_eq!(score, 1.0); // $0 cost
2035        assert!(details.iter().any(|d| d.contains("unknown model")));
2036    }
2037
2038    #[test]
2039    fn cost_scorer_no_model_field() {
2040        let collector = EvalRunner::event_collector();
2041        {
2042            let mut events = collector.lock().unwrap();
2043            events.push(make_llm_response_event(None, 1000, 1000, 100));
2044        }
2045        let scorer = CostScorer::new(collector, 1.0);
2046        let case = EvalCase::new("t", "i");
2047        let (score, _) = scorer.score(&case, "", &[]);
2048        assert_eq!(score, 1.0); // unknown → $0
2049    }
2050
2051    #[test]
2052    fn cost_scorer_case_override() {
2053        let collector = EvalRunner::event_collector();
2054        {
2055            let mut events = collector.lock().unwrap();
2056            events.push(make_llm_response_event(
2057                Some("claude-sonnet-4-20250514"),
2058                100_000,
2059                50_000,
2060                100,
2061            ));
2062        }
2063        let scorer = CostScorer::new(collector, 100.0); // very high default
2064        let case = EvalCase::new("t", "i").expect_max_cost_usd(0.0001); // tiny override
2065        let (score, _) = scorer.score(&case, "", &[]);
2066        assert_eq!(score, 0.0); // should fail with tiny budget
2067    }
2068
2069    #[test]
2070    fn cost_scorer_pass_threshold() {
2071        let scorer = CostScorer::new(EvalRunner::event_collector(), 1.0);
2072        assert_eq!(scorer.pass_threshold(), 0.01);
2073    }
2074
2075    #[test]
2076    fn cost_scorer_zero_budget() {
2077        let scorer = CostScorer::new(EvalRunner::event_collector(), 0.0);
2078        let case = EvalCase::new("t", "i");
2079        let (score, details) = scorer.score(&case, "", &[]);
2080        assert_eq!(score, 0.0);
2081        assert!(details[0].contains("zero"));
2082    }
2083
2084    // -----------------------------------------------------------------------
2085    // LatencyScorer tests
2086    // -----------------------------------------------------------------------
2087
2088    #[test]
2089    fn latency_scorer_under_budget() {
2090        let collector = EvalRunner::event_collector();
2091        {
2092            let mut events = collector.lock().unwrap();
2093            events.push(make_llm_response_event(None, 0, 0, 500));
2094            events.push(make_llm_response_event(None, 0, 0, 300));
2095        }
2096        let scorer = LatencyScorer::new(collector, 5000);
2097        let case = EvalCase::new("t", "i");
2098        let (score, details) = scorer.score(&case, "", &[]);
2099        // 800ms / 5000ms = 0.16; score = 0.84
2100        assert!((score - 0.84).abs() < 0.001);
2101        assert!(details[0].contains("800ms"));
2102    }
2103
2104    #[test]
2105    fn latency_scorer_over_budget() {
2106        let collector = EvalRunner::event_collector();
2107        {
2108            let mut events = collector.lock().unwrap();
2109            events.push(make_llm_response_event(None, 0, 0, 10_000));
2110        }
2111        let scorer = LatencyScorer::new(collector, 5000);
2112        let case = EvalCase::new("t", "i");
2113        let (score, _) = scorer.score(&case, "", &[]);
2114        assert_eq!(score, 0.0);
2115    }
2116
2117    #[test]
2118    fn latency_scorer_case_override() {
2119        let collector = EvalRunner::event_collector();
2120        {
2121            let mut events = collector.lock().unwrap();
2122            events.push(make_llm_response_event(None, 0, 0, 500));
2123        }
2124        let scorer = LatencyScorer::new(collector, 10_000); // high default
2125        let case = EvalCase::new("t", "i").expect_max_latency_ms(1000); // override
2126        let (score, _) = scorer.score(&case, "", &[]);
2127        // 500 / 1000 = 0.5; score = 0.5
2128        assert!((score - 0.5).abs() < 0.001);
2129    }
2130
2131    #[test]
2132    fn latency_scorer_no_events() {
2133        let collector = EvalRunner::event_collector();
2134        let scorer = LatencyScorer::new(collector, 5000);
2135        let case = EvalCase::new("t", "i");
2136        let (score, _) = scorer.score(&case, "", &[]);
2137        assert_eq!(score, 1.0);
2138    }
2139
2140    #[test]
2141    fn latency_scorer_pass_threshold() {
2142        let scorer = LatencyScorer::new(EvalRunner::event_collector(), 5000);
2143        assert_eq!(scorer.pass_threshold(), 0.01);
2144    }
2145
2146    #[test]
2147    fn latency_scorer_zero_budget() {
2148        let scorer = LatencyScorer::new(EvalRunner::event_collector(), 0);
2149        let case = EvalCase::new("t", "i");
2150        let (score, details) = scorer.score(&case, "", &[]);
2151        assert_eq!(score, 0.0);
2152        assert!(details[0].contains("zero"));
2153    }
2154
2155    // -----------------------------------------------------------------------
2156    // ToolCallCountScorer tests
2157    // -----------------------------------------------------------------------
2158
2159    #[test]
2160    fn tool_call_count_under_budget() {
2161        let scorer = ToolCallCountScorer::new(10);
2162        let case = EvalCase::new("t", "i");
2163        let tools: Vec<String> = vec!["a".into(), "b".into(), "c".into()];
2164        let (score, details) = scorer.score(&case, "", &tools);
2165        // 3/10 = 0.3; score = 0.7
2166        assert!((score - 0.7).abs() < 0.001);
2167        assert!(details[0].contains("tool calls: 3"));
2168    }
2169
2170    #[test]
2171    fn tool_call_count_over_budget() {
2172        let scorer = ToolCallCountScorer::new(2);
2173        let case = EvalCase::new("t", "i");
2174        let tools: Vec<String> = vec!["a".into(), "b".into(), "c".into()];
2175        let (score, _) = scorer.score(&case, "", &tools);
2176        assert_eq!(score, 0.0);
2177    }
2178
2179    #[test]
2180    fn tool_call_count_zero_calls() {
2181        let scorer = ToolCallCountScorer::new(10);
2182        let case = EvalCase::new("t", "i");
2183        let (score, _) = scorer.score(&case, "", &[]);
2184        assert_eq!(score, 1.0);
2185    }
2186
2187    #[test]
2188    fn tool_call_count_case_override() {
2189        let scorer = ToolCallCountScorer::new(100); // high default
2190        let case = EvalCase::new("t", "i").expect_max_tool_calls(2); // tight override
2191        let tools: Vec<String> = vec!["a".into(), "b".into(), "c".into()];
2192        let (score, _) = scorer.score(&case, "", &tools);
2193        assert_eq!(score, 0.0); // 3 > 2
2194    }
2195
2196    #[test]
2197    fn tool_call_count_pass_threshold() {
2198        let scorer = ToolCallCountScorer::new(10);
2199        assert_eq!(scorer.pass_threshold(), 0.01);
2200    }
2201
2202    #[test]
2203    fn tool_call_count_zero_budget() {
2204        let scorer = ToolCallCountScorer::new(0);
2205        let case = EvalCase::new("t", "i");
2206        let (score, details) = scorer.score(&case, "", &[]);
2207        assert_eq!(score, 0.0);
2208        assert!(details[0].contains("zero"));
2209    }
2210
2211    // -----------------------------------------------------------------------
2212    // SafetyScorer tests
2213    // -----------------------------------------------------------------------
2214
2215    #[test]
2216    fn safety_scorer_no_denials() {
2217        let collector = EvalRunner::event_collector();
2218        {
2219            let mut events = collector.lock().unwrap();
2220            events.push(AgentEvent::RunStarted {
2221                agent: "a".into(),
2222                task: "t".into(),
2223            });
2224        }
2225        let scorer = SafetyScorer::new(collector);
2226        let case = EvalCase::new("t", "i");
2227        let (score, details) = scorer.score(&case, "", &[]);
2228        assert_eq!(score, 1.0);
2229        assert!(details[0].contains("no guardrail denials"));
2230    }
2231
2232    #[test]
2233    fn safety_scorer_with_denial() {
2234        let collector = EvalRunner::event_collector();
2235        {
2236            let mut events = collector.lock().unwrap();
2237            events.push(AgentEvent::GuardrailDenied {
2238                agent: "a".into(),
2239                hook: "post_llm".into(),
2240                reason: "unsafe content".into(),
2241                tool_name: None,
2242            });
2243        }
2244        let scorer = SafetyScorer::new(collector);
2245        let case = EvalCase::new("t", "i");
2246        let (score, details) = scorer.score(&case, "", &[]);
2247        assert_eq!(score, 0.0);
2248        assert!(details[0].contains("unsafe content"));
2249    }
2250
2251    #[test]
2252    fn safety_scorer_tool_denial() {
2253        let collector = EvalRunner::event_collector();
2254        {
2255            let mut events = collector.lock().unwrap();
2256            events.push(AgentEvent::GuardrailDenied {
2257                agent: "a".into(),
2258                hook: "pre_tool".into(),
2259                reason: "blocked".into(),
2260                tool_name: Some("bash".into()),
2261            });
2262        }
2263        let scorer = SafetyScorer::new(collector);
2264        let case = EvalCase::new("t", "i");
2265        let (score, details) = scorer.score(&case, "", &[]);
2266        assert_eq!(score, 0.0);
2267        assert!(details[0].contains("(tool: bash)"));
2268    }
2269
2270    #[test]
2271    fn safety_scorer_multiple_denials() {
2272        let collector = EvalRunner::event_collector();
2273        {
2274            let mut events = collector.lock().unwrap();
2275            events.push(AgentEvent::GuardrailDenied {
2276                agent: "a".into(),
2277                hook: "post_llm".into(),
2278                reason: "reason1".into(),
2279                tool_name: None,
2280            });
2281            events.push(AgentEvent::GuardrailDenied {
2282                agent: "a".into(),
2283                hook: "pre_tool".into(),
2284                reason: "reason2".into(),
2285                tool_name: Some("bash".into()),
2286            });
2287        }
2288        let scorer = SafetyScorer::new(collector);
2289        let case = EvalCase::new("t", "i");
2290        let (score, details) = scorer.score(&case, "", &[]);
2291        assert_eq!(score, 0.0);
2292        assert_eq!(details.len(), 2);
2293    }
2294
2295    #[test]
2296    fn safety_scorer_pass_threshold() {
2297        let scorer = SafetyScorer::new(EvalRunner::event_collector());
2298        assert_eq!(scorer.pass_threshold(), 1.0);
2299    }
2300
2301    // -----------------------------------------------------------------------
2302    // EvalComparison tests
2303    // -----------------------------------------------------------------------
2304
2305    fn make_eval_result(name: &str, scores: Vec<(&str, f64)>) -> EvalResult {
2306        EvalResult {
2307            case_name: name.into(),
2308            passed: true,
2309            scores: scores
2310                .into_iter()
2311                .map(|(scorer, score)| ScorerResult {
2312                    scorer: scorer.into(),
2313                    score,
2314                    passed: score >= 0.5,
2315                    details: vec![],
2316                })
2317                .collect(),
2318            actual_tools: vec![],
2319            actual_output: String::new(),
2320            error: None,
2321        }
2322    }
2323
2324    #[test]
2325    fn comparison_no_regressions() {
2326        let baseline = vec![
2327            make_eval_result("a", vec![("keyword", 0.8)]),
2328            make_eval_result("b", vec![("keyword", 0.6)]),
2329        ];
2330        let candidate = vec![
2331            make_eval_result("a", vec![("keyword", 0.9)]),
2332            make_eval_result("b", vec![("keyword", 0.7)]),
2333        ];
2334        let cmp = EvalComparison::compare(&baseline, &candidate);
2335        assert!(!cmp.has_regressions());
2336        assert_eq!(cmp.candidate_wins(), 2);
2337        assert_eq!(cmp.baseline_wins(), 0);
2338        assert_eq!(cmp.ties(), 0);
2339        assert_eq!(cmp.cases.len(), 2);
2340    }
2341
2342    #[test]
2343    fn comparison_with_regression() {
2344        let baseline = vec![make_eval_result("a", vec![("keyword", 0.9)])];
2345        let candidate = vec![make_eval_result("a", vec![("keyword", 0.5)])];
2346        let cmp = EvalComparison::compare(&baseline, &candidate);
2347        assert!(cmp.has_regressions());
2348        assert_eq!(cmp.regressions(), vec!["a"]);
2349        assert_eq!(cmp.baseline_wins(), 1);
2350        assert_eq!(cmp.candidate_wins(), 0);
2351        assert!(cmp.cases[0].regressed);
2352        assert!((cmp.cases[0].delta - (-0.4)).abs() < 0.001);
2353    }
2354
2355    #[test]
2356    fn comparison_ties() {
2357        let baseline = vec![make_eval_result("a", vec![("keyword", 0.8)])];
2358        let candidate = vec![make_eval_result("a", vec![("keyword", 0.8)])];
2359        let cmp = EvalComparison::compare(&baseline, &candidate);
2360        assert!(!cmp.has_regressions());
2361        assert_eq!(cmp.ties(), 1);
2362    }
2363
2364    #[test]
2365    fn comparison_skips_unmatched_cases() {
2366        let baseline = vec![make_eval_result("a", vec![("keyword", 0.8)])];
2367        let candidate = vec![make_eval_result("b", vec![("keyword", 0.9)])];
2368        let cmp = EvalComparison::compare(&baseline, &candidate);
2369        assert!(cmp.cases.is_empty());
2370    }
2371
2372    #[test]
2373    fn comparison_mixed_results() {
2374        let baseline = vec![
2375            make_eval_result("a", vec![("k", 0.8), ("t", 0.6)]),
2376            make_eval_result("b", vec![("k", 0.5), ("t", 0.9)]),
2377            make_eval_result("c", vec![("k", 1.0)]),
2378        ];
2379        let candidate = vec![
2380            make_eval_result("a", vec![("k", 0.9), ("t", 0.8)]), // improved
2381            make_eval_result("b", vec![("k", 0.3), ("t", 0.5)]), // regressed
2382            make_eval_result("c", vec![("k", 1.0)]),             // tie
2383        ];
2384        let cmp = EvalComparison::compare(&baseline, &candidate);
2385        assert_eq!(cmp.candidate_wins(), 1);
2386        assert_eq!(cmp.baseline_wins(), 1);
2387        assert_eq!(cmp.ties(), 1);
2388        assert_eq!(cmp.regressions(), vec!["b"]);
2389    }
2390
2391    #[test]
2392    fn comparison_display() {
2393        let baseline = vec![make_eval_result("a", vec![("k", 0.8)])];
2394        let candidate = vec![make_eval_result("a", vec![("k", 0.6)])];
2395        let cmp = EvalComparison::compare(&baseline, &candidate);
2396        let display = format!("{cmp}");
2397        assert!(display.contains("REGRESSED"));
2398        assert!(display.contains("Regressions: a"));
2399    }
2400
2401    #[test]
2402    fn comparison_serializes_to_json() {
2403        let baseline = vec![make_eval_result("a", vec![("k", 0.8)])];
2404        let candidate = vec![make_eval_result("a", vec![("k", 0.9)])];
2405        let cmp = EvalComparison::compare(&baseline, &candidate);
2406        let json = serde_json::to_string(&cmp).unwrap();
2407        assert!(json.contains("\"case_name\":\"a\""));
2408        assert!(json.contains("\"regressed\":false"));
2409        assert_eq!(cmp.candidate_wins(), 1);
2410    }
2411
2412    #[test]
2413    fn comparison_empty_inputs() {
2414        let cmp = EvalComparison::compare(&[], &[]);
2415        assert!(cmp.cases.is_empty());
2416        assert!(!cmp.has_regressions());
2417    }
2418
2419    // -----------------------------------------------------------------------
2420    // avg_score helper tests
2421    // -----------------------------------------------------------------------
2422
2423    #[test]
2424    fn avg_score_empty() {
2425        assert_eq!(avg_score(&[]), 0.0);
2426    }
2427
2428    #[test]
2429    fn avg_score_single() {
2430        let scores = vec![ScorerResult {
2431            scorer: "k".into(),
2432            score: 0.7,
2433            passed: true,
2434            details: vec![],
2435        }];
2436        assert!((avg_score(&scores) - 0.7).abs() < 0.001);
2437    }
2438
2439    #[test]
2440    fn avg_score_multiple() {
2441        let scores = vec![
2442            ScorerResult {
2443                scorer: "k".into(),
2444                score: 0.6,
2445                passed: true,
2446                details: vec![],
2447            },
2448            ScorerResult {
2449                scorer: "t".into(),
2450                score: 0.8,
2451                passed: true,
2452                details: vec![],
2453            },
2454        ];
2455        assert!((avg_score(&scores) - 0.7).abs() < 0.001);
2456    }
2457
2458    // -----------------------------------------------------------------------
2459    // Integration: new scorers with EvalRunner
2460    // -----------------------------------------------------------------------
2461
2462    #[test]
2463    fn runner_with_tool_call_count_scorer() {
2464        let runner = EvalRunner::new().scorer(ToolCallCountScorer::new(5));
2465        let case = EvalCase::new("t", "i");
2466        let tools: Vec<String> = vec!["a".into(), "b".into()];
2467        let result = runner.score_result(&case, "output", &tools, None);
2468        assert!(result.passed);
2469        // 2/5 = 0.4; score = 0.6
2470        assert!((result.scores[0].score - 0.6).abs() < 0.001);
2471    }
2472
2473    #[test]
2474    fn runner_with_safety_scorer() {
2475        let collector = EvalRunner::event_collector();
2476        let runner = EvalRunner::new().scorer(SafetyScorer::new(Arc::clone(&collector)));
2477        let case = EvalCase::new("t", "i");
2478        let result = runner.score_result(&case, "output", &[], None);
2479        assert!(result.passed);
2480        assert_eq!(result.scores[0].score, 1.0);
2481    }
2482
2483    // -----------------------------------------------------------------------
2484    // clear_events tests
2485    // -----------------------------------------------------------------------
2486
2487    #[test]
2488    fn clear_events_resets_collector() {
2489        let collector = EvalRunner::event_collector();
2490        {
2491            let mut events = collector.lock().unwrap();
2492            events.push(make_llm_response_event(None, 0, 0, 1000));
2493            events.push(AgentEvent::GuardrailDenied {
2494                agent: "a".into(),
2495                hook: "post_llm".into(),
2496                reason: "bad".into(),
2497                tool_name: None,
2498            });
2499        }
2500        assert_eq!(collector.lock().unwrap().len(), 2);
2501        clear_events(&collector);
2502        assert!(collector.lock().unwrap().is_empty());
2503    }
2504
2505    #[test]
2506    fn clear_events_fixes_accumulation_between_cases() {
2507        let collector = EvalRunner::event_collector();
2508        let scorer = LatencyScorer::new(Arc::clone(&collector), 1000);
2509        let case = EvalCase::new("t", "i");
2510
2511        // Case 1: 500ms
2512        {
2513            collector
2514                .lock()
2515                .unwrap()
2516                .push(make_llm_response_event(None, 0, 0, 500));
2517        }
2518        let (score1, _) = scorer.score(&case, "", &[]);
2519        assert!((score1 - 0.5).abs() < 0.001);
2520
2521        // Without clearing, case 2 would see 500 + 300 = 800ms
2522        clear_events(&collector);
2523        {
2524            collector
2525                .lock()
2526                .unwrap()
2527                .push(make_llm_response_event(None, 0, 0, 300));
2528        }
2529        let (score2, _) = scorer.score(&case, "", &[]);
2530        // 300/1000 = 0.3; score = 0.7
2531        assert!((score2 - 0.7).abs() < 0.001);
2532    }
2533
2534    // -----------------------------------------------------------------------
2535    // Regression tolerance tests
2536    // -----------------------------------------------------------------------
2537
2538    #[test]
2539    fn comparison_tiny_delta_is_tie() {
2540        // Delta of 0.0005 is below REGRESSION_TOLERANCE (0.001), should be a tie
2541        let baseline = vec![make_eval_result("a", vec![("k", 0.8005)])];
2542        let candidate = vec![make_eval_result("a", vec![("k", 0.8)])];
2543        let cmp = EvalComparison::compare(&baseline, &candidate);
2544        assert!(!cmp.has_regressions());
2545        assert_eq!(cmp.ties(), 1);
2546    }
2547
2548    #[test]
2549    fn comparison_significant_delta_is_regression() {
2550        // Delta of -0.01 is above REGRESSION_TOLERANCE, should regress
2551        let baseline = vec![make_eval_result("a", vec![("k", 0.81)])];
2552        let candidate = vec![make_eval_result("a", vec![("k", 0.8)])];
2553        let cmp = EvalComparison::compare(&baseline, &candidate);
2554        assert!(cmp.has_regressions());
2555        assert_eq!(cmp.regressions(), vec!["a"]);
2556    }
2557
2558    // -----------------------------------------------------------------------
2559    // Serde round-trip tests
2560    // -----------------------------------------------------------------------
2561
2562    #[test]
2563    fn eval_case_serde_round_trip() {
2564        let case = EvalCase::new("greeting", "Say hello")
2565            .expect_tool("bash")
2566            .expect_tool_at("read_file", 1)
2567            .expect_output_contains("hello")
2568            .expect_output_not_contains("goodbye")
2569            .reference_output("Hello there!")
2570            .expect_max_cost_usd(0.05)
2571            .expect_max_latency_ms(5000)
2572            .expect_max_tool_calls(10);
2573
2574        let json = serde_json::to_string(&case).expect("serialize EvalCase");
2575        let parsed: EvalCase = serde_json::from_str(&json).expect("deserialize EvalCase");
2576
2577        assert_eq!(parsed.name, "greeting");
2578        assert_eq!(parsed.input, "Say hello");
2579        assert_eq!(parsed.expected_tools.as_ref().unwrap().len(), 2);
2580        assert_eq!(parsed.expected_tools.as_ref().unwrap()[0].name, "bash");
2581        assert!(parsed.expected_tools.as_ref().unwrap()[0].order.is_none());
2582        assert_eq!(parsed.expected_tools.as_ref().unwrap()[1].order, Some(1));
2583        assert_eq!(parsed.output_contains, vec!["hello"]);
2584        assert_eq!(parsed.output_not_contains, vec!["goodbye"]);
2585        assert_eq!(parsed.reference_output.as_deref(), Some("Hello there!"));
2586        assert_eq!(parsed.max_cost_usd, Some(0.05));
2587        assert_eq!(parsed.max_latency_ms, Some(5000));
2588        assert_eq!(parsed.max_tool_calls, Some(10));
2589    }
2590
2591    #[test]
2592    fn eval_case_deserialize_minimal() {
2593        let json = r#"{"name":"simple","input":"do it"}"#;
2594        let case: EvalCase = serde_json::from_str(json).expect("deserialize minimal");
2595        assert_eq!(case.name, "simple");
2596        assert_eq!(case.input, "do it");
2597        assert!(case.expected_tools.is_none());
2598        assert!(case.output_contains.is_empty());
2599    }
2600
2601    #[test]
2602    fn eval_result_serde_round_trip() {
2603        let result = EvalResult {
2604            case_name: "test-case".into(),
2605            passed: true,
2606            scores: vec![ScorerResult {
2607                scorer: "keyword".into(),
2608                score: 0.85,
2609                passed: true,
2610                details: vec!["OK: found hello".into()],
2611            }],
2612            actual_tools: vec!["bash".into(), "read".into()],
2613            actual_output: "Hello world".into(),
2614            error: None,
2615        };
2616
2617        let json = serde_json::to_string(&result).expect("serialize EvalResult");
2618        let parsed: EvalResult = serde_json::from_str(&json).expect("deserialize EvalResult");
2619
2620        assert_eq!(parsed.case_name, "test-case");
2621        assert!(parsed.passed);
2622        assert_eq!(parsed.scores.len(), 1);
2623        assert_eq!(parsed.scores[0].scorer, "keyword");
2624        assert!((parsed.scores[0].score - 0.85).abs() < f64::EPSILON);
2625        assert_eq!(parsed.actual_tools, vec!["bash", "read"]);
2626        assert_eq!(parsed.actual_output, "Hello world");
2627        assert!(parsed.error.is_none());
2628    }
2629
2630    #[test]
2631    fn eval_summary_serde_round_trip() {
2632        let summary = EvalSummary {
2633            total: 10,
2634            passed: 8,
2635            failed: 1,
2636            errors: 1,
2637            avg_score: 0.9,
2638            scorer_averages: vec![("keyword".into(), 0.95), ("trajectory".into(), 0.85)],
2639        };
2640        let json = serde_json::to_string(&summary).expect("serialize");
2641        let parsed: EvalSummary = serde_json::from_str(&json).expect("deserialize");
2642        assert_eq!(parsed.total, 10);
2643        assert_eq!(parsed.passed, 8);
2644        assert_eq!(parsed.scorer_averages.len(), 2);
2645    }
2646
2647    #[test]
2648    fn eval_comparison_serde_round_trip() {
2649        let cmp = EvalComparison {
2650            cases: vec![CaseComparison {
2651                case_name: "test".into(),
2652                baseline_avg_score: 0.8,
2653                candidate_avg_score: 0.9,
2654                delta: 0.1,
2655                regressed: false,
2656            }],
2657        };
2658        let json = serde_json::to_string(&cmp).expect("serialize");
2659        let parsed: EvalComparison = serde_json::from_str(&json).expect("deserialize");
2660        assert_eq!(parsed.cases.len(), 1);
2661        assert!(!parsed.cases[0].regressed);
2662        assert!((parsed.cases[0].delta - 0.1).abs() < f64::EPSILON);
2663    }
2664}