Skip to main content

ravenclaws/
eval.rs

1//! RavenClaws
2//!
3//! Provides a framework for defining, running, and scoring evaluation tasks
4//! against LLM agents. Captures full run traces for inspection and debugging.
5//!
6//! # Architecture
7//!
8//! ```text
9//! EvalConfig (TOML file)
10//!   └── Vec<EvalTask>
11//!         ├── prompt + golden answer
12//!         ├── assertions (contains, not_contains, regex, exact)
13//!         └── scoring weights
14//!
15//! EvalRunner
16//!   ├── run_task() → EvalResult (with RunTrace)
17//!   └── run_suite() → EvalReport (summary of all results)
18//!
19//! RunTrace
20//!   ├── steps: Vec<TraceStep>
21//!   ├── llm_calls: Vec<LlmCallTrace>
22//!   └── tool_calls: Vec<ToolCallTrace>
23//! ```
24
25use crate::agent::{run_agent_loop, AgentLoopConfig};
26use crate::error::{RavenClawsError, Result};
27use crate::llm::LLMProviderTrait;
28use serde::{Deserialize, Serialize};
29use std::sync::Arc;
30use tracing::{info, instrument, warn};
31
32// ── Configuration ───────────────────────────────────────────────────────────
33
34/// Configuration for an eval suite — loaded from a TOML file
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct EvalConfig {
37    /// Name of this eval suite
38    #[serde(default = "default_suite_name")]
39    pub name: String,
40    /// Description of what this suite tests
41    #[serde(default)]
42    pub description: String,
43    /// System prompt to use for all tasks in this suite
44    #[serde(default = "default_system_prompt")]
45    pub system_prompt: String,
46    /// Maximum iterations per task
47    #[serde(default = "default_max_iterations")]
48    pub max_iterations: usize,
49    /// List of eval tasks to run
50    #[serde(default)]
51    pub tasks: Vec<EvalTask>,
52}
53
54fn default_suite_name() -> String {
55    "unnamed".to_string()
56}
57
58fn default_system_prompt() -> String {
59    "You are a helpful assistant. Be concise and accurate.".to_string()
60}
61
62fn default_max_iterations() -> usize {
63    5
64}
65
66/// A single eval task with prompt, golden answer, and assertions
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct EvalTask {
69    /// Name of this task (used in reports)
70    pub name: String,
71    /// Description of what this task tests
72    #[serde(default)]
73    pub description: String,
74    /// The prompt to send to the agent
75    pub prompt: String,
76    /// Expected golden answer (used for exact match scoring)
77    #[serde(default)]
78    pub golden: String,
79    /// List of assertions to check against the response
80    #[serde(default)]
81    pub assertions: Vec<Assertion>,
82    /// Weight of this task in the overall score (0.0 - 1.0)
83    #[serde(default = "default_weight")]
84    pub weight: f64,
85    /// Whether this task is required to pass (fails the suite if not)
86    #[serde(default)]
87    pub required: bool,
88}
89
90fn default_weight() -> f64 {
91    1.0
92}
93
94/// Types of assertions that can be checked against a response
95#[derive(Debug, Clone, Serialize, Deserialize)]
96#[serde(tag = "type", content = "value")]
97pub enum Assertion {
98    /// Response must contain this substring
99    #[serde(rename = "contains")]
100    Contains(String),
101    /// Response must NOT contain this substring
102    #[serde(rename = "not_contains")]
103    NotContains(String),
104    /// Response must exactly match this string
105    #[serde(rename = "exact")]
106    Exact(String),
107    /// Response must match this regex pattern
108    #[serde(rename = "regex")]
109    Regex(String),
110    /// Response must be non-empty
111    #[serde(rename = "non_empty")]
112    NonEmpty,
113    /// Response length must be at least N characters
114    #[serde(rename = "min_length")]
115    MinLength(usize),
116    /// Response length must be at most N characters
117    #[serde(rename = "max_length")]
118    MaxLength(usize),
119    /// A tool with this name must have been called during execution (v0.9.6)
120    #[serde(rename = "tool_called")]
121    ToolCalled(String),
122    /// A tool with this name must NOT have been called during execution (v0.9.6)
123    #[serde(rename = "tool_not_called")]
124    ToolNotCalled(String),
125}
126
127// ── Run Trace ───────────────────────────────────────────────────────────────
128
129/// Full trace of a single agent run — captures every step for inspection
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RunTrace {
132    /// Task name
133    pub task_name: String,
134    /// When the run started (ISO 8601)
135    pub started_at: String,
136    /// When the run ended (ISO 8601)
137    pub ended_at: String,
138    /// Duration in milliseconds
139    pub duration_ms: u64,
140    /// Number of iterations used
141    pub iterations: usize,
142    /// All steps in chronological order
143    pub steps: Vec<TraceStep>,
144    /// LLM calls made during the run
145    pub llm_calls: Vec<LlmCallTrace>,
146    /// Tool calls made during the run
147    pub tool_calls: Vec<ToolCallTrace>,
148    /// Final response from the agent
149    pub final_response: String,
150}
151
152/// A single step in the agent loop
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct TraceStep {
155    /// Step number (0-based)
156    pub number: usize,
157    /// Type of step
158    pub step_type: StepType,
159    /// Content of the step (LLM response, tool result, etc.)
160    pub content: String,
161    /// Duration of this step in milliseconds
162    pub duration_ms: u64,
163}
164
165/// Type of a trace step
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub enum StepType {
168    /// LLM thought/response
169    Thought,
170    /// Tool call
171    ToolCall,
172    /// Tool result/observation
173    Observation,
174    /// Final answer
175    Final,
176    /// Error
177    Error,
178}
179
180/// Trace of a single LLM call
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct LlmCallTrace {
183    /// Iteration number
184    pub iteration: usize,
185    /// Provider name
186    pub provider: String,
187    /// Model name
188    pub model: String,
189    /// Prompt tokens (if available)
190    pub prompt_tokens: Option<u32>,
191    /// Completion tokens (if available)
192    pub completion_tokens: Option<u32>,
193    /// Duration in milliseconds
194    pub duration_ms: u64,
195    /// Response content (truncated to 1000 chars for storage)
196    pub response_preview: String,
197}
198
199/// Trace of a single tool call
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct ToolCallTrace {
202    /// Iteration number
203    pub iteration: usize,
204    /// Tool name
205    pub tool_name: String,
206    /// Arguments (JSON)
207    pub arguments: serde_json::Value,
208    /// Whether the tool succeeded
209    pub success: bool,
210    /// Output preview (truncated to 500 chars)
211    pub output_preview: String,
212    /// Duration in milliseconds
213    pub duration_ms: u64,
214}
215
216// ── Results ─────────────────────────────────────────────────────────────────
217
218/// Result of a single eval task
219#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct EvalResult {
221    /// Task name
222    pub task_name: String,
223    /// Whether the task passed all assertions
224    pub passed: bool,
225    /// Score (0.0 - 1.0)
226    pub score: f64,
227    /// Number of assertions that passed
228    pub assertions_passed: usize,
229    /// Number of assertions that failed
230    pub assertions_failed: usize,
231    /// Details of each assertion check
232    pub assertion_results: Vec<AssertionResult>,
233    /// Full run trace for inspection
234    pub trace: RunTrace,
235    /// Error message if the task failed to run
236    pub error: Option<String>,
237}
238
239/// Result of a single assertion check
240#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct AssertionResult {
242    /// The assertion that was checked
243    pub assertion: String,
244    /// Whether it passed
245    pub passed: bool,
246    /// Details about the check
247    pub details: String,
248}
249
250/// Summary report of an entire eval suite run
251#[derive(Debug, Clone, Serialize, Deserialize)]
252pub struct EvalReport {
253    /// Suite name
254    pub suite_name: String,
255    /// When the suite was run (ISO 8601)
256    pub ran_at: String,
257    /// Duration in milliseconds
258    pub duration_ms: u64,
259    /// Overall score (0.0 - 1.0)
260    pub overall_score: f64,
261    /// Number of tasks
262    pub total_tasks: usize,
263    /// Number of tasks that passed
264    pub passed_tasks: usize,
265    /// Number of tasks that failed
266    pub failed_tasks: usize,
267    /// Individual task results
268    pub results: Vec<EvalResult>,
269}
270
271// ── Eval Runner ─────────────────────────────────────────────────────────────
272
273/// Runs eval tasks against an LLM provider and captures traces
274pub struct EvalRunner {
275    /// The LLM provider to test
276    llm: Arc<dyn LLMProviderTrait>,
277    /// Eval configuration
278    config: EvalConfig,
279}
280
281impl EvalRunner {
282    /// Create a new eval runner
283    pub fn new(llm: Arc<dyn LLMProviderTrait>, config: EvalConfig) -> Self {
284        Self { llm, config }
285    }
286
287    /// Run the full eval suite and return a report
288    #[instrument(skip(self), fields(suite = %self.config.name, task_count = self.config.tasks.len()))]
289    pub async fn run_suite(&self) -> EvalReport {
290        let started_at = chrono::Utc::now().to_rfc3339();
291        let suite_start = std::time::Instant::now();
292        let mut results = Vec::with_capacity(self.config.tasks.len());
293
294        info!(
295            suite = %self.config.name,
296            task_count = self.config.tasks.len(),
297            "Starting eval suite"
298        );
299
300        for task in &self.config.tasks {
301            let result = self.run_task(task).await;
302            let passed = result.passed;
303            let name = &result.task_name;
304
305            if passed {
306                info!(task = %name, score = result.score, "Eval task passed");
307            } else {
308                warn!(
309                    task = %name,
310                    score = result.score,
311                    passed = result.assertions_passed,
312                    failed = result.assertions_failed,
313                    "Eval task failed"
314                );
315            }
316
317            results.push(result);
318        }
319
320        let duration_ms = suite_start.elapsed().as_millis() as u64;
321        let total_tasks = results.len();
322        let passed_tasks = results.iter().filter(|r| r.passed).count();
323        let failed_tasks = total_tasks - passed_tasks;
324        let overall_score = if total_tasks > 0 {
325            results
326                .iter()
327                .map(|r| r.score * r.trace.iterations as f64)
328                .sum::<f64>()
329                / results
330                    .iter()
331                    .map(|r| r.trace.iterations as f64)
332                    .sum::<f64>()
333        } else {
334            0.0
335        };
336
337        info!(
338            suite = %self.config.name,
339            passed = passed_tasks,
340            failed = failed_tasks,
341            overall_score = overall_score,
342            duration_ms = duration_ms,
343            "Eval suite completed"
344        );
345
346        EvalReport {
347            suite_name: self.config.name.clone(),
348            ran_at: started_at,
349            duration_ms,
350            overall_score,
351            total_tasks,
352            passed_tasks,
353            failed_tasks,
354            results,
355        }
356    }
357
358    /// Run a single eval task and return the result with trace
359    ///
360    /// Uses the full agent loop (`run_agent_loop`) instead of a single LLM call,
361    /// so eval tasks exercise the complete ReAct loop with tool use, security
362    /// checks, and iteration limits.
363    #[instrument(skip(self), fields(task = %task.name))]
364    async fn run_task(&self, task: &EvalTask) -> EvalResult {
365        let task_start = std::time::Instant::now();
366        let started_at = chrono::Utc::now().to_rfc3339();
367
368        // Build agent loop config from suite settings
369        let agent_config = AgentLoopConfig {
370            max_iterations: self.config.max_iterations,
371            enable_tools: true,
372            require_approval: false,
373            prompt_injection_protection: true,
374            token_lifetime_secs: 0,
375            no_final_required: false,
376            fallback_chain: None,
377            token_budget: None,
378            ravenfabric: None,
379            checkpoint_dir: None,
380            session_id: None,
381            metrics_callback: None,
382            load_manager: None,
383            retry_config: None,
384        };
385
386        // Run the full agent loop (ReAct + tools + security)
387        let result = run_agent_loop(
388            self.llm.clone(),
389            &task.prompt,
390            &self.config.system_prompt,
391            agent_config,
392        )
393        .await;
394
395        let duration_ms = task_start.elapsed().as_millis() as u64;
396
397        match result {
398            Ok(final_response) => {
399                let trace = RunTrace {
400                    task_name: task.name.clone(),
401                    started_at,
402                    ended_at: chrono::Utc::now().to_rfc3339(),
403                    duration_ms,
404                    iterations: self.config.max_iterations, // best-effort; agent loop doesn't expose exact count
405                    steps: vec![TraceStep {
406                        number: 0,
407                        step_type: StepType::Final,
408                        content: final_response.clone(),
409                        duration_ms,
410                    }],
411                    llm_calls: Vec::new(), // agent loop doesn't expose per-call traces
412                    tool_calls: Vec::new(), // agent loop doesn't expose per-call traces
413                    final_response: final_response.clone(),
414                };
415
416                // Run assertions against the final response
417                let (assertion_results, assertions_passed, assertions_failed) =
418                    check_assertions(&final_response, &task.assertions, Some(&trace));
419
420                // Calculate score
421                let score = if task.assertions.is_empty() {
422                    if final_response.is_empty() || final_response.len() < 10 {
423                        0.0
424                    } else {
425                        1.0
426                    }
427                } else if task.assertions.len() == assertions_passed + assertions_failed {
428                    assertions_passed as f64 / task.assertions.len() as f64
429                } else {
430                    0.0
431                };
432
433                let passed = assertions_failed == 0 && !final_response.is_empty();
434
435                EvalResult {
436                    task_name: task.name.clone(),
437                    passed,
438                    score,
439                    assertions_passed,
440                    assertions_failed,
441                    assertion_results,
442                    trace,
443                    error: None,
444                }
445            }
446            Err(e) => {
447                let trace = RunTrace {
448                    task_name: task.name.clone(),
449                    started_at,
450                    ended_at: chrono::Utc::now().to_rfc3339(),
451                    duration_ms,
452                    iterations: 0,
453                    steps: vec![TraceStep {
454                        number: 0,
455                        step_type: StepType::Error,
456                        content: format!("Agent loop failed: {}", e),
457                        duration_ms,
458                    }],
459                    llm_calls: Vec::new(),
460                    tool_calls: Vec::new(),
461                    final_response: String::new(),
462                };
463
464                EvalResult {
465                    task_name: task.name.clone(),
466                    passed: false,
467                    score: 0.0,
468                    assertions_passed: 0,
469                    assertions_failed: 1,
470                    assertion_results: vec![AssertionResult {
471                        assertion: "agent_loop".to_string(),
472                        passed: false,
473                        details: format!("Agent loop failed: {}", e),
474                    }],
475                    trace,
476                    error: Some(e.to_string()),
477                }
478            }
479        }
480    }
481}
482
483// ── Assertion Checking ──────────────────────────────────────────────────────
484
485/// Check all assertions against a response string
486fn check_assertions(
487    response: &str,
488    assertions: &[Assertion],
489    run_trace: Option<&RunTrace>,
490) -> (Vec<AssertionResult>, usize, usize) {
491    let mut results = Vec::with_capacity(assertions.len());
492    let mut passed = 0;
493    let mut failed = 0;
494
495    for assertion in assertions {
496        let result = check_single_assertion(response, assertion, run_trace);
497        if result.passed {
498            passed += 1;
499        } else {
500            failed += 1;
501        }
502        results.push(result);
503    }
504
505    (results, passed, failed)
506}
507
508/// Check a single assertion against a response
509fn check_single_assertion(
510    response: &str,
511    assertion: &Assertion,
512    run_trace: Option<&RunTrace>,
513) -> AssertionResult {
514    match assertion {
515        Assertion::Contains(pattern) => {
516            let passed = response.contains(pattern);
517            AssertionResult {
518                assertion: format!("contains: {}", pattern),
519                passed,
520                details: if passed {
521                    format!("Response contains '{}'", pattern)
522                } else {
523                    format!("Response does not contain '{}'", pattern)
524                },
525            }
526        }
527        Assertion::NotContains(pattern) => {
528            let passed = !response.contains(pattern);
529            AssertionResult {
530                assertion: format!("not_contains: {}", pattern),
531                passed,
532                details: if passed {
533                    format!("Response does not contain '{}'", pattern)
534                } else {
535                    format!("Response contains '{}'", pattern)
536                },
537            }
538        }
539        Assertion::Exact(expected) => {
540            let trimmed_response = response.trim();
541            let passed = trimmed_response == expected.as_str();
542            AssertionResult {
543                assertion: format!("exact: {}", expected),
544                passed,
545                details: if passed {
546                    "Response matches exactly".to_string()
547                } else {
548                    format!(
549                        "Expected '{}', got '{}'",
550                        expected,
551                        trimmed_response.chars().take(100).collect::<String>()
552                    )
553                },
554            }
555        }
556        Assertion::Regex(pattern) => {
557            let re = regex_lite::Regex::new(pattern);
558            match re {
559                Ok(re) => {
560                    let passed = re.is_match(response);
561                    AssertionResult {
562                        assertion: format!("regex: {}", pattern),
563                        passed,
564                        details: if passed {
565                            format!("Response matches pattern '{}'", pattern)
566                        } else {
567                            format!("Response does not match pattern '{}'", pattern)
568                        },
569                    }
570                }
571                Err(e) => AssertionResult {
572                    assertion: format!("regex: {}", pattern),
573                    passed: false,
574                    details: format!("Invalid regex pattern: {}", e),
575                },
576            }
577        }
578        Assertion::NonEmpty => {
579            let passed = !response.is_empty();
580            AssertionResult {
581                assertion: "non_empty".to_string(),
582                passed,
583                details: if passed {
584                    format!("Response is non-empty ({} chars)", response.len())
585                } else {
586                    "Response is empty".to_string()
587                },
588            }
589        }
590        Assertion::MinLength(min) => {
591            let passed = response.len() >= *min;
592            AssertionResult {
593                assertion: format!("min_length: {}", min),
594                passed,
595                details: if passed {
596                    format!("Response length {} >= {}", response.len(), min)
597                } else {
598                    format!("Response length {} < {}", response.len(), min)
599                },
600            }
601        }
602        Assertion::MaxLength(max) => {
603            let passed = response.len() <= *max;
604            AssertionResult {
605                assertion: format!("max_length: {}", max),
606                passed,
607                details: if passed {
608                    format!("Response length {} <= {}", response.len(), max)
609                } else {
610                    format!("Response length {} > {}", response.len(), max)
611                },
612            }
613        }
614        Assertion::ToolCalled(tool_name) => {
615            let tool_calls = run_trace
616                .map(|t| &t.tool_calls)
617                .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
618            let passed = tool_calls.is_some();
619            AssertionResult {
620                assertion: format!("tool_called: {}", tool_name),
621                passed,
622                details: if passed {
623                    format!("Tool '{}' was called", tool_name)
624                } else {
625                    let all_tools: Vec<&str> = run_trace
626                        .map(|t| {
627                            t.tool_calls
628                                .iter()
629                                .map(|tc| tc.tool_name.as_str())
630                                .collect()
631                        })
632                        .unwrap_or_default();
633                    if all_tools.is_empty() {
634                        format!("Tool '{}' was not called (no tools were called)", tool_name)
635                    } else {
636                        format!(
637                            "Tool '{}' was not called (called: {})",
638                            tool_name,
639                            all_tools.join(", ")
640                        )
641                    }
642                },
643            }
644        }
645        Assertion::ToolNotCalled(tool_name) => {
646            let tool_calls = run_trace
647                .map(|t| &t.tool_calls)
648                .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
649            let passed = tool_calls.is_none();
650            AssertionResult {
651                assertion: format!("tool_not_called: {}", tool_name),
652                passed,
653                details: if passed {
654                    format!("Tool '{}' was not called", tool_name)
655                } else {
656                    format!("Tool '{}' was called but should not have been", tool_name)
657                },
658            }
659        }
660    }
661}
662
663// ── Report Formatting ───────────────────────────────────────────────────────
664
665impl EvalReport {
666    /// Format the report as a human-readable string
667    pub fn format_text(&self) -> String {
668        let mut output = String::new();
669
670        output.push_str(&format!("\n🐦‍⬛ Eval Report: {}\n", self.suite_name));
671        output.push_str(&format!("{:-^60}\n", ""));
672        output.push_str(&format!(
673            "Ran at:       {}\n",
674            &self.ran_at[..19].replace('T', " ")
675        ));
676        output.push_str(&format!("Duration:     {} ms\n", self.duration_ms));
677        output.push_str(&format!(
678            "Overall score: {:.1}%\n",
679            self.overall_score * 100.0
680        ));
681        output.push_str(&format!(
682            "Tasks:        {}/{} passed\n",
683            self.passed_tasks, self.total_tasks
684        ));
685        output.push_str(&format!("{:-^60}\n", ""));
686
687        for result in &self.results {
688            output.push_str(&format!(
689                "\n  {} {} — {:.1}%\n",
690                if result.passed { "✅" } else { "❌" },
691                result.task_name,
692                result.score * 100.0
693            ));
694
695            if let Some(ref error) = result.error {
696                output.push_str(&format!("    Error: {}\n", error));
697            }
698
699            if !result.assertion_results.is_empty() {
700                for ar in &result.assertion_results {
701                    output.push_str(&format!(
702                        "    {} {}\n",
703                        if ar.passed { "  ✅" } else { "  ❌" },
704                        ar.details
705                    ));
706                }
707            }
708
709            // Show trace summary
710            let trace = &result.trace;
711            output.push_str(&format!(
712                "    Iterations: {} · LLM calls: {} · Tool calls: {} · Duration: {} ms\n",
713                trace.iterations,
714                trace.llm_calls.len(),
715                trace.tool_calls.len(),
716                trace.duration_ms
717            ));
718
719            // Show response preview
720            let preview: String = trace.final_response.chars().take(200).collect();
721            if !preview.is_empty() {
722                output.push_str(&format!("    Response: {}\n", preview));
723            }
724        }
725
726        output
727    }
728
729    /// Format the report as JSON
730    pub fn format_json(&self) -> serde_json::Value {
731        serde_json::to_value(self).unwrap_or(serde_json::json!({"error": "serialization failed"}))
732    }
733}
734
735// ── Config Loading ──────────────────────────────────────────────────────────
736
737impl EvalConfig {
738    /// Load eval config from a TOML file
739    pub fn from_file(path: &str) -> Result<Self> {
740        let content = std::fs::read_to_string(path).map_err(|e| {
741            RavenClawsError::CommandExecution(format!("Failed to read eval config: {}", e))
742        })?;
743
744        if content.trim().is_empty() {
745            return Err(RavenClawsError::CommandExecution(format!(
746                "Eval config file '{}' is empty — no tasks to run",
747                path
748            )));
749        }
750
751        let config: EvalConfig = toml::from_str(&content).map_err(|e| {
752            RavenClawsError::CommandExecution(format!("Failed to parse eval config: {}", e))
753        })?;
754
755        if config.tasks.is_empty() {
756            return Err(RavenClawsError::CommandExecution(format!(
757                "Eval config file '{}' has no tasks defined",
758                path
759            )));
760        }
761
762        Ok(config)
763    }
764}
765
766// ── Tests ───────────────────────────────────────────────────────────────────
767
768#[cfg(test)]
769mod tests {
770    use super::*;
771
772    #[test]
773    fn test_assertion_contains_pass() {
774        let result = check_single_assertion(
775            "hello world",
776            &Assertion::Contains("world".to_string()),
777            None,
778        );
779        assert!(result.passed);
780        assert!(result.details.contains("contains"));
781    }
782
783    #[test]
784    fn test_assertion_contains_fail() {
785        let result =
786            check_single_assertion("hello world", &Assertion::Contains("foo".to_string()), None);
787        assert!(!result.passed);
788    }
789
790    #[test]
791    fn test_assertion_not_contains_pass() {
792        let result = check_single_assertion(
793            "hello world",
794            &Assertion::NotContains("foo".to_string()),
795            None,
796        );
797        assert!(result.passed);
798    }
799
800    #[test]
801    fn test_assertion_not_contains_fail() {
802        let result = check_single_assertion(
803            "hello world",
804            &Assertion::NotContains("world".to_string()),
805            None,
806        );
807        assert!(!result.passed);
808    }
809
810    #[test]
811    fn test_assertion_exact_pass() {
812        let result = check_single_assertion("hello", &Assertion::Exact("hello".to_string()), None);
813        assert!(result.passed);
814    }
815
816    #[test]
817    fn test_assertion_exact_fail() {
818        let result = check_single_assertion("world", &Assertion::Exact("hello".to_string()), None);
819        assert!(!result.passed);
820    }
821
822    #[test]
823    fn test_assertion_regex_pass() {
824        let result =
825            check_single_assertion("hello 123", &Assertion::Regex(r"\d+".to_string()), None);
826        assert!(result.passed);
827    }
828
829    #[test]
830    fn test_assertion_regex_fail() {
831        let result = check_single_assertion("hello", &Assertion::Regex(r"\d+".to_string()), None);
832        assert!(!result.passed);
833    }
834
835    #[test]
836    fn test_assertion_non_empty_pass() {
837        let result = check_single_assertion("hello", &Assertion::NonEmpty, None);
838        assert!(result.passed);
839    }
840
841    #[test]
842    fn test_assertion_non_empty_fail() {
843        let result = check_single_assertion("", &Assertion::NonEmpty, None);
844        assert!(!result.passed);
845    }
846
847    #[test]
848    fn test_assertion_min_length_pass() {
849        let result = check_single_assertion("hello", &Assertion::MinLength(3), None);
850        assert!(result.passed);
851    }
852
853    #[test]
854    fn test_assertion_min_length_fail() {
855        let result = check_single_assertion("hi", &Assertion::MinLength(5), None);
856        assert!(!result.passed);
857    }
858
859    #[test]
860    fn test_assertion_max_length_pass() {
861        let result = check_single_assertion("hi", &Assertion::MaxLength(5), None);
862        assert!(result.passed);
863    }
864
865    #[test]
866    fn test_assertion_max_length_fail() {
867        let result = check_single_assertion("hello world", &Assertion::MaxLength(5), None);
868        assert!(!result.passed);
869    }
870
871    #[test]
872    fn test_check_assertions_empty() {
873        let (results, passed, failed) = check_assertions("hello", &[], None);
874        assert!(results.is_empty());
875        assert_eq!(passed, 0);
876        assert_eq!(failed, 0);
877    }
878
879    #[test]
880    fn test_check_assertions_multiple() {
881        let assertions = vec![
882            Assertion::Contains("hello".to_string()),
883            Assertion::Contains("world".to_string()),
884            Assertion::NonEmpty,
885        ];
886        let (results, passed, failed) = check_assertions("hello world", &assertions, None);
887        assert_eq!(passed, 3);
888        assert_eq!(failed, 0);
889        assert_eq!(results.len(), 3);
890    }
891
892    #[test]
893    fn test_check_assertions_tool_called() {
894        let trace = RunTrace {
895            task_name: "test".to_string(),
896            started_at: "2026-01-01T00:00:00Z".to_string(),
897            ended_at: "2026-01-01T00:00:01Z".to_string(),
898            duration_ms: 1000,
899            iterations: 1,
900            steps: vec![],
901            llm_calls: vec![],
902            tool_calls: vec![
903                ToolCallTrace {
904                    iteration: 0,
905                    tool_name: "web_search".to_string(),
906                    arguments: serde_json::json!({"query": "test"}),
907                    success: true,
908                    output_preview: "results".to_string(),
909                    duration_ms: 100,
910                },
911                ToolCallTrace {
912                    iteration: 0,
913                    tool_name: "read_file".to_string(),
914                    arguments: serde_json::json!({"path": "/tmp/test"}),
915                    success: true,
916                    output_preview: "content".to_string(),
917                    duration_ms: 50,
918                },
919            ],
920            final_response: "response".to_string(),
921        };
922
923        // ToolCalled — should pass
924        let (results, passed, failed) = check_assertions(
925            "response",
926            &[Assertion::ToolCalled("web_search".to_string())],
927            Some(&trace),
928        );
929        assert_eq!(passed, 1);
930        assert_eq!(failed, 0);
931        assert!(results[0].passed);
932
933        // ToolCalled — should fail (tool not called)
934        let (results, passed, failed) = check_assertions(
935            "response",
936            &[Assertion::ToolCalled("nonexistent".to_string())],
937            Some(&trace),
938        );
939        assert_eq!(passed, 0);
940        assert_eq!(failed, 1);
941        assert!(!results[0].passed);
942
943        // ToolNotCalled — should pass (tool not in list)
944        let (results, passed, failed) = check_assertions(
945            "response",
946            &[Assertion::ToolNotCalled("nonexistent".to_string())],
947            Some(&trace),
948        );
949        assert_eq!(passed, 1);
950        assert_eq!(failed, 0);
951        assert!(results[0].passed);
952
953        // ToolNotCalled — should fail (tool was called)
954        let (results, passed, failed) = check_assertions(
955            "response",
956            &[Assertion::ToolNotCalled("web_search".to_string())],
957            Some(&trace),
958        );
959        assert_eq!(passed, 0);
960        assert_eq!(failed, 1);
961        assert!(!results[0].passed);
962
963        // ToolCalled with no trace — should fail
964        let (results, passed, failed) = check_assertions(
965            "response",
966            &[Assertion::ToolCalled("web_search".to_string())],
967            None,
968        );
969        assert_eq!(passed, 0);
970        assert_eq!(failed, 1);
971        assert!(!results[0].passed);
972    }
973
974    #[test]
975    fn test_eval_config_from_toml() {
976        let toml_str = r#"
977name = "test-suite"
978description = "A test suite"
979system_prompt = "Be concise"
980max_iterations = 3
981
982[[tasks]]
983name = "test-1"
984prompt = "What is 2+2?"
985golden = "4"
986assertions = [{ type = "contains", value = "4" }]
987weight = 1.0
988required = true
989"#;
990
991        let config: EvalConfig = toml::from_str(toml_str).unwrap();
992        assert_eq!(config.name, "test-suite");
993        assert_eq!(config.tasks.len(), 1);
994        assert_eq!(config.tasks[0].name, "test-1");
995        assert_eq!(config.tasks[0].prompt, "What is 2+2?");
996        assert_eq!(config.tasks[0].golden, "4");
997        assert_eq!(config.tasks[0].assertions.len(), 1);
998    }
999
1000    #[test]
1001    fn test_eval_config_defaults() {
1002        let toml_str = r#"
1003[[tasks]]
1004name = "simple"
1005prompt = "Say hello"
1006"#;
1007
1008        let config: EvalConfig = toml::from_str(toml_str).unwrap();
1009        assert_eq!(config.name, "unnamed");
1010        assert_eq!(config.system_prompt, default_system_prompt());
1011        assert_eq!(config.max_iterations, 5);
1012        assert_eq!(config.tasks[0].weight, 1.0);
1013        assert!(!config.tasks[0].required);
1014    }
1015
1016    #[test]
1017    fn test_report_format_text() {
1018        let report = EvalReport {
1019            suite_name: "test".to_string(),
1020            ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1021            duration_ms: 100,
1022            overall_score: 0.75,
1023            total_tasks: 2,
1024            passed_tasks: 1,
1025            failed_tasks: 1,
1026            results: vec![
1027                EvalResult {
1028                    task_name: "pass-task".to_string(),
1029                    passed: true,
1030                    score: 1.0,
1031                    assertions_passed: 2,
1032                    assertions_failed: 0,
1033                    assertion_results: vec![AssertionResult {
1034                        assertion: "contains: hello".to_string(),
1035                        passed: true,
1036                        details: "Response contains 'hello'".to_string(),
1037                    }],
1038                    trace: RunTrace {
1039                        task_name: "pass-task".to_string(),
1040                        started_at: "2026-06-22T12:00:00+00:00".to_string(),
1041                        ended_at: "2026-06-22T12:00:01+00:00".to_string(),
1042                        duration_ms: 50,
1043                        iterations: 1,
1044                        steps: vec![],
1045                        llm_calls: vec![],
1046                        tool_calls: vec![],
1047                        final_response: "hello world".to_string(),
1048                    },
1049                    error: None,
1050                },
1051                EvalResult {
1052                    task_name: "fail-task".to_string(),
1053                    passed: false,
1054                    score: 0.0,
1055                    assertions_passed: 0,
1056                    assertions_failed: 1,
1057                    assertion_results: vec![AssertionResult {
1058                        assertion: "contains: foo".to_string(),
1059                        passed: false,
1060                        details: "Response does not contain 'foo'".to_string(),
1061                    }],
1062                    trace: RunTrace {
1063                        task_name: "fail-task".to_string(),
1064                        started_at: "2026-06-22T12:00:01+00:00".to_string(),
1065                        ended_at: "2026-06-22T12:00:02+00:00".to_string(),
1066                        duration_ms: 50,
1067                        iterations: 1,
1068                        steps: vec![],
1069                        llm_calls: vec![],
1070                        tool_calls: vec![],
1071                        final_response: "bar".to_string(),
1072                    },
1073                    error: None,
1074                },
1075            ],
1076        };
1077
1078        let text = report.format_text();
1079        assert!(text.contains("Eval Report: test"));
1080        assert!(text.contains("75.0%"));
1081        assert!(text.contains("1/2 passed"));
1082        assert!(text.contains("✅ pass-task"));
1083        assert!(text.contains("❌ fail-task"));
1084    }
1085
1086    #[test]
1087    fn test_report_format_json() {
1088        let report = EvalReport {
1089            suite_name: "test".to_string(),
1090            ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1091            duration_ms: 100,
1092            overall_score: 1.0,
1093            total_tasks: 1,
1094            passed_tasks: 1,
1095            failed_tasks: 0,
1096            results: vec![],
1097        };
1098
1099        let json = report.format_json();
1100        assert_eq!(json["suite_name"], "test");
1101        assert_eq!(json["overall_score"], 1.0);
1102    }
1103
1104    #[test]
1105    fn test_eval_config_from_file_not_found() {
1106        let result = EvalConfig::from_file("/tmp/nonexistent-eval-config.toml");
1107        assert!(result.is_err());
1108    }
1109
1110    #[test]
1111    fn test_assertion_regex_invalid_pattern() {
1112        let result =
1113            check_single_assertion("hello", &Assertion::Regex(r"[invalid".to_string()), None);
1114        assert!(!result.passed);
1115        assert!(result.details.contains("Invalid regex"));
1116    }
1117
1118    #[test]
1119    fn test_trace_step_serialization() {
1120        let step = TraceStep {
1121            number: 0,
1122            step_type: StepType::Thought,
1123            content: "test".to_string(),
1124            duration_ms: 100,
1125        };
1126        let json = serde_json::to_string(&step).unwrap();
1127        assert!(json.contains("Thought"));
1128    }
1129
1130    #[test]
1131    fn test_tool_call_trace_serialization() {
1132        let trace = ToolCallTrace {
1133            iteration: 0,
1134            tool_name: "shell_exec".to_string(),
1135            arguments: serde_json::json!({"command": "echo hello"}),
1136            success: true,
1137            output_preview: "hello".to_string(),
1138            duration_ms: 50,
1139        };
1140        let json = serde_json::to_string(&trace).unwrap();
1141        assert!(json.contains("shell_exec"));
1142        assert!(json.contains("echo hello"));
1143    }
1144}