Skip to main content

ravenclaws/
eval.rs

1//! RavenClaws
2//!
3//! Provides a framework for defining, running, and scoring evaluation tasks
4//! against LLM agents. Captures full run traces for inspection and debugging.
5//!
6//! # Architecture
7//!
8//! ```text
9//! EvalConfig (TOML file)
10//!   └── Vec<EvalTask>
11//!         ├── prompt + golden answer
12//!         ├── assertions (contains, not_contains, regex, exact)
13//!         └── scoring weights
14//!
15//! EvalRunner
16//!   ├── run_task() → EvalResult (with RunTrace)
17//!   └── run_suite() → EvalReport (summary of all results)
18//!
19//! RunTrace
20//!   ├── steps: Vec<TraceStep>
21//!   ├── llm_calls: Vec<LlmCallTrace>
22//!   └── tool_calls: Vec<ToolCallTrace>
23//! ```
24
25use crate::agent::{run_agent_loop, AgentLoopConfig};
26use crate::error::{RavenClawsError, Result};
27use crate::llm::LLMProviderTrait;
28use serde::{Deserialize, Serialize};
29use std::sync::Arc;
30use tracing::{info, instrument, warn};
31
32// ── Configuration ───────────────────────────────────────────────────────────
33
34/// Configuration for an eval suite — loaded from a TOML file
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct EvalConfig {
37    /// Name of this eval suite
38    #[serde(default = "default_suite_name")]
39    pub name: String,
40    /// Description of what this suite tests
41    #[serde(default)]
42    pub description: String,
43    /// System prompt to use for all tasks in this suite
44    #[serde(default = "default_system_prompt")]
45    pub system_prompt: String,
46    /// Maximum iterations per task
47    #[serde(default = "default_max_iterations")]
48    pub max_iterations: usize,
49    /// List of eval tasks to run
50    #[serde(default)]
51    pub tasks: Vec<EvalTask>,
52}
53
54fn default_suite_name() -> String {
55    "unnamed".to_string()
56}
57
58fn default_system_prompt() -> String {
59    "You are a helpful assistant. Be concise and accurate.".to_string()
60}
61
62fn default_max_iterations() -> usize {
63    5
64}
65
66/// A single eval task with prompt, golden answer, and assertions
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct EvalTask {
69    /// Name of this task (used in reports)
70    pub name: String,
71    /// Description of what this task tests
72    #[serde(default)]
73    pub description: String,
74    /// The prompt to send to the agent
75    pub prompt: String,
76    /// Expected golden answer (used for exact match scoring)
77    #[serde(default)]
78    pub golden: String,
79    /// List of assertions to check against the response
80    #[serde(default)]
81    pub assertions: Vec<Assertion>,
82    /// Weight of this task in the overall score (0.0 - 1.0)
83    #[serde(default = "default_weight")]
84    pub weight: f64,
85    /// Whether this task is required to pass (fails the suite if not)
86    #[serde(default)]
87    pub required: bool,
88}
89
90fn default_weight() -> f64 {
91    1.0
92}
93
94/// Types of assertions that can be checked against a response
95#[derive(Debug, Clone, Serialize, Deserialize)]
96#[serde(tag = "type", content = "value")]
97pub enum Assertion {
98    /// Response must contain this substring
99    #[serde(rename = "contains")]
100    Contains(String),
101    /// Response must NOT contain this substring
102    #[serde(rename = "not_contains")]
103    NotContains(String),
104    /// Response must exactly match this string
105    #[serde(rename = "exact")]
106    Exact(String),
107    /// Response must match this regex pattern
108    #[serde(rename = "regex")]
109    Regex(String),
110    /// Response must be non-empty
111    #[serde(rename = "non_empty")]
112    NonEmpty,
113    /// Response length must be at least N characters
114    #[serde(rename = "min_length")]
115    MinLength(usize),
116    /// Response length must be at most N characters
117    #[serde(rename = "max_length")]
118    MaxLength(usize),
119    /// A tool with this name must have been called during execution (v0.9.6)
120    #[serde(rename = "tool_called")]
121    ToolCalled(String),
122    /// A tool with this name must NOT have been called during execution (v0.9.6)
123    #[serde(rename = "tool_not_called")]
124    ToolNotCalled(String),
125}
126
127// ── Run Trace ───────────────────────────────────────────────────────────────
128
129/// Full trace of a single agent run — captures every step for inspection
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RunTrace {
132    /// Task name
133    pub task_name: String,
134    /// When the run started (ISO 8601)
135    pub started_at: String,
136    /// When the run ended (ISO 8601)
137    pub ended_at: String,
138    /// Duration in milliseconds
139    pub duration_ms: u64,
140    /// Number of iterations used
141    pub iterations: usize,
142    /// All steps in chronological order
143    pub steps: Vec<TraceStep>,
144    /// LLM calls made during the run
145    pub llm_calls: Vec<LlmCallTrace>,
146    /// Tool calls made during the run
147    pub tool_calls: Vec<ToolCallTrace>,
148    /// Final response from the agent
149    pub final_response: String,
150}
151
152/// A single step in the agent loop
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct TraceStep {
155    /// Step number (0-based)
156    pub number: usize,
157    /// Type of step
158    pub step_type: StepType,
159    /// Content of the step (LLM response, tool result, etc.)
160    pub content: String,
161    /// Duration of this step in milliseconds
162    pub duration_ms: u64,
163}
164
165/// Type of a trace step
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub enum StepType {
168    /// LLM thought/response
169    Thought,
170    /// Tool call
171    ToolCall,
172    /// Tool result/observation
173    Observation,
174    /// Final answer
175    Final,
176    /// Error
177    Error,
178}
179
180/// Trace of a single LLM call
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct LlmCallTrace {
183    /// Iteration number
184    pub iteration: usize,
185    /// Provider name
186    pub provider: String,
187    /// Model name
188    pub model: String,
189    /// Prompt tokens (if available)
190    pub prompt_tokens: Option<u32>,
191    /// Completion tokens (if available)
192    pub completion_tokens: Option<u32>,
193    /// Duration in milliseconds
194    pub duration_ms: u64,
195    /// Response content (truncated to 1000 chars for storage)
196    pub response_preview: String,
197}
198
199/// Trace of a single tool call
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct ToolCallTrace {
202    /// Iteration number
203    pub iteration: usize,
204    /// Tool name
205    pub tool_name: String,
206    /// Arguments (JSON)
207    pub arguments: serde_json::Value,
208    /// Whether the tool succeeded
209    pub success: bool,
210    /// Output preview (truncated to 500 chars)
211    pub output_preview: String,
212    /// Duration in milliseconds
213    pub duration_ms: u64,
214}
215
216// ── Results ─────────────────────────────────────────────────────────────────
217
218/// Result of a single eval task
219#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct EvalResult {
221    /// Task name
222    pub task_name: String,
223    /// Whether the task passed all assertions
224    pub passed: bool,
225    /// Score (0.0 - 1.0)
226    pub score: f64,
227    /// Number of assertions that passed
228    pub assertions_passed: usize,
229    /// Number of assertions that failed
230    pub assertions_failed: usize,
231    /// Details of each assertion check
232    pub assertion_results: Vec<AssertionResult>,
233    /// Full run trace for inspection
234    pub trace: RunTrace,
235    /// Error message if the task failed to run
236    pub error: Option<String>,
237}
238
239/// Result of a single assertion check
240#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct AssertionResult {
242    /// The assertion that was checked
243    pub assertion: String,
244    /// Whether it passed
245    pub passed: bool,
246    /// Details about the check
247    pub details: String,
248}
249
250/// Summary report of an entire eval suite run
251#[derive(Debug, Clone, Serialize, Deserialize)]
252pub struct EvalReport {
253    /// Suite name
254    pub suite_name: String,
255    /// When the suite was run (ISO 8601)
256    pub ran_at: String,
257    /// Duration in milliseconds
258    pub duration_ms: u64,
259    /// Overall score (0.0 - 1.0)
260    pub overall_score: f64,
261    /// Number of tasks
262    pub total_tasks: usize,
263    /// Number of tasks that passed
264    pub passed_tasks: usize,
265    /// Number of tasks that failed
266    pub failed_tasks: usize,
267    /// Individual task results
268    pub results: Vec<EvalResult>,
269}
270
271// ── Eval Runner ─────────────────────────────────────────────────────────────
272
273/// Runs eval tasks against an LLM provider and captures traces
274pub struct EvalRunner {
275    /// The LLM provider to test
276    llm: Arc<dyn LLMProviderTrait>,
277    /// Eval configuration
278    config: EvalConfig,
279}
280
281impl EvalRunner {
282    /// Create a new eval runner
283    pub fn new(llm: Arc<dyn LLMProviderTrait>, config: EvalConfig) -> Self {
284        Self { llm, config }
285    }
286
287    /// Run the full eval suite and return a report
288    #[instrument(skip(self), fields(suite = %self.config.name, task_count = self.config.tasks.len()))]
289    pub async fn run_suite(&self) -> EvalReport {
290        let started_at = chrono::Utc::now().to_rfc3339();
291        let suite_start = std::time::Instant::now();
292        let mut results = Vec::with_capacity(self.config.tasks.len());
293
294        info!(
295            suite = %self.config.name,
296            task_count = self.config.tasks.len(),
297            "Starting eval suite"
298        );
299
300        for task in &self.config.tasks {
301            let result = self.run_task(task).await;
302            let passed = result.passed;
303            let name = &result.task_name;
304
305            if passed {
306                info!(task = %name, score = result.score, "Eval task passed");
307            } else {
308                warn!(
309                    task = %name,
310                    score = result.score,
311                    passed = result.assertions_passed,
312                    failed = result.assertions_failed,
313                    "Eval task failed"
314                );
315            }
316
317            results.push(result);
318        }
319
320        let duration_ms = suite_start.elapsed().as_millis() as u64;
321        let total_tasks = results.len();
322        let passed_tasks = results.iter().filter(|r| r.passed).count();
323        let failed_tasks = total_tasks - passed_tasks;
324        let overall_score = if total_tasks > 0 {
325            results
326                .iter()
327                .map(|r| r.score * r.trace.iterations as f64)
328                .sum::<f64>()
329                / results
330                    .iter()
331                    .map(|r| r.trace.iterations as f64)
332                    .sum::<f64>()
333        } else {
334            0.0
335        };
336
337        info!(
338            suite = %self.config.name,
339            passed = passed_tasks,
340            failed = failed_tasks,
341            overall_score = overall_score,
342            duration_ms = duration_ms,
343            "Eval suite completed"
344        );
345
346        EvalReport {
347            suite_name: self.config.name.clone(),
348            ran_at: started_at,
349            duration_ms,
350            overall_score,
351            total_tasks,
352            passed_tasks,
353            failed_tasks,
354            results,
355        }
356    }
357
358    /// Run a single eval task and return the result with trace
359    ///
360    /// Uses the full agent loop (`run_agent_loop`) instead of a single LLM call,
361    /// so eval tasks exercise the complete ReAct loop with tool use, security
362    /// checks, and iteration limits.
363    #[instrument(skip(self), fields(task = %task.name))]
364    async fn run_task(&self, task: &EvalTask) -> EvalResult {
365        let task_start = std::time::Instant::now();
366        let started_at = chrono::Utc::now().to_rfc3339();
367
368        // Build agent loop config from suite settings
369        let agent_config = AgentLoopConfig {
370            max_iterations: self.config.max_iterations,
371            enable_tools: true,
372            require_approval: false,
373            prompt_injection_protection: true,
374            token_lifetime_secs: 0,
375            no_final_required: false,
376            fallback_chain: None,
377            token_budget: None,
378            ravenfabric: None,
379            checkpoint_dir: None,
380            session_id: None,
381            metrics_callback: None,
382        };
383
384        // Run the full agent loop (ReAct + tools + security)
385        let result = run_agent_loop(
386            self.llm.clone(),
387            &task.prompt,
388            &self.config.system_prompt,
389            agent_config,
390        )
391        .await;
392
393        let duration_ms = task_start.elapsed().as_millis() as u64;
394
395        match result {
396            Ok(final_response) => {
397                let trace = RunTrace {
398                    task_name: task.name.clone(),
399                    started_at,
400                    ended_at: chrono::Utc::now().to_rfc3339(),
401                    duration_ms,
402                    iterations: self.config.max_iterations, // best-effort; agent loop doesn't expose exact count
403                    steps: vec![TraceStep {
404                        number: 0,
405                        step_type: StepType::Final,
406                        content: final_response.clone(),
407                        duration_ms,
408                    }],
409                    llm_calls: Vec::new(), // agent loop doesn't expose per-call traces
410                    tool_calls: Vec::new(), // agent loop doesn't expose per-call traces
411                    final_response: final_response.clone(),
412                };
413
414                // Run assertions against the final response
415                let (assertion_results, assertions_passed, assertions_failed) =
416                    check_assertions(&final_response, &task.assertions, Some(&trace));
417
418                // Calculate score
419                let score = if task.assertions.is_empty() {
420                    if final_response.is_empty() || final_response.len() < 10 {
421                        0.0
422                    } else {
423                        1.0
424                    }
425                } else if task.assertions.len() == assertions_passed + assertions_failed {
426                    assertions_passed as f64 / task.assertions.len() as f64
427                } else {
428                    0.0
429                };
430
431                let passed = assertions_failed == 0 && !final_response.is_empty();
432
433                EvalResult {
434                    task_name: task.name.clone(),
435                    passed,
436                    score,
437                    assertions_passed,
438                    assertions_failed,
439                    assertion_results,
440                    trace,
441                    error: None,
442                }
443            }
444            Err(e) => {
445                let trace = RunTrace {
446                    task_name: task.name.clone(),
447                    started_at,
448                    ended_at: chrono::Utc::now().to_rfc3339(),
449                    duration_ms,
450                    iterations: 0,
451                    steps: vec![TraceStep {
452                        number: 0,
453                        step_type: StepType::Error,
454                        content: format!("Agent loop failed: {}", e),
455                        duration_ms,
456                    }],
457                    llm_calls: Vec::new(),
458                    tool_calls: Vec::new(),
459                    final_response: String::new(),
460                };
461
462                EvalResult {
463                    task_name: task.name.clone(),
464                    passed: false,
465                    score: 0.0,
466                    assertions_passed: 0,
467                    assertions_failed: 1,
468                    assertion_results: vec![AssertionResult {
469                        assertion: "agent_loop".to_string(),
470                        passed: false,
471                        details: format!("Agent loop failed: {}", e),
472                    }],
473                    trace,
474                    error: Some(e.to_string()),
475                }
476            }
477        }
478    }
479}
480
481// ── Assertion Checking ──────────────────────────────────────────────────────
482
483/// Check all assertions against a response string
484fn check_assertions(
485    response: &str,
486    assertions: &[Assertion],
487    run_trace: Option<&RunTrace>,
488) -> (Vec<AssertionResult>, usize, usize) {
489    let mut results = Vec::with_capacity(assertions.len());
490    let mut passed = 0;
491    let mut failed = 0;
492
493    for assertion in assertions {
494        let result = check_single_assertion(response, assertion, run_trace);
495        if result.passed {
496            passed += 1;
497        } else {
498            failed += 1;
499        }
500        results.push(result);
501    }
502
503    (results, passed, failed)
504}
505
506/// Check a single assertion against a response
507fn check_single_assertion(
508    response: &str,
509    assertion: &Assertion,
510    run_trace: Option<&RunTrace>,
511) -> AssertionResult {
512    match assertion {
513        Assertion::Contains(pattern) => {
514            let passed = response.contains(pattern);
515            AssertionResult {
516                assertion: format!("contains: {}", pattern),
517                passed,
518                details: if passed {
519                    format!("Response contains '{}'", pattern)
520                } else {
521                    format!("Response does not contain '{}'", pattern)
522                },
523            }
524        }
525        Assertion::NotContains(pattern) => {
526            let passed = !response.contains(pattern);
527            AssertionResult {
528                assertion: format!("not_contains: {}", pattern),
529                passed,
530                details: if passed {
531                    format!("Response does not contain '{}'", pattern)
532                } else {
533                    format!("Response contains '{}'", pattern)
534                },
535            }
536        }
537        Assertion::Exact(expected) => {
538            let trimmed_response = response.trim();
539            let passed = trimmed_response == expected.as_str();
540            AssertionResult {
541                assertion: format!("exact: {}", expected),
542                passed,
543                details: if passed {
544                    "Response matches exactly".to_string()
545                } else {
546                    format!(
547                        "Expected '{}', got '{}'",
548                        expected,
549                        trimmed_response.chars().take(100).collect::<String>()
550                    )
551                },
552            }
553        }
554        Assertion::Regex(pattern) => {
555            let re = regex_lite::Regex::new(pattern);
556            match re {
557                Ok(re) => {
558                    let passed = re.is_match(response);
559                    AssertionResult {
560                        assertion: format!("regex: {}", pattern),
561                        passed,
562                        details: if passed {
563                            format!("Response matches pattern '{}'", pattern)
564                        } else {
565                            format!("Response does not match pattern '{}'", pattern)
566                        },
567                    }
568                }
569                Err(e) => AssertionResult {
570                    assertion: format!("regex: {}", pattern),
571                    passed: false,
572                    details: format!("Invalid regex pattern: {}", e),
573                },
574            }
575        }
576        Assertion::NonEmpty => {
577            let passed = !response.is_empty();
578            AssertionResult {
579                assertion: "non_empty".to_string(),
580                passed,
581                details: if passed {
582                    format!("Response is non-empty ({} chars)", response.len())
583                } else {
584                    "Response is empty".to_string()
585                },
586            }
587        }
588        Assertion::MinLength(min) => {
589            let passed = response.len() >= *min;
590            AssertionResult {
591                assertion: format!("min_length: {}", min),
592                passed,
593                details: if passed {
594                    format!("Response length {} >= {}", response.len(), min)
595                } else {
596                    format!("Response length {} < {}", response.len(), min)
597                },
598            }
599        }
600        Assertion::MaxLength(max) => {
601            let passed = response.len() <= *max;
602            AssertionResult {
603                assertion: format!("max_length: {}", max),
604                passed,
605                details: if passed {
606                    format!("Response length {} <= {}", response.len(), max)
607                } else {
608                    format!("Response length {} > {}", response.len(), max)
609                },
610            }
611        }
612        Assertion::ToolCalled(tool_name) => {
613            let tool_calls = run_trace
614                .map(|t| &t.tool_calls)
615                .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
616            let passed = tool_calls.is_some();
617            AssertionResult {
618                assertion: format!("tool_called: {}", tool_name),
619                passed,
620                details: if passed {
621                    format!("Tool '{}' was called", tool_name)
622                } else {
623                    let all_tools: Vec<&str> = run_trace
624                        .map(|t| {
625                            t.tool_calls
626                                .iter()
627                                .map(|tc| tc.tool_name.as_str())
628                                .collect()
629                        })
630                        .unwrap_or_default();
631                    if all_tools.is_empty() {
632                        format!("Tool '{}' was not called (no tools were called)", tool_name)
633                    } else {
634                        format!(
635                            "Tool '{}' was not called (called: {})",
636                            tool_name,
637                            all_tools.join(", ")
638                        )
639                    }
640                },
641            }
642        }
643        Assertion::ToolNotCalled(tool_name) => {
644            let tool_calls = run_trace
645                .map(|t| &t.tool_calls)
646                .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
647            let passed = tool_calls.is_none();
648            AssertionResult {
649                assertion: format!("tool_not_called: {}", tool_name),
650                passed,
651                details: if passed {
652                    format!("Tool '{}' was not called", tool_name)
653                } else {
654                    format!("Tool '{}' was called but should not have been", tool_name)
655                },
656            }
657        }
658    }
659}
660
661// ── Report Formatting ───────────────────────────────────────────────────────
662
663impl EvalReport {
664    /// Format the report as a human-readable string
665    pub fn format_text(&self) -> String {
666        let mut output = String::new();
667
668        output.push_str(&format!("\n🐦‍⬛ Eval Report: {}\n", self.suite_name));
669        output.push_str(&format!("{:-^60}\n", ""));
670        output.push_str(&format!(
671            "Ran at:       {}\n",
672            &self.ran_at[..19].replace('T', " ")
673        ));
674        output.push_str(&format!("Duration:     {} ms\n", self.duration_ms));
675        output.push_str(&format!(
676            "Overall score: {:.1}%\n",
677            self.overall_score * 100.0
678        ));
679        output.push_str(&format!(
680            "Tasks:        {}/{} passed\n",
681            self.passed_tasks, self.total_tasks
682        ));
683        output.push_str(&format!("{:-^60}\n", ""));
684
685        for result in &self.results {
686            output.push_str(&format!(
687                "\n  {} {} — {:.1}%\n",
688                if result.passed { "✅" } else { "❌" },
689                result.task_name,
690                result.score * 100.0
691            ));
692
693            if let Some(ref error) = result.error {
694                output.push_str(&format!("    Error: {}\n", error));
695            }
696
697            if !result.assertion_results.is_empty() {
698                for ar in &result.assertion_results {
699                    output.push_str(&format!(
700                        "    {} {}\n",
701                        if ar.passed { "  ✅" } else { "  ❌" },
702                        ar.details
703                    ));
704                }
705            }
706
707            // Show trace summary
708            let trace = &result.trace;
709            output.push_str(&format!(
710                "    Iterations: {} · LLM calls: {} · Tool calls: {} · Duration: {} ms\n",
711                trace.iterations,
712                trace.llm_calls.len(),
713                trace.tool_calls.len(),
714                trace.duration_ms
715            ));
716
717            // Show response preview
718            let preview: String = trace.final_response.chars().take(200).collect();
719            if !preview.is_empty() {
720                output.push_str(&format!("    Response: {}\n", preview));
721            }
722        }
723
724        output
725    }
726
727    /// Format the report as JSON
728    pub fn format_json(&self) -> serde_json::Value {
729        serde_json::to_value(self).unwrap_or(serde_json::json!({"error": "serialization failed"}))
730    }
731}
732
733// ── Config Loading ──────────────────────────────────────────────────────────
734
735impl EvalConfig {
736    /// Load eval config from a TOML file
737    pub fn from_file(path: &str) -> Result<Self> {
738        let content = std::fs::read_to_string(path).map_err(|e| {
739            RavenClawsError::CommandExecution(format!("Failed to read eval config: {}", e))
740        })?;
741
742        if content.trim().is_empty() {
743            return Err(RavenClawsError::CommandExecution(format!(
744                "Eval config file '{}' is empty — no tasks to run",
745                path
746            )));
747        }
748
749        let config: EvalConfig = toml::from_str(&content).map_err(|e| {
750            RavenClawsError::CommandExecution(format!("Failed to parse eval config: {}", e))
751        })?;
752
753        if config.tasks.is_empty() {
754            return Err(RavenClawsError::CommandExecution(format!(
755                "Eval config file '{}' has no tasks defined",
756                path
757            )));
758        }
759
760        Ok(config)
761    }
762}
763
764// ── Tests ───────────────────────────────────────────────────────────────────
765
766#[cfg(test)]
767mod tests {
768    use super::*;
769
770    #[test]
771    fn test_assertion_contains_pass() {
772        let result = check_single_assertion(
773            "hello world",
774            &Assertion::Contains("world".to_string()),
775            None,
776        );
777        assert!(result.passed);
778        assert!(result.details.contains("contains"));
779    }
780
781    #[test]
782    fn test_assertion_contains_fail() {
783        let result =
784            check_single_assertion("hello world", &Assertion::Contains("foo".to_string()), None);
785        assert!(!result.passed);
786    }
787
788    #[test]
789    fn test_assertion_not_contains_pass() {
790        let result = check_single_assertion(
791            "hello world",
792            &Assertion::NotContains("foo".to_string()),
793            None,
794        );
795        assert!(result.passed);
796    }
797
798    #[test]
799    fn test_assertion_not_contains_fail() {
800        let result = check_single_assertion(
801            "hello world",
802            &Assertion::NotContains("world".to_string()),
803            None,
804        );
805        assert!(!result.passed);
806    }
807
808    #[test]
809    fn test_assertion_exact_pass() {
810        let result = check_single_assertion("hello", &Assertion::Exact("hello".to_string()), None);
811        assert!(result.passed);
812    }
813
814    #[test]
815    fn test_assertion_exact_fail() {
816        let result = check_single_assertion("world", &Assertion::Exact("hello".to_string()), None);
817        assert!(!result.passed);
818    }
819
820    #[test]
821    fn test_assertion_regex_pass() {
822        let result =
823            check_single_assertion("hello 123", &Assertion::Regex(r"\d+".to_string()), None);
824        assert!(result.passed);
825    }
826
827    #[test]
828    fn test_assertion_regex_fail() {
829        let result = check_single_assertion("hello", &Assertion::Regex(r"\d+".to_string()), None);
830        assert!(!result.passed);
831    }
832
833    #[test]
834    fn test_assertion_non_empty_pass() {
835        let result = check_single_assertion("hello", &Assertion::NonEmpty, None);
836        assert!(result.passed);
837    }
838
839    #[test]
840    fn test_assertion_non_empty_fail() {
841        let result = check_single_assertion("", &Assertion::NonEmpty, None);
842        assert!(!result.passed);
843    }
844
845    #[test]
846    fn test_assertion_min_length_pass() {
847        let result = check_single_assertion("hello", &Assertion::MinLength(3), None);
848        assert!(result.passed);
849    }
850
851    #[test]
852    fn test_assertion_min_length_fail() {
853        let result = check_single_assertion("hi", &Assertion::MinLength(5), None);
854        assert!(!result.passed);
855    }
856
857    #[test]
858    fn test_assertion_max_length_pass() {
859        let result = check_single_assertion("hi", &Assertion::MaxLength(5), None);
860        assert!(result.passed);
861    }
862
863    #[test]
864    fn test_assertion_max_length_fail() {
865        let result = check_single_assertion("hello world", &Assertion::MaxLength(5), None);
866        assert!(!result.passed);
867    }
868
869    #[test]
870    fn test_check_assertions_empty() {
871        let (results, passed, failed) = check_assertions("hello", &[], None);
872        assert!(results.is_empty());
873        assert_eq!(passed, 0);
874        assert_eq!(failed, 0);
875    }
876
877    #[test]
878    fn test_check_assertions_multiple() {
879        let assertions = vec![
880            Assertion::Contains("hello".to_string()),
881            Assertion::Contains("world".to_string()),
882            Assertion::NonEmpty,
883        ];
884        let (results, passed, failed) = check_assertions("hello world", &assertions, None);
885        assert_eq!(passed, 3);
886        assert_eq!(failed, 0);
887        assert_eq!(results.len(), 3);
888    }
889
890    #[test]
891    fn test_check_assertions_tool_called() {
892        let trace = RunTrace {
893            task_name: "test".to_string(),
894            started_at: "2026-01-01T00:00:00Z".to_string(),
895            ended_at: "2026-01-01T00:00:01Z".to_string(),
896            duration_ms: 1000,
897            iterations: 1,
898            steps: vec![],
899            llm_calls: vec![],
900            tool_calls: vec![
901                ToolCallTrace {
902                    iteration: 0,
903                    tool_name: "web_search".to_string(),
904                    arguments: serde_json::json!({"query": "test"}),
905                    success: true,
906                    output_preview: "results".to_string(),
907                    duration_ms: 100,
908                },
909                ToolCallTrace {
910                    iteration: 0,
911                    tool_name: "read_file".to_string(),
912                    arguments: serde_json::json!({"path": "/tmp/test"}),
913                    success: true,
914                    output_preview: "content".to_string(),
915                    duration_ms: 50,
916                },
917            ],
918            final_response: "response".to_string(),
919        };
920
921        // ToolCalled — should pass
922        let (results, passed, failed) = check_assertions(
923            "response",
924            &[Assertion::ToolCalled("web_search".to_string())],
925            Some(&trace),
926        );
927        assert_eq!(passed, 1);
928        assert_eq!(failed, 0);
929        assert!(results[0].passed);
930
931        // ToolCalled — should fail (tool not called)
932        let (results, passed, failed) = check_assertions(
933            "response",
934            &[Assertion::ToolCalled("nonexistent".to_string())],
935            Some(&trace),
936        );
937        assert_eq!(passed, 0);
938        assert_eq!(failed, 1);
939        assert!(!results[0].passed);
940
941        // ToolNotCalled — should pass (tool not in list)
942        let (results, passed, failed) = check_assertions(
943            "response",
944            &[Assertion::ToolNotCalled("nonexistent".to_string())],
945            Some(&trace),
946        );
947        assert_eq!(passed, 1);
948        assert_eq!(failed, 0);
949        assert!(results[0].passed);
950
951        // ToolNotCalled — should fail (tool was called)
952        let (results, passed, failed) = check_assertions(
953            "response",
954            &[Assertion::ToolNotCalled("web_search".to_string())],
955            Some(&trace),
956        );
957        assert_eq!(passed, 0);
958        assert_eq!(failed, 1);
959        assert!(!results[0].passed);
960
961        // ToolCalled with no trace — should fail
962        let (results, passed, failed) = check_assertions(
963            "response",
964            &[Assertion::ToolCalled("web_search".to_string())],
965            None,
966        );
967        assert_eq!(passed, 0);
968        assert_eq!(failed, 1);
969        assert!(!results[0].passed);
970    }
971
972    #[test]
973    fn test_eval_config_from_toml() {
974        let toml_str = r#"
975name = "test-suite"
976description = "A test suite"
977system_prompt = "Be concise"
978max_iterations = 3
979
980[[tasks]]
981name = "test-1"
982prompt = "What is 2+2?"
983golden = "4"
984assertions = [{ type = "contains", value = "4" }]
985weight = 1.0
986required = true
987"#;
988
989        let config: EvalConfig = toml::from_str(toml_str).unwrap();
990        assert_eq!(config.name, "test-suite");
991        assert_eq!(config.tasks.len(), 1);
992        assert_eq!(config.tasks[0].name, "test-1");
993        assert_eq!(config.tasks[0].prompt, "What is 2+2?");
994        assert_eq!(config.tasks[0].golden, "4");
995        assert_eq!(config.tasks[0].assertions.len(), 1);
996    }
997
998    #[test]
999    fn test_eval_config_defaults() {
1000        let toml_str = r#"
1001[[tasks]]
1002name = "simple"
1003prompt = "Say hello"
1004"#;
1005
1006        let config: EvalConfig = toml::from_str(toml_str).unwrap();
1007        assert_eq!(config.name, "unnamed");
1008        assert_eq!(config.system_prompt, default_system_prompt());
1009        assert_eq!(config.max_iterations, 5);
1010        assert_eq!(config.tasks[0].weight, 1.0);
1011        assert!(!config.tasks[0].required);
1012    }
1013
1014    #[test]
1015    fn test_report_format_text() {
1016        let report = EvalReport {
1017            suite_name: "test".to_string(),
1018            ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1019            duration_ms: 100,
1020            overall_score: 0.75,
1021            total_tasks: 2,
1022            passed_tasks: 1,
1023            failed_tasks: 1,
1024            results: vec![
1025                EvalResult {
1026                    task_name: "pass-task".to_string(),
1027                    passed: true,
1028                    score: 1.0,
1029                    assertions_passed: 2,
1030                    assertions_failed: 0,
1031                    assertion_results: vec![AssertionResult {
1032                        assertion: "contains: hello".to_string(),
1033                        passed: true,
1034                        details: "Response contains 'hello'".to_string(),
1035                    }],
1036                    trace: RunTrace {
1037                        task_name: "pass-task".to_string(),
1038                        started_at: "2026-06-22T12:00:00+00:00".to_string(),
1039                        ended_at: "2026-06-22T12:00:01+00:00".to_string(),
1040                        duration_ms: 50,
1041                        iterations: 1,
1042                        steps: vec![],
1043                        llm_calls: vec![],
1044                        tool_calls: vec![],
1045                        final_response: "hello world".to_string(),
1046                    },
1047                    error: None,
1048                },
1049                EvalResult {
1050                    task_name: "fail-task".to_string(),
1051                    passed: false,
1052                    score: 0.0,
1053                    assertions_passed: 0,
1054                    assertions_failed: 1,
1055                    assertion_results: vec![AssertionResult {
1056                        assertion: "contains: foo".to_string(),
1057                        passed: false,
1058                        details: "Response does not contain 'foo'".to_string(),
1059                    }],
1060                    trace: RunTrace {
1061                        task_name: "fail-task".to_string(),
1062                        started_at: "2026-06-22T12:00:01+00:00".to_string(),
1063                        ended_at: "2026-06-22T12:00:02+00:00".to_string(),
1064                        duration_ms: 50,
1065                        iterations: 1,
1066                        steps: vec![],
1067                        llm_calls: vec![],
1068                        tool_calls: vec![],
1069                        final_response: "bar".to_string(),
1070                    },
1071                    error: None,
1072                },
1073            ],
1074        };
1075
1076        let text = report.format_text();
1077        assert!(text.contains("Eval Report: test"));
1078        assert!(text.contains("75.0%"));
1079        assert!(text.contains("1/2 passed"));
1080        assert!(text.contains("✅ pass-task"));
1081        assert!(text.contains("❌ fail-task"));
1082    }
1083
1084    #[test]
1085    fn test_report_format_json() {
1086        let report = EvalReport {
1087            suite_name: "test".to_string(),
1088            ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1089            duration_ms: 100,
1090            overall_score: 1.0,
1091            total_tasks: 1,
1092            passed_tasks: 1,
1093            failed_tasks: 0,
1094            results: vec![],
1095        };
1096
1097        let json = report.format_json();
1098        assert_eq!(json["suite_name"], "test");
1099        assert_eq!(json["overall_score"], 1.0);
1100    }
1101
1102    #[test]
1103    fn test_eval_config_from_file_not_found() {
1104        let result = EvalConfig::from_file("/tmp/nonexistent-eval-config.toml");
1105        assert!(result.is_err());
1106    }
1107
1108    #[test]
1109    fn test_assertion_regex_invalid_pattern() {
1110        let result =
1111            check_single_assertion("hello", &Assertion::Regex(r"[invalid".to_string()), None);
1112        assert!(!result.passed);
1113        assert!(result.details.contains("Invalid regex"));
1114    }
1115
1116    #[test]
1117    fn test_trace_step_serialization() {
1118        let step = TraceStep {
1119            number: 0,
1120            step_type: StepType::Thought,
1121            content: "test".to_string(),
1122            duration_ms: 100,
1123        };
1124        let json = serde_json::to_string(&step).unwrap();
1125        assert!(json.contains("Thought"));
1126    }
1127
1128    #[test]
1129    fn test_tool_call_trace_serialization() {
1130        let trace = ToolCallTrace {
1131            iteration: 0,
1132            tool_name: "shell_exec".to_string(),
1133            arguments: serde_json::json!({"command": "echo hello"}),
1134            success: true,
1135            output_preview: "hello".to_string(),
1136            duration_ms: 50,
1137        };
1138        let json = serde_json::to_string(&trace).unwrap();
1139        assert!(json.contains("shell_exec"));
1140        assert!(json.contains("echo hello"));
1141    }
1142}