ravenclaws/
eval.rs

1//! RavenClaws
2//!
3//! Provides a framework for defining, running, and scoring evaluation tasks
4//! against LLM agents. Captures full run traces for inspection and debugging.
5//!
6//! # Architecture
7//!
8//! ```text
9//! EvalConfig (TOML file)
10//!   └── Vec<EvalTask>
11//!         ├── prompt + golden answer
12//!         ├── assertions (contains, not_contains, regex, exact)
13//!         └── scoring weights
14//!
15//! EvalRunner
16//!   ├── run_task() → EvalResult (with RunTrace)
17//!   └── run_suite() → EvalReport (summary of all results)
18//!
19//! RunTrace
20//!   ├── steps: Vec<TraceStep>
21//!   ├── llm_calls: Vec<LlmCallTrace>
22//!   └── tool_calls: Vec<ToolCallTrace>
23//! ```
24
25use crate::agent::{run_agent_loop, AgentLoopConfig};
26use crate::error::{RavenClawsError, Result};
27use crate::llm::LLMProviderTrait;
28use serde::{Deserialize, Serialize};
29use std::sync::Arc;
30use tracing::{info, instrument, warn};
31
32// ── Configuration ───────────────────────────────────────────────────────────
33
34/// Configuration for an eval suite — loaded from a TOML file
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct EvalConfig {
37    /// Name of this eval suite
38    #[serde(default = "default_suite_name")]
39    pub name: String,
40    /// Description of what this suite tests
41    #[serde(default)]
42    pub description: String,
43    /// System prompt to use for all tasks in this suite
44    #[serde(default = "default_system_prompt")]
45    pub system_prompt: String,
46    /// Maximum iterations per task
47    #[serde(default = "default_max_iterations")]
48    pub max_iterations: usize,
49    /// List of eval tasks to run
50    #[serde(default)]
51    pub tasks: Vec<EvalTask>,
52}
53
54fn default_suite_name() -> String {
55    "unnamed".to_string()
56}
57
58fn default_system_prompt() -> String {
59    "You are a helpful assistant. Be concise and accurate.".to_string()
60}
61
62fn default_max_iterations() -> usize {
63    5
64}
65
66/// A single eval task with prompt, golden answer, and assertions
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct EvalTask {
69    /// Name of this task (used in reports)
70    pub name: String,
71    /// Description of what this task tests
72    #[serde(default)]
73    pub description: String,
74    /// The prompt to send to the agent
75    pub prompt: String,
76    /// Expected golden answer (used for exact match scoring)
77    #[serde(default)]
78    pub golden: String,
79    /// List of assertions to check against the response
80    #[serde(default)]
81    pub assertions: Vec<Assertion>,
82    /// Weight of this task in the overall score (0.0 - 1.0)
83    #[serde(default = "default_weight")]
84    pub weight: f64,
85    /// Whether this task is required to pass (fails the suite if not)
86    #[serde(default)]
87    pub required: bool,
88}
89
90fn default_weight() -> f64 {
91    1.0
92}
93
94/// Types of assertions that can be checked against a response
95#[derive(Debug, Clone, Serialize, Deserialize)]
96#[serde(tag = "type", content = "value")]
97pub enum Assertion {
98    /// Response must contain this substring
99    #[serde(rename = "contains")]
100    Contains(String),
101    /// Response must NOT contain this substring
102    #[serde(rename = "not_contains")]
103    NotContains(String),
104    /// Response must exactly match this string
105    #[serde(rename = "exact")]
106    Exact(String),
107    /// Response must match this regex pattern
108    #[serde(rename = "regex")]
109    Regex(String),
110    /// Response must be non-empty
111    #[serde(rename = "non_empty")]
112    NonEmpty,
113    /// Response length must be at least N characters
114    #[serde(rename = "min_length")]
115    MinLength(usize),
116    /// Response length must be at most N characters
117    #[serde(rename = "max_length")]
118    MaxLength(usize),
119    /// A tool with this name must have been called during execution (v0.9.6)
120    #[serde(rename = "tool_called")]
121    ToolCalled(String),
122    /// A tool with this name must NOT have been called during execution (v0.9.6)
123    #[serde(rename = "tool_not_called")]
124    ToolNotCalled(String),
125}
126
127// ── Run Trace ───────────────────────────────────────────────────────────────
128
129/// Full trace of a single agent run — captures every step for inspection
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RunTrace {
132    /// Task name
133    pub task_name: String,
134    /// When the run started (ISO 8601)
135    pub started_at: String,
136    /// When the run ended (ISO 8601)
137    pub ended_at: String,
138    /// Duration in milliseconds
139    pub duration_ms: u64,
140    /// Number of iterations used
141    pub iterations: usize,
142    /// All steps in chronological order
143    pub steps: Vec<TraceStep>,
144    /// LLM calls made during the run
145    pub llm_calls: Vec<LlmCallTrace>,
146    /// Tool calls made during the run
147    pub tool_calls: Vec<ToolCallTrace>,
148    /// Final response from the agent
149    pub final_response: String,
150}
151
152/// A single step in the agent loop
153#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct TraceStep {
155    /// Step number (0-based)
156    pub number: usize,
157    /// Type of step
158    pub step_type: StepType,
159    /// Content of the step (LLM response, tool result, etc.)
160    pub content: String,
161    /// Duration of this step in milliseconds
162    pub duration_ms: u64,
163}
164
165/// Type of a trace step
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub enum StepType {
168    /// LLM thought/response
169    Thought,
170    /// Tool call
171    ToolCall,
172    /// Tool result/observation
173    Observation,
174    /// Final answer
175    Final,
176    /// Error
177    Error,
178}
179
180/// Trace of a single LLM call
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct LlmCallTrace {
183    /// Iteration number
184    pub iteration: usize,
185    /// Provider name
186    pub provider: String,
187    /// Model name
188    pub model: String,
189    /// Prompt tokens (if available)
190    pub prompt_tokens: Option<u32>,
191    /// Completion tokens (if available)
192    pub completion_tokens: Option<u32>,
193    /// Duration in milliseconds
194    pub duration_ms: u64,
195    /// Response content (truncated to 1000 chars for storage)
196    pub response_preview: String,
197}
198
199/// Trace of a single tool call
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct ToolCallTrace {
202    /// Iteration number
203    pub iteration: usize,
204    /// Tool name
205    pub tool_name: String,
206    /// Arguments (JSON)
207    pub arguments: serde_json::Value,
208    /// Whether the tool succeeded
209    pub success: bool,
210    /// Output preview (truncated to 500 chars)
211    pub output_preview: String,
212    /// Duration in milliseconds
213    pub duration_ms: u64,
214}
215
216// ── Results ─────────────────────────────────────────────────────────────────
217
218/// Result of a single eval task
219#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct EvalResult {
221    /// Task name
222    pub task_name: String,
223    /// Whether the task passed all assertions
224    pub passed: bool,
225    /// Score (0.0 - 1.0)
226    pub score: f64,
227    /// Number of assertions that passed
228    pub assertions_passed: usize,
229    /// Number of assertions that failed
230    pub assertions_failed: usize,
231    /// Details of each assertion check
232    pub assertion_results: Vec<AssertionResult>,
233    /// Full run trace for inspection
234    pub trace: RunTrace,
235    /// Error message if the task failed to run
236    pub error: Option<String>,
237}
238
239/// Result of a single assertion check
240#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct AssertionResult {
242    /// The assertion that was checked
243    pub assertion: String,
244    /// Whether it passed
245    pub passed: bool,
246    /// Details about the check
247    pub details: String,
248}
249
250/// Summary report of an entire eval suite run
251#[derive(Debug, Clone, Serialize, Deserialize)]
252pub struct EvalReport {
253    /// Suite name
254    pub suite_name: String,
255    /// When the suite was run (ISO 8601)
256    pub ran_at: String,
257    /// Duration in milliseconds
258    pub duration_ms: u64,
259    /// Overall score (0.0 - 1.0)
260    pub overall_score: f64,
261    /// Number of tasks
262    pub total_tasks: usize,
263    /// Number of tasks that passed
264    pub passed_tasks: usize,
265    /// Number of tasks that failed
266    pub failed_tasks: usize,
267    /// Individual task results
268    pub results: Vec<EvalResult>,
269}
270
271// ── Eval Runner ─────────────────────────────────────────────────────────────
272
273/// Runs eval tasks against an LLM provider and captures traces
274pub struct EvalRunner {
275    /// The LLM provider to test
276    llm: Arc<dyn LLMProviderTrait>,
277    /// Eval configuration
278    config: EvalConfig,
279}
280
281impl EvalRunner {
282    /// Create a new eval runner
283    pub fn new(llm: Arc<dyn LLMProviderTrait>, config: EvalConfig) -> Self {
284        Self { llm, config }
285    }
286
287    /// Run the full eval suite and return a report
288    #[instrument(skip(self), fields(suite = %self.config.name, task_count = self.config.tasks.len()))]
289    pub async fn run_suite(&self) -> EvalReport {
290        let started_at = chrono::Utc::now().to_rfc3339();
291        let suite_start = std::time::Instant::now();
292        let mut results = Vec::with_capacity(self.config.tasks.len());
293
294        info!(
295            suite = %self.config.name,
296            task_count = self.config.tasks.len(),
297            "Starting eval suite"
298        );
299
300        for task in &self.config.tasks {
301            let result = self.run_task(task).await;
302            let passed = result.passed;
303            let name = &result.task_name;
304
305            if passed {
306                info!(task = %name, score = result.score, "Eval task passed");
307            } else {
308                warn!(
309                    task = %name,
310                    score = result.score,
311                    passed = result.assertions_passed,
312                    failed = result.assertions_failed,
313                    "Eval task failed"
314                );
315            }
316
317            results.push(result);
318        }
319
320        let duration_ms = suite_start.elapsed().as_millis() as u64;
321        let total_tasks = results.len();
322        let passed_tasks = results.iter().filter(|r| r.passed).count();
323        let failed_tasks = total_tasks - passed_tasks;
324        let overall_score = if total_tasks > 0 {
325            results
326                .iter()
327                .map(|r| r.score * r.trace.iterations as f64)
328                .sum::<f64>()
329                / results
330                    .iter()
331                    .map(|r| r.trace.iterations as f64)
332                    .sum::<f64>()
333        } else {
334            0.0
335        };
336
337        info!(
338            suite = %self.config.name,
339            passed = passed_tasks,
340            failed = failed_tasks,
341            overall_score = overall_score,
342            duration_ms = duration_ms,
343            "Eval suite completed"
344        );
345
346        EvalReport {
347            suite_name: self.config.name.clone(),
348            ran_at: started_at,
349            duration_ms,
350            overall_score,
351            total_tasks,
352            passed_tasks,
353            failed_tasks,
354            results,
355        }
356    }
357
358    /// Run a single eval task and return the result with trace
359    ///
360    /// Uses the full agent loop (`run_agent_loop`) instead of a single LLM call,
361    /// so eval tasks exercise the complete ReAct loop with tool use, security
362    /// checks, and iteration limits.
363    #[instrument(skip(self), fields(task = %task.name))]
364    async fn run_task(&self, task: &EvalTask) -> EvalResult {
365        let task_start = std::time::Instant::now();
366        let started_at = chrono::Utc::now().to_rfc3339();
367
368        // Build agent loop config from suite settings
369        let agent_config = AgentLoopConfig {
370            max_iterations: self.config.max_iterations,
371            enable_tools: true,
372            require_approval: false,
373            prompt_injection_protection: true,
374            token_lifetime_secs: 0,
375            no_final_required: false,
376            fallback_chain: None,
377            token_budget: None,
378            ravenfabric: None,
379            checkpoint_dir: None,
380            session_id: None,
381            metrics_callback: None,
382            load_manager: None,
383        };
384
385        // Run the full agent loop (ReAct + tools + security)
386        let result = run_agent_loop(
387            self.llm.clone(),
388            &task.prompt,
389            &self.config.system_prompt,
390            agent_config,
391        )
392        .await;
393
394        let duration_ms = task_start.elapsed().as_millis() as u64;
395
396        match result {
397            Ok(final_response) => {
398                let trace = RunTrace {
399                    task_name: task.name.clone(),
400                    started_at,
401                    ended_at: chrono::Utc::now().to_rfc3339(),
402                    duration_ms,
403                    iterations: self.config.max_iterations, // best-effort; agent loop doesn't expose exact count
404                    steps: vec![TraceStep {
405                        number: 0,
406                        step_type: StepType::Final,
407                        content: final_response.clone(),
408                        duration_ms,
409                    }],
410                    llm_calls: Vec::new(), // agent loop doesn't expose per-call traces
411                    tool_calls: Vec::new(), // agent loop doesn't expose per-call traces
412                    final_response: final_response.clone(),
413                };
414
415                // Run assertions against the final response
416                let (assertion_results, assertions_passed, assertions_failed) =
417                    check_assertions(&final_response, &task.assertions, Some(&trace));
418
419                // Calculate score
420                let score = if task.assertions.is_empty() {
421                    if final_response.is_empty() || final_response.len() < 10 {
422                        0.0
423                    } else {
424                        1.0
425                    }
426                } else if task.assertions.len() == assertions_passed + assertions_failed {
427                    assertions_passed as f64 / task.assertions.len() as f64
428                } else {
429                    0.0
430                };
431
432                let passed = assertions_failed == 0 && !final_response.is_empty();
433
434                EvalResult {
435                    task_name: task.name.clone(),
436                    passed,
437                    score,
438                    assertions_passed,
439                    assertions_failed,
440                    assertion_results,
441                    trace,
442                    error: None,
443                }
444            }
445            Err(e) => {
446                let trace = RunTrace {
447                    task_name: task.name.clone(),
448                    started_at,
449                    ended_at: chrono::Utc::now().to_rfc3339(),
450                    duration_ms,
451                    iterations: 0,
452                    steps: vec![TraceStep {
453                        number: 0,
454                        step_type: StepType::Error,
455                        content: format!("Agent loop failed: {}", e),
456                        duration_ms,
457                    }],
458                    llm_calls: Vec::new(),
459                    tool_calls: Vec::new(),
460                    final_response: String::new(),
461                };
462
463                EvalResult {
464                    task_name: task.name.clone(),
465                    passed: false,
466                    score: 0.0,
467                    assertions_passed: 0,
468                    assertions_failed: 1,
469                    assertion_results: vec![AssertionResult {
470                        assertion: "agent_loop".to_string(),
471                        passed: false,
472                        details: format!("Agent loop failed: {}", e),
473                    }],
474                    trace,
475                    error: Some(e.to_string()),
476                }
477            }
478        }
479    }
480}
481
482// ── Assertion Checking ──────────────────────────────────────────────────────
483
484/// Check all assertions against a response string
485fn check_assertions(
486    response: &str,
487    assertions: &[Assertion],
488    run_trace: Option<&RunTrace>,
489) -> (Vec<AssertionResult>, usize, usize) {
490    let mut results = Vec::with_capacity(assertions.len());
491    let mut passed = 0;
492    let mut failed = 0;
493
494    for assertion in assertions {
495        let result = check_single_assertion(response, assertion, run_trace);
496        if result.passed {
497            passed += 1;
498        } else {
499            failed += 1;
500        }
501        results.push(result);
502    }
503
504    (results, passed, failed)
505}
506
507/// Check a single assertion against a response
508fn check_single_assertion(
509    response: &str,
510    assertion: &Assertion,
511    run_trace: Option<&RunTrace>,
512) -> AssertionResult {
513    match assertion {
514        Assertion::Contains(pattern) => {
515            let passed = response.contains(pattern);
516            AssertionResult {
517                assertion: format!("contains: {}", pattern),
518                passed,
519                details: if passed {
520                    format!("Response contains '{}'", pattern)
521                } else {
522                    format!("Response does not contain '{}'", pattern)
523                },
524            }
525        }
526        Assertion::NotContains(pattern) => {
527            let passed = !response.contains(pattern);
528            AssertionResult {
529                assertion: format!("not_contains: {}", pattern),
530                passed,
531                details: if passed {
532                    format!("Response does not contain '{}'", pattern)
533                } else {
534                    format!("Response contains '{}'", pattern)
535                },
536            }
537        }
538        Assertion::Exact(expected) => {
539            let trimmed_response = response.trim();
540            let passed = trimmed_response == expected.as_str();
541            AssertionResult {
542                assertion: format!("exact: {}", expected),
543                passed,
544                details: if passed {
545                    "Response matches exactly".to_string()
546                } else {
547                    format!(
548                        "Expected '{}', got '{}'",
549                        expected,
550                        trimmed_response.chars().take(100).collect::<String>()
551                    )
552                },
553            }
554        }
555        Assertion::Regex(pattern) => {
556            let re = regex_lite::Regex::new(pattern);
557            match re {
558                Ok(re) => {
559                    let passed = re.is_match(response);
560                    AssertionResult {
561                        assertion: format!("regex: {}", pattern),
562                        passed,
563                        details: if passed {
564                            format!("Response matches pattern '{}'", pattern)
565                        } else {
566                            format!("Response does not match pattern '{}'", pattern)
567                        },
568                    }
569                }
570                Err(e) => AssertionResult {
571                    assertion: format!("regex: {}", pattern),
572                    passed: false,
573                    details: format!("Invalid regex pattern: {}", e),
574                },
575            }
576        }
577        Assertion::NonEmpty => {
578            let passed = !response.is_empty();
579            AssertionResult {
580                assertion: "non_empty".to_string(),
581                passed,
582                details: if passed {
583                    format!("Response is non-empty ({} chars)", response.len())
584                } else {
585                    "Response is empty".to_string()
586                },
587            }
588        }
589        Assertion::MinLength(min) => {
590            let passed = response.len() >= *min;
591            AssertionResult {
592                assertion: format!("min_length: {}", min),
593                passed,
594                details: if passed {
595                    format!("Response length {} >= {}", response.len(), min)
596                } else {
597                    format!("Response length {} < {}", response.len(), min)
598                },
599            }
600        }
601        Assertion::MaxLength(max) => {
602            let passed = response.len() <= *max;
603            AssertionResult {
604                assertion: format!("max_length: {}", max),
605                passed,
606                details: if passed {
607                    format!("Response length {} <= {}", response.len(), max)
608                } else {
609                    format!("Response length {} > {}", response.len(), max)
610                },
611            }
612        }
613        Assertion::ToolCalled(tool_name) => {
614            let tool_calls = run_trace
615                .map(|t| &t.tool_calls)
616                .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
617            let passed = tool_calls.is_some();
618            AssertionResult {
619                assertion: format!("tool_called: {}", tool_name),
620                passed,
621                details: if passed {
622                    format!("Tool '{}' was called", tool_name)
623                } else {
624                    let all_tools: Vec<&str> = run_trace
625                        .map(|t| {
626                            t.tool_calls
627                                .iter()
628                                .map(|tc| tc.tool_name.as_str())
629                                .collect()
630                        })
631                        .unwrap_or_default();
632                    if all_tools.is_empty() {
633                        format!("Tool '{}' was not called (no tools were called)", tool_name)
634                    } else {
635                        format!(
636                            "Tool '{}' was not called (called: {})",
637                            tool_name,
638                            all_tools.join(", ")
639                        )
640                    }
641                },
642            }
643        }
644        Assertion::ToolNotCalled(tool_name) => {
645            let tool_calls = run_trace
646                .map(|t| &t.tool_calls)
647                .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
648            let passed = tool_calls.is_none();
649            AssertionResult {
650                assertion: format!("tool_not_called: {}", tool_name),
651                passed,
652                details: if passed {
653                    format!("Tool '{}' was not called", tool_name)
654                } else {
655                    format!("Tool '{}' was called but should not have been", tool_name)
656                },
657            }
658        }
659    }
660}
661
662// ── Report Formatting ───────────────────────────────────────────────────────
663
664impl EvalReport {
665    /// Format the report as a human-readable string
666    pub fn format_text(&self) -> String {
667        let mut output = String::new();
668
669        output.push_str(&format!("\n🐦‍⬛ Eval Report: {}\n", self.suite_name));
670        output.push_str(&format!("{:-^60}\n", ""));
671        output.push_str(&format!(
672            "Ran at:       {}\n",
673            &self.ran_at[..19].replace('T', " ")
674        ));
675        output.push_str(&format!("Duration:     {} ms\n", self.duration_ms));
676        output.push_str(&format!(
677            "Overall score: {:.1}%\n",
678            self.overall_score * 100.0
679        ));
680        output.push_str(&format!(
681            "Tasks:        {}/{} passed\n",
682            self.passed_tasks, self.total_tasks
683        ));
684        output.push_str(&format!("{:-^60}\n", ""));
685
686        for result in &self.results {
687            output.push_str(&format!(
688                "\n  {} {} — {:.1}%\n",
689                if result.passed { "✅" } else { "❌" },
690                result.task_name,
691                result.score * 100.0
692            ));
693
694            if let Some(ref error) = result.error {
695                output.push_str(&format!("    Error: {}\n", error));
696            }
697
698            if !result.assertion_results.is_empty() {
699                for ar in &result.assertion_results {
700                    output.push_str(&format!(
701                        "    {} {}\n",
702                        if ar.passed { "  ✅" } else { "  ❌" },
703                        ar.details
704                    ));
705                }
706            }
707
708            // Show trace summary
709            let trace = &result.trace;
710            output.push_str(&format!(
711                "    Iterations: {} · LLM calls: {} · Tool calls: {} · Duration: {} ms\n",
712                trace.iterations,
713                trace.llm_calls.len(),
714                trace.tool_calls.len(),
715                trace.duration_ms
716            ));
717
718            // Show response preview
719            let preview: String = trace.final_response.chars().take(200).collect();
720            if !preview.is_empty() {
721                output.push_str(&format!("    Response: {}\n", preview));
722            }
723        }
724
725        output
726    }
727
728    /// Format the report as JSON
729    pub fn format_json(&self) -> serde_json::Value {
730        serde_json::to_value(self).unwrap_or(serde_json::json!({"error": "serialization failed"}))
731    }
732}
733
734// ── Config Loading ──────────────────────────────────────────────────────────
735
736impl EvalConfig {
737    /// Load eval config from a TOML file
738    pub fn from_file(path: &str) -> Result<Self> {
739        let content = std::fs::read_to_string(path).map_err(|e| {
740            RavenClawsError::CommandExecution(format!("Failed to read eval config: {}", e))
741        })?;
742
743        if content.trim().is_empty() {
744            return Err(RavenClawsError::CommandExecution(format!(
745                "Eval config file '{}' is empty — no tasks to run",
746                path
747            )));
748        }
749
750        let config: EvalConfig = toml::from_str(&content).map_err(|e| {
751            RavenClawsError::CommandExecution(format!("Failed to parse eval config: {}", e))
752        })?;
753
754        if config.tasks.is_empty() {
755            return Err(RavenClawsError::CommandExecution(format!(
756                "Eval config file '{}' has no tasks defined",
757                path
758            )));
759        }
760
761        Ok(config)
762    }
763}
764
765// ── Tests ───────────────────────────────────────────────────────────────────
766
767#[cfg(test)]
768mod tests {
769    use super::*;
770
771    #[test]
772    fn test_assertion_contains_pass() {
773        let result = check_single_assertion(
774            "hello world",
775            &Assertion::Contains("world".to_string()),
776            None,
777        );
778        assert!(result.passed);
779        assert!(result.details.contains("contains"));
780    }
781
782    #[test]
783    fn test_assertion_contains_fail() {
784        let result =
785            check_single_assertion("hello world", &Assertion::Contains("foo".to_string()), None);
786        assert!(!result.passed);
787    }
788
789    #[test]
790    fn test_assertion_not_contains_pass() {
791        let result = check_single_assertion(
792            "hello world",
793            &Assertion::NotContains("foo".to_string()),
794            None,
795        );
796        assert!(result.passed);
797    }
798
799    #[test]
800    fn test_assertion_not_contains_fail() {
801        let result = check_single_assertion(
802            "hello world",
803            &Assertion::NotContains("world".to_string()),
804            None,
805        );
806        assert!(!result.passed);
807    }
808
809    #[test]
810    fn test_assertion_exact_pass() {
811        let result = check_single_assertion("hello", &Assertion::Exact("hello".to_string()), None);
812        assert!(result.passed);
813    }
814
815    #[test]
816    fn test_assertion_exact_fail() {
817        let result = check_single_assertion("world", &Assertion::Exact("hello".to_string()), None);
818        assert!(!result.passed);
819    }
820
821    #[test]
822    fn test_assertion_regex_pass() {
823        let result =
824            check_single_assertion("hello 123", &Assertion::Regex(r"\d+".to_string()), None);
825        assert!(result.passed);
826    }
827
828    #[test]
829    fn test_assertion_regex_fail() {
830        let result = check_single_assertion("hello", &Assertion::Regex(r"\d+".to_string()), None);
831        assert!(!result.passed);
832    }
833
834    #[test]
835    fn test_assertion_non_empty_pass() {
836        let result = check_single_assertion("hello", &Assertion::NonEmpty, None);
837        assert!(result.passed);
838    }
839
840    #[test]
841    fn test_assertion_non_empty_fail() {
842        let result = check_single_assertion("", &Assertion::NonEmpty, None);
843        assert!(!result.passed);
844    }
845
846    #[test]
847    fn test_assertion_min_length_pass() {
848        let result = check_single_assertion("hello", &Assertion::MinLength(3), None);
849        assert!(result.passed);
850    }
851
852    #[test]
853    fn test_assertion_min_length_fail() {
854        let result = check_single_assertion("hi", &Assertion::MinLength(5), None);
855        assert!(!result.passed);
856    }
857
858    #[test]
859    fn test_assertion_max_length_pass() {
860        let result = check_single_assertion("hi", &Assertion::MaxLength(5), None);
861        assert!(result.passed);
862    }
863
864    #[test]
865    fn test_assertion_max_length_fail() {
866        let result = check_single_assertion("hello world", &Assertion::MaxLength(5), None);
867        assert!(!result.passed);
868    }
869
870    #[test]
871    fn test_check_assertions_empty() {
872        let (results, passed, failed) = check_assertions("hello", &[], None);
873        assert!(results.is_empty());
874        assert_eq!(passed, 0);
875        assert_eq!(failed, 0);
876    }
877
878    #[test]
879    fn test_check_assertions_multiple() {
880        let assertions = vec![
881            Assertion::Contains("hello".to_string()),
882            Assertion::Contains("world".to_string()),
883            Assertion::NonEmpty,
884        ];
885        let (results, passed, failed) = check_assertions("hello world", &assertions, None);
886        assert_eq!(passed, 3);
887        assert_eq!(failed, 0);
888        assert_eq!(results.len(), 3);
889    }
890
891    #[test]
892    fn test_check_assertions_tool_called() {
893        let trace = RunTrace {
894            task_name: "test".to_string(),
895            started_at: "2026-01-01T00:00:00Z".to_string(),
896            ended_at: "2026-01-01T00:00:01Z".to_string(),
897            duration_ms: 1000,
898            iterations: 1,
899            steps: vec![],
900            llm_calls: vec![],
901            tool_calls: vec![
902                ToolCallTrace {
903                    iteration: 0,
904                    tool_name: "web_search".to_string(),
905                    arguments: serde_json::json!({"query": "test"}),
906                    success: true,
907                    output_preview: "results".to_string(),
908                    duration_ms: 100,
909                },
910                ToolCallTrace {
911                    iteration: 0,
912                    tool_name: "read_file".to_string(),
913                    arguments: serde_json::json!({"path": "/tmp/test"}),
914                    success: true,
915                    output_preview: "content".to_string(),
916                    duration_ms: 50,
917                },
918            ],
919            final_response: "response".to_string(),
920        };
921
922        // ToolCalled — should pass
923        let (results, passed, failed) = check_assertions(
924            "response",
925            &[Assertion::ToolCalled("web_search".to_string())],
926            Some(&trace),
927        );
928        assert_eq!(passed, 1);
929        assert_eq!(failed, 0);
930        assert!(results[0].passed);
931
932        // ToolCalled — should fail (tool not called)
933        let (results, passed, failed) = check_assertions(
934            "response",
935            &[Assertion::ToolCalled("nonexistent".to_string())],
936            Some(&trace),
937        );
938        assert_eq!(passed, 0);
939        assert_eq!(failed, 1);
940        assert!(!results[0].passed);
941
942        // ToolNotCalled — should pass (tool not in list)
943        let (results, passed, failed) = check_assertions(
944            "response",
945            &[Assertion::ToolNotCalled("nonexistent".to_string())],
946            Some(&trace),
947        );
948        assert_eq!(passed, 1);
949        assert_eq!(failed, 0);
950        assert!(results[0].passed);
951
952        // ToolNotCalled — should fail (tool was called)
953        let (results, passed, failed) = check_assertions(
954            "response",
955            &[Assertion::ToolNotCalled("web_search".to_string())],
956            Some(&trace),
957        );
958        assert_eq!(passed, 0);
959        assert_eq!(failed, 1);
960        assert!(!results[0].passed);
961
962        // ToolCalled with no trace — should fail
963        let (results, passed, failed) = check_assertions(
964            "response",
965            &[Assertion::ToolCalled("web_search".to_string())],
966            None,
967        );
968        assert_eq!(passed, 0);
969        assert_eq!(failed, 1);
970        assert!(!results[0].passed);
971    }
972
973    #[test]
974    fn test_eval_config_from_toml() {
975        let toml_str = r#"
976name = "test-suite"
977description = "A test suite"
978system_prompt = "Be concise"
979max_iterations = 3
980
981[[tasks]]
982name = "test-1"
983prompt = "What is 2+2?"
984golden = "4"
985assertions = [{ type = "contains", value = "4" }]
986weight = 1.0
987required = true
988"#;
989
990        let config: EvalConfig = toml::from_str(toml_str).unwrap();
991        assert_eq!(config.name, "test-suite");
992        assert_eq!(config.tasks.len(), 1);
993        assert_eq!(config.tasks[0].name, "test-1");
994        assert_eq!(config.tasks[0].prompt, "What is 2+2?");
995        assert_eq!(config.tasks[0].golden, "4");
996        assert_eq!(config.tasks[0].assertions.len(), 1);
997    }
998
999    #[test]
1000    fn test_eval_config_defaults() {
1001        let toml_str = r#"
1002[[tasks]]
1003name = "simple"
1004prompt = "Say hello"
1005"#;
1006
1007        let config: EvalConfig = toml::from_str(toml_str).unwrap();
1008        assert_eq!(config.name, "unnamed");
1009        assert_eq!(config.system_prompt, default_system_prompt());
1010        assert_eq!(config.max_iterations, 5);
1011        assert_eq!(config.tasks[0].weight, 1.0);
1012        assert!(!config.tasks[0].required);
1013    }
1014
1015    #[test]
1016    fn test_report_format_text() {
1017        let report = EvalReport {
1018            suite_name: "test".to_string(),
1019            ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1020            duration_ms: 100,
1021            overall_score: 0.75,
1022            total_tasks: 2,
1023            passed_tasks: 1,
1024            failed_tasks: 1,
1025            results: vec![
1026                EvalResult {
1027                    task_name: "pass-task".to_string(),
1028                    passed: true,
1029                    score: 1.0,
1030                    assertions_passed: 2,
1031                    assertions_failed: 0,
1032                    assertion_results: vec![AssertionResult {
1033                        assertion: "contains: hello".to_string(),
1034                        passed: true,
1035                        details: "Response contains 'hello'".to_string(),
1036                    }],
1037                    trace: RunTrace {
1038                        task_name: "pass-task".to_string(),
1039                        started_at: "2026-06-22T12:00:00+00:00".to_string(),
1040                        ended_at: "2026-06-22T12:00:01+00:00".to_string(),
1041                        duration_ms: 50,
1042                        iterations: 1,
1043                        steps: vec![],
1044                        llm_calls: vec![],
1045                        tool_calls: vec![],
1046                        final_response: "hello world".to_string(),
1047                    },
1048                    error: None,
1049                },
1050                EvalResult {
1051                    task_name: "fail-task".to_string(),
1052                    passed: false,
1053                    score: 0.0,
1054                    assertions_passed: 0,
1055                    assertions_failed: 1,
1056                    assertion_results: vec![AssertionResult {
1057                        assertion: "contains: foo".to_string(),
1058                        passed: false,
1059                        details: "Response does not contain 'foo'".to_string(),
1060                    }],
1061                    trace: RunTrace {
1062                        task_name: "fail-task".to_string(),
1063                        started_at: "2026-06-22T12:00:01+00:00".to_string(),
1064                        ended_at: "2026-06-22T12:00:02+00:00".to_string(),
1065                        duration_ms: 50,
1066                        iterations: 1,
1067                        steps: vec![],
1068                        llm_calls: vec![],
1069                        tool_calls: vec![],
1070                        final_response: "bar".to_string(),
1071                    },
1072                    error: None,
1073                },
1074            ],
1075        };
1076
1077        let text = report.format_text();
1078        assert!(text.contains("Eval Report: test"));
1079        assert!(text.contains("75.0%"));
1080        assert!(text.contains("1/2 passed"));
1081        assert!(text.contains("✅ pass-task"));
1082        assert!(text.contains("❌ fail-task"));
1083    }
1084
1085    #[test]
1086    fn test_report_format_json() {
1087        let report = EvalReport {
1088            suite_name: "test".to_string(),
1089            ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1090            duration_ms: 100,
1091            overall_score: 1.0,
1092            total_tasks: 1,
1093            passed_tasks: 1,
1094            failed_tasks: 0,
1095            results: vec![],
1096        };
1097
1098        let json = report.format_json();
1099        assert_eq!(json["suite_name"], "test");
1100        assert_eq!(json["overall_score"], 1.0);
1101    }
1102
1103    #[test]
1104    fn test_eval_config_from_file_not_found() {
1105        let result = EvalConfig::from_file("/tmp/nonexistent-eval-config.toml");
1106        assert!(result.is_err());
1107    }
1108
1109    #[test]
1110    fn test_assertion_regex_invalid_pattern() {
1111        let result =
1112            check_single_assertion("hello", &Assertion::Regex(r"[invalid".to_string()), None);
1113        assert!(!result.passed);
1114        assert!(result.details.contains("Invalid regex"));
1115    }
1116
1117    #[test]
1118    fn test_trace_step_serialization() {
1119        let step = TraceStep {
1120            number: 0,
1121            step_type: StepType::Thought,
1122            content: "test".to_string(),
1123            duration_ms: 100,
1124        };
1125        let json = serde_json::to_string(&step).unwrap();
1126        assert!(json.contains("Thought"));
1127    }
1128
1129    #[test]
1130    fn test_tool_call_trace_serialization() {
1131        let trace = ToolCallTrace {
1132            iteration: 0,
1133            tool_name: "shell_exec".to_string(),
1134            arguments: serde_json::json!({"command": "echo hello"}),
1135            success: true,
1136            output_preview: "hello".to_string(),
1137            duration_ms: 50,
1138        };
1139        let json = serde_json::to_string(&trace).unwrap();
1140        assert!(json.contains("shell_exec"));
1141        assert!(json.contains("echo hello"));
1142    }
1143}
ravenclaws/eval.rs

ravenclaws/
eval.rs