1use crate::agent::{run_agent_loop, AgentLoopConfig};
26use crate::error::{RavenClawsError, Result};
27use crate::llm::LLMProviderTrait;
28use serde::{Deserialize, Serialize};
29use std::sync::Arc;
30use tracing::{info, instrument, warn};
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct EvalConfig {
37 #[serde(default = "default_suite_name")]
39 pub name: String,
40 #[serde(default)]
42 pub description: String,
43 #[serde(default = "default_system_prompt")]
45 pub system_prompt: String,
46 #[serde(default = "default_max_iterations")]
48 pub max_iterations: usize,
49 #[serde(default)]
51 pub tasks: Vec<EvalTask>,
52}
53
54fn default_suite_name() -> String {
55 "unnamed".to_string()
56}
57
58fn default_system_prompt() -> String {
59 "You are a helpful assistant. Be concise and accurate.".to_string()
60}
61
62fn default_max_iterations() -> usize {
63 5
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct EvalTask {
69 pub name: String,
71 #[serde(default)]
73 pub description: String,
74 pub prompt: String,
76 #[serde(default)]
78 pub golden: String,
79 #[serde(default)]
81 pub assertions: Vec<Assertion>,
82 #[serde(default = "default_weight")]
84 pub weight: f64,
85 #[serde(default)]
87 pub required: bool,
88}
89
90fn default_weight() -> f64 {
91 1.0
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
96#[serde(tag = "type", content = "value")]
97pub enum Assertion {
98 #[serde(rename = "contains")]
100 Contains(String),
101 #[serde(rename = "not_contains")]
103 NotContains(String),
104 #[serde(rename = "exact")]
106 Exact(String),
107 #[serde(rename = "regex")]
109 Regex(String),
110 #[serde(rename = "non_empty")]
112 NonEmpty,
113 #[serde(rename = "min_length")]
115 MinLength(usize),
116 #[serde(rename = "max_length")]
118 MaxLength(usize),
119 #[serde(rename = "tool_called")]
121 ToolCalled(String),
122 #[serde(rename = "tool_not_called")]
124 ToolNotCalled(String),
125}
126
127#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RunTrace {
132 pub task_name: String,
134 pub started_at: String,
136 pub ended_at: String,
138 pub duration_ms: u64,
140 pub iterations: usize,
142 pub steps: Vec<TraceStep>,
144 pub llm_calls: Vec<LlmCallTrace>,
146 pub tool_calls: Vec<ToolCallTrace>,
148 pub final_response: String,
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct TraceStep {
155 pub number: usize,
157 pub step_type: StepType,
159 pub content: String,
161 pub duration_ms: u64,
163}
164
165#[derive(Debug, Clone, Serialize, Deserialize)]
167pub enum StepType {
168 Thought,
170 ToolCall,
172 Observation,
174 Final,
176 Error,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct LlmCallTrace {
183 pub iteration: usize,
185 pub provider: String,
187 pub model: String,
189 pub prompt_tokens: Option<u32>,
191 pub completion_tokens: Option<u32>,
193 pub duration_ms: u64,
195 pub response_preview: String,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct ToolCallTrace {
202 pub iteration: usize,
204 pub tool_name: String,
206 pub arguments: serde_json::Value,
208 pub success: bool,
210 pub output_preview: String,
212 pub duration_ms: u64,
214}
215
216#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct EvalResult {
221 pub task_name: String,
223 pub passed: bool,
225 pub score: f64,
227 pub assertions_passed: usize,
229 pub assertions_failed: usize,
231 pub assertion_results: Vec<AssertionResult>,
233 pub trace: RunTrace,
235 pub error: Option<String>,
237}
238
239#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct AssertionResult {
242 pub assertion: String,
244 pub passed: bool,
246 pub details: String,
248}
249
250#[derive(Debug, Clone, Serialize, Deserialize)]
252pub struct EvalReport {
253 pub suite_name: String,
255 pub ran_at: String,
257 pub duration_ms: u64,
259 pub overall_score: f64,
261 pub total_tasks: usize,
263 pub passed_tasks: usize,
265 pub failed_tasks: usize,
267 pub results: Vec<EvalResult>,
269}
270
271pub struct EvalRunner {
275 llm: Arc<dyn LLMProviderTrait>,
277 config: EvalConfig,
279}
280
281impl EvalRunner {
282 pub fn new(llm: Arc<dyn LLMProviderTrait>, config: EvalConfig) -> Self {
284 Self { llm, config }
285 }
286
287 #[instrument(skip(self), fields(suite = %self.config.name, task_count = self.config.tasks.len()))]
289 pub async fn run_suite(&self) -> EvalReport {
290 let started_at = chrono::Utc::now().to_rfc3339();
291 let suite_start = std::time::Instant::now();
292 let mut results = Vec::with_capacity(self.config.tasks.len());
293
294 info!(
295 suite = %self.config.name,
296 task_count = self.config.tasks.len(),
297 "Starting eval suite"
298 );
299
300 for task in &self.config.tasks {
301 let result = self.run_task(task).await;
302 let passed = result.passed;
303 let name = &result.task_name;
304
305 if passed {
306 info!(task = %name, score = result.score, "Eval task passed");
307 } else {
308 warn!(
309 task = %name,
310 score = result.score,
311 passed = result.assertions_passed,
312 failed = result.assertions_failed,
313 "Eval task failed"
314 );
315 }
316
317 results.push(result);
318 }
319
320 let duration_ms = suite_start.elapsed().as_millis() as u64;
321 let total_tasks = results.len();
322 let passed_tasks = results.iter().filter(|r| r.passed).count();
323 let failed_tasks = total_tasks - passed_tasks;
324 let overall_score = if total_tasks > 0 {
325 results
326 .iter()
327 .map(|r| r.score * r.trace.iterations as f64)
328 .sum::<f64>()
329 / results
330 .iter()
331 .map(|r| r.trace.iterations as f64)
332 .sum::<f64>()
333 } else {
334 0.0
335 };
336
337 info!(
338 suite = %self.config.name,
339 passed = passed_tasks,
340 failed = failed_tasks,
341 overall_score = overall_score,
342 duration_ms = duration_ms,
343 "Eval suite completed"
344 );
345
346 EvalReport {
347 suite_name: self.config.name.clone(),
348 ran_at: started_at,
349 duration_ms,
350 overall_score,
351 total_tasks,
352 passed_tasks,
353 failed_tasks,
354 results,
355 }
356 }
357
358 #[instrument(skip(self), fields(task = %task.name))]
364 async fn run_task(&self, task: &EvalTask) -> EvalResult {
365 let task_start = std::time::Instant::now();
366 let started_at = chrono::Utc::now().to_rfc3339();
367
368 let agent_config = AgentLoopConfig {
370 max_iterations: self.config.max_iterations,
371 enable_tools: true,
372 require_approval: false,
373 prompt_injection_protection: true,
374 token_lifetime_secs: 0,
375 no_final_required: false,
376 fallback_chain: None,
377 token_budget: None,
378 ravenfabric: None,
379 checkpoint_dir: None,
380 session_id: None,
381 metrics_callback: None,
382 };
383
384 let result = run_agent_loop(
386 self.llm.clone(),
387 &task.prompt,
388 &self.config.system_prompt,
389 agent_config,
390 )
391 .await;
392
393 let duration_ms = task_start.elapsed().as_millis() as u64;
394
395 match result {
396 Ok(final_response) => {
397 let trace = RunTrace {
398 task_name: task.name.clone(),
399 started_at,
400 ended_at: chrono::Utc::now().to_rfc3339(),
401 duration_ms,
402 iterations: self.config.max_iterations, steps: vec![TraceStep {
404 number: 0,
405 step_type: StepType::Final,
406 content: final_response.clone(),
407 duration_ms,
408 }],
409 llm_calls: Vec::new(), tool_calls: Vec::new(), final_response: final_response.clone(),
412 };
413
414 let (assertion_results, assertions_passed, assertions_failed) =
416 check_assertions(&final_response, &task.assertions, Some(&trace));
417
418 let score = if task.assertions.is_empty() {
420 if final_response.is_empty() || final_response.len() < 10 {
421 0.0
422 } else {
423 1.0
424 }
425 } else if task.assertions.len() == assertions_passed + assertions_failed {
426 assertions_passed as f64 / task.assertions.len() as f64
427 } else {
428 0.0
429 };
430
431 let passed = assertions_failed == 0 && !final_response.is_empty();
432
433 EvalResult {
434 task_name: task.name.clone(),
435 passed,
436 score,
437 assertions_passed,
438 assertions_failed,
439 assertion_results,
440 trace,
441 error: None,
442 }
443 }
444 Err(e) => {
445 let trace = RunTrace {
446 task_name: task.name.clone(),
447 started_at,
448 ended_at: chrono::Utc::now().to_rfc3339(),
449 duration_ms,
450 iterations: 0,
451 steps: vec![TraceStep {
452 number: 0,
453 step_type: StepType::Error,
454 content: format!("Agent loop failed: {}", e),
455 duration_ms,
456 }],
457 llm_calls: Vec::new(),
458 tool_calls: Vec::new(),
459 final_response: String::new(),
460 };
461
462 EvalResult {
463 task_name: task.name.clone(),
464 passed: false,
465 score: 0.0,
466 assertions_passed: 0,
467 assertions_failed: 1,
468 assertion_results: vec![AssertionResult {
469 assertion: "agent_loop".to_string(),
470 passed: false,
471 details: format!("Agent loop failed: {}", e),
472 }],
473 trace,
474 error: Some(e.to_string()),
475 }
476 }
477 }
478 }
479}
480
481fn check_assertions(
485 response: &str,
486 assertions: &[Assertion],
487 run_trace: Option<&RunTrace>,
488) -> (Vec<AssertionResult>, usize, usize) {
489 let mut results = Vec::with_capacity(assertions.len());
490 let mut passed = 0;
491 let mut failed = 0;
492
493 for assertion in assertions {
494 let result = check_single_assertion(response, assertion, run_trace);
495 if result.passed {
496 passed += 1;
497 } else {
498 failed += 1;
499 }
500 results.push(result);
501 }
502
503 (results, passed, failed)
504}
505
506fn check_single_assertion(
508 response: &str,
509 assertion: &Assertion,
510 run_trace: Option<&RunTrace>,
511) -> AssertionResult {
512 match assertion {
513 Assertion::Contains(pattern) => {
514 let passed = response.contains(pattern);
515 AssertionResult {
516 assertion: format!("contains: {}", pattern),
517 passed,
518 details: if passed {
519 format!("Response contains '{}'", pattern)
520 } else {
521 format!("Response does not contain '{}'", pattern)
522 },
523 }
524 }
525 Assertion::NotContains(pattern) => {
526 let passed = !response.contains(pattern);
527 AssertionResult {
528 assertion: format!("not_contains: {}", pattern),
529 passed,
530 details: if passed {
531 format!("Response does not contain '{}'", pattern)
532 } else {
533 format!("Response contains '{}'", pattern)
534 },
535 }
536 }
537 Assertion::Exact(expected) => {
538 let trimmed_response = response.trim();
539 let passed = trimmed_response == expected.as_str();
540 AssertionResult {
541 assertion: format!("exact: {}", expected),
542 passed,
543 details: if passed {
544 "Response matches exactly".to_string()
545 } else {
546 format!(
547 "Expected '{}', got '{}'",
548 expected,
549 trimmed_response.chars().take(100).collect::<String>()
550 )
551 },
552 }
553 }
554 Assertion::Regex(pattern) => {
555 let re = regex_lite::Regex::new(pattern);
556 match re {
557 Ok(re) => {
558 let passed = re.is_match(response);
559 AssertionResult {
560 assertion: format!("regex: {}", pattern),
561 passed,
562 details: if passed {
563 format!("Response matches pattern '{}'", pattern)
564 } else {
565 format!("Response does not match pattern '{}'", pattern)
566 },
567 }
568 }
569 Err(e) => AssertionResult {
570 assertion: format!("regex: {}", pattern),
571 passed: false,
572 details: format!("Invalid regex pattern: {}", e),
573 },
574 }
575 }
576 Assertion::NonEmpty => {
577 let passed = !response.is_empty();
578 AssertionResult {
579 assertion: "non_empty".to_string(),
580 passed,
581 details: if passed {
582 format!("Response is non-empty ({} chars)", response.len())
583 } else {
584 "Response is empty".to_string()
585 },
586 }
587 }
588 Assertion::MinLength(min) => {
589 let passed = response.len() >= *min;
590 AssertionResult {
591 assertion: format!("min_length: {}", min),
592 passed,
593 details: if passed {
594 format!("Response length {} >= {}", response.len(), min)
595 } else {
596 format!("Response length {} < {}", response.len(), min)
597 },
598 }
599 }
600 Assertion::MaxLength(max) => {
601 let passed = response.len() <= *max;
602 AssertionResult {
603 assertion: format!("max_length: {}", max),
604 passed,
605 details: if passed {
606 format!("Response length {} <= {}", response.len(), max)
607 } else {
608 format!("Response length {} > {}", response.len(), max)
609 },
610 }
611 }
612 Assertion::ToolCalled(tool_name) => {
613 let tool_calls = run_trace
614 .map(|t| &t.tool_calls)
615 .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
616 let passed = tool_calls.is_some();
617 AssertionResult {
618 assertion: format!("tool_called: {}", tool_name),
619 passed,
620 details: if passed {
621 format!("Tool '{}' was called", tool_name)
622 } else {
623 let all_tools: Vec<&str> = run_trace
624 .map(|t| {
625 t.tool_calls
626 .iter()
627 .map(|tc| tc.tool_name.as_str())
628 .collect()
629 })
630 .unwrap_or_default();
631 if all_tools.is_empty() {
632 format!("Tool '{}' was not called (no tools were called)", tool_name)
633 } else {
634 format!(
635 "Tool '{}' was not called (called: {})",
636 tool_name,
637 all_tools.join(", ")
638 )
639 }
640 },
641 }
642 }
643 Assertion::ToolNotCalled(tool_name) => {
644 let tool_calls = run_trace
645 .map(|t| &t.tool_calls)
646 .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
647 let passed = tool_calls.is_none();
648 AssertionResult {
649 assertion: format!("tool_not_called: {}", tool_name),
650 passed,
651 details: if passed {
652 format!("Tool '{}' was not called", tool_name)
653 } else {
654 format!("Tool '{}' was called but should not have been", tool_name)
655 },
656 }
657 }
658 }
659}
660
661impl EvalReport {
664 pub fn format_text(&self) -> String {
666 let mut output = String::new();
667
668 output.push_str(&format!("\n🐦⬛ Eval Report: {}\n", self.suite_name));
669 output.push_str(&format!("{:-^60}\n", ""));
670 output.push_str(&format!(
671 "Ran at: {}\n",
672 &self.ran_at[..19].replace('T', " ")
673 ));
674 output.push_str(&format!("Duration: {} ms\n", self.duration_ms));
675 output.push_str(&format!(
676 "Overall score: {:.1}%\n",
677 self.overall_score * 100.0
678 ));
679 output.push_str(&format!(
680 "Tasks: {}/{} passed\n",
681 self.passed_tasks, self.total_tasks
682 ));
683 output.push_str(&format!("{:-^60}\n", ""));
684
685 for result in &self.results {
686 output.push_str(&format!(
687 "\n {} {} — {:.1}%\n",
688 if result.passed { "✅" } else { "❌" },
689 result.task_name,
690 result.score * 100.0
691 ));
692
693 if let Some(ref error) = result.error {
694 output.push_str(&format!(" Error: {}\n", error));
695 }
696
697 if !result.assertion_results.is_empty() {
698 for ar in &result.assertion_results {
699 output.push_str(&format!(
700 " {} {}\n",
701 if ar.passed { " ✅" } else { " ❌" },
702 ar.details
703 ));
704 }
705 }
706
707 let trace = &result.trace;
709 output.push_str(&format!(
710 " Iterations: {} · LLM calls: {} · Tool calls: {} · Duration: {} ms\n",
711 trace.iterations,
712 trace.llm_calls.len(),
713 trace.tool_calls.len(),
714 trace.duration_ms
715 ));
716
717 let preview: String = trace.final_response.chars().take(200).collect();
719 if !preview.is_empty() {
720 output.push_str(&format!(" Response: {}\n", preview));
721 }
722 }
723
724 output
725 }
726
727 pub fn format_json(&self) -> serde_json::Value {
729 serde_json::to_value(self).unwrap_or(serde_json::json!({"error": "serialization failed"}))
730 }
731}
732
733impl EvalConfig {
736 pub fn from_file(path: &str) -> Result<Self> {
738 let content = std::fs::read_to_string(path).map_err(|e| {
739 RavenClawsError::CommandExecution(format!("Failed to read eval config: {}", e))
740 })?;
741
742 if content.trim().is_empty() {
743 return Err(RavenClawsError::CommandExecution(format!(
744 "Eval config file '{}' is empty — no tasks to run",
745 path
746 )));
747 }
748
749 let config: EvalConfig = toml::from_str(&content).map_err(|e| {
750 RavenClawsError::CommandExecution(format!("Failed to parse eval config: {}", e))
751 })?;
752
753 if config.tasks.is_empty() {
754 return Err(RavenClawsError::CommandExecution(format!(
755 "Eval config file '{}' has no tasks defined",
756 path
757 )));
758 }
759
760 Ok(config)
761 }
762}
763
764#[cfg(test)]
767mod tests {
768 use super::*;
769
770 #[test]
771 fn test_assertion_contains_pass() {
772 let result = check_single_assertion(
773 "hello world",
774 &Assertion::Contains("world".to_string()),
775 None,
776 );
777 assert!(result.passed);
778 assert!(result.details.contains("contains"));
779 }
780
781 #[test]
782 fn test_assertion_contains_fail() {
783 let result =
784 check_single_assertion("hello world", &Assertion::Contains("foo".to_string()), None);
785 assert!(!result.passed);
786 }
787
788 #[test]
789 fn test_assertion_not_contains_pass() {
790 let result = check_single_assertion(
791 "hello world",
792 &Assertion::NotContains("foo".to_string()),
793 None,
794 );
795 assert!(result.passed);
796 }
797
798 #[test]
799 fn test_assertion_not_contains_fail() {
800 let result = check_single_assertion(
801 "hello world",
802 &Assertion::NotContains("world".to_string()),
803 None,
804 );
805 assert!(!result.passed);
806 }
807
808 #[test]
809 fn test_assertion_exact_pass() {
810 let result = check_single_assertion("hello", &Assertion::Exact("hello".to_string()), None);
811 assert!(result.passed);
812 }
813
814 #[test]
815 fn test_assertion_exact_fail() {
816 let result = check_single_assertion("world", &Assertion::Exact("hello".to_string()), None);
817 assert!(!result.passed);
818 }
819
820 #[test]
821 fn test_assertion_regex_pass() {
822 let result =
823 check_single_assertion("hello 123", &Assertion::Regex(r"\d+".to_string()), None);
824 assert!(result.passed);
825 }
826
827 #[test]
828 fn test_assertion_regex_fail() {
829 let result = check_single_assertion("hello", &Assertion::Regex(r"\d+".to_string()), None);
830 assert!(!result.passed);
831 }
832
833 #[test]
834 fn test_assertion_non_empty_pass() {
835 let result = check_single_assertion("hello", &Assertion::NonEmpty, None);
836 assert!(result.passed);
837 }
838
839 #[test]
840 fn test_assertion_non_empty_fail() {
841 let result = check_single_assertion("", &Assertion::NonEmpty, None);
842 assert!(!result.passed);
843 }
844
845 #[test]
846 fn test_assertion_min_length_pass() {
847 let result = check_single_assertion("hello", &Assertion::MinLength(3), None);
848 assert!(result.passed);
849 }
850
851 #[test]
852 fn test_assertion_min_length_fail() {
853 let result = check_single_assertion("hi", &Assertion::MinLength(5), None);
854 assert!(!result.passed);
855 }
856
857 #[test]
858 fn test_assertion_max_length_pass() {
859 let result = check_single_assertion("hi", &Assertion::MaxLength(5), None);
860 assert!(result.passed);
861 }
862
863 #[test]
864 fn test_assertion_max_length_fail() {
865 let result = check_single_assertion("hello world", &Assertion::MaxLength(5), None);
866 assert!(!result.passed);
867 }
868
869 #[test]
870 fn test_check_assertions_empty() {
871 let (results, passed, failed) = check_assertions("hello", &[], None);
872 assert!(results.is_empty());
873 assert_eq!(passed, 0);
874 assert_eq!(failed, 0);
875 }
876
877 #[test]
878 fn test_check_assertions_multiple() {
879 let assertions = vec![
880 Assertion::Contains("hello".to_string()),
881 Assertion::Contains("world".to_string()),
882 Assertion::NonEmpty,
883 ];
884 let (results, passed, failed) = check_assertions("hello world", &assertions, None);
885 assert_eq!(passed, 3);
886 assert_eq!(failed, 0);
887 assert_eq!(results.len(), 3);
888 }
889
890 #[test]
891 fn test_check_assertions_tool_called() {
892 let trace = RunTrace {
893 task_name: "test".to_string(),
894 started_at: "2026-01-01T00:00:00Z".to_string(),
895 ended_at: "2026-01-01T00:00:01Z".to_string(),
896 duration_ms: 1000,
897 iterations: 1,
898 steps: vec![],
899 llm_calls: vec![],
900 tool_calls: vec![
901 ToolCallTrace {
902 iteration: 0,
903 tool_name: "web_search".to_string(),
904 arguments: serde_json::json!({"query": "test"}),
905 success: true,
906 output_preview: "results".to_string(),
907 duration_ms: 100,
908 },
909 ToolCallTrace {
910 iteration: 0,
911 tool_name: "read_file".to_string(),
912 arguments: serde_json::json!({"path": "/tmp/test"}),
913 success: true,
914 output_preview: "content".to_string(),
915 duration_ms: 50,
916 },
917 ],
918 final_response: "response".to_string(),
919 };
920
921 let (results, passed, failed) = check_assertions(
923 "response",
924 &[Assertion::ToolCalled("web_search".to_string())],
925 Some(&trace),
926 );
927 assert_eq!(passed, 1);
928 assert_eq!(failed, 0);
929 assert!(results[0].passed);
930
931 let (results, passed, failed) = check_assertions(
933 "response",
934 &[Assertion::ToolCalled("nonexistent".to_string())],
935 Some(&trace),
936 );
937 assert_eq!(passed, 0);
938 assert_eq!(failed, 1);
939 assert!(!results[0].passed);
940
941 let (results, passed, failed) = check_assertions(
943 "response",
944 &[Assertion::ToolNotCalled("nonexistent".to_string())],
945 Some(&trace),
946 );
947 assert_eq!(passed, 1);
948 assert_eq!(failed, 0);
949 assert!(results[0].passed);
950
951 let (results, passed, failed) = check_assertions(
953 "response",
954 &[Assertion::ToolNotCalled("web_search".to_string())],
955 Some(&trace),
956 );
957 assert_eq!(passed, 0);
958 assert_eq!(failed, 1);
959 assert!(!results[0].passed);
960
961 let (results, passed, failed) = check_assertions(
963 "response",
964 &[Assertion::ToolCalled("web_search".to_string())],
965 None,
966 );
967 assert_eq!(passed, 0);
968 assert_eq!(failed, 1);
969 assert!(!results[0].passed);
970 }
971
972 #[test]
973 fn test_eval_config_from_toml() {
974 let toml_str = r#"
975name = "test-suite"
976description = "A test suite"
977system_prompt = "Be concise"
978max_iterations = 3
979
980[[tasks]]
981name = "test-1"
982prompt = "What is 2+2?"
983golden = "4"
984assertions = [{ type = "contains", value = "4" }]
985weight = 1.0
986required = true
987"#;
988
989 let config: EvalConfig = toml::from_str(toml_str).unwrap();
990 assert_eq!(config.name, "test-suite");
991 assert_eq!(config.tasks.len(), 1);
992 assert_eq!(config.tasks[0].name, "test-1");
993 assert_eq!(config.tasks[0].prompt, "What is 2+2?");
994 assert_eq!(config.tasks[0].golden, "4");
995 assert_eq!(config.tasks[0].assertions.len(), 1);
996 }
997
998 #[test]
999 fn test_eval_config_defaults() {
1000 let toml_str = r#"
1001[[tasks]]
1002name = "simple"
1003prompt = "Say hello"
1004"#;
1005
1006 let config: EvalConfig = toml::from_str(toml_str).unwrap();
1007 assert_eq!(config.name, "unnamed");
1008 assert_eq!(config.system_prompt, default_system_prompt());
1009 assert_eq!(config.max_iterations, 5);
1010 assert_eq!(config.tasks[0].weight, 1.0);
1011 assert!(!config.tasks[0].required);
1012 }
1013
1014 #[test]
1015 fn test_report_format_text() {
1016 let report = EvalReport {
1017 suite_name: "test".to_string(),
1018 ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1019 duration_ms: 100,
1020 overall_score: 0.75,
1021 total_tasks: 2,
1022 passed_tasks: 1,
1023 failed_tasks: 1,
1024 results: vec![
1025 EvalResult {
1026 task_name: "pass-task".to_string(),
1027 passed: true,
1028 score: 1.0,
1029 assertions_passed: 2,
1030 assertions_failed: 0,
1031 assertion_results: vec![AssertionResult {
1032 assertion: "contains: hello".to_string(),
1033 passed: true,
1034 details: "Response contains 'hello'".to_string(),
1035 }],
1036 trace: RunTrace {
1037 task_name: "pass-task".to_string(),
1038 started_at: "2026-06-22T12:00:00+00:00".to_string(),
1039 ended_at: "2026-06-22T12:00:01+00:00".to_string(),
1040 duration_ms: 50,
1041 iterations: 1,
1042 steps: vec![],
1043 llm_calls: vec![],
1044 tool_calls: vec![],
1045 final_response: "hello world".to_string(),
1046 },
1047 error: None,
1048 },
1049 EvalResult {
1050 task_name: "fail-task".to_string(),
1051 passed: false,
1052 score: 0.0,
1053 assertions_passed: 0,
1054 assertions_failed: 1,
1055 assertion_results: vec![AssertionResult {
1056 assertion: "contains: foo".to_string(),
1057 passed: false,
1058 details: "Response does not contain 'foo'".to_string(),
1059 }],
1060 trace: RunTrace {
1061 task_name: "fail-task".to_string(),
1062 started_at: "2026-06-22T12:00:01+00:00".to_string(),
1063 ended_at: "2026-06-22T12:00:02+00:00".to_string(),
1064 duration_ms: 50,
1065 iterations: 1,
1066 steps: vec![],
1067 llm_calls: vec![],
1068 tool_calls: vec![],
1069 final_response: "bar".to_string(),
1070 },
1071 error: None,
1072 },
1073 ],
1074 };
1075
1076 let text = report.format_text();
1077 assert!(text.contains("Eval Report: test"));
1078 assert!(text.contains("75.0%"));
1079 assert!(text.contains("1/2 passed"));
1080 assert!(text.contains("✅ pass-task"));
1081 assert!(text.contains("❌ fail-task"));
1082 }
1083
1084 #[test]
1085 fn test_report_format_json() {
1086 let report = EvalReport {
1087 suite_name: "test".to_string(),
1088 ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1089 duration_ms: 100,
1090 overall_score: 1.0,
1091 total_tasks: 1,
1092 passed_tasks: 1,
1093 failed_tasks: 0,
1094 results: vec![],
1095 };
1096
1097 let json = report.format_json();
1098 assert_eq!(json["suite_name"], "test");
1099 assert_eq!(json["overall_score"], 1.0);
1100 }
1101
1102 #[test]
1103 fn test_eval_config_from_file_not_found() {
1104 let result = EvalConfig::from_file("/tmp/nonexistent-eval-config.toml");
1105 assert!(result.is_err());
1106 }
1107
1108 #[test]
1109 fn test_assertion_regex_invalid_pattern() {
1110 let result =
1111 check_single_assertion("hello", &Assertion::Regex(r"[invalid".to_string()), None);
1112 assert!(!result.passed);
1113 assert!(result.details.contains("Invalid regex"));
1114 }
1115
1116 #[test]
1117 fn test_trace_step_serialization() {
1118 let step = TraceStep {
1119 number: 0,
1120 step_type: StepType::Thought,
1121 content: "test".to_string(),
1122 duration_ms: 100,
1123 };
1124 let json = serde_json::to_string(&step).unwrap();
1125 assert!(json.contains("Thought"));
1126 }
1127
1128 #[test]
1129 fn test_tool_call_trace_serialization() {
1130 let trace = ToolCallTrace {
1131 iteration: 0,
1132 tool_name: "shell_exec".to_string(),
1133 arguments: serde_json::json!({"command": "echo hello"}),
1134 success: true,
1135 output_preview: "hello".to_string(),
1136 duration_ms: 50,
1137 };
1138 let json = serde_json::to_string(&trace).unwrap();
1139 assert!(json.contains("shell_exec"));
1140 assert!(json.contains("echo hello"));
1141 }
1142}