1use crate::agent::{run_agent_loop, AgentLoopConfig};
26use crate::error::{RavenClawsError, Result};
27use crate::llm::LLMProviderTrait;
28use serde::{Deserialize, Serialize};
29use std::sync::Arc;
30use tracing::{info, instrument, warn};
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct EvalConfig {
37 #[serde(default = "default_suite_name")]
39 pub name: String,
40 #[serde(default)]
42 pub description: String,
43 #[serde(default = "default_system_prompt")]
45 pub system_prompt: String,
46 #[serde(default = "default_max_iterations")]
48 pub max_iterations: usize,
49 #[serde(default)]
51 pub tasks: Vec<EvalTask>,
52}
53
54fn default_suite_name() -> String {
55 "unnamed".to_string()
56}
57
58fn default_system_prompt() -> String {
59 "You are a helpful assistant. Be concise and accurate.".to_string()
60}
61
62fn default_max_iterations() -> usize {
63 5
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct EvalTask {
69 pub name: String,
71 #[serde(default)]
73 pub description: String,
74 pub prompt: String,
76 #[serde(default)]
78 pub golden: String,
79 #[serde(default)]
81 pub assertions: Vec<Assertion>,
82 #[serde(default = "default_weight")]
84 pub weight: f64,
85 #[serde(default)]
87 pub required: bool,
88}
89
90fn default_weight() -> f64 {
91 1.0
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
96#[serde(tag = "type", content = "value")]
97pub enum Assertion {
98 #[serde(rename = "contains")]
100 Contains(String),
101 #[serde(rename = "not_contains")]
103 NotContains(String),
104 #[serde(rename = "exact")]
106 Exact(String),
107 #[serde(rename = "regex")]
109 Regex(String),
110 #[serde(rename = "non_empty")]
112 NonEmpty,
113 #[serde(rename = "min_length")]
115 MinLength(usize),
116 #[serde(rename = "max_length")]
118 MaxLength(usize),
119 #[serde(rename = "tool_called")]
121 ToolCalled(String),
122 #[serde(rename = "tool_not_called")]
124 ToolNotCalled(String),
125}
126
127#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RunTrace {
132 pub task_name: String,
134 pub started_at: String,
136 pub ended_at: String,
138 pub duration_ms: u64,
140 pub iterations: usize,
142 pub steps: Vec<TraceStep>,
144 pub llm_calls: Vec<LlmCallTrace>,
146 pub tool_calls: Vec<ToolCallTrace>,
148 pub final_response: String,
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct TraceStep {
155 pub number: usize,
157 pub step_type: StepType,
159 pub content: String,
161 pub duration_ms: u64,
163}
164
165#[derive(Debug, Clone, Serialize, Deserialize)]
167pub enum StepType {
168 Thought,
170 ToolCall,
172 Observation,
174 Final,
176 Error,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct LlmCallTrace {
183 pub iteration: usize,
185 pub provider: String,
187 pub model: String,
189 pub prompt_tokens: Option<u32>,
191 pub completion_tokens: Option<u32>,
193 pub duration_ms: u64,
195 pub response_preview: String,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct ToolCallTrace {
202 pub iteration: usize,
204 pub tool_name: String,
206 pub arguments: serde_json::Value,
208 pub success: bool,
210 pub output_preview: String,
212 pub duration_ms: u64,
214}
215
216#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct EvalResult {
221 pub task_name: String,
223 pub passed: bool,
225 pub score: f64,
227 pub assertions_passed: usize,
229 pub assertions_failed: usize,
231 pub assertion_results: Vec<AssertionResult>,
233 pub trace: RunTrace,
235 pub error: Option<String>,
237}
238
239#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct AssertionResult {
242 pub assertion: String,
244 pub passed: bool,
246 pub details: String,
248}
249
250#[derive(Debug, Clone, Serialize, Deserialize)]
252pub struct EvalReport {
253 pub suite_name: String,
255 pub ran_at: String,
257 pub duration_ms: u64,
259 pub overall_score: f64,
261 pub total_tasks: usize,
263 pub passed_tasks: usize,
265 pub failed_tasks: usize,
267 pub results: Vec<EvalResult>,
269}
270
271pub struct EvalRunner {
275 llm: Arc<dyn LLMProviderTrait>,
277 config: EvalConfig,
279}
280
281impl EvalRunner {
282 pub fn new(llm: Arc<dyn LLMProviderTrait>, config: EvalConfig) -> Self {
284 Self { llm, config }
285 }
286
287 #[instrument(skip(self), fields(suite = %self.config.name, task_count = self.config.tasks.len()))]
289 pub async fn run_suite(&self) -> EvalReport {
290 let started_at = chrono::Utc::now().to_rfc3339();
291 let suite_start = std::time::Instant::now();
292 let mut results = Vec::with_capacity(self.config.tasks.len());
293
294 info!(
295 suite = %self.config.name,
296 task_count = self.config.tasks.len(),
297 "Starting eval suite"
298 );
299
300 for task in &self.config.tasks {
301 let result = self.run_task(task).await;
302 let passed = result.passed;
303 let name = &result.task_name;
304
305 if passed {
306 info!(task = %name, score = result.score, "Eval task passed");
307 } else {
308 warn!(
309 task = %name,
310 score = result.score,
311 passed = result.assertions_passed,
312 failed = result.assertions_failed,
313 "Eval task failed"
314 );
315 }
316
317 results.push(result);
318 }
319
320 let duration_ms = suite_start.elapsed().as_millis() as u64;
321 let total_tasks = results.len();
322 let passed_tasks = results.iter().filter(|r| r.passed).count();
323 let failed_tasks = total_tasks - passed_tasks;
324 let overall_score = if total_tasks > 0 {
325 results
326 .iter()
327 .map(|r| r.score * r.trace.iterations as f64)
328 .sum::<f64>()
329 / results
330 .iter()
331 .map(|r| r.trace.iterations as f64)
332 .sum::<f64>()
333 } else {
334 0.0
335 };
336
337 info!(
338 suite = %self.config.name,
339 passed = passed_tasks,
340 failed = failed_tasks,
341 overall_score = overall_score,
342 duration_ms = duration_ms,
343 "Eval suite completed"
344 );
345
346 EvalReport {
347 suite_name: self.config.name.clone(),
348 ran_at: started_at,
349 duration_ms,
350 overall_score,
351 total_tasks,
352 passed_tasks,
353 failed_tasks,
354 results,
355 }
356 }
357
358 #[instrument(skip(self), fields(task = %task.name))]
364 async fn run_task(&self, task: &EvalTask) -> EvalResult {
365 let task_start = std::time::Instant::now();
366 let started_at = chrono::Utc::now().to_rfc3339();
367
368 let agent_config = AgentLoopConfig {
370 max_iterations: self.config.max_iterations,
371 enable_tools: true,
372 require_approval: false,
373 prompt_injection_protection: true,
374 token_lifetime_secs: 0,
375 no_final_required: false,
376 fallback_chain: None,
377 token_budget: None,
378 ravenfabric: None,
379 checkpoint_dir: None,
380 session_id: None,
381 metrics_callback: None,
382 load_manager: None,
383 };
384
385 let result = run_agent_loop(
387 self.llm.clone(),
388 &task.prompt,
389 &self.config.system_prompt,
390 agent_config,
391 )
392 .await;
393
394 let duration_ms = task_start.elapsed().as_millis() as u64;
395
396 match result {
397 Ok(final_response) => {
398 let trace = RunTrace {
399 task_name: task.name.clone(),
400 started_at,
401 ended_at: chrono::Utc::now().to_rfc3339(),
402 duration_ms,
403 iterations: self.config.max_iterations, steps: vec![TraceStep {
405 number: 0,
406 step_type: StepType::Final,
407 content: final_response.clone(),
408 duration_ms,
409 }],
410 llm_calls: Vec::new(), tool_calls: Vec::new(), final_response: final_response.clone(),
413 };
414
415 let (assertion_results, assertions_passed, assertions_failed) =
417 check_assertions(&final_response, &task.assertions, Some(&trace));
418
419 let score = if task.assertions.is_empty() {
421 if final_response.is_empty() || final_response.len() < 10 {
422 0.0
423 } else {
424 1.0
425 }
426 } else if task.assertions.len() == assertions_passed + assertions_failed {
427 assertions_passed as f64 / task.assertions.len() as f64
428 } else {
429 0.0
430 };
431
432 let passed = assertions_failed == 0 && !final_response.is_empty();
433
434 EvalResult {
435 task_name: task.name.clone(),
436 passed,
437 score,
438 assertions_passed,
439 assertions_failed,
440 assertion_results,
441 trace,
442 error: None,
443 }
444 }
445 Err(e) => {
446 let trace = RunTrace {
447 task_name: task.name.clone(),
448 started_at,
449 ended_at: chrono::Utc::now().to_rfc3339(),
450 duration_ms,
451 iterations: 0,
452 steps: vec![TraceStep {
453 number: 0,
454 step_type: StepType::Error,
455 content: format!("Agent loop failed: {}", e),
456 duration_ms,
457 }],
458 llm_calls: Vec::new(),
459 tool_calls: Vec::new(),
460 final_response: String::new(),
461 };
462
463 EvalResult {
464 task_name: task.name.clone(),
465 passed: false,
466 score: 0.0,
467 assertions_passed: 0,
468 assertions_failed: 1,
469 assertion_results: vec![AssertionResult {
470 assertion: "agent_loop".to_string(),
471 passed: false,
472 details: format!("Agent loop failed: {}", e),
473 }],
474 trace,
475 error: Some(e.to_string()),
476 }
477 }
478 }
479 }
480}
481
482fn check_assertions(
486 response: &str,
487 assertions: &[Assertion],
488 run_trace: Option<&RunTrace>,
489) -> (Vec<AssertionResult>, usize, usize) {
490 let mut results = Vec::with_capacity(assertions.len());
491 let mut passed = 0;
492 let mut failed = 0;
493
494 for assertion in assertions {
495 let result = check_single_assertion(response, assertion, run_trace);
496 if result.passed {
497 passed += 1;
498 } else {
499 failed += 1;
500 }
501 results.push(result);
502 }
503
504 (results, passed, failed)
505}
506
507fn check_single_assertion(
509 response: &str,
510 assertion: &Assertion,
511 run_trace: Option<&RunTrace>,
512) -> AssertionResult {
513 match assertion {
514 Assertion::Contains(pattern) => {
515 let passed = response.contains(pattern);
516 AssertionResult {
517 assertion: format!("contains: {}", pattern),
518 passed,
519 details: if passed {
520 format!("Response contains '{}'", pattern)
521 } else {
522 format!("Response does not contain '{}'", pattern)
523 },
524 }
525 }
526 Assertion::NotContains(pattern) => {
527 let passed = !response.contains(pattern);
528 AssertionResult {
529 assertion: format!("not_contains: {}", pattern),
530 passed,
531 details: if passed {
532 format!("Response does not contain '{}'", pattern)
533 } else {
534 format!("Response contains '{}'", pattern)
535 },
536 }
537 }
538 Assertion::Exact(expected) => {
539 let trimmed_response = response.trim();
540 let passed = trimmed_response == expected.as_str();
541 AssertionResult {
542 assertion: format!("exact: {}", expected),
543 passed,
544 details: if passed {
545 "Response matches exactly".to_string()
546 } else {
547 format!(
548 "Expected '{}', got '{}'",
549 expected,
550 trimmed_response.chars().take(100).collect::<String>()
551 )
552 },
553 }
554 }
555 Assertion::Regex(pattern) => {
556 let re = regex_lite::Regex::new(pattern);
557 match re {
558 Ok(re) => {
559 let passed = re.is_match(response);
560 AssertionResult {
561 assertion: format!("regex: {}", pattern),
562 passed,
563 details: if passed {
564 format!("Response matches pattern '{}'", pattern)
565 } else {
566 format!("Response does not match pattern '{}'", pattern)
567 },
568 }
569 }
570 Err(e) => AssertionResult {
571 assertion: format!("regex: {}", pattern),
572 passed: false,
573 details: format!("Invalid regex pattern: {}", e),
574 },
575 }
576 }
577 Assertion::NonEmpty => {
578 let passed = !response.is_empty();
579 AssertionResult {
580 assertion: "non_empty".to_string(),
581 passed,
582 details: if passed {
583 format!("Response is non-empty ({} chars)", response.len())
584 } else {
585 "Response is empty".to_string()
586 },
587 }
588 }
589 Assertion::MinLength(min) => {
590 let passed = response.len() >= *min;
591 AssertionResult {
592 assertion: format!("min_length: {}", min),
593 passed,
594 details: if passed {
595 format!("Response length {} >= {}", response.len(), min)
596 } else {
597 format!("Response length {} < {}", response.len(), min)
598 },
599 }
600 }
601 Assertion::MaxLength(max) => {
602 let passed = response.len() <= *max;
603 AssertionResult {
604 assertion: format!("max_length: {}", max),
605 passed,
606 details: if passed {
607 format!("Response length {} <= {}", response.len(), max)
608 } else {
609 format!("Response length {} > {}", response.len(), max)
610 },
611 }
612 }
613 Assertion::ToolCalled(tool_name) => {
614 let tool_calls = run_trace
615 .map(|t| &t.tool_calls)
616 .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
617 let passed = tool_calls.is_some();
618 AssertionResult {
619 assertion: format!("tool_called: {}", tool_name),
620 passed,
621 details: if passed {
622 format!("Tool '{}' was called", tool_name)
623 } else {
624 let all_tools: Vec<&str> = run_trace
625 .map(|t| {
626 t.tool_calls
627 .iter()
628 .map(|tc| tc.tool_name.as_str())
629 .collect()
630 })
631 .unwrap_or_default();
632 if all_tools.is_empty() {
633 format!("Tool '{}' was not called (no tools were called)", tool_name)
634 } else {
635 format!(
636 "Tool '{}' was not called (called: {})",
637 tool_name,
638 all_tools.join(", ")
639 )
640 }
641 },
642 }
643 }
644 Assertion::ToolNotCalled(tool_name) => {
645 let tool_calls = run_trace
646 .map(|t| &t.tool_calls)
647 .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
648 let passed = tool_calls.is_none();
649 AssertionResult {
650 assertion: format!("tool_not_called: {}", tool_name),
651 passed,
652 details: if passed {
653 format!("Tool '{}' was not called", tool_name)
654 } else {
655 format!("Tool '{}' was called but should not have been", tool_name)
656 },
657 }
658 }
659 }
660}
661
662impl EvalReport {
665 pub fn format_text(&self) -> String {
667 let mut output = String::new();
668
669 output.push_str(&format!("\n🐦⬛ Eval Report: {}\n", self.suite_name));
670 output.push_str(&format!("{:-^60}\n", ""));
671 output.push_str(&format!(
672 "Ran at: {}\n",
673 &self.ran_at[..19].replace('T', " ")
674 ));
675 output.push_str(&format!("Duration: {} ms\n", self.duration_ms));
676 output.push_str(&format!(
677 "Overall score: {:.1}%\n",
678 self.overall_score * 100.0
679 ));
680 output.push_str(&format!(
681 "Tasks: {}/{} passed\n",
682 self.passed_tasks, self.total_tasks
683 ));
684 output.push_str(&format!("{:-^60}\n", ""));
685
686 for result in &self.results {
687 output.push_str(&format!(
688 "\n {} {} — {:.1}%\n",
689 if result.passed { "✅" } else { "❌" },
690 result.task_name,
691 result.score * 100.0
692 ));
693
694 if let Some(ref error) = result.error {
695 output.push_str(&format!(" Error: {}\n", error));
696 }
697
698 if !result.assertion_results.is_empty() {
699 for ar in &result.assertion_results {
700 output.push_str(&format!(
701 " {} {}\n",
702 if ar.passed { " ✅" } else { " ❌" },
703 ar.details
704 ));
705 }
706 }
707
708 let trace = &result.trace;
710 output.push_str(&format!(
711 " Iterations: {} · LLM calls: {} · Tool calls: {} · Duration: {} ms\n",
712 trace.iterations,
713 trace.llm_calls.len(),
714 trace.tool_calls.len(),
715 trace.duration_ms
716 ));
717
718 let preview: String = trace.final_response.chars().take(200).collect();
720 if !preview.is_empty() {
721 output.push_str(&format!(" Response: {}\n", preview));
722 }
723 }
724
725 output
726 }
727
728 pub fn format_json(&self) -> serde_json::Value {
730 serde_json::to_value(self).unwrap_or(serde_json::json!({"error": "serialization failed"}))
731 }
732}
733
734impl EvalConfig {
737 pub fn from_file(path: &str) -> Result<Self> {
739 let content = std::fs::read_to_string(path).map_err(|e| {
740 RavenClawsError::CommandExecution(format!("Failed to read eval config: {}", e))
741 })?;
742
743 if content.trim().is_empty() {
744 return Err(RavenClawsError::CommandExecution(format!(
745 "Eval config file '{}' is empty — no tasks to run",
746 path
747 )));
748 }
749
750 let config: EvalConfig = toml::from_str(&content).map_err(|e| {
751 RavenClawsError::CommandExecution(format!("Failed to parse eval config: {}", e))
752 })?;
753
754 if config.tasks.is_empty() {
755 return Err(RavenClawsError::CommandExecution(format!(
756 "Eval config file '{}' has no tasks defined",
757 path
758 )));
759 }
760
761 Ok(config)
762 }
763}
764
765#[cfg(test)]
768mod tests {
769 use super::*;
770
771 #[test]
772 fn test_assertion_contains_pass() {
773 let result = check_single_assertion(
774 "hello world",
775 &Assertion::Contains("world".to_string()),
776 None,
777 );
778 assert!(result.passed);
779 assert!(result.details.contains("contains"));
780 }
781
782 #[test]
783 fn test_assertion_contains_fail() {
784 let result =
785 check_single_assertion("hello world", &Assertion::Contains("foo".to_string()), None);
786 assert!(!result.passed);
787 }
788
789 #[test]
790 fn test_assertion_not_contains_pass() {
791 let result = check_single_assertion(
792 "hello world",
793 &Assertion::NotContains("foo".to_string()),
794 None,
795 );
796 assert!(result.passed);
797 }
798
799 #[test]
800 fn test_assertion_not_contains_fail() {
801 let result = check_single_assertion(
802 "hello world",
803 &Assertion::NotContains("world".to_string()),
804 None,
805 );
806 assert!(!result.passed);
807 }
808
809 #[test]
810 fn test_assertion_exact_pass() {
811 let result = check_single_assertion("hello", &Assertion::Exact("hello".to_string()), None);
812 assert!(result.passed);
813 }
814
815 #[test]
816 fn test_assertion_exact_fail() {
817 let result = check_single_assertion("world", &Assertion::Exact("hello".to_string()), None);
818 assert!(!result.passed);
819 }
820
821 #[test]
822 fn test_assertion_regex_pass() {
823 let result =
824 check_single_assertion("hello 123", &Assertion::Regex(r"\d+".to_string()), None);
825 assert!(result.passed);
826 }
827
828 #[test]
829 fn test_assertion_regex_fail() {
830 let result = check_single_assertion("hello", &Assertion::Regex(r"\d+".to_string()), None);
831 assert!(!result.passed);
832 }
833
834 #[test]
835 fn test_assertion_non_empty_pass() {
836 let result = check_single_assertion("hello", &Assertion::NonEmpty, None);
837 assert!(result.passed);
838 }
839
840 #[test]
841 fn test_assertion_non_empty_fail() {
842 let result = check_single_assertion("", &Assertion::NonEmpty, None);
843 assert!(!result.passed);
844 }
845
846 #[test]
847 fn test_assertion_min_length_pass() {
848 let result = check_single_assertion("hello", &Assertion::MinLength(3), None);
849 assert!(result.passed);
850 }
851
852 #[test]
853 fn test_assertion_min_length_fail() {
854 let result = check_single_assertion("hi", &Assertion::MinLength(5), None);
855 assert!(!result.passed);
856 }
857
858 #[test]
859 fn test_assertion_max_length_pass() {
860 let result = check_single_assertion("hi", &Assertion::MaxLength(5), None);
861 assert!(result.passed);
862 }
863
864 #[test]
865 fn test_assertion_max_length_fail() {
866 let result = check_single_assertion("hello world", &Assertion::MaxLength(5), None);
867 assert!(!result.passed);
868 }
869
870 #[test]
871 fn test_check_assertions_empty() {
872 let (results, passed, failed) = check_assertions("hello", &[], None);
873 assert!(results.is_empty());
874 assert_eq!(passed, 0);
875 assert_eq!(failed, 0);
876 }
877
878 #[test]
879 fn test_check_assertions_multiple() {
880 let assertions = vec![
881 Assertion::Contains("hello".to_string()),
882 Assertion::Contains("world".to_string()),
883 Assertion::NonEmpty,
884 ];
885 let (results, passed, failed) = check_assertions("hello world", &assertions, None);
886 assert_eq!(passed, 3);
887 assert_eq!(failed, 0);
888 assert_eq!(results.len(), 3);
889 }
890
891 #[test]
892 fn test_check_assertions_tool_called() {
893 let trace = RunTrace {
894 task_name: "test".to_string(),
895 started_at: "2026-01-01T00:00:00Z".to_string(),
896 ended_at: "2026-01-01T00:00:01Z".to_string(),
897 duration_ms: 1000,
898 iterations: 1,
899 steps: vec![],
900 llm_calls: vec![],
901 tool_calls: vec![
902 ToolCallTrace {
903 iteration: 0,
904 tool_name: "web_search".to_string(),
905 arguments: serde_json::json!({"query": "test"}),
906 success: true,
907 output_preview: "results".to_string(),
908 duration_ms: 100,
909 },
910 ToolCallTrace {
911 iteration: 0,
912 tool_name: "read_file".to_string(),
913 arguments: serde_json::json!({"path": "/tmp/test"}),
914 success: true,
915 output_preview: "content".to_string(),
916 duration_ms: 50,
917 },
918 ],
919 final_response: "response".to_string(),
920 };
921
922 let (results, passed, failed) = check_assertions(
924 "response",
925 &[Assertion::ToolCalled("web_search".to_string())],
926 Some(&trace),
927 );
928 assert_eq!(passed, 1);
929 assert_eq!(failed, 0);
930 assert!(results[0].passed);
931
932 let (results, passed, failed) = check_assertions(
934 "response",
935 &[Assertion::ToolCalled("nonexistent".to_string())],
936 Some(&trace),
937 );
938 assert_eq!(passed, 0);
939 assert_eq!(failed, 1);
940 assert!(!results[0].passed);
941
942 let (results, passed, failed) = check_assertions(
944 "response",
945 &[Assertion::ToolNotCalled("nonexistent".to_string())],
946 Some(&trace),
947 );
948 assert_eq!(passed, 1);
949 assert_eq!(failed, 0);
950 assert!(results[0].passed);
951
952 let (results, passed, failed) = check_assertions(
954 "response",
955 &[Assertion::ToolNotCalled("web_search".to_string())],
956 Some(&trace),
957 );
958 assert_eq!(passed, 0);
959 assert_eq!(failed, 1);
960 assert!(!results[0].passed);
961
962 let (results, passed, failed) = check_assertions(
964 "response",
965 &[Assertion::ToolCalled("web_search".to_string())],
966 None,
967 );
968 assert_eq!(passed, 0);
969 assert_eq!(failed, 1);
970 assert!(!results[0].passed);
971 }
972
973 #[test]
974 fn test_eval_config_from_toml() {
975 let toml_str = r#"
976name = "test-suite"
977description = "A test suite"
978system_prompt = "Be concise"
979max_iterations = 3
980
981[[tasks]]
982name = "test-1"
983prompt = "What is 2+2?"
984golden = "4"
985assertions = [{ type = "contains", value = "4" }]
986weight = 1.0
987required = true
988"#;
989
990 let config: EvalConfig = toml::from_str(toml_str).unwrap();
991 assert_eq!(config.name, "test-suite");
992 assert_eq!(config.tasks.len(), 1);
993 assert_eq!(config.tasks[0].name, "test-1");
994 assert_eq!(config.tasks[0].prompt, "What is 2+2?");
995 assert_eq!(config.tasks[0].golden, "4");
996 assert_eq!(config.tasks[0].assertions.len(), 1);
997 }
998
999 #[test]
1000 fn test_eval_config_defaults() {
1001 let toml_str = r#"
1002[[tasks]]
1003name = "simple"
1004prompt = "Say hello"
1005"#;
1006
1007 let config: EvalConfig = toml::from_str(toml_str).unwrap();
1008 assert_eq!(config.name, "unnamed");
1009 assert_eq!(config.system_prompt, default_system_prompt());
1010 assert_eq!(config.max_iterations, 5);
1011 assert_eq!(config.tasks[0].weight, 1.0);
1012 assert!(!config.tasks[0].required);
1013 }
1014
1015 #[test]
1016 fn test_report_format_text() {
1017 let report = EvalReport {
1018 suite_name: "test".to_string(),
1019 ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1020 duration_ms: 100,
1021 overall_score: 0.75,
1022 total_tasks: 2,
1023 passed_tasks: 1,
1024 failed_tasks: 1,
1025 results: vec![
1026 EvalResult {
1027 task_name: "pass-task".to_string(),
1028 passed: true,
1029 score: 1.0,
1030 assertions_passed: 2,
1031 assertions_failed: 0,
1032 assertion_results: vec![AssertionResult {
1033 assertion: "contains: hello".to_string(),
1034 passed: true,
1035 details: "Response contains 'hello'".to_string(),
1036 }],
1037 trace: RunTrace {
1038 task_name: "pass-task".to_string(),
1039 started_at: "2026-06-22T12:00:00+00:00".to_string(),
1040 ended_at: "2026-06-22T12:00:01+00:00".to_string(),
1041 duration_ms: 50,
1042 iterations: 1,
1043 steps: vec![],
1044 llm_calls: vec![],
1045 tool_calls: vec![],
1046 final_response: "hello world".to_string(),
1047 },
1048 error: None,
1049 },
1050 EvalResult {
1051 task_name: "fail-task".to_string(),
1052 passed: false,
1053 score: 0.0,
1054 assertions_passed: 0,
1055 assertions_failed: 1,
1056 assertion_results: vec![AssertionResult {
1057 assertion: "contains: foo".to_string(),
1058 passed: false,
1059 details: "Response does not contain 'foo'".to_string(),
1060 }],
1061 trace: RunTrace {
1062 task_name: "fail-task".to_string(),
1063 started_at: "2026-06-22T12:00:01+00:00".to_string(),
1064 ended_at: "2026-06-22T12:00:02+00:00".to_string(),
1065 duration_ms: 50,
1066 iterations: 1,
1067 steps: vec![],
1068 llm_calls: vec![],
1069 tool_calls: vec![],
1070 final_response: "bar".to_string(),
1071 },
1072 error: None,
1073 },
1074 ],
1075 };
1076
1077 let text = report.format_text();
1078 assert!(text.contains("Eval Report: test"));
1079 assert!(text.contains("75.0%"));
1080 assert!(text.contains("1/2 passed"));
1081 assert!(text.contains("✅ pass-task"));
1082 assert!(text.contains("❌ fail-task"));
1083 }
1084
1085 #[test]
1086 fn test_report_format_json() {
1087 let report = EvalReport {
1088 suite_name: "test".to_string(),
1089 ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1090 duration_ms: 100,
1091 overall_score: 1.0,
1092 total_tasks: 1,
1093 passed_tasks: 1,
1094 failed_tasks: 0,
1095 results: vec![],
1096 };
1097
1098 let json = report.format_json();
1099 assert_eq!(json["suite_name"], "test");
1100 assert_eq!(json["overall_score"], 1.0);
1101 }
1102
1103 #[test]
1104 fn test_eval_config_from_file_not_found() {
1105 let result = EvalConfig::from_file("/tmp/nonexistent-eval-config.toml");
1106 assert!(result.is_err());
1107 }
1108
1109 #[test]
1110 fn test_assertion_regex_invalid_pattern() {
1111 let result =
1112 check_single_assertion("hello", &Assertion::Regex(r"[invalid".to_string()), None);
1113 assert!(!result.passed);
1114 assert!(result.details.contains("Invalid regex"));
1115 }
1116
1117 #[test]
1118 fn test_trace_step_serialization() {
1119 let step = TraceStep {
1120 number: 0,
1121 step_type: StepType::Thought,
1122 content: "test".to_string(),
1123 duration_ms: 100,
1124 };
1125 let json = serde_json::to_string(&step).unwrap();
1126 assert!(json.contains("Thought"));
1127 }
1128
1129 #[test]
1130 fn test_tool_call_trace_serialization() {
1131 let trace = ToolCallTrace {
1132 iteration: 0,
1133 tool_name: "shell_exec".to_string(),
1134 arguments: serde_json::json!({"command": "echo hello"}),
1135 success: true,
1136 output_preview: "hello".to_string(),
1137 duration_ms: 50,
1138 };
1139 let json = serde_json::to_string(&trace).unwrap();
1140 assert!(json.contains("shell_exec"));
1141 assert!(json.contains("echo hello"));
1142 }
1143}