1use crate::agent::{run_agent_loop, AgentLoopConfig};
26use crate::error::{RavenClawsError, Result};
27use crate::llm::LLMProviderTrait;
28use serde::{Deserialize, Serialize};
29use std::sync::Arc;
30use tracing::{info, instrument, warn};
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct EvalConfig {
37 #[serde(default = "default_suite_name")]
39 pub name: String,
40 #[serde(default)]
42 pub description: String,
43 #[serde(default = "default_system_prompt")]
45 pub system_prompt: String,
46 #[serde(default = "default_max_iterations")]
48 pub max_iterations: usize,
49 #[serde(default)]
51 pub tasks: Vec<EvalTask>,
52}
53
54fn default_suite_name() -> String {
55 "unnamed".to_string()
56}
57
58fn default_system_prompt() -> String {
59 "You are a helpful assistant. Be concise and accurate.".to_string()
60}
61
62fn default_max_iterations() -> usize {
63 5
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct EvalTask {
69 pub name: String,
71 #[serde(default)]
73 pub description: String,
74 pub prompt: String,
76 #[serde(default)]
78 pub golden: String,
79 #[serde(default)]
81 pub assertions: Vec<Assertion>,
82 #[serde(default = "default_weight")]
84 pub weight: f64,
85 #[serde(default)]
87 pub required: bool,
88}
89
90fn default_weight() -> f64 {
91 1.0
92}
93
94#[derive(Debug, Clone, Serialize, Deserialize)]
96#[serde(tag = "type", content = "value")]
97pub enum Assertion {
98 #[serde(rename = "contains")]
100 Contains(String),
101 #[serde(rename = "not_contains")]
103 NotContains(String),
104 #[serde(rename = "exact")]
106 Exact(String),
107 #[serde(rename = "regex")]
109 Regex(String),
110 #[serde(rename = "non_empty")]
112 NonEmpty,
113 #[serde(rename = "min_length")]
115 MinLength(usize),
116 #[serde(rename = "max_length")]
118 MaxLength(usize),
119 #[serde(rename = "tool_called")]
121 ToolCalled(String),
122 #[serde(rename = "tool_not_called")]
124 ToolNotCalled(String),
125}
126
127#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct RunTrace {
132 pub task_name: String,
134 pub started_at: String,
136 pub ended_at: String,
138 pub duration_ms: u64,
140 pub iterations: usize,
142 pub steps: Vec<TraceStep>,
144 pub llm_calls: Vec<LlmCallTrace>,
146 pub tool_calls: Vec<ToolCallTrace>,
148 pub final_response: String,
150}
151
152#[derive(Debug, Clone, Serialize, Deserialize)]
154pub struct TraceStep {
155 pub number: usize,
157 pub step_type: StepType,
159 pub content: String,
161 pub duration_ms: u64,
163}
164
165#[derive(Debug, Clone, Serialize, Deserialize)]
167pub enum StepType {
168 Thought,
170 ToolCall,
172 Observation,
174 Final,
176 Error,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct LlmCallTrace {
183 pub iteration: usize,
185 pub provider: String,
187 pub model: String,
189 pub prompt_tokens: Option<u32>,
191 pub completion_tokens: Option<u32>,
193 pub duration_ms: u64,
195 pub response_preview: String,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct ToolCallTrace {
202 pub iteration: usize,
204 pub tool_name: String,
206 pub arguments: serde_json::Value,
208 pub success: bool,
210 pub output_preview: String,
212 pub duration_ms: u64,
214}
215
216#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct EvalResult {
221 pub task_name: String,
223 pub passed: bool,
225 pub score: f64,
227 pub assertions_passed: usize,
229 pub assertions_failed: usize,
231 pub assertion_results: Vec<AssertionResult>,
233 pub trace: RunTrace,
235 pub error: Option<String>,
237}
238
239#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct AssertionResult {
242 pub assertion: String,
244 pub passed: bool,
246 pub details: String,
248}
249
250#[derive(Debug, Clone, Serialize, Deserialize)]
252pub struct EvalReport {
253 pub suite_name: String,
255 pub ran_at: String,
257 pub duration_ms: u64,
259 pub overall_score: f64,
261 pub total_tasks: usize,
263 pub passed_tasks: usize,
265 pub failed_tasks: usize,
267 pub results: Vec<EvalResult>,
269}
270
271pub struct EvalRunner {
275 llm: Arc<dyn LLMProviderTrait>,
277 config: EvalConfig,
279}
280
281impl EvalRunner {
282 pub fn new(llm: Arc<dyn LLMProviderTrait>, config: EvalConfig) -> Self {
284 Self { llm, config }
285 }
286
287 #[instrument(skip(self), fields(suite = %self.config.name, task_count = self.config.tasks.len()))]
289 pub async fn run_suite(&self) -> EvalReport {
290 let started_at = chrono::Utc::now().to_rfc3339();
291 let suite_start = std::time::Instant::now();
292 let mut results = Vec::with_capacity(self.config.tasks.len());
293
294 info!(
295 suite = %self.config.name,
296 task_count = self.config.tasks.len(),
297 "Starting eval suite"
298 );
299
300 for task in &self.config.tasks {
301 let result = self.run_task(task).await;
302 let passed = result.passed;
303 let name = &result.task_name;
304
305 if passed {
306 info!(task = %name, score = result.score, "Eval task passed");
307 } else {
308 warn!(
309 task = %name,
310 score = result.score,
311 passed = result.assertions_passed,
312 failed = result.assertions_failed,
313 "Eval task failed"
314 );
315 }
316
317 results.push(result);
318 }
319
320 let duration_ms = suite_start.elapsed().as_millis() as u64;
321 let total_tasks = results.len();
322 let passed_tasks = results.iter().filter(|r| r.passed).count();
323 let failed_tasks = total_tasks - passed_tasks;
324 let overall_score = if total_tasks > 0 {
325 results
326 .iter()
327 .map(|r| r.score * r.trace.iterations as f64)
328 .sum::<f64>()
329 / results
330 .iter()
331 .map(|r| r.trace.iterations as f64)
332 .sum::<f64>()
333 } else {
334 0.0
335 };
336
337 info!(
338 suite = %self.config.name,
339 passed = passed_tasks,
340 failed = failed_tasks,
341 overall_score = overall_score,
342 duration_ms = duration_ms,
343 "Eval suite completed"
344 );
345
346 EvalReport {
347 suite_name: self.config.name.clone(),
348 ran_at: started_at,
349 duration_ms,
350 overall_score,
351 total_tasks,
352 passed_tasks,
353 failed_tasks,
354 results,
355 }
356 }
357
358 #[instrument(skip(self), fields(task = %task.name))]
364 async fn run_task(&self, task: &EvalTask) -> EvalResult {
365 let task_start = std::time::Instant::now();
366 let started_at = chrono::Utc::now().to_rfc3339();
367
368 let agent_config = AgentLoopConfig {
370 max_iterations: self.config.max_iterations,
371 enable_tools: true,
372 require_approval: false,
373 prompt_injection_protection: true,
374 token_lifetime_secs: 0,
375 no_final_required: false,
376 fallback_chain: None,
377 token_budget: None,
378 ravenfabric: None,
379 checkpoint_dir: None,
380 session_id: None,
381 metrics_callback: None,
382 load_manager: None,
383 retry_config: None,
384 };
385
386 let result = run_agent_loop(
388 self.llm.clone(),
389 &task.prompt,
390 &self.config.system_prompt,
391 agent_config,
392 )
393 .await;
394
395 let duration_ms = task_start.elapsed().as_millis() as u64;
396
397 match result {
398 Ok(final_response) => {
399 let trace = RunTrace {
400 task_name: task.name.clone(),
401 started_at,
402 ended_at: chrono::Utc::now().to_rfc3339(),
403 duration_ms,
404 iterations: self.config.max_iterations, steps: vec![TraceStep {
406 number: 0,
407 step_type: StepType::Final,
408 content: final_response.clone(),
409 duration_ms,
410 }],
411 llm_calls: Vec::new(), tool_calls: Vec::new(), final_response: final_response.clone(),
414 };
415
416 let (assertion_results, assertions_passed, assertions_failed) =
418 check_assertions(&final_response, &task.assertions, Some(&trace));
419
420 let score = if task.assertions.is_empty() {
422 if final_response.is_empty() || final_response.len() < 10 {
423 0.0
424 } else {
425 1.0
426 }
427 } else if task.assertions.len() == assertions_passed + assertions_failed {
428 assertions_passed as f64 / task.assertions.len() as f64
429 } else {
430 0.0
431 };
432
433 let passed = assertions_failed == 0 && !final_response.is_empty();
434
435 EvalResult {
436 task_name: task.name.clone(),
437 passed,
438 score,
439 assertions_passed,
440 assertions_failed,
441 assertion_results,
442 trace,
443 error: None,
444 }
445 }
446 Err(e) => {
447 let trace = RunTrace {
448 task_name: task.name.clone(),
449 started_at,
450 ended_at: chrono::Utc::now().to_rfc3339(),
451 duration_ms,
452 iterations: 0,
453 steps: vec![TraceStep {
454 number: 0,
455 step_type: StepType::Error,
456 content: format!("Agent loop failed: {}", e),
457 duration_ms,
458 }],
459 llm_calls: Vec::new(),
460 tool_calls: Vec::new(),
461 final_response: String::new(),
462 };
463
464 EvalResult {
465 task_name: task.name.clone(),
466 passed: false,
467 score: 0.0,
468 assertions_passed: 0,
469 assertions_failed: 1,
470 assertion_results: vec![AssertionResult {
471 assertion: "agent_loop".to_string(),
472 passed: false,
473 details: format!("Agent loop failed: {}", e),
474 }],
475 trace,
476 error: Some(e.to_string()),
477 }
478 }
479 }
480 }
481}
482
483fn check_assertions(
487 response: &str,
488 assertions: &[Assertion],
489 run_trace: Option<&RunTrace>,
490) -> (Vec<AssertionResult>, usize, usize) {
491 let mut results = Vec::with_capacity(assertions.len());
492 let mut passed = 0;
493 let mut failed = 0;
494
495 for assertion in assertions {
496 let result = check_single_assertion(response, assertion, run_trace);
497 if result.passed {
498 passed += 1;
499 } else {
500 failed += 1;
501 }
502 results.push(result);
503 }
504
505 (results, passed, failed)
506}
507
508fn check_single_assertion(
510 response: &str,
511 assertion: &Assertion,
512 run_trace: Option<&RunTrace>,
513) -> AssertionResult {
514 match assertion {
515 Assertion::Contains(pattern) => {
516 let passed = response.contains(pattern);
517 AssertionResult {
518 assertion: format!("contains: {}", pattern),
519 passed,
520 details: if passed {
521 format!("Response contains '{}'", pattern)
522 } else {
523 format!("Response does not contain '{}'", pattern)
524 },
525 }
526 }
527 Assertion::NotContains(pattern) => {
528 let passed = !response.contains(pattern);
529 AssertionResult {
530 assertion: format!("not_contains: {}", pattern),
531 passed,
532 details: if passed {
533 format!("Response does not contain '{}'", pattern)
534 } else {
535 format!("Response contains '{}'", pattern)
536 },
537 }
538 }
539 Assertion::Exact(expected) => {
540 let trimmed_response = response.trim();
541 let passed = trimmed_response == expected.as_str();
542 AssertionResult {
543 assertion: format!("exact: {}", expected),
544 passed,
545 details: if passed {
546 "Response matches exactly".to_string()
547 } else {
548 format!(
549 "Expected '{}', got '{}'",
550 expected,
551 trimmed_response.chars().take(100).collect::<String>()
552 )
553 },
554 }
555 }
556 Assertion::Regex(pattern) => {
557 let re = regex_lite::Regex::new(pattern);
558 match re {
559 Ok(re) => {
560 let passed = re.is_match(response);
561 AssertionResult {
562 assertion: format!("regex: {}", pattern),
563 passed,
564 details: if passed {
565 format!("Response matches pattern '{}'", pattern)
566 } else {
567 format!("Response does not match pattern '{}'", pattern)
568 },
569 }
570 }
571 Err(e) => AssertionResult {
572 assertion: format!("regex: {}", pattern),
573 passed: false,
574 details: format!("Invalid regex pattern: {}", e),
575 },
576 }
577 }
578 Assertion::NonEmpty => {
579 let passed = !response.is_empty();
580 AssertionResult {
581 assertion: "non_empty".to_string(),
582 passed,
583 details: if passed {
584 format!("Response is non-empty ({} chars)", response.len())
585 } else {
586 "Response is empty".to_string()
587 },
588 }
589 }
590 Assertion::MinLength(min) => {
591 let passed = response.len() >= *min;
592 AssertionResult {
593 assertion: format!("min_length: {}", min),
594 passed,
595 details: if passed {
596 format!("Response length {} >= {}", response.len(), min)
597 } else {
598 format!("Response length {} < {}", response.len(), min)
599 },
600 }
601 }
602 Assertion::MaxLength(max) => {
603 let passed = response.len() <= *max;
604 AssertionResult {
605 assertion: format!("max_length: {}", max),
606 passed,
607 details: if passed {
608 format!("Response length {} <= {}", response.len(), max)
609 } else {
610 format!("Response length {} > {}", response.len(), max)
611 },
612 }
613 }
614 Assertion::ToolCalled(tool_name) => {
615 let tool_calls = run_trace
616 .map(|t| &t.tool_calls)
617 .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
618 let passed = tool_calls.is_some();
619 AssertionResult {
620 assertion: format!("tool_called: {}", tool_name),
621 passed,
622 details: if passed {
623 format!("Tool '{}' was called", tool_name)
624 } else {
625 let all_tools: Vec<&str> = run_trace
626 .map(|t| {
627 t.tool_calls
628 .iter()
629 .map(|tc| tc.tool_name.as_str())
630 .collect()
631 })
632 .unwrap_or_default();
633 if all_tools.is_empty() {
634 format!("Tool '{}' was not called (no tools were called)", tool_name)
635 } else {
636 format!(
637 "Tool '{}' was not called (called: {})",
638 tool_name,
639 all_tools.join(", ")
640 )
641 }
642 },
643 }
644 }
645 Assertion::ToolNotCalled(tool_name) => {
646 let tool_calls = run_trace
647 .map(|t| &t.tool_calls)
648 .filter(|calls| calls.iter().any(|tc| tc.tool_name == *tool_name));
649 let passed = tool_calls.is_none();
650 AssertionResult {
651 assertion: format!("tool_not_called: {}", tool_name),
652 passed,
653 details: if passed {
654 format!("Tool '{}' was not called", tool_name)
655 } else {
656 format!("Tool '{}' was called but should not have been", tool_name)
657 },
658 }
659 }
660 }
661}
662
663impl EvalReport {
666 pub fn format_text(&self) -> String {
668 let mut output = String::new();
669
670 output.push_str(&format!("\n🐦⬛ Eval Report: {}\n", self.suite_name));
671 output.push_str(&format!("{:-^60}\n", ""));
672 output.push_str(&format!(
673 "Ran at: {}\n",
674 &self.ran_at[..19].replace('T', " ")
675 ));
676 output.push_str(&format!("Duration: {} ms\n", self.duration_ms));
677 output.push_str(&format!(
678 "Overall score: {:.1}%\n",
679 self.overall_score * 100.0
680 ));
681 output.push_str(&format!(
682 "Tasks: {}/{} passed\n",
683 self.passed_tasks, self.total_tasks
684 ));
685 output.push_str(&format!("{:-^60}\n", ""));
686
687 for result in &self.results {
688 output.push_str(&format!(
689 "\n {} {} — {:.1}%\n",
690 if result.passed { "✅" } else { "❌" },
691 result.task_name,
692 result.score * 100.0
693 ));
694
695 if let Some(ref error) = result.error {
696 output.push_str(&format!(" Error: {}\n", error));
697 }
698
699 if !result.assertion_results.is_empty() {
700 for ar in &result.assertion_results {
701 output.push_str(&format!(
702 " {} {}\n",
703 if ar.passed { " ✅" } else { " ❌" },
704 ar.details
705 ));
706 }
707 }
708
709 let trace = &result.trace;
711 output.push_str(&format!(
712 " Iterations: {} · LLM calls: {} · Tool calls: {} · Duration: {} ms\n",
713 trace.iterations,
714 trace.llm_calls.len(),
715 trace.tool_calls.len(),
716 trace.duration_ms
717 ));
718
719 let preview: String = trace.final_response.chars().take(200).collect();
721 if !preview.is_empty() {
722 output.push_str(&format!(" Response: {}\n", preview));
723 }
724 }
725
726 output
727 }
728
729 pub fn format_json(&self) -> serde_json::Value {
731 serde_json::to_value(self).unwrap_or(serde_json::json!({"error": "serialization failed"}))
732 }
733}
734
735impl EvalConfig {
738 pub fn from_file(path: &str) -> Result<Self> {
740 let content = std::fs::read_to_string(path).map_err(|e| {
741 RavenClawsError::CommandExecution(format!("Failed to read eval config: {}", e))
742 })?;
743
744 if content.trim().is_empty() {
745 return Err(RavenClawsError::CommandExecution(format!(
746 "Eval config file '{}' is empty — no tasks to run",
747 path
748 )));
749 }
750
751 let config: EvalConfig = toml::from_str(&content).map_err(|e| {
752 RavenClawsError::CommandExecution(format!("Failed to parse eval config: {}", e))
753 })?;
754
755 if config.tasks.is_empty() {
756 return Err(RavenClawsError::CommandExecution(format!(
757 "Eval config file '{}' has no tasks defined",
758 path
759 )));
760 }
761
762 Ok(config)
763 }
764}
765
766#[cfg(test)]
769mod tests {
770 use super::*;
771
772 #[test]
773 fn test_assertion_contains_pass() {
774 let result = check_single_assertion(
775 "hello world",
776 &Assertion::Contains("world".to_string()),
777 None,
778 );
779 assert!(result.passed);
780 assert!(result.details.contains("contains"));
781 }
782
783 #[test]
784 fn test_assertion_contains_fail() {
785 let result =
786 check_single_assertion("hello world", &Assertion::Contains("foo".to_string()), None);
787 assert!(!result.passed);
788 }
789
790 #[test]
791 fn test_assertion_not_contains_pass() {
792 let result = check_single_assertion(
793 "hello world",
794 &Assertion::NotContains("foo".to_string()),
795 None,
796 );
797 assert!(result.passed);
798 }
799
800 #[test]
801 fn test_assertion_not_contains_fail() {
802 let result = check_single_assertion(
803 "hello world",
804 &Assertion::NotContains("world".to_string()),
805 None,
806 );
807 assert!(!result.passed);
808 }
809
810 #[test]
811 fn test_assertion_exact_pass() {
812 let result = check_single_assertion("hello", &Assertion::Exact("hello".to_string()), None);
813 assert!(result.passed);
814 }
815
816 #[test]
817 fn test_assertion_exact_fail() {
818 let result = check_single_assertion("world", &Assertion::Exact("hello".to_string()), None);
819 assert!(!result.passed);
820 }
821
822 #[test]
823 fn test_assertion_regex_pass() {
824 let result =
825 check_single_assertion("hello 123", &Assertion::Regex(r"\d+".to_string()), None);
826 assert!(result.passed);
827 }
828
829 #[test]
830 fn test_assertion_regex_fail() {
831 let result = check_single_assertion("hello", &Assertion::Regex(r"\d+".to_string()), None);
832 assert!(!result.passed);
833 }
834
835 #[test]
836 fn test_assertion_non_empty_pass() {
837 let result = check_single_assertion("hello", &Assertion::NonEmpty, None);
838 assert!(result.passed);
839 }
840
841 #[test]
842 fn test_assertion_non_empty_fail() {
843 let result = check_single_assertion("", &Assertion::NonEmpty, None);
844 assert!(!result.passed);
845 }
846
847 #[test]
848 fn test_assertion_min_length_pass() {
849 let result = check_single_assertion("hello", &Assertion::MinLength(3), None);
850 assert!(result.passed);
851 }
852
853 #[test]
854 fn test_assertion_min_length_fail() {
855 let result = check_single_assertion("hi", &Assertion::MinLength(5), None);
856 assert!(!result.passed);
857 }
858
859 #[test]
860 fn test_assertion_max_length_pass() {
861 let result = check_single_assertion("hi", &Assertion::MaxLength(5), None);
862 assert!(result.passed);
863 }
864
865 #[test]
866 fn test_assertion_max_length_fail() {
867 let result = check_single_assertion("hello world", &Assertion::MaxLength(5), None);
868 assert!(!result.passed);
869 }
870
871 #[test]
872 fn test_check_assertions_empty() {
873 let (results, passed, failed) = check_assertions("hello", &[], None);
874 assert!(results.is_empty());
875 assert_eq!(passed, 0);
876 assert_eq!(failed, 0);
877 }
878
879 #[test]
880 fn test_check_assertions_multiple() {
881 let assertions = vec![
882 Assertion::Contains("hello".to_string()),
883 Assertion::Contains("world".to_string()),
884 Assertion::NonEmpty,
885 ];
886 let (results, passed, failed) = check_assertions("hello world", &assertions, None);
887 assert_eq!(passed, 3);
888 assert_eq!(failed, 0);
889 assert_eq!(results.len(), 3);
890 }
891
892 #[test]
893 fn test_check_assertions_tool_called() {
894 let trace = RunTrace {
895 task_name: "test".to_string(),
896 started_at: "2026-01-01T00:00:00Z".to_string(),
897 ended_at: "2026-01-01T00:00:01Z".to_string(),
898 duration_ms: 1000,
899 iterations: 1,
900 steps: vec![],
901 llm_calls: vec![],
902 tool_calls: vec![
903 ToolCallTrace {
904 iteration: 0,
905 tool_name: "web_search".to_string(),
906 arguments: serde_json::json!({"query": "test"}),
907 success: true,
908 output_preview: "results".to_string(),
909 duration_ms: 100,
910 },
911 ToolCallTrace {
912 iteration: 0,
913 tool_name: "read_file".to_string(),
914 arguments: serde_json::json!({"path": "/tmp/test"}),
915 success: true,
916 output_preview: "content".to_string(),
917 duration_ms: 50,
918 },
919 ],
920 final_response: "response".to_string(),
921 };
922
923 let (results, passed, failed) = check_assertions(
925 "response",
926 &[Assertion::ToolCalled("web_search".to_string())],
927 Some(&trace),
928 );
929 assert_eq!(passed, 1);
930 assert_eq!(failed, 0);
931 assert!(results[0].passed);
932
933 let (results, passed, failed) = check_assertions(
935 "response",
936 &[Assertion::ToolCalled("nonexistent".to_string())],
937 Some(&trace),
938 );
939 assert_eq!(passed, 0);
940 assert_eq!(failed, 1);
941 assert!(!results[0].passed);
942
943 let (results, passed, failed) = check_assertions(
945 "response",
946 &[Assertion::ToolNotCalled("nonexistent".to_string())],
947 Some(&trace),
948 );
949 assert_eq!(passed, 1);
950 assert_eq!(failed, 0);
951 assert!(results[0].passed);
952
953 let (results, passed, failed) = check_assertions(
955 "response",
956 &[Assertion::ToolNotCalled("web_search".to_string())],
957 Some(&trace),
958 );
959 assert_eq!(passed, 0);
960 assert_eq!(failed, 1);
961 assert!(!results[0].passed);
962
963 let (results, passed, failed) = check_assertions(
965 "response",
966 &[Assertion::ToolCalled("web_search".to_string())],
967 None,
968 );
969 assert_eq!(passed, 0);
970 assert_eq!(failed, 1);
971 assert!(!results[0].passed);
972 }
973
974 #[test]
975 fn test_eval_config_from_toml() {
976 let toml_str = r#"
977name = "test-suite"
978description = "A test suite"
979system_prompt = "Be concise"
980max_iterations = 3
981
982[[tasks]]
983name = "test-1"
984prompt = "What is 2+2?"
985golden = "4"
986assertions = [{ type = "contains", value = "4" }]
987weight = 1.0
988required = true
989"#;
990
991 let config: EvalConfig = toml::from_str(toml_str).unwrap();
992 assert_eq!(config.name, "test-suite");
993 assert_eq!(config.tasks.len(), 1);
994 assert_eq!(config.tasks[0].name, "test-1");
995 assert_eq!(config.tasks[0].prompt, "What is 2+2?");
996 assert_eq!(config.tasks[0].golden, "4");
997 assert_eq!(config.tasks[0].assertions.len(), 1);
998 }
999
1000 #[test]
1001 fn test_eval_config_defaults() {
1002 let toml_str = r#"
1003[[tasks]]
1004name = "simple"
1005prompt = "Say hello"
1006"#;
1007
1008 let config: EvalConfig = toml::from_str(toml_str).unwrap();
1009 assert_eq!(config.name, "unnamed");
1010 assert_eq!(config.system_prompt, default_system_prompt());
1011 assert_eq!(config.max_iterations, 5);
1012 assert_eq!(config.tasks[0].weight, 1.0);
1013 assert!(!config.tasks[0].required);
1014 }
1015
1016 #[test]
1017 fn test_report_format_text() {
1018 let report = EvalReport {
1019 suite_name: "test".to_string(),
1020 ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1021 duration_ms: 100,
1022 overall_score: 0.75,
1023 total_tasks: 2,
1024 passed_tasks: 1,
1025 failed_tasks: 1,
1026 results: vec![
1027 EvalResult {
1028 task_name: "pass-task".to_string(),
1029 passed: true,
1030 score: 1.0,
1031 assertions_passed: 2,
1032 assertions_failed: 0,
1033 assertion_results: vec![AssertionResult {
1034 assertion: "contains: hello".to_string(),
1035 passed: true,
1036 details: "Response contains 'hello'".to_string(),
1037 }],
1038 trace: RunTrace {
1039 task_name: "pass-task".to_string(),
1040 started_at: "2026-06-22T12:00:00+00:00".to_string(),
1041 ended_at: "2026-06-22T12:00:01+00:00".to_string(),
1042 duration_ms: 50,
1043 iterations: 1,
1044 steps: vec![],
1045 llm_calls: vec![],
1046 tool_calls: vec![],
1047 final_response: "hello world".to_string(),
1048 },
1049 error: None,
1050 },
1051 EvalResult {
1052 task_name: "fail-task".to_string(),
1053 passed: false,
1054 score: 0.0,
1055 assertions_passed: 0,
1056 assertions_failed: 1,
1057 assertion_results: vec![AssertionResult {
1058 assertion: "contains: foo".to_string(),
1059 passed: false,
1060 details: "Response does not contain 'foo'".to_string(),
1061 }],
1062 trace: RunTrace {
1063 task_name: "fail-task".to_string(),
1064 started_at: "2026-06-22T12:00:01+00:00".to_string(),
1065 ended_at: "2026-06-22T12:00:02+00:00".to_string(),
1066 duration_ms: 50,
1067 iterations: 1,
1068 steps: vec![],
1069 llm_calls: vec![],
1070 tool_calls: vec![],
1071 final_response: "bar".to_string(),
1072 },
1073 error: None,
1074 },
1075 ],
1076 };
1077
1078 let text = report.format_text();
1079 assert!(text.contains("Eval Report: test"));
1080 assert!(text.contains("75.0%"));
1081 assert!(text.contains("1/2 passed"));
1082 assert!(text.contains("✅ pass-task"));
1083 assert!(text.contains("❌ fail-task"));
1084 }
1085
1086 #[test]
1087 fn test_report_format_json() {
1088 let report = EvalReport {
1089 suite_name: "test".to_string(),
1090 ran_at: "2026-06-22T12:00:00+00:00".to_string(),
1091 duration_ms: 100,
1092 overall_score: 1.0,
1093 total_tasks: 1,
1094 passed_tasks: 1,
1095 failed_tasks: 0,
1096 results: vec![],
1097 };
1098
1099 let json = report.format_json();
1100 assert_eq!(json["suite_name"], "test");
1101 assert_eq!(json["overall_score"], 1.0);
1102 }
1103
1104 #[test]
1105 fn test_eval_config_from_file_not_found() {
1106 let result = EvalConfig::from_file("/tmp/nonexistent-eval-config.toml");
1107 assert!(result.is_err());
1108 }
1109
1110 #[test]
1111 fn test_assertion_regex_invalid_pattern() {
1112 let result =
1113 check_single_assertion("hello", &Assertion::Regex(r"[invalid".to_string()), None);
1114 assert!(!result.passed);
1115 assert!(result.details.contains("Invalid regex"));
1116 }
1117
1118 #[test]
1119 fn test_trace_step_serialization() {
1120 let step = TraceStep {
1121 number: 0,
1122 step_type: StepType::Thought,
1123 content: "test".to_string(),
1124 duration_ms: 100,
1125 };
1126 let json = serde_json::to_string(&step).unwrap();
1127 assert!(json.contains("Thought"));
1128 }
1129
1130 #[test]
1131 fn test_tool_call_trace_serialization() {
1132 let trace = ToolCallTrace {
1133 iteration: 0,
1134 tool_name: "shell_exec".to_string(),
1135 arguments: serde_json::json!({"command": "echo hello"}),
1136 success: true,
1137 output_preview: "hello".to_string(),
1138 duration_ms: 50,
1139 };
1140 let json = serde_json::to_string(&trace).unwrap();
1141 assert!(json.contains("shell_exec"));
1142 assert!(json.contains("echo hello"));
1143 }
1144}