1pub mod dataset;
21pub mod evaluator;
22pub mod metrics;
23pub mod report;
24pub mod runner;
25
26pub use dataset::{EvalCase, EvalDataset};
27pub use evaluator::{EvalMetric, MetricScore};
28pub use metrics::{AnswerLength, AnswerRelevance, ContextRecall, Faithfulness};
29pub use report::{CaseResult, EvalReport};
30pub use runner::EvalRunner;
31
32pub mod handlers;
34pub mod pipeline_sse;
35pub mod plugin;
36pub mod semantic;
37
38pub use plugin::EvalPlugin;
39pub use semantic::{EvalFault, EvalRunEvent, EvalState};
40
41#[cfg(test)]
42mod tests {
43 use super::*;
44
45 fn sample_dataset() -> EvalDataset {
46 let mut ds = EvalDataset::new();
47 ds.add_case(EvalCase {
48 question: "What is Rust programming language?".to_string(),
49 context: "Rust is a systems programming language focused on safety and performance."
50 .to_string(),
51 answer: "Rust is a systems programming language that prioritizes safety and speed."
52 .to_string(),
53 reference: Some(
54 "Rust is a systems programming language focused on safety, speed, and concurrency."
55 .to_string(),
56 ),
57 });
58 ds.add_case(EvalCase {
59 question: "What is VIL?".to_string(),
60 context: "VIL is a process-oriented server framework built in Rust.".to_string(),
61 answer: "VIL is a framework for building servers using Rust.".to_string(),
62 reference: Some(
63 "VIL is a process-oriented server framework built in Rust.".to_string(),
64 ),
65 });
66 ds
67 }
68
69 #[test]
70 fn test_answer_relevance_scoring() {
71 let metric = AnswerRelevance;
72 let score = metric.evaluate(
73 "What is Rust?",
74 "Rust is a systems programming language.",
75 "",
76 None,
77 );
78 assert_eq!(score.name, "answer_relevance");
79 assert!(score.score >= 0.0 && score.score <= 1.0);
80 assert!(score.score > 0.0); }
82
83 #[test]
84 fn test_faithfulness_scoring() {
85 let metric = Faithfulness;
86 let score = metric.evaluate(
87 "",
88 "Rust is a systems language.",
89 "Rust is a systems programming language focused on safety.",
90 None,
91 );
92 assert_eq!(score.name, "faithfulness");
93 assert!(score.score >= 0.0 && score.score <= 1.0);
94 assert!(score.score > 0.0);
95 }
96
97 #[test]
98 fn test_context_recall_with_reference() {
99 let metric = ContextRecall;
100 let score = metric.evaluate(
101 "",
102 "",
103 "Rust is safe and fast.",
104 Some("Rust is safe, fast, and concurrent."),
105 );
106 assert_eq!(score.name, "context_recall");
107 assert!(score.score > 0.0);
108 }
109
110 #[test]
111 fn test_context_recall_no_reference() {
112 let metric = ContextRecall;
113 let score = metric.evaluate("", "", "Rust is safe.", None);
114 assert_eq!(score.score, 0.0);
115 }
116
117 #[test]
118 fn test_answer_length_too_short() {
119 let metric = AnswerLength::default();
120 let score = metric.evaluate("", "Yes.", "", None);
121 assert!(score.score < 1.0);
122 }
123
124 #[test]
125 fn test_answer_length_good() {
126 let metric = AnswerLength::default();
127 let score = metric.evaluate(
128 "",
129 "Rust is a systems programming language focused on safety and performance.",
130 "",
131 None,
132 );
133 assert_eq!(score.score, 1.0);
134 }
135
136 #[test]
137 fn test_dataset_loading_from_json() {
138 let json = r#"{
139 "cases": [
140 {
141 "question": "What is Rust?",
142 "context": "Rust is a language.",
143 "answer": "Rust is great.",
144 "reference": null
145 }
146 ]
147 }"#;
148 let ds = EvalDataset::from_json(json).unwrap();
149 assert_eq!(ds.len(), 1);
150 assert_eq!(ds.cases[0].question, "What is Rust?");
151 }
152
153 #[test]
154 fn test_runner_with_multiple_metrics() {
155 let ds = sample_dataset();
156 let runner = EvalRunner::new(ds)
157 .add_metric(Box::new(AnswerRelevance))
158 .add_metric(Box::new(Faithfulness))
159 .add_metric(Box::new(ContextRecall))
160 .add_metric(Box::new(AnswerLength::default()));
161 let report = runner.run();
162 assert_eq!(report.case_count(), 2);
163 assert_eq!(report.results[0].scores.len(), 4);
164 assert!(report.summary.contains_key("answer_relevance"));
165 assert!(report.summary.contains_key("faithfulness"));
166 assert!(report.summary.contains_key("context_recall"));
167 assert!(report.summary.contains_key("answer_length"));
168 }
169
170 #[test]
171 fn test_empty_dataset() {
172 let ds = EvalDataset::new();
173 assert!(ds.is_empty());
174 let runner = EvalRunner::new(ds).add_metric(Box::new(AnswerRelevance));
175 let report = runner.run();
176 assert_eq!(report.case_count(), 0);
177 assert!(report.summary.is_empty());
178 }
179
180 #[test]
181 fn test_report_aggregation() {
182 let ds = sample_dataset();
183 let runner = EvalRunner::new(ds)
184 .add_metric(Box::new(AnswerRelevance))
185 .add_metric(Box::new(Faithfulness));
186 let report = runner.run();
187
188 let relevance_avg = report.metric_average("answer_relevance").unwrap();
190 let case_scores: Vec<f32> = report
191 .results
192 .iter()
193 .map(|r| {
194 r.scores
195 .iter()
196 .find(|s| s.name == "answer_relevance")
197 .unwrap()
198 .score
199 })
200 .collect();
201 let expected_avg = case_scores.iter().sum::<f32>() / case_scores.len() as f32;
202 assert!((relevance_avg - expected_avg).abs() < f32::EPSILON);
203 }
204
205 #[test]
206 fn test_metric_scores_in_range() {
207 let ds = sample_dataset();
208 let runner = EvalRunner::new(ds)
209 .add_metric(Box::new(AnswerRelevance))
210 .add_metric(Box::new(Faithfulness))
211 .add_metric(Box::new(ContextRecall))
212 .add_metric(Box::new(AnswerLength::default()));
213 let report = runner.run();
214
215 for case_result in &report.results {
216 for score in &case_result.scores {
217 assert!(
218 score.score >= 0.0 && score.score <= 1.0,
219 "Metric {} score {} out of [0,1] range",
220 score.name,
221 score.score
222 );
223 }
224 }
225 }
226
227 #[test]
228 fn test_dataset_roundtrip_json() {
229 let ds = sample_dataset();
230 let json = ds.to_json().unwrap();
231 let ds2 = EvalDataset::from_json(&json).unwrap();
232 assert_eq!(ds2.len(), ds.len());
233 assert_eq!(ds2.cases[0].question, ds.cases[0].question);
234 }
235}