Skip to main content

vil_eval/
lib.rs

1//! VIL Evaluation Framework (H10).
2//!
3//! Provides metrics, datasets, batch evaluation, and reporting for LLM output quality.
4//!
5//! ```
6//! use vil_eval::{EvalDataset, EvalCase, EvalRunner, AnswerRelevance};
7//!
8//! let mut dataset = EvalDataset::new();
9//! dataset.add_case(EvalCase {
10//!     question: "What is Rust?".to_string(),
11//!     context: "Rust is a systems programming language.".to_string(),
12//!     answer: "Rust is a systems programming language focused on safety.".to_string(),
13//!     reference: None,
14//! });
15//! let runner = EvalRunner::new(dataset).add_metric(Box::new(AnswerRelevance));
16//! let report = runner.run();
17//! assert_eq!(report.case_count(), 1);
18//! ```
19
20pub mod dataset;
21pub mod evaluator;
22pub mod metrics;
23pub mod report;
24pub mod runner;
25
26pub use dataset::{EvalCase, EvalDataset};
27pub use evaluator::{EvalMetric, MetricScore};
28pub use metrics::{AnswerLength, AnswerRelevance, ContextRecall, Faithfulness};
29pub use report::{CaseResult, EvalReport};
30pub use runner::EvalRunner;
31
32// VIL integration layer
33pub mod handlers;
34pub mod pipeline_sse;
35pub mod plugin;
36pub mod semantic;
37
38pub use plugin::EvalPlugin;
39pub use semantic::{EvalFault, EvalRunEvent, EvalState};
40
41#[cfg(test)]
42mod tests {
43    use super::*;
44
45    fn sample_dataset() -> EvalDataset {
46        let mut ds = EvalDataset::new();
47        ds.add_case(EvalCase {
48            question: "What is Rust programming language?".to_string(),
49            context: "Rust is a systems programming language focused on safety and performance."
50                .to_string(),
51            answer: "Rust is a systems programming language that prioritizes safety and speed."
52                .to_string(),
53            reference: Some(
54                "Rust is a systems programming language focused on safety, speed, and concurrency."
55                    .to_string(),
56            ),
57        });
58        ds.add_case(EvalCase {
59            question: "What is VIL?".to_string(),
60            context: "VIL is a process-oriented server framework built in Rust.".to_string(),
61            answer: "VIL is a framework for building servers using Rust.".to_string(),
62            reference: Some(
63                "VIL is a process-oriented server framework built in Rust.".to_string(),
64            ),
65        });
66        ds
67    }
68
69    #[test]
70    fn test_answer_relevance_scoring() {
71        let metric = AnswerRelevance;
72        let score = metric.evaluate(
73            "What is Rust?",
74            "Rust is a systems programming language.",
75            "",
76            None,
77        );
78        assert_eq!(score.name, "answer_relevance");
79        assert!(score.score >= 0.0 && score.score <= 1.0);
80        assert!(score.score > 0.0); // "Rust" and "is" overlap
81    }
82
83    #[test]
84    fn test_faithfulness_scoring() {
85        let metric = Faithfulness;
86        let score = metric.evaluate(
87            "",
88            "Rust is a systems language.",
89            "Rust is a systems programming language focused on safety.",
90            None,
91        );
92        assert_eq!(score.name, "faithfulness");
93        assert!(score.score >= 0.0 && score.score <= 1.0);
94        assert!(score.score > 0.0);
95    }
96
97    #[test]
98    fn test_context_recall_with_reference() {
99        let metric = ContextRecall;
100        let score = metric.evaluate(
101            "",
102            "",
103            "Rust is safe and fast.",
104            Some("Rust is safe, fast, and concurrent."),
105        );
106        assert_eq!(score.name, "context_recall");
107        assert!(score.score > 0.0);
108    }
109
110    #[test]
111    fn test_context_recall_no_reference() {
112        let metric = ContextRecall;
113        let score = metric.evaluate("", "", "Rust is safe.", None);
114        assert_eq!(score.score, 0.0);
115    }
116
117    #[test]
118    fn test_answer_length_too_short() {
119        let metric = AnswerLength::default();
120        let score = metric.evaluate("", "Yes.", "", None);
121        assert!(score.score < 1.0);
122    }
123
124    #[test]
125    fn test_answer_length_good() {
126        let metric = AnswerLength::default();
127        let score = metric.evaluate(
128            "",
129            "Rust is a systems programming language focused on safety and performance.",
130            "",
131            None,
132        );
133        assert_eq!(score.score, 1.0);
134    }
135
136    #[test]
137    fn test_dataset_loading_from_json() {
138        let json = r#"{
139            "cases": [
140                {
141                    "question": "What is Rust?",
142                    "context": "Rust is a language.",
143                    "answer": "Rust is great.",
144                    "reference": null
145                }
146            ]
147        }"#;
148        let ds = EvalDataset::from_json(json).unwrap();
149        assert_eq!(ds.len(), 1);
150        assert_eq!(ds.cases[0].question, "What is Rust?");
151    }
152
153    #[test]
154    fn test_runner_with_multiple_metrics() {
155        let ds = sample_dataset();
156        let runner = EvalRunner::new(ds)
157            .add_metric(Box::new(AnswerRelevance))
158            .add_metric(Box::new(Faithfulness))
159            .add_metric(Box::new(ContextRecall))
160            .add_metric(Box::new(AnswerLength::default()));
161        let report = runner.run();
162        assert_eq!(report.case_count(), 2);
163        assert_eq!(report.results[0].scores.len(), 4);
164        assert!(report.summary.contains_key("answer_relevance"));
165        assert!(report.summary.contains_key("faithfulness"));
166        assert!(report.summary.contains_key("context_recall"));
167        assert!(report.summary.contains_key("answer_length"));
168    }
169
170    #[test]
171    fn test_empty_dataset() {
172        let ds = EvalDataset::new();
173        assert!(ds.is_empty());
174        let runner = EvalRunner::new(ds).add_metric(Box::new(AnswerRelevance));
175        let report = runner.run();
176        assert_eq!(report.case_count(), 0);
177        assert!(report.summary.is_empty());
178    }
179
180    #[test]
181    fn test_report_aggregation() {
182        let ds = sample_dataset();
183        let runner = EvalRunner::new(ds)
184            .add_metric(Box::new(AnswerRelevance))
185            .add_metric(Box::new(Faithfulness));
186        let report = runner.run();
187
188        // Verify averages are computed correctly
189        let relevance_avg = report.metric_average("answer_relevance").unwrap();
190        let case_scores: Vec<f32> = report
191            .results
192            .iter()
193            .map(|r| {
194                r.scores
195                    .iter()
196                    .find(|s| s.name == "answer_relevance")
197                    .unwrap()
198                    .score
199            })
200            .collect();
201        let expected_avg = case_scores.iter().sum::<f32>() / case_scores.len() as f32;
202        assert!((relevance_avg - expected_avg).abs() < f32::EPSILON);
203    }
204
205    #[test]
206    fn test_metric_scores_in_range() {
207        let ds = sample_dataset();
208        let runner = EvalRunner::new(ds)
209            .add_metric(Box::new(AnswerRelevance))
210            .add_metric(Box::new(Faithfulness))
211            .add_metric(Box::new(ContextRecall))
212            .add_metric(Box::new(AnswerLength::default()));
213        let report = runner.run();
214
215        for case_result in &report.results {
216            for score in &case_result.scores {
217                assert!(
218                    score.score >= 0.0 && score.score <= 1.0,
219                    "Metric {} score {} out of [0,1] range",
220                    score.name,
221                    score.score
222                );
223            }
224        }
225    }
226
227    #[test]
228    fn test_dataset_roundtrip_json() {
229        let ds = sample_dataset();
230        let json = ds.to_json().unwrap();
231        let ds2 = EvalDataset::from_json(&json).unwrap();
232        assert_eq!(ds2.len(), ds.len());
233        assert_eq!(ds2.cases[0].question, ds.cases[0].question);
234    }
235}