1use serde::{Deserialize, Serialize};
2
3use crate::SearchMode;
4
5#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
6pub struct EvalJudgment {
7 pub path: String,
8 pub relevance: u8,
9}
10
11#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
12pub struct EvalImportReport {
13 pub dataset: String,
14 pub source: String,
15 pub output_dir: String,
16 pub corpus_dir: String,
17 pub manifest_path: String,
18 pub default_space: String,
19 pub collection: String,
20 pub document_count: usize,
21 pub query_count: usize,
22 pub judgment_count: usize,
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
26pub struct EvalDataset {
27 pub cases: Vec<EvalCase>,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
31pub struct EvalCase {
32 pub query: String,
33 #[serde(default)]
34 pub space: Option<String>,
35 #[serde(default)]
36 pub collections: Vec<String>,
37 #[serde(default)]
38 pub judgments: Vec<EvalJudgment>,
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
42pub struct EvalRunReport {
43 pub total_cases: usize,
44 pub modes: Vec<EvalModeReport>,
45 pub failed_modes: Vec<EvalModeFailure>,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
49pub struct EvalModeReport {
50 pub mode: SearchMode,
51 pub no_rerank: bool,
52 pub ndcg_at_10: f32,
53 pub recall_at_10: f32,
54 pub mrr_at_10: f32,
55 pub latency_p50_ms: u64,
56 pub latency_p95_ms: u64,
57 pub queries: Vec<EvalQueryReport>,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
61pub struct EvalModeFailure {
62 pub mode: SearchMode,
63 pub no_rerank: bool,
64 pub error: String,
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
68pub struct EvalQueryReport {
69 pub query: String,
70 pub space: Option<String>,
71 pub collections: Vec<String>,
72 pub judgments: Vec<EvalJudgment>,
73 pub returned_paths: Vec<String>,
74 pub matched_paths: Vec<String>,
75 pub first_relevant_rank: Option<usize>,
76 pub elapsed_ms: u64,
77}
78
79#[cfg(test)]
80mod tests {
81 use serde_json::json;
82
83 use super::{
84 EvalCase, EvalDataset, EvalImportReport, EvalJudgment, EvalModeFailure, EvalModeReport,
85 EvalQueryReport, EvalRunReport,
86 };
87 use crate::SearchMode;
88
89 #[test]
90 fn eval_dataset_serializes_minimal_case_shape() {
91 let value = serde_json::to_value(EvalDataset {
92 cases: vec![EvalCase {
93 query: "trait object vs generic".to_string(),
94 space: Some("bench".to_string()),
95 collections: vec!["rust".to_string()],
96 judgments: vec![
97 EvalJudgment {
98 path: "rust/traits.md".to_string(),
99 relevance: 2,
100 },
101 EvalJudgment {
102 path: "rust/generics.md".to_string(),
103 relevance: 1,
104 },
105 ],
106 }],
107 })
108 .expect("serialize eval dataset");
109
110 assert_eq!(
111 value,
112 json!({
113 "cases": [
114 {
115 "query": "trait object vs generic",
116 "space": "bench",
117 "collections": ["rust"],
118 "judgments": [
119 {"path": "rust/traits.md", "relevance": 2},
120 {"path": "rust/generics.md", "relevance": 1}
121 ]
122 }
123 ]
124 })
125 );
126 }
127
128 #[test]
129 fn eval_run_report_serializes_mode_metrics_and_queries() {
130 let value = serde_json::to_value(EvalRunReport {
131 total_cases: 1,
132 modes: vec![EvalModeReport {
133 mode: SearchMode::Keyword,
134 no_rerank: true,
135 ndcg_at_10: 1.0,
136 recall_at_10: 1.0,
137 mrr_at_10: 1.0,
138 latency_p50_ms: 3,
139 latency_p95_ms: 4,
140 queries: vec![EvalQueryReport {
141 query: "trait object vs generic".to_string(),
142 space: Some("bench".to_string()),
143 collections: vec!["rust".to_string()],
144 judgments: vec![EvalJudgment {
145 path: "rust/traits.md".to_string(),
146 relevance: 1,
147 }],
148 returned_paths: vec!["rust/traits.md".to_string()],
149 matched_paths: vec!["rust/traits.md".to_string()],
150 first_relevant_rank: Some(1),
151 elapsed_ms: 3,
152 }],
153 }],
154 failed_modes: vec![EvalModeFailure {
155 mode: SearchMode::Deep,
156 no_rerank: false,
157 error: "model not available".to_string(),
158 }],
159 })
160 .expect("serialize eval report");
161
162 assert_eq!(
163 value,
164 json!({
165 "total_cases": 1,
166 "modes": [
167 {
168 "mode": "Keyword",
169 "no_rerank": true,
170 "ndcg_at_10": 1.0,
171 "recall_at_10": 1.0,
172 "mrr_at_10": 1.0,
173 "latency_p50_ms": 3,
174 "latency_p95_ms": 4,
175 "queries": [
176 {
177 "query": "trait object vs generic",
178 "space": "bench",
179 "collections": ["rust"],
180 "judgments": [
181 {
182 "path": "rust/traits.md",
183 "relevance": 1
184 }
185 ],
186 "returned_paths": ["rust/traits.md"],
187 "matched_paths": ["rust/traits.md"],
188 "first_relevant_rank": 1,
189 "elapsed_ms": 3
190 }
191 ]
192 }
193 ],
194 "failed_modes": [
195 {
196 "mode": "Deep",
197 "no_rerank": false,
198 "error": "model not available"
199 }
200 ]
201 })
202 );
203 }
204
205 #[test]
206 fn eval_import_report_serializes_paths_and_counts() {
207 let value = serde_json::to_value(EvalImportReport {
208 dataset: "scifact".to_string(),
209 source: "/tmp/scifact-source".to_string(),
210 output_dir: "/tmp/scifact-bench".to_string(),
211 corpus_dir: "/tmp/scifact-bench/corpus".to_string(),
212 manifest_path: "/tmp/scifact-bench/eval.toml".to_string(),
213 default_space: "bench".to_string(),
214 collection: "scifact".to_string(),
215 document_count: 5_183,
216 query_count: 300,
217 judgment_count: 1_109,
218 })
219 .expect("serialize import report");
220
221 assert_eq!(
222 value,
223 json!({
224 "dataset": "scifact",
225 "source": "/tmp/scifact-source",
226 "output_dir": "/tmp/scifact-bench",
227 "corpus_dir": "/tmp/scifact-bench/corpus",
228 "manifest_path": "/tmp/scifact-bench/eval.toml",
229 "default_space": "bench",
230 "collection": "scifact",
231 "document_count": 5183,
232 "query_count": 300,
233 "judgment_count": 1109
234 })
235 );
236 }
237}