Skip to main content

kbolt_types/
eval.rs

1use serde::{Deserialize, Serialize};
2
3use crate::SearchMode;
4
5#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
6pub struct EvalJudgment {
7    pub path: String,
8    pub relevance: u8,
9}
10
11#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
12pub struct EvalImportReport {
13    pub dataset: String,
14    pub source: String,
15    pub output_dir: String,
16    pub corpus_dir: String,
17    pub manifest_path: String,
18    pub default_space: String,
19    pub collection: String,
20    pub document_count: usize,
21    pub query_count: usize,
22    pub judgment_count: usize,
23}
24
25#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
26pub struct EvalDataset {
27    pub cases: Vec<EvalCase>,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
31pub struct EvalCase {
32    pub query: String,
33    #[serde(default)]
34    pub space: Option<String>,
35    #[serde(default)]
36    pub collections: Vec<String>,
37    #[serde(default)]
38    pub judgments: Vec<EvalJudgment>,
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
42pub struct EvalRunReport {
43    pub total_cases: usize,
44    pub modes: Vec<EvalModeReport>,
45    pub failed_modes: Vec<EvalModeFailure>,
46}
47
48#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
49pub struct EvalModeReport {
50    pub mode: SearchMode,
51    pub no_rerank: bool,
52    pub ndcg_at_10: f32,
53    pub recall_at_10: f32,
54    pub mrr_at_10: f32,
55    pub latency_p50_ms: u64,
56    pub latency_p95_ms: u64,
57    pub queries: Vec<EvalQueryReport>,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
61pub struct EvalModeFailure {
62    pub mode: SearchMode,
63    pub no_rerank: bool,
64    pub error: String,
65}
66
67#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
68pub struct EvalQueryReport {
69    pub query: String,
70    pub space: Option<String>,
71    pub collections: Vec<String>,
72    pub judgments: Vec<EvalJudgment>,
73    pub returned_paths: Vec<String>,
74    pub matched_paths: Vec<String>,
75    pub first_relevant_rank: Option<usize>,
76    pub elapsed_ms: u64,
77}
78
79#[cfg(test)]
80mod tests {
81    use serde_json::json;
82
83    use super::{
84        EvalCase, EvalDataset, EvalImportReport, EvalJudgment, EvalModeFailure, EvalModeReport,
85        EvalQueryReport, EvalRunReport,
86    };
87    use crate::SearchMode;
88
89    #[test]
90    fn eval_dataset_serializes_minimal_case_shape() {
91        let value = serde_json::to_value(EvalDataset {
92            cases: vec![EvalCase {
93                query: "trait object vs generic".to_string(),
94                space: Some("bench".to_string()),
95                collections: vec!["rust".to_string()],
96                judgments: vec![
97                    EvalJudgment {
98                        path: "rust/traits.md".to_string(),
99                        relevance: 2,
100                    },
101                    EvalJudgment {
102                        path: "rust/generics.md".to_string(),
103                        relevance: 1,
104                    },
105                ],
106            }],
107        })
108        .expect("serialize eval dataset");
109
110        assert_eq!(
111            value,
112            json!({
113                "cases": [
114                    {
115                        "query": "trait object vs generic",
116                        "space": "bench",
117                        "collections": ["rust"],
118                        "judgments": [
119                            {"path": "rust/traits.md", "relevance": 2},
120                            {"path": "rust/generics.md", "relevance": 1}
121                        ]
122                    }
123                ]
124            })
125        );
126    }
127
128    #[test]
129    fn eval_run_report_serializes_mode_metrics_and_queries() {
130        let value = serde_json::to_value(EvalRunReport {
131            total_cases: 1,
132            modes: vec![EvalModeReport {
133                mode: SearchMode::Keyword,
134                no_rerank: true,
135                ndcg_at_10: 1.0,
136                recall_at_10: 1.0,
137                mrr_at_10: 1.0,
138                latency_p50_ms: 3,
139                latency_p95_ms: 4,
140                queries: vec![EvalQueryReport {
141                    query: "trait object vs generic".to_string(),
142                    space: Some("bench".to_string()),
143                    collections: vec!["rust".to_string()],
144                    judgments: vec![EvalJudgment {
145                        path: "rust/traits.md".to_string(),
146                        relevance: 1,
147                    }],
148                    returned_paths: vec!["rust/traits.md".to_string()],
149                    matched_paths: vec!["rust/traits.md".to_string()],
150                    first_relevant_rank: Some(1),
151                    elapsed_ms: 3,
152                }],
153            }],
154            failed_modes: vec![EvalModeFailure {
155                mode: SearchMode::Deep,
156                no_rerank: false,
157                error: "model not available".to_string(),
158            }],
159        })
160        .expect("serialize eval report");
161
162        assert_eq!(
163            value,
164            json!({
165                "total_cases": 1,
166                "modes": [
167                    {
168                        "mode": "Keyword",
169                        "no_rerank": true,
170                        "ndcg_at_10": 1.0,
171                        "recall_at_10": 1.0,
172                        "mrr_at_10": 1.0,
173                        "latency_p50_ms": 3,
174                        "latency_p95_ms": 4,
175                        "queries": [
176                            {
177                                "query": "trait object vs generic",
178                                "space": "bench",
179                                "collections": ["rust"],
180                                "judgments": [
181                                    {
182                                        "path": "rust/traits.md",
183                                        "relevance": 1
184                                    }
185                                ],
186                                "returned_paths": ["rust/traits.md"],
187                                "matched_paths": ["rust/traits.md"],
188                                "first_relevant_rank": 1,
189                                "elapsed_ms": 3
190                            }
191                        ]
192                    }
193                ],
194                "failed_modes": [
195                    {
196                        "mode": "Deep",
197                        "no_rerank": false,
198                        "error": "model not available"
199                    }
200                ]
201            })
202        );
203    }
204
205    #[test]
206    fn eval_import_report_serializes_paths_and_counts() {
207        let value = serde_json::to_value(EvalImportReport {
208            dataset: "scifact".to_string(),
209            source: "/tmp/scifact-source".to_string(),
210            output_dir: "/tmp/scifact-bench".to_string(),
211            corpus_dir: "/tmp/scifact-bench/corpus".to_string(),
212            manifest_path: "/tmp/scifact-bench/eval.toml".to_string(),
213            default_space: "bench".to_string(),
214            collection: "scifact".to_string(),
215            document_count: 5_183,
216            query_count: 300,
217            judgment_count: 1_109,
218        })
219        .expect("serialize import report");
220
221        assert_eq!(
222            value,
223            json!({
224                "dataset": "scifact",
225                "source": "/tmp/scifact-source",
226                "output_dir": "/tmp/scifact-bench",
227                "corpus_dir": "/tmp/scifact-bench/corpus",
228                "manifest_path": "/tmp/scifact-bench/eval.toml",
229                "default_space": "bench",
230                "collection": "scifact",
231                "document_count": 5183,
232                "query_count": 300,
233                "judgment_count": 1109
234            })
235        );
236    }
237}