Skip to main content

kbolt_core/
eval_import.rs

1use std::collections::{HashMap, HashSet};
2use std::fs;
3use std::io::{BufRead, BufReader};
4use std::path::{Path, PathBuf};
5
6use kbolt_types::{EvalCase, EvalDataset, EvalImportReport, EvalJudgment, KboltError};
7use serde::Deserialize;
8
9use crate::Result;
10
11const DEFAULT_SPACE: &str = "bench";
12const CORPUS_DIRNAME: &str = "corpus";
13const MANIFEST_FILENAME: &str = "eval.toml";
14
15#[derive(Debug, Deserialize)]
16struct BeirCorpusRecord {
17    #[serde(rename = "_id")]
18    id: String,
19    #[serde(default)]
20    title: Option<String>,
21    text: String,
22}
23
24#[derive(Debug, Deserialize)]
25struct BeirQueryRecord {
26    #[serde(rename = "_id")]
27    id: String,
28    text: String,
29}
30
31pub fn import_beir(
32    dataset: &str,
33    source: &Path,
34    output: &Path,
35    collection: Option<&str>,
36) -> Result<EvalImportReport> {
37    let dataset = normalize_import_name("dataset", dataset)?;
38    let collection = normalize_import_name("collection", collection.unwrap_or(dataset))?;
39    let layout = validate_beir_layout(dataset, source)?;
40    prepare_output_dir(output)?;
41
42    let corpus_dir = output.join(CORPUS_DIRNAME);
43    fs::create_dir_all(&corpus_dir)?;
44
45    let document_ids = materialize_beir_corpus(dataset, &layout.corpus, &corpus_dir)?;
46    let queries = load_beir_queries(dataset, &layout.queries)?;
47    let (judgments_by_query, judgment_count) =
48        load_beir_qrels(dataset, collection, &layout.qrels, &document_ids, &queries)?;
49    let eval_dataset = build_eval_dataset(dataset, collection, queries, judgments_by_query)?;
50    let query_count = eval_dataset.cases.len();
51    let manifest_path = output.join(MANIFEST_FILENAME);
52    fs::write(&manifest_path, toml::to_string_pretty(&eval_dataset)?)?;
53
54    Ok(EvalImportReport {
55        dataset: dataset.to_string(),
56        source: source.display().to_string(),
57        output_dir: output.display().to_string(),
58        corpus_dir: corpus_dir.display().to_string(),
59        manifest_path: manifest_path.display().to_string(),
60        default_space: DEFAULT_SPACE.to_string(),
61        collection: collection.to_string(),
62        document_count: document_ids.len(),
63        query_count,
64        judgment_count,
65    })
66}
67
68struct BeirLayout {
69    corpus: PathBuf,
70    queries: PathBuf,
71    qrels: PathBuf,
72}
73
74fn validate_beir_layout(dataset: &str, source: &Path) -> Result<BeirLayout> {
75    if !source.is_dir() {
76        return Err(KboltError::InvalidInput(format!(
77            "{dataset} source must be a directory: {}",
78            source.display()
79        ))
80        .into());
81    }
82
83    let corpus = source.join("corpus.jsonl");
84    let queries = source.join("queries.jsonl");
85    let qrels = source.join("qrels").join("test.tsv");
86
87    for (label, path) in [
88        ("corpus.jsonl", &corpus),
89        ("queries.jsonl", &queries),
90        ("qrels/test.tsv", &qrels),
91    ] {
92        if !path.is_file() {
93            return Err(KboltError::InvalidInput(format!(
94                "invalid {dataset} source {}: missing {label}",
95                source.display()
96            ))
97            .into());
98        }
99    }
100
101    Ok(BeirLayout {
102        corpus,
103        queries,
104        qrels,
105    })
106}
107
108fn prepare_output_dir(output: &Path) -> Result<()> {
109    if output.exists() {
110        if !output.is_dir() {
111            return Err(KboltError::InvalidInput(format!(
112                "eval import output must be a directory: {}",
113                output.display()
114            ))
115            .into());
116        }
117        if fs::read_dir(output)?.next().transpose()?.is_some() {
118            return Err(KboltError::InvalidInput(format!(
119                "eval import output directory must be empty: {}",
120                output.display()
121            ))
122            .into());
123        }
124        return Ok(());
125    }
126
127    fs::create_dir_all(output)?;
128    Ok(())
129}
130
131fn materialize_beir_corpus(
132    dataset: &str,
133    corpus_file: &Path,
134    corpus_dir: &Path,
135) -> Result<HashSet<String>> {
136    let records = read_jsonl::<BeirCorpusRecord>(corpus_file, "corpus")?;
137    if records.is_empty() {
138        return Err(KboltError::InvalidInput(format!(
139            "{dataset} corpus is empty: {}",
140            corpus_file.display()
141        ))
142        .into());
143    }
144
145    let mut ids = HashSet::with_capacity(records.len());
146    for record in records {
147        validate_record_id("corpus document", &record.id)?;
148        if !ids.insert(record.id.clone()) {
149            return Err(KboltError::InvalidInput(format!(
150                "duplicate corpus document id '{}'",
151                record.id
152            ))
153            .into());
154        }
155        let document_path = corpus_dir.join(format!("{}.md", record.id));
156        fs::write(document_path, render_corpus_document(&record))?;
157    }
158
159    Ok(ids)
160}
161
162fn render_corpus_document(record: &BeirCorpusRecord) -> String {
163    let title = record.title.as_deref().map(str::trim).unwrap_or_default();
164    let text = record.text.trim();
165    if title.is_empty() {
166        format!("{text}\n")
167    } else {
168        format!("# {title}\n\n{text}\n")
169    }
170}
171
172fn load_beir_queries(dataset: &str, queries_file: &Path) -> Result<Vec<BeirQueryRecord>> {
173    let queries = read_jsonl::<BeirQueryRecord>(queries_file, "queries")?;
174    if queries.is_empty() {
175        return Err(KboltError::InvalidInput(format!(
176            "{dataset} queries are empty: {}",
177            queries_file.display()
178        ))
179        .into());
180    }
181
182    let mut seen = HashSet::with_capacity(queries.len());
183    for query in &queries {
184        validate_record_id("query", &query.id)?;
185        if !seen.insert(query.id.clone()) {
186            return Err(
187                KboltError::InvalidInput(format!("duplicate query id '{}'", query.id)).into(),
188            );
189        }
190        if query.text.trim().is_empty() {
191            return Err(
192                KboltError::InvalidInput(format!("query '{}' has empty text", query.id)).into(),
193            );
194        }
195    }
196
197    Ok(queries)
198}
199
200fn load_beir_qrels(
201    dataset: &str,
202    collection: &str,
203    qrels_file: &Path,
204    document_ids: &HashSet<String>,
205    queries: &[BeirQueryRecord],
206) -> Result<(HashMap<String, Vec<EvalJudgment>>, usize)> {
207    let query_ids = queries
208        .iter()
209        .map(|query| query.id.as_str())
210        .collect::<HashSet<_>>();
211    let file = fs::File::open(qrels_file)?;
212    let reader = BufReader::new(file);
213    let mut judgments_by_query: HashMap<String, Vec<EvalJudgment>> = HashMap::new();
214    let mut seen_pairs = HashSet::new();
215    let mut judgment_count = 0;
216
217    for (index, line) in reader.lines().enumerate() {
218        let line_number = index + 1;
219        let line = line?;
220        let trimmed = line.trim();
221        if trimmed.is_empty() {
222            continue;
223        }
224        if line_number == 1 && trimmed.eq_ignore_ascii_case("query-id\tcorpus-id\tscore") {
225            continue;
226        }
227
228        let fields = trimmed.split('\t').collect::<Vec<_>>();
229        if fields.len() != 3 {
230            return Err(KboltError::InvalidInput(format!(
231                "invalid {dataset} qrels line {} in {}: expected 3 tab-separated fields",
232                line_number,
233                qrels_file.display()
234            ))
235            .into());
236        }
237
238        let query_id = fields[0].trim();
239        let document_id = fields[1].trim();
240        let relevance = fields[2].trim().parse::<u8>().map_err(|err| {
241            KboltError::InvalidInput(format!(
242                "invalid relevance '{}' on qrels line {} in {}: {err}",
243                fields[2].trim(),
244                line_number,
245                qrels_file.display()
246            ))
247        })?;
248
249        validate_record_id("query", query_id)?;
250        validate_record_id("corpus document", document_id)?;
251
252        if !query_ids.contains(query_id) {
253            return Err(KboltError::InvalidInput(format!(
254                "qrels references unknown query id '{}' in {}",
255                query_id,
256                qrels_file.display()
257            ))
258            .into());
259        }
260        if !document_ids.contains(document_id) {
261            return Err(KboltError::InvalidInput(format!(
262                "qrels references unknown corpus document id '{}' in {}",
263                document_id,
264                qrels_file.display()
265            ))
266            .into());
267        }
268        if relevance == 0 {
269            continue;
270        }
271        if !seen_pairs.insert((query_id.to_string(), document_id.to_string())) {
272            return Err(KboltError::InvalidInput(format!(
273                "duplicate qrels pair '{} -> {}' in {}",
274                query_id,
275                document_id,
276                qrels_file.display()
277            ))
278            .into());
279        }
280
281        judgments_by_query
282            .entry(query_id.to_string())
283            .or_default()
284            .push(EvalJudgment {
285                path: format!("{collection}/{document_id}.md"),
286                relevance,
287            });
288        judgment_count += 1;
289    }
290
291    Ok((judgments_by_query, judgment_count))
292}
293
294fn build_eval_dataset(
295    dataset: &str,
296    collection: &str,
297    queries: Vec<BeirQueryRecord>,
298    mut judgments_by_query: HashMap<String, Vec<EvalJudgment>>,
299) -> Result<EvalDataset> {
300    let mut cases = Vec::new();
301    for query in queries {
302        let Some(mut judgments) = judgments_by_query.remove(&query.id) else {
303            continue;
304        };
305        judgments.sort_by(|left, right| {
306            right
307                .relevance
308                .cmp(&left.relevance)
309                .then_with(|| left.path.cmp(&right.path))
310        });
311        cases.push(EvalCase {
312            query: query.text.trim().to_string(),
313            space: Some(DEFAULT_SPACE.to_string()),
314            collections: vec![collection.to_string()],
315            judgments,
316        });
317    }
318
319    if cases.is_empty() {
320        return Err(KboltError::InvalidInput(format!(
321            "{dataset} qrels did not produce any positive judged queries"
322        ))
323        .into());
324    }
325
326    Ok(EvalDataset { cases })
327}
328
329fn read_jsonl<T>(path: &Path, label: &str) -> Result<Vec<T>>
330where
331    T: for<'de> Deserialize<'de>,
332{
333    let file = fs::File::open(path)?;
334    let reader = BufReader::new(file);
335    let mut records = Vec::new();
336    for (index, line) in reader.lines().enumerate() {
337        let line_number = index + 1;
338        let line = line?;
339        let trimmed = line.trim();
340        if trimmed.is_empty() {
341            continue;
342        }
343        let record = serde_json::from_str(trimmed).map_err(|err| {
344            KboltError::InvalidInput(format!(
345                "invalid {label} jsonl line {} in {}: {err}",
346                line_number,
347                path.display()
348            ))
349        })?;
350        records.push(record);
351    }
352    Ok(records)
353}
354
355fn validate_record_id(kind: &str, value: &str) -> Result<()> {
356    let trimmed = value.trim();
357    if trimmed.is_empty() {
358        return Err(KboltError::InvalidInput(format!("{kind} id must not be empty")).into());
359    }
360    if trimmed == "." || trimmed == ".." || trimmed.contains('/') || trimmed.contains('\\') {
361        return Err(KboltError::InvalidInput(format!(
362            "{kind} id '{}' is not a valid filesystem-safe identifier",
363            value
364        ))
365        .into());
366    }
367    Ok(())
368}
369
370fn normalize_import_name<'a>(kind: &str, value: &'a str) -> Result<&'a str> {
371    let trimmed = value.trim();
372    if trimmed.is_empty() {
373        return Err(KboltError::InvalidInput(format!("{kind} name must not be empty")).into());
374    }
375    validate_record_id(kind, trimmed)?;
376    Ok(trimmed)
377}
378
379#[cfg(test)]
380mod tests {
381    use std::fs;
382    use std::path::Path;
383
384    use tempfile::tempdir;
385
386    use crate::eval_store::load_eval_dataset_with_file;
387
388    use super::import_beir;
389
390    #[test]
391    fn import_beir_materializes_corpus_and_manifest() {
392        let tmp = tempdir().expect("create tempdir");
393        let source = tmp.path().join("source");
394        let output = tmp.path().join("output");
395        write_beir_fixture(&source);
396
397        let report = import_beir("scifact", &source, &output, None).expect("import benchmark");
398
399        assert_eq!(report.dataset, "scifact");
400        assert_eq!(report.default_space, "bench");
401        assert_eq!(report.collection, "scifact");
402        assert_eq!(report.document_count, 2);
403        assert_eq!(report.query_count, 2);
404        assert_eq!(report.judgment_count, 3);
405        assert_eq!(
406            fs::read_to_string(output.join("corpus/10.md")).expect("read corpus doc"),
407            "# Alpha Evidence\n\nAlpha evidence text.\n"
408        );
409
410        let dataset = load_eval_dataset_with_file(tmp.path(), Some(&output.join("eval.toml")))
411            .expect("load imported manifest");
412        assert_eq!(dataset.cases.len(), 2);
413        assert_eq!(dataset.cases[0].space.as_deref(), Some("bench"));
414        assert_eq!(dataset.cases[0].collections, vec!["scifact".to_string()]);
415        assert_eq!(dataset.cases[0].judgments[0].path, "scifact/10.md");
416        assert_eq!(dataset.cases[0].judgments[0].relevance, 2);
417    }
418
419    #[test]
420    fn import_beir_honors_collection_override() {
421        let tmp = tempdir().expect("create tempdir");
422        let source = tmp.path().join("source");
423        let output = tmp.path().join("output");
424        write_beir_fixture(&source);
425
426        let report =
427            import_beir("fiqa", &source, &output, Some("finance")).expect("import benchmark");
428
429        assert_eq!(report.dataset, "fiqa");
430        assert_eq!(report.collection, "finance");
431
432        let dataset = load_eval_dataset_with_file(tmp.path(), Some(&output.join("eval.toml")))
433            .expect("load imported manifest");
434        assert_eq!(dataset.cases[0].collections, vec!["finance".to_string()]);
435        assert_eq!(dataset.cases[0].judgments[0].path, "finance/10.md");
436    }
437
438    #[test]
439    fn import_beir_accepts_missing_titles() {
440        let tmp = tempdir().expect("create tempdir");
441        let source = tmp.path().join("source");
442        let output = tmp.path().join("output");
443        write_beir_fixture_with_missing_title(&source);
444
445        import_beir("fiqa", &source, &output, None).expect("import benchmark");
446
447        assert_eq!(
448            fs::read_to_string(output.join("corpus/10.md")).expect("read corpus doc"),
449            "Alpha evidence text.\n"
450        );
451    }
452
453    #[test]
454    fn import_beir_rejects_nonempty_output_directory() {
455        let tmp = tempdir().expect("create tempdir");
456        let source = tmp.path().join("source");
457        let output = tmp.path().join("output");
458        write_beir_fixture(&source);
459        fs::create_dir_all(&output).expect("create output");
460        fs::write(output.join("keep.txt"), "existing").expect("write sentinel");
461
462        let err =
463            import_beir("fiqa", &source, &output, None).expect_err("nonempty output should fail");
464
465        assert!(
466            err.to_string()
467                .contains("eval import output directory must be empty"),
468            "unexpected error: {err}"
469        );
470    }
471
472    #[test]
473    fn import_beir_rejects_missing_test_split() {
474        let tmp = tempdir().expect("create tempdir");
475        let source = tmp.path().join("source");
476        fs::create_dir_all(source.join("qrels")).expect("create qrels dir");
477        fs::write(
478            source.join("corpus.jsonl"),
479            "{\"_id\":\"10\",\"title\":\"Alpha Evidence\",\"text\":\"Alpha evidence text.\"}\n",
480        )
481        .expect("write corpus");
482        fs::write(
483            source.join("queries.jsonl"),
484            "{\"_id\":\"q1\",\"text\":\"alpha evidence\"}\n",
485        )
486        .expect("write queries");
487
488        let err = import_beir("fiqa", &source, &tmp.path().join("output"), None)
489            .expect_err("missing qrels/test.tsv should fail");
490
491        assert!(
492            err.to_string().contains("missing qrels/test.tsv"),
493            "unexpected error: {err}"
494        );
495    }
496
497    fn write_beir_fixture(root: &Path) {
498        fs::create_dir_all(root.join("qrels")).expect("create qrels dir");
499        fs::write(
500            root.join("corpus.jsonl"),
501            concat!(
502                "{\"_id\":\"10\",\"title\":\"Alpha Evidence\",\"text\":\"Alpha evidence text.\"}\n",
503                "{\"_id\":\"20\",\"title\":\"Beta Study\",\"text\":\"Beta study text.\"}\n"
504            ),
505        )
506        .expect("write corpus");
507        fs::write(
508            root.join("queries.jsonl"),
509            concat!(
510                "{\"_id\":\"q1\",\"text\":\"alpha evidence\"}\n",
511                "{\"_id\":\"q2\",\"text\":\"beta study\"}\n"
512            ),
513        )
514        .expect("write queries");
515        fs::write(
516            root.join("qrels").join("test.tsv"),
517            concat!(
518                "query-id\tcorpus-id\tscore\n",
519                "q1\t10\t2\n",
520                "q1\t20\t1\n",
521                "q2\t20\t1\n",
522                "q2\t10\t0\n"
523            ),
524        )
525        .expect("write qrels");
526    }
527
528    fn write_beir_fixture_with_missing_title(root: &Path) {
529        fs::create_dir_all(root.join("qrels")).expect("create qrels dir");
530        fs::write(
531            root.join("corpus.jsonl"),
532            "{\"_id\":\"10\",\"text\":\"Alpha evidence text.\"}\n",
533        )
534        .expect("write corpus");
535        fs::write(
536            root.join("queries.jsonl"),
537            "{\"_id\":\"q1\",\"text\":\"alpha evidence\"}\n",
538        )
539        .expect("write queries");
540        fs::write(
541            root.join("qrels").join("test.tsv"),
542            concat!("query-id\tcorpus-id\tscore\n", "q1\t10\t2\n"),
543        )
544        .expect("write qrels");
545    }
546}