1use std::collections::{HashMap, HashSet};
2use std::fs;
3use std::io::{BufRead, BufReader};
4use std::path::{Path, PathBuf};
5
6use kbolt_types::{EvalCase, EvalDataset, EvalImportReport, EvalJudgment, KboltError};
7use serde::Deserialize;
8
9use crate::Result;
10
11const DEFAULT_SPACE: &str = "bench";
12const CORPUS_DIRNAME: &str = "corpus";
13const MANIFEST_FILENAME: &str = "eval.toml";
14
15#[derive(Debug, Deserialize)]
16struct BeirCorpusRecord {
17 #[serde(rename = "_id")]
18 id: String,
19 #[serde(default)]
20 title: Option<String>,
21 text: String,
22}
23
24#[derive(Debug, Deserialize)]
25struct BeirQueryRecord {
26 #[serde(rename = "_id")]
27 id: String,
28 text: String,
29}
30
31pub fn import_beir(
32 dataset: &str,
33 source: &Path,
34 output: &Path,
35 collection: Option<&str>,
36) -> Result<EvalImportReport> {
37 let dataset = normalize_import_name("dataset", dataset)?;
38 let collection = normalize_import_name("collection", collection.unwrap_or(dataset))?;
39 let layout = validate_beir_layout(dataset, source)?;
40 prepare_output_dir(output)?;
41
42 let corpus_dir = output.join(CORPUS_DIRNAME);
43 fs::create_dir_all(&corpus_dir)?;
44
45 let document_ids = materialize_beir_corpus(dataset, &layout.corpus, &corpus_dir)?;
46 let queries = load_beir_queries(dataset, &layout.queries)?;
47 let (judgments_by_query, judgment_count) =
48 load_beir_qrels(dataset, collection, &layout.qrels, &document_ids, &queries)?;
49 let eval_dataset = build_eval_dataset(dataset, collection, queries, judgments_by_query)?;
50 let query_count = eval_dataset.cases.len();
51 let manifest_path = output.join(MANIFEST_FILENAME);
52 fs::write(&manifest_path, toml::to_string_pretty(&eval_dataset)?)?;
53
54 Ok(EvalImportReport {
55 dataset: dataset.to_string(),
56 source: source.display().to_string(),
57 output_dir: output.display().to_string(),
58 corpus_dir: corpus_dir.display().to_string(),
59 manifest_path: manifest_path.display().to_string(),
60 default_space: DEFAULT_SPACE.to_string(),
61 collection: collection.to_string(),
62 document_count: document_ids.len(),
63 query_count,
64 judgment_count,
65 })
66}
67
68struct BeirLayout {
69 corpus: PathBuf,
70 queries: PathBuf,
71 qrels: PathBuf,
72}
73
74fn validate_beir_layout(dataset: &str, source: &Path) -> Result<BeirLayout> {
75 if !source.is_dir() {
76 return Err(KboltError::InvalidInput(format!(
77 "{dataset} source must be a directory: {}",
78 source.display()
79 ))
80 .into());
81 }
82
83 let corpus = source.join("corpus.jsonl");
84 let queries = source.join("queries.jsonl");
85 let qrels = source.join("qrels").join("test.tsv");
86
87 for (label, path) in [
88 ("corpus.jsonl", &corpus),
89 ("queries.jsonl", &queries),
90 ("qrels/test.tsv", &qrels),
91 ] {
92 if !path.is_file() {
93 return Err(KboltError::InvalidInput(format!(
94 "invalid {dataset} source {}: missing {label}",
95 source.display()
96 ))
97 .into());
98 }
99 }
100
101 Ok(BeirLayout {
102 corpus,
103 queries,
104 qrels,
105 })
106}
107
108fn prepare_output_dir(output: &Path) -> Result<()> {
109 if output.exists() {
110 if !output.is_dir() {
111 return Err(KboltError::InvalidInput(format!(
112 "eval import output must be a directory: {}",
113 output.display()
114 ))
115 .into());
116 }
117 if fs::read_dir(output)?.next().transpose()?.is_some() {
118 return Err(KboltError::InvalidInput(format!(
119 "eval import output directory must be empty: {}",
120 output.display()
121 ))
122 .into());
123 }
124 return Ok(());
125 }
126
127 fs::create_dir_all(output)?;
128 Ok(())
129}
130
131fn materialize_beir_corpus(
132 dataset: &str,
133 corpus_file: &Path,
134 corpus_dir: &Path,
135) -> Result<HashSet<String>> {
136 let records = read_jsonl::<BeirCorpusRecord>(corpus_file, "corpus")?;
137 if records.is_empty() {
138 return Err(KboltError::InvalidInput(format!(
139 "{dataset} corpus is empty: {}",
140 corpus_file.display()
141 ))
142 .into());
143 }
144
145 let mut ids = HashSet::with_capacity(records.len());
146 for record in records {
147 validate_record_id("corpus document", &record.id)?;
148 if !ids.insert(record.id.clone()) {
149 return Err(KboltError::InvalidInput(format!(
150 "duplicate corpus document id '{}'",
151 record.id
152 ))
153 .into());
154 }
155 let document_path = corpus_dir.join(format!("{}.md", record.id));
156 fs::write(document_path, render_corpus_document(&record))?;
157 }
158
159 Ok(ids)
160}
161
162fn render_corpus_document(record: &BeirCorpusRecord) -> String {
163 let title = record.title.as_deref().map(str::trim).unwrap_or_default();
164 let text = record.text.trim();
165 if title.is_empty() {
166 format!("{text}\n")
167 } else {
168 format!("# {title}\n\n{text}\n")
169 }
170}
171
172fn load_beir_queries(dataset: &str, queries_file: &Path) -> Result<Vec<BeirQueryRecord>> {
173 let queries = read_jsonl::<BeirQueryRecord>(queries_file, "queries")?;
174 if queries.is_empty() {
175 return Err(KboltError::InvalidInput(format!(
176 "{dataset} queries are empty: {}",
177 queries_file.display()
178 ))
179 .into());
180 }
181
182 let mut seen = HashSet::with_capacity(queries.len());
183 for query in &queries {
184 validate_record_id("query", &query.id)?;
185 if !seen.insert(query.id.clone()) {
186 return Err(
187 KboltError::InvalidInput(format!("duplicate query id '{}'", query.id)).into(),
188 );
189 }
190 if query.text.trim().is_empty() {
191 return Err(
192 KboltError::InvalidInput(format!("query '{}' has empty text", query.id)).into(),
193 );
194 }
195 }
196
197 Ok(queries)
198}
199
200fn load_beir_qrels(
201 dataset: &str,
202 collection: &str,
203 qrels_file: &Path,
204 document_ids: &HashSet<String>,
205 queries: &[BeirQueryRecord],
206) -> Result<(HashMap<String, Vec<EvalJudgment>>, usize)> {
207 let query_ids = queries
208 .iter()
209 .map(|query| query.id.as_str())
210 .collect::<HashSet<_>>();
211 let file = fs::File::open(qrels_file)?;
212 let reader = BufReader::new(file);
213 let mut judgments_by_query: HashMap<String, Vec<EvalJudgment>> = HashMap::new();
214 let mut seen_pairs = HashSet::new();
215 let mut judgment_count = 0;
216
217 for (index, line) in reader.lines().enumerate() {
218 let line_number = index + 1;
219 let line = line?;
220 let trimmed = line.trim();
221 if trimmed.is_empty() {
222 continue;
223 }
224 if line_number == 1 && trimmed.eq_ignore_ascii_case("query-id\tcorpus-id\tscore") {
225 continue;
226 }
227
228 let fields = trimmed.split('\t').collect::<Vec<_>>();
229 if fields.len() != 3 {
230 return Err(KboltError::InvalidInput(format!(
231 "invalid {dataset} qrels line {} in {}: expected 3 tab-separated fields",
232 line_number,
233 qrels_file.display()
234 ))
235 .into());
236 }
237
238 let query_id = fields[0].trim();
239 let document_id = fields[1].trim();
240 let relevance = fields[2].trim().parse::<u8>().map_err(|err| {
241 KboltError::InvalidInput(format!(
242 "invalid relevance '{}' on qrels line {} in {}: {err}",
243 fields[2].trim(),
244 line_number,
245 qrels_file.display()
246 ))
247 })?;
248
249 validate_record_id("query", query_id)?;
250 validate_record_id("corpus document", document_id)?;
251
252 if !query_ids.contains(query_id) {
253 return Err(KboltError::InvalidInput(format!(
254 "qrels references unknown query id '{}' in {}",
255 query_id,
256 qrels_file.display()
257 ))
258 .into());
259 }
260 if !document_ids.contains(document_id) {
261 return Err(KboltError::InvalidInput(format!(
262 "qrels references unknown corpus document id '{}' in {}",
263 document_id,
264 qrels_file.display()
265 ))
266 .into());
267 }
268 if relevance == 0 {
269 continue;
270 }
271 if !seen_pairs.insert((query_id.to_string(), document_id.to_string())) {
272 return Err(KboltError::InvalidInput(format!(
273 "duplicate qrels pair '{} -> {}' in {}",
274 query_id,
275 document_id,
276 qrels_file.display()
277 ))
278 .into());
279 }
280
281 judgments_by_query
282 .entry(query_id.to_string())
283 .or_default()
284 .push(EvalJudgment {
285 path: format!("{collection}/{document_id}.md"),
286 relevance,
287 });
288 judgment_count += 1;
289 }
290
291 Ok((judgments_by_query, judgment_count))
292}
293
294fn build_eval_dataset(
295 dataset: &str,
296 collection: &str,
297 queries: Vec<BeirQueryRecord>,
298 mut judgments_by_query: HashMap<String, Vec<EvalJudgment>>,
299) -> Result<EvalDataset> {
300 let mut cases = Vec::new();
301 for query in queries {
302 let Some(mut judgments) = judgments_by_query.remove(&query.id) else {
303 continue;
304 };
305 judgments.sort_by(|left, right| {
306 right
307 .relevance
308 .cmp(&left.relevance)
309 .then_with(|| left.path.cmp(&right.path))
310 });
311 cases.push(EvalCase {
312 query: query.text.trim().to_string(),
313 space: Some(DEFAULT_SPACE.to_string()),
314 collections: vec![collection.to_string()],
315 judgments,
316 });
317 }
318
319 if cases.is_empty() {
320 return Err(KboltError::InvalidInput(format!(
321 "{dataset} qrels did not produce any positive judged queries"
322 ))
323 .into());
324 }
325
326 Ok(EvalDataset { cases })
327}
328
329fn read_jsonl<T>(path: &Path, label: &str) -> Result<Vec<T>>
330where
331 T: for<'de> Deserialize<'de>,
332{
333 let file = fs::File::open(path)?;
334 let reader = BufReader::new(file);
335 let mut records = Vec::new();
336 for (index, line) in reader.lines().enumerate() {
337 let line_number = index + 1;
338 let line = line?;
339 let trimmed = line.trim();
340 if trimmed.is_empty() {
341 continue;
342 }
343 let record = serde_json::from_str(trimmed).map_err(|err| {
344 KboltError::InvalidInput(format!(
345 "invalid {label} jsonl line {} in {}: {err}",
346 line_number,
347 path.display()
348 ))
349 })?;
350 records.push(record);
351 }
352 Ok(records)
353}
354
355fn validate_record_id(kind: &str, value: &str) -> Result<()> {
356 let trimmed = value.trim();
357 if trimmed.is_empty() {
358 return Err(KboltError::InvalidInput(format!("{kind} id must not be empty")).into());
359 }
360 if trimmed == "." || trimmed == ".." || trimmed.contains('/') || trimmed.contains('\\') {
361 return Err(KboltError::InvalidInput(format!(
362 "{kind} id '{}' is not a valid filesystem-safe identifier",
363 value
364 ))
365 .into());
366 }
367 Ok(())
368}
369
370fn normalize_import_name<'a>(kind: &str, value: &'a str) -> Result<&'a str> {
371 let trimmed = value.trim();
372 if trimmed.is_empty() {
373 return Err(KboltError::InvalidInput(format!("{kind} name must not be empty")).into());
374 }
375 validate_record_id(kind, trimmed)?;
376 Ok(trimmed)
377}
378
379#[cfg(test)]
380mod tests {
381 use std::fs;
382 use std::path::Path;
383
384 use tempfile::tempdir;
385
386 use crate::eval_store::load_eval_dataset_with_file;
387
388 use super::import_beir;
389
390 #[test]
391 fn import_beir_materializes_corpus_and_manifest() {
392 let tmp = tempdir().expect("create tempdir");
393 let source = tmp.path().join("source");
394 let output = tmp.path().join("output");
395 write_beir_fixture(&source);
396
397 let report = import_beir("scifact", &source, &output, None).expect("import benchmark");
398
399 assert_eq!(report.dataset, "scifact");
400 assert_eq!(report.default_space, "bench");
401 assert_eq!(report.collection, "scifact");
402 assert_eq!(report.document_count, 2);
403 assert_eq!(report.query_count, 2);
404 assert_eq!(report.judgment_count, 3);
405 assert_eq!(
406 fs::read_to_string(output.join("corpus/10.md")).expect("read corpus doc"),
407 "# Alpha Evidence\n\nAlpha evidence text.\n"
408 );
409
410 let dataset = load_eval_dataset_with_file(tmp.path(), Some(&output.join("eval.toml")))
411 .expect("load imported manifest");
412 assert_eq!(dataset.cases.len(), 2);
413 assert_eq!(dataset.cases[0].space.as_deref(), Some("bench"));
414 assert_eq!(dataset.cases[0].collections, vec!["scifact".to_string()]);
415 assert_eq!(dataset.cases[0].judgments[0].path, "scifact/10.md");
416 assert_eq!(dataset.cases[0].judgments[0].relevance, 2);
417 }
418
419 #[test]
420 fn import_beir_honors_collection_override() {
421 let tmp = tempdir().expect("create tempdir");
422 let source = tmp.path().join("source");
423 let output = tmp.path().join("output");
424 write_beir_fixture(&source);
425
426 let report =
427 import_beir("fiqa", &source, &output, Some("finance")).expect("import benchmark");
428
429 assert_eq!(report.dataset, "fiqa");
430 assert_eq!(report.collection, "finance");
431
432 let dataset = load_eval_dataset_with_file(tmp.path(), Some(&output.join("eval.toml")))
433 .expect("load imported manifest");
434 assert_eq!(dataset.cases[0].collections, vec!["finance".to_string()]);
435 assert_eq!(dataset.cases[0].judgments[0].path, "finance/10.md");
436 }
437
438 #[test]
439 fn import_beir_accepts_missing_titles() {
440 let tmp = tempdir().expect("create tempdir");
441 let source = tmp.path().join("source");
442 let output = tmp.path().join("output");
443 write_beir_fixture_with_missing_title(&source);
444
445 import_beir("fiqa", &source, &output, None).expect("import benchmark");
446
447 assert_eq!(
448 fs::read_to_string(output.join("corpus/10.md")).expect("read corpus doc"),
449 "Alpha evidence text.\n"
450 );
451 }
452
453 #[test]
454 fn import_beir_rejects_nonempty_output_directory() {
455 let tmp = tempdir().expect("create tempdir");
456 let source = tmp.path().join("source");
457 let output = tmp.path().join("output");
458 write_beir_fixture(&source);
459 fs::create_dir_all(&output).expect("create output");
460 fs::write(output.join("keep.txt"), "existing").expect("write sentinel");
461
462 let err =
463 import_beir("fiqa", &source, &output, None).expect_err("nonempty output should fail");
464
465 assert!(
466 err.to_string()
467 .contains("eval import output directory must be empty"),
468 "unexpected error: {err}"
469 );
470 }
471
472 #[test]
473 fn import_beir_rejects_missing_test_split() {
474 let tmp = tempdir().expect("create tempdir");
475 let source = tmp.path().join("source");
476 fs::create_dir_all(source.join("qrels")).expect("create qrels dir");
477 fs::write(
478 source.join("corpus.jsonl"),
479 "{\"_id\":\"10\",\"title\":\"Alpha Evidence\",\"text\":\"Alpha evidence text.\"}\n",
480 )
481 .expect("write corpus");
482 fs::write(
483 source.join("queries.jsonl"),
484 "{\"_id\":\"q1\",\"text\":\"alpha evidence\"}\n",
485 )
486 .expect("write queries");
487
488 let err = import_beir("fiqa", &source, &tmp.path().join("output"), None)
489 .expect_err("missing qrels/test.tsv should fail");
490
491 assert!(
492 err.to_string().contains("missing qrels/test.tsv"),
493 "unexpected error: {err}"
494 );
495 }
496
497 fn write_beir_fixture(root: &Path) {
498 fs::create_dir_all(root.join("qrels")).expect("create qrels dir");
499 fs::write(
500 root.join("corpus.jsonl"),
501 concat!(
502 "{\"_id\":\"10\",\"title\":\"Alpha Evidence\",\"text\":\"Alpha evidence text.\"}\n",
503 "{\"_id\":\"20\",\"title\":\"Beta Study\",\"text\":\"Beta study text.\"}\n"
504 ),
505 )
506 .expect("write corpus");
507 fs::write(
508 root.join("queries.jsonl"),
509 concat!(
510 "{\"_id\":\"q1\",\"text\":\"alpha evidence\"}\n",
511 "{\"_id\":\"q2\",\"text\":\"beta study\"}\n"
512 ),
513 )
514 .expect("write queries");
515 fs::write(
516 root.join("qrels").join("test.tsv"),
517 concat!(
518 "query-id\tcorpus-id\tscore\n",
519 "q1\t10\t2\n",
520 "q1\t20\t1\n",
521 "q2\t20\t1\n",
522 "q2\t10\t0\n"
523 ),
524 )
525 .expect("write qrels");
526 }
527
528 fn write_beir_fixture_with_missing_title(root: &Path) {
529 fs::create_dir_all(root.join("qrels")).expect("create qrels dir");
530 fs::write(
531 root.join("corpus.jsonl"),
532 "{\"_id\":\"10\",\"text\":\"Alpha evidence text.\"}\n",
533 )
534 .expect("write corpus");
535 fs::write(
536 root.join("queries.jsonl"),
537 "{\"_id\":\"q1\",\"text\":\"alpha evidence\"}\n",
538 )
539 .expect("write queries");
540 fs::write(
541 root.join("qrels").join("test.tsv"),
542 concat!("query-id\tcorpus-id\tscore\n", "q1\t10\t2\n"),
543 )
544 .expect("write qrels");
545 }
546}