1use camino::Utf8Path;
29use thiserror::Error;
30
31use crate::{Ref, RefParseError};
32
33#[derive(Debug, Clone, PartialEq, Eq)]
41#[non_exhaustive]
42pub struct ParsedEntry {
43 pub ref_: Ref,
46 pub entry_key: Option<String>,
50}
51
52#[derive(Debug, Clone, Error, PartialEq, Eq)]
58#[non_exhaustive]
59pub enum ParseError {
60 #[error("entry has no DOI / arXiv id (entry_key={entry_key:?})")]
65 NoIdentifier {
66 entry_key: Option<String>,
68 },
69 #[error(
72 "entry identifier {raw:?} did not parse as a Ref \
73 (entry_key={entry_key:?}): {source}"
74 )]
75 InvalidRef {
76 raw: String,
78 entry_key: Option<String>,
80 #[source]
82 source: RefParseError,
83 },
84 #[error("input did not deserialise as {format}: {message}")]
89 Decode {
90 format: &'static str,
92 message: String,
94 },
95 #[error(
100 "{format} parsing is not yet implemented — \
101 re-export as CSL-JSON from your reference manager, \
102 or wait for the BibLaTeX slice (ADR-0030 D2 follow-up)"
103 )]
104 UnsupportedFormat {
105 format: &'static str,
107 },
108}
109
110#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116#[non_exhaustive]
117pub enum Format {
118 Auto,
121 Refs,
123 CslJson,
125 Bibtex,
128}
129
130impl Format {
131 pub fn as_wire(&self) -> &'static str {
134 match self {
135 Format::Auto => "auto",
136 Format::Refs => "refs",
137 Format::CslJson => "csl-json",
138 Format::Bibtex => "bibtex",
139 }
140 }
141}
142
143pub fn detect_format(path: Option<&Utf8Path>, content: &str) -> Format {
151 if let Some(p) = path {
152 let ext = p.extension().unwrap_or_default().to_ascii_lowercase();
153 match ext.as_str() {
154 "bib" | "biblatex" => return Format::Bibtex,
155 "json" | "csl" => return Format::CslJson,
156 _ => {}
157 }
158 }
159 for line in content.lines() {
161 let trimmed = line.trim();
162 if trimmed.is_empty() || trimmed.starts_with('#') {
163 continue;
164 }
165 if trimmed.starts_with('@') {
166 return Format::Bibtex;
167 }
168 if trimmed.starts_with('[') || trimmed.starts_with('{') {
169 return Format::CslJson;
170 }
171 break;
172 }
173 Format::Refs
174}
175
176pub fn parse_input(
187 text: &str,
188 format: Format,
189 path: Option<&Utf8Path>,
190) -> Vec<Result<ParsedEntry, ParseError>> {
191 let resolved = match format {
192 Format::Auto => detect_format(path, text),
193 other => other,
194 };
195 match resolved {
196 Format::Refs | Format::Auto => parse_plain_refs(text),
197 Format::CslJson => parse_csl_json(text),
198 Format::Bibtex => vec![Err(ParseError::UnsupportedFormat { format: "bibtex" })],
199 }
200}
201
202pub fn parse_plain_refs(text: &str) -> Vec<Result<ParsedEntry, ParseError>> {
206 let mut out = Vec::new();
207 for raw_line in text.lines() {
208 let line = raw_line.trim();
209 if line.is_empty() || line.starts_with('#') {
210 continue;
211 }
212 out.push(match Ref::parse(line) {
213 Ok(ref_) => Ok(ParsedEntry {
214 ref_,
215 entry_key: None,
216 }),
217 Err(e) => Err(ParseError::InvalidRef {
218 raw: line.to_string(),
219 entry_key: None,
220 source: e,
221 }),
222 });
223 }
224 out
225}
226
227pub fn parse_csl_json(text: &str) -> Vec<Result<ParsedEntry, ParseError>> {
243 let parsed: serde_json::Result<Vec<serde_json::Value>> = serde_json::from_str(text);
244 let entries = match parsed {
245 Ok(arr) => arr,
246 Err(e) => {
247 return vec![Err(ParseError::Decode {
248 format: "csl-json",
249 message: e.to_string(),
250 })]
251 }
252 };
253 let mut out = Vec::with_capacity(entries.len());
254 for entry in entries {
255 let entry_key = entry.get("id").and_then(|v| {
259 if let Some(s) = v.as_str() {
260 Some(s.to_string())
261 } else if v.is_number() {
262 Some(v.to_string())
263 } else {
264 None
265 }
266 });
267 out.push(parse_csl_entry(&entry, entry_key));
268 }
269 out
270}
271
272fn parse_csl_entry(
275 entry: &serde_json::Value,
276 entry_key: Option<String>,
277) -> Result<ParsedEntry, ParseError> {
278 if let Some(doi) = entry
282 .get("DOI")
283 .or_else(|| entry.get("doi"))
284 .and_then(|v| v.as_str())
285 {
286 let raw = doi.trim();
287 if !raw.is_empty() {
288 return match Ref::parse(raw) {
289 Ok(ref_) => Ok(ParsedEntry { ref_, entry_key }),
290 Err(e) => Err(ParseError::InvalidRef {
291 raw: raw.to_string(),
292 entry_key,
293 source: e,
294 }),
295 };
296 }
297 }
298 let is_arxiv = entry
301 .get("archivePrefix")
302 .or_else(|| entry.get("archive_prefix"))
303 .and_then(|v| v.as_str())
304 .map(|s| s.eq_ignore_ascii_case("arxiv"))
305 .unwrap_or(false);
306 if is_arxiv {
307 if let Some(eprint) = entry.get("eprint").and_then(|v| v.as_str()) {
308 let raw = eprint.trim();
309 if !raw.is_empty() {
310 let with_scheme = if raw.to_ascii_lowercase().starts_with("arxiv:") {
311 raw.to_string()
312 } else {
313 format!("arxiv:{raw}")
314 };
315 return match Ref::parse(&with_scheme) {
316 Ok(ref_) => Ok(ParsedEntry { ref_, entry_key }),
317 Err(e) => Err(ParseError::InvalidRef {
318 raw: with_scheme,
319 entry_key,
320 source: e,
321 }),
322 };
323 }
324 }
325 }
326 if let Some(note) = entry.get("note").and_then(|v| v.as_str()) {
332 if let Some(idx) = note.to_ascii_lowercase().find("arxiv:") {
333 let tail = ¬e[idx + "arxiv:".len()..];
334 let id: String = tail
338 .chars()
339 .take_while(|c| matches!(c, '0'..='9' | '.' | '/' | 'a'..='z' | 'A'..='Z' | '-'))
340 .collect();
341 if !id.is_empty() {
342 let with_scheme = format!("arxiv:{id}");
343 return match Ref::parse(&with_scheme) {
344 Ok(ref_) => Ok(ParsedEntry { ref_, entry_key }),
345 Err(e) => Err(ParseError::InvalidRef {
346 raw: with_scheme,
347 entry_key,
348 source: e,
349 }),
350 };
351 }
352 }
353 }
354 Err(ParseError::NoIdentifier { entry_key })
355}
356
357#[cfg(test)]
358#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
359mod tests {
360 use super::*;
361
362 #[test]
365 fn detect_by_bib_extension() {
366 let p = Utf8Path::new("/tmp/library.bib");
367 assert_eq!(detect_format(Some(p), ""), Format::Bibtex);
368 }
369
370 #[test]
371 fn detect_by_json_extension() {
372 let p = Utf8Path::new("/tmp/library.json");
373 assert_eq!(detect_format(Some(p), ""), Format::CslJson);
374 }
375
376 #[test]
377 fn detect_by_csl_extension() {
378 let p = Utf8Path::new("/tmp/library.csl");
379 assert_eq!(detect_format(Some(p), ""), Format::CslJson);
380 }
381
382 #[test]
383 fn detect_by_fingerprint_bibtex_at_sign() {
384 let body = "# comment\n\n@article{foo,\n doi = {10.1/x}\n}\n";
385 assert_eq!(detect_format(None, body), Format::Bibtex);
386 }
387
388 #[test]
389 fn detect_by_fingerprint_csl_json_array() {
390 let body = "[{\"id\":\"foo\",\"DOI\":\"10.1/x\"}]";
391 assert_eq!(detect_format(None, body), Format::CslJson);
392 }
393
394 #[test]
395 fn detect_by_fingerprint_falls_through_to_refs() {
396 let body = "doi:10.1234/foo\narxiv:2401.12345\n";
397 assert_eq!(detect_format(None, body), Format::Refs);
398 }
399
400 #[test]
403 fn plain_refs_parses_mix_with_comments_and_blanks() {
404 let body = "\
405# header comment
406doi:10.1234/foo
407
408 arxiv:2401.12345
409# trailing comment
410";
411 let parsed = parse_plain_refs(body);
412 assert_eq!(parsed.len(), 2);
413 let okays: Vec<_> = parsed.into_iter().filter_map(Result::ok).collect();
414 assert!(matches!(okays[0].ref_, Ref::Doi(_)));
415 assert!(matches!(okays[1].ref_, Ref::Arxiv(_)));
416 assert!(okays.iter().all(|e| e.entry_key.is_none()));
417 }
418
419 #[test]
420 fn plain_refs_surface_per_line_invalid_refs() {
421 let body = "doi:10.1234/foo\nnot-a-ref\narxiv:2401.12345\n";
422 let parsed = parse_plain_refs(body);
423 assert_eq!(parsed.len(), 3);
424 assert!(parsed[0].is_ok());
425 assert!(matches!(parsed[1], Err(ParseError::InvalidRef { .. })));
426 assert!(parsed[2].is_ok());
427 }
428
429 #[test]
432 fn csl_json_picks_doi_when_present() {
433 let body = r#"[{"id":"foo2024","DOI":"10.1234/foo"}]"#;
434 let parsed = parse_csl_json(body);
435 assert_eq!(parsed.len(), 1);
436 let entry = parsed.into_iter().next().unwrap().expect("entry parses");
437 assert!(matches!(entry.ref_, Ref::Doi(_)));
438 assert_eq!(entry.entry_key.as_deref(), Some("foo2024"));
439 }
440
441 #[test]
442 fn csl_json_accepts_lowercase_doi_field() {
443 let body = r#"[{"id":"x","doi":"10.5555/bar"}]"#;
445 let parsed = parse_csl_json(body);
446 let entry = parsed.into_iter().next().unwrap().expect("entry parses");
447 assert!(matches!(entry.ref_, Ref::Doi(_)));
448 }
449
450 #[test]
451 fn csl_json_picks_arxiv_via_archive_prefix_and_eprint() {
452 let body = r#"[{"id":"arx","archivePrefix":"arXiv","eprint":"2401.12345"}]"#;
453 let parsed = parse_csl_json(body);
454 let entry = parsed.into_iter().next().unwrap().expect("entry parses");
455 assert!(matches!(entry.ref_, Ref::Arxiv(_)));
456 }
457
458 #[test]
459 fn csl_json_arxiv_archive_prefix_is_case_insensitive() {
460 let body = r#"[{"id":"arx","archivePrefix":"ARXIV","eprint":"2401.12345"}]"#;
461 let parsed = parse_csl_json(body);
462 let entry = parsed.into_iter().next().unwrap().expect("entry parses");
463 assert!(matches!(entry.ref_, Ref::Arxiv(_)));
464 }
465
466 #[test]
467 fn csl_json_doi_beats_arxiv_when_both_present() {
468 let body = r#"[{
470 "id":"both",
471 "DOI":"10.1234/foo",
472 "archivePrefix":"arXiv",
473 "eprint":"2401.12345"
474 }]"#;
475 let parsed = parse_csl_json(body);
476 let entry = parsed.into_iter().next().unwrap().expect("entry parses");
477 assert!(matches!(entry.ref_, Ref::Doi(_)));
478 }
479
480 #[test]
481 fn csl_json_arxiv_from_note_field() {
482 let body = r#"[{"id":"znote","note":"Comment: 12 pages. arXiv:2401.12345"}]"#;
485 let parsed = parse_csl_json(body);
486 let entry = parsed.into_iter().next().unwrap().expect("entry parses");
487 assert!(matches!(entry.ref_, Ref::Arxiv(_)));
488 }
489
490 #[test]
491 fn csl_json_entry_without_any_identifier_yields_no_identifier_error() {
492 let body = r#"[{"id":"empty","title":"no ids here"}]"#;
493 let parsed = parse_csl_json(body);
494 assert!(matches!(
495 parsed.into_iter().next().unwrap(),
496 Err(ParseError::NoIdentifier { .. })
497 ));
498 }
499
500 #[test]
501 fn csl_json_invalid_doi_surface_as_invalid_ref_per_entry() {
502 let body = r#"[{"id":"bad","DOI":"not-a-doi"}]"#;
503 let parsed = parse_csl_json(body);
504 match &parsed[0] {
505 Err(ParseError::InvalidRef { raw, entry_key, .. }) => {
506 assert_eq!(raw, "not-a-doi");
507 assert_eq!(entry_key.as_deref(), Some("bad"));
508 }
509 other => panic!("expected InvalidRef, got {other:?}"),
510 }
511 }
512
513 #[test]
514 fn csl_json_top_level_malformed_yields_single_decode_error() {
515 let body = "{this is not JSON}";
516 let parsed = parse_csl_json(body);
517 assert_eq!(parsed.len(), 1);
518 assert!(matches!(
519 parsed[0],
520 Err(ParseError::Decode {
521 format: "csl-json",
522 ..
523 })
524 ));
525 }
526
527 #[test]
528 fn csl_json_non_array_top_level_yields_decode_error() {
529 let body = r#"{"id":"x","DOI":"10.1/x"}"#;
533 let parsed = parse_csl_json(body);
534 assert!(matches!(
535 parsed[0],
536 Err(ParseError::Decode {
537 format: "csl-json",
538 ..
539 })
540 ));
541 }
542
543 #[test]
546 fn parse_input_auto_dispatches_csl_json_by_content() {
547 let body = r#"[{"id":"foo","DOI":"10.1234/foo"}]"#;
548 let parsed = parse_input(body, Format::Auto, None);
549 assert_eq!(parsed.len(), 1);
550 assert!(matches!(
551 parsed[0],
552 Ok(ParsedEntry {
553 ref_: Ref::Doi(_),
554 ..
555 })
556 ));
557 }
558
559 #[test]
560 fn parse_input_auto_dispatches_refs_by_content() {
561 let body = "doi:10.1234/foo\n";
562 let parsed = parse_input(body, Format::Auto, None);
563 assert_eq!(parsed.len(), 1);
564 assert!(matches!(
565 parsed[0],
566 Ok(ParsedEntry {
567 ref_: Ref::Doi(_),
568 ..
569 })
570 ));
571 }
572
573 #[test]
574 fn parse_input_bibtex_returns_unsupported_format_error() {
575 let body = "@article{foo, doi={10.1234/x}}";
576 let parsed = parse_input(body, Format::Bibtex, None);
577 assert_eq!(parsed.len(), 1);
578 assert!(matches!(
579 parsed[0],
580 Err(ParseError::UnsupportedFormat { format: "bibtex" })
581 ));
582 }
583
584 #[test]
585 fn parse_input_auto_with_path_uses_extension() {
586 let body = "[]";
587 let parsed = parse_input(body, Format::Auto, Some(Utf8Path::new("foo.csl")));
588 assert_eq!(
589 parsed.len(),
590 0,
591 "empty array yields zero entries: {parsed:?}"
592 );
593 }
594
595 #[test]
598 fn format_wire_strings_are_stable() {
599 assert_eq!(Format::Auto.as_wire(), "auto");
603 assert_eq!(Format::Refs.as_wire(), "refs");
604 assert_eq!(Format::CslJson.as_wire(), "csl-json");
605 assert_eq!(Format::Bibtex.as_wire(), "bibtex");
606 }
607}