use camino::Utf8Path;
use thiserror::Error;
use crate::{Ref, RefParseError};
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct ParsedEntry {
pub ref_: Ref,
pub entry_key: Option<String>,
}
#[derive(Debug, Clone, Error, PartialEq, Eq)]
#[non_exhaustive]
pub enum ParseError {
#[error("entry has no DOI / arXiv id (entry_key={entry_key:?})")]
NoIdentifier {
entry_key: Option<String>,
},
#[error(
"entry identifier {raw:?} did not parse as a Ref \
(entry_key={entry_key:?}): {source}"
)]
InvalidRef {
raw: String,
entry_key: Option<String>,
#[source]
source: RefParseError,
},
#[error("input did not deserialise as {format}: {message}")]
Decode {
format: &'static str,
message: String,
},
#[error(
"{format} parsing is not yet implemented — \
re-export as CSL-JSON from your reference manager, \
or wait for the BibLaTeX slice (ADR-0030 D2 follow-up)"
)]
UnsupportedFormat {
format: &'static str,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum Format {
Auto,
Refs,
CslJson,
Bibtex,
}
impl Format {
pub fn as_wire(&self) -> &'static str {
match self {
Format::Auto => "auto",
Format::Refs => "refs",
Format::CslJson => "csl-json",
Format::Bibtex => "bibtex",
}
}
}
pub fn detect_format(path: Option<&Utf8Path>, content: &str) -> Format {
if let Some(p) = path {
let ext = p.extension().unwrap_or_default().to_ascii_lowercase();
match ext.as_str() {
"bib" | "biblatex" => return Format::Bibtex,
"json" | "csl" => return Format::CslJson,
_ => {}
}
}
for line in content.lines() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
if trimmed.starts_with('@') {
return Format::Bibtex;
}
if trimmed.starts_with('[') || trimmed.starts_with('{') {
return Format::CslJson;
}
break;
}
Format::Refs
}
pub fn parse_input(
text: &str,
format: Format,
path: Option<&Utf8Path>,
) -> Vec<Result<ParsedEntry, ParseError>> {
let resolved = match format {
Format::Auto => detect_format(path, text),
other => other,
};
match resolved {
Format::Refs | Format::Auto => parse_plain_refs(text),
Format::CslJson => parse_csl_json(text),
Format::Bibtex => vec![Err(ParseError::UnsupportedFormat { format: "bibtex" })],
}
}
pub fn parse_plain_refs(text: &str) -> Vec<Result<ParsedEntry, ParseError>> {
let mut out = Vec::new();
for raw_line in text.lines() {
let line = raw_line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
out.push(match Ref::parse(line) {
Ok(ref_) => Ok(ParsedEntry {
ref_,
entry_key: None,
}),
Err(e) => Err(ParseError::InvalidRef {
raw: line.to_string(),
entry_key: None,
source: e,
}),
});
}
out
}
pub fn parse_csl_json(text: &str) -> Vec<Result<ParsedEntry, ParseError>> {
let parsed: serde_json::Result<Vec<serde_json::Value>> = serde_json::from_str(text);
let entries = match parsed {
Ok(arr) => arr,
Err(e) => {
return vec![Err(ParseError::Decode {
format: "csl-json",
message: e.to_string(),
})]
}
};
let mut out = Vec::with_capacity(entries.len());
for entry in entries {
let entry_key = entry.get("id").and_then(|v| {
if let Some(s) = v.as_str() {
Some(s.to_string())
} else if v.is_number() {
Some(v.to_string())
} else {
None
}
});
out.push(parse_csl_entry(&entry, entry_key));
}
out
}
fn parse_csl_entry(
entry: &serde_json::Value,
entry_key: Option<String>,
) -> Result<ParsedEntry, ParseError> {
if let Some(doi) = entry
.get("DOI")
.or_else(|| entry.get("doi"))
.and_then(|v| v.as_str())
{
let raw = doi.trim();
if !raw.is_empty() {
return match Ref::parse(raw) {
Ok(ref_) => Ok(ParsedEntry { ref_, entry_key }),
Err(e) => Err(ParseError::InvalidRef {
raw: raw.to_string(),
entry_key,
source: e,
}),
};
}
}
let is_arxiv = entry
.get("archivePrefix")
.or_else(|| entry.get("archive_prefix"))
.and_then(|v| v.as_str())
.map(|s| s.eq_ignore_ascii_case("arxiv"))
.unwrap_or(false);
if is_arxiv {
if let Some(eprint) = entry.get("eprint").and_then(|v| v.as_str()) {
let raw = eprint.trim();
if !raw.is_empty() {
let with_scheme = if raw.to_ascii_lowercase().starts_with("arxiv:") {
raw.to_string()
} else {
format!("arxiv:{raw}")
};
return match Ref::parse(&with_scheme) {
Ok(ref_) => Ok(ParsedEntry { ref_, entry_key }),
Err(e) => Err(ParseError::InvalidRef {
raw: with_scheme,
entry_key,
source: e,
}),
};
}
}
}
if let Some(note) = entry.get("note").and_then(|v| v.as_str()) {
if let Some(idx) = note.to_ascii_lowercase().find("arxiv:") {
let tail = ¬e[idx + "arxiv:".len()..];
let id: String = tail
.chars()
.take_while(|c| matches!(c, '0'..='9' | '.' | '/' | 'a'..='z' | 'A'..='Z' | '-'))
.collect();
if !id.is_empty() {
let with_scheme = format!("arxiv:{id}");
return match Ref::parse(&with_scheme) {
Ok(ref_) => Ok(ParsedEntry { ref_, entry_key }),
Err(e) => Err(ParseError::InvalidRef {
raw: with_scheme,
entry_key,
source: e,
}),
};
}
}
}
Err(ParseError::NoIdentifier { entry_key })
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
#[test]
fn detect_by_bib_extension() {
let p = Utf8Path::new("/tmp/library.bib");
assert_eq!(detect_format(Some(p), ""), Format::Bibtex);
}
#[test]
fn detect_by_json_extension() {
let p = Utf8Path::new("/tmp/library.json");
assert_eq!(detect_format(Some(p), ""), Format::CslJson);
}
#[test]
fn detect_by_csl_extension() {
let p = Utf8Path::new("/tmp/library.csl");
assert_eq!(detect_format(Some(p), ""), Format::CslJson);
}
#[test]
fn detect_by_fingerprint_bibtex_at_sign() {
let body = "# comment\n\n@article{foo,\n doi = {10.1/x}\n}\n";
assert_eq!(detect_format(None, body), Format::Bibtex);
}
#[test]
fn detect_by_fingerprint_csl_json_array() {
let body = "[{\"id\":\"foo\",\"DOI\":\"10.1/x\"}]";
assert_eq!(detect_format(None, body), Format::CslJson);
}
#[test]
fn detect_by_fingerprint_falls_through_to_refs() {
let body = "doi:10.1234/foo\narxiv:2401.12345\n";
assert_eq!(detect_format(None, body), Format::Refs);
}
#[test]
fn plain_refs_parses_mix_with_comments_and_blanks() {
let body = "\
# header comment
doi:10.1234/foo
arxiv:2401.12345
# trailing comment
";
let parsed = parse_plain_refs(body);
assert_eq!(parsed.len(), 2);
let okays: Vec<_> = parsed.into_iter().filter_map(Result::ok).collect();
assert!(matches!(okays[0].ref_, Ref::Doi(_)));
assert!(matches!(okays[1].ref_, Ref::Arxiv(_)));
assert!(okays.iter().all(|e| e.entry_key.is_none()));
}
#[test]
fn plain_refs_surface_per_line_invalid_refs() {
let body = "doi:10.1234/foo\nnot-a-ref\narxiv:2401.12345\n";
let parsed = parse_plain_refs(body);
assert_eq!(parsed.len(), 3);
assert!(parsed[0].is_ok());
assert!(matches!(parsed[1], Err(ParseError::InvalidRef { .. })));
assert!(parsed[2].is_ok());
}
#[test]
fn csl_json_picks_doi_when_present() {
let body = r#"[{"id":"foo2024","DOI":"10.1234/foo"}]"#;
let parsed = parse_csl_json(body);
assert_eq!(parsed.len(), 1);
let entry = parsed.into_iter().next().unwrap().expect("entry parses");
assert!(matches!(entry.ref_, Ref::Doi(_)));
assert_eq!(entry.entry_key.as_deref(), Some("foo2024"));
}
#[test]
fn csl_json_accepts_lowercase_doi_field() {
let body = r#"[{"id":"x","doi":"10.5555/bar"}]"#;
let parsed = parse_csl_json(body);
let entry = parsed.into_iter().next().unwrap().expect("entry parses");
assert!(matches!(entry.ref_, Ref::Doi(_)));
}
#[test]
fn csl_json_picks_arxiv_via_archive_prefix_and_eprint() {
let body = r#"[{"id":"arx","archivePrefix":"arXiv","eprint":"2401.12345"}]"#;
let parsed = parse_csl_json(body);
let entry = parsed.into_iter().next().unwrap().expect("entry parses");
assert!(matches!(entry.ref_, Ref::Arxiv(_)));
}
#[test]
fn csl_json_arxiv_archive_prefix_is_case_insensitive() {
let body = r#"[{"id":"arx","archivePrefix":"ARXIV","eprint":"2401.12345"}]"#;
let parsed = parse_csl_json(body);
let entry = parsed.into_iter().next().unwrap().expect("entry parses");
assert!(matches!(entry.ref_, Ref::Arxiv(_)));
}
#[test]
fn csl_json_doi_beats_arxiv_when_both_present() {
let body = r#"[{
"id":"both",
"DOI":"10.1234/foo",
"archivePrefix":"arXiv",
"eprint":"2401.12345"
}]"#;
let parsed = parse_csl_json(body);
let entry = parsed.into_iter().next().unwrap().expect("entry parses");
assert!(matches!(entry.ref_, Ref::Doi(_)));
}
#[test]
fn csl_json_arxiv_from_note_field() {
let body = r#"[{"id":"znote","note":"Comment: 12 pages. arXiv:2401.12345"}]"#;
let parsed = parse_csl_json(body);
let entry = parsed.into_iter().next().unwrap().expect("entry parses");
assert!(matches!(entry.ref_, Ref::Arxiv(_)));
}
#[test]
fn csl_json_entry_without_any_identifier_yields_no_identifier_error() {
let body = r#"[{"id":"empty","title":"no ids here"}]"#;
let parsed = parse_csl_json(body);
assert!(matches!(
parsed.into_iter().next().unwrap(),
Err(ParseError::NoIdentifier { .. })
));
}
#[test]
fn csl_json_invalid_doi_surface_as_invalid_ref_per_entry() {
let body = r#"[{"id":"bad","DOI":"not-a-doi"}]"#;
let parsed = parse_csl_json(body);
match &parsed[0] {
Err(ParseError::InvalidRef { raw, entry_key, .. }) => {
assert_eq!(raw, "not-a-doi");
assert_eq!(entry_key.as_deref(), Some("bad"));
}
other => panic!("expected InvalidRef, got {other:?}"),
}
}
#[test]
fn csl_json_top_level_malformed_yields_single_decode_error() {
let body = "{this is not JSON}";
let parsed = parse_csl_json(body);
assert_eq!(parsed.len(), 1);
assert!(matches!(
parsed[0],
Err(ParseError::Decode {
format: "csl-json",
..
})
));
}
#[test]
fn csl_json_non_array_top_level_yields_decode_error() {
let body = r#"{"id":"x","DOI":"10.1/x"}"#;
let parsed = parse_csl_json(body);
assert!(matches!(
parsed[0],
Err(ParseError::Decode {
format: "csl-json",
..
})
));
}
#[test]
fn parse_input_auto_dispatches_csl_json_by_content() {
let body = r#"[{"id":"foo","DOI":"10.1234/foo"}]"#;
let parsed = parse_input(body, Format::Auto, None);
assert_eq!(parsed.len(), 1);
assert!(matches!(
parsed[0],
Ok(ParsedEntry {
ref_: Ref::Doi(_),
..
})
));
}
#[test]
fn parse_input_auto_dispatches_refs_by_content() {
let body = "doi:10.1234/foo\n";
let parsed = parse_input(body, Format::Auto, None);
assert_eq!(parsed.len(), 1);
assert!(matches!(
parsed[0],
Ok(ParsedEntry {
ref_: Ref::Doi(_),
..
})
));
}
#[test]
fn parse_input_bibtex_returns_unsupported_format_error() {
let body = "@article{foo, doi={10.1234/x}}";
let parsed = parse_input(body, Format::Bibtex, None);
assert_eq!(parsed.len(), 1);
assert!(matches!(
parsed[0],
Err(ParseError::UnsupportedFormat { format: "bibtex" })
));
}
#[test]
fn parse_input_auto_with_path_uses_extension() {
let body = "[]";
let parsed = parse_input(body, Format::Auto, Some(Utf8Path::new("foo.csl")));
assert_eq!(
parsed.len(),
0,
"empty array yields zero entries: {parsed:?}"
);
}
#[test]
fn format_wire_strings_are_stable() {
assert_eq!(Format::Auto.as_wire(), "auto");
assert_eq!(Format::Refs.as_wire(), "refs");
assert_eq!(Format::CslJson.as_wire(), "csl-json");
assert_eq!(Format::Bibtex.as_wire(), "bibtex");
}
}