Skip to main content

doiget_core/
refs.rs

1//! Bibliography input adapters per ADR-0030.
2//!
3//! Parses three input shapes into an iterator of `Ref`s with optional
4//! `entry_key` provenance back to the source bibliography:
5//!
6//! - **Plain refs**: one `doi:…` / `arxiv:…` / bare-DOI / bare-arXiv id
7//!   per line, with `#`-prefixed comments and blank lines tolerated.
8//!   The existing `doiget batch <refs.txt>` shape.
9//! - **CSL-JSON**: a JSON array of entries with `id` (citation key),
10//!   `DOI`, and optionally `archivePrefix = "arXiv"` + `eprint`
11//!   fields. Parsed via the workspace's existing `serde_json` — no
12//!   new dependency.
13//! - **BibTeX / BibLaTeX (.bib)**: deferred to a follow-up slice (the
14//!   `biblatex` crate adds cargo-vet exemption churn that is
15//!   independent of the slice 1 wire shape; users with a `.bib`
16//!   library can re-export it as CSL-JSON from Zotero today).
17//!
18//! Identifier-pick priority per ADR-0030 D3: `doi` > `arxiv` > `pmid`
19//! (PMID adapter parking until the `Ref::Pmid` variant lands in a
20//! later slice; current code carries the rule through without
21//! producing a `Pmid` ref).
22//!
23//! Parse-error policy per ADR-0030 D5: a single entry's failure is
24//! captured per-entry and does NOT abort the whole batch. The caller
25//! decides whether to skip-and-warn (default) or fail-closed
26//! (`--strict`).
27
28use camino::Utf8Path;
29use thiserror::Error;
30
31use crate::{Ref, RefParseError};
32
33/// One successfully-parsed bibliography entry.
34///
35/// `entry_key` echoes the source bibliography's citation key
36/// (BibTeX `@article{KEY,…}` / CSL-JSON `"id"`) so downstream
37/// automation can bridge the fetch outcome back to the originating
38/// reference — the load-bearing field for the Zotero / Mendeley
39/// "attach fetched PDF to this reference" workflow per ADR-0030 §6.
40#[derive(Debug, Clone, PartialEq, Eq)]
41#[non_exhaustive]
42pub struct ParsedEntry {
43    /// The identifier the adapter chose for this entry (`Ref::Doi` /
44    /// `Ref::Arxiv`).
45    pub ref_: Ref,
46    /// The source bibliography's citation key, when one is available.
47    /// `None` for plain-refs input (no key concept) and for any
48    /// future input shape that lacks per-entry keys.
49    pub entry_key: Option<String>,
50}
51
52/// Why a single bibliography entry failed to produce a `Ref`.
53///
54/// Closed-enum so the failure-class can be exposed at the
55/// `docs/ERRORS.md` §3 INVALID_REF surface without leaking parser
56/// internals.
57#[derive(Debug, Clone, Error, PartialEq, Eq)]
58#[non_exhaustive]
59pub enum ParseError {
60    /// The line did not contain a `doi:` / `arxiv:` / bare-DOI /
61    /// bare-arXiv id — empty (after trimming) or just a comment.
62    /// Plain-refs path filters these out silently; CSL-JSON path
63    /// emits this when an entry has no resolvable identifier.
64    #[error("entry has no DOI / arXiv id (entry_key={entry_key:?})")]
65    NoIdentifier {
66        /// The source bibliography's citation key, when known.
67        entry_key: Option<String>,
68    },
69    /// The identifier was present but `Ref::parse` rejected it
70    /// (malformed DOI suffix, invalid arXiv id shape, etc.).
71    #[error(
72        "entry identifier {raw:?} did not parse as a Ref \
73         (entry_key={entry_key:?}): {source}"
74    )]
75    InvalidRef {
76        /// The raw identifier string the parser saw.
77        raw: String,
78        /// The source bibliography's citation key, when known.
79        entry_key: Option<String>,
80        /// The structured `Ref::parse` failure.
81        #[source]
82        source: RefParseError,
83    },
84    /// The whole input did not deserialise — CSL-JSON that is not a
85    /// JSON array, top-level malformed JSON, etc. This is a
86    /// whole-input failure, not a per-entry failure; callers receive
87    /// it as the sole `Err` element of the result iterator.
88    #[error("input did not deserialise as {format}: {message}")]
89    Decode {
90        /// Which parser branch produced the failure (`"csl-json"`).
91        format: &'static str,
92        /// `serde_json::Error::to_string()`.
93        message: String,
94    },
95    /// Format requested or detected, but the parser for that format is
96    /// not yet shipped. Today this is the `.bib` / BibLaTeX path —
97    /// users should re-export their library as CSL-JSON from Zotero
98    /// until slice 2 ships.
99    #[error(
100        "{format} parsing is not yet implemented — \
101         re-export as CSL-JSON from your reference manager, \
102         or wait for the BibLaTeX slice (ADR-0030 D2 follow-up)"
103    )]
104    UnsupportedFormat {
105        /// The format token (`"bibtex"`).
106        format: &'static str,
107    },
108}
109
110/// Input-shape discriminator per ADR-0030 D4.
111///
112/// `Auto` means "detect from path extension and/or content
113/// fingerprint"; the explicit variants name a parser directly and
114/// skip detection.
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116#[non_exhaustive]
117pub enum Format {
118    /// Detect from file extension if a path was supplied, else from
119    /// content fingerprint; fall through to [`Format::Refs`].
120    Auto,
121    /// Plain refs — one identifier per line, `#` comments, blanks.
122    Refs,
123    /// CSL-JSON array per <https://citationstyles.org/>.
124    CslJson,
125    /// BibTeX / BibLaTeX. Currently unsupported — parser ships in a
126    /// follow-up slice.
127    Bibtex,
128}
129
130impl Format {
131    /// Wire token used by the CLI `--format` flag and the MCP tool
132    /// input schema's `format` field per ADR-0030 §6.
133    pub fn as_wire(&self) -> &'static str {
134        match self {
135            Format::Auto => "auto",
136            Format::Refs => "refs",
137            Format::CslJson => "csl-json",
138            Format::Bibtex => "bibtex",
139        }
140    }
141}
142
143/// Detect the input format per ADR-0030 D4.
144///
145/// Precedence: file extension first (when `path` is `Some`), then
146/// content fingerprint, then fallback to [`Format::Refs`]. The
147/// caller's explicit `--format` flag should short-circuit this
148/// function — it is the slowest of the three precedence rules in the
149/// ADR.
150pub fn detect_format(path: Option<&Utf8Path>, content: &str) -> Format {
151    if let Some(p) = path {
152        let ext = p.extension().unwrap_or_default().to_ascii_lowercase();
153        match ext.as_str() {
154            "bib" | "biblatex" => return Format::Bibtex,
155            "json" | "csl" => return Format::CslJson,
156            _ => {}
157        }
158    }
159    // Content fingerprint: peek the first non-blank, non-comment line.
160    for line in content.lines() {
161        let trimmed = line.trim();
162        if trimmed.is_empty() || trimmed.starts_with('#') {
163            continue;
164        }
165        if trimmed.starts_with('@') {
166            return Format::Bibtex;
167        }
168        if trimmed.starts_with('[') || trimmed.starts_with('{') {
169            return Format::CslJson;
170        }
171        break;
172    }
173    Format::Refs
174}
175
176/// Parse `text` per `format`, dispatching to the matching shape
177/// parser. `path` is consulted only when `format == Format::Auto` to
178/// drive [`detect_format`].
179///
180/// Returns one element per discovered entry — `Ok` for entries that
181/// produced a `Ref`, `Err` for per-entry failures the caller should
182/// surface as a JSONL `INVALID_REF` line. A whole-input decode
183/// failure ([`ParseError::Decode`]) is returned as a single-element
184/// `Err` so the caller's exit-code path treats it as one parse error
185/// rather than zero.
186pub fn parse_input(
187    text: &str,
188    format: Format,
189    path: Option<&Utf8Path>,
190) -> Vec<Result<ParsedEntry, ParseError>> {
191    let resolved = match format {
192        Format::Auto => detect_format(path, text),
193        other => other,
194    };
195    match resolved {
196        Format::Refs | Format::Auto => parse_plain_refs(text),
197        Format::CslJson => parse_csl_json(text),
198        Format::Bibtex => vec![Err(ParseError::UnsupportedFormat { format: "bibtex" })],
199    }
200}
201
202/// Parse plain refs — the existing batch input format. One ref per
203/// non-blank, non-comment line. `entry_key` is always `None` for this
204/// shape; plain refs have no citation-key concept.
205pub fn parse_plain_refs(text: &str) -> Vec<Result<ParsedEntry, ParseError>> {
206    let mut out = Vec::new();
207    for raw_line in text.lines() {
208        let line = raw_line.trim();
209        if line.is_empty() || line.starts_with('#') {
210            continue;
211        }
212        out.push(match Ref::parse(line) {
213            Ok(ref_) => Ok(ParsedEntry {
214                ref_,
215                entry_key: None,
216            }),
217            Err(e) => Err(ParseError::InvalidRef {
218                raw: line.to_string(),
219                entry_key: None,
220                source: e,
221            }),
222        });
223    }
224    out
225}
226
227/// Parse a CSL-JSON document — a JSON array of objects, each with at
228/// least an `id` (citation key) and one of `DOI`, or `archivePrefix`
229/// + `eprint` (arXiv).
230///
231/// Identifier-pick priority per ADR-0030 D3:
232///
233/// 1. `DOI` field (case-sensitive per the CSL-JSON spec but Zotero
234///    sometimes emits `doi` lowercase — we accept both).
235/// 2. `archivePrefix == "arXiv"` (case-insensitive) + `eprint`
236///    (or `note: "arXiv:..."` shape Zotero emits).
237/// 3. (PMID parking — `Ref::Pmid` not yet defined; PMIDs in CSL-JSON
238///    are recorded as parse failures with `NoIdentifier` until the
239///    variant lands.)
240///
241/// `entry_key` is the `id` field verbatim.
242pub fn parse_csl_json(text: &str) -> Vec<Result<ParsedEntry, ParseError>> {
243    let parsed: serde_json::Result<Vec<serde_json::Value>> = serde_json::from_str(text);
244    let entries = match parsed {
245        Ok(arr) => arr,
246        Err(e) => {
247            return vec![Err(ParseError::Decode {
248                format: "csl-json",
249                message: e.to_string(),
250            })]
251        }
252    };
253    let mut out = Vec::with_capacity(entries.len());
254    for entry in entries {
255        // `id` is usually a string in real-world Zotero exports but
256        // the spec allows numeric ids too — stringify either form so
257        // the operator can find the entry in their library.
258        let entry_key = entry.get("id").and_then(|v| {
259            if let Some(s) = v.as_str() {
260                Some(s.to_string())
261            } else if v.is_number() {
262                Some(v.to_string())
263            } else {
264                None
265            }
266        });
267        out.push(parse_csl_entry(&entry, entry_key));
268    }
269    out
270}
271
272/// Pick the highest-priority identifier on a single CSL-JSON entry
273/// and parse it. Honors ADR-0030 D3 priority.
274fn parse_csl_entry(
275    entry: &serde_json::Value,
276    entry_key: Option<String>,
277) -> Result<ParsedEntry, ParseError> {
278    // Priority 1: DOI (both `DOI` per spec and `doi` lowercase per
279    // real-world exports). Zotero emits uppercase; Mendeley sometimes
280    // lowercase.
281    if let Some(doi) = entry
282        .get("DOI")
283        .or_else(|| entry.get("doi"))
284        .and_then(|v| v.as_str())
285    {
286        let raw = doi.trim();
287        if !raw.is_empty() {
288            return match Ref::parse(raw) {
289                Ok(ref_) => Ok(ParsedEntry { ref_, entry_key }),
290                Err(e) => Err(ParseError::InvalidRef {
291                    raw: raw.to_string(),
292                    entry_key,
293                    source: e,
294                }),
295            };
296        }
297    }
298    // Priority 2: arXiv — `archivePrefix == "arXiv"` (CSL extension)
299    // OR the Zotero-specific `note: "arXiv:..."` shape.
300    let is_arxiv = entry
301        .get("archivePrefix")
302        .or_else(|| entry.get("archive_prefix"))
303        .and_then(|v| v.as_str())
304        .map(|s| s.eq_ignore_ascii_case("arxiv"))
305        .unwrap_or(false);
306    if is_arxiv {
307        if let Some(eprint) = entry.get("eprint").and_then(|v| v.as_str()) {
308            let raw = eprint.trim();
309            if !raw.is_empty() {
310                let with_scheme = if raw.to_ascii_lowercase().starts_with("arxiv:") {
311                    raw.to_string()
312                } else {
313                    format!("arxiv:{raw}")
314                };
315                return match Ref::parse(&with_scheme) {
316                    Ok(ref_) => Ok(ParsedEntry { ref_, entry_key }),
317                    Err(e) => Err(ParseError::InvalidRef {
318                        raw: with_scheme,
319                        entry_key,
320                        source: e,
321                    }),
322                };
323            }
324        }
325    }
326    // Fallback: scan `note` for an embedded `arXiv:NNNN.NNNNN` —
327    // Zotero often stores the arXiv id there instead of a typed
328    // field. The pattern is intentionally narrow (must follow the
329    // canonical "arXiv:" prefix); free-text DOIs in notes are NOT
330    // mined here.
331    if let Some(note) = entry.get("note").and_then(|v| v.as_str()) {
332        if let Some(idx) = note.to_ascii_lowercase().find("arxiv:") {
333            let tail = &note[idx + "arxiv:".len()..];
334            // Take chars matching the arXiv id alphabet (digits / dot /
335            // slash / letters / hyphen) — stop at the first separator
336            // so the rest of the note is ignored.
337            let id: String = tail
338                .chars()
339                .take_while(|c| matches!(c, '0'..='9' | '.' | '/' | 'a'..='z' | 'A'..='Z' | '-'))
340                .collect();
341            if !id.is_empty() {
342                let with_scheme = format!("arxiv:{id}");
343                return match Ref::parse(&with_scheme) {
344                    Ok(ref_) => Ok(ParsedEntry { ref_, entry_key }),
345                    Err(e) => Err(ParseError::InvalidRef {
346                        raw: with_scheme,
347                        entry_key,
348                        source: e,
349                    }),
350                };
351            }
352        }
353    }
354    Err(ParseError::NoIdentifier { entry_key })
355}
356
357#[cfg(test)]
358#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
359mod tests {
360    use super::*;
361
362    // ---- detect_format ---------------------------------------------
363
364    #[test]
365    fn detect_by_bib_extension() {
366        let p = Utf8Path::new("/tmp/library.bib");
367        assert_eq!(detect_format(Some(p), ""), Format::Bibtex);
368    }
369
370    #[test]
371    fn detect_by_json_extension() {
372        let p = Utf8Path::new("/tmp/library.json");
373        assert_eq!(detect_format(Some(p), ""), Format::CslJson);
374    }
375
376    #[test]
377    fn detect_by_csl_extension() {
378        let p = Utf8Path::new("/tmp/library.csl");
379        assert_eq!(detect_format(Some(p), ""), Format::CslJson);
380    }
381
382    #[test]
383    fn detect_by_fingerprint_bibtex_at_sign() {
384        let body = "# comment\n\n@article{foo,\n  doi = {10.1/x}\n}\n";
385        assert_eq!(detect_format(None, body), Format::Bibtex);
386    }
387
388    #[test]
389    fn detect_by_fingerprint_csl_json_array() {
390        let body = "[{\"id\":\"foo\",\"DOI\":\"10.1/x\"}]";
391        assert_eq!(detect_format(None, body), Format::CslJson);
392    }
393
394    #[test]
395    fn detect_by_fingerprint_falls_through_to_refs() {
396        let body = "doi:10.1234/foo\narxiv:2401.12345\n";
397        assert_eq!(detect_format(None, body), Format::Refs);
398    }
399
400    // ---- plain refs ------------------------------------------------
401
402    #[test]
403    fn plain_refs_parses_mix_with_comments_and_blanks() {
404        let body = "\
405# header comment
406doi:10.1234/foo
407
408   arxiv:2401.12345
409# trailing comment
410";
411        let parsed = parse_plain_refs(body);
412        assert_eq!(parsed.len(), 2);
413        let okays: Vec<_> = parsed.into_iter().filter_map(Result::ok).collect();
414        assert!(matches!(okays[0].ref_, Ref::Doi(_)));
415        assert!(matches!(okays[1].ref_, Ref::Arxiv(_)));
416        assert!(okays.iter().all(|e| e.entry_key.is_none()));
417    }
418
419    #[test]
420    fn plain_refs_surface_per_line_invalid_refs() {
421        let body = "doi:10.1234/foo\nnot-a-ref\narxiv:2401.12345\n";
422        let parsed = parse_plain_refs(body);
423        assert_eq!(parsed.len(), 3);
424        assert!(parsed[0].is_ok());
425        assert!(matches!(parsed[1], Err(ParseError::InvalidRef { .. })));
426        assert!(parsed[2].is_ok());
427    }
428
429    // ---- CSL-JSON --------------------------------------------------
430
431    #[test]
432    fn csl_json_picks_doi_when_present() {
433        let body = r#"[{"id":"foo2024","DOI":"10.1234/foo"}]"#;
434        let parsed = parse_csl_json(body);
435        assert_eq!(parsed.len(), 1);
436        let entry = parsed.into_iter().next().unwrap().expect("entry parses");
437        assert!(matches!(entry.ref_, Ref::Doi(_)));
438        assert_eq!(entry.entry_key.as_deref(), Some("foo2024"));
439    }
440
441    #[test]
442    fn csl_json_accepts_lowercase_doi_field() {
443        // Mendeley exports sometimes lowercase the field name.
444        let body = r#"[{"id":"x","doi":"10.5555/bar"}]"#;
445        let parsed = parse_csl_json(body);
446        let entry = parsed.into_iter().next().unwrap().expect("entry parses");
447        assert!(matches!(entry.ref_, Ref::Doi(_)));
448    }
449
450    #[test]
451    fn csl_json_picks_arxiv_via_archive_prefix_and_eprint() {
452        let body = r#"[{"id":"arx","archivePrefix":"arXiv","eprint":"2401.12345"}]"#;
453        let parsed = parse_csl_json(body);
454        let entry = parsed.into_iter().next().unwrap().expect("entry parses");
455        assert!(matches!(entry.ref_, Ref::Arxiv(_)));
456    }
457
458    #[test]
459    fn csl_json_arxiv_archive_prefix_is_case_insensitive() {
460        let body = r#"[{"id":"arx","archivePrefix":"ARXIV","eprint":"2401.12345"}]"#;
461        let parsed = parse_csl_json(body);
462        let entry = parsed.into_iter().next().unwrap().expect("entry parses");
463        assert!(matches!(entry.ref_, Ref::Arxiv(_)));
464    }
465
466    #[test]
467    fn csl_json_doi_beats_arxiv_when_both_present() {
468        // ADR-0030 D3: priority is DOI > arXiv > PMID.
469        let body = r#"[{
470            "id":"both",
471            "DOI":"10.1234/foo",
472            "archivePrefix":"arXiv",
473            "eprint":"2401.12345"
474        }]"#;
475        let parsed = parse_csl_json(body);
476        let entry = parsed.into_iter().next().unwrap().expect("entry parses");
477        assert!(matches!(entry.ref_, Ref::Doi(_)));
478    }
479
480    #[test]
481    fn csl_json_arxiv_from_note_field() {
482        // Zotero often dumps "arXiv:NNNN.NNNNN" into the note field
483        // instead of a typed field.
484        let body = r#"[{"id":"znote","note":"Comment: 12 pages. arXiv:2401.12345"}]"#;
485        let parsed = parse_csl_json(body);
486        let entry = parsed.into_iter().next().unwrap().expect("entry parses");
487        assert!(matches!(entry.ref_, Ref::Arxiv(_)));
488    }
489
490    #[test]
491    fn csl_json_entry_without_any_identifier_yields_no_identifier_error() {
492        let body = r#"[{"id":"empty","title":"no ids here"}]"#;
493        let parsed = parse_csl_json(body);
494        assert!(matches!(
495            parsed.into_iter().next().unwrap(),
496            Err(ParseError::NoIdentifier { .. })
497        ));
498    }
499
500    #[test]
501    fn csl_json_invalid_doi_surface_as_invalid_ref_per_entry() {
502        let body = r#"[{"id":"bad","DOI":"not-a-doi"}]"#;
503        let parsed = parse_csl_json(body);
504        match &parsed[0] {
505            Err(ParseError::InvalidRef { raw, entry_key, .. }) => {
506                assert_eq!(raw, "not-a-doi");
507                assert_eq!(entry_key.as_deref(), Some("bad"));
508            }
509            other => panic!("expected InvalidRef, got {other:?}"),
510        }
511    }
512
513    #[test]
514    fn csl_json_top_level_malformed_yields_single_decode_error() {
515        let body = "{this is not JSON}";
516        let parsed = parse_csl_json(body);
517        assert_eq!(parsed.len(), 1);
518        assert!(matches!(
519            parsed[0],
520            Err(ParseError::Decode {
521                format: "csl-json",
522                ..
523            })
524        ));
525    }
526
527    #[test]
528    fn csl_json_non_array_top_level_yields_decode_error() {
529        // A single-entry object (not an array) is not a valid CSL-JSON
530        // document by the spec — the top level MUST be an array even
531        // for a single entry.
532        let body = r#"{"id":"x","DOI":"10.1/x"}"#;
533        let parsed = parse_csl_json(body);
534        assert!(matches!(
535            parsed[0],
536            Err(ParseError::Decode {
537                format: "csl-json",
538                ..
539            })
540        ));
541    }
542
543    // ---- parse_input dispatch -------------------------------------
544
545    #[test]
546    fn parse_input_auto_dispatches_csl_json_by_content() {
547        let body = r#"[{"id":"foo","DOI":"10.1234/foo"}]"#;
548        let parsed = parse_input(body, Format::Auto, None);
549        assert_eq!(parsed.len(), 1);
550        assert!(matches!(
551            parsed[0],
552            Ok(ParsedEntry {
553                ref_: Ref::Doi(_),
554                ..
555            })
556        ));
557    }
558
559    #[test]
560    fn parse_input_auto_dispatches_refs_by_content() {
561        let body = "doi:10.1234/foo\n";
562        let parsed = parse_input(body, Format::Auto, None);
563        assert_eq!(parsed.len(), 1);
564        assert!(matches!(
565            parsed[0],
566            Ok(ParsedEntry {
567                ref_: Ref::Doi(_),
568                ..
569            })
570        ));
571    }
572
573    #[test]
574    fn parse_input_bibtex_returns_unsupported_format_error() {
575        let body = "@article{foo, doi={10.1234/x}}";
576        let parsed = parse_input(body, Format::Bibtex, None);
577        assert_eq!(parsed.len(), 1);
578        assert!(matches!(
579            parsed[0],
580            Err(ParseError::UnsupportedFormat { format: "bibtex" })
581        ));
582    }
583
584    #[test]
585    fn parse_input_auto_with_path_uses_extension() {
586        let body = "[]";
587        let parsed = parse_input(body, Format::Auto, Some(Utf8Path::new("foo.csl")));
588        assert_eq!(
589            parsed.len(),
590            0,
591            "empty array yields zero entries: {parsed:?}"
592        );
593    }
594
595    // ---- Format::as_wire ------------------------------------------
596
597    #[test]
598    fn format_wire_strings_are_stable() {
599        // Pinned because the strings appear in the CLI --format flag,
600        // the MCP tool input schema, and the JSON-Lines parse-error
601        // records (ADR-0030 §6). A drift would be a wire-format break.
602        assert_eq!(Format::Auto.as_wire(), "auto");
603        assert_eq!(Format::Refs.as_wire(), "refs");
604        assert_eq!(Format::CslJson.as_wire(), "csl-json");
605        assert_eq!(Format::Bibtex.as_wire(), "bibtex");
606    }
607}