Skip to main content

doiget_core/
lib.rs

1//! # doiget-core
2//!
3//! Core library for [doiget](https://github.com/sotashimozono/doiget): an Open Access
4//! first paper-fetcher with strict capability gating, fail-closed provenance logging,
5//! and a BiblioFetch.jl-compatible store layout.
6//!
7//! Phase 0 ships only this skeleton. Real implementations land in Phase 1.
8//! See `docs/PUBLIC_API.md` for the semver-locked surface and `docs/ARCHITECTURE.md`
9//! for the high-level design.
10
11#![warn(missing_docs)]
12#![forbid(unsafe_code)]
13
14use serde::{Deserialize, Serialize};
15use sha2::Digest;
16
17// --- Modules ---
18pub mod canonical;
19pub mod discovery;
20pub mod dry_run;
21pub mod http;
22pub mod orchestrator;
23pub mod paper_tex_source;
24pub mod paper_text;
25pub mod provenance;
26pub mod rate_limiter;
27pub mod refs;
28pub mod resolver_cache;
29pub mod source;
30pub mod sources;
31pub mod store;
32pub mod user_extension;
33pub mod verify_config;
34
35// Phase 4 citation graph (ADR-0010). Compile-gated by the `citation`
36// Cargo feature, which itself enables the `metadata` feature so the
37// Tier-2 source impls are available.
38#[cfg(feature = "citation")]
39pub mod citation_graph;
40
41// Re-export the canonical-tuple audit-identity types at the crate root
42// per ADR-0024 / `docs/PUBLIC_API.md` §1. The types themselves live in
43// the [`canonical`] submodule.
44pub use crate::canonical::{CanonicalRef, SourceType};
45
46/// Crate version. Used by `doiget-cli --version` and `doiget_health`.
47pub const VERSION: &str = env!("CARGO_PKG_VERSION");
48
49/// TOML schema version this build writes. See `docs/STORE.md` §3.
50pub const SCHEMA_VERSION: &str = "1.0";
51
52/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
53pub const MAX_CONCURRENT_FETCHES: u32 = 5;
54
55/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
56pub const MAX_FETCHES_PER_SECOND: f32 = 5.0;
57
58/// Maximum batch size for `doiget batch` and `doiget_batch_fetch`.
59pub const MCP_BATCH_MAX_SIZE: usize = 100;
60
61/// Slice 2 alias for [`MCP_BATCH_MAX_SIZE`] using the
62/// spec-language name (`docs/MCP_TOOLS.md` §1 / Slice 2 plan). The
63/// numeric value MUST equal [`MCP_BATCH_MAX_SIZE`]; an internal test
64/// pins the equivalence so the two constants cannot drift.
65pub const MAX_BATCH_REFS: usize = MCP_BATCH_MAX_SIZE;
66
67/// Maximum queued MCP requests beyond `MAX_CONCURRENT_FETCHES`. Excess returns
68/// `ErrorCode::RateLimited`. See `docs/SECURITY.md` §1.4 / `docs/MCP_TOOLS.md`.
69pub const MCP_QUEUE_DEPTH_MAX: usize = 100;
70
71/// MCP server stdin-EOF graceful-shutdown deadline, in seconds. See ADR-0001
72/// and `docs/MCP_TOOLS.md` §8.
73pub const MCP_STDIN_EOF_SHUTDOWN_SEC: u64 = 5;
74
75/// Maximum DOI suffix length accepted at validation. See `docs/SECURITY.md` §1.1.
76pub const DOI_SUFFIX_MAX_LEN: usize = 256;
77
78/// Maximum PDF body size accepted by the fetcher, in bytes. See
79/// `docs/SECURITY.md` §1.2 (Oversized PDF).
80pub const PDF_MAX_BYTES: u64 = 100_000_000;
81
82/// Time-to-live for entries in `~/.cache/doiget/resolver/`. See
83/// `docs/CACHE.md` §3.
84pub const RESOLVER_CACHE_TTL_DAYS: u32 = 7;
85
86/// Time-to-live for entries in `~/.cache/doiget/citations/`. See
87/// `docs/CACHE.md` §3.
88pub const CITATION_CACHE_TTL_DAYS: u32 = 30;
89
90// ---------------------------------------------------------------------------
91// Ref
92// ---------------------------------------------------------------------------
93
94/// A reference to a paper, either by DOI or arXiv id.
95///
96/// See `docs/SECURITY.md` §1.1 for input-validation rules.
97#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
98#[serde(rename_all = "lowercase", tag = "kind", content = "id")]
99pub enum Ref {
100    /// A DOI (e.g., `10.1234/example`).
101    Doi(Doi),
102    /// An arXiv id (e.g., `2401.12345`).
103    Arxiv(ArxivId),
104}
105
106/// A validated DOI string.
107///
108/// Construct via `Doi::parse(s)` (Phase 1+). The inner field is intentionally
109/// `pub(crate)` to forbid bypass construction; tests inside `doiget-core` may
110/// still use `Doi(s)` for fixture purposes.
111///
112/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"10.1234/example"`.
113#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
114#[serde(transparent)]
115pub struct Doi(pub(crate) String);
116
117/// A validated arXiv id string.
118///
119/// Construct via `ArxivId::parse(s)` (Phase 1+). Inner field is `pub(crate)`.
120///
121/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"2401.12345"`.
122#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
123#[serde(transparent)]
124pub struct ArxivId(pub(crate) String);
125
126impl Doi {
127    /// Returns the DOI as a string slice.
128    pub fn as_str(&self) -> &str {
129        &self.0
130    }
131
132    /// Parses and validates a DOI string per `docs/SECURITY.md` §1.1.
133    ///
134    /// Accepts:
135    /// - Bare DOIs: `10.<registrant>/<suffix>` where `<registrant>` is 4–9
136    ///   digits and `<suffix>` is a non-empty sequence of characters drawn
137    ///   from `[A-Za-z0-9._/():-]` (the `:` covers legacy Kluwer
138    ///   `10.1023/A:NNNN` and EDP Sciences `10.1051/jphys:NNNN` DOIs).
139    /// - The `doi:` URI scheme prefix; it is stripped before validation, so
140    ///   the stored value never carries a scheme. (Matches the convention
141    ///   established in `docs/SAFEKEY.md` §3 step 0.)
142    ///
143    /// Rejects:
144    /// - Inputs missing the literal `10.` prefix (after optional scheme
145    ///   strip).
146    /// - Suffixes longer than [`DOI_SUFFIX_MAX_LEN`] bytes.
147    /// - Empty suffixes.
148    /// - Any character outside the suffix charset above (including control
149    ///   characters, whitespace, and non-ASCII).
150    ///
151    /// # Errors
152    ///
153    /// Returns a [`RefParseError`] variant that names the specific rejection
154    /// category. Tier 1+ callers should map any [`RefParseError`] to
155    /// [`ErrorCode::InvalidRef`] when surfacing to MCP / CLI.
156    pub fn parse(s: &str) -> Result<Self, RefParseError> {
157        let stripped = parse::strip_doi_scheme(s);
158        parse::validate_doi(stripped)?;
159        Ok(Doi(stripped.to_string()))
160    }
161}
162
163impl std::fmt::Display for ArxivId {
164    /// Displays the validated id as its canonical string (e.g.
165    /// `2401.12345`) so it can be interpolated into messages — notably the
166    /// `FetchError::TextUnavailable` `#[error]` template (review #318).
167    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
168        f.write_str(&self.0)
169    }
170}
171
172impl ArxivId {
173    /// Returns the arXiv id as a string slice.
174    pub fn as_str(&self) -> &str {
175        &self.0
176    }
177
178    /// Parses and validates an arXiv id per `docs/SECURITY.md` §1.1 and the
179    /// pattern published in `docs/MCP_TOOLS.md`.
180    ///
181    /// Accepts:
182    /// - New-style ids: `YYMM.NNNNN[vN]` where the date block is 4 digits, the
183    ///   sequence number is 4–5 digits, and the optional version `vN` is one
184    ///   or more digits. Examples: `2401.12345`, `2401.12345v2`.
185    /// - Old-style ids: `subject-class/YYMMNNN[vN]` where the subject class
186    ///   is a lowercase token (with optional internal hyphens and an
187    ///   optional `.XX` two-uppercase-letter group), and the numeric body
188    ///   is exactly 7 digits with optional `vN`. Examples:
189    ///   `cond-mat/9501001`, `astro-ph.CO/0703123v2`.
190    /// - The `arxiv:` / `arXiv:` URI scheme prefix; it is stripped before
191    ///   validation.
192    ///
193    /// Rejects:
194    /// - Inputs that match neither the new-style nor old-style shape.
195    /// - Inputs containing characters outside the per-shape charset
196    ///   (control chars, whitespace, non-ASCII).
197    /// - Empty input.
198    ///
199    /// # Errors
200    ///
201    /// Returns a [`RefParseError`] variant that names the specific rejection
202    /// category.
203    pub fn parse(s: &str) -> Result<Self, RefParseError> {
204        let stripped = parse::strip_arxiv_scheme(s);
205        parse::validate_arxiv(stripped)?;
206        Ok(ArxivId(stripped.to_string()))
207    }
208}
209
210impl Ref {
211    /// Parses a string into a [`Ref`], auto-detecting DOI vs arXiv.
212    ///
213    /// Detection rules:
214    /// 1. If the input begins with the case-insensitive `doi:` scheme, the
215    ///    remainder is parsed as a DOI.
216    /// 2. If the input begins with the `arxiv:` or `arXiv:` scheme, the
217    ///    remainder is parsed as an arXiv id.
218    /// 3. Otherwise, if the input starts with `10.` it is treated as a bare
219    ///    DOI; this matches the heuristic in `docs/SAFEKEY.md` §4 (Julia
220    ///    reference) and is stable because DOIs always begin `10.`.
221    /// 4. Failing all of the above, parsing falls back to arXiv.
222    ///
223    /// The returned [`Ref`] never carries the URI scheme — `as_str()` on the
224    /// inner `Doi` / `ArxivId` is always the bare identifier.
225    ///
226    /// # Errors
227    ///
228    /// Returns a [`RefParseError`] from the underlying [`Doi::parse`] or
229    /// [`ArxivId::parse`] call. When the input has an explicit scheme
230    /// (`doi:` / `arxiv:`), the matching parser is dispatched and its error
231    /// surfaces directly. When the input is bare and ambiguous, the
232    /// heuristic in rule 3/4 selects the parser; an unparsable bare input
233    /// surfaces the arXiv parser's error (a non-`10.` ref that also fails
234    /// arXiv validation is never a valid DOI).
235    pub fn parse(s: &str) -> Result<Self, RefParseError> {
236        // Reject empty up front so all three parsers see a meaningful slice;
237        // without this, `strip_*_scheme("")` returns "" and we'd get a
238        // confusing "missing 10. prefix" error for empty input.
239        if s.is_empty() {
240            return Err(RefParseError::Empty);
241        }
242
243        if parse::has_doi_scheme(s) {
244            return Doi::parse(s).map(Ref::Doi);
245        }
246        if parse::has_arxiv_scheme(s) {
247            return ArxivId::parse(s).map(Ref::Arxiv);
248        }
249        if s.starts_with("10.") {
250            return Doi::parse(s).map(Ref::Doi);
251        }
252        ArxivId::parse(s).map(Ref::Arxiv)
253    }
254}
255
256// ---------------------------------------------------------------------------
257// Parser internals
258// ---------------------------------------------------------------------------
259
260mod parse {
261    use super::{RefParseError, DOI_SUFFIX_MAX_LEN};
262
263    /// Case-insensitive `doi:` prefix detector. Matches both `doi:` and
264    /// `DOI:` (and any case mix); the spec in `docs/SAFEKEY.md` §3 only
265    /// names the lowercase form, but the field convention is to be lenient
266    /// in what we accept (the scheme is dropped at the boundary anyway).
267    pub(crate) fn has_doi_scheme(s: &str) -> bool {
268        s.len() >= 4 && s.is_char_boundary(4) && s[..4].eq_ignore_ascii_case("doi:")
269    }
270
271    /// Case-insensitive `arxiv:` prefix detector. Accepts `arxiv:`,
272    /// `arXiv:` (the form used in `docs/MCP_TOOLS.md`), and any other case
273    /// mix.
274    pub(crate) fn has_arxiv_scheme(s: &str) -> bool {
275        s.len() >= 6 && s.is_char_boundary(6) && s[..6].eq_ignore_ascii_case("arxiv:")
276    }
277
278    pub(crate) fn strip_doi_scheme(s: &str) -> &str {
279        if has_doi_scheme(s) {
280            &s[4..]
281        } else {
282            s
283        }
284    }
285
286    pub(crate) fn strip_arxiv_scheme(s: &str) -> &str {
287        if has_arxiv_scheme(s) {
288            &s[6..]
289        } else {
290            s
291        }
292    }
293
294    /// DOI suffix charset per `docs/SECURITY.md` §1.1:
295    /// `[A-Za-z0-9._/():-]`. The forward slash is permitted inside the
296    /// suffix (e.g. `10.1016/...`); the registrant separator is the
297    /// *first* `/` and the suffix is everything after it.
298    ///
299    /// `:` is permitted because two large real publisher DOI families use
300    /// it in the suffix — legacy Kluwer/Springer (`10.1023/A:NNNNNNNNNN`)
301    /// and EDP Sciences / Journal de Physique
302    /// (`10.1051/jphys:NNNNNNNNNNNNNNNNN`). It adds no path-traversal
303    /// capability: traversal requires composing `/` and `.` into `../`,
304    /// and both characters are already in the suffix charset. In addition,
305    /// `safekey` independently escapes every char outside `[A-Za-z0-9._-]`
306    /// before any filesystem use, so `:` never reaches a path literally.
307    /// See ADR-0026 and `docs/SECURITY.md` §1.1.
308    fn is_doi_suffix_char(c: char) -> bool {
309        matches!(c,
310            'A'..='Z' | 'a'..='z' | '0'..='9'
311            | '.' | '_' | '/' | '(' | ')' | '-' | ':'
312        )
313    }
314
315    pub(crate) fn validate_doi(s: &str) -> Result<(), RefParseError> {
316        if s.is_empty() {
317            return Err(RefParseError::Empty);
318        }
319
320        // Must begin with literal "10."; the registrant is 4–9 digits up
321        // to the first '/'. After that, everything is suffix.
322        let rest = s
323            .strip_prefix("10.")
324            .ok_or(RefParseError::MissingDoiPrefix)?;
325        let slash_idx = rest
326            .find('/')
327            .ok_or(RefParseError::MissingDoiSuffixSeparator)?;
328        let registrant = &rest[..slash_idx];
329        let suffix = &rest[slash_idx + 1..];
330
331        // Registrant: 4–9 ASCII digits.
332        if registrant.len() < 4
333            || registrant.len() > 9
334            || !registrant.chars().all(|c| c.is_ascii_digit())
335        {
336            return Err(RefParseError::InvalidDoiRegistrant);
337        }
338
339        // Suffix: non-empty, charset-restricted, length-bounded.
340        if suffix.is_empty() {
341            return Err(RefParseError::EmptyDoiSuffix);
342        }
343        if suffix.len() > DOI_SUFFIX_MAX_LEN {
344            return Err(RefParseError::DoiSuffixTooLong {
345                len: suffix.len(),
346                max: DOI_SUFFIX_MAX_LEN,
347            });
348        }
349        if let Some(bad) = suffix.chars().find(|c| !is_doi_suffix_char(*c)) {
350            return Err(RefParseError::InvalidDoiSuffixChar { ch: bad });
351        }
352        Ok(())
353    }
354
355    /// Validates an arXiv id (with the `arxiv:` / `arXiv:` scheme already
356    /// stripped). Tries the new-style shape first, then the old-style.
357    pub(crate) fn validate_arxiv(s: &str) -> Result<(), RefParseError> {
358        if s.is_empty() {
359            return Err(RefParseError::Empty);
360        }
361        if validate_arxiv_new(s).is_ok() || validate_arxiv_old(s).is_ok() {
362            return Ok(());
363        }
364        Err(RefParseError::InvalidArxivShape)
365    }
366
367    /// New-style arXiv id: `YYMM.NNNNN[vN]`.
368    fn validate_arxiv_new(s: &str) -> Result<(), ()> {
369        let dot_idx = s.find('.').ok_or(())?;
370        let head = &s[..dot_idx];
371        let tail = &s[dot_idx + 1..];
372
373        // Head: exactly 4 ASCII digits.
374        if head.len() != 4 || !head.chars().all(|c| c.is_ascii_digit()) {
375            return Err(());
376        }
377
378        // Tail: 4–5 digits, then optional `v` followed by ≥1 digits.
379        let bytes = tail.as_bytes();
380        let mut i = 0;
381        while i < bytes.len() && bytes[i].is_ascii_digit() {
382            i += 1;
383        }
384        let digits_len = i;
385        if !(4..=5).contains(&digits_len) {
386            return Err(());
387        }
388        if i == bytes.len() {
389            return Ok(());
390        }
391        // Optional version suffix.
392        if bytes[i] != b'v' {
393            return Err(());
394        }
395        i += 1;
396        let v_start = i;
397        while i < bytes.len() && bytes[i].is_ascii_digit() {
398            i += 1;
399        }
400        if i == v_start || i != bytes.len() {
401            return Err(());
402        }
403        Ok(())
404    }
405
406    /// Old-style arXiv id: `subject-class/YYMMNNN[vN]`.
407    /// Subject class: `[a-z]([a-z-]*[a-z])?(\.[A-Z]{2})?`.
408    fn validate_arxiv_old(s: &str) -> Result<(), ()> {
409        let slash_idx = s.find('/').ok_or(())?;
410        let class = &s[..slash_idx];
411        let id = &s[slash_idx + 1..];
412
413        // Class: starts with [a-z], body is [a-z-], optional `.XX` (two
414        // ASCII upper).
415        let (core_class, dot_part) = match class.find('.') {
416            Some(d) => (&class[..d], Some(&class[d + 1..])),
417            None => (class, None),
418        };
419        if core_class.is_empty()
420            || !core_class
421                .chars()
422                .all(|c| c.is_ascii_lowercase() || c == '-')
423            || core_class.starts_with('-')
424            || core_class.ends_with('-')
425        {
426            return Err(());
427        }
428        if let Some(dp) = dot_part {
429            if dp.len() != 2 || !dp.chars().all(|c| c.is_ascii_uppercase()) {
430                return Err(());
431            }
432        }
433
434        // Id: 7 digits, optional `vN`.
435        let bytes = id.as_bytes();
436        let mut i = 0;
437        while i < bytes.len() && bytes[i].is_ascii_digit() {
438            i += 1;
439        }
440        if i != 7 {
441            return Err(());
442        }
443        if i == bytes.len() {
444            return Ok(());
445        }
446        if bytes[i] != b'v' {
447            return Err(());
448        }
449        i += 1;
450        let v_start = i;
451        while i < bytes.len() && bytes[i].is_ascii_digit() {
452            i += 1;
453        }
454        if i == v_start || i != bytes.len() {
455            return Err(());
456        }
457        Ok(())
458    }
459}
460
461// ---------------------------------------------------------------------------
462// RefParseError
463// ---------------------------------------------------------------------------
464
465/// Reasons a `Doi::parse` / `ArxivId::parse` / `Ref::parse` call can fail.
466///
467/// Each variant maps to one rejection category in `docs/SECURITY.md` §1.1.
468/// All variants funnel to [`ErrorCode::InvalidRef`] when surfacing to MCP /
469/// CLI; the granular shape is preserved for tests and for future log
470/// breadcrumbs. The `From<RefParseError> for ErrorCode` impl below makes
471/// `?` propagation collapse to `INVALID_REF` automatically, satisfying
472/// `docs/PUBLIC_API.md` §4.
473///
474/// Marked `#[non_exhaustive]` so adding new categories is a non-breaking
475/// change. Pattern-match with a wildcard arm.
476#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
477#[non_exhaustive]
478pub enum RefParseError {
479    /// Input was empty.
480    #[error("empty input")]
481    Empty,
482    /// Input did not begin with the required `10.` literal (after any
483    /// scheme strip).
484    #[error("DOI must begin with '10.'")]
485    MissingDoiPrefix,
486    /// Input started with `10.` but had no `/` separator between
487    /// registrant and suffix.
488    #[error("DOI must contain '/' between registrant and suffix")]
489    MissingDoiSuffixSeparator,
490    /// Registrant was not 4–9 ASCII digits.
491    #[error("DOI registrant must be 4–9 ASCII digits")]
492    InvalidDoiRegistrant,
493    /// DOI suffix was empty.
494    #[error("DOI suffix is empty")]
495    EmptyDoiSuffix,
496    /// DOI suffix exceeded `DOI_SUFFIX_MAX_LEN` bytes.
497    #[error("DOI suffix is {len} bytes; maximum is {max}")]
498    DoiSuffixTooLong {
499        /// Observed suffix length, in bytes.
500        len: usize,
501        /// Hard upper bound (always [`DOI_SUFFIX_MAX_LEN`]).
502        max: usize,
503    },
504    /// DOI suffix contained a character outside `[A-Za-z0-9._/():-]`.
505    #[error("DOI suffix contains invalid character {ch:?}")]
506    InvalidDoiSuffixChar {
507        /// The first offending character.
508        ch: char,
509    },
510    /// Input matched neither the new-style nor old-style arXiv shape.
511    #[error("input does not match any known arXiv id shape")]
512    InvalidArxivShape,
513}
514
515impl From<RefParseError> for ErrorCode {
516    fn from(_: RefParseError) -> Self {
517        // All parse failures collapse to INVALID_REF at the public boundary,
518        // matching `docs/PUBLIC_API.md` §4 and `docs/SECURITY.md` §1.1.
519        ErrorCode::InvalidRef
520    }
521}
522
523// ---------------------------------------------------------------------------
524// Safekey
525// ---------------------------------------------------------------------------
526
527/// A filesystem-safe key derived deterministically from a `Ref`.
528///
529/// See `docs/SAFEKEY.md` for the full algorithm and reference test vectors.
530/// Construct via `Ref::safekey()` (Phase 1+); inner field is `pub(crate)`.
531///
532/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"doi_10.1234_example"`.
533#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
534#[serde(transparent)]
535pub struct Safekey(pub(crate) String);
536
537impl Safekey {
538    /// Returns the safekey as a string slice.
539    pub fn as_str(&self) -> &str {
540        &self.0
541    }
542}
543
544impl Ref {
545    /// Returns the bare identifier string usable as a provenance `ref` field.
546    ///
547    /// Equivalent to `Doi::as_str` / `ArxivId::as_str` dispatched on the
548    /// variant — the URI scheme (`doi:` / `arxiv:`) is never present in the
549    /// inner identifiers (it is stripped at parse time), so the result is
550    /// always the bare DOI or arXiv id. Used by the CLI / MCP orchestrators
551    /// to populate the `ref` column of provenance log rows
552    /// (`docs/PROVENANCE_LOG.md` §3) without re-matching the variant.
553    pub fn as_input_str(&self) -> &str {
554        match self {
555            Ref::Doi(d) => d.as_str(),
556            Ref::Arxiv(a) => a.as_str(),
557        }
558    }
559
560    /// Derives a deterministic, filesystem-safe key from this reference.
561    ///
562    /// The algorithm is the NORMATIVE binding spec in `docs/SAFEKEY.md` §3.
563    /// Both Rust and Julia implementations MUST produce bit-identical output
564    /// for every entry in `tests/fixtures/safekey/vectors.json`.
565    ///
566    /// # Algorithm summary
567    ///
568    /// 1. Prefix with `doi_` or `arxiv_` (per variant).
569    /// 2. Replace any character outside `[A-Za-z0-9._-]` with `_`.
570    /// 3. Collapse consecutive `_` runs to a single `_`.
571    /// 4. Trim leading/trailing `_`.
572    /// 5. If the result exceeds 192 bytes, take the first 192 bytes plus
573    ///    `_` plus the first 8 hex chars of `SHA-256(raw)` (where `raw` is
574    ///    the step-1 output, before escaping).
575    ///
576    /// The bound on `as_str()` after step 4 is pure ASCII (steps 1-3 produce
577    /// only ASCII bytes), so the byte-slice in step 5 cannot split a
578    /// multibyte char.
579    pub fn safekey(&self) -> Safekey {
580        // Step 0: prefix per variant. Doi/ArxivId hold the bare identifier
581        // (no `doi:` / `arxiv:` URI scheme — that is stripped by Ref::parse,
582        // not relevant here).
583        let raw = match self {
584            Ref::Doi(d) => format!("doi_{}", d.as_str()),
585            Ref::Arxiv(a) => format!("arxiv_{}", a.as_str()),
586        };
587
588        // Step 1: replace unsafe chars with '_'. Non-ASCII chars (emitted by
589        // String::chars() as full Unicode code points) all hit the wildcard
590        // arm and become a single '_'.
591        let escaped: String = raw
592            .chars()
593            .map(|c| match c {
594                'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' => c,
595                _ => '_',
596            })
597            .collect();
598
599        // Step 2: collapse consecutive '_' runs to a single '_'.
600        let mut collapsed = String::with_capacity(escaped.len());
601        let mut last_was_underscore = false;
602        for c in escaped.chars() {
603            if c == '_' {
604                if !last_was_underscore {
605                    collapsed.push('_');
606                }
607                last_was_underscore = true;
608            } else {
609                collapsed.push(c);
610                last_was_underscore = false;
611            }
612        }
613
614        // Step 3: trim leading/trailing '_'.
615        let trimmed = collapsed.trim_matches('_');
616
617        // Step 4: length-bound. After steps 1-3 `trimmed` is pure ASCII, so
618        // `len()` (bytes) == char count and `&trimmed[..192]` is char-safe.
619        let key = if trimmed.len() > 192 {
620            let digest = sha2::Sha256::digest(raw.as_bytes());
621            let hash = hex::encode(&digest[..4]);
622            format!("{}_{}", &trimmed[..192], hash)
623        } else {
624            trimmed.to_string()
625        };
626
627        Safekey(key)
628    }
629}
630
631// ---------------------------------------------------------------------------
632// ErrorCode
633// ---------------------------------------------------------------------------
634
635/// The closed set of error codes doiget surfaces.
636///
637/// See `docs/ERRORS.md` for the persona × code matrix.
638///
639/// Marked `#[non_exhaustive]` so adding new variants is a minor (not major)
640/// version bump.
641#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
642#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
643#[non_exhaustive]
644pub enum ErrorCode {
645    /// DOI / arXiv id failed validation.
646    InvalidRef,
647    /// Tier 1 sources reported no OA URL.
648    NoOaAvailable,
649    /// Internal rate cap or upstream 429.
650    RateLimited,
651    /// Transport / DNS / TLS failure.
652    NetworkError,
653    /// A metadata source authoritatively reported that the identifier
654    /// does not exist. Network-independent and reproducible, so `doiget
655    /// verify` treats it as a definite dead reference (fails the run even
656    /// without `--strict`) rather than a tolerable blip — distinct from
657    /// the transient [`Self::NetworkError`], [`Self::RateLimited`], and
658    /// [`Self::FetchTimeout`].
659    ///
660    /// Sources: an HTTP `404` / `410` / `451` from a metadata API, or a
661    /// source-specific absence signal (e.g. arXiv returns HTTP 200 with an
662    /// empty `<feed>` for an unknown id, surfaced via `FetchError::NotFound`).
663    ///
664    /// Caveat (DOI fan-out): for a DOI this is emitted only when the
665    /// configured metadata sources (Crossref, then Unpaywall) all fail to
666    /// resolve it and at least one authoritatively 404s. A DOI registered
667    /// only outside that set (e.g. a DataCite-only dataset DOI) can
668    /// therefore be reported `NotFound` even though it exists in a
669    /// registry doiget does not query.
670    NotFound,
671    /// A name filter (author / venue / publisher) matched MORE than one
672    /// entity with no clear winner, so it could not be resolved to a single
673    /// id. Distinct from [`Self::NotFound`] ("matched nothing"): an agent
674    /// should *narrow* the name (add a first name / fuller title) rather
675    /// than conclude the entity does not exist. The accompanying error
676    /// message lists the candidate matches. Wire form: `"AMBIGUOUS"`.
677    /// Raised by `doiget search`'s name-filter resolution (ADR-0031 D5).
678    Ambiguous,
679    /// Filesystem write failed.
680    StoreError,
681    /// Provenance log write failed; the fetch was aborted.
682    LogError,
683    /// Source not granted by the runtime `CapabilityProfile`.
684    CapabilityDenied,
685    /// Per-request timeout exceeded.
686    FetchTimeout,
687    /// Store entry's `schema_version` is ahead of this build.
688    SchemaTooNew,
689    /// Could not acquire `flock` within 5 s.
690    LockTimeout,
691    /// Bug — please open an issue.
692    InternalError,
693    /// Feature is spec'd but not yet wired in this Phase. Distinct from
694    /// [`Self::InternalError`] (which signals a bug) and
695    /// [`Self::CapabilityDenied`] (which signals a runtime config gate).
696    /// Returned by stubs that exist to pin the public surface ahead of
697    /// orchestrator implementation, so an agent can react with "wait for
698    /// next minor release" rather than "report a bug" or "tweak my
699    /// capability profile". Wire form: `"NOT_IMPLEMENTED"`.
700    NotImplemented,
701    /// The identifier is valid and resolvable, but the **requested
702    /// representation** is not available from its source — currently the
703    /// ar5iv HTML render consulted by `doiget text` (a 200 with no
704    /// extractable prose: the paper was never converted to HTML).
705    ///
706    /// Deliberately distinct from the neighbouring codes so an agent does
707    /// not misdiagnose a missing render as a bad reference (issue #302):
708    /// it is NOT [`Self::NotFound`] (the id *does* exist), NOT
709    /// [`Self::NoOaAvailable`] (the paper may well be OA — only this one
710    /// representation is missing), and NOT [`Self::NetworkError`] (the
711    /// fetch succeeded). The actionable branch is "fetch the PDF instead",
712    /// not "fix the identifier". Wire form: `"TEXT_UNAVAILABLE"`.
713    TextUnavailable,
714}
715
716impl ErrorCode {
717    /// The `SCREAMING_SNAKE_CASE` wire token for this code, as a
718    /// `&'static str`. Identical to the serde representation but
719    /// allocation-free and usable where a borrowed string with a
720    /// `'static` lifetime is required — notably the provenance log
721    /// `error_code` column (`docs/PROVENANCE_LOG.md` §3), so a failure
722    /// row records the *actual* mapped code instead of a hand-written
723    /// literal that can drift from this enum (issue #118).
724    #[must_use]
725    pub fn as_wire(&self) -> &'static str {
726        match self {
727            ErrorCode::InvalidRef => "INVALID_REF",
728            ErrorCode::NoOaAvailable => "NO_OA_AVAILABLE",
729            ErrorCode::RateLimited => "RATE_LIMITED",
730            ErrorCode::NetworkError => "NETWORK_ERROR",
731            ErrorCode::NotFound => "NOT_FOUND",
732            ErrorCode::Ambiguous => "AMBIGUOUS",
733            ErrorCode::StoreError => "STORE_ERROR",
734            ErrorCode::LogError => "LOG_ERROR",
735            ErrorCode::CapabilityDenied => "CAPABILITY_DENIED",
736            ErrorCode::FetchTimeout => "FETCH_TIMEOUT",
737            ErrorCode::SchemaTooNew => "SCHEMA_TOO_NEW",
738            ErrorCode::LockTimeout => "LOCK_TIMEOUT",
739            ErrorCode::InternalError => "INTERNAL_ERROR",
740            ErrorCode::NotImplemented => "NOT_IMPLEMENTED",
741            ErrorCode::TextUnavailable => "TEXT_UNAVAILABLE",
742        }
743    }
744}
745
746// ---------------------------------------------------------------------------
747// DenialReason / DenialContext (ADR-0023)
748// ---------------------------------------------------------------------------
749
750/// Closed-set reasons a denial-class error envelope can carry on its
751/// optional `denial_context.reason` field.
752///
753/// Wire form (JSON / MCP) is `snake_case` — e.g. `"redirect_not_in_allowlist"`.
754/// The set is **closed** per ADR-0023 §2: adding a new variant is a minor
755/// semver bump; renaming or repurposing one is a breaking change. Mirrors
756/// the stability rule that already governs [`ErrorCode`].
757///
758/// See [`DenialContext`] for the surrounding struct, `docs/ERRORS.md` §3.1
759/// for the wire surface, and `docs/PUBLIC_API.md` §8 for the
760/// semver-locked surface contract.
761#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
762#[serde(rename_all = "snake_case")]
763pub enum DenialReason {
764    /// Redirect target host did not match the source's allowlist
765    /// (`HttpError::RedirectDenied`).
766    RedirectNotInAllowlist,
767    /// Redirect target had a non-HTTPS scheme (`HttpError::InsecureRedirect`).
768    InsecureScheme,
769    /// Source produced a URL whose host is on a future blocklist.
770    ///
771    /// Reserved — no producer wired yet. Will be emitted by the future
772    /// per-source URL host-blocklist guard once that component lands
773    /// (post-Phase-1 supply-chain hardening; see
774    /// `docs/REDIRECT_ALLOWLIST.md` §4 for the staging plan).
775    HostInBlockList,
776    /// Body exceeded [`PDF_MAX_BYTES`] (`HttpError::OversizedBody`).
777    SizeCapExceeded,
778    /// Store entry's `schema_version` is ahead of this binary.
779    ///
780    /// Reserved — no producer wired yet. Will be emitted by the
781    /// `FsStore` schema-rejection path once the read-side bump check
782    /// lands (it currently only writes the current `SCHEMA_VERSION`).
783    SchemaDrift,
784    /// Source not in the runtime [`CapabilityProfile`]
785    /// (`FetchError::NotEligible`).
786    CapabilityNotGranted,
787    /// Rate limiter rejected the call inside the current window.
788    ///
789    /// Reserved — no producer wired yet. Will be emitted by
790    /// [`RateLimiter`](crate::rate_limiter::RateLimiter) once the
791    /// limiter surfaces structured denials (Phase 2+; today the
792    /// limiter only sleeps to enforce the window).
793    RateLimitWindow,
794    /// SSRF guard rejected a private / link-local / cloud-metadata address.
795    ///
796    /// Reserved — no producer wired yet. Will be emitted by the
797    /// future SSRF pre-flight check (post-Phase-1 supply-chain
798    /// hardening; the workspace currently relies on rustls + the
799    /// HTTPS-only redirect policy to keep the attack surface small).
800    SsrfPrivateAddress,
801    /// Response Content-Type / magic-byte mismatch (`HttpError::NotAPdf`).
802    ContentTypeMismatch,
803}
804
805/// Structured machine-parseable companion to `error.message` for
806/// recoverable denials.
807///
808/// The field is **optional and additive** on the public error envelope —
809/// every previously-shipped `{code, message}` envelope remains valid, and
810/// agents that ignore this struct continue to work. When present, it
811/// carries the concrete parameters an LLM agent can use to plan a recovery
812/// (e.g. "the redirect to `evil.example.com` was denied because it is not
813/// in the crossref allowlist") without text-mining `error.message`.
814///
815/// ## Wire shape
816///
817/// `#[serde(deny_unknown_fields)]`: forward-compatible field additions on
818/// the wire are forbidden by design — adding a field to this struct is a
819/// **breaking** change. This is why the type is **not** `#[non_exhaustive]`
820/// (per `docs/PUBLIC_API.md` §8): both production rules — Rust struct
821/// construction outside the crate AND wire-level extension — must agree.
822///
823/// All fields except `reason` are optional. Producers populate the fields
824/// relevant to the reason and leave the rest at `None`; consumers MUST
825/// tolerate any subset of fields being present. Optional fields are
826/// skipped on serialize but accepted as missing on deserialize via
827/// `#[serde(default, skip_serializing_if = "Option::is_none")]`.
828///
829/// [`Self::expected`] is `Option<Vec<String>>` rather than `Vec<String>`
830/// so the producer can distinguish "this reason has no allowlist channel"
831/// (`None` → field absent on the wire) from "this is the explicit list of
832/// acceptable values, possibly empty" (`Some(vec![])` → `"expected":[]` on
833/// the wire). The previous `Vec<String>` shape collapsed both states
834/// into "field omitted", which an LLM agent could not safely disambiguate.
835///
836/// Mapping table: see ADR-0023 §4, plus the
837/// `From<&HttpError> for Option<DenialContext>` and
838/// `From<&FetchError> for Option<DenialContext>` impls in
839/// [`crate::http`] / [`crate::source`].
840#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
841#[serde(deny_unknown_fields)]
842pub struct DenialContext {
843    /// Closed-enum reason code; the only required field.
844    pub reason: DenialReason,
845    /// Resolver source key (e.g. `"crossref"`) when one is in scope.
846    #[serde(default, skip_serializing_if = "Option::is_none")]
847    pub source: Option<String>,
848    /// Concrete value the producer attempted (host, path, hex magic bytes,
849    /// scheme prefix). Shape is reason-specific; consumers MUST treat it
850    /// as opaque text.
851    #[serde(default, skip_serializing_if = "Option::is_none")]
852    pub attempted: Option<String>,
853    /// Allowlist entries / acceptable values. `Option<Vec<String>>` so the
854    /// producer can distinguish "this reason has no allowlist channel"
855    /// (`None`, field absent on the wire) from "this is the explicit list
856    /// of acceptable values, possibly empty" (`Some(vec![])`, `"expected":[]`
857    /// on the wire). The inner `Vec<String>` is used even when only one
858    /// value is meaningful (e.g. `Some(vec!["%PDF-".into()])`) so the
859    /// format does not have to flip when multiple values are acceptable.
860    #[serde(default, skip_serializing_if = "Option::is_none")]
861    pub expected: Option<Vec<String>>,
862    /// Redirect-chain hop position, 0-indexed. `u8` because the chain is
863    /// hard-capped at [`crate::http`]'s `MAX_REDIRECTS` (= 10) and any
864    /// larger value indicates a bug.
865    #[serde(default, skip_serializing_if = "Option::is_none")]
866    pub hop_index: Option<u8>,
867    /// Size or rate cap value (e.g. [`PDF_MAX_BYTES`]).
868    #[serde(default, skip_serializing_if = "Option::is_none")]
869    pub cap: Option<u64>,
870    /// Observed value (e.g. response bytes when [`Self::cap`] is the byte
871    /// cap, or row schema_version when [`Self::cap`] is the binary's).
872    #[serde(default, skip_serializing_if = "Option::is_none")]
873    pub actual: Option<u64>,
874}
875
876// ---------------------------------------------------------------------------
877// ResolvedCandidate / ResolveResult (Issue #242)
878// ---------------------------------------------------------------------------
879
880/// A candidate paper resolved from a bibliographic citation string.
881#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
882pub struct ResolvedCandidate {
883    /// Resolved DOI.
884    pub doi: String,
885    /// Title of the resolved candidate.
886    pub title: String,
887    /// First author or primary author representation.
888    pub author: String,
889    /// Publication year, if resolved.
890    pub year: Option<i32>,
891    /// Token similarity overlap score in `0.0..=1.0`.
892    pub score: f64,
893    /// Resolving metadata source (e.g. `"crossref"`).
894    pub source: String,
895}
896
897/// The result structure returned by bibliographic citation resolution.
898#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
899pub struct ResolveResult {
900    /// The original query bibliographic citation string.
901    pub query: String,
902    /// Ranked candidate list (highest score first, thresholded to >= 0.5).
903    pub candidates: Vec<ResolvedCandidate>,
904}
905
906// ---------------------------------------------------------------------------
907// CapabilityProfile (placeholder; full impl in Phase 1)
908// ---------------------------------------------------------------------------
909
910/// Marker for the always-on Open Access tier. See `docs/CAPABILITY.md`.
911#[derive(Debug, Clone, Copy)]
912pub struct AlwaysOn;
913
914/// Which Tier 2 metadata sources are enabled this session. See `docs/CAPABILITY.md`.
915#[derive(Debug, Clone, Default)]
916#[non_exhaustive]
917pub struct MetadataAccess {
918    /// Phase 4+; enabled by `DOIGET_ENABLE_OPENALEX`.
919    pub openalex: bool,
920    /// Phase 4+; enabled by `DOIGET_ENABLE_S2`.
921    pub semantic_scholar: bool,
922    /// Phase 4+; enabled by `DOIGET_ENABLE_DOAJ`.
923    pub doaj: bool,
924}
925
926/// Process-wide rate limits. Hard-coded; not configurable.
927///
928/// Construct only via [`RateLimits::HARD_CODED`]. The struct fields are
929/// `pub(crate)` so downstream code cannot synthesize a `RateLimits` with
930/// different values, which would weaken `docs/LEGAL.md` §6 safeguard 8.
931#[derive(Debug, Clone, Copy)]
932#[non_exhaustive]
933pub struct RateLimits {
934    pub(crate) max_concurrent_fetches: u32,
935    pub(crate) max_fetches_per_second: f32,
936    pub(crate) per_source_backoff_ms: u64,
937}
938
939impl RateLimits {
940    /// The single, hard-coded set of rate limits. There is no other public
941    /// constructor — see the type-level docs.
942    pub const HARD_CODED: Self = Self {
943        max_concurrent_fetches: MAX_CONCURRENT_FETCHES,
944        max_fetches_per_second: MAX_FETCHES_PER_SECOND,
945        per_source_backoff_ms: 200,
946    };
947
948    /// Maximum number of concurrent fetches in flight.
949    pub const fn max_concurrent_fetches(&self) -> u32 {
950        self.max_concurrent_fetches
951    }
952
953    /// Maximum fetch attempts per second across all sources.
954    pub const fn max_fetches_per_second(&self) -> f32 {
955        self.max_fetches_per_second
956    }
957
958    /// Per-source backoff in milliseconds between consecutive requests.
959    pub const fn per_source_backoff_ms(&self) -> u64 {
960        self.per_source_backoff_ms
961    }
962}
963
964/// A successful TDM grant.
965///
966/// Carries the validated API key (`docs/CAPABILITY.md` §1) so that the key
967/// flows from the startup capability gate into the source, rather than each
968/// TDM source re-reading the env var at fetch time (issue #153 — an env
969/// mutation between startup and fetch is otherwise undetectable).
970///
971/// The `api_key` field exists only when at least one `tdm-*` Cargo feature
972/// is compiled in (the `secrecy` dependency is `optional = true` and gated
973/// on those features per ADR-0002, so default release binaries contain no
974/// TDM code path at all). The struct is `#[non_exhaustive]`; the
975/// `tdm-*`-gated `api_key` field is therefore additive, not breaking, for
976/// builds that toggle the feature set.
977///
978/// `docs/CAPABILITY.md` §1 specifies the type as `Secret<String>`; that is
979/// the `secrecy` 0.9 spelling. The workspace pins `secrecy` 0.10, whose
980/// equivalent owned-string secret type is `secrecy::SecretString`
981/// (`= SecretBox<str>`). CAPABILITY.md §1 has been updated to match the
982/// 0.10 API. `Debug` redacts the value.
983///
984/// Implements `Default` so in-crate test fixtures using
985/// `TdmGrant { agree_env_var: ..., ..Default::default() }` keep compiling;
986/// the default `api_key` is an empty secret.
987#[derive(Debug, Clone)]
988#[non_exhaustive]
989pub struct TdmGrant {
990    /// The publisher API key, validated present at startup by
991    /// [`CapabilityProfile::from_env`]. Wrapped in
992    /// `secrecy::SecretString` so `Debug` never prints it; use
993    /// `secrecy::ExposeSecret::expose_secret` at the point of use.
994    ///
995    /// Only present when a `tdm-*` feature is compiled in (see the
996    /// type-level docs and ADR-0002).
997    #[cfg(any(
998        feature = "tdm-elsevier",
999        feature = "tdm-aps",
1000        feature = "tdm-springer"
1001    ))]
1002    pub api_key: secrecy::SecretString,
1003    /// Which env var the user used to acknowledge the publisher's ToS.
1004    pub agree_env_var: String,
1005    /// When the agreement env var was first observed at startup.
1006    pub agreed_at: chrono::DateTime<chrono::Utc>,
1007}
1008
1009impl Default for TdmGrant {
1010    fn default() -> Self {
1011        Self {
1012            #[cfg(any(
1013                feature = "tdm-elsevier",
1014                feature = "tdm-aps",
1015                feature = "tdm-springer"
1016            ))]
1017            api_key: secrecy::SecretString::from(String::new()),
1018            agree_env_var: String::new(),
1019            agreed_at: chrono::Utc::now(),
1020        }
1021    }
1022}
1023
1024/// Runtime gate for which sources may be invoked. See `docs/CAPABILITY.md`.
1025///
1026/// Marked `#[non_exhaustive]` so adding new capability classes is non-breaking.
1027/// Pattern-match only against the documented variants and use a wildcard arm.
1028///
1029/// **Construction**: external callers use [`CapabilityProfile::from_env()`].
1030/// Struct-literal construction is blocked outside this crate by
1031/// `#[non_exhaustive]`; this is intentional — the type's safety guarantees
1032/// rely on the resolution rules in `from_env`. `Default` is **not yet**
1033/// implemented; Phase 1 will add it once the field set stabilizes.
1034#[derive(Debug, Clone)]
1035#[non_exhaustive]
1036pub struct CapabilityProfile {
1037    /// Tier 1 OA sources are always permitted.
1038    pub oa: AlwaysOn,
1039    /// Tier 2 metadata access (Phase 4+).
1040    pub metadata: MetadataAccess,
1041    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
1042    pub tdm_elsevier: Option<TdmGrant>,
1043    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
1044    pub tdm_aps: Option<TdmGrant>,
1045    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
1046    pub tdm_springer: Option<TdmGrant>,
1047    /// Hard-coded rate limits for this process.
1048    pub rate_limits: RateLimits,
1049}
1050
1051/// Errors that can arise during `CapabilityProfile::from_env`.
1052#[derive(Debug, thiserror::Error)]
1053pub enum CapabilityError {
1054    /// User set the agree env var but provided no key. See `docs/CAPABILITY.md` §2.
1055    #[error("env {agree_var} is set but {key_var} is missing")]
1056    AgreedButNoKey {
1057        /// The agreement env var the user set.
1058        agree_var: String,
1059        /// The key env var that should accompany it.
1060        key_var: String,
1061    },
1062    /// Key env var is set but user has not agreed. See `docs/CAPABILITY.md` §2.
1063    #[error("key for {agree_var} is present but {agree_var} is not set to '1'")]
1064    KeyButNotAgreed {
1065        /// The agreement env var the user must set to `1` before the key takes effect.
1066        agree_var: String,
1067    },
1068}
1069
1070impl CapabilityProfile {
1071    /// Read the runtime profile from environment variables.
1072    ///
1073    /// Implements the resolution algorithm specified in
1074    /// [`docs/CAPABILITY.md`](../../../docs/CAPABILITY.md) §2.
1075    ///
1076    /// # Tier 1 (Open Access)
1077    ///
1078    /// Always permitted; not gated on any env var or feature.
1079    ///
1080    /// # Tier 2 (metadata)
1081    ///
1082    /// Each metadata source becomes available when its env var is set
1083    /// (presence-checked, value ignored) **and** the `metadata` Cargo feature
1084    /// was compiled in. If the env var is set but the feature is not compiled
1085    /// in, a `tracing::warn!` is emitted and the source is left disabled —
1086    /// this is not an error so that users can move binaries between machines
1087    /// (or switch feature sets between cargo invocations) without breaking
1088    /// startup. See `docs/CAPABILITY.md` §3 for the env var list.
1089    ///
1090    /// # Tier 3 (TDM)
1091    ///
1092    /// For each publisher in `{ELSEVIER, APS, SPRINGER}`, the
1093    /// `DOIGET_AGREE_TDM_<X>` agreement env var is paired with
1094    /// `DOIGET_KEY_<X>`. Resolution rules (per `docs/CAPABILITY.md` §2):
1095    ///
1096    /// - both unset → `tdm_<x> = None` (no error);
1097    /// - `agree == "1"` and key set → `Some(TdmGrant { .. })` (subject to the
1098    ///   feature gate below);
1099    /// - `agree == "1"` and key unset → [`CapabilityError::AgreedButNoKey`];
1100    /// - key set but `agree` unset (or `agree != "1"`) →
1101    ///   [`CapabilityError::KeyButNotAgreed`].
1102    ///
1103    /// When both env vars are set correctly **but** the corresponding
1104    /// `tdm-<x>` Cargo feature is not compiled in, this function emits a
1105    /// `tracing::warn!` and sets the grant to `None` rather than returning an
1106    /// error — same rationale as for the Tier 2 warn-and-skip behavior.
1107    ///
1108    /// # Precondition: tracing subscriber must be installed first
1109    ///
1110    /// Warn breadcrumbs are delivered via `tracing::warn!`. Callers MUST
1111    /// install a `tracing-subscriber` (or equivalent) **before** invoking
1112    /// this function, otherwise warnings are silently dropped. The
1113    /// `doiget-cli` binary does this in `main.rs`.
1114    ///
1115    /// # Errors
1116    ///
1117    /// Returns [`CapabilityError::AgreedButNoKey`] or
1118    /// [`CapabilityError::KeyButNotAgreed`] when the TDM env-var pair for any
1119    /// publisher is misconfigured. See the variant docs for the precise
1120    /// trigger conditions.
1121    ///
1122    /// # Note on `api_key` storage
1123    ///
1124    /// When a `tdm-*` feature is compiled in, [`TdmGrant`] carries the
1125    /// validated key as `secrecy::SecretString` (issue #153). The key is
1126    /// read exactly once here, at startup; TDM sources consume it from the
1127    /// grant and never re-read the env var at fetch time. This makes the
1128    /// grant a true startup attestation — an env mutation between startup
1129    /// and fetch can no longer silently change the credential in flight.
1130    /// See the [`TdmGrant`] doc-comment and `docs/CAPABILITY.md` §1/§2.
1131    pub fn from_env() -> Result<Self, CapabilityError> {
1132        // Issue #153: the validated API key is now threaded through
1133        // `TdmGrant` (as `secrecy::SecretString`, behind the `tdm-*`
1134        // features) by `resolve_tdm_grant` below — sources no longer
1135        // re-read the key env var at fetch time. See the `TdmGrant`
1136        // doc-comment and `docs/CAPABILITY.md` §1/§2.
1137
1138        // -- Tier 2 metadata -------------------------------------------------
1139        let metadata = MetadataAccess {
1140            openalex: resolve_metadata_flag(
1141                "DOIGET_ENABLE_OPENALEX",
1142                "metadata",
1143                cfg!(feature = "metadata"),
1144            ),
1145            semantic_scholar: resolve_metadata_flag(
1146                "DOIGET_ENABLE_S2",
1147                "metadata",
1148                cfg!(feature = "metadata"),
1149            ),
1150            doaj: resolve_metadata_flag(
1151                "DOIGET_ENABLE_DOAJ",
1152                "metadata",
1153                cfg!(feature = "metadata"),
1154            ),
1155        };
1156
1157        // -- Tier 3 TDM grants ----------------------------------------------
1158        let tdm_elsevier = resolve_tdm_grant(
1159            "DOIGET_AGREE_TDM_ELSEVIER",
1160            "DOIGET_KEY_ELSEVIER",
1161            "tdm-elsevier",
1162            cfg!(feature = "tdm-elsevier"),
1163        )?;
1164        let tdm_aps = resolve_tdm_grant(
1165            "DOIGET_AGREE_TDM_APS",
1166            "DOIGET_KEY_APS",
1167            "tdm-aps",
1168            cfg!(feature = "tdm-aps"),
1169        )?;
1170        let tdm_springer = resolve_tdm_grant(
1171            "DOIGET_AGREE_TDM_SPRINGER",
1172            "DOIGET_KEY_SPRINGER",
1173            "tdm-springer",
1174            cfg!(feature = "tdm-springer"),
1175        )?;
1176
1177        Ok(Self {
1178            oa: AlwaysOn,
1179            metadata,
1180            tdm_elsevier,
1181            tdm_aps,
1182            tdm_springer,
1183            rate_limits: RateLimits::HARD_CODED,
1184        })
1185    }
1186}
1187
1188/// Resolve a Tier 2 metadata flag from its env var and compile-in feature.
1189///
1190/// Returns `true` only when both the env var is present and the feature is
1191/// compiled in. When the env var is set without the feature, emits a
1192/// `tracing::warn!` and returns `false` — see [`CapabilityProfile::from_env`]
1193/// for the rationale (binaries may move between hosts / feature sets).
1194fn resolve_metadata_flag(env_var: &str, feature: &str, feature_enabled: bool) -> bool {
1195    let env_set = std::env::var_os(env_var).is_some();
1196    match (env_set, feature_enabled) {
1197        (true, true) => true,
1198        (true, false) => {
1199            tracing::warn!(
1200                env_var,
1201                feature,
1202                "{} is set but feature {} was not compiled in; the source will be unavailable",
1203                env_var,
1204                feature
1205            );
1206            false
1207        }
1208        (false, _) => false,
1209    }
1210}
1211
1212/// Resolve a Tier 3 TDM grant from the `agree`/`key` env-var pair and the
1213/// per-publisher Cargo feature.
1214///
1215/// Implements the rules in `docs/CAPABILITY.md` §2:
1216///
1217/// - both unset → `Ok(None)`.
1218/// - `agree == "1"` and `key` set → `Ok(Some(TdmGrant { .. }))` (when the
1219///   feature is enabled), or warn-and-`Ok(None)` (when the feature is not
1220///   compiled in).
1221/// - `agree == "1"` and `key` unset →
1222///   [`CapabilityError::AgreedButNoKey`].
1223/// - `key` set and `agree` unset OR `agree` set to anything other than `"1"`
1224///   → [`CapabilityError::KeyButNotAgreed`].
1225fn resolve_tdm_grant(
1226    agree_var: &str,
1227    key_var: &str,
1228    feature: &str,
1229    feature_enabled: bool,
1230) -> Result<Option<TdmGrant>, CapabilityError> {
1231    // `agree` is "agreed" iff the value is exactly the literal "1"; any other
1232    // value (including "true", "yes", empty) is treated as not-agreed per
1233    // `docs/CAPABILITY.md` §2.
1234    let agree_raw = std::env::var(agree_var).ok();
1235    let agreed = matches!(agree_raw.as_deref(), Some("1"));
1236    let agree_present = agree_raw.is_some();
1237    // Read the key value once, at startup, so the validated key flows
1238    // through `TdmGrant` and sources never re-read the env (issue #153).
1239    // An empty value is treated as "not set" — an empty API key cannot
1240    // authenticate, and silently constructing a grant around it would
1241    // mask the misconfiguration the AgreedButNoKey rule exists to surface.
1242    let key_value = std::env::var(key_var).ok().filter(|v| !v.is_empty());
1243
1244    match (agreed, agree_present, key_value) {
1245        (true, _, Some(key)) => {
1246            if feature_enabled {
1247                Ok(Some(build_tdm_grant(agree_var, key)))
1248            } else {
1249                // `key` is dropped here; under no-tdm builds it is the only
1250                // consumer of the owned `String`, which is intended.
1251                let _ = key;
1252                tracing::warn!(
1253                    env_var = agree_var,
1254                    feature,
1255                    "{} is set but feature {} was not compiled in; the source will be unavailable",
1256                    agree_var,
1257                    feature
1258                );
1259                Ok(None)
1260            }
1261        }
1262        (true, _, None) => Err(CapabilityError::AgreedButNoKey {
1263            agree_var: agree_var.to_string(),
1264            key_var: key_var.to_string(),
1265        }),
1266        // agree set to non-"1", key also set: KeyButNotAgreed (the key would
1267        // otherwise authorize the source without an explicit agreement).
1268        (false, true, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1269            agree_var: agree_var.to_string(),
1270        }),
1271        // agree unset, key set: KeyButNotAgreed (same rule).
1272        (false, false, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1273            agree_var: agree_var.to_string(),
1274        }),
1275        // agree set to non-"1" and no key: treat as no-grant. The user
1276        // expressed something but did not opt in and provided no credential,
1277        // so silent skip is the safe default (no source enabled).
1278        (false, true, None) => Ok(None),
1279        // Neither env var set: no grant, no error.
1280        (false, false, None) => Ok(None),
1281    }
1282}
1283
1284/// Construct a [`TdmGrant`] from the validated agreement var and key value.
1285///
1286/// Split out so the `tdm-*`-gated `api_key` field is populated in exactly
1287/// one place. When no `tdm-*` feature is compiled in the `key` is consumed
1288/// (dropped) here — the grant is still produced so that startup attestation
1289/// behavior (the warn-and-skip path) does not change shape between feature
1290/// sets.
1291fn build_tdm_grant(agree_var: &str, key: String) -> TdmGrant {
1292    #[cfg(any(
1293        feature = "tdm-elsevier",
1294        feature = "tdm-aps",
1295        feature = "tdm-springer"
1296    ))]
1297    {
1298        TdmGrant {
1299            api_key: secrecy::SecretString::from(key),
1300            agree_env_var: agree_var.to_string(),
1301            agreed_at: chrono::Utc::now(),
1302        }
1303    }
1304    #[cfg(not(any(
1305        feature = "tdm-elsevier",
1306        feature = "tdm-aps",
1307        feature = "tdm-springer"
1308    )))]
1309    {
1310        let _ = key;
1311        TdmGrant {
1312            agree_env_var: agree_var.to_string(),
1313            agreed_at: chrono::Utc::now(),
1314        }
1315    }
1316}
1317
1318// ---------------------------------------------------------------------------
1319// Tests — one smoke test per legally-load-bearing constant. See
1320// `docs/LEGAL.md` §6 safeguard 8 and `docs/PHASES.md` §4. These also keep the
1321// `cargo test --workspace` job from being a false-green during Phase 0.
1322// ---------------------------------------------------------------------------
1323
1324// `expect`/`unwrap` are idiomatic in tests where panics double as assertions.
1325// The workspace lints deny them in production code; relax for the test module
1326// only.
1327#[cfg(test)]
1328#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1329mod tests {
1330    use super::*;
1331
1332    #[test]
1333    fn rate_limits_hard_coded_match_legal_safeguards() {
1334        // docs/LEGAL.md §6 safeguard 8 names these exact values.
1335        assert_eq!(RateLimits::HARD_CODED.max_concurrent_fetches(), 5);
1336        assert!((RateLimits::HARD_CODED.max_fetches_per_second() - 5.0).abs() < f32::EPSILON);
1337        assert_eq!(RateLimits::HARD_CODED.per_source_backoff_ms(), 200);
1338    }
1339
1340    #[test]
1341    fn batch_size_caps_match_security_doc() {
1342        // docs/SECURITY.md §1.4 + docs/MCP_TOOLS.md.
1343        assert_eq!(MCP_BATCH_MAX_SIZE, 100);
1344        assert_eq!(MCP_QUEUE_DEPTH_MAX, 100);
1345        assert_eq!(DOI_SUFFIX_MAX_LEN, 256);
1346        assert_eq!(MCP_STDIN_EOF_SHUTDOWN_SEC, 5);
1347        // Slice 2: spec-language alias for MCP_BATCH_MAX_SIZE must
1348        // numerically agree with the original constant.
1349        assert_eq!(MAX_BATCH_REFS, MCP_BATCH_MAX_SIZE);
1350    }
1351
1352    #[test]
1353    fn schema_version_is_pinned_to_1_0() {
1354        // docs/STORE.md §3 — Phase 0/1 writes 1.0 exactly.
1355        // A bump to 1.1 (minor, backward-compat additions) requires updating
1356        // both this test and the cross-tool compat fixtures simultaneously.
1357        assert_eq!(SCHEMA_VERSION, "1.0");
1358    }
1359
1360    // -----------------------------------------------------------------
1361    // CapabilityProfile::from_env — Phase 1 resolution algorithm tests.
1362    //
1363    // These tests mutate process-global env state via std::env::set_var /
1364    // remove_var, so each test holds an `EnvGuard` RAII drop guard that
1365    // captures the pre-test value of every env var it touches and restores
1366    // it on drop (even on panic). They also use `#[serial_test::serial]` so
1367    // that no two tests in this module touch env state concurrently — the
1368    // workspace's test runner defaults to multi-threaded.
1369    //
1370    // Spec: docs/CAPABILITY.md §2 (resolution algorithm) and §3 (env var
1371    // reference table).
1372    // -----------------------------------------------------------------
1373
1374    /// RAII guard that captures the prior value of an env var on construction
1375    /// and restores it on drop. Use one guard per touched var per test.
1376    struct EnvGuard {
1377        var: &'static str,
1378        prior: Option<std::ffi::OsString>,
1379    }
1380
1381    impl EnvGuard {
1382        /// Capture and clear `var`. Use `set` afterwards to install a value.
1383        fn unset(var: &'static str) -> Self {
1384            let prior = std::env::var_os(var);
1385            // SAFETY (env mutation): tests are serialized via
1386            // `#[serial_test::serial]`. `remove_var` is sound when no other
1387            // thread reads or writes the environment concurrently.
1388            std::env::remove_var(var);
1389            EnvGuard { var, prior }
1390        }
1391
1392        /// Capture, then set `var` to `value`.
1393        fn set(var: &'static str, value: &str) -> Self {
1394            let prior = std::env::var_os(var);
1395            std::env::set_var(var, value);
1396            EnvGuard { var, prior }
1397        }
1398    }
1399
1400    impl Drop for EnvGuard {
1401        fn drop(&mut self) {
1402            match &self.prior {
1403                Some(v) => std::env::set_var(self.var, v),
1404                None => std::env::remove_var(self.var),
1405            }
1406        }
1407    }
1408
1409    /// Convenience: unset every Tier 2 / Tier 3 env var the resolution
1410    /// algorithm reads, returning a vector of guards that restore them on
1411    /// drop. Callers can then `EnvGuard::set` individual vars on top.
1412    fn unset_all_capability_env_vars() -> Vec<EnvGuard> {
1413        [
1414            "DOIGET_ENABLE_OPENALEX",
1415            "DOIGET_ENABLE_S2",
1416            "DOIGET_ENABLE_DOAJ",
1417            "DOIGET_AGREE_TDM_ELSEVIER",
1418            "DOIGET_KEY_ELSEVIER",
1419            "DOIGET_AGREE_TDM_APS",
1420            "DOIGET_KEY_APS",
1421            "DOIGET_AGREE_TDM_SPRINGER",
1422            "DOIGET_KEY_SPRINGER",
1423        ]
1424        .iter()
1425        .map(|v| EnvGuard::unset(v))
1426        .collect()
1427    }
1428
1429    #[test]
1430    #[serial_test::serial]
1431    fn from_env_no_env_vars_set_returns_tier_1_only() {
1432        // Rule: with every relevant env var unset, the resolved profile has
1433        // all TDM grants `None` and all metadata flags `false`. Hard-coded
1434        // rate limits still apply. (Replaces the old Phase 0 stub test.)
1435        let _g = unset_all_capability_env_vars();
1436
1437        let p = CapabilityProfile::from_env().expect("clean env never errors");
1438        assert!(p.tdm_elsevier.is_none());
1439        assert!(p.tdm_aps.is_none());
1440        assert!(p.tdm_springer.is_none());
1441        assert!(!p.metadata.openalex);
1442        assert!(!p.metadata.semantic_scholar);
1443        assert!(!p.metadata.doaj);
1444        assert_eq!(p.rate_limits.max_concurrent_fetches(), 5);
1445    }
1446
1447    #[test]
1448    #[serial_test::serial]
1449    fn from_env_no_tdm_returns_tier_1_profile() {
1450        // Rule (CAPABILITY.md §2): with every TDM env var unset, all
1451        // `tdm_*` fields are `None` and no error is produced.
1452        let _g = unset_all_capability_env_vars();
1453
1454        let p = CapabilityProfile::from_env().expect("no TDM env -> Ok");
1455        assert!(p.tdm_elsevier.is_none());
1456        assert!(p.tdm_aps.is_none());
1457        assert!(p.tdm_springer.is_none());
1458    }
1459
1460    #[test]
1461    #[serial_test::serial]
1462    fn from_env_agreed_but_no_key_errs() {
1463        // Rule (CAPABILITY.md §2): agree=1 + key unset -> AgreedButNoKey.
1464        let _g = unset_all_capability_env_vars();
1465        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1466
1467        let result = CapabilityProfile::from_env();
1468        match result {
1469            Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1470                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1471                assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1472            }
1473            other => panic!("expected AgreedButNoKey, got {:?}", other),
1474        }
1475    }
1476
1477    #[test]
1478    #[serial_test::serial]
1479    fn from_env_agreed_but_empty_key_errs() {
1480        // Security-adjacent (PR #161 review): an *empty* key string is
1481        // treated as "not set" by `resolve_tdm_grant`. With agree=1 and
1482        // DOIGET_KEY_ELSEVIER="" the misconfiguration must surface as
1483        // AgreedButNoKey, not silently build a grant around an empty
1484        // secret that could never authenticate.
1485        let _g = unset_all_capability_env_vars();
1486        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1487        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1488
1489        let result = CapabilityProfile::from_env();
1490        match result {
1491            Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1492                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1493                assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1494            }
1495            other => panic!("expected AgreedButNoKey for empty key, got {:?}", other),
1496        }
1497    }
1498
1499    #[test]
1500    #[serial_test::serial]
1501    fn from_env_empty_key_without_agree_is_no_grant() {
1502        // Security-adjacent (PR #161 review): an empty key with the
1503        // agree var unset is indistinguishable from "no key at all".
1504        // It must resolve to Ok(None) (no grant, no error) — an empty
1505        // string must NOT trip the KeyButNotAgreed leaked-credential
1506        // rule, since there is no credential.
1507        let _g = unset_all_capability_env_vars();
1508        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1509
1510        let p = CapabilityProfile::from_env()
1511            .expect("empty key + agree unset must be Ok(None), not an error");
1512        assert!(
1513            p.tdm_elsevier.is_none(),
1514            "empty DOIGET_KEY_ELSEVIER with no agree var must yield no grant"
1515        );
1516        assert!(p.tdm_aps.is_none());
1517        assert!(p.tdm_springer.is_none());
1518    }
1519
1520    #[test]
1521    #[serial_test::serial]
1522    fn from_env_key_but_not_agreed_errs() {
1523        // Rule (CAPABILITY.md §2): key set + agree unset -> KeyButNotAgreed.
1524        // A leaked DOIGET_KEY_ELSEVIER must not silently enable a source.
1525        let _g = unset_all_capability_env_vars();
1526        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1527
1528        let result = CapabilityProfile::from_env();
1529        match result {
1530            Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1531                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1532            }
1533            other => panic!("expected KeyButNotAgreed, got {:?}", other),
1534        }
1535    }
1536
1537    #[test]
1538    #[serial_test::serial]
1539    fn from_env_agree_not_one_errs() {
1540        // Rule (CAPABILITY.md §2): the agree var must be exactly "1". Any
1541        // other value (here: "true") is treated as not-agreed; combined
1542        // with a key set, that triggers KeyButNotAgreed.
1543        let _g = unset_all_capability_env_vars();
1544        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "true");
1545        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1546
1547        let result = CapabilityProfile::from_env();
1548        match result {
1549            Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1550                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1551            }
1552            other => panic!("expected KeyButNotAgreed, got {:?}", other),
1553        }
1554    }
1555
1556    #[test]
1557    #[serial_test::serial]
1558    fn from_env_both_set_correctly_returns_grant() {
1559        // Rule (CAPABILITY.md §2): agree=1 + key set -> Some(TdmGrant) when
1560        // the corresponding feature is compiled in; else None (warn-and-skip).
1561        // The feature gate for elsevier is `tdm-elsevier`; this test asserts
1562        // both branches via `cfg!`.
1563        let _g = unset_all_capability_env_vars();
1564        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1565        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1566
1567        let p = CapabilityProfile::from_env().expect("agree=1 + key -> Ok");
1568
1569        if cfg!(feature = "tdm-elsevier") {
1570            let grant = p
1571                .tdm_elsevier
1572                .as_ref()
1573                .expect("feature tdm-elsevier compiled in -> Some(TdmGrant)");
1574            assert_eq!(grant.agree_env_var, "DOIGET_AGREE_TDM_ELSEVIER");
1575            // Issue #153 / PR #161 review: prove the key was actually
1576            // threaded into TdmGrant::api_key at startup (not just that
1577            // the agree var was recorded). The field is cfg-gated to
1578            // the same `tdm-*` set as the assertion below, so gate the
1579            // check identically.
1580            #[cfg(any(
1581                feature = "tdm-elsevier",
1582                feature = "tdm-aps",
1583                feature = "tdm-springer"
1584            ))]
1585            {
1586                use secrecy::ExposeSecret as _;
1587                assert_eq!(
1588                    grant.api_key.expose_secret(),
1589                    "sk-test",
1590                    "the DOIGET_KEY_ELSEVIER value must be threaded into \
1591                     TdmGrant::api_key (issue #153)"
1592                );
1593            }
1594        } else {
1595            assert!(
1596                p.tdm_elsevier.is_none(),
1597                "feature tdm-elsevier NOT compiled in -> None (warn-and-skip)"
1598            );
1599        }
1600    }
1601
1602    #[test]
1603    #[serial_test::serial]
1604    fn from_env_metadata_env_warns_without_feature() {
1605        // Rule (CAPABILITY.md §2): metadata env var without the `metadata`
1606        // feature -> source disabled (warn-and-skip, not an error).
1607        // We don't capture the tracing warn here; we just assert the field
1608        // is `false` when the feature is absent and `true` when present.
1609        let _g = unset_all_capability_env_vars();
1610        let _enable = EnvGuard::set("DOIGET_ENABLE_OPENALEX", "1");
1611
1612        let p = CapabilityProfile::from_env().expect("metadata env never errors");
1613
1614        if cfg!(feature = "metadata") {
1615            assert!(p.metadata.openalex);
1616        } else {
1617            assert!(!p.metadata.openalex);
1618        }
1619    }
1620
1621    // -----------------------------------------------------------------
1622    // Safekey reference vectors (docs/SAFEKEY.md §3, NORMATIVE).
1623    //
1624    // The vectors.json file is the binding cross-tool contract with
1625    // BiblioFetch.jl: every entry MUST round-trip identically through
1626    // both implementations. Phase 0 ships 13 entries; the full 100-entry
1627    // set is gated on the BiblioFetch.jl pre-flight (ADR-0007 Status:
1628    // Proposed at the time of this Phase 1 implementation).
1629    //
1630    // `Ref::parse` is concurrent W3-A work and is not on `main` yet, so
1631    // this test branches on the input prefix (`doi:` / `arxiv:`) and
1632    // constructs the variant directly via the in-crate `pub(crate)`
1633    // tuple constructor.
1634    // -----------------------------------------------------------------
1635
1636    #[derive(Deserialize)]
1637    struct SafekeyVector {
1638        input: String,
1639        expected: String,
1640    }
1641
1642    #[derive(Deserialize)]
1643    struct SafekeyVectorFile {
1644        vectors: Vec<SafekeyVector>,
1645    }
1646
1647    /// In-crate test helper: build a `Ref` from the user-facing form used
1648    /// in the vectors file, by stripping the `doi:` / `arxiv:` URI scheme
1649    /// and wrapping the remainder. This bypasses validation; it is fine
1650    /// here because the vectors are hand-curated and the test asserts the
1651    /// derivation algorithm, not parser semantics.
1652    fn ref_from_vector_input(input: &str) -> Ref {
1653        if let Some(rest) = input.strip_prefix("doi:") {
1654            Ref::Doi(Doi(rest.to_string()))
1655        } else if let Some(rest) = input.strip_prefix("arxiv:") {
1656            Ref::Arxiv(ArxivId(rest.to_string()))
1657        } else {
1658            panic!(
1659                "vectors.json entry has unknown ref scheme (expected doi: or arxiv: prefix): {}",
1660                input
1661            );
1662        }
1663    }
1664
1665    #[test]
1666    fn safekey_matches_reference_vectors() {
1667        // include_str! resolves relative to the file containing this macro
1668        // call (crates/doiget-core/src/lib.rs), so we go up three levels
1669        // to reach the workspace root, then down to tests/fixtures.
1670        let raw = include_str!("../../../tests/fixtures/safekey/vectors.json");
1671        let parsed: SafekeyVectorFile =
1672            serde_json::from_str(raw).expect("vectors.json is valid JSON matching schema");
1673
1674        // Phase 0 final ships the full NORMATIVE 100-entry set
1675        // (docs/SAFEKEY.md §5). The fixture is the binding cross-tool
1676        // contract with BiblioFetch.jl; tightening the count guard to
1677        // `== 100` ensures the set cannot silently grow or shrink without
1678        // a coordinated ADR bump (per docs/SAFEKEY.md status block).
1679        assert_eq!(
1680            parsed.vectors.len(),
1681            100,
1682            "vectors.json MUST be exactly 100 entries (NORMATIVE per docs/SAFEKEY.md §5); got {}",
1683            parsed.vectors.len()
1684        );
1685
1686        let mut failures: Vec<String> = Vec::new();
1687        for v in &parsed.vectors {
1688            let r = ref_from_vector_input(&v.input);
1689            let got = r.safekey().as_str().to_string();
1690            if got != v.expected {
1691                failures.push(format!(
1692                    "input={:?}\n  expected={:?}\n  got     ={:?}",
1693                    v.input, v.expected, got
1694                ));
1695            }
1696        }
1697
1698        assert!(
1699            failures.is_empty(),
1700            "{}/{} safekey reference vectors failed:\n{}",
1701            failures.len(),
1702            parsed.vectors.len(),
1703            failures.join("\n")
1704        );
1705    }
1706
1707    #[test]
1708    fn safekey_truncates_long_inputs_with_sha256_suffix() {
1709        // Construct a synthetic DOI whose suffix produces a `trimmed` longer than
1710        // 192 chars after step 3. 220 ASCII-safe chars + the `doi_10.1234/`
1711        // prefix easily exceeds 192. The resulting key must be exactly 201 chars:
1712        // 192 (trimmed prefix) + 1 (`_` separator) + 8 (hex of first 4 bytes of
1713        // SHA-256(raw)). Per docs/SAFEKEY.md §3 step 5.
1714        let suffix = "a".repeat(220);
1715        let doi = Doi(format!("10.1234/{}", suffix));
1716        let key = Ref::Doi(doi).safekey();
1717        let s = key.as_str();
1718
1719        // Shape: <192 ASCII chars from {A-Za-z0-9._-}> + "_" + <8 hex chars>
1720        assert_eq!(
1721            s.len(),
1722            201,
1723            "expected 201-char truncated key, got {}: {}",
1724            s.len(),
1725            s
1726        );
1727        assert_eq!(&s[192..193], "_", "expected '_' separator at byte 192");
1728        let hash_part = &s[193..];
1729        assert_eq!(hash_part.len(), 8, "hash suffix must be 8 hex chars");
1730        assert!(
1731            hash_part
1732                .chars()
1733                .all(|c| c.is_ascii_hexdigit() && !c.is_ascii_uppercase()),
1734            "hash suffix must be lowercase hex: {}",
1735            hash_part
1736        );
1737
1738        // Determinism: same input twice must produce the same key.
1739        let key2 = Ref::Doi(Doi(format!("10.1234/{}", "a".repeat(220)))).safekey();
1740        assert_eq!(s, key2.as_str(), "safekey must be deterministic");
1741
1742        // Hash content: must equal hex(sha256(raw)[..4]) where raw is the
1743        // pre-escape prefixed form per docs/SAFEKEY.md §3 step 5.
1744        use sha2::Digest;
1745        let raw = format!("doi_10.1234/{}", "a".repeat(220));
1746        let expected_hash = {
1747            let digest = sha2::Sha256::digest(raw.as_bytes());
1748            format!(
1749                "{:02x}{:02x}{:02x}{:02x}",
1750                digest[0], digest[1], digest[2], digest[3]
1751            )
1752        };
1753        assert_eq!(
1754            hash_part, expected_hash,
1755            "hash must match SHA-256 of raw form"
1756        );
1757    }
1758
1759    // -----------------------------------------------------------------
1760    // Doi::parse / ArxivId::parse / Ref::parse — Phase 1 W3-A.
1761    // Spec: docs/SECURITY.md §1.1 (input validation). The rejection
1762    // category set is the binding contract; each test case below names
1763    // which rule it exercises in a comment.
1764    // -----------------------------------------------------------------
1765
1766    // ---- Doi::parse happy paths (≥6) --------------------------------
1767
1768    #[test]
1769    fn doi_parse_accepts_bare_canonical_form() {
1770        // Rule: "10.<registrant>/<suffix>" is the canonical bare form.
1771        let d = Doi::parse("10.1234/example").expect("canonical bare DOI");
1772        assert_eq!(d.as_str(), "10.1234/example");
1773    }
1774
1775    #[test]
1776    fn doi_parse_accepts_doi_uri_scheme() {
1777        // Rule: the `doi:` scheme is stripped at construction; as_str
1778        // never carries it (matches docs/SAFEKEY.md §3 step 0).
1779        let d = Doi::parse("doi:10.1234/example").expect("doi: scheme accepted");
1780        assert_eq!(d.as_str(), "10.1234/example");
1781    }
1782
1783    #[test]
1784    fn doi_parse_accepts_complex_real_world_suffix() {
1785        // Rule: suffix charset includes `.`, `(`, `)`, `-`. From a real
1786        // PhysRevLett DOI used elsewhere in the test fixture set.
1787        let d = Doi::parse("10.1103/PhysRevLett.130.200601").expect("real-world PhysRev DOI");
1788        assert_eq!(d.as_str(), "10.1103/PhysRevLett.130.200601");
1789    }
1790
1791    #[test]
1792    fn doi_parse_accepts_parens_in_suffix() {
1793        // Rule: `(` and `)` are explicitly listed in the spec charset.
1794        let d = Doi::parse("10.1016/S0370-1573(98)00122-3").expect("parens in suffix");
1795        assert_eq!(d.as_str(), "10.1016/S0370-1573(98)00122-3");
1796    }
1797
1798    #[test]
1799    fn doi_parse_accepts_nested_slashes_in_suffix() {
1800        // Rule: `/` is a suffix character; only the first `/` is the
1801        // registrant/suffix separator.
1802        let d = Doi::parse("10.1234/foo/bar/baz").expect("nested slashes");
1803        assert_eq!(d.as_str(), "10.1234/foo/bar/baz");
1804    }
1805
1806    #[test]
1807    fn doi_parse_accepts_colon_in_legacy_kluwer_suffix() {
1808        // #194: legacy Kluwer/Springer DOIs (`10.1023/A:NNNNNNNNNN`)
1809        // carry a `:` in the suffix. Real DOI: "Entanglement, Quantum
1810        // Phase Transitions, and DMRG" (Kluwer, 2002).
1811        let d = Doi::parse("10.1023/A:1019601218492").expect("legacy Kluwer colon DOI");
1812        assert_eq!(d.as_str(), "10.1023/A:1019601218492");
1813    }
1814
1815    #[test]
1816    fn doi_parse_accepts_colon_in_edp_jphys_suffix() {
1817        // #194: EDP Sciences / Journal de Physique legacy corpus uses
1818        // `10.1051/jphys:NNNNNNNNNNNNNNNNN`. Real DOIs from the dogfood
1819        // Ising-RG run; both resolve at doi.org and via Crossref.
1820        let d = Doi::parse("10.1051/jphys:0198900500120136500").expect("EDP jphys colon DOI");
1821        assert_eq!(d.as_str(), "10.1051/jphys:0198900500120136500");
1822        let d2 = Doi::parse("doi:10.1051/jphys:0198500460100164500").expect("scheme + colon");
1823        assert_eq!(d2.as_str(), "10.1051/jphys:0198500460100164500");
1824    }
1825
1826    #[test]
1827    fn doi_parse_rejects_semicolon_in_suffix() {
1828        // #194 / ADR-0026: `;` is the natural ASCII neighbor of `:` and
1829        // is explicitly EXCLUDED from the suffix charset extension
1830        // (ADR-0026 §"Out of scope"). This test guards against an
1831        // over-broad `matches!` arm (e.g. an accidental `':'..=';'` range
1832        // typo) re-admitting `;` along with `:`.
1833        let result = Doi::parse("10.1234/foo;bar");
1834        assert!(
1835            matches!(result, Err(RefParseError::InvalidDoiSuffixChar { ch: ';' })),
1836            "expected InvalidDoiSuffixChar with ch=';', got {:?}",
1837            result
1838        );
1839    }
1840
1841    #[test]
1842    fn doi_parse_accepts_suffix_at_max_len_boundary() {
1843        // Rule: a suffix of exactly DOI_SUFFIX_MAX_LEN bytes is accepted;
1844        // 1 byte more is rejected (covered separately below).
1845        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN);
1846        let input = format!("10.1234/{}", suffix);
1847        let d = Doi::parse(&input).expect("suffix at max len");
1848        assert_eq!(d.as_str().len(), "10.1234/".len() + DOI_SUFFIX_MAX_LEN);
1849    }
1850
1851    #[test]
1852    fn doi_parse_uri_scheme_is_case_insensitive() {
1853        // Rule: be lenient on scheme casing; the scheme is stripped
1854        // either way so the stored form is identical.
1855        let d = Doi::parse("DOI:10.1234/example").expect("uppercase scheme");
1856        assert_eq!(d.as_str(), "10.1234/example");
1857    }
1858
1859    // ---- Doi::parse rejection paths (≥6) ----------------------------
1860
1861    #[test]
1862    fn doi_parse_rejects_missing_10_prefix() {
1863        // Rule: must start with "10." literal.
1864        assert_eq!(
1865            Doi::parse("11.1234/example"),
1866            Err(RefParseError::MissingDoiPrefix)
1867        );
1868    }
1869
1870    #[test]
1871    fn doi_parse_rejects_empty_input() {
1872        // Rule: empty inputs are not valid DOIs.
1873        assert_eq!(Doi::parse(""), Err(RefParseError::Empty));
1874    }
1875
1876    #[test]
1877    fn doi_parse_rejects_missing_suffix_separator() {
1878        // Rule: must contain a `/` between registrant and suffix.
1879        assert_eq!(
1880            Doi::parse("10.1234"),
1881            Err(RefParseError::MissingDoiSuffixSeparator)
1882        );
1883    }
1884
1885    #[test]
1886    fn doi_parse_rejects_empty_suffix() {
1887        // Rule: suffix must be non-empty.
1888        assert_eq!(Doi::parse("10.1234/"), Err(RefParseError::EmptyDoiSuffix));
1889    }
1890
1891    #[test]
1892    fn doi_parse_rejects_invalid_registrant_too_short() {
1893        // Rule: registrant must be 4–9 digits.
1894        assert_eq!(
1895            Doi::parse("10.12/example"),
1896            Err(RefParseError::InvalidDoiRegistrant)
1897        );
1898    }
1899
1900    #[test]
1901    fn doi_parse_rejects_non_digit_registrant() {
1902        // Rule: registrant chars must all be ASCII digits.
1903        assert_eq!(
1904            Doi::parse("10.12ab/example"),
1905            Err(RefParseError::InvalidDoiRegistrant)
1906        );
1907    }
1908
1909    #[test]
1910    fn doi_parse_rejects_control_char_in_suffix() {
1911        // Rule (from docs/SECURITY.md §1.1, log-injection mitigation):
1912        // control chars are not in the suffix charset; reject before they
1913        // can reach the provenance log.
1914        let result = Doi::parse("10.1234/foo\nbar");
1915        assert!(
1916            matches!(
1917                result,
1918                Err(RefParseError::InvalidDoiSuffixChar { ch: '\n' })
1919            ),
1920            "got {:?}",
1921            result
1922        );
1923    }
1924
1925    #[test]
1926    fn doi_parse_rejects_suffix_over_max_len() {
1927        // Rule: DOI_SUFFIX_MAX_LEN + 1 bytes is rejected.
1928        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 1);
1929        let input = format!("10.1234/{}", suffix);
1930        let result = Doi::parse(&input);
1931        match result {
1932            Err(RefParseError::DoiSuffixTooLong { len, max }) => {
1933                assert_eq!(len, DOI_SUFFIX_MAX_LEN + 1);
1934                assert_eq!(max, DOI_SUFFIX_MAX_LEN);
1935            }
1936            other => panic!("expected DoiSuffixTooLong, got {:?}", other),
1937        }
1938    }
1939
1940    #[test]
1941    fn doi_parse_rejects_non_ascii_in_suffix() {
1942        // Rule: spec charset is ASCII-only; non-ASCII becomes an
1943        // InvalidDoiSuffixChar (consistent with safekey behavior of
1944        // collapsing such chars to '_', which is a downstream concern).
1945        let result = Doi::parse("10.1234/物理学");
1946        assert!(
1947            matches!(result, Err(RefParseError::InvalidDoiSuffixChar { .. })),
1948            "got {:?}",
1949            result
1950        );
1951    }
1952
1953    // ---- ArxivId::parse happy paths (≥6) ----------------------------
1954
1955    #[test]
1956    fn arxiv_parse_accepts_new_style_4_digit_seq() {
1957        // Rule: new-style YYMM.NNNN (4-digit sequence number).
1958        let a = ArxivId::parse("0704.0001").expect("new-style 4-digit seq");
1959        assert_eq!(a.as_str(), "0704.0001");
1960    }
1961
1962    #[test]
1963    fn arxiv_parse_accepts_new_style_5_digit_seq() {
1964        // Rule: new-style YYMM.NNNNN (5-digit sequence number, post-2015).
1965        let a = ArxivId::parse("2401.12345").expect("new-style 5-digit seq");
1966        assert_eq!(a.as_str(), "2401.12345");
1967    }
1968
1969    #[test]
1970    fn arxiv_parse_accepts_new_style_with_version() {
1971        // Rule: optional `vN` version suffix.
1972        let a = ArxivId::parse("2401.12345v2").expect("with version");
1973        assert_eq!(a.as_str(), "2401.12345v2");
1974    }
1975
1976    #[test]
1977    fn arxiv_parse_accepts_old_style() {
1978        // Rule: old-style subject-class/YYMMNNN.
1979        let a = ArxivId::parse("cond-mat/9501001").expect("old-style cond-mat");
1980        assert_eq!(a.as_str(), "cond-mat/9501001");
1981    }
1982
1983    #[test]
1984    fn arxiv_parse_accepts_old_style_with_subclass_and_version() {
1985        // Rule: old-style subject-class may have a `.XX` two-upper subclass
1986        // and an optional `vN` suffix.
1987        let a = ArxivId::parse("astro-ph.CO/0703123v2").expect("old-style with subclass + version");
1988        assert_eq!(a.as_str(), "astro-ph.CO/0703123v2");
1989    }
1990
1991    #[test]
1992    fn arxiv_parse_accepts_arxiv_uri_scheme() {
1993        // Rule: `arxiv:` / `arXiv:` scheme is stripped at construction.
1994        let a = ArxivId::parse("arxiv:2401.12345").expect("arxiv: scheme");
1995        assert_eq!(a.as_str(), "2401.12345");
1996    }
1997
1998    #[test]
1999    fn arxiv_parse_accepts_arxiv_uri_scheme_mixed_case() {
2000        // Rule: scheme case-insensitive; matches the `arXiv:` form named
2001        // in docs/MCP_TOOLS.md.
2002        let a = ArxivId::parse("arXiv:2401.12345v2").expect("arXiv: scheme");
2003        assert_eq!(a.as_str(), "2401.12345v2");
2004    }
2005
2006    // ---- ArxivId::parse rejection paths (≥6) ------------------------
2007
2008    #[test]
2009    fn arxiv_parse_rejects_empty_input() {
2010        // Rule: empty rejected up-front.
2011        assert_eq!(ArxivId::parse(""), Err(RefParseError::Empty));
2012    }
2013
2014    #[test]
2015    fn arxiv_parse_rejects_no_dot_or_slash() {
2016        // Rule: must contain `.` (new-style) or `/` (old-style).
2017        assert_eq!(
2018            ArxivId::parse("notanarxivid"),
2019            Err(RefParseError::InvalidArxivShape)
2020        );
2021    }
2022
2023    #[test]
2024    fn arxiv_parse_rejects_new_style_wrong_head_length() {
2025        // Rule: head must be exactly 4 digits.
2026        assert_eq!(
2027            ArxivId::parse("240.12345"),
2028            Err(RefParseError::InvalidArxivShape)
2029        );
2030    }
2031
2032    #[test]
2033    fn arxiv_parse_rejects_new_style_seq_too_short() {
2034        // Rule: seq must be 4–5 digits.
2035        assert_eq!(
2036            ArxivId::parse("2401.123"),
2037            Err(RefParseError::InvalidArxivShape)
2038        );
2039    }
2040
2041    #[test]
2042    fn arxiv_parse_rejects_old_style_wrong_id_length() {
2043        // Rule: old-style id is exactly 7 digits.
2044        assert_eq!(
2045            ArxivId::parse("cond-mat/95001"),
2046            Err(RefParseError::InvalidArxivShape)
2047        );
2048    }
2049
2050    #[test]
2051    fn arxiv_parse_rejects_invalid_version_suffix() {
2052        // Rule: version suffix is `v` followed by ≥1 digits, nothing else.
2053        assert_eq!(
2054            ArxivId::parse("2401.12345v"),
2055            Err(RefParseError::InvalidArxivShape)
2056        );
2057    }
2058
2059    #[test]
2060    fn arxiv_parse_rejects_control_char() {
2061        // Rule (docs/SECURITY.md §1.1 log-injection): no control chars.
2062        assert_eq!(
2063            ArxivId::parse("2401.12345\n"),
2064            Err(RefParseError::InvalidArxivShape)
2065        );
2066    }
2067
2068    #[test]
2069    fn arxiv_parse_rejects_non_ascii() {
2070        // Rule: ASCII-only.
2071        assert_eq!(
2072            ArxivId::parse("2401.物理"),
2073            Err(RefParseError::InvalidArxivShape)
2074        );
2075    }
2076
2077    // ---- Ref::parse happy paths (≥6) --------------------------------
2078
2079    #[test]
2080    fn ref_parse_dispatches_doi_scheme_to_doi() {
2081        // Detection rule 1: explicit `doi:` scheme.
2082        match Ref::parse("doi:10.1234/example").expect("doi: dispatched to Doi") {
2083            Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/example"),
2084            other => panic!("expected Ref::Doi, got {:?}", other),
2085        }
2086    }
2087
2088    #[test]
2089    fn ref_parse_dispatches_arxiv_scheme_to_arxiv() {
2090        // Detection rule 2: explicit `arxiv:` scheme.
2091        match Ref::parse("arxiv:2401.12345").expect("arxiv: dispatched to Arxiv") {
2092            Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2093            other => panic!("expected Ref::Arxiv, got {:?}", other),
2094        }
2095    }
2096
2097    #[test]
2098    fn ref_parse_dispatches_arxiv_mixed_case_scheme() {
2099        // Detection rule 2 (case-insensitive): `arXiv:` form.
2100        match Ref::parse("arXiv:cond-mat/9501001").expect("arXiv: dispatched") {
2101            Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2102            other => panic!("expected Ref::Arxiv, got {:?}", other),
2103        }
2104    }
2105
2106    #[test]
2107    fn ref_parse_bare_doi_resolves_to_doi() {
2108        // Detection rule 3: bare input starting with `10.` is a DOI.
2109        match Ref::parse("10.1234/foo").expect("bare DOI") {
2110            Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/foo"),
2111            other => panic!("expected Ref::Doi, got {:?}", other),
2112        }
2113    }
2114
2115    #[test]
2116    fn ref_parse_bare_arxiv_new_resolves_to_arxiv() {
2117        // Detection rule 4: bare input not starting with `10.` falls
2118        // through to arXiv. Tests the ambiguous-input branch named in the
2119        // PR brief: `2401.12345` should resolve to ArxivId.
2120        match Ref::parse("2401.12345").expect("bare new-style arXiv") {
2121            Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2122            other => panic!("expected Ref::Arxiv, got {:?}", other),
2123        }
2124    }
2125
2126    #[test]
2127    fn ref_parse_bare_arxiv_old_resolves_to_arxiv() {
2128        // Detection rule 4: bare old-style arXiv id.
2129        match Ref::parse("cond-mat/9501001").expect("bare old-style arXiv") {
2130            Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2131            other => panic!("expected Ref::Arxiv, got {:?}", other),
2132        }
2133    }
2134
2135    // ---- Ref::parse rejection paths (≥6) ----------------------------
2136
2137    #[test]
2138    fn ref_parse_rejects_empty() {
2139        // Rule: empty up-front.
2140        assert_eq!(Ref::parse(""), Err(RefParseError::Empty));
2141    }
2142
2143    #[test]
2144    fn ref_parse_doi_scheme_with_invalid_doi_propagates_doi_error() {
2145        // When the scheme is explicit, we surface the parser's error
2146        // verbatim — not a generic "shape mismatch".
2147        assert_eq!(
2148            Ref::parse("doi:10.1234"),
2149            Err(RefParseError::MissingDoiSuffixSeparator)
2150        );
2151    }
2152
2153    #[test]
2154    fn ref_parse_arxiv_scheme_with_invalid_arxiv_propagates_arxiv_error() {
2155        assert_eq!(
2156            Ref::parse("arxiv:notanid"),
2157            Err(RefParseError::InvalidArxivShape)
2158        );
2159    }
2160
2161    #[test]
2162    fn ref_parse_bare_with_10_prefix_uses_doi_errors() {
2163        // Bare `10.…` heuristic: DOI parser is dispatched and its error
2164        // surfaces (here: bad registrant).
2165        assert_eq!(
2166            Ref::parse("10.12/x"),
2167            Err(RefParseError::InvalidDoiRegistrant)
2168        );
2169    }
2170
2171    #[test]
2172    fn ref_parse_bare_without_10_prefix_uses_arxiv_errors() {
2173        // Bare ambiguous fallback: ArxivId parser is dispatched and its
2174        // error surfaces. `1.2.3` is neither a DOI nor an arXiv shape.
2175        assert_eq!(Ref::parse("1.2.3"), Err(RefParseError::InvalidArxivShape));
2176    }
2177
2178    #[test]
2179    fn ref_parse_rejects_doi_scheme_with_oversized_suffix() {
2180        // Length-bound: DOI suffix > DOI_SUFFIX_MAX_LEN through Ref::parse
2181        // surfaces DoiSuffixTooLong, not a generic InvalidArxivShape.
2182        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 5);
2183        let input = format!("doi:10.1234/{}", suffix);
2184        match Ref::parse(&input) {
2185            Err(RefParseError::DoiSuffixTooLong { .. }) => {}
2186            other => panic!("expected DoiSuffixTooLong, got {:?}", other),
2187        }
2188    }
2189
2190    #[test]
2191    fn ref_parse_round_trip_via_serde_preserves_inner_string() {
2192        // Wire-format check: Doi/ArxivId are #[serde(transparent)], and a
2193        // round-trip through Ref::parse → serde_json → Ref must preserve
2194        // the inner identifier. Guards against accidental scheme leakage
2195        // into the stored form.
2196        let r = Ref::parse("doi:10.1234/example").expect("parse ok");
2197        let json = serde_json::to_string(&r).expect("serialize");
2198        // The transparent inner value is the bare identifier (no `doi:`).
2199        assert!(
2200            json.contains("10.1234/example") && !json.contains("doi:"),
2201            "scheme leaked into wire form: {}",
2202            json
2203        );
2204    }
2205
2206    #[test]
2207    fn ref_parse_error_maps_to_invalid_ref_error_code() {
2208        // Public-API contract (docs/PUBLIC_API.md §4): all parse failures
2209        // collapse to ErrorCode::InvalidRef at the public boundary.
2210        let err: ErrorCode = RefParseError::Empty.into();
2211        assert_eq!(err, ErrorCode::InvalidRef);
2212        let err2: ErrorCode = RefParseError::MissingDoiPrefix.into();
2213        assert_eq!(err2, ErrorCode::InvalidRef);
2214    }
2215
2216    // -----------------------------------------------------------------
2217    // DenialReason / DenialContext (ADR-0023) — wire-shape tests.
2218    // -----------------------------------------------------------------
2219
2220    #[test]
2221    fn denial_reason_serializes_snake_case() {
2222        // ADR-0023 §2 / docs/PUBLIC_API.md §8: wire form is snake_case.
2223        let s = serde_json::to_string(&DenialReason::RedirectNotInAllowlist).expect("ser");
2224        assert_eq!(s, "\"redirect_not_in_allowlist\"");
2225        let s = serde_json::to_string(&DenialReason::SizeCapExceeded).expect("ser");
2226        assert_eq!(s, "\"size_cap_exceeded\"");
2227        let s = serde_json::to_string(&DenialReason::ContentTypeMismatch).expect("ser");
2228        assert_eq!(s, "\"content_type_mismatch\"");
2229    }
2230
2231    #[test]
2232    fn denial_reason_round_trip_via_serde() {
2233        // Round-trip every closed-set variant so adding a new variant
2234        // forces this test to be updated (the closed-set contract).
2235        for r in [
2236            DenialReason::RedirectNotInAllowlist,
2237            DenialReason::InsecureScheme,
2238            DenialReason::HostInBlockList,
2239            DenialReason::SizeCapExceeded,
2240            DenialReason::SchemaDrift,
2241            DenialReason::CapabilityNotGranted,
2242            DenialReason::RateLimitWindow,
2243            DenialReason::SsrfPrivateAddress,
2244            DenialReason::ContentTypeMismatch,
2245        ] {
2246            let s = serde_json::to_string(&r).expect("ser");
2247            let back: DenialReason = serde_json::from_str(&s).expect("de");
2248            assert_eq!(back, r, "round-trip mismatch for {:?} -> {}", r, s);
2249        }
2250    }
2251
2252    #[test]
2253    fn denial_context_round_trips_full_shape() {
2254        // A populated context (the redirect-denied case from ADR-0023 §1
2255        // example) survives a JSON round-trip. Whole-struct equality
2256        // exercises the `PartialEq` derive added per ADR-0023 §3 (added
2257        // in the multi-agent review feedback PR — see ADR-0023 history).
2258        let dc = DenialContext {
2259            reason: DenialReason::RedirectNotInAllowlist,
2260            source: Some("crossref".to_string()),
2261            attempted: Some("evil.example.com".to_string()),
2262            expected: Some(vec![
2263                "api.crossref.org".to_string(),
2264                "*.crossref.org".to_string(),
2265            ]),
2266            hop_index: Some(1),
2267            cap: None,
2268            actual: None,
2269        };
2270        let s = serde_json::to_string(&dc).expect("ser");
2271        let back: DenialContext = serde_json::from_str(&s).expect("de");
2272        assert_eq!(back, dc);
2273    }
2274
2275    #[test]
2276    fn denial_context_serialize_elides_empty_fields() {
2277        // `skip_serializing_if = "Option::is_none"` must keep the wire form
2278        // lean: every `None` field MUST NOT appear on the wire. Reason is
2279        // always present.
2280        let dc = DenialContext {
2281            reason: DenialReason::CapabilityNotGranted,
2282            source: None,
2283            attempted: None,
2284            expected: None,
2285            hop_index: None,
2286            cap: None,
2287            actual: None,
2288        };
2289        let s = serde_json::to_string(&dc).expect("ser");
2290        assert_eq!(s, "{\"reason\":\"capability_not_granted\"}");
2291    }
2292
2293    #[test]
2294    fn denial_context_expected_some_empty_vec_preserves_explicit_empty_allowlist() {
2295        // Post-refinement disambiguation: `expected: Some(vec![])` is the
2296        // "explicit empty allowlist" signal and MUST survive the wire as
2297        // `"expected":[]`. Only `expected: None` is skipped on serialize.
2298        // This is the bug the previous `Vec<String>` shape masked.
2299        let dc = DenialContext {
2300            reason: DenialReason::RedirectNotInAllowlist,
2301            source: Some("crossref".to_string()),
2302            attempted: Some("evil.example.com".to_string()),
2303            expected: Some(Vec::new()),
2304            hop_index: None,
2305            cap: None,
2306            actual: None,
2307        };
2308        let s = serde_json::to_string(&dc).expect("ser");
2309        assert!(
2310            s.contains("\"expected\":[]"),
2311            "expected:[] must survive on the wire (got: {s})"
2312        );
2313        let back: DenialContext = serde_json::from_str(&s).expect("de");
2314        assert_eq!(back.expected, Some(Vec::new()));
2315    }
2316
2317    #[test]
2318    fn denial_context_deserialize_tolerates_missing_optional_fields() {
2319        // Consumer-side contract (ADR-0023 §3): consumers MUST tolerate
2320        // any subset of fields being present. Missing optional fields
2321        // deserialize to their defaults via `#[serde(default)]`.
2322        let wire = r#"{"reason":"size_cap_exceeded","cap":104857600,"actual":209715200}"#;
2323        let dc: DenialContext = serde_json::from_str(wire).expect("de");
2324        assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
2325        assert_eq!(dc.cap, Some(104857600));
2326        assert_eq!(dc.actual, Some(209715200));
2327        assert!(dc.source.is_none());
2328        assert!(dc.attempted.is_none());
2329        assert!(dc.expected.is_none());
2330        assert!(dc.hop_index.is_none());
2331    }
2332
2333    #[test]
2334    fn full_error_envelope_with_denial_context_serializes_to_pinned_json() {
2335        // Pins the byte-exact wire shape of the full failure envelope
2336        // documented in docs/ERRORS.md §3 + §3.1 and ADR-0023 §1. A
2337        // future regression that flips key order or skip-rules anywhere
2338        // in the chain breaks this test loudly.
2339        //
2340        // Note: serde_json's `Map` (used by `json!`) sorts keys
2341        // alphabetically when the `preserve_order` feature is NOT
2342        // enabled (we do not enable it). Embedding a `DenialContext`
2343        // via `json!` first re-serialises it through the same alphabet-
2344        // sorted Map path, so the inner field order is also alphabetical
2345        // here — NOT the struct field-order produced by direct
2346        // `to_string(&DenialContext)`. This is by design: the public
2347        // wire shape is canonicalised by serde_json's Map ordering, so
2348        // the byte-exact pin below documents that exact canonicalisation.
2349        let denial = DenialContext {
2350            reason: DenialReason::RedirectNotInAllowlist,
2351            source: Some("crossref".into()),
2352            attempted: Some("evil.example.com".into()),
2353            expected: Some(vec!["api.crossref.org".into(), "*.crossref.org".into()]),
2354            hop_index: Some(1),
2355            cap: None,
2356            actual: None,
2357        };
2358        let envelope = serde_json::json!({
2359            "ok": false,
2360            "error": {
2361                "code": ErrorCode::NetworkError,
2362                "message": "redirect target evil.example.com not in allowlist for source crossref",
2363                "denial_context": denial,
2364            }
2365        });
2366        let actual = serde_json::to_string(&envelope).expect("serialize envelope");
2367        let expected = r#"{"error":{"code":"NETWORK_ERROR","denial_context":{"attempted":"evil.example.com","expected":["api.crossref.org","*.crossref.org"],"hop_index":1,"reason":"redirect_not_in_allowlist","source":"crossref"},"message":"redirect target evil.example.com not in allowlist for source crossref"},"ok":false}"#;
2368        assert_eq!(actual, expected);
2369    }
2370
2371    #[test]
2372    fn denial_context_rejects_unknown_fields() {
2373        // `#[serde(deny_unknown_fields)]` (ADR-0023 §3, PUBLIC_API.md §8):
2374        // an unknown field on the wire MUST be a deserialize error so
2375        // forward-compat field additions stay a breaking change.
2376        let wire = r#"{"reason":"capability_not_granted","banana":1}"#;
2377        let result: Result<DenialContext, _> = serde_json::from_str(wire);
2378        assert!(
2379            result.is_err(),
2380            "deny_unknown_fields must reject 'banana': {:?}",
2381            result.map(|d| d.reason),
2382        );
2383    }
2384}