Skip to main content

doiget_core/
lib.rs

1//! # doiget-core
2//!
3//! Core library for [doiget](https://github.com/sotashimozono/doiget): an Open Access
4//! first paper-fetcher with strict capability gating, fail-closed provenance logging,
5//! and a BiblioFetch.jl-compatible store layout.
6//!
7//! Phase 0 ships only this skeleton. Real implementations land in Phase 1.
8//! See `docs/PUBLIC_API.md` for the semver-locked surface and `docs/ARCHITECTURE.md`
9//! for the high-level design.
10
11#![warn(missing_docs)]
12#![forbid(unsafe_code)]
13
14use serde::{Deserialize, Serialize};
15use sha2::Digest;
16
17// --- Modules ---
18pub mod canonical;
19pub mod dry_run;
20pub mod http;
21pub mod orchestrator;
22pub mod provenance;
23pub mod rate_limiter;
24pub mod refs;
25pub mod resolver_cache;
26pub mod source;
27pub mod sources;
28pub mod store;
29pub mod user_extension;
30pub mod verify_config;
31
32// Phase 4 citation graph (ADR-0010). Compile-gated by the `citation`
33// Cargo feature, which itself enables the `metadata` feature so the
34// Tier-2 source impls are available.
35#[cfg(feature = "citation")]
36pub mod citation_graph;
37
38// Re-export the canonical-tuple audit-identity types at the crate root
39// per ADR-0024 / `docs/PUBLIC_API.md` §1. The types themselves live in
40// the [`canonical`] submodule.
41pub use crate::canonical::{CanonicalRef, SourceType};
42
43/// Crate version. Used by `doiget-cli --version` and `doiget_health`.
44pub const VERSION: &str = env!("CARGO_PKG_VERSION");
45
46/// TOML schema version this build writes. See `docs/STORE.md` §3.
47pub const SCHEMA_VERSION: &str = "1.0";
48
49/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
50pub const MAX_CONCURRENT_FETCHES: u32 = 5;
51
52/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
53pub const MAX_FETCHES_PER_SECOND: f32 = 5.0;
54
55/// Maximum batch size for `doiget batch` and `doiget_batch_fetch`.
56pub const MCP_BATCH_MAX_SIZE: usize = 100;
57
58/// Slice 2 alias for [`MCP_BATCH_MAX_SIZE`] using the
59/// spec-language name (`docs/MCP_TOOLS.md` §1 / Slice 2 plan). The
60/// numeric value MUST equal [`MCP_BATCH_MAX_SIZE`]; an internal test
61/// pins the equivalence so the two constants cannot drift.
62pub const MAX_BATCH_REFS: usize = MCP_BATCH_MAX_SIZE;
63
64/// Maximum queued MCP requests beyond `MAX_CONCURRENT_FETCHES`. Excess returns
65/// `ErrorCode::RateLimited`. See `docs/SECURITY.md` §1.4 / `docs/MCP_TOOLS.md`.
66pub const MCP_QUEUE_DEPTH_MAX: usize = 100;
67
68/// MCP server stdin-EOF graceful-shutdown deadline, in seconds. See ADR-0001
69/// and `docs/MCP_TOOLS.md` §8.
70pub const MCP_STDIN_EOF_SHUTDOWN_SEC: u64 = 5;
71
72/// Maximum DOI suffix length accepted at validation. See `docs/SECURITY.md` §1.1.
73pub const DOI_SUFFIX_MAX_LEN: usize = 256;
74
75/// Maximum PDF body size accepted by the fetcher, in bytes. See
76/// `docs/SECURITY.md` §1.2 (Oversized PDF).
77pub const PDF_MAX_BYTES: u64 = 100_000_000;
78
79/// Time-to-live for entries in `~/.cache/doiget/resolver/`. See
80/// `docs/CACHE.md` §3.
81pub const RESOLVER_CACHE_TTL_DAYS: u32 = 7;
82
83/// Time-to-live for entries in `~/.cache/doiget/citations/`. See
84/// `docs/CACHE.md` §3.
85pub const CITATION_CACHE_TTL_DAYS: u32 = 30;
86
87// ---------------------------------------------------------------------------
88// Ref
89// ---------------------------------------------------------------------------
90
91/// A reference to a paper, either by DOI or arXiv id.
92///
93/// See `docs/SECURITY.md` §1.1 for input-validation rules.
94#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
95#[serde(rename_all = "lowercase", tag = "kind", content = "id")]
96pub enum Ref {
97    /// A DOI (e.g., `10.1234/example`).
98    Doi(Doi),
99    /// An arXiv id (e.g., `2401.12345`).
100    Arxiv(ArxivId),
101}
102
103/// A validated DOI string.
104///
105/// Construct via `Doi::parse(s)` (Phase 1+). The inner field is intentionally
106/// `pub(crate)` to forbid bypass construction; tests inside `doiget-core` may
107/// still use `Doi(s)` for fixture purposes.
108///
109/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"10.1234/example"`.
110#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
111#[serde(transparent)]
112pub struct Doi(pub(crate) String);
113
114/// A validated arXiv id string.
115///
116/// Construct via `ArxivId::parse(s)` (Phase 1+). Inner field is `pub(crate)`.
117///
118/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"2401.12345"`.
119#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
120#[serde(transparent)]
121pub struct ArxivId(pub(crate) String);
122
123impl Doi {
124    /// Returns the DOI as a string slice.
125    pub fn as_str(&self) -> &str {
126        &self.0
127    }
128
129    /// Parses and validates a DOI string per `docs/SECURITY.md` §1.1.
130    ///
131    /// Accepts:
132    /// - Bare DOIs: `10.<registrant>/<suffix>` where `<registrant>` is 4–9
133    ///   digits and `<suffix>` is a non-empty sequence of characters drawn
134    ///   from `[A-Za-z0-9._/():-]` (the `:` covers legacy Kluwer
135    ///   `10.1023/A:NNNN` and EDP Sciences `10.1051/jphys:NNNN` DOIs).
136    /// - The `doi:` URI scheme prefix; it is stripped before validation, so
137    ///   the stored value never carries a scheme. (Matches the convention
138    ///   established in `docs/SAFEKEY.md` §3 step 0.)
139    ///
140    /// Rejects:
141    /// - Inputs missing the literal `10.` prefix (after optional scheme
142    ///   strip).
143    /// - Suffixes longer than [`DOI_SUFFIX_MAX_LEN`] bytes.
144    /// - Empty suffixes.
145    /// - Any character outside the suffix charset above (including control
146    ///   characters, whitespace, and non-ASCII).
147    ///
148    /// # Errors
149    ///
150    /// Returns a [`RefParseError`] variant that names the specific rejection
151    /// category. Tier 1+ callers should map any [`RefParseError`] to
152    /// [`ErrorCode::InvalidRef`] when surfacing to MCP / CLI.
153    pub fn parse(s: &str) -> Result<Self, RefParseError> {
154        let stripped = parse::strip_doi_scheme(s);
155        parse::validate_doi(stripped)?;
156        Ok(Doi(stripped.to_string()))
157    }
158}
159
160impl ArxivId {
161    /// Returns the arXiv id as a string slice.
162    pub fn as_str(&self) -> &str {
163        &self.0
164    }
165
166    /// Parses and validates an arXiv id per `docs/SECURITY.md` §1.1 and the
167    /// pattern published in `docs/MCP_TOOLS.md`.
168    ///
169    /// Accepts:
170    /// - New-style ids: `YYMM.NNNNN[vN]` where the date block is 4 digits, the
171    ///   sequence number is 4–5 digits, and the optional version `vN` is one
172    ///   or more digits. Examples: `2401.12345`, `2401.12345v2`.
173    /// - Old-style ids: `subject-class/YYMMNNN[vN]` where the subject class
174    ///   is a lowercase token (with optional internal hyphens and an
175    ///   optional `.XX` two-uppercase-letter group), and the numeric body
176    ///   is exactly 7 digits with optional `vN`. Examples:
177    ///   `cond-mat/9501001`, `astro-ph.CO/0703123v2`.
178    /// - The `arxiv:` / `arXiv:` URI scheme prefix; it is stripped before
179    ///   validation.
180    ///
181    /// Rejects:
182    /// - Inputs that match neither the new-style nor old-style shape.
183    /// - Inputs containing characters outside the per-shape charset
184    ///   (control chars, whitespace, non-ASCII).
185    /// - Empty input.
186    ///
187    /// # Errors
188    ///
189    /// Returns a [`RefParseError`] variant that names the specific rejection
190    /// category.
191    pub fn parse(s: &str) -> Result<Self, RefParseError> {
192        let stripped = parse::strip_arxiv_scheme(s);
193        parse::validate_arxiv(stripped)?;
194        Ok(ArxivId(stripped.to_string()))
195    }
196}
197
198impl Ref {
199    /// Parses a string into a [`Ref`], auto-detecting DOI vs arXiv.
200    ///
201    /// Detection rules:
202    /// 1. If the input begins with the case-insensitive `doi:` scheme, the
203    ///    remainder is parsed as a DOI.
204    /// 2. If the input begins with the `arxiv:` or `arXiv:` scheme, the
205    ///    remainder is parsed as an arXiv id.
206    /// 3. Otherwise, if the input starts with `10.` it is treated as a bare
207    ///    DOI; this matches the heuristic in `docs/SAFEKEY.md` §4 (Julia
208    ///    reference) and is stable because DOIs always begin `10.`.
209    /// 4. Failing all of the above, parsing falls back to arXiv.
210    ///
211    /// The returned [`Ref`] never carries the URI scheme — `as_str()` on the
212    /// inner `Doi` / `ArxivId` is always the bare identifier.
213    ///
214    /// # Errors
215    ///
216    /// Returns a [`RefParseError`] from the underlying [`Doi::parse`] or
217    /// [`ArxivId::parse`] call. When the input has an explicit scheme
218    /// (`doi:` / `arxiv:`), the matching parser is dispatched and its error
219    /// surfaces directly. When the input is bare and ambiguous, the
220    /// heuristic in rule 3/4 selects the parser; an unparsable bare input
221    /// surfaces the arXiv parser's error (a non-`10.` ref that also fails
222    /// arXiv validation is never a valid DOI).
223    pub fn parse(s: &str) -> Result<Self, RefParseError> {
224        // Reject empty up front so all three parsers see a meaningful slice;
225        // without this, `strip_*_scheme("")` returns "" and we'd get a
226        // confusing "missing 10. prefix" error for empty input.
227        if s.is_empty() {
228            return Err(RefParseError::Empty);
229        }
230
231        if parse::has_doi_scheme(s) {
232            return Doi::parse(s).map(Ref::Doi);
233        }
234        if parse::has_arxiv_scheme(s) {
235            return ArxivId::parse(s).map(Ref::Arxiv);
236        }
237        if s.starts_with("10.") {
238            return Doi::parse(s).map(Ref::Doi);
239        }
240        ArxivId::parse(s).map(Ref::Arxiv)
241    }
242}
243
244// ---------------------------------------------------------------------------
245// Parser internals
246// ---------------------------------------------------------------------------
247
248mod parse {
249    use super::{RefParseError, DOI_SUFFIX_MAX_LEN};
250
251    /// Case-insensitive `doi:` prefix detector. Matches both `doi:` and
252    /// `DOI:` (and any case mix); the spec in `docs/SAFEKEY.md` §3 only
253    /// names the lowercase form, but the field convention is to be lenient
254    /// in what we accept (the scheme is dropped at the boundary anyway).
255    pub(crate) fn has_doi_scheme(s: &str) -> bool {
256        s.len() >= 4 && s.is_char_boundary(4) && s[..4].eq_ignore_ascii_case("doi:")
257    }
258
259    /// Case-insensitive `arxiv:` prefix detector. Accepts `arxiv:`,
260    /// `arXiv:` (the form used in `docs/MCP_TOOLS.md`), and any other case
261    /// mix.
262    pub(crate) fn has_arxiv_scheme(s: &str) -> bool {
263        s.len() >= 6 && s.is_char_boundary(6) && s[..6].eq_ignore_ascii_case("arxiv:")
264    }
265
266    pub(crate) fn strip_doi_scheme(s: &str) -> &str {
267        if has_doi_scheme(s) {
268            &s[4..]
269        } else {
270            s
271        }
272    }
273
274    pub(crate) fn strip_arxiv_scheme(s: &str) -> &str {
275        if has_arxiv_scheme(s) {
276            &s[6..]
277        } else {
278            s
279        }
280    }
281
282    /// DOI suffix charset per `docs/SECURITY.md` §1.1:
283    /// `[A-Za-z0-9._/():-]`. The forward slash is permitted inside the
284    /// suffix (e.g. `10.1016/...`); the registrant separator is the
285    /// *first* `/` and the suffix is everything after it.
286    ///
287    /// `:` is permitted because two large real publisher DOI families use
288    /// it in the suffix — legacy Kluwer/Springer (`10.1023/A:NNNNNNNNNN`)
289    /// and EDP Sciences / Journal de Physique
290    /// (`10.1051/jphys:NNNNNNNNNNNNNNNNN`). It adds no path-traversal
291    /// capability: traversal requires composing `/` and `.` into `../`,
292    /// and both characters are already in the suffix charset. In addition,
293    /// `safekey` independently escapes every char outside `[A-Za-z0-9._-]`
294    /// before any filesystem use, so `:` never reaches a path literally.
295    /// See ADR-0026 and `docs/SECURITY.md` §1.1.
296    fn is_doi_suffix_char(c: char) -> bool {
297        matches!(c,
298            'A'..='Z' | 'a'..='z' | '0'..='9'
299            | '.' | '_' | '/' | '(' | ')' | '-' | ':'
300        )
301    }
302
303    pub(crate) fn validate_doi(s: &str) -> Result<(), RefParseError> {
304        if s.is_empty() {
305            return Err(RefParseError::Empty);
306        }
307
308        // Must begin with literal "10."; the registrant is 4–9 digits up
309        // to the first '/'. After that, everything is suffix.
310        let rest = s
311            .strip_prefix("10.")
312            .ok_or(RefParseError::MissingDoiPrefix)?;
313        let slash_idx = rest
314            .find('/')
315            .ok_or(RefParseError::MissingDoiSuffixSeparator)?;
316        let registrant = &rest[..slash_idx];
317        let suffix = &rest[slash_idx + 1..];
318
319        // Registrant: 4–9 ASCII digits.
320        if registrant.len() < 4
321            || registrant.len() > 9
322            || !registrant.chars().all(|c| c.is_ascii_digit())
323        {
324            return Err(RefParseError::InvalidDoiRegistrant);
325        }
326
327        // Suffix: non-empty, charset-restricted, length-bounded.
328        if suffix.is_empty() {
329            return Err(RefParseError::EmptyDoiSuffix);
330        }
331        if suffix.len() > DOI_SUFFIX_MAX_LEN {
332            return Err(RefParseError::DoiSuffixTooLong {
333                len: suffix.len(),
334                max: DOI_SUFFIX_MAX_LEN,
335            });
336        }
337        if let Some(bad) = suffix.chars().find(|c| !is_doi_suffix_char(*c)) {
338            return Err(RefParseError::InvalidDoiSuffixChar { ch: bad });
339        }
340        Ok(())
341    }
342
343    /// Validates an arXiv id (with the `arxiv:` / `arXiv:` scheme already
344    /// stripped). Tries the new-style shape first, then the old-style.
345    pub(crate) fn validate_arxiv(s: &str) -> Result<(), RefParseError> {
346        if s.is_empty() {
347            return Err(RefParseError::Empty);
348        }
349        if validate_arxiv_new(s).is_ok() || validate_arxiv_old(s).is_ok() {
350            return Ok(());
351        }
352        Err(RefParseError::InvalidArxivShape)
353    }
354
355    /// New-style arXiv id: `YYMM.NNNNN[vN]`.
356    fn validate_arxiv_new(s: &str) -> Result<(), ()> {
357        let dot_idx = s.find('.').ok_or(())?;
358        let head = &s[..dot_idx];
359        let tail = &s[dot_idx + 1..];
360
361        // Head: exactly 4 ASCII digits.
362        if head.len() != 4 || !head.chars().all(|c| c.is_ascii_digit()) {
363            return Err(());
364        }
365
366        // Tail: 4–5 digits, then optional `v` followed by ≥1 digits.
367        let bytes = tail.as_bytes();
368        let mut i = 0;
369        while i < bytes.len() && bytes[i].is_ascii_digit() {
370            i += 1;
371        }
372        let digits_len = i;
373        if !(4..=5).contains(&digits_len) {
374            return Err(());
375        }
376        if i == bytes.len() {
377            return Ok(());
378        }
379        // Optional version suffix.
380        if bytes[i] != b'v' {
381            return Err(());
382        }
383        i += 1;
384        let v_start = i;
385        while i < bytes.len() && bytes[i].is_ascii_digit() {
386            i += 1;
387        }
388        if i == v_start || i != bytes.len() {
389            return Err(());
390        }
391        Ok(())
392    }
393
394    /// Old-style arXiv id: `subject-class/YYMMNNN[vN]`.
395    /// Subject class: `[a-z]([a-z-]*[a-z])?(\.[A-Z]{2})?`.
396    fn validate_arxiv_old(s: &str) -> Result<(), ()> {
397        let slash_idx = s.find('/').ok_or(())?;
398        let class = &s[..slash_idx];
399        let id = &s[slash_idx + 1..];
400
401        // Class: starts with [a-z], body is [a-z-], optional `.XX` (two
402        // ASCII upper).
403        let (core_class, dot_part) = match class.find('.') {
404            Some(d) => (&class[..d], Some(&class[d + 1..])),
405            None => (class, None),
406        };
407        if core_class.is_empty()
408            || !core_class
409                .chars()
410                .all(|c| c.is_ascii_lowercase() || c == '-')
411            || core_class.starts_with('-')
412            || core_class.ends_with('-')
413        {
414            return Err(());
415        }
416        if let Some(dp) = dot_part {
417            if dp.len() != 2 || !dp.chars().all(|c| c.is_ascii_uppercase()) {
418                return Err(());
419            }
420        }
421
422        // Id: 7 digits, optional `vN`.
423        let bytes = id.as_bytes();
424        let mut i = 0;
425        while i < bytes.len() && bytes[i].is_ascii_digit() {
426            i += 1;
427        }
428        if i != 7 {
429            return Err(());
430        }
431        if i == bytes.len() {
432            return Ok(());
433        }
434        if bytes[i] != b'v' {
435            return Err(());
436        }
437        i += 1;
438        let v_start = i;
439        while i < bytes.len() && bytes[i].is_ascii_digit() {
440            i += 1;
441        }
442        if i == v_start || i != bytes.len() {
443            return Err(());
444        }
445        Ok(())
446    }
447}
448
449// ---------------------------------------------------------------------------
450// RefParseError
451// ---------------------------------------------------------------------------
452
453/// Reasons a `Doi::parse` / `ArxivId::parse` / `Ref::parse` call can fail.
454///
455/// Each variant maps to one rejection category in `docs/SECURITY.md` §1.1.
456/// All variants funnel to [`ErrorCode::InvalidRef`] when surfacing to MCP /
457/// CLI; the granular shape is preserved for tests and for future log
458/// breadcrumbs. The `From<RefParseError> for ErrorCode` impl below makes
459/// `?` propagation collapse to `INVALID_REF` automatically, satisfying
460/// `docs/PUBLIC_API.md` §4.
461///
462/// Marked `#[non_exhaustive]` so adding new categories is a non-breaking
463/// change. Pattern-match with a wildcard arm.
464#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
465#[non_exhaustive]
466pub enum RefParseError {
467    /// Input was empty.
468    #[error("empty input")]
469    Empty,
470    /// Input did not begin with the required `10.` literal (after any
471    /// scheme strip).
472    #[error("DOI must begin with '10.'")]
473    MissingDoiPrefix,
474    /// Input started with `10.` but had no `/` separator between
475    /// registrant and suffix.
476    #[error("DOI must contain '/' between registrant and suffix")]
477    MissingDoiSuffixSeparator,
478    /// Registrant was not 4–9 ASCII digits.
479    #[error("DOI registrant must be 4–9 ASCII digits")]
480    InvalidDoiRegistrant,
481    /// DOI suffix was empty.
482    #[error("DOI suffix is empty")]
483    EmptyDoiSuffix,
484    /// DOI suffix exceeded `DOI_SUFFIX_MAX_LEN` bytes.
485    #[error("DOI suffix is {len} bytes; maximum is {max}")]
486    DoiSuffixTooLong {
487        /// Observed suffix length, in bytes.
488        len: usize,
489        /// Hard upper bound (always [`DOI_SUFFIX_MAX_LEN`]).
490        max: usize,
491    },
492    /// DOI suffix contained a character outside `[A-Za-z0-9._/():-]`.
493    #[error("DOI suffix contains invalid character {ch:?}")]
494    InvalidDoiSuffixChar {
495        /// The first offending character.
496        ch: char,
497    },
498    /// Input matched neither the new-style nor old-style arXiv shape.
499    #[error("input does not match any known arXiv id shape")]
500    InvalidArxivShape,
501}
502
503impl From<RefParseError> for ErrorCode {
504    fn from(_: RefParseError) -> Self {
505        // All parse failures collapse to INVALID_REF at the public boundary,
506        // matching `docs/PUBLIC_API.md` §4 and `docs/SECURITY.md` §1.1.
507        ErrorCode::InvalidRef
508    }
509}
510
511// ---------------------------------------------------------------------------
512// Safekey
513// ---------------------------------------------------------------------------
514
515/// A filesystem-safe key derived deterministically from a `Ref`.
516///
517/// See `docs/SAFEKEY.md` for the full algorithm and reference test vectors.
518/// Construct via `Ref::safekey()` (Phase 1+); inner field is `pub(crate)`.
519///
520/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"doi_10.1234_example"`.
521#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
522#[serde(transparent)]
523pub struct Safekey(pub(crate) String);
524
525impl Safekey {
526    /// Returns the safekey as a string slice.
527    pub fn as_str(&self) -> &str {
528        &self.0
529    }
530}
531
532impl Ref {
533    /// Returns the bare identifier string usable as a provenance `ref` field.
534    ///
535    /// Equivalent to `Doi::as_str` / `ArxivId::as_str` dispatched on the
536    /// variant — the URI scheme (`doi:` / `arxiv:`) is never present in the
537    /// inner identifiers (it is stripped at parse time), so the result is
538    /// always the bare DOI or arXiv id. Used by the CLI / MCP orchestrators
539    /// to populate the `ref` column of provenance log rows
540    /// (`docs/PROVENANCE_LOG.md` §3) without re-matching the variant.
541    pub fn as_input_str(&self) -> &str {
542        match self {
543            Ref::Doi(d) => d.as_str(),
544            Ref::Arxiv(a) => a.as_str(),
545        }
546    }
547
548    /// Derives a deterministic, filesystem-safe key from this reference.
549    ///
550    /// The algorithm is the NORMATIVE binding spec in `docs/SAFEKEY.md` §3.
551    /// Both Rust and Julia implementations MUST produce bit-identical output
552    /// for every entry in `tests/fixtures/safekey/vectors.json`.
553    ///
554    /// # Algorithm summary
555    ///
556    /// 1. Prefix with `doi_` or `arxiv_` (per variant).
557    /// 2. Replace any character outside `[A-Za-z0-9._-]` with `_`.
558    /// 3. Collapse consecutive `_` runs to a single `_`.
559    /// 4. Trim leading/trailing `_`.
560    /// 5. If the result exceeds 192 bytes, take the first 192 bytes plus
561    ///    `_` plus the first 8 hex chars of `SHA-256(raw)` (where `raw` is
562    ///    the step-1 output, before escaping).
563    ///
564    /// The bound on `as_str()` after step 4 is pure ASCII (steps 1-3 produce
565    /// only ASCII bytes), so the byte-slice in step 5 cannot split a
566    /// multibyte char.
567    pub fn safekey(&self) -> Safekey {
568        // Step 0: prefix per variant. Doi/ArxivId hold the bare identifier
569        // (no `doi:` / `arxiv:` URI scheme — that is stripped by Ref::parse,
570        // not relevant here).
571        let raw = match self {
572            Ref::Doi(d) => format!("doi_{}", d.as_str()),
573            Ref::Arxiv(a) => format!("arxiv_{}", a.as_str()),
574        };
575
576        // Step 1: replace unsafe chars with '_'. Non-ASCII chars (emitted by
577        // String::chars() as full Unicode code points) all hit the wildcard
578        // arm and become a single '_'.
579        let escaped: String = raw
580            .chars()
581            .map(|c| match c {
582                'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' => c,
583                _ => '_',
584            })
585            .collect();
586
587        // Step 2: collapse consecutive '_' runs to a single '_'.
588        let mut collapsed = String::with_capacity(escaped.len());
589        let mut last_was_underscore = false;
590        for c in escaped.chars() {
591            if c == '_' {
592                if !last_was_underscore {
593                    collapsed.push('_');
594                }
595                last_was_underscore = true;
596            } else {
597                collapsed.push(c);
598                last_was_underscore = false;
599            }
600        }
601
602        // Step 3: trim leading/trailing '_'.
603        let trimmed = collapsed.trim_matches('_');
604
605        // Step 4: length-bound. After steps 1-3 `trimmed` is pure ASCII, so
606        // `len()` (bytes) == char count and `&trimmed[..192]` is char-safe.
607        let key = if trimmed.len() > 192 {
608            let digest = sha2::Sha256::digest(raw.as_bytes());
609            let hash = hex::encode(&digest[..4]);
610            format!("{}_{}", &trimmed[..192], hash)
611        } else {
612            trimmed.to_string()
613        };
614
615        Safekey(key)
616    }
617}
618
619// ---------------------------------------------------------------------------
620// ErrorCode
621// ---------------------------------------------------------------------------
622
623/// The closed set of error codes doiget surfaces.
624///
625/// See `docs/ERRORS.md` for the persona × code matrix.
626///
627/// Marked `#[non_exhaustive]` so adding new variants is a minor (not major)
628/// version bump.
629#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
630#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
631#[non_exhaustive]
632pub enum ErrorCode {
633    /// DOI / arXiv id failed validation.
634    InvalidRef,
635    /// Tier 1 sources reported no OA URL.
636    NoOaAvailable,
637    /// Internal rate cap or upstream 429.
638    RateLimited,
639    /// Transport / DNS / TLS failure.
640    NetworkError,
641    /// A metadata source authoritatively reported that the identifier
642    /// does not exist. Network-independent and reproducible, so `doiget
643    /// verify` treats it as a definite dead reference (fails the run even
644    /// without `--strict`) rather than a tolerable blip — distinct from
645    /// the transient [`Self::NetworkError`], [`Self::RateLimited`], and
646    /// [`Self::FetchTimeout`].
647    ///
648    /// Sources: an HTTP `404` / `410` / `451` from a metadata API, or a
649    /// source-specific absence signal (e.g. arXiv returns HTTP 200 with an
650    /// empty `<feed>` for an unknown id, surfaced via `FetchError::NotFound`).
651    ///
652    /// Caveat (DOI fan-out): for a DOI this is emitted only when the
653    /// configured metadata sources (Crossref, then Unpaywall) all fail to
654    /// resolve it and at least one authoritatively 404s. A DOI registered
655    /// only outside that set (e.g. a DataCite-only dataset DOI) can
656    /// therefore be reported `NotFound` even though it exists in a
657    /// registry doiget does not query.
658    NotFound,
659    /// Filesystem write failed.
660    StoreError,
661    /// Provenance log write failed; the fetch was aborted.
662    LogError,
663    /// Source not granted by the runtime `CapabilityProfile`.
664    CapabilityDenied,
665    /// Per-request timeout exceeded.
666    FetchTimeout,
667    /// Store entry's `schema_version` is ahead of this build.
668    SchemaTooNew,
669    /// Could not acquire `flock` within 5 s.
670    LockTimeout,
671    /// Bug — please open an issue.
672    InternalError,
673    /// Feature is spec'd but not yet wired in this Phase. Distinct from
674    /// [`Self::InternalError`] (which signals a bug) and
675    /// [`Self::CapabilityDenied`] (which signals a runtime config gate).
676    /// Returned by stubs that exist to pin the public surface ahead of
677    /// orchestrator implementation, so an agent can react with "wait for
678    /// next minor release" rather than "report a bug" or "tweak my
679    /// capability profile". Wire form: `"NOT_IMPLEMENTED"`.
680    NotImplemented,
681}
682
683impl ErrorCode {
684    /// The `SCREAMING_SNAKE_CASE` wire token for this code, as a
685    /// `&'static str`. Identical to the serde representation but
686    /// allocation-free and usable where a borrowed string with a
687    /// `'static` lifetime is required — notably the provenance log
688    /// `error_code` column (`docs/PROVENANCE_LOG.md` §3), so a failure
689    /// row records the *actual* mapped code instead of a hand-written
690    /// literal that can drift from this enum (issue #118).
691    #[must_use]
692    pub fn as_wire(&self) -> &'static str {
693        match self {
694            ErrorCode::InvalidRef => "INVALID_REF",
695            ErrorCode::NoOaAvailable => "NO_OA_AVAILABLE",
696            ErrorCode::RateLimited => "RATE_LIMITED",
697            ErrorCode::NetworkError => "NETWORK_ERROR",
698            ErrorCode::NotFound => "NOT_FOUND",
699            ErrorCode::StoreError => "STORE_ERROR",
700            ErrorCode::LogError => "LOG_ERROR",
701            ErrorCode::CapabilityDenied => "CAPABILITY_DENIED",
702            ErrorCode::FetchTimeout => "FETCH_TIMEOUT",
703            ErrorCode::SchemaTooNew => "SCHEMA_TOO_NEW",
704            ErrorCode::LockTimeout => "LOCK_TIMEOUT",
705            ErrorCode::InternalError => "INTERNAL_ERROR",
706            ErrorCode::NotImplemented => "NOT_IMPLEMENTED",
707        }
708    }
709}
710
711// ---------------------------------------------------------------------------
712// DenialReason / DenialContext (ADR-0023)
713// ---------------------------------------------------------------------------
714
715/// Closed-set reasons a denial-class error envelope can carry on its
716/// optional `denial_context.reason` field.
717///
718/// Wire form (JSON / MCP) is `snake_case` — e.g. `"redirect_not_in_allowlist"`.
719/// The set is **closed** per ADR-0023 §2: adding a new variant is a minor
720/// semver bump; renaming or repurposing one is a breaking change. Mirrors
721/// the stability rule that already governs [`ErrorCode`].
722///
723/// See [`DenialContext`] for the surrounding struct, `docs/ERRORS.md` §3.1
724/// for the wire surface, and `docs/PUBLIC_API.md` §8 for the
725/// semver-locked surface contract.
726#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
727#[serde(rename_all = "snake_case")]
728pub enum DenialReason {
729    /// Redirect target host did not match the source's allowlist
730    /// (`HttpError::RedirectDenied`).
731    RedirectNotInAllowlist,
732    /// Redirect target had a non-HTTPS scheme (`HttpError::InsecureRedirect`).
733    InsecureScheme,
734    /// Source produced a URL whose host is on a future blocklist.
735    ///
736    /// Reserved — no producer wired yet. Will be emitted by the future
737    /// per-source URL host-blocklist guard once that component lands
738    /// (post-Phase-1 supply-chain hardening; see
739    /// `docs/REDIRECT_ALLOWLIST.md` §4 for the staging plan).
740    HostInBlockList,
741    /// Body exceeded [`PDF_MAX_BYTES`] (`HttpError::OversizedBody`).
742    SizeCapExceeded,
743    /// Store entry's `schema_version` is ahead of this binary.
744    ///
745    /// Reserved — no producer wired yet. Will be emitted by the
746    /// `FsStore` schema-rejection path once the read-side bump check
747    /// lands (it currently only writes the current `SCHEMA_VERSION`).
748    SchemaDrift,
749    /// Source not in the runtime [`CapabilityProfile`]
750    /// (`FetchError::NotEligible`).
751    CapabilityNotGranted,
752    /// Rate limiter rejected the call inside the current window.
753    ///
754    /// Reserved — no producer wired yet. Will be emitted by
755    /// [`RateLimiter`](crate::rate_limiter::RateLimiter) once the
756    /// limiter surfaces structured denials (Phase 2+; today the
757    /// limiter only sleeps to enforce the window).
758    RateLimitWindow,
759    /// SSRF guard rejected a private / link-local / cloud-metadata address.
760    ///
761    /// Reserved — no producer wired yet. Will be emitted by the
762    /// future SSRF pre-flight check (post-Phase-1 supply-chain
763    /// hardening; the workspace currently relies on rustls + the
764    /// HTTPS-only redirect policy to keep the attack surface small).
765    SsrfPrivateAddress,
766    /// Response Content-Type / magic-byte mismatch (`HttpError::NotAPdf`).
767    ContentTypeMismatch,
768}
769
770/// Structured machine-parseable companion to `error.message` for
771/// recoverable denials.
772///
773/// The field is **optional and additive** on the public error envelope —
774/// every previously-shipped `{code, message}` envelope remains valid, and
775/// agents that ignore this struct continue to work. When present, it
776/// carries the concrete parameters an LLM agent can use to plan a recovery
777/// (e.g. "the redirect to `evil.example.com` was denied because it is not
778/// in the crossref allowlist") without text-mining `error.message`.
779///
780/// ## Wire shape
781///
782/// `#[serde(deny_unknown_fields)]`: forward-compatible field additions on
783/// the wire are forbidden by design — adding a field to this struct is a
784/// **breaking** change. This is why the type is **not** `#[non_exhaustive]`
785/// (per `docs/PUBLIC_API.md` §8): both production rules — Rust struct
786/// construction outside the crate AND wire-level extension — must agree.
787///
788/// All fields except `reason` are optional. Producers populate the fields
789/// relevant to the reason and leave the rest at `None`; consumers MUST
790/// tolerate any subset of fields being present. Optional fields are
791/// skipped on serialize but accepted as missing on deserialize via
792/// `#[serde(default, skip_serializing_if = "Option::is_none")]`.
793///
794/// [`Self::expected`] is `Option<Vec<String>>` rather than `Vec<String>`
795/// so the producer can distinguish "this reason has no allowlist channel"
796/// (`None` → field absent on the wire) from "this is the explicit list of
797/// acceptable values, possibly empty" (`Some(vec![])` → `"expected":[]` on
798/// the wire). The previous `Vec<String>` shape collapsed both states
799/// into "field omitted", which an LLM agent could not safely disambiguate.
800///
801/// Mapping table: see ADR-0023 §4, plus the
802/// `From<&HttpError> for Option<DenialContext>` and
803/// `From<&FetchError> for Option<DenialContext>` impls in
804/// [`crate::http`] / [`crate::source`].
805#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
806#[serde(deny_unknown_fields)]
807pub struct DenialContext {
808    /// Closed-enum reason code; the only required field.
809    pub reason: DenialReason,
810    /// Resolver source key (e.g. `"crossref"`) when one is in scope.
811    #[serde(default, skip_serializing_if = "Option::is_none")]
812    pub source: Option<String>,
813    /// Concrete value the producer attempted (host, path, hex magic bytes,
814    /// scheme prefix). Shape is reason-specific; consumers MUST treat it
815    /// as opaque text.
816    #[serde(default, skip_serializing_if = "Option::is_none")]
817    pub attempted: Option<String>,
818    /// Allowlist entries / acceptable values. `Option<Vec<String>>` so the
819    /// producer can distinguish "this reason has no allowlist channel"
820    /// (`None`, field absent on the wire) from "this is the explicit list
821    /// of acceptable values, possibly empty" (`Some(vec![])`, `"expected":[]`
822    /// on the wire). The inner `Vec<String>` is used even when only one
823    /// value is meaningful (e.g. `Some(vec!["%PDF-".into()])`) so the
824    /// format does not have to flip when multiple values are acceptable.
825    #[serde(default, skip_serializing_if = "Option::is_none")]
826    pub expected: Option<Vec<String>>,
827    /// Redirect-chain hop position, 0-indexed. `u8` because the chain is
828    /// hard-capped at [`crate::http`]'s `MAX_REDIRECTS` (= 10) and any
829    /// larger value indicates a bug.
830    #[serde(default, skip_serializing_if = "Option::is_none")]
831    pub hop_index: Option<u8>,
832    /// Size or rate cap value (e.g. [`PDF_MAX_BYTES`]).
833    #[serde(default, skip_serializing_if = "Option::is_none")]
834    pub cap: Option<u64>,
835    /// Observed value (e.g. response bytes when [`Self::cap`] is the byte
836    /// cap, or row schema_version when [`Self::cap`] is the binary's).
837    #[serde(default, skip_serializing_if = "Option::is_none")]
838    pub actual: Option<u64>,
839}
840
841// ---------------------------------------------------------------------------
842// ResolvedCandidate / ResolveResult (Issue #242)
843// ---------------------------------------------------------------------------
844
845/// A candidate paper resolved from a bibliographic citation string.
846#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
847pub struct ResolvedCandidate {
848    /// Resolved DOI.
849    pub doi: String,
850    /// Title of the resolved candidate.
851    pub title: String,
852    /// First author or primary author representation.
853    pub author: String,
854    /// Publication year, if resolved.
855    pub year: Option<i32>,
856    /// Token similarity overlap score in `0.0..=1.0`.
857    pub score: f64,
858    /// Resolving metadata source (e.g. `"crossref"`).
859    pub source: String,
860}
861
862/// The result structure returned by bibliographic citation resolution.
863#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
864pub struct ResolveResult {
865    /// The original query bibliographic citation string.
866    pub query: String,
867    /// Ranked candidate list (highest score first, thresholded to >= 0.5).
868    pub candidates: Vec<ResolvedCandidate>,
869}
870
871// ---------------------------------------------------------------------------
872// CapabilityProfile (placeholder; full impl in Phase 1)
873// ---------------------------------------------------------------------------
874
875/// Marker for the always-on Open Access tier. See `docs/CAPABILITY.md`.
876#[derive(Debug, Clone, Copy)]
877pub struct AlwaysOn;
878
879/// Which Tier 2 metadata sources are enabled this session. See `docs/CAPABILITY.md`.
880#[derive(Debug, Clone, Default)]
881#[non_exhaustive]
882pub struct MetadataAccess {
883    /// Phase 4+; enabled by `DOIGET_ENABLE_OPENALEX`.
884    pub openalex: bool,
885    /// Phase 4+; enabled by `DOIGET_ENABLE_S2`.
886    pub semantic_scholar: bool,
887    /// Phase 4+; enabled by `DOIGET_ENABLE_DOAJ`.
888    pub doaj: bool,
889}
890
891/// Process-wide rate limits. Hard-coded; not configurable.
892///
893/// Construct only via [`RateLimits::HARD_CODED`]. The struct fields are
894/// `pub(crate)` so downstream code cannot synthesize a `RateLimits` with
895/// different values, which would weaken `docs/LEGAL.md` §6 safeguard 8.
896#[derive(Debug, Clone, Copy)]
897#[non_exhaustive]
898pub struct RateLimits {
899    pub(crate) max_concurrent_fetches: u32,
900    pub(crate) max_fetches_per_second: f32,
901    pub(crate) per_source_backoff_ms: u64,
902}
903
904impl RateLimits {
905    /// The single, hard-coded set of rate limits. There is no other public
906    /// constructor — see the type-level docs.
907    pub const HARD_CODED: Self = Self {
908        max_concurrent_fetches: MAX_CONCURRENT_FETCHES,
909        max_fetches_per_second: MAX_FETCHES_PER_SECOND,
910        per_source_backoff_ms: 200,
911    };
912
913    /// Maximum number of concurrent fetches in flight.
914    pub const fn max_concurrent_fetches(&self) -> u32 {
915        self.max_concurrent_fetches
916    }
917
918    /// Maximum fetch attempts per second across all sources.
919    pub const fn max_fetches_per_second(&self) -> f32 {
920        self.max_fetches_per_second
921    }
922
923    /// Per-source backoff in milliseconds between consecutive requests.
924    pub const fn per_source_backoff_ms(&self) -> u64 {
925        self.per_source_backoff_ms
926    }
927}
928
929/// A successful TDM grant.
930///
931/// Carries the validated API key (`docs/CAPABILITY.md` §1) so that the key
932/// flows from the startup capability gate into the source, rather than each
933/// TDM source re-reading the env var at fetch time (issue #153 — an env
934/// mutation between startup and fetch is otherwise undetectable).
935///
936/// The `api_key` field exists only when at least one `tdm-*` Cargo feature
937/// is compiled in (the `secrecy` dependency is `optional = true` and gated
938/// on those features per ADR-0002, so default release binaries contain no
939/// TDM code path at all). The struct is `#[non_exhaustive]`; the
940/// `tdm-*`-gated `api_key` field is therefore additive, not breaking, for
941/// builds that toggle the feature set.
942///
943/// `docs/CAPABILITY.md` §1 specifies the type as `Secret<String>`; that is
944/// the `secrecy` 0.9 spelling. The workspace pins `secrecy` 0.10, whose
945/// equivalent owned-string secret type is `secrecy::SecretString`
946/// (`= SecretBox<str>`). CAPABILITY.md §1 has been updated to match the
947/// 0.10 API. `Debug` redacts the value.
948///
949/// Implements `Default` so in-crate test fixtures using
950/// `TdmGrant { agree_env_var: ..., ..Default::default() }` keep compiling;
951/// the default `api_key` is an empty secret.
952#[derive(Debug, Clone)]
953#[non_exhaustive]
954pub struct TdmGrant {
955    /// The publisher API key, validated present at startup by
956    /// [`CapabilityProfile::from_env`]. Wrapped in
957    /// `secrecy::SecretString` so `Debug` never prints it; use
958    /// `secrecy::ExposeSecret::expose_secret` at the point of use.
959    ///
960    /// Only present when a `tdm-*` feature is compiled in (see the
961    /// type-level docs and ADR-0002).
962    #[cfg(any(
963        feature = "tdm-elsevier",
964        feature = "tdm-aps",
965        feature = "tdm-springer"
966    ))]
967    pub api_key: secrecy::SecretString,
968    /// Which env var the user used to acknowledge the publisher's ToS.
969    pub agree_env_var: String,
970    /// When the agreement env var was first observed at startup.
971    pub agreed_at: chrono::DateTime<chrono::Utc>,
972}
973
974impl Default for TdmGrant {
975    fn default() -> Self {
976        Self {
977            #[cfg(any(
978                feature = "tdm-elsevier",
979                feature = "tdm-aps",
980                feature = "tdm-springer"
981            ))]
982            api_key: secrecy::SecretString::from(String::new()),
983            agree_env_var: String::new(),
984            agreed_at: chrono::Utc::now(),
985        }
986    }
987}
988
989/// Runtime gate for which sources may be invoked. See `docs/CAPABILITY.md`.
990///
991/// Marked `#[non_exhaustive]` so adding new capability classes is non-breaking.
992/// Pattern-match only against the documented variants and use a wildcard arm.
993///
994/// **Construction**: external callers use [`CapabilityProfile::from_env()`].
995/// Struct-literal construction is blocked outside this crate by
996/// `#[non_exhaustive]`; this is intentional — the type's safety guarantees
997/// rely on the resolution rules in `from_env`. `Default` is **not yet**
998/// implemented; Phase 1 will add it once the field set stabilizes.
999#[derive(Debug, Clone)]
1000#[non_exhaustive]
1001pub struct CapabilityProfile {
1002    /// Tier 1 OA sources are always permitted.
1003    pub oa: AlwaysOn,
1004    /// Tier 2 metadata access (Phase 4+).
1005    pub metadata: MetadataAccess,
1006    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
1007    pub tdm_elsevier: Option<TdmGrant>,
1008    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
1009    pub tdm_aps: Option<TdmGrant>,
1010    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
1011    pub tdm_springer: Option<TdmGrant>,
1012    /// Hard-coded rate limits for this process.
1013    pub rate_limits: RateLimits,
1014}
1015
1016/// Errors that can arise during `CapabilityProfile::from_env`.
1017#[derive(Debug, thiserror::Error)]
1018pub enum CapabilityError {
1019    /// User set the agree env var but provided no key. See `docs/CAPABILITY.md` §2.
1020    #[error("env {agree_var} is set but {key_var} is missing")]
1021    AgreedButNoKey {
1022        /// The agreement env var the user set.
1023        agree_var: String,
1024        /// The key env var that should accompany it.
1025        key_var: String,
1026    },
1027    /// Key env var is set but user has not agreed. See `docs/CAPABILITY.md` §2.
1028    #[error("key for {agree_var} is present but {agree_var} is not set to '1'")]
1029    KeyButNotAgreed {
1030        /// The agreement env var the user must set to `1` before the key takes effect.
1031        agree_var: String,
1032    },
1033}
1034
1035impl CapabilityProfile {
1036    /// Read the runtime profile from environment variables.
1037    ///
1038    /// Implements the resolution algorithm specified in
1039    /// [`docs/CAPABILITY.md`](../../../docs/CAPABILITY.md) §2.
1040    ///
1041    /// # Tier 1 (Open Access)
1042    ///
1043    /// Always permitted; not gated on any env var or feature.
1044    ///
1045    /// # Tier 2 (metadata)
1046    ///
1047    /// Each metadata source becomes available when its env var is set
1048    /// (presence-checked, value ignored) **and** the `metadata` Cargo feature
1049    /// was compiled in. If the env var is set but the feature is not compiled
1050    /// in, a `tracing::warn!` is emitted and the source is left disabled —
1051    /// this is not an error so that users can move binaries between machines
1052    /// (or switch feature sets between cargo invocations) without breaking
1053    /// startup. See `docs/CAPABILITY.md` §3 for the env var list.
1054    ///
1055    /// # Tier 3 (TDM)
1056    ///
1057    /// For each publisher in `{ELSEVIER, APS, SPRINGER}`, the
1058    /// `DOIGET_AGREE_TDM_<X>` agreement env var is paired with
1059    /// `DOIGET_KEY_<X>`. Resolution rules (per `docs/CAPABILITY.md` §2):
1060    ///
1061    /// - both unset → `tdm_<x> = None` (no error);
1062    /// - `agree == "1"` and key set → `Some(TdmGrant { .. })` (subject to the
1063    ///   feature gate below);
1064    /// - `agree == "1"` and key unset → [`CapabilityError::AgreedButNoKey`];
1065    /// - key set but `agree` unset (or `agree != "1"`) →
1066    ///   [`CapabilityError::KeyButNotAgreed`].
1067    ///
1068    /// When both env vars are set correctly **but** the corresponding
1069    /// `tdm-<x>` Cargo feature is not compiled in, this function emits a
1070    /// `tracing::warn!` and sets the grant to `None` rather than returning an
1071    /// error — same rationale as for the Tier 2 warn-and-skip behavior.
1072    ///
1073    /// # Precondition: tracing subscriber must be installed first
1074    ///
1075    /// Warn breadcrumbs are delivered via `tracing::warn!`. Callers MUST
1076    /// install a `tracing-subscriber` (or equivalent) **before** invoking
1077    /// this function, otherwise warnings are silently dropped. The
1078    /// `doiget-cli` binary does this in `main.rs`.
1079    ///
1080    /// # Errors
1081    ///
1082    /// Returns [`CapabilityError::AgreedButNoKey`] or
1083    /// [`CapabilityError::KeyButNotAgreed`] when the TDM env-var pair for any
1084    /// publisher is misconfigured. See the variant docs for the precise
1085    /// trigger conditions.
1086    ///
1087    /// # Note on `api_key` storage
1088    ///
1089    /// When a `tdm-*` feature is compiled in, [`TdmGrant`] carries the
1090    /// validated key as `secrecy::SecretString` (issue #153). The key is
1091    /// read exactly once here, at startup; TDM sources consume it from the
1092    /// grant and never re-read the env var at fetch time. This makes the
1093    /// grant a true startup attestation — an env mutation between startup
1094    /// and fetch can no longer silently change the credential in flight.
1095    /// See the [`TdmGrant`] doc-comment and `docs/CAPABILITY.md` §1/§2.
1096    pub fn from_env() -> Result<Self, CapabilityError> {
1097        // Issue #153: the validated API key is now threaded through
1098        // `TdmGrant` (as `secrecy::SecretString`, behind the `tdm-*`
1099        // features) by `resolve_tdm_grant` below — sources no longer
1100        // re-read the key env var at fetch time. See the `TdmGrant`
1101        // doc-comment and `docs/CAPABILITY.md` §1/§2.
1102
1103        // -- Tier 2 metadata -------------------------------------------------
1104        let metadata = MetadataAccess {
1105            openalex: resolve_metadata_flag(
1106                "DOIGET_ENABLE_OPENALEX",
1107                "metadata",
1108                cfg!(feature = "metadata"),
1109            ),
1110            semantic_scholar: resolve_metadata_flag(
1111                "DOIGET_ENABLE_S2",
1112                "metadata",
1113                cfg!(feature = "metadata"),
1114            ),
1115            doaj: resolve_metadata_flag(
1116                "DOIGET_ENABLE_DOAJ",
1117                "metadata",
1118                cfg!(feature = "metadata"),
1119            ),
1120        };
1121
1122        // -- Tier 3 TDM grants ----------------------------------------------
1123        let tdm_elsevier = resolve_tdm_grant(
1124            "DOIGET_AGREE_TDM_ELSEVIER",
1125            "DOIGET_KEY_ELSEVIER",
1126            "tdm-elsevier",
1127            cfg!(feature = "tdm-elsevier"),
1128        )?;
1129        let tdm_aps = resolve_tdm_grant(
1130            "DOIGET_AGREE_TDM_APS",
1131            "DOIGET_KEY_APS",
1132            "tdm-aps",
1133            cfg!(feature = "tdm-aps"),
1134        )?;
1135        let tdm_springer = resolve_tdm_grant(
1136            "DOIGET_AGREE_TDM_SPRINGER",
1137            "DOIGET_KEY_SPRINGER",
1138            "tdm-springer",
1139            cfg!(feature = "tdm-springer"),
1140        )?;
1141
1142        Ok(Self {
1143            oa: AlwaysOn,
1144            metadata,
1145            tdm_elsevier,
1146            tdm_aps,
1147            tdm_springer,
1148            rate_limits: RateLimits::HARD_CODED,
1149        })
1150    }
1151}
1152
1153/// Resolve a Tier 2 metadata flag from its env var and compile-in feature.
1154///
1155/// Returns `true` only when both the env var is present and the feature is
1156/// compiled in. When the env var is set without the feature, emits a
1157/// `tracing::warn!` and returns `false` — see [`CapabilityProfile::from_env`]
1158/// for the rationale (binaries may move between hosts / feature sets).
1159fn resolve_metadata_flag(env_var: &str, feature: &str, feature_enabled: bool) -> bool {
1160    let env_set = std::env::var_os(env_var).is_some();
1161    match (env_set, feature_enabled) {
1162        (true, true) => true,
1163        (true, false) => {
1164            tracing::warn!(
1165                env_var,
1166                feature,
1167                "{} is set but feature {} was not compiled in; the source will be unavailable",
1168                env_var,
1169                feature
1170            );
1171            false
1172        }
1173        (false, _) => false,
1174    }
1175}
1176
1177/// Resolve a Tier 3 TDM grant from the `agree`/`key` env-var pair and the
1178/// per-publisher Cargo feature.
1179///
1180/// Implements the rules in `docs/CAPABILITY.md` §2:
1181///
1182/// - both unset → `Ok(None)`.
1183/// - `agree == "1"` and `key` set → `Ok(Some(TdmGrant { .. }))` (when the
1184///   feature is enabled), or warn-and-`Ok(None)` (when the feature is not
1185///   compiled in).
1186/// - `agree == "1"` and `key` unset →
1187///   [`CapabilityError::AgreedButNoKey`].
1188/// - `key` set and `agree` unset OR `agree` set to anything other than `"1"`
1189///   → [`CapabilityError::KeyButNotAgreed`].
1190fn resolve_tdm_grant(
1191    agree_var: &str,
1192    key_var: &str,
1193    feature: &str,
1194    feature_enabled: bool,
1195) -> Result<Option<TdmGrant>, CapabilityError> {
1196    // `agree` is "agreed" iff the value is exactly the literal "1"; any other
1197    // value (including "true", "yes", empty) is treated as not-agreed per
1198    // `docs/CAPABILITY.md` §2.
1199    let agree_raw = std::env::var(agree_var).ok();
1200    let agreed = matches!(agree_raw.as_deref(), Some("1"));
1201    let agree_present = agree_raw.is_some();
1202    // Read the key value once, at startup, so the validated key flows
1203    // through `TdmGrant` and sources never re-read the env (issue #153).
1204    // An empty value is treated as "not set" — an empty API key cannot
1205    // authenticate, and silently constructing a grant around it would
1206    // mask the misconfiguration the AgreedButNoKey rule exists to surface.
1207    let key_value = std::env::var(key_var).ok().filter(|v| !v.is_empty());
1208
1209    match (agreed, agree_present, key_value) {
1210        (true, _, Some(key)) => {
1211            if feature_enabled {
1212                Ok(Some(build_tdm_grant(agree_var, key)))
1213            } else {
1214                // `key` is dropped here; under no-tdm builds it is the only
1215                // consumer of the owned `String`, which is intended.
1216                let _ = key;
1217                tracing::warn!(
1218                    env_var = agree_var,
1219                    feature,
1220                    "{} is set but feature {} was not compiled in; the source will be unavailable",
1221                    agree_var,
1222                    feature
1223                );
1224                Ok(None)
1225            }
1226        }
1227        (true, _, None) => Err(CapabilityError::AgreedButNoKey {
1228            agree_var: agree_var.to_string(),
1229            key_var: key_var.to_string(),
1230        }),
1231        // agree set to non-"1", key also set: KeyButNotAgreed (the key would
1232        // otherwise authorize the source without an explicit agreement).
1233        (false, true, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1234            agree_var: agree_var.to_string(),
1235        }),
1236        // agree unset, key set: KeyButNotAgreed (same rule).
1237        (false, false, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1238            agree_var: agree_var.to_string(),
1239        }),
1240        // agree set to non-"1" and no key: treat as no-grant. The user
1241        // expressed something but did not opt in and provided no credential,
1242        // so silent skip is the safe default (no source enabled).
1243        (false, true, None) => Ok(None),
1244        // Neither env var set: no grant, no error.
1245        (false, false, None) => Ok(None),
1246    }
1247}
1248
1249/// Construct a [`TdmGrant`] from the validated agreement var and key value.
1250///
1251/// Split out so the `tdm-*`-gated `api_key` field is populated in exactly
1252/// one place. When no `tdm-*` feature is compiled in the `key` is consumed
1253/// (dropped) here — the grant is still produced so that startup attestation
1254/// behavior (the warn-and-skip path) does not change shape between feature
1255/// sets.
1256fn build_tdm_grant(agree_var: &str, key: String) -> TdmGrant {
1257    #[cfg(any(
1258        feature = "tdm-elsevier",
1259        feature = "tdm-aps",
1260        feature = "tdm-springer"
1261    ))]
1262    {
1263        TdmGrant {
1264            api_key: secrecy::SecretString::from(key),
1265            agree_env_var: agree_var.to_string(),
1266            agreed_at: chrono::Utc::now(),
1267        }
1268    }
1269    #[cfg(not(any(
1270        feature = "tdm-elsevier",
1271        feature = "tdm-aps",
1272        feature = "tdm-springer"
1273    )))]
1274    {
1275        let _ = key;
1276        TdmGrant {
1277            agree_env_var: agree_var.to_string(),
1278            agreed_at: chrono::Utc::now(),
1279        }
1280    }
1281}
1282
1283// ---------------------------------------------------------------------------
1284// Tests — one smoke test per legally-load-bearing constant. See
1285// `docs/LEGAL.md` §6 safeguard 8 and `docs/PHASES.md` §4. These also keep the
1286// `cargo test --workspace` job from being a false-green during Phase 0.
1287// ---------------------------------------------------------------------------
1288
1289// `expect`/`unwrap` are idiomatic in tests where panics double as assertions.
1290// The workspace lints deny them in production code; relax for the test module
1291// only.
1292#[cfg(test)]
1293#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1294mod tests {
1295    use super::*;
1296
1297    #[test]
1298    fn rate_limits_hard_coded_match_legal_safeguards() {
1299        // docs/LEGAL.md §6 safeguard 8 names these exact values.
1300        assert_eq!(RateLimits::HARD_CODED.max_concurrent_fetches(), 5);
1301        assert!((RateLimits::HARD_CODED.max_fetches_per_second() - 5.0).abs() < f32::EPSILON);
1302        assert_eq!(RateLimits::HARD_CODED.per_source_backoff_ms(), 200);
1303    }
1304
1305    #[test]
1306    fn batch_size_caps_match_security_doc() {
1307        // docs/SECURITY.md §1.4 + docs/MCP_TOOLS.md.
1308        assert_eq!(MCP_BATCH_MAX_SIZE, 100);
1309        assert_eq!(MCP_QUEUE_DEPTH_MAX, 100);
1310        assert_eq!(DOI_SUFFIX_MAX_LEN, 256);
1311        assert_eq!(MCP_STDIN_EOF_SHUTDOWN_SEC, 5);
1312        // Slice 2: spec-language alias for MCP_BATCH_MAX_SIZE must
1313        // numerically agree with the original constant.
1314        assert_eq!(MAX_BATCH_REFS, MCP_BATCH_MAX_SIZE);
1315    }
1316
1317    #[test]
1318    fn schema_version_is_pinned_to_1_0() {
1319        // docs/STORE.md §3 — Phase 0/1 writes 1.0 exactly.
1320        // A bump to 1.1 (minor, backward-compat additions) requires updating
1321        // both this test and the cross-tool compat fixtures simultaneously.
1322        assert_eq!(SCHEMA_VERSION, "1.0");
1323    }
1324
1325    // -----------------------------------------------------------------
1326    // CapabilityProfile::from_env — Phase 1 resolution algorithm tests.
1327    //
1328    // These tests mutate process-global env state via std::env::set_var /
1329    // remove_var, so each test holds an `EnvGuard` RAII drop guard that
1330    // captures the pre-test value of every env var it touches and restores
1331    // it on drop (even on panic). They also use `#[serial_test::serial]` so
1332    // that no two tests in this module touch env state concurrently — the
1333    // workspace's test runner defaults to multi-threaded.
1334    //
1335    // Spec: docs/CAPABILITY.md §2 (resolution algorithm) and §3 (env var
1336    // reference table).
1337    // -----------------------------------------------------------------
1338
1339    /// RAII guard that captures the prior value of an env var on construction
1340    /// and restores it on drop. Use one guard per touched var per test.
1341    struct EnvGuard {
1342        var: &'static str,
1343        prior: Option<std::ffi::OsString>,
1344    }
1345
1346    impl EnvGuard {
1347        /// Capture and clear `var`. Use `set` afterwards to install a value.
1348        fn unset(var: &'static str) -> Self {
1349            let prior = std::env::var_os(var);
1350            // SAFETY (env mutation): tests are serialized via
1351            // `#[serial_test::serial]`. `remove_var` is sound when no other
1352            // thread reads or writes the environment concurrently.
1353            std::env::remove_var(var);
1354            EnvGuard { var, prior }
1355        }
1356
1357        /// Capture, then set `var` to `value`.
1358        fn set(var: &'static str, value: &str) -> Self {
1359            let prior = std::env::var_os(var);
1360            std::env::set_var(var, value);
1361            EnvGuard { var, prior }
1362        }
1363    }
1364
1365    impl Drop for EnvGuard {
1366        fn drop(&mut self) {
1367            match &self.prior {
1368                Some(v) => std::env::set_var(self.var, v),
1369                None => std::env::remove_var(self.var),
1370            }
1371        }
1372    }
1373
1374    /// Convenience: unset every Tier 2 / Tier 3 env var the resolution
1375    /// algorithm reads, returning a vector of guards that restore them on
1376    /// drop. Callers can then `EnvGuard::set` individual vars on top.
1377    fn unset_all_capability_env_vars() -> Vec<EnvGuard> {
1378        [
1379            "DOIGET_ENABLE_OPENALEX",
1380            "DOIGET_ENABLE_S2",
1381            "DOIGET_ENABLE_DOAJ",
1382            "DOIGET_AGREE_TDM_ELSEVIER",
1383            "DOIGET_KEY_ELSEVIER",
1384            "DOIGET_AGREE_TDM_APS",
1385            "DOIGET_KEY_APS",
1386            "DOIGET_AGREE_TDM_SPRINGER",
1387            "DOIGET_KEY_SPRINGER",
1388        ]
1389        .iter()
1390        .map(|v| EnvGuard::unset(v))
1391        .collect()
1392    }
1393
1394    #[test]
1395    #[serial_test::serial]
1396    fn from_env_no_env_vars_set_returns_tier_1_only() {
1397        // Rule: with every relevant env var unset, the resolved profile has
1398        // all TDM grants `None` and all metadata flags `false`. Hard-coded
1399        // rate limits still apply. (Replaces the old Phase 0 stub test.)
1400        let _g = unset_all_capability_env_vars();
1401
1402        let p = CapabilityProfile::from_env().expect("clean env never errors");
1403        assert!(p.tdm_elsevier.is_none());
1404        assert!(p.tdm_aps.is_none());
1405        assert!(p.tdm_springer.is_none());
1406        assert!(!p.metadata.openalex);
1407        assert!(!p.metadata.semantic_scholar);
1408        assert!(!p.metadata.doaj);
1409        assert_eq!(p.rate_limits.max_concurrent_fetches(), 5);
1410    }
1411
1412    #[test]
1413    #[serial_test::serial]
1414    fn from_env_no_tdm_returns_tier_1_profile() {
1415        // Rule (CAPABILITY.md §2): with every TDM env var unset, all
1416        // `tdm_*` fields are `None` and no error is produced.
1417        let _g = unset_all_capability_env_vars();
1418
1419        let p = CapabilityProfile::from_env().expect("no TDM env -> Ok");
1420        assert!(p.tdm_elsevier.is_none());
1421        assert!(p.tdm_aps.is_none());
1422        assert!(p.tdm_springer.is_none());
1423    }
1424
1425    #[test]
1426    #[serial_test::serial]
1427    fn from_env_agreed_but_no_key_errs() {
1428        // Rule (CAPABILITY.md §2): agree=1 + key unset -> AgreedButNoKey.
1429        let _g = unset_all_capability_env_vars();
1430        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1431
1432        let result = CapabilityProfile::from_env();
1433        match result {
1434            Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1435                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1436                assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1437            }
1438            other => panic!("expected AgreedButNoKey, got {:?}", other),
1439        }
1440    }
1441
1442    #[test]
1443    #[serial_test::serial]
1444    fn from_env_agreed_but_empty_key_errs() {
1445        // Security-adjacent (PR #161 review): an *empty* key string is
1446        // treated as "not set" by `resolve_tdm_grant`. With agree=1 and
1447        // DOIGET_KEY_ELSEVIER="" the misconfiguration must surface as
1448        // AgreedButNoKey, not silently build a grant around an empty
1449        // secret that could never authenticate.
1450        let _g = unset_all_capability_env_vars();
1451        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1452        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1453
1454        let result = CapabilityProfile::from_env();
1455        match result {
1456            Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1457                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1458                assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1459            }
1460            other => panic!("expected AgreedButNoKey for empty key, got {:?}", other),
1461        }
1462    }
1463
1464    #[test]
1465    #[serial_test::serial]
1466    fn from_env_empty_key_without_agree_is_no_grant() {
1467        // Security-adjacent (PR #161 review): an empty key with the
1468        // agree var unset is indistinguishable from "no key at all".
1469        // It must resolve to Ok(None) (no grant, no error) — an empty
1470        // string must NOT trip the KeyButNotAgreed leaked-credential
1471        // rule, since there is no credential.
1472        let _g = unset_all_capability_env_vars();
1473        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1474
1475        let p = CapabilityProfile::from_env()
1476            .expect("empty key + agree unset must be Ok(None), not an error");
1477        assert!(
1478            p.tdm_elsevier.is_none(),
1479            "empty DOIGET_KEY_ELSEVIER with no agree var must yield no grant"
1480        );
1481        assert!(p.tdm_aps.is_none());
1482        assert!(p.tdm_springer.is_none());
1483    }
1484
1485    #[test]
1486    #[serial_test::serial]
1487    fn from_env_key_but_not_agreed_errs() {
1488        // Rule (CAPABILITY.md §2): key set + agree unset -> KeyButNotAgreed.
1489        // A leaked DOIGET_KEY_ELSEVIER must not silently enable a source.
1490        let _g = unset_all_capability_env_vars();
1491        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1492
1493        let result = CapabilityProfile::from_env();
1494        match result {
1495            Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1496                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1497            }
1498            other => panic!("expected KeyButNotAgreed, got {:?}", other),
1499        }
1500    }
1501
1502    #[test]
1503    #[serial_test::serial]
1504    fn from_env_agree_not_one_errs() {
1505        // Rule (CAPABILITY.md §2): the agree var must be exactly "1". Any
1506        // other value (here: "true") is treated as not-agreed; combined
1507        // with a key set, that triggers KeyButNotAgreed.
1508        let _g = unset_all_capability_env_vars();
1509        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "true");
1510        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1511
1512        let result = CapabilityProfile::from_env();
1513        match result {
1514            Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1515                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1516            }
1517            other => panic!("expected KeyButNotAgreed, got {:?}", other),
1518        }
1519    }
1520
1521    #[test]
1522    #[serial_test::serial]
1523    fn from_env_both_set_correctly_returns_grant() {
1524        // Rule (CAPABILITY.md §2): agree=1 + key set -> Some(TdmGrant) when
1525        // the corresponding feature is compiled in; else None (warn-and-skip).
1526        // The feature gate for elsevier is `tdm-elsevier`; this test asserts
1527        // both branches via `cfg!`.
1528        let _g = unset_all_capability_env_vars();
1529        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1530        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1531
1532        let p = CapabilityProfile::from_env().expect("agree=1 + key -> Ok");
1533
1534        if cfg!(feature = "tdm-elsevier") {
1535            let grant = p
1536                .tdm_elsevier
1537                .as_ref()
1538                .expect("feature tdm-elsevier compiled in -> Some(TdmGrant)");
1539            assert_eq!(grant.agree_env_var, "DOIGET_AGREE_TDM_ELSEVIER");
1540            // Issue #153 / PR #161 review: prove the key was actually
1541            // threaded into TdmGrant::api_key at startup (not just that
1542            // the agree var was recorded). The field is cfg-gated to
1543            // the same `tdm-*` set as the assertion below, so gate the
1544            // check identically.
1545            #[cfg(any(
1546                feature = "tdm-elsevier",
1547                feature = "tdm-aps",
1548                feature = "tdm-springer"
1549            ))]
1550            {
1551                use secrecy::ExposeSecret as _;
1552                assert_eq!(
1553                    grant.api_key.expose_secret(),
1554                    "sk-test",
1555                    "the DOIGET_KEY_ELSEVIER value must be threaded into \
1556                     TdmGrant::api_key (issue #153)"
1557                );
1558            }
1559        } else {
1560            assert!(
1561                p.tdm_elsevier.is_none(),
1562                "feature tdm-elsevier NOT compiled in -> None (warn-and-skip)"
1563            );
1564        }
1565    }
1566
1567    #[test]
1568    #[serial_test::serial]
1569    fn from_env_metadata_env_warns_without_feature() {
1570        // Rule (CAPABILITY.md §2): metadata env var without the `metadata`
1571        // feature -> source disabled (warn-and-skip, not an error).
1572        // We don't capture the tracing warn here; we just assert the field
1573        // is `false` when the feature is absent and `true` when present.
1574        let _g = unset_all_capability_env_vars();
1575        let _enable = EnvGuard::set("DOIGET_ENABLE_OPENALEX", "1");
1576
1577        let p = CapabilityProfile::from_env().expect("metadata env never errors");
1578
1579        if cfg!(feature = "metadata") {
1580            assert!(p.metadata.openalex);
1581        } else {
1582            assert!(!p.metadata.openalex);
1583        }
1584    }
1585
1586    // -----------------------------------------------------------------
1587    // Safekey reference vectors (docs/SAFEKEY.md §3, NORMATIVE).
1588    //
1589    // The vectors.json file is the binding cross-tool contract with
1590    // BiblioFetch.jl: every entry MUST round-trip identically through
1591    // both implementations. Phase 0 ships 13 entries; the full 100-entry
1592    // set is gated on the BiblioFetch.jl pre-flight (ADR-0007 Status:
1593    // Proposed at the time of this Phase 1 implementation).
1594    //
1595    // `Ref::parse` is concurrent W3-A work and is not on `main` yet, so
1596    // this test branches on the input prefix (`doi:` / `arxiv:`) and
1597    // constructs the variant directly via the in-crate `pub(crate)`
1598    // tuple constructor.
1599    // -----------------------------------------------------------------
1600
1601    #[derive(Deserialize)]
1602    struct SafekeyVector {
1603        input: String,
1604        expected: String,
1605    }
1606
1607    #[derive(Deserialize)]
1608    struct SafekeyVectorFile {
1609        vectors: Vec<SafekeyVector>,
1610    }
1611
1612    /// In-crate test helper: build a `Ref` from the user-facing form used
1613    /// in the vectors file, by stripping the `doi:` / `arxiv:` URI scheme
1614    /// and wrapping the remainder. This bypasses validation; it is fine
1615    /// here because the vectors are hand-curated and the test asserts the
1616    /// derivation algorithm, not parser semantics.
1617    fn ref_from_vector_input(input: &str) -> Ref {
1618        if let Some(rest) = input.strip_prefix("doi:") {
1619            Ref::Doi(Doi(rest.to_string()))
1620        } else if let Some(rest) = input.strip_prefix("arxiv:") {
1621            Ref::Arxiv(ArxivId(rest.to_string()))
1622        } else {
1623            panic!(
1624                "vectors.json entry has unknown ref scheme (expected doi: or arxiv: prefix): {}",
1625                input
1626            );
1627        }
1628    }
1629
1630    #[test]
1631    fn safekey_matches_reference_vectors() {
1632        // include_str! resolves relative to the file containing this macro
1633        // call (crates/doiget-core/src/lib.rs), so we go up three levels
1634        // to reach the workspace root, then down to tests/fixtures.
1635        let raw = include_str!("../../../tests/fixtures/safekey/vectors.json");
1636        let parsed: SafekeyVectorFile =
1637            serde_json::from_str(raw).expect("vectors.json is valid JSON matching schema");
1638
1639        // Phase 0 final ships the full NORMATIVE 100-entry set
1640        // (docs/SAFEKEY.md §5). The fixture is the binding cross-tool
1641        // contract with BiblioFetch.jl; tightening the count guard to
1642        // `== 100` ensures the set cannot silently grow or shrink without
1643        // a coordinated ADR bump (per docs/SAFEKEY.md status block).
1644        assert_eq!(
1645            parsed.vectors.len(),
1646            100,
1647            "vectors.json MUST be exactly 100 entries (NORMATIVE per docs/SAFEKEY.md §5); got {}",
1648            parsed.vectors.len()
1649        );
1650
1651        let mut failures: Vec<String> = Vec::new();
1652        for v in &parsed.vectors {
1653            let r = ref_from_vector_input(&v.input);
1654            let got = r.safekey().as_str().to_string();
1655            if got != v.expected {
1656                failures.push(format!(
1657                    "input={:?}\n  expected={:?}\n  got     ={:?}",
1658                    v.input, v.expected, got
1659                ));
1660            }
1661        }
1662
1663        assert!(
1664            failures.is_empty(),
1665            "{}/{} safekey reference vectors failed:\n{}",
1666            failures.len(),
1667            parsed.vectors.len(),
1668            failures.join("\n")
1669        );
1670    }
1671
1672    #[test]
1673    fn safekey_truncates_long_inputs_with_sha256_suffix() {
1674        // Construct a synthetic DOI whose suffix produces a `trimmed` longer than
1675        // 192 chars after step 3. 220 ASCII-safe chars + the `doi_10.1234/`
1676        // prefix easily exceeds 192. The resulting key must be exactly 201 chars:
1677        // 192 (trimmed prefix) + 1 (`_` separator) + 8 (hex of first 4 bytes of
1678        // SHA-256(raw)). Per docs/SAFEKEY.md §3 step 5.
1679        let suffix = "a".repeat(220);
1680        let doi = Doi(format!("10.1234/{}", suffix));
1681        let key = Ref::Doi(doi).safekey();
1682        let s = key.as_str();
1683
1684        // Shape: <192 ASCII chars from {A-Za-z0-9._-}> + "_" + <8 hex chars>
1685        assert_eq!(
1686            s.len(),
1687            201,
1688            "expected 201-char truncated key, got {}: {}",
1689            s.len(),
1690            s
1691        );
1692        assert_eq!(&s[192..193], "_", "expected '_' separator at byte 192");
1693        let hash_part = &s[193..];
1694        assert_eq!(hash_part.len(), 8, "hash suffix must be 8 hex chars");
1695        assert!(
1696            hash_part
1697                .chars()
1698                .all(|c| c.is_ascii_hexdigit() && !c.is_ascii_uppercase()),
1699            "hash suffix must be lowercase hex: {}",
1700            hash_part
1701        );
1702
1703        // Determinism: same input twice must produce the same key.
1704        let key2 = Ref::Doi(Doi(format!("10.1234/{}", "a".repeat(220)))).safekey();
1705        assert_eq!(s, key2.as_str(), "safekey must be deterministic");
1706
1707        // Hash content: must equal hex(sha256(raw)[..4]) where raw is the
1708        // pre-escape prefixed form per docs/SAFEKEY.md §3 step 5.
1709        use sha2::Digest;
1710        let raw = format!("doi_10.1234/{}", "a".repeat(220));
1711        let expected_hash = {
1712            let digest = sha2::Sha256::digest(raw.as_bytes());
1713            format!(
1714                "{:02x}{:02x}{:02x}{:02x}",
1715                digest[0], digest[1], digest[2], digest[3]
1716            )
1717        };
1718        assert_eq!(
1719            hash_part, expected_hash,
1720            "hash must match SHA-256 of raw form"
1721        );
1722    }
1723
1724    // -----------------------------------------------------------------
1725    // Doi::parse / ArxivId::parse / Ref::parse — Phase 1 W3-A.
1726    // Spec: docs/SECURITY.md §1.1 (input validation). The rejection
1727    // category set is the binding contract; each test case below names
1728    // which rule it exercises in a comment.
1729    // -----------------------------------------------------------------
1730
1731    // ---- Doi::parse happy paths (≥6) --------------------------------
1732
1733    #[test]
1734    fn doi_parse_accepts_bare_canonical_form() {
1735        // Rule: "10.<registrant>/<suffix>" is the canonical bare form.
1736        let d = Doi::parse("10.1234/example").expect("canonical bare DOI");
1737        assert_eq!(d.as_str(), "10.1234/example");
1738    }
1739
1740    #[test]
1741    fn doi_parse_accepts_doi_uri_scheme() {
1742        // Rule: the `doi:` scheme is stripped at construction; as_str
1743        // never carries it (matches docs/SAFEKEY.md §3 step 0).
1744        let d = Doi::parse("doi:10.1234/example").expect("doi: scheme accepted");
1745        assert_eq!(d.as_str(), "10.1234/example");
1746    }
1747
1748    #[test]
1749    fn doi_parse_accepts_complex_real_world_suffix() {
1750        // Rule: suffix charset includes `.`, `(`, `)`, `-`. From a real
1751        // PhysRevLett DOI used elsewhere in the test fixture set.
1752        let d = Doi::parse("10.1103/PhysRevLett.130.200601").expect("real-world PhysRev DOI");
1753        assert_eq!(d.as_str(), "10.1103/PhysRevLett.130.200601");
1754    }
1755
1756    #[test]
1757    fn doi_parse_accepts_parens_in_suffix() {
1758        // Rule: `(` and `)` are explicitly listed in the spec charset.
1759        let d = Doi::parse("10.1016/S0370-1573(98)00122-3").expect("parens in suffix");
1760        assert_eq!(d.as_str(), "10.1016/S0370-1573(98)00122-3");
1761    }
1762
1763    #[test]
1764    fn doi_parse_accepts_nested_slashes_in_suffix() {
1765        // Rule: `/` is a suffix character; only the first `/` is the
1766        // registrant/suffix separator.
1767        let d = Doi::parse("10.1234/foo/bar/baz").expect("nested slashes");
1768        assert_eq!(d.as_str(), "10.1234/foo/bar/baz");
1769    }
1770
1771    #[test]
1772    fn doi_parse_accepts_colon_in_legacy_kluwer_suffix() {
1773        // #194: legacy Kluwer/Springer DOIs (`10.1023/A:NNNNNNNNNN`)
1774        // carry a `:` in the suffix. Real DOI: "Entanglement, Quantum
1775        // Phase Transitions, and DMRG" (Kluwer, 2002).
1776        let d = Doi::parse("10.1023/A:1019601218492").expect("legacy Kluwer colon DOI");
1777        assert_eq!(d.as_str(), "10.1023/A:1019601218492");
1778    }
1779
1780    #[test]
1781    fn doi_parse_accepts_colon_in_edp_jphys_suffix() {
1782        // #194: EDP Sciences / Journal de Physique legacy corpus uses
1783        // `10.1051/jphys:NNNNNNNNNNNNNNNNN`. Real DOIs from the dogfood
1784        // Ising-RG run; both resolve at doi.org and via Crossref.
1785        let d = Doi::parse("10.1051/jphys:0198900500120136500").expect("EDP jphys colon DOI");
1786        assert_eq!(d.as_str(), "10.1051/jphys:0198900500120136500");
1787        let d2 = Doi::parse("doi:10.1051/jphys:0198500460100164500").expect("scheme + colon");
1788        assert_eq!(d2.as_str(), "10.1051/jphys:0198500460100164500");
1789    }
1790
1791    #[test]
1792    fn doi_parse_rejects_semicolon_in_suffix() {
1793        // #194 / ADR-0026: `;` is the natural ASCII neighbor of `:` and
1794        // is explicitly EXCLUDED from the suffix charset extension
1795        // (ADR-0026 §"Out of scope"). This test guards against an
1796        // over-broad `matches!` arm (e.g. an accidental `':'..=';'` range
1797        // typo) re-admitting `;` along with `:`.
1798        let result = Doi::parse("10.1234/foo;bar");
1799        assert!(
1800            matches!(result, Err(RefParseError::InvalidDoiSuffixChar { ch: ';' })),
1801            "expected InvalidDoiSuffixChar with ch=';', got {:?}",
1802            result
1803        );
1804    }
1805
1806    #[test]
1807    fn doi_parse_accepts_suffix_at_max_len_boundary() {
1808        // Rule: a suffix of exactly DOI_SUFFIX_MAX_LEN bytes is accepted;
1809        // 1 byte more is rejected (covered separately below).
1810        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN);
1811        let input = format!("10.1234/{}", suffix);
1812        let d = Doi::parse(&input).expect("suffix at max len");
1813        assert_eq!(d.as_str().len(), "10.1234/".len() + DOI_SUFFIX_MAX_LEN);
1814    }
1815
1816    #[test]
1817    fn doi_parse_uri_scheme_is_case_insensitive() {
1818        // Rule: be lenient on scheme casing; the scheme is stripped
1819        // either way so the stored form is identical.
1820        let d = Doi::parse("DOI:10.1234/example").expect("uppercase scheme");
1821        assert_eq!(d.as_str(), "10.1234/example");
1822    }
1823
1824    // ---- Doi::parse rejection paths (≥6) ----------------------------
1825
1826    #[test]
1827    fn doi_parse_rejects_missing_10_prefix() {
1828        // Rule: must start with "10." literal.
1829        assert_eq!(
1830            Doi::parse("11.1234/example"),
1831            Err(RefParseError::MissingDoiPrefix)
1832        );
1833    }
1834
1835    #[test]
1836    fn doi_parse_rejects_empty_input() {
1837        // Rule: empty inputs are not valid DOIs.
1838        assert_eq!(Doi::parse(""), Err(RefParseError::Empty));
1839    }
1840
1841    #[test]
1842    fn doi_parse_rejects_missing_suffix_separator() {
1843        // Rule: must contain a `/` between registrant and suffix.
1844        assert_eq!(
1845            Doi::parse("10.1234"),
1846            Err(RefParseError::MissingDoiSuffixSeparator)
1847        );
1848    }
1849
1850    #[test]
1851    fn doi_parse_rejects_empty_suffix() {
1852        // Rule: suffix must be non-empty.
1853        assert_eq!(Doi::parse("10.1234/"), Err(RefParseError::EmptyDoiSuffix));
1854    }
1855
1856    #[test]
1857    fn doi_parse_rejects_invalid_registrant_too_short() {
1858        // Rule: registrant must be 4–9 digits.
1859        assert_eq!(
1860            Doi::parse("10.12/example"),
1861            Err(RefParseError::InvalidDoiRegistrant)
1862        );
1863    }
1864
1865    #[test]
1866    fn doi_parse_rejects_non_digit_registrant() {
1867        // Rule: registrant chars must all be ASCII digits.
1868        assert_eq!(
1869            Doi::parse("10.12ab/example"),
1870            Err(RefParseError::InvalidDoiRegistrant)
1871        );
1872    }
1873
1874    #[test]
1875    fn doi_parse_rejects_control_char_in_suffix() {
1876        // Rule (from docs/SECURITY.md §1.1, log-injection mitigation):
1877        // control chars are not in the suffix charset; reject before they
1878        // can reach the provenance log.
1879        let result = Doi::parse("10.1234/foo\nbar");
1880        assert!(
1881            matches!(
1882                result,
1883                Err(RefParseError::InvalidDoiSuffixChar { ch: '\n' })
1884            ),
1885            "got {:?}",
1886            result
1887        );
1888    }
1889
1890    #[test]
1891    fn doi_parse_rejects_suffix_over_max_len() {
1892        // Rule: DOI_SUFFIX_MAX_LEN + 1 bytes is rejected.
1893        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 1);
1894        let input = format!("10.1234/{}", suffix);
1895        let result = Doi::parse(&input);
1896        match result {
1897            Err(RefParseError::DoiSuffixTooLong { len, max }) => {
1898                assert_eq!(len, DOI_SUFFIX_MAX_LEN + 1);
1899                assert_eq!(max, DOI_SUFFIX_MAX_LEN);
1900            }
1901            other => panic!("expected DoiSuffixTooLong, got {:?}", other),
1902        }
1903    }
1904
1905    #[test]
1906    fn doi_parse_rejects_non_ascii_in_suffix() {
1907        // Rule: spec charset is ASCII-only; non-ASCII becomes an
1908        // InvalidDoiSuffixChar (consistent with safekey behavior of
1909        // collapsing such chars to '_', which is a downstream concern).
1910        let result = Doi::parse("10.1234/物理学");
1911        assert!(
1912            matches!(result, Err(RefParseError::InvalidDoiSuffixChar { .. })),
1913            "got {:?}",
1914            result
1915        );
1916    }
1917
1918    // ---- ArxivId::parse happy paths (≥6) ----------------------------
1919
1920    #[test]
1921    fn arxiv_parse_accepts_new_style_4_digit_seq() {
1922        // Rule: new-style YYMM.NNNN (4-digit sequence number).
1923        let a = ArxivId::parse("0704.0001").expect("new-style 4-digit seq");
1924        assert_eq!(a.as_str(), "0704.0001");
1925    }
1926
1927    #[test]
1928    fn arxiv_parse_accepts_new_style_5_digit_seq() {
1929        // Rule: new-style YYMM.NNNNN (5-digit sequence number, post-2015).
1930        let a = ArxivId::parse("2401.12345").expect("new-style 5-digit seq");
1931        assert_eq!(a.as_str(), "2401.12345");
1932    }
1933
1934    #[test]
1935    fn arxiv_parse_accepts_new_style_with_version() {
1936        // Rule: optional `vN` version suffix.
1937        let a = ArxivId::parse("2401.12345v2").expect("with version");
1938        assert_eq!(a.as_str(), "2401.12345v2");
1939    }
1940
1941    #[test]
1942    fn arxiv_parse_accepts_old_style() {
1943        // Rule: old-style subject-class/YYMMNNN.
1944        let a = ArxivId::parse("cond-mat/9501001").expect("old-style cond-mat");
1945        assert_eq!(a.as_str(), "cond-mat/9501001");
1946    }
1947
1948    #[test]
1949    fn arxiv_parse_accepts_old_style_with_subclass_and_version() {
1950        // Rule: old-style subject-class may have a `.XX` two-upper subclass
1951        // and an optional `vN` suffix.
1952        let a = ArxivId::parse("astro-ph.CO/0703123v2").expect("old-style with subclass + version");
1953        assert_eq!(a.as_str(), "astro-ph.CO/0703123v2");
1954    }
1955
1956    #[test]
1957    fn arxiv_parse_accepts_arxiv_uri_scheme() {
1958        // Rule: `arxiv:` / `arXiv:` scheme is stripped at construction.
1959        let a = ArxivId::parse("arxiv:2401.12345").expect("arxiv: scheme");
1960        assert_eq!(a.as_str(), "2401.12345");
1961    }
1962
1963    #[test]
1964    fn arxiv_parse_accepts_arxiv_uri_scheme_mixed_case() {
1965        // Rule: scheme case-insensitive; matches the `arXiv:` form named
1966        // in docs/MCP_TOOLS.md.
1967        let a = ArxivId::parse("arXiv:2401.12345v2").expect("arXiv: scheme");
1968        assert_eq!(a.as_str(), "2401.12345v2");
1969    }
1970
1971    // ---- ArxivId::parse rejection paths (≥6) ------------------------
1972
1973    #[test]
1974    fn arxiv_parse_rejects_empty_input() {
1975        // Rule: empty rejected up-front.
1976        assert_eq!(ArxivId::parse(""), Err(RefParseError::Empty));
1977    }
1978
1979    #[test]
1980    fn arxiv_parse_rejects_no_dot_or_slash() {
1981        // Rule: must contain `.` (new-style) or `/` (old-style).
1982        assert_eq!(
1983            ArxivId::parse("notanarxivid"),
1984            Err(RefParseError::InvalidArxivShape)
1985        );
1986    }
1987
1988    #[test]
1989    fn arxiv_parse_rejects_new_style_wrong_head_length() {
1990        // Rule: head must be exactly 4 digits.
1991        assert_eq!(
1992            ArxivId::parse("240.12345"),
1993            Err(RefParseError::InvalidArxivShape)
1994        );
1995    }
1996
1997    #[test]
1998    fn arxiv_parse_rejects_new_style_seq_too_short() {
1999        // Rule: seq must be 4–5 digits.
2000        assert_eq!(
2001            ArxivId::parse("2401.123"),
2002            Err(RefParseError::InvalidArxivShape)
2003        );
2004    }
2005
2006    #[test]
2007    fn arxiv_parse_rejects_old_style_wrong_id_length() {
2008        // Rule: old-style id is exactly 7 digits.
2009        assert_eq!(
2010            ArxivId::parse("cond-mat/95001"),
2011            Err(RefParseError::InvalidArxivShape)
2012        );
2013    }
2014
2015    #[test]
2016    fn arxiv_parse_rejects_invalid_version_suffix() {
2017        // Rule: version suffix is `v` followed by ≥1 digits, nothing else.
2018        assert_eq!(
2019            ArxivId::parse("2401.12345v"),
2020            Err(RefParseError::InvalidArxivShape)
2021        );
2022    }
2023
2024    #[test]
2025    fn arxiv_parse_rejects_control_char() {
2026        // Rule (docs/SECURITY.md §1.1 log-injection): no control chars.
2027        assert_eq!(
2028            ArxivId::parse("2401.12345\n"),
2029            Err(RefParseError::InvalidArxivShape)
2030        );
2031    }
2032
2033    #[test]
2034    fn arxiv_parse_rejects_non_ascii() {
2035        // Rule: ASCII-only.
2036        assert_eq!(
2037            ArxivId::parse("2401.物理"),
2038            Err(RefParseError::InvalidArxivShape)
2039        );
2040    }
2041
2042    // ---- Ref::parse happy paths (≥6) --------------------------------
2043
2044    #[test]
2045    fn ref_parse_dispatches_doi_scheme_to_doi() {
2046        // Detection rule 1: explicit `doi:` scheme.
2047        match Ref::parse("doi:10.1234/example").expect("doi: dispatched to Doi") {
2048            Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/example"),
2049            other => panic!("expected Ref::Doi, got {:?}", other),
2050        }
2051    }
2052
2053    #[test]
2054    fn ref_parse_dispatches_arxiv_scheme_to_arxiv() {
2055        // Detection rule 2: explicit `arxiv:` scheme.
2056        match Ref::parse("arxiv:2401.12345").expect("arxiv: dispatched to Arxiv") {
2057            Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2058            other => panic!("expected Ref::Arxiv, got {:?}", other),
2059        }
2060    }
2061
2062    #[test]
2063    fn ref_parse_dispatches_arxiv_mixed_case_scheme() {
2064        // Detection rule 2 (case-insensitive): `arXiv:` form.
2065        match Ref::parse("arXiv:cond-mat/9501001").expect("arXiv: dispatched") {
2066            Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2067            other => panic!("expected Ref::Arxiv, got {:?}", other),
2068        }
2069    }
2070
2071    #[test]
2072    fn ref_parse_bare_doi_resolves_to_doi() {
2073        // Detection rule 3: bare input starting with `10.` is a DOI.
2074        match Ref::parse("10.1234/foo").expect("bare DOI") {
2075            Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/foo"),
2076            other => panic!("expected Ref::Doi, got {:?}", other),
2077        }
2078    }
2079
2080    #[test]
2081    fn ref_parse_bare_arxiv_new_resolves_to_arxiv() {
2082        // Detection rule 4: bare input not starting with `10.` falls
2083        // through to arXiv. Tests the ambiguous-input branch named in the
2084        // PR brief: `2401.12345` should resolve to ArxivId.
2085        match Ref::parse("2401.12345").expect("bare new-style arXiv") {
2086            Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2087            other => panic!("expected Ref::Arxiv, got {:?}", other),
2088        }
2089    }
2090
2091    #[test]
2092    fn ref_parse_bare_arxiv_old_resolves_to_arxiv() {
2093        // Detection rule 4: bare old-style arXiv id.
2094        match Ref::parse("cond-mat/9501001").expect("bare old-style arXiv") {
2095            Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2096            other => panic!("expected Ref::Arxiv, got {:?}", other),
2097        }
2098    }
2099
2100    // ---- Ref::parse rejection paths (≥6) ----------------------------
2101
2102    #[test]
2103    fn ref_parse_rejects_empty() {
2104        // Rule: empty up-front.
2105        assert_eq!(Ref::parse(""), Err(RefParseError::Empty));
2106    }
2107
2108    #[test]
2109    fn ref_parse_doi_scheme_with_invalid_doi_propagates_doi_error() {
2110        // When the scheme is explicit, we surface the parser's error
2111        // verbatim — not a generic "shape mismatch".
2112        assert_eq!(
2113            Ref::parse("doi:10.1234"),
2114            Err(RefParseError::MissingDoiSuffixSeparator)
2115        );
2116    }
2117
2118    #[test]
2119    fn ref_parse_arxiv_scheme_with_invalid_arxiv_propagates_arxiv_error() {
2120        assert_eq!(
2121            Ref::parse("arxiv:notanid"),
2122            Err(RefParseError::InvalidArxivShape)
2123        );
2124    }
2125
2126    #[test]
2127    fn ref_parse_bare_with_10_prefix_uses_doi_errors() {
2128        // Bare `10.…` heuristic: DOI parser is dispatched and its error
2129        // surfaces (here: bad registrant).
2130        assert_eq!(
2131            Ref::parse("10.12/x"),
2132            Err(RefParseError::InvalidDoiRegistrant)
2133        );
2134    }
2135
2136    #[test]
2137    fn ref_parse_bare_without_10_prefix_uses_arxiv_errors() {
2138        // Bare ambiguous fallback: ArxivId parser is dispatched and its
2139        // error surfaces. `1.2.3` is neither a DOI nor an arXiv shape.
2140        assert_eq!(Ref::parse("1.2.3"), Err(RefParseError::InvalidArxivShape));
2141    }
2142
2143    #[test]
2144    fn ref_parse_rejects_doi_scheme_with_oversized_suffix() {
2145        // Length-bound: DOI suffix > DOI_SUFFIX_MAX_LEN through Ref::parse
2146        // surfaces DoiSuffixTooLong, not a generic InvalidArxivShape.
2147        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 5);
2148        let input = format!("doi:10.1234/{}", suffix);
2149        match Ref::parse(&input) {
2150            Err(RefParseError::DoiSuffixTooLong { .. }) => {}
2151            other => panic!("expected DoiSuffixTooLong, got {:?}", other),
2152        }
2153    }
2154
2155    #[test]
2156    fn ref_parse_round_trip_via_serde_preserves_inner_string() {
2157        // Wire-format check: Doi/ArxivId are #[serde(transparent)], and a
2158        // round-trip through Ref::parse → serde_json → Ref must preserve
2159        // the inner identifier. Guards against accidental scheme leakage
2160        // into the stored form.
2161        let r = Ref::parse("doi:10.1234/example").expect("parse ok");
2162        let json = serde_json::to_string(&r).expect("serialize");
2163        // The transparent inner value is the bare identifier (no `doi:`).
2164        assert!(
2165            json.contains("10.1234/example") && !json.contains("doi:"),
2166            "scheme leaked into wire form: {}",
2167            json
2168        );
2169    }
2170
2171    #[test]
2172    fn ref_parse_error_maps_to_invalid_ref_error_code() {
2173        // Public-API contract (docs/PUBLIC_API.md §4): all parse failures
2174        // collapse to ErrorCode::InvalidRef at the public boundary.
2175        let err: ErrorCode = RefParseError::Empty.into();
2176        assert_eq!(err, ErrorCode::InvalidRef);
2177        let err2: ErrorCode = RefParseError::MissingDoiPrefix.into();
2178        assert_eq!(err2, ErrorCode::InvalidRef);
2179    }
2180
2181    // -----------------------------------------------------------------
2182    // DenialReason / DenialContext (ADR-0023) — wire-shape tests.
2183    // -----------------------------------------------------------------
2184
2185    #[test]
2186    fn denial_reason_serializes_snake_case() {
2187        // ADR-0023 §2 / docs/PUBLIC_API.md §8: wire form is snake_case.
2188        let s = serde_json::to_string(&DenialReason::RedirectNotInAllowlist).expect("ser");
2189        assert_eq!(s, "\"redirect_not_in_allowlist\"");
2190        let s = serde_json::to_string(&DenialReason::SizeCapExceeded).expect("ser");
2191        assert_eq!(s, "\"size_cap_exceeded\"");
2192        let s = serde_json::to_string(&DenialReason::ContentTypeMismatch).expect("ser");
2193        assert_eq!(s, "\"content_type_mismatch\"");
2194    }
2195
2196    #[test]
2197    fn denial_reason_round_trip_via_serde() {
2198        // Round-trip every closed-set variant so adding a new variant
2199        // forces this test to be updated (the closed-set contract).
2200        for r in [
2201            DenialReason::RedirectNotInAllowlist,
2202            DenialReason::InsecureScheme,
2203            DenialReason::HostInBlockList,
2204            DenialReason::SizeCapExceeded,
2205            DenialReason::SchemaDrift,
2206            DenialReason::CapabilityNotGranted,
2207            DenialReason::RateLimitWindow,
2208            DenialReason::SsrfPrivateAddress,
2209            DenialReason::ContentTypeMismatch,
2210        ] {
2211            let s = serde_json::to_string(&r).expect("ser");
2212            let back: DenialReason = serde_json::from_str(&s).expect("de");
2213            assert_eq!(back, r, "round-trip mismatch for {:?} -> {}", r, s);
2214        }
2215    }
2216
2217    #[test]
2218    fn denial_context_round_trips_full_shape() {
2219        // A populated context (the redirect-denied case from ADR-0023 §1
2220        // example) survives a JSON round-trip. Whole-struct equality
2221        // exercises the `PartialEq` derive added per ADR-0023 §3 (added
2222        // in the multi-agent review feedback PR — see ADR-0023 history).
2223        let dc = DenialContext {
2224            reason: DenialReason::RedirectNotInAllowlist,
2225            source: Some("crossref".to_string()),
2226            attempted: Some("evil.example.com".to_string()),
2227            expected: Some(vec![
2228                "api.crossref.org".to_string(),
2229                "*.crossref.org".to_string(),
2230            ]),
2231            hop_index: Some(1),
2232            cap: None,
2233            actual: None,
2234        };
2235        let s = serde_json::to_string(&dc).expect("ser");
2236        let back: DenialContext = serde_json::from_str(&s).expect("de");
2237        assert_eq!(back, dc);
2238    }
2239
2240    #[test]
2241    fn denial_context_serialize_elides_empty_fields() {
2242        // `skip_serializing_if = "Option::is_none"` must keep the wire form
2243        // lean: every `None` field MUST NOT appear on the wire. Reason is
2244        // always present.
2245        let dc = DenialContext {
2246            reason: DenialReason::CapabilityNotGranted,
2247            source: None,
2248            attempted: None,
2249            expected: None,
2250            hop_index: None,
2251            cap: None,
2252            actual: None,
2253        };
2254        let s = serde_json::to_string(&dc).expect("ser");
2255        assert_eq!(s, "{\"reason\":\"capability_not_granted\"}");
2256    }
2257
2258    #[test]
2259    fn denial_context_expected_some_empty_vec_preserves_explicit_empty_allowlist() {
2260        // Post-refinement disambiguation: `expected: Some(vec![])` is the
2261        // "explicit empty allowlist" signal and MUST survive the wire as
2262        // `"expected":[]`. Only `expected: None` is skipped on serialize.
2263        // This is the bug the previous `Vec<String>` shape masked.
2264        let dc = DenialContext {
2265            reason: DenialReason::RedirectNotInAllowlist,
2266            source: Some("crossref".to_string()),
2267            attempted: Some("evil.example.com".to_string()),
2268            expected: Some(Vec::new()),
2269            hop_index: None,
2270            cap: None,
2271            actual: None,
2272        };
2273        let s = serde_json::to_string(&dc).expect("ser");
2274        assert!(
2275            s.contains("\"expected\":[]"),
2276            "expected:[] must survive on the wire (got: {s})"
2277        );
2278        let back: DenialContext = serde_json::from_str(&s).expect("de");
2279        assert_eq!(back.expected, Some(Vec::new()));
2280    }
2281
2282    #[test]
2283    fn denial_context_deserialize_tolerates_missing_optional_fields() {
2284        // Consumer-side contract (ADR-0023 §3): consumers MUST tolerate
2285        // any subset of fields being present. Missing optional fields
2286        // deserialize to their defaults via `#[serde(default)]`.
2287        let wire = r#"{"reason":"size_cap_exceeded","cap":104857600,"actual":209715200}"#;
2288        let dc: DenialContext = serde_json::from_str(wire).expect("de");
2289        assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
2290        assert_eq!(dc.cap, Some(104857600));
2291        assert_eq!(dc.actual, Some(209715200));
2292        assert!(dc.source.is_none());
2293        assert!(dc.attempted.is_none());
2294        assert!(dc.expected.is_none());
2295        assert!(dc.hop_index.is_none());
2296    }
2297
2298    #[test]
2299    fn full_error_envelope_with_denial_context_serializes_to_pinned_json() {
2300        // Pins the byte-exact wire shape of the full failure envelope
2301        // documented in docs/ERRORS.md §3 + §3.1 and ADR-0023 §1. A
2302        // future regression that flips key order or skip-rules anywhere
2303        // in the chain breaks this test loudly.
2304        //
2305        // Note: serde_json's `Map` (used by `json!`) sorts keys
2306        // alphabetically when the `preserve_order` feature is NOT
2307        // enabled (we do not enable it). Embedding a `DenialContext`
2308        // via `json!` first re-serialises it through the same alphabet-
2309        // sorted Map path, so the inner field order is also alphabetical
2310        // here — NOT the struct field-order produced by direct
2311        // `to_string(&DenialContext)`. This is by design: the public
2312        // wire shape is canonicalised by serde_json's Map ordering, so
2313        // the byte-exact pin below documents that exact canonicalisation.
2314        let denial = DenialContext {
2315            reason: DenialReason::RedirectNotInAllowlist,
2316            source: Some("crossref".into()),
2317            attempted: Some("evil.example.com".into()),
2318            expected: Some(vec!["api.crossref.org".into(), "*.crossref.org".into()]),
2319            hop_index: Some(1),
2320            cap: None,
2321            actual: None,
2322        };
2323        let envelope = serde_json::json!({
2324            "ok": false,
2325            "error": {
2326                "code": ErrorCode::NetworkError,
2327                "message": "redirect target evil.example.com not in allowlist for source crossref",
2328                "denial_context": denial,
2329            }
2330        });
2331        let actual = serde_json::to_string(&envelope).expect("serialize envelope");
2332        let expected = r#"{"error":{"code":"NETWORK_ERROR","denial_context":{"attempted":"evil.example.com","expected":["api.crossref.org","*.crossref.org"],"hop_index":1,"reason":"redirect_not_in_allowlist","source":"crossref"},"message":"redirect target evil.example.com not in allowlist for source crossref"},"ok":false}"#;
2333        assert_eq!(actual, expected);
2334    }
2335
2336    #[test]
2337    fn denial_context_rejects_unknown_fields() {
2338        // `#[serde(deny_unknown_fields)]` (ADR-0023 §3, PUBLIC_API.md §8):
2339        // an unknown field on the wire MUST be a deserialize error so
2340        // forward-compat field additions stay a breaking change.
2341        let wire = r#"{"reason":"capability_not_granted","banana":1}"#;
2342        let result: Result<DenialContext, _> = serde_json::from_str(wire);
2343        assert!(
2344            result.is_err(),
2345            "deny_unknown_fields must reject 'banana': {:?}",
2346            result.map(|d| d.reason),
2347        );
2348    }
2349}