Skip to main content

doiget_core/
lib.rs

1//! # doiget-core
2//!
3//! Core library for [doiget](https://github.com/sotashimozono/doiget): an Open Access
4//! first paper-fetcher with strict capability gating, fail-closed provenance logging,
5//! and a BiblioFetch.jl-compatible store layout.
6//!
7//! Phase 0 ships only this skeleton. Real implementations land in Phase 1.
8//! See `docs/PUBLIC_API.md` for the semver-locked surface and `docs/ARCHITECTURE.md`
9//! for the high-level design.
10
11#![warn(missing_docs)]
12#![forbid(unsafe_code)]
13
14use serde::{Deserialize, Serialize};
15use sha2::Digest;
16
17// --- Modules ---
18pub mod canonical;
19pub mod dry_run;
20pub mod http;
21pub mod orchestrator;
22pub mod provenance;
23pub mod rate_limiter;
24pub mod source;
25pub mod sources;
26pub mod store;
27
28// Phase 4 citation graph (ADR-0010). Compile-gated by the `citation`
29// Cargo feature, which itself enables the `metadata` feature so the
30// Tier-2 source impls are available.
31#[cfg(feature = "citation")]
32pub mod citation_graph;
33
34// Re-export the canonical-tuple audit-identity types at the crate root
35// per ADR-0024 / `docs/PUBLIC_API.md` §1. The types themselves live in
36// the [`canonical`] submodule.
37pub use crate::canonical::{CanonicalRef, SourceType};
38
39/// Crate version. Used by `doiget-cli --version` and `doiget_health`.
40pub const VERSION: &str = env!("CARGO_PKG_VERSION");
41
42/// TOML schema version this build writes. See `docs/STORE.md` §3.
43pub const SCHEMA_VERSION: &str = "1.0";
44
45/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
46pub const MAX_CONCURRENT_FETCHES: u32 = 5;
47
48/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
49pub const MAX_FETCHES_PER_SECOND: f32 = 5.0;
50
51/// Maximum batch size for `doiget batch` and `doiget_batch_fetch`.
52pub const MCP_BATCH_MAX_SIZE: usize = 100;
53
54/// Slice 2 alias for [`MCP_BATCH_MAX_SIZE`] using the
55/// spec-language name (`docs/MCP_TOOLS.md` §1 / Slice 2 plan). The
56/// numeric value MUST equal [`MCP_BATCH_MAX_SIZE`]; an internal test
57/// pins the equivalence so the two constants cannot drift.
58pub const MAX_BATCH_REFS: usize = MCP_BATCH_MAX_SIZE;
59
60/// Maximum queued MCP requests beyond `MAX_CONCURRENT_FETCHES`. Excess returns
61/// `ErrorCode::RateLimited`. See `docs/SECURITY.md` §1.4 / `docs/MCP_TOOLS.md`.
62pub const MCP_QUEUE_DEPTH_MAX: usize = 100;
63
64/// MCP server stdin-EOF graceful-shutdown deadline, in seconds. See ADR-0001
65/// and `docs/MCP_TOOLS.md` §8.
66pub const MCP_STDIN_EOF_SHUTDOWN_SEC: u64 = 5;
67
68/// Maximum DOI suffix length accepted at validation. See `docs/SECURITY.md` §1.1.
69pub const DOI_SUFFIX_MAX_LEN: usize = 256;
70
71/// Maximum PDF body size accepted by the fetcher, in bytes. See
72/// `docs/SECURITY.md` §1.2 (Oversized PDF).
73pub const PDF_MAX_BYTES: u64 = 100_000_000;
74
75/// Time-to-live for entries in `~/.cache/doiget/resolver/`. See
76/// `docs/CACHE.md` §3.
77pub const RESOLVER_CACHE_TTL_DAYS: u32 = 7;
78
79/// Time-to-live for entries in `~/.cache/doiget/citations/`. See
80/// `docs/CACHE.md` §3.
81pub const CITATION_CACHE_TTL_DAYS: u32 = 30;
82
83// ---------------------------------------------------------------------------
84// Ref
85// ---------------------------------------------------------------------------
86
87/// A reference to a paper, either by DOI or arXiv id.
88///
89/// See `docs/SECURITY.md` §1.1 for input-validation rules.
90#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
91#[serde(rename_all = "lowercase", tag = "kind", content = "id")]
92pub enum Ref {
93    /// A DOI (e.g., `10.1234/example`).
94    Doi(Doi),
95    /// An arXiv id (e.g., `2401.12345`).
96    Arxiv(ArxivId),
97}
98
99/// A validated DOI string.
100///
101/// Construct via `Doi::parse(s)` (Phase 1+). The inner field is intentionally
102/// `pub(crate)` to forbid bypass construction; tests inside `doiget-core` may
103/// still use `Doi(s)` for fixture purposes.
104///
105/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"10.1234/example"`.
106#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
107#[serde(transparent)]
108pub struct Doi(pub(crate) String);
109
110/// A validated arXiv id string.
111///
112/// Construct via `ArxivId::parse(s)` (Phase 1+). Inner field is `pub(crate)`.
113///
114/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"2401.12345"`.
115#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
116#[serde(transparent)]
117pub struct ArxivId(pub(crate) String);
118
119impl Doi {
120    /// Returns the DOI as a string slice.
121    pub fn as_str(&self) -> &str {
122        &self.0
123    }
124
125    /// Parses and validates a DOI string per `docs/SECURITY.md` §1.1.
126    ///
127    /// Accepts:
128    /// - Bare DOIs: `10.<registrant>/<suffix>` where `<registrant>` is 4–9
129    ///   digits and `<suffix>` is a non-empty sequence of characters drawn
130    ///   from `[A-Za-z0-9._/():-]` (the `:` covers legacy Kluwer
131    ///   `10.1023/A:NNNN` and EDP Sciences `10.1051/jphys:NNNN` DOIs).
132    /// - The `doi:` URI scheme prefix; it is stripped before validation, so
133    ///   the stored value never carries a scheme. (Matches the convention
134    ///   established in `docs/SAFEKEY.md` §3 step 0.)
135    ///
136    /// Rejects:
137    /// - Inputs missing the literal `10.` prefix (after optional scheme
138    ///   strip).
139    /// - Suffixes longer than [`DOI_SUFFIX_MAX_LEN`] bytes.
140    /// - Empty suffixes.
141    /// - Any character outside the suffix charset above (including control
142    ///   characters, whitespace, and non-ASCII).
143    ///
144    /// # Errors
145    ///
146    /// Returns a [`RefParseError`] variant that names the specific rejection
147    /// category. Tier 1+ callers should map any [`RefParseError`] to
148    /// [`ErrorCode::InvalidRef`] when surfacing to MCP / CLI.
149    pub fn parse(s: &str) -> Result<Self, RefParseError> {
150        let stripped = parse::strip_doi_scheme(s);
151        parse::validate_doi(stripped)?;
152        Ok(Doi(stripped.to_string()))
153    }
154}
155
156impl ArxivId {
157    /// Returns the arXiv id as a string slice.
158    pub fn as_str(&self) -> &str {
159        &self.0
160    }
161
162    /// Parses and validates an arXiv id per `docs/SECURITY.md` §1.1 and the
163    /// pattern published in `docs/MCP_TOOLS.md`.
164    ///
165    /// Accepts:
166    /// - New-style ids: `YYMM.NNNNN[vN]` where the date block is 4 digits, the
167    ///   sequence number is 4–5 digits, and the optional version `vN` is one
168    ///   or more digits. Examples: `2401.12345`, `2401.12345v2`.
169    /// - Old-style ids: `subject-class/YYMMNNN[vN]` where the subject class
170    ///   is a lowercase token (with optional internal hyphens and an
171    ///   optional `.XX` two-uppercase-letter group), and the numeric body
172    ///   is exactly 7 digits with optional `vN`. Examples:
173    ///   `cond-mat/9501001`, `astro-ph.CO/0703123v2`.
174    /// - The `arxiv:` / `arXiv:` URI scheme prefix; it is stripped before
175    ///   validation.
176    ///
177    /// Rejects:
178    /// - Inputs that match neither the new-style nor old-style shape.
179    /// - Inputs containing characters outside the per-shape charset
180    ///   (control chars, whitespace, non-ASCII).
181    /// - Empty input.
182    ///
183    /// # Errors
184    ///
185    /// Returns a [`RefParseError`] variant that names the specific rejection
186    /// category.
187    pub fn parse(s: &str) -> Result<Self, RefParseError> {
188        let stripped = parse::strip_arxiv_scheme(s);
189        parse::validate_arxiv(stripped)?;
190        Ok(ArxivId(stripped.to_string()))
191    }
192}
193
194impl Ref {
195    /// Parses a string into a [`Ref`], auto-detecting DOI vs arXiv.
196    ///
197    /// Detection rules:
198    /// 1. If the input begins with the case-insensitive `doi:` scheme, the
199    ///    remainder is parsed as a DOI.
200    /// 2. If the input begins with the `arxiv:` or `arXiv:` scheme, the
201    ///    remainder is parsed as an arXiv id.
202    /// 3. Otherwise, if the input starts with `10.` it is treated as a bare
203    ///    DOI; this matches the heuristic in `docs/SAFEKEY.md` §4 (Julia
204    ///    reference) and is stable because DOIs always begin `10.`.
205    /// 4. Failing all of the above, parsing falls back to arXiv.
206    ///
207    /// The returned [`Ref`] never carries the URI scheme — `as_str()` on the
208    /// inner `Doi` / `ArxivId` is always the bare identifier.
209    ///
210    /// # Errors
211    ///
212    /// Returns a [`RefParseError`] from the underlying [`Doi::parse`] or
213    /// [`ArxivId::parse`] call. When the input has an explicit scheme
214    /// (`doi:` / `arxiv:`), the matching parser is dispatched and its error
215    /// surfaces directly. When the input is bare and ambiguous, the
216    /// heuristic in rule 3/4 selects the parser; an unparsable bare input
217    /// surfaces the arXiv parser's error (a non-`10.` ref that also fails
218    /// arXiv validation is never a valid DOI).
219    pub fn parse(s: &str) -> Result<Self, RefParseError> {
220        // Reject empty up front so all three parsers see a meaningful slice;
221        // without this, `strip_*_scheme("")` returns "" and we'd get a
222        // confusing "missing 10. prefix" error for empty input.
223        if s.is_empty() {
224            return Err(RefParseError::Empty);
225        }
226
227        if parse::has_doi_scheme(s) {
228            return Doi::parse(s).map(Ref::Doi);
229        }
230        if parse::has_arxiv_scheme(s) {
231            return ArxivId::parse(s).map(Ref::Arxiv);
232        }
233        if s.starts_with("10.") {
234            return Doi::parse(s).map(Ref::Doi);
235        }
236        ArxivId::parse(s).map(Ref::Arxiv)
237    }
238}
239
240// ---------------------------------------------------------------------------
241// Parser internals
242// ---------------------------------------------------------------------------
243
244mod parse {
245    use super::{RefParseError, DOI_SUFFIX_MAX_LEN};
246
247    /// Case-insensitive `doi:` prefix detector. Matches both `doi:` and
248    /// `DOI:` (and any case mix); the spec in `docs/SAFEKEY.md` §3 only
249    /// names the lowercase form, but the field convention is to be lenient
250    /// in what we accept (the scheme is dropped at the boundary anyway).
251    pub(crate) fn has_doi_scheme(s: &str) -> bool {
252        s.len() >= 4 && s.is_char_boundary(4) && s[..4].eq_ignore_ascii_case("doi:")
253    }
254
255    /// Case-insensitive `arxiv:` prefix detector. Accepts `arxiv:`,
256    /// `arXiv:` (the form used in `docs/MCP_TOOLS.md`), and any other case
257    /// mix.
258    pub(crate) fn has_arxiv_scheme(s: &str) -> bool {
259        s.len() >= 6 && s.is_char_boundary(6) && s[..6].eq_ignore_ascii_case("arxiv:")
260    }
261
262    pub(crate) fn strip_doi_scheme(s: &str) -> &str {
263        if has_doi_scheme(s) {
264            &s[4..]
265        } else {
266            s
267        }
268    }
269
270    pub(crate) fn strip_arxiv_scheme(s: &str) -> &str {
271        if has_arxiv_scheme(s) {
272            &s[6..]
273        } else {
274            s
275        }
276    }
277
278    /// DOI suffix charset per `docs/SECURITY.md` §1.1:
279    /// `[A-Za-z0-9._/():-]`. The forward slash is permitted inside the
280    /// suffix (e.g. `10.1016/...`); the registrant separator is the
281    /// *first* `/` and the suffix is everything after it.
282    ///
283    /// `:` is permitted because two large real publisher DOI families use
284    /// it in the suffix — legacy Kluwer/Springer (`10.1023/A:NNNNNNNNNN`)
285    /// and EDP Sciences / Journal de Physique
286    /// (`10.1051/jphys:NNNNNNNNNNNNNNNNN`). It adds no path-traversal
287    /// capability: traversal requires composing `/` and `.` into `../`,
288    /// and both characters are already in the suffix charset. In addition,
289    /// `safekey` independently escapes every char outside `[A-Za-z0-9._-]`
290    /// before any filesystem use, so `:` never reaches a path literally.
291    /// See ADR-0026 and `docs/SECURITY.md` §1.1.
292    fn is_doi_suffix_char(c: char) -> bool {
293        matches!(c,
294            'A'..='Z' | 'a'..='z' | '0'..='9'
295            | '.' | '_' | '/' | '(' | ')' | '-' | ':'
296        )
297    }
298
299    pub(crate) fn validate_doi(s: &str) -> Result<(), RefParseError> {
300        if s.is_empty() {
301            return Err(RefParseError::Empty);
302        }
303
304        // Must begin with literal "10."; the registrant is 4–9 digits up
305        // to the first '/'. After that, everything is suffix.
306        let rest = s
307            .strip_prefix("10.")
308            .ok_or(RefParseError::MissingDoiPrefix)?;
309        let slash_idx = rest
310            .find('/')
311            .ok_or(RefParseError::MissingDoiSuffixSeparator)?;
312        let registrant = &rest[..slash_idx];
313        let suffix = &rest[slash_idx + 1..];
314
315        // Registrant: 4–9 ASCII digits.
316        if registrant.len() < 4
317            || registrant.len() > 9
318            || !registrant.chars().all(|c| c.is_ascii_digit())
319        {
320            return Err(RefParseError::InvalidDoiRegistrant);
321        }
322
323        // Suffix: non-empty, charset-restricted, length-bounded.
324        if suffix.is_empty() {
325            return Err(RefParseError::EmptyDoiSuffix);
326        }
327        if suffix.len() > DOI_SUFFIX_MAX_LEN {
328            return Err(RefParseError::DoiSuffixTooLong {
329                len: suffix.len(),
330                max: DOI_SUFFIX_MAX_LEN,
331            });
332        }
333        if let Some(bad) = suffix.chars().find(|c| !is_doi_suffix_char(*c)) {
334            return Err(RefParseError::InvalidDoiSuffixChar { ch: bad });
335        }
336        Ok(())
337    }
338
339    /// Validates an arXiv id (with the `arxiv:` / `arXiv:` scheme already
340    /// stripped). Tries the new-style shape first, then the old-style.
341    pub(crate) fn validate_arxiv(s: &str) -> Result<(), RefParseError> {
342        if s.is_empty() {
343            return Err(RefParseError::Empty);
344        }
345        if validate_arxiv_new(s).is_ok() || validate_arxiv_old(s).is_ok() {
346            return Ok(());
347        }
348        Err(RefParseError::InvalidArxivShape)
349    }
350
351    /// New-style arXiv id: `YYMM.NNNNN[vN]`.
352    fn validate_arxiv_new(s: &str) -> Result<(), ()> {
353        let dot_idx = s.find('.').ok_or(())?;
354        let head = &s[..dot_idx];
355        let tail = &s[dot_idx + 1..];
356
357        // Head: exactly 4 ASCII digits.
358        if head.len() != 4 || !head.chars().all(|c| c.is_ascii_digit()) {
359            return Err(());
360        }
361
362        // Tail: 4–5 digits, then optional `v` followed by ≥1 digits.
363        let bytes = tail.as_bytes();
364        let mut i = 0;
365        while i < bytes.len() && bytes[i].is_ascii_digit() {
366            i += 1;
367        }
368        let digits_len = i;
369        if !(4..=5).contains(&digits_len) {
370            return Err(());
371        }
372        if i == bytes.len() {
373            return Ok(());
374        }
375        // Optional version suffix.
376        if bytes[i] != b'v' {
377            return Err(());
378        }
379        i += 1;
380        let v_start = i;
381        while i < bytes.len() && bytes[i].is_ascii_digit() {
382            i += 1;
383        }
384        if i == v_start || i != bytes.len() {
385            return Err(());
386        }
387        Ok(())
388    }
389
390    /// Old-style arXiv id: `subject-class/YYMMNNN[vN]`.
391    /// Subject class: `[a-z]([a-z-]*[a-z])?(\.[A-Z]{2})?`.
392    fn validate_arxiv_old(s: &str) -> Result<(), ()> {
393        let slash_idx = s.find('/').ok_or(())?;
394        let class = &s[..slash_idx];
395        let id = &s[slash_idx + 1..];
396
397        // Class: starts with [a-z], body is [a-z-], optional `.XX` (two
398        // ASCII upper).
399        let (core_class, dot_part) = match class.find('.') {
400            Some(d) => (&class[..d], Some(&class[d + 1..])),
401            None => (class, None),
402        };
403        if core_class.is_empty()
404            || !core_class
405                .chars()
406                .all(|c| c.is_ascii_lowercase() || c == '-')
407            || core_class.starts_with('-')
408            || core_class.ends_with('-')
409        {
410            return Err(());
411        }
412        if let Some(dp) = dot_part {
413            if dp.len() != 2 || !dp.chars().all(|c| c.is_ascii_uppercase()) {
414                return Err(());
415            }
416        }
417
418        // Id: 7 digits, optional `vN`.
419        let bytes = id.as_bytes();
420        let mut i = 0;
421        while i < bytes.len() && bytes[i].is_ascii_digit() {
422            i += 1;
423        }
424        if i != 7 {
425            return Err(());
426        }
427        if i == bytes.len() {
428            return Ok(());
429        }
430        if bytes[i] != b'v' {
431            return Err(());
432        }
433        i += 1;
434        let v_start = i;
435        while i < bytes.len() && bytes[i].is_ascii_digit() {
436            i += 1;
437        }
438        if i == v_start || i != bytes.len() {
439            return Err(());
440        }
441        Ok(())
442    }
443}
444
445// ---------------------------------------------------------------------------
446// RefParseError
447// ---------------------------------------------------------------------------
448
449/// Reasons a `Doi::parse` / `ArxivId::parse` / `Ref::parse` call can fail.
450///
451/// Each variant maps to one rejection category in `docs/SECURITY.md` §1.1.
452/// All variants funnel to [`ErrorCode::InvalidRef`] when surfacing to MCP /
453/// CLI; the granular shape is preserved for tests and for future log
454/// breadcrumbs. The `From<RefParseError> for ErrorCode` impl below makes
455/// `?` propagation collapse to `INVALID_REF` automatically, satisfying
456/// `docs/PUBLIC_API.md` §4.
457///
458/// Marked `#[non_exhaustive]` so adding new categories is a non-breaking
459/// change. Pattern-match with a wildcard arm.
460#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
461#[non_exhaustive]
462pub enum RefParseError {
463    /// Input was empty.
464    #[error("empty input")]
465    Empty,
466    /// Input did not begin with the required `10.` literal (after any
467    /// scheme strip).
468    #[error("DOI must begin with '10.'")]
469    MissingDoiPrefix,
470    /// Input started with `10.` but had no `/` separator between
471    /// registrant and suffix.
472    #[error("DOI must contain '/' between registrant and suffix")]
473    MissingDoiSuffixSeparator,
474    /// Registrant was not 4–9 ASCII digits.
475    #[error("DOI registrant must be 4–9 ASCII digits")]
476    InvalidDoiRegistrant,
477    /// DOI suffix was empty.
478    #[error("DOI suffix is empty")]
479    EmptyDoiSuffix,
480    /// DOI suffix exceeded `DOI_SUFFIX_MAX_LEN` bytes.
481    #[error("DOI suffix is {len} bytes; maximum is {max}")]
482    DoiSuffixTooLong {
483        /// Observed suffix length, in bytes.
484        len: usize,
485        /// Hard upper bound (always [`DOI_SUFFIX_MAX_LEN`]).
486        max: usize,
487    },
488    /// DOI suffix contained a character outside `[A-Za-z0-9._/():-]`.
489    #[error("DOI suffix contains invalid character {ch:?}")]
490    InvalidDoiSuffixChar {
491        /// The first offending character.
492        ch: char,
493    },
494    /// Input matched neither the new-style nor old-style arXiv shape.
495    #[error("input does not match any known arXiv id shape")]
496    InvalidArxivShape,
497}
498
499impl From<RefParseError> for ErrorCode {
500    fn from(_: RefParseError) -> Self {
501        // All parse failures collapse to INVALID_REF at the public boundary,
502        // matching `docs/PUBLIC_API.md` §4 and `docs/SECURITY.md` §1.1.
503        ErrorCode::InvalidRef
504    }
505}
506
507// ---------------------------------------------------------------------------
508// Safekey
509// ---------------------------------------------------------------------------
510
511/// A filesystem-safe key derived deterministically from a `Ref`.
512///
513/// See `docs/SAFEKEY.md` for the full algorithm and reference test vectors.
514/// Construct via `Ref::safekey()` (Phase 1+); inner field is `pub(crate)`.
515///
516/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"doi_10.1234_example"`.
517#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
518#[serde(transparent)]
519pub struct Safekey(pub(crate) String);
520
521impl Safekey {
522    /// Returns the safekey as a string slice.
523    pub fn as_str(&self) -> &str {
524        &self.0
525    }
526}
527
528impl Ref {
529    /// Returns the bare identifier string usable as a provenance `ref` field.
530    ///
531    /// Equivalent to `Doi::as_str` / `ArxivId::as_str` dispatched on the
532    /// variant — the URI scheme (`doi:` / `arxiv:`) is never present in the
533    /// inner identifiers (it is stripped at parse time), so the result is
534    /// always the bare DOI or arXiv id. Used by the CLI / MCP orchestrators
535    /// to populate the `ref` column of provenance log rows
536    /// (`docs/PROVENANCE_LOG.md` §3) without re-matching the variant.
537    pub fn as_input_str(&self) -> &str {
538        match self {
539            Ref::Doi(d) => d.as_str(),
540            Ref::Arxiv(a) => a.as_str(),
541        }
542    }
543
544    /// Derives a deterministic, filesystem-safe key from this reference.
545    ///
546    /// The algorithm is the NORMATIVE binding spec in `docs/SAFEKEY.md` §3.
547    /// Both Rust and Julia implementations MUST produce bit-identical output
548    /// for every entry in `tests/fixtures/safekey/vectors.json`.
549    ///
550    /// # Algorithm summary
551    ///
552    /// 1. Prefix with `doi_` or `arxiv_` (per variant).
553    /// 2. Replace any character outside `[A-Za-z0-9._-]` with `_`.
554    /// 3. Collapse consecutive `_` runs to a single `_`.
555    /// 4. Trim leading/trailing `_`.
556    /// 5. If the result exceeds 192 bytes, take the first 192 bytes plus
557    ///    `_` plus the first 8 hex chars of `SHA-256(raw)` (where `raw` is
558    ///    the step-1 output, before escaping).
559    ///
560    /// The bound on `as_str()` after step 4 is pure ASCII (steps 1-3 produce
561    /// only ASCII bytes), so the byte-slice in step 5 cannot split a
562    /// multibyte char.
563    pub fn safekey(&self) -> Safekey {
564        // Step 0: prefix per variant. Doi/ArxivId hold the bare identifier
565        // (no `doi:` / `arxiv:` URI scheme — that is stripped by Ref::parse,
566        // not relevant here).
567        let raw = match self {
568            Ref::Doi(d) => format!("doi_{}", d.as_str()),
569            Ref::Arxiv(a) => format!("arxiv_{}", a.as_str()),
570        };
571
572        // Step 1: replace unsafe chars with '_'. Non-ASCII chars (emitted by
573        // String::chars() as full Unicode code points) all hit the wildcard
574        // arm and become a single '_'.
575        let escaped: String = raw
576            .chars()
577            .map(|c| match c {
578                'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' => c,
579                _ => '_',
580            })
581            .collect();
582
583        // Step 2: collapse consecutive '_' runs to a single '_'.
584        let mut collapsed = String::with_capacity(escaped.len());
585        let mut last_was_underscore = false;
586        for c in escaped.chars() {
587            if c == '_' {
588                if !last_was_underscore {
589                    collapsed.push('_');
590                }
591                last_was_underscore = true;
592            } else {
593                collapsed.push(c);
594                last_was_underscore = false;
595            }
596        }
597
598        // Step 3: trim leading/trailing '_'.
599        let trimmed = collapsed.trim_matches('_');
600
601        // Step 4: length-bound. After steps 1-3 `trimmed` is pure ASCII, so
602        // `len()` (bytes) == char count and `&trimmed[..192]` is char-safe.
603        let key = if trimmed.len() > 192 {
604            let digest = sha2::Sha256::digest(raw.as_bytes());
605            let hash = hex::encode(&digest[..4]);
606            format!("{}_{}", &trimmed[..192], hash)
607        } else {
608            trimmed.to_string()
609        };
610
611        Safekey(key)
612    }
613}
614
615// ---------------------------------------------------------------------------
616// ErrorCode
617// ---------------------------------------------------------------------------
618
619/// The closed set of error codes doiget surfaces.
620///
621/// See `docs/ERRORS.md` for the persona × code matrix.
622///
623/// Marked `#[non_exhaustive]` so adding new variants is a minor (not major)
624/// version bump.
625#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
626#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
627#[non_exhaustive]
628pub enum ErrorCode {
629    /// DOI / arXiv id failed validation.
630    InvalidRef,
631    /// Tier 1 sources reported no OA URL.
632    NoOaAvailable,
633    /// Internal rate cap or upstream 429.
634    RateLimited,
635    /// Transport / DNS / TLS failure.
636    NetworkError,
637    /// Filesystem write failed.
638    StoreError,
639    /// Provenance log write failed; the fetch was aborted.
640    LogError,
641    /// Source not granted by the runtime `CapabilityProfile`.
642    CapabilityDenied,
643    /// Per-request timeout exceeded.
644    FetchTimeout,
645    /// Store entry's `schema_version` is ahead of this build.
646    SchemaTooNew,
647    /// Could not acquire `flock` within 5 s.
648    LockTimeout,
649    /// Bug — please open an issue.
650    InternalError,
651    /// Feature is spec'd but not yet wired in this Phase. Distinct from
652    /// [`Self::InternalError`] (which signals a bug) and
653    /// [`Self::CapabilityDenied`] (which signals a runtime config gate).
654    /// Returned by stubs that exist to pin the public surface ahead of
655    /// orchestrator implementation, so an agent can react with "wait for
656    /// next minor release" rather than "report a bug" or "tweak my
657    /// capability profile". Wire form: `"NOT_IMPLEMENTED"`.
658    NotImplemented,
659}
660
661impl ErrorCode {
662    /// The `SCREAMING_SNAKE_CASE` wire token for this code, as a
663    /// `&'static str`. Identical to the serde representation but
664    /// allocation-free and usable where a borrowed string with a
665    /// `'static` lifetime is required — notably the provenance log
666    /// `error_code` column (`docs/PROVENANCE_LOG.md` §3), so a failure
667    /// row records the *actual* mapped code instead of a hand-written
668    /// literal that can drift from this enum (issue #118).
669    #[must_use]
670    pub fn as_wire(&self) -> &'static str {
671        match self {
672            ErrorCode::InvalidRef => "INVALID_REF",
673            ErrorCode::NoOaAvailable => "NO_OA_AVAILABLE",
674            ErrorCode::RateLimited => "RATE_LIMITED",
675            ErrorCode::NetworkError => "NETWORK_ERROR",
676            ErrorCode::StoreError => "STORE_ERROR",
677            ErrorCode::LogError => "LOG_ERROR",
678            ErrorCode::CapabilityDenied => "CAPABILITY_DENIED",
679            ErrorCode::FetchTimeout => "FETCH_TIMEOUT",
680            ErrorCode::SchemaTooNew => "SCHEMA_TOO_NEW",
681            ErrorCode::LockTimeout => "LOCK_TIMEOUT",
682            ErrorCode::InternalError => "INTERNAL_ERROR",
683            ErrorCode::NotImplemented => "NOT_IMPLEMENTED",
684        }
685    }
686}
687
688// ---------------------------------------------------------------------------
689// DenialReason / DenialContext (ADR-0023)
690// ---------------------------------------------------------------------------
691
692/// Closed-set reasons a denial-class error envelope can carry on its
693/// optional `denial_context.reason` field.
694///
695/// Wire form (JSON / MCP) is `snake_case` — e.g. `"redirect_not_in_allowlist"`.
696/// The set is **closed** per ADR-0023 §2: adding a new variant is a minor
697/// semver bump; renaming or repurposing one is a breaking change. Mirrors
698/// the stability rule that already governs [`ErrorCode`].
699///
700/// See [`DenialContext`] for the surrounding struct, `docs/ERRORS.md` §3.1
701/// for the wire surface, and `docs/PUBLIC_API.md` §8 for the
702/// semver-locked surface contract.
703#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
704#[serde(rename_all = "snake_case")]
705pub enum DenialReason {
706    /// Redirect target host did not match the source's allowlist
707    /// (`HttpError::RedirectDenied`).
708    RedirectNotInAllowlist,
709    /// Redirect target had a non-HTTPS scheme (`HttpError::InsecureRedirect`).
710    InsecureScheme,
711    /// Source produced a URL whose host is on a future blocklist.
712    ///
713    /// Reserved — no producer wired yet. Will be emitted by the future
714    /// per-source URL host-blocklist guard once that component lands
715    /// (post-Phase-1 supply-chain hardening; see
716    /// `docs/REDIRECT_ALLOWLIST.md` §4 for the staging plan).
717    HostInBlockList,
718    /// Body exceeded [`PDF_MAX_BYTES`] (`HttpError::OversizedBody`).
719    SizeCapExceeded,
720    /// Store entry's `schema_version` is ahead of this binary.
721    ///
722    /// Reserved — no producer wired yet. Will be emitted by the
723    /// `FsStore` schema-rejection path once the read-side bump check
724    /// lands (it currently only writes the current `SCHEMA_VERSION`).
725    SchemaDrift,
726    /// Source not in the runtime [`CapabilityProfile`]
727    /// (`FetchError::NotEligible`).
728    CapabilityNotGranted,
729    /// Rate limiter rejected the call inside the current window.
730    ///
731    /// Reserved — no producer wired yet. Will be emitted by
732    /// [`RateLimiter`](crate::rate_limiter::RateLimiter) once the
733    /// limiter surfaces structured denials (Phase 2+; today the
734    /// limiter only sleeps to enforce the window).
735    RateLimitWindow,
736    /// SSRF guard rejected a private / link-local / cloud-metadata address.
737    ///
738    /// Reserved — no producer wired yet. Will be emitted by the
739    /// future SSRF pre-flight check (post-Phase-1 supply-chain
740    /// hardening; the workspace currently relies on rustls + the
741    /// HTTPS-only redirect policy to keep the attack surface small).
742    SsrfPrivateAddress,
743    /// Response Content-Type / magic-byte mismatch (`HttpError::NotAPdf`).
744    ContentTypeMismatch,
745}
746
747/// Structured machine-parseable companion to `error.message` for
748/// recoverable denials.
749///
750/// The field is **optional and additive** on the public error envelope —
751/// every previously-shipped `{code, message}` envelope remains valid, and
752/// agents that ignore this struct continue to work. When present, it
753/// carries the concrete parameters an LLM agent can use to plan a recovery
754/// (e.g. "the redirect to `evil.example.com` was denied because it is not
755/// in the crossref allowlist") without text-mining `error.message`.
756///
757/// ## Wire shape
758///
759/// `#[serde(deny_unknown_fields)]`: forward-compatible field additions on
760/// the wire are forbidden by design — adding a field to this struct is a
761/// **breaking** change. This is why the type is **not** `#[non_exhaustive]`
762/// (per `docs/PUBLIC_API.md` §8): both production rules — Rust struct
763/// construction outside the crate AND wire-level extension — must agree.
764///
765/// All fields except `reason` are optional. Producers populate the fields
766/// relevant to the reason and leave the rest at `None`; consumers MUST
767/// tolerate any subset of fields being present. Optional fields are
768/// skipped on serialize but accepted as missing on deserialize via
769/// `#[serde(default, skip_serializing_if = "Option::is_none")]`.
770///
771/// [`Self::expected`] is `Option<Vec<String>>` rather than `Vec<String>`
772/// so the producer can distinguish "this reason has no allowlist channel"
773/// (`None` → field absent on the wire) from "this is the explicit list of
774/// acceptable values, possibly empty" (`Some(vec![])` → `"expected":[]` on
775/// the wire). The previous `Vec<String>` shape collapsed both states
776/// into "field omitted", which an LLM agent could not safely disambiguate.
777///
778/// Mapping table: see ADR-0023 §4, plus the
779/// `From<&HttpError> for Option<DenialContext>` and
780/// `From<&FetchError> for Option<DenialContext>` impls in
781/// [`crate::http`] / [`crate::source`].
782#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
783#[serde(deny_unknown_fields)]
784pub struct DenialContext {
785    /// Closed-enum reason code; the only required field.
786    pub reason: DenialReason,
787    /// Resolver source key (e.g. `"crossref"`) when one is in scope.
788    #[serde(default, skip_serializing_if = "Option::is_none")]
789    pub source: Option<String>,
790    /// Concrete value the producer attempted (host, path, hex magic bytes,
791    /// scheme prefix). Shape is reason-specific; consumers MUST treat it
792    /// as opaque text.
793    #[serde(default, skip_serializing_if = "Option::is_none")]
794    pub attempted: Option<String>,
795    /// Allowlist entries / acceptable values. `Option<Vec<String>>` so the
796    /// producer can distinguish "this reason has no allowlist channel"
797    /// (`None`, field absent on the wire) from "this is the explicit list
798    /// of acceptable values, possibly empty" (`Some(vec![])`, `"expected":[]`
799    /// on the wire). The inner `Vec<String>` is used even when only one
800    /// value is meaningful (e.g. `Some(vec!["%PDF-".into()])`) so the
801    /// format does not have to flip when multiple values are acceptable.
802    #[serde(default, skip_serializing_if = "Option::is_none")]
803    pub expected: Option<Vec<String>>,
804    /// Redirect-chain hop position, 0-indexed. `u8` because the chain is
805    /// hard-capped at [`crate::http`]'s `MAX_REDIRECTS` (= 10) and any
806    /// larger value indicates a bug.
807    #[serde(default, skip_serializing_if = "Option::is_none")]
808    pub hop_index: Option<u8>,
809    /// Size or rate cap value (e.g. [`PDF_MAX_BYTES`]).
810    #[serde(default, skip_serializing_if = "Option::is_none")]
811    pub cap: Option<u64>,
812    /// Observed value (e.g. response bytes when [`Self::cap`] is the byte
813    /// cap, or row schema_version when [`Self::cap`] is the binary's).
814    #[serde(default, skip_serializing_if = "Option::is_none")]
815    pub actual: Option<u64>,
816}
817
818// ---------------------------------------------------------------------------
819// CapabilityProfile (placeholder; full impl in Phase 1)
820// ---------------------------------------------------------------------------
821
822/// Marker for the always-on Open Access tier. See `docs/CAPABILITY.md`.
823#[derive(Debug, Clone, Copy)]
824pub struct AlwaysOn;
825
826/// Which Tier 2 metadata sources are enabled this session. See `docs/CAPABILITY.md`.
827#[derive(Debug, Clone, Default)]
828#[non_exhaustive]
829pub struct MetadataAccess {
830    /// Phase 4+; enabled by `DOIGET_ENABLE_OPENALEX`.
831    pub openalex: bool,
832    /// Phase 4+; enabled by `DOIGET_ENABLE_S2`.
833    pub semantic_scholar: bool,
834    /// Phase 4+; enabled by `DOIGET_ENABLE_DOAJ`.
835    pub doaj: bool,
836}
837
838/// Process-wide rate limits. Hard-coded; not configurable.
839///
840/// Construct only via [`RateLimits::HARD_CODED`]. The struct fields are
841/// `pub(crate)` so downstream code cannot synthesize a `RateLimits` with
842/// different values, which would weaken `docs/LEGAL.md` §6 safeguard 8.
843#[derive(Debug, Clone, Copy)]
844#[non_exhaustive]
845pub struct RateLimits {
846    pub(crate) max_concurrent_fetches: u32,
847    pub(crate) max_fetches_per_second: f32,
848    pub(crate) per_source_backoff_ms: u64,
849}
850
851impl RateLimits {
852    /// The single, hard-coded set of rate limits. There is no other public
853    /// constructor — see the type-level docs.
854    pub const HARD_CODED: Self = Self {
855        max_concurrent_fetches: MAX_CONCURRENT_FETCHES,
856        max_fetches_per_second: MAX_FETCHES_PER_SECOND,
857        per_source_backoff_ms: 200,
858    };
859
860    /// Maximum number of concurrent fetches in flight.
861    pub const fn max_concurrent_fetches(&self) -> u32 {
862        self.max_concurrent_fetches
863    }
864
865    /// Maximum fetch attempts per second across all sources.
866    pub const fn max_fetches_per_second(&self) -> f32 {
867        self.max_fetches_per_second
868    }
869
870    /// Per-source backoff in milliseconds between consecutive requests.
871    pub const fn per_source_backoff_ms(&self) -> u64 {
872        self.per_source_backoff_ms
873    }
874}
875
876/// A successful TDM grant.
877///
878/// Carries the validated API key (`docs/CAPABILITY.md` §1) so that the key
879/// flows from the startup capability gate into the source, rather than each
880/// TDM source re-reading the env var at fetch time (issue #153 — an env
881/// mutation between startup and fetch is otherwise undetectable).
882///
883/// The `api_key` field exists only when at least one `tdm-*` Cargo feature
884/// is compiled in (the `secrecy` dependency is `optional = true` and gated
885/// on those features per ADR-0002, so default release binaries contain no
886/// TDM code path at all). The struct is `#[non_exhaustive]`; the
887/// `tdm-*`-gated `api_key` field is therefore additive, not breaking, for
888/// builds that toggle the feature set.
889///
890/// `docs/CAPABILITY.md` §1 specifies the type as `Secret<String>`; that is
891/// the `secrecy` 0.9 spelling. The workspace pins `secrecy` 0.10, whose
892/// equivalent owned-string secret type is `secrecy::SecretString`
893/// (`= SecretBox<str>`). CAPABILITY.md §1 has been updated to match the
894/// 0.10 API. `Debug` redacts the value.
895///
896/// Implements `Default` so in-crate test fixtures using
897/// `TdmGrant { agree_env_var: ..., ..Default::default() }` keep compiling;
898/// the default `api_key` is an empty secret.
899#[derive(Debug, Clone)]
900#[non_exhaustive]
901pub struct TdmGrant {
902    /// The publisher API key, validated present at startup by
903    /// [`CapabilityProfile::from_env`]. Wrapped in
904    /// `secrecy::SecretString` so `Debug` never prints it; use
905    /// `secrecy::ExposeSecret::expose_secret` at the point of use.
906    ///
907    /// Only present when a `tdm-*` feature is compiled in (see the
908    /// type-level docs and ADR-0002).
909    #[cfg(any(
910        feature = "tdm-elsevier",
911        feature = "tdm-aps",
912        feature = "tdm-springer"
913    ))]
914    pub api_key: secrecy::SecretString,
915    /// Which env var the user used to acknowledge the publisher's ToS.
916    pub agree_env_var: String,
917    /// When the agreement env var was first observed at startup.
918    pub agreed_at: chrono::DateTime<chrono::Utc>,
919}
920
921impl Default for TdmGrant {
922    fn default() -> Self {
923        Self {
924            #[cfg(any(
925                feature = "tdm-elsevier",
926                feature = "tdm-aps",
927                feature = "tdm-springer"
928            ))]
929            api_key: secrecy::SecretString::from(String::new()),
930            agree_env_var: String::new(),
931            agreed_at: chrono::Utc::now(),
932        }
933    }
934}
935
936/// Runtime gate for which sources may be invoked. See `docs/CAPABILITY.md`.
937///
938/// Marked `#[non_exhaustive]` so adding new capability classes is non-breaking.
939/// Pattern-match only against the documented variants and use a wildcard arm.
940///
941/// **Construction**: external callers use [`CapabilityProfile::from_env()`].
942/// Struct-literal construction is blocked outside this crate by
943/// `#[non_exhaustive]`; this is intentional — the type's safety guarantees
944/// rely on the resolution rules in `from_env`. `Default` is **not yet**
945/// implemented; Phase 1 will add it once the field set stabilizes.
946#[derive(Debug, Clone)]
947#[non_exhaustive]
948pub struct CapabilityProfile {
949    /// Tier 1 OA sources are always permitted.
950    pub oa: AlwaysOn,
951    /// Tier 2 metadata access (Phase 4+).
952    pub metadata: MetadataAccess,
953    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
954    pub tdm_elsevier: Option<TdmGrant>,
955    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
956    pub tdm_aps: Option<TdmGrant>,
957    /// Tier 3 grants are populated only when both env var and feature compile-in are set.
958    pub tdm_springer: Option<TdmGrant>,
959    /// Hard-coded rate limits for this process.
960    pub rate_limits: RateLimits,
961}
962
963/// Errors that can arise during `CapabilityProfile::from_env`.
964#[derive(Debug, thiserror::Error)]
965pub enum CapabilityError {
966    /// User set the agree env var but provided no key. See `docs/CAPABILITY.md` §2.
967    #[error("env {agree_var} is set but {key_var} is missing")]
968    AgreedButNoKey {
969        /// The agreement env var the user set.
970        agree_var: String,
971        /// The key env var that should accompany it.
972        key_var: String,
973    },
974    /// Key env var is set but user has not agreed. See `docs/CAPABILITY.md` §2.
975    #[error("key for {agree_var} is present but {agree_var} is not set to '1'")]
976    KeyButNotAgreed {
977        /// The agreement env var the user must set to `1` before the key takes effect.
978        agree_var: String,
979    },
980}
981
982impl CapabilityProfile {
983    /// Read the runtime profile from environment variables.
984    ///
985    /// Implements the resolution algorithm specified in
986    /// [`docs/CAPABILITY.md`](../../../docs/CAPABILITY.md) §2.
987    ///
988    /// # Tier 1 (Open Access)
989    ///
990    /// Always permitted; not gated on any env var or feature.
991    ///
992    /// # Tier 2 (metadata)
993    ///
994    /// Each metadata source becomes available when its env var is set
995    /// (presence-checked, value ignored) **and** the `metadata` Cargo feature
996    /// was compiled in. If the env var is set but the feature is not compiled
997    /// in, a `tracing::warn!` is emitted and the source is left disabled —
998    /// this is not an error so that users can move binaries between machines
999    /// (or switch feature sets between cargo invocations) without breaking
1000    /// startup. See `docs/CAPABILITY.md` §3 for the env var list.
1001    ///
1002    /// # Tier 3 (TDM)
1003    ///
1004    /// For each publisher in `{ELSEVIER, APS, SPRINGER}`, the
1005    /// `DOIGET_AGREE_TDM_<X>` agreement env var is paired with
1006    /// `DOIGET_KEY_<X>`. Resolution rules (per `docs/CAPABILITY.md` §2):
1007    ///
1008    /// - both unset → `tdm_<x> = None` (no error);
1009    /// - `agree == "1"` and key set → `Some(TdmGrant { .. })` (subject to the
1010    ///   feature gate below);
1011    /// - `agree == "1"` and key unset → [`CapabilityError::AgreedButNoKey`];
1012    /// - key set but `agree` unset (or `agree != "1"`) →
1013    ///   [`CapabilityError::KeyButNotAgreed`].
1014    ///
1015    /// When both env vars are set correctly **but** the corresponding
1016    /// `tdm-<x>` Cargo feature is not compiled in, this function emits a
1017    /// `tracing::warn!` and sets the grant to `None` rather than returning an
1018    /// error — same rationale as for the Tier 2 warn-and-skip behavior.
1019    ///
1020    /// # Precondition: tracing subscriber must be installed first
1021    ///
1022    /// Warn breadcrumbs are delivered via `tracing::warn!`. Callers MUST
1023    /// install a `tracing-subscriber` (or equivalent) **before** invoking
1024    /// this function, otherwise warnings are silently dropped. The
1025    /// `doiget-cli` binary does this in `main.rs`.
1026    ///
1027    /// # Errors
1028    ///
1029    /// Returns [`CapabilityError::AgreedButNoKey`] or
1030    /// [`CapabilityError::KeyButNotAgreed`] when the TDM env-var pair for any
1031    /// publisher is misconfigured. See the variant docs for the precise
1032    /// trigger conditions.
1033    ///
1034    /// # Note on `api_key` storage
1035    ///
1036    /// When a `tdm-*` feature is compiled in, [`TdmGrant`] carries the
1037    /// validated key as `secrecy::SecretString` (issue #153). The key is
1038    /// read exactly once here, at startup; TDM sources consume it from the
1039    /// grant and never re-read the env var at fetch time. This makes the
1040    /// grant a true startup attestation — an env mutation between startup
1041    /// and fetch can no longer silently change the credential in flight.
1042    /// See the [`TdmGrant`] doc-comment and `docs/CAPABILITY.md` §1/§2.
1043    pub fn from_env() -> Result<Self, CapabilityError> {
1044        // Issue #153: the validated API key is now threaded through
1045        // `TdmGrant` (as `secrecy::SecretString`, behind the `tdm-*`
1046        // features) by `resolve_tdm_grant` below — sources no longer
1047        // re-read the key env var at fetch time. See the `TdmGrant`
1048        // doc-comment and `docs/CAPABILITY.md` §1/§2.
1049
1050        // -- Tier 2 metadata -------------------------------------------------
1051        let metadata = MetadataAccess {
1052            openalex: resolve_metadata_flag(
1053                "DOIGET_ENABLE_OPENALEX",
1054                "metadata",
1055                cfg!(feature = "metadata"),
1056            ),
1057            semantic_scholar: resolve_metadata_flag(
1058                "DOIGET_ENABLE_S2",
1059                "metadata",
1060                cfg!(feature = "metadata"),
1061            ),
1062            doaj: resolve_metadata_flag(
1063                "DOIGET_ENABLE_DOAJ",
1064                "metadata",
1065                cfg!(feature = "metadata"),
1066            ),
1067        };
1068
1069        // -- Tier 3 TDM grants ----------------------------------------------
1070        let tdm_elsevier = resolve_tdm_grant(
1071            "DOIGET_AGREE_TDM_ELSEVIER",
1072            "DOIGET_KEY_ELSEVIER",
1073            "tdm-elsevier",
1074            cfg!(feature = "tdm-elsevier"),
1075        )?;
1076        let tdm_aps = resolve_tdm_grant(
1077            "DOIGET_AGREE_TDM_APS",
1078            "DOIGET_KEY_APS",
1079            "tdm-aps",
1080            cfg!(feature = "tdm-aps"),
1081        )?;
1082        let tdm_springer = resolve_tdm_grant(
1083            "DOIGET_AGREE_TDM_SPRINGER",
1084            "DOIGET_KEY_SPRINGER",
1085            "tdm-springer",
1086            cfg!(feature = "tdm-springer"),
1087        )?;
1088
1089        Ok(Self {
1090            oa: AlwaysOn,
1091            metadata,
1092            tdm_elsevier,
1093            tdm_aps,
1094            tdm_springer,
1095            rate_limits: RateLimits::HARD_CODED,
1096        })
1097    }
1098}
1099
1100/// Resolve a Tier 2 metadata flag from its env var and compile-in feature.
1101///
1102/// Returns `true` only when both the env var is present and the feature is
1103/// compiled in. When the env var is set without the feature, emits a
1104/// `tracing::warn!` and returns `false` — see [`CapabilityProfile::from_env`]
1105/// for the rationale (binaries may move between hosts / feature sets).
1106fn resolve_metadata_flag(env_var: &str, feature: &str, feature_enabled: bool) -> bool {
1107    let env_set = std::env::var_os(env_var).is_some();
1108    match (env_set, feature_enabled) {
1109        (true, true) => true,
1110        (true, false) => {
1111            tracing::warn!(
1112                env_var,
1113                feature,
1114                "{} is set but feature {} was not compiled in; the source will be unavailable",
1115                env_var,
1116                feature
1117            );
1118            false
1119        }
1120        (false, _) => false,
1121    }
1122}
1123
1124/// Resolve a Tier 3 TDM grant from the `agree`/`key` env-var pair and the
1125/// per-publisher Cargo feature.
1126///
1127/// Implements the rules in `docs/CAPABILITY.md` §2:
1128///
1129/// - both unset → `Ok(None)`.
1130/// - `agree == "1"` and `key` set → `Ok(Some(TdmGrant { .. }))` (when the
1131///   feature is enabled), or warn-and-`Ok(None)` (when the feature is not
1132///   compiled in).
1133/// - `agree == "1"` and `key` unset →
1134///   [`CapabilityError::AgreedButNoKey`].
1135/// - `key` set and `agree` unset OR `agree` set to anything other than `"1"`
1136///   → [`CapabilityError::KeyButNotAgreed`].
1137fn resolve_tdm_grant(
1138    agree_var: &str,
1139    key_var: &str,
1140    feature: &str,
1141    feature_enabled: bool,
1142) -> Result<Option<TdmGrant>, CapabilityError> {
1143    // `agree` is "agreed" iff the value is exactly the literal "1"; any other
1144    // value (including "true", "yes", empty) is treated as not-agreed per
1145    // `docs/CAPABILITY.md` §2.
1146    let agree_raw = std::env::var(agree_var).ok();
1147    let agreed = matches!(agree_raw.as_deref(), Some("1"));
1148    let agree_present = agree_raw.is_some();
1149    // Read the key value once, at startup, so the validated key flows
1150    // through `TdmGrant` and sources never re-read the env (issue #153).
1151    // An empty value is treated as "not set" — an empty API key cannot
1152    // authenticate, and silently constructing a grant around it would
1153    // mask the misconfiguration the AgreedButNoKey rule exists to surface.
1154    let key_value = std::env::var(key_var).ok().filter(|v| !v.is_empty());
1155
1156    match (agreed, agree_present, key_value) {
1157        (true, _, Some(key)) => {
1158            if feature_enabled {
1159                Ok(Some(build_tdm_grant(agree_var, key)))
1160            } else {
1161                // `key` is dropped here; under no-tdm builds it is the only
1162                // consumer of the owned `String`, which is intended.
1163                let _ = key;
1164                tracing::warn!(
1165                    env_var = agree_var,
1166                    feature,
1167                    "{} is set but feature {} was not compiled in; the source will be unavailable",
1168                    agree_var,
1169                    feature
1170                );
1171                Ok(None)
1172            }
1173        }
1174        (true, _, None) => Err(CapabilityError::AgreedButNoKey {
1175            agree_var: agree_var.to_string(),
1176            key_var: key_var.to_string(),
1177        }),
1178        // agree set to non-"1", key also set: KeyButNotAgreed (the key would
1179        // otherwise authorize the source without an explicit agreement).
1180        (false, true, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1181            agree_var: agree_var.to_string(),
1182        }),
1183        // agree unset, key set: KeyButNotAgreed (same rule).
1184        (false, false, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1185            agree_var: agree_var.to_string(),
1186        }),
1187        // agree set to non-"1" and no key: treat as no-grant. The user
1188        // expressed something but did not opt in and provided no credential,
1189        // so silent skip is the safe default (no source enabled).
1190        (false, true, None) => Ok(None),
1191        // Neither env var set: no grant, no error.
1192        (false, false, None) => Ok(None),
1193    }
1194}
1195
1196/// Construct a [`TdmGrant`] from the validated agreement var and key value.
1197///
1198/// Split out so the `tdm-*`-gated `api_key` field is populated in exactly
1199/// one place. When no `tdm-*` feature is compiled in the `key` is consumed
1200/// (dropped) here — the grant is still produced so that startup attestation
1201/// behavior (the warn-and-skip path) does not change shape between feature
1202/// sets.
1203fn build_tdm_grant(agree_var: &str, key: String) -> TdmGrant {
1204    #[cfg(any(
1205        feature = "tdm-elsevier",
1206        feature = "tdm-aps",
1207        feature = "tdm-springer"
1208    ))]
1209    {
1210        TdmGrant {
1211            api_key: secrecy::SecretString::from(key),
1212            agree_env_var: agree_var.to_string(),
1213            agreed_at: chrono::Utc::now(),
1214        }
1215    }
1216    #[cfg(not(any(
1217        feature = "tdm-elsevier",
1218        feature = "tdm-aps",
1219        feature = "tdm-springer"
1220    )))]
1221    {
1222        let _ = key;
1223        TdmGrant {
1224            agree_env_var: agree_var.to_string(),
1225            agreed_at: chrono::Utc::now(),
1226        }
1227    }
1228}
1229
1230// ---------------------------------------------------------------------------
1231// Tests — one smoke test per legally-load-bearing constant. See
1232// `docs/LEGAL.md` §6 safeguard 8 and `docs/PHASES.md` §4. These also keep the
1233// `cargo test --workspace` job from being a false-green during Phase 0.
1234// ---------------------------------------------------------------------------
1235
1236// `expect`/`unwrap` are idiomatic in tests where panics double as assertions.
1237// The workspace lints deny them in production code; relax for the test module
1238// only.
1239#[cfg(test)]
1240#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1241mod tests {
1242    use super::*;
1243
1244    #[test]
1245    fn rate_limits_hard_coded_match_legal_safeguards() {
1246        // docs/LEGAL.md §6 safeguard 8 names these exact values.
1247        assert_eq!(RateLimits::HARD_CODED.max_concurrent_fetches(), 5);
1248        assert!((RateLimits::HARD_CODED.max_fetches_per_second() - 5.0).abs() < f32::EPSILON);
1249        assert_eq!(RateLimits::HARD_CODED.per_source_backoff_ms(), 200);
1250    }
1251
1252    #[test]
1253    fn batch_size_caps_match_security_doc() {
1254        // docs/SECURITY.md §1.4 + docs/MCP_TOOLS.md.
1255        assert_eq!(MCP_BATCH_MAX_SIZE, 100);
1256        assert_eq!(MCP_QUEUE_DEPTH_MAX, 100);
1257        assert_eq!(DOI_SUFFIX_MAX_LEN, 256);
1258        assert_eq!(MCP_STDIN_EOF_SHUTDOWN_SEC, 5);
1259        // Slice 2: spec-language alias for MCP_BATCH_MAX_SIZE must
1260        // numerically agree with the original constant.
1261        assert_eq!(MAX_BATCH_REFS, MCP_BATCH_MAX_SIZE);
1262    }
1263
1264    #[test]
1265    fn schema_version_is_pinned_to_1_0() {
1266        // docs/STORE.md §3 — Phase 0/1 writes 1.0 exactly.
1267        // A bump to 1.1 (minor, backward-compat additions) requires updating
1268        // both this test and the cross-tool compat fixtures simultaneously.
1269        assert_eq!(SCHEMA_VERSION, "1.0");
1270    }
1271
1272    // -----------------------------------------------------------------
1273    // CapabilityProfile::from_env — Phase 1 resolution algorithm tests.
1274    //
1275    // These tests mutate process-global env state via std::env::set_var /
1276    // remove_var, so each test holds an `EnvGuard` RAII drop guard that
1277    // captures the pre-test value of every env var it touches and restores
1278    // it on drop (even on panic). They also use `#[serial_test::serial]` so
1279    // that no two tests in this module touch env state concurrently — the
1280    // workspace's test runner defaults to multi-threaded.
1281    //
1282    // Spec: docs/CAPABILITY.md §2 (resolution algorithm) and §3 (env var
1283    // reference table).
1284    // -----------------------------------------------------------------
1285
1286    /// RAII guard that captures the prior value of an env var on construction
1287    /// and restores it on drop. Use one guard per touched var per test.
1288    struct EnvGuard {
1289        var: &'static str,
1290        prior: Option<std::ffi::OsString>,
1291    }
1292
1293    impl EnvGuard {
1294        /// Capture and clear `var`. Use `set` afterwards to install a value.
1295        fn unset(var: &'static str) -> Self {
1296            let prior = std::env::var_os(var);
1297            // SAFETY (env mutation): tests are serialized via
1298            // `#[serial_test::serial]`. `remove_var` is sound when no other
1299            // thread reads or writes the environment concurrently.
1300            std::env::remove_var(var);
1301            EnvGuard { var, prior }
1302        }
1303
1304        /// Capture, then set `var` to `value`.
1305        fn set(var: &'static str, value: &str) -> Self {
1306            let prior = std::env::var_os(var);
1307            std::env::set_var(var, value);
1308            EnvGuard { var, prior }
1309        }
1310    }
1311
1312    impl Drop for EnvGuard {
1313        fn drop(&mut self) {
1314            match &self.prior {
1315                Some(v) => std::env::set_var(self.var, v),
1316                None => std::env::remove_var(self.var),
1317            }
1318        }
1319    }
1320
1321    /// Convenience: unset every Tier 2 / Tier 3 env var the resolution
1322    /// algorithm reads, returning a vector of guards that restore them on
1323    /// drop. Callers can then `EnvGuard::set` individual vars on top.
1324    fn unset_all_capability_env_vars() -> Vec<EnvGuard> {
1325        [
1326            "DOIGET_ENABLE_OPENALEX",
1327            "DOIGET_ENABLE_S2",
1328            "DOIGET_ENABLE_DOAJ",
1329            "DOIGET_AGREE_TDM_ELSEVIER",
1330            "DOIGET_KEY_ELSEVIER",
1331            "DOIGET_AGREE_TDM_APS",
1332            "DOIGET_KEY_APS",
1333            "DOIGET_AGREE_TDM_SPRINGER",
1334            "DOIGET_KEY_SPRINGER",
1335        ]
1336        .iter()
1337        .map(|v| EnvGuard::unset(v))
1338        .collect()
1339    }
1340
1341    #[test]
1342    #[serial_test::serial]
1343    fn from_env_no_env_vars_set_returns_tier_1_only() {
1344        // Rule: with every relevant env var unset, the resolved profile has
1345        // all TDM grants `None` and all metadata flags `false`. Hard-coded
1346        // rate limits still apply. (Replaces the old Phase 0 stub test.)
1347        let _g = unset_all_capability_env_vars();
1348
1349        let p = CapabilityProfile::from_env().expect("clean env never errors");
1350        assert!(p.tdm_elsevier.is_none());
1351        assert!(p.tdm_aps.is_none());
1352        assert!(p.tdm_springer.is_none());
1353        assert!(!p.metadata.openalex);
1354        assert!(!p.metadata.semantic_scholar);
1355        assert!(!p.metadata.doaj);
1356        assert_eq!(p.rate_limits.max_concurrent_fetches(), 5);
1357    }
1358
1359    #[test]
1360    #[serial_test::serial]
1361    fn from_env_no_tdm_returns_tier_1_profile() {
1362        // Rule (CAPABILITY.md §2): with every TDM env var unset, all
1363        // `tdm_*` fields are `None` and no error is produced.
1364        let _g = unset_all_capability_env_vars();
1365
1366        let p = CapabilityProfile::from_env().expect("no TDM env -> Ok");
1367        assert!(p.tdm_elsevier.is_none());
1368        assert!(p.tdm_aps.is_none());
1369        assert!(p.tdm_springer.is_none());
1370    }
1371
1372    #[test]
1373    #[serial_test::serial]
1374    fn from_env_agreed_but_no_key_errs() {
1375        // Rule (CAPABILITY.md §2): agree=1 + key unset -> AgreedButNoKey.
1376        let _g = unset_all_capability_env_vars();
1377        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1378
1379        let result = CapabilityProfile::from_env();
1380        match result {
1381            Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1382                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1383                assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1384            }
1385            other => panic!("expected AgreedButNoKey, got {:?}", other),
1386        }
1387    }
1388
1389    #[test]
1390    #[serial_test::serial]
1391    fn from_env_agreed_but_empty_key_errs() {
1392        // Security-adjacent (PR #161 review): an *empty* key string is
1393        // treated as "not set" by `resolve_tdm_grant`. With agree=1 and
1394        // DOIGET_KEY_ELSEVIER="" the misconfiguration must surface as
1395        // AgreedButNoKey, not silently build a grant around an empty
1396        // secret that could never authenticate.
1397        let _g = unset_all_capability_env_vars();
1398        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1399        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1400
1401        let result = CapabilityProfile::from_env();
1402        match result {
1403            Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1404                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1405                assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1406            }
1407            other => panic!("expected AgreedButNoKey for empty key, got {:?}", other),
1408        }
1409    }
1410
1411    #[test]
1412    #[serial_test::serial]
1413    fn from_env_empty_key_without_agree_is_no_grant() {
1414        // Security-adjacent (PR #161 review): an empty key with the
1415        // agree var unset is indistinguishable from "no key at all".
1416        // It must resolve to Ok(None) (no grant, no error) — an empty
1417        // string must NOT trip the KeyButNotAgreed leaked-credential
1418        // rule, since there is no credential.
1419        let _g = unset_all_capability_env_vars();
1420        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1421
1422        let p = CapabilityProfile::from_env()
1423            .expect("empty key + agree unset must be Ok(None), not an error");
1424        assert!(
1425            p.tdm_elsevier.is_none(),
1426            "empty DOIGET_KEY_ELSEVIER with no agree var must yield no grant"
1427        );
1428        assert!(p.tdm_aps.is_none());
1429        assert!(p.tdm_springer.is_none());
1430    }
1431
1432    #[test]
1433    #[serial_test::serial]
1434    fn from_env_key_but_not_agreed_errs() {
1435        // Rule (CAPABILITY.md §2): key set + agree unset -> KeyButNotAgreed.
1436        // A leaked DOIGET_KEY_ELSEVIER must not silently enable a source.
1437        let _g = unset_all_capability_env_vars();
1438        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1439
1440        let result = CapabilityProfile::from_env();
1441        match result {
1442            Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1443                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1444            }
1445            other => panic!("expected KeyButNotAgreed, got {:?}", other),
1446        }
1447    }
1448
1449    #[test]
1450    #[serial_test::serial]
1451    fn from_env_agree_not_one_errs() {
1452        // Rule (CAPABILITY.md §2): the agree var must be exactly "1". Any
1453        // other value (here: "true") is treated as not-agreed; combined
1454        // with a key set, that triggers KeyButNotAgreed.
1455        let _g = unset_all_capability_env_vars();
1456        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "true");
1457        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1458
1459        let result = CapabilityProfile::from_env();
1460        match result {
1461            Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1462                assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1463            }
1464            other => panic!("expected KeyButNotAgreed, got {:?}", other),
1465        }
1466    }
1467
1468    #[test]
1469    #[serial_test::serial]
1470    fn from_env_both_set_correctly_returns_grant() {
1471        // Rule (CAPABILITY.md §2): agree=1 + key set -> Some(TdmGrant) when
1472        // the corresponding feature is compiled in; else None (warn-and-skip).
1473        // The feature gate for elsevier is `tdm-elsevier`; this test asserts
1474        // both branches via `cfg!`.
1475        let _g = unset_all_capability_env_vars();
1476        let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1477        let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1478
1479        let p = CapabilityProfile::from_env().expect("agree=1 + key -> Ok");
1480
1481        if cfg!(feature = "tdm-elsevier") {
1482            let grant = p
1483                .tdm_elsevier
1484                .as_ref()
1485                .expect("feature tdm-elsevier compiled in -> Some(TdmGrant)");
1486            assert_eq!(grant.agree_env_var, "DOIGET_AGREE_TDM_ELSEVIER");
1487            // Issue #153 / PR #161 review: prove the key was actually
1488            // threaded into TdmGrant::api_key at startup (not just that
1489            // the agree var was recorded). The field is cfg-gated to
1490            // the same `tdm-*` set as the assertion below, so gate the
1491            // check identically.
1492            #[cfg(any(
1493                feature = "tdm-elsevier",
1494                feature = "tdm-aps",
1495                feature = "tdm-springer"
1496            ))]
1497            {
1498                use secrecy::ExposeSecret as _;
1499                assert_eq!(
1500                    grant.api_key.expose_secret(),
1501                    "sk-test",
1502                    "the DOIGET_KEY_ELSEVIER value must be threaded into \
1503                     TdmGrant::api_key (issue #153)"
1504                );
1505            }
1506        } else {
1507            assert!(
1508                p.tdm_elsevier.is_none(),
1509                "feature tdm-elsevier NOT compiled in -> None (warn-and-skip)"
1510            );
1511        }
1512    }
1513
1514    #[test]
1515    #[serial_test::serial]
1516    fn from_env_metadata_env_warns_without_feature() {
1517        // Rule (CAPABILITY.md §2): metadata env var without the `metadata`
1518        // feature -> source disabled (warn-and-skip, not an error).
1519        // We don't capture the tracing warn here; we just assert the field
1520        // is `false` when the feature is absent and `true` when present.
1521        let _g = unset_all_capability_env_vars();
1522        let _enable = EnvGuard::set("DOIGET_ENABLE_OPENALEX", "1");
1523
1524        let p = CapabilityProfile::from_env().expect("metadata env never errors");
1525
1526        if cfg!(feature = "metadata") {
1527            assert!(p.metadata.openalex);
1528        } else {
1529            assert!(!p.metadata.openalex);
1530        }
1531    }
1532
1533    // -----------------------------------------------------------------
1534    // Safekey reference vectors (docs/SAFEKEY.md §3, NORMATIVE).
1535    //
1536    // The vectors.json file is the binding cross-tool contract with
1537    // BiblioFetch.jl: every entry MUST round-trip identically through
1538    // both implementations. Phase 0 ships 13 entries; the full 100-entry
1539    // set is gated on the BiblioFetch.jl pre-flight (ADR-0007 Status:
1540    // Proposed at the time of this Phase 1 implementation).
1541    //
1542    // `Ref::parse` is concurrent W3-A work and is not on `main` yet, so
1543    // this test branches on the input prefix (`doi:` / `arxiv:`) and
1544    // constructs the variant directly via the in-crate `pub(crate)`
1545    // tuple constructor.
1546    // -----------------------------------------------------------------
1547
1548    #[derive(Deserialize)]
1549    struct SafekeyVector {
1550        input: String,
1551        expected: String,
1552    }
1553
1554    #[derive(Deserialize)]
1555    struct SafekeyVectorFile {
1556        vectors: Vec<SafekeyVector>,
1557    }
1558
1559    /// In-crate test helper: build a `Ref` from the user-facing form used
1560    /// in the vectors file, by stripping the `doi:` / `arxiv:` URI scheme
1561    /// and wrapping the remainder. This bypasses validation; it is fine
1562    /// here because the vectors are hand-curated and the test asserts the
1563    /// derivation algorithm, not parser semantics.
1564    fn ref_from_vector_input(input: &str) -> Ref {
1565        if let Some(rest) = input.strip_prefix("doi:") {
1566            Ref::Doi(Doi(rest.to_string()))
1567        } else if let Some(rest) = input.strip_prefix("arxiv:") {
1568            Ref::Arxiv(ArxivId(rest.to_string()))
1569        } else {
1570            panic!(
1571                "vectors.json entry has unknown ref scheme (expected doi: or arxiv: prefix): {}",
1572                input
1573            );
1574        }
1575    }
1576
1577    #[test]
1578    fn safekey_matches_reference_vectors() {
1579        // include_str! resolves relative to the file containing this macro
1580        // call (crates/doiget-core/src/lib.rs), so we go up three levels
1581        // to reach the workspace root, then down to tests/fixtures.
1582        let raw = include_str!("../../../tests/fixtures/safekey/vectors.json");
1583        let parsed: SafekeyVectorFile =
1584            serde_json::from_str(raw).expect("vectors.json is valid JSON matching schema");
1585
1586        // Phase 0 final ships the full NORMATIVE 100-entry set
1587        // (docs/SAFEKEY.md §5). The fixture is the binding cross-tool
1588        // contract with BiblioFetch.jl; tightening the count guard to
1589        // `== 100` ensures the set cannot silently grow or shrink without
1590        // a coordinated ADR bump (per docs/SAFEKEY.md status block).
1591        assert_eq!(
1592            parsed.vectors.len(),
1593            100,
1594            "vectors.json MUST be exactly 100 entries (NORMATIVE per docs/SAFEKEY.md §5); got {}",
1595            parsed.vectors.len()
1596        );
1597
1598        let mut failures: Vec<String> = Vec::new();
1599        for v in &parsed.vectors {
1600            let r = ref_from_vector_input(&v.input);
1601            let got = r.safekey().as_str().to_string();
1602            if got != v.expected {
1603                failures.push(format!(
1604                    "input={:?}\n  expected={:?}\n  got     ={:?}",
1605                    v.input, v.expected, got
1606                ));
1607            }
1608        }
1609
1610        assert!(
1611            failures.is_empty(),
1612            "{}/{} safekey reference vectors failed:\n{}",
1613            failures.len(),
1614            parsed.vectors.len(),
1615            failures.join("\n")
1616        );
1617    }
1618
1619    #[test]
1620    fn safekey_truncates_long_inputs_with_sha256_suffix() {
1621        // Construct a synthetic DOI whose suffix produces a `trimmed` longer than
1622        // 192 chars after step 3. 220 ASCII-safe chars + the `doi_10.1234/`
1623        // prefix easily exceeds 192. The resulting key must be exactly 201 chars:
1624        // 192 (trimmed prefix) + 1 (`_` separator) + 8 (hex of first 4 bytes of
1625        // SHA-256(raw)). Per docs/SAFEKEY.md §3 step 5.
1626        let suffix = "a".repeat(220);
1627        let doi = Doi(format!("10.1234/{}", suffix));
1628        let key = Ref::Doi(doi).safekey();
1629        let s = key.as_str();
1630
1631        // Shape: <192 ASCII chars from {A-Za-z0-9._-}> + "_" + <8 hex chars>
1632        assert_eq!(
1633            s.len(),
1634            201,
1635            "expected 201-char truncated key, got {}: {}",
1636            s.len(),
1637            s
1638        );
1639        assert_eq!(&s[192..193], "_", "expected '_' separator at byte 192");
1640        let hash_part = &s[193..];
1641        assert_eq!(hash_part.len(), 8, "hash suffix must be 8 hex chars");
1642        assert!(
1643            hash_part
1644                .chars()
1645                .all(|c| c.is_ascii_hexdigit() && !c.is_ascii_uppercase()),
1646            "hash suffix must be lowercase hex: {}",
1647            hash_part
1648        );
1649
1650        // Determinism: same input twice must produce the same key.
1651        let key2 = Ref::Doi(Doi(format!("10.1234/{}", "a".repeat(220)))).safekey();
1652        assert_eq!(s, key2.as_str(), "safekey must be deterministic");
1653
1654        // Hash content: must equal hex(sha256(raw)[..4]) where raw is the
1655        // pre-escape prefixed form per docs/SAFEKEY.md §3 step 5.
1656        use sha2::Digest;
1657        let raw = format!("doi_10.1234/{}", "a".repeat(220));
1658        let expected_hash = {
1659            let digest = sha2::Sha256::digest(raw.as_bytes());
1660            format!(
1661                "{:02x}{:02x}{:02x}{:02x}",
1662                digest[0], digest[1], digest[2], digest[3]
1663            )
1664        };
1665        assert_eq!(
1666            hash_part, expected_hash,
1667            "hash must match SHA-256 of raw form"
1668        );
1669    }
1670
1671    // -----------------------------------------------------------------
1672    // Doi::parse / ArxivId::parse / Ref::parse — Phase 1 W3-A.
1673    // Spec: docs/SECURITY.md §1.1 (input validation). The rejection
1674    // category set is the binding contract; each test case below names
1675    // which rule it exercises in a comment.
1676    // -----------------------------------------------------------------
1677
1678    // ---- Doi::parse happy paths (≥6) --------------------------------
1679
1680    #[test]
1681    fn doi_parse_accepts_bare_canonical_form() {
1682        // Rule: "10.<registrant>/<suffix>" is the canonical bare form.
1683        let d = Doi::parse("10.1234/example").expect("canonical bare DOI");
1684        assert_eq!(d.as_str(), "10.1234/example");
1685    }
1686
1687    #[test]
1688    fn doi_parse_accepts_doi_uri_scheme() {
1689        // Rule: the `doi:` scheme is stripped at construction; as_str
1690        // never carries it (matches docs/SAFEKEY.md §3 step 0).
1691        let d = Doi::parse("doi:10.1234/example").expect("doi: scheme accepted");
1692        assert_eq!(d.as_str(), "10.1234/example");
1693    }
1694
1695    #[test]
1696    fn doi_parse_accepts_complex_real_world_suffix() {
1697        // Rule: suffix charset includes `.`, `(`, `)`, `-`. From a real
1698        // PhysRevLett DOI used elsewhere in the test fixture set.
1699        let d = Doi::parse("10.1103/PhysRevLett.130.200601").expect("real-world PhysRev DOI");
1700        assert_eq!(d.as_str(), "10.1103/PhysRevLett.130.200601");
1701    }
1702
1703    #[test]
1704    fn doi_parse_accepts_parens_in_suffix() {
1705        // Rule: `(` and `)` are explicitly listed in the spec charset.
1706        let d = Doi::parse("10.1016/S0370-1573(98)00122-3").expect("parens in suffix");
1707        assert_eq!(d.as_str(), "10.1016/S0370-1573(98)00122-3");
1708    }
1709
1710    #[test]
1711    fn doi_parse_accepts_nested_slashes_in_suffix() {
1712        // Rule: `/` is a suffix character; only the first `/` is the
1713        // registrant/suffix separator.
1714        let d = Doi::parse("10.1234/foo/bar/baz").expect("nested slashes");
1715        assert_eq!(d.as_str(), "10.1234/foo/bar/baz");
1716    }
1717
1718    #[test]
1719    fn doi_parse_accepts_colon_in_legacy_kluwer_suffix() {
1720        // #194: legacy Kluwer/Springer DOIs (`10.1023/A:NNNNNNNNNN`)
1721        // carry a `:` in the suffix. Real DOI: "Entanglement, Quantum
1722        // Phase Transitions, and DMRG" (Kluwer, 2002).
1723        let d = Doi::parse("10.1023/A:1019601218492").expect("legacy Kluwer colon DOI");
1724        assert_eq!(d.as_str(), "10.1023/A:1019601218492");
1725    }
1726
1727    #[test]
1728    fn doi_parse_accepts_colon_in_edp_jphys_suffix() {
1729        // #194: EDP Sciences / Journal de Physique legacy corpus uses
1730        // `10.1051/jphys:NNNNNNNNNNNNNNNNN`. Real DOIs from the dogfood
1731        // Ising-RG run; both resolve at doi.org and via Crossref.
1732        let d = Doi::parse("10.1051/jphys:0198900500120136500").expect("EDP jphys colon DOI");
1733        assert_eq!(d.as_str(), "10.1051/jphys:0198900500120136500");
1734        let d2 = Doi::parse("doi:10.1051/jphys:0198500460100164500").expect("scheme + colon");
1735        assert_eq!(d2.as_str(), "10.1051/jphys:0198500460100164500");
1736    }
1737
1738    #[test]
1739    fn doi_parse_rejects_semicolon_in_suffix() {
1740        // #194 / ADR-0026: `;` is the natural ASCII neighbor of `:` and
1741        // is explicitly EXCLUDED from the suffix charset extension
1742        // (ADR-0026 §"Out of scope"). This test guards against an
1743        // over-broad `matches!` arm (e.g. an accidental `':'..=';'` range
1744        // typo) re-admitting `;` along with `:`.
1745        let result = Doi::parse("10.1234/foo;bar");
1746        assert!(
1747            matches!(result, Err(RefParseError::InvalidDoiSuffixChar { ch: ';' })),
1748            "expected InvalidDoiSuffixChar with ch=';', got {:?}",
1749            result
1750        );
1751    }
1752
1753    #[test]
1754    fn doi_parse_accepts_suffix_at_max_len_boundary() {
1755        // Rule: a suffix of exactly DOI_SUFFIX_MAX_LEN bytes is accepted;
1756        // 1 byte more is rejected (covered separately below).
1757        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN);
1758        let input = format!("10.1234/{}", suffix);
1759        let d = Doi::parse(&input).expect("suffix at max len");
1760        assert_eq!(d.as_str().len(), "10.1234/".len() + DOI_SUFFIX_MAX_LEN);
1761    }
1762
1763    #[test]
1764    fn doi_parse_uri_scheme_is_case_insensitive() {
1765        // Rule: be lenient on scheme casing; the scheme is stripped
1766        // either way so the stored form is identical.
1767        let d = Doi::parse("DOI:10.1234/example").expect("uppercase scheme");
1768        assert_eq!(d.as_str(), "10.1234/example");
1769    }
1770
1771    // ---- Doi::parse rejection paths (≥6) ----------------------------
1772
1773    #[test]
1774    fn doi_parse_rejects_missing_10_prefix() {
1775        // Rule: must start with "10." literal.
1776        assert_eq!(
1777            Doi::parse("11.1234/example"),
1778            Err(RefParseError::MissingDoiPrefix)
1779        );
1780    }
1781
1782    #[test]
1783    fn doi_parse_rejects_empty_input() {
1784        // Rule: empty inputs are not valid DOIs.
1785        assert_eq!(Doi::parse(""), Err(RefParseError::Empty));
1786    }
1787
1788    #[test]
1789    fn doi_parse_rejects_missing_suffix_separator() {
1790        // Rule: must contain a `/` between registrant and suffix.
1791        assert_eq!(
1792            Doi::parse("10.1234"),
1793            Err(RefParseError::MissingDoiSuffixSeparator)
1794        );
1795    }
1796
1797    #[test]
1798    fn doi_parse_rejects_empty_suffix() {
1799        // Rule: suffix must be non-empty.
1800        assert_eq!(Doi::parse("10.1234/"), Err(RefParseError::EmptyDoiSuffix));
1801    }
1802
1803    #[test]
1804    fn doi_parse_rejects_invalid_registrant_too_short() {
1805        // Rule: registrant must be 4–9 digits.
1806        assert_eq!(
1807            Doi::parse("10.12/example"),
1808            Err(RefParseError::InvalidDoiRegistrant)
1809        );
1810    }
1811
1812    #[test]
1813    fn doi_parse_rejects_non_digit_registrant() {
1814        // Rule: registrant chars must all be ASCII digits.
1815        assert_eq!(
1816            Doi::parse("10.12ab/example"),
1817            Err(RefParseError::InvalidDoiRegistrant)
1818        );
1819    }
1820
1821    #[test]
1822    fn doi_parse_rejects_control_char_in_suffix() {
1823        // Rule (from docs/SECURITY.md §1.1, log-injection mitigation):
1824        // control chars are not in the suffix charset; reject before they
1825        // can reach the provenance log.
1826        let result = Doi::parse("10.1234/foo\nbar");
1827        assert!(
1828            matches!(
1829                result,
1830                Err(RefParseError::InvalidDoiSuffixChar { ch: '\n' })
1831            ),
1832            "got {:?}",
1833            result
1834        );
1835    }
1836
1837    #[test]
1838    fn doi_parse_rejects_suffix_over_max_len() {
1839        // Rule: DOI_SUFFIX_MAX_LEN + 1 bytes is rejected.
1840        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 1);
1841        let input = format!("10.1234/{}", suffix);
1842        let result = Doi::parse(&input);
1843        match result {
1844            Err(RefParseError::DoiSuffixTooLong { len, max }) => {
1845                assert_eq!(len, DOI_SUFFIX_MAX_LEN + 1);
1846                assert_eq!(max, DOI_SUFFIX_MAX_LEN);
1847            }
1848            other => panic!("expected DoiSuffixTooLong, got {:?}", other),
1849        }
1850    }
1851
1852    #[test]
1853    fn doi_parse_rejects_non_ascii_in_suffix() {
1854        // Rule: spec charset is ASCII-only; non-ASCII becomes an
1855        // InvalidDoiSuffixChar (consistent with safekey behavior of
1856        // collapsing such chars to '_', which is a downstream concern).
1857        let result = Doi::parse("10.1234/物理学");
1858        assert!(
1859            matches!(result, Err(RefParseError::InvalidDoiSuffixChar { .. })),
1860            "got {:?}",
1861            result
1862        );
1863    }
1864
1865    // ---- ArxivId::parse happy paths (≥6) ----------------------------
1866
1867    #[test]
1868    fn arxiv_parse_accepts_new_style_4_digit_seq() {
1869        // Rule: new-style YYMM.NNNN (4-digit sequence number).
1870        let a = ArxivId::parse("0704.0001").expect("new-style 4-digit seq");
1871        assert_eq!(a.as_str(), "0704.0001");
1872    }
1873
1874    #[test]
1875    fn arxiv_parse_accepts_new_style_5_digit_seq() {
1876        // Rule: new-style YYMM.NNNNN (5-digit sequence number, post-2015).
1877        let a = ArxivId::parse("2401.12345").expect("new-style 5-digit seq");
1878        assert_eq!(a.as_str(), "2401.12345");
1879    }
1880
1881    #[test]
1882    fn arxiv_parse_accepts_new_style_with_version() {
1883        // Rule: optional `vN` version suffix.
1884        let a = ArxivId::parse("2401.12345v2").expect("with version");
1885        assert_eq!(a.as_str(), "2401.12345v2");
1886    }
1887
1888    #[test]
1889    fn arxiv_parse_accepts_old_style() {
1890        // Rule: old-style subject-class/YYMMNNN.
1891        let a = ArxivId::parse("cond-mat/9501001").expect("old-style cond-mat");
1892        assert_eq!(a.as_str(), "cond-mat/9501001");
1893    }
1894
1895    #[test]
1896    fn arxiv_parse_accepts_old_style_with_subclass_and_version() {
1897        // Rule: old-style subject-class may have a `.XX` two-upper subclass
1898        // and an optional `vN` suffix.
1899        let a = ArxivId::parse("astro-ph.CO/0703123v2").expect("old-style with subclass + version");
1900        assert_eq!(a.as_str(), "astro-ph.CO/0703123v2");
1901    }
1902
1903    #[test]
1904    fn arxiv_parse_accepts_arxiv_uri_scheme() {
1905        // Rule: `arxiv:` / `arXiv:` scheme is stripped at construction.
1906        let a = ArxivId::parse("arxiv:2401.12345").expect("arxiv: scheme");
1907        assert_eq!(a.as_str(), "2401.12345");
1908    }
1909
1910    #[test]
1911    fn arxiv_parse_accepts_arxiv_uri_scheme_mixed_case() {
1912        // Rule: scheme case-insensitive; matches the `arXiv:` form named
1913        // in docs/MCP_TOOLS.md.
1914        let a = ArxivId::parse("arXiv:2401.12345v2").expect("arXiv: scheme");
1915        assert_eq!(a.as_str(), "2401.12345v2");
1916    }
1917
1918    // ---- ArxivId::parse rejection paths (≥6) ------------------------
1919
1920    #[test]
1921    fn arxiv_parse_rejects_empty_input() {
1922        // Rule: empty rejected up-front.
1923        assert_eq!(ArxivId::parse(""), Err(RefParseError::Empty));
1924    }
1925
1926    #[test]
1927    fn arxiv_parse_rejects_no_dot_or_slash() {
1928        // Rule: must contain `.` (new-style) or `/` (old-style).
1929        assert_eq!(
1930            ArxivId::parse("notanarxivid"),
1931            Err(RefParseError::InvalidArxivShape)
1932        );
1933    }
1934
1935    #[test]
1936    fn arxiv_parse_rejects_new_style_wrong_head_length() {
1937        // Rule: head must be exactly 4 digits.
1938        assert_eq!(
1939            ArxivId::parse("240.12345"),
1940            Err(RefParseError::InvalidArxivShape)
1941        );
1942    }
1943
1944    #[test]
1945    fn arxiv_parse_rejects_new_style_seq_too_short() {
1946        // Rule: seq must be 4–5 digits.
1947        assert_eq!(
1948            ArxivId::parse("2401.123"),
1949            Err(RefParseError::InvalidArxivShape)
1950        );
1951    }
1952
1953    #[test]
1954    fn arxiv_parse_rejects_old_style_wrong_id_length() {
1955        // Rule: old-style id is exactly 7 digits.
1956        assert_eq!(
1957            ArxivId::parse("cond-mat/95001"),
1958            Err(RefParseError::InvalidArxivShape)
1959        );
1960    }
1961
1962    #[test]
1963    fn arxiv_parse_rejects_invalid_version_suffix() {
1964        // Rule: version suffix is `v` followed by ≥1 digits, nothing else.
1965        assert_eq!(
1966            ArxivId::parse("2401.12345v"),
1967            Err(RefParseError::InvalidArxivShape)
1968        );
1969    }
1970
1971    #[test]
1972    fn arxiv_parse_rejects_control_char() {
1973        // Rule (docs/SECURITY.md §1.1 log-injection): no control chars.
1974        assert_eq!(
1975            ArxivId::parse("2401.12345\n"),
1976            Err(RefParseError::InvalidArxivShape)
1977        );
1978    }
1979
1980    #[test]
1981    fn arxiv_parse_rejects_non_ascii() {
1982        // Rule: ASCII-only.
1983        assert_eq!(
1984            ArxivId::parse("2401.物理"),
1985            Err(RefParseError::InvalidArxivShape)
1986        );
1987    }
1988
1989    // ---- Ref::parse happy paths (≥6) --------------------------------
1990
1991    #[test]
1992    fn ref_parse_dispatches_doi_scheme_to_doi() {
1993        // Detection rule 1: explicit `doi:` scheme.
1994        match Ref::parse("doi:10.1234/example").expect("doi: dispatched to Doi") {
1995            Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/example"),
1996            other => panic!("expected Ref::Doi, got {:?}", other),
1997        }
1998    }
1999
2000    #[test]
2001    fn ref_parse_dispatches_arxiv_scheme_to_arxiv() {
2002        // Detection rule 2: explicit `arxiv:` scheme.
2003        match Ref::parse("arxiv:2401.12345").expect("arxiv: dispatched to Arxiv") {
2004            Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2005            other => panic!("expected Ref::Arxiv, got {:?}", other),
2006        }
2007    }
2008
2009    #[test]
2010    fn ref_parse_dispatches_arxiv_mixed_case_scheme() {
2011        // Detection rule 2 (case-insensitive): `arXiv:` form.
2012        match Ref::parse("arXiv:cond-mat/9501001").expect("arXiv: dispatched") {
2013            Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2014            other => panic!("expected Ref::Arxiv, got {:?}", other),
2015        }
2016    }
2017
2018    #[test]
2019    fn ref_parse_bare_doi_resolves_to_doi() {
2020        // Detection rule 3: bare input starting with `10.` is a DOI.
2021        match Ref::parse("10.1234/foo").expect("bare DOI") {
2022            Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/foo"),
2023            other => panic!("expected Ref::Doi, got {:?}", other),
2024        }
2025    }
2026
2027    #[test]
2028    fn ref_parse_bare_arxiv_new_resolves_to_arxiv() {
2029        // Detection rule 4: bare input not starting with `10.` falls
2030        // through to arXiv. Tests the ambiguous-input branch named in the
2031        // PR brief: `2401.12345` should resolve to ArxivId.
2032        match Ref::parse("2401.12345").expect("bare new-style arXiv") {
2033            Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2034            other => panic!("expected Ref::Arxiv, got {:?}", other),
2035        }
2036    }
2037
2038    #[test]
2039    fn ref_parse_bare_arxiv_old_resolves_to_arxiv() {
2040        // Detection rule 4: bare old-style arXiv id.
2041        match Ref::parse("cond-mat/9501001").expect("bare old-style arXiv") {
2042            Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2043            other => panic!("expected Ref::Arxiv, got {:?}", other),
2044        }
2045    }
2046
2047    // ---- Ref::parse rejection paths (≥6) ----------------------------
2048
2049    #[test]
2050    fn ref_parse_rejects_empty() {
2051        // Rule: empty up-front.
2052        assert_eq!(Ref::parse(""), Err(RefParseError::Empty));
2053    }
2054
2055    #[test]
2056    fn ref_parse_doi_scheme_with_invalid_doi_propagates_doi_error() {
2057        // When the scheme is explicit, we surface the parser's error
2058        // verbatim — not a generic "shape mismatch".
2059        assert_eq!(
2060            Ref::parse("doi:10.1234"),
2061            Err(RefParseError::MissingDoiSuffixSeparator)
2062        );
2063    }
2064
2065    #[test]
2066    fn ref_parse_arxiv_scheme_with_invalid_arxiv_propagates_arxiv_error() {
2067        assert_eq!(
2068            Ref::parse("arxiv:notanid"),
2069            Err(RefParseError::InvalidArxivShape)
2070        );
2071    }
2072
2073    #[test]
2074    fn ref_parse_bare_with_10_prefix_uses_doi_errors() {
2075        // Bare `10.…` heuristic: DOI parser is dispatched and its error
2076        // surfaces (here: bad registrant).
2077        assert_eq!(
2078            Ref::parse("10.12/x"),
2079            Err(RefParseError::InvalidDoiRegistrant)
2080        );
2081    }
2082
2083    #[test]
2084    fn ref_parse_bare_without_10_prefix_uses_arxiv_errors() {
2085        // Bare ambiguous fallback: ArxivId parser is dispatched and its
2086        // error surfaces. `1.2.3` is neither a DOI nor an arXiv shape.
2087        assert_eq!(Ref::parse("1.2.3"), Err(RefParseError::InvalidArxivShape));
2088    }
2089
2090    #[test]
2091    fn ref_parse_rejects_doi_scheme_with_oversized_suffix() {
2092        // Length-bound: DOI suffix > DOI_SUFFIX_MAX_LEN through Ref::parse
2093        // surfaces DoiSuffixTooLong, not a generic InvalidArxivShape.
2094        let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 5);
2095        let input = format!("doi:10.1234/{}", suffix);
2096        match Ref::parse(&input) {
2097            Err(RefParseError::DoiSuffixTooLong { .. }) => {}
2098            other => panic!("expected DoiSuffixTooLong, got {:?}", other),
2099        }
2100    }
2101
2102    #[test]
2103    fn ref_parse_round_trip_via_serde_preserves_inner_string() {
2104        // Wire-format check: Doi/ArxivId are #[serde(transparent)], and a
2105        // round-trip through Ref::parse → serde_json → Ref must preserve
2106        // the inner identifier. Guards against accidental scheme leakage
2107        // into the stored form.
2108        let r = Ref::parse("doi:10.1234/example").expect("parse ok");
2109        let json = serde_json::to_string(&r).expect("serialize");
2110        // The transparent inner value is the bare identifier (no `doi:`).
2111        assert!(
2112            json.contains("10.1234/example") && !json.contains("doi:"),
2113            "scheme leaked into wire form: {}",
2114            json
2115        );
2116    }
2117
2118    #[test]
2119    fn ref_parse_error_maps_to_invalid_ref_error_code() {
2120        // Public-API contract (docs/PUBLIC_API.md §4): all parse failures
2121        // collapse to ErrorCode::InvalidRef at the public boundary.
2122        let err: ErrorCode = RefParseError::Empty.into();
2123        assert_eq!(err, ErrorCode::InvalidRef);
2124        let err2: ErrorCode = RefParseError::MissingDoiPrefix.into();
2125        assert_eq!(err2, ErrorCode::InvalidRef);
2126    }
2127
2128    // -----------------------------------------------------------------
2129    // DenialReason / DenialContext (ADR-0023) — wire-shape tests.
2130    // -----------------------------------------------------------------
2131
2132    #[test]
2133    fn denial_reason_serializes_snake_case() {
2134        // ADR-0023 §2 / docs/PUBLIC_API.md §8: wire form is snake_case.
2135        let s = serde_json::to_string(&DenialReason::RedirectNotInAllowlist).expect("ser");
2136        assert_eq!(s, "\"redirect_not_in_allowlist\"");
2137        let s = serde_json::to_string(&DenialReason::SizeCapExceeded).expect("ser");
2138        assert_eq!(s, "\"size_cap_exceeded\"");
2139        let s = serde_json::to_string(&DenialReason::ContentTypeMismatch).expect("ser");
2140        assert_eq!(s, "\"content_type_mismatch\"");
2141    }
2142
2143    #[test]
2144    fn denial_reason_round_trip_via_serde() {
2145        // Round-trip every closed-set variant so adding a new variant
2146        // forces this test to be updated (the closed-set contract).
2147        for r in [
2148            DenialReason::RedirectNotInAllowlist,
2149            DenialReason::InsecureScheme,
2150            DenialReason::HostInBlockList,
2151            DenialReason::SizeCapExceeded,
2152            DenialReason::SchemaDrift,
2153            DenialReason::CapabilityNotGranted,
2154            DenialReason::RateLimitWindow,
2155            DenialReason::SsrfPrivateAddress,
2156            DenialReason::ContentTypeMismatch,
2157        ] {
2158            let s = serde_json::to_string(&r).expect("ser");
2159            let back: DenialReason = serde_json::from_str(&s).expect("de");
2160            assert_eq!(back, r, "round-trip mismatch for {:?} -> {}", r, s);
2161        }
2162    }
2163
2164    #[test]
2165    fn denial_context_round_trips_full_shape() {
2166        // A populated context (the redirect-denied case from ADR-0023 §1
2167        // example) survives a JSON round-trip. Whole-struct equality
2168        // exercises the `PartialEq` derive added per ADR-0023 §3 (added
2169        // in the multi-agent review feedback PR — see ADR-0023 history).
2170        let dc = DenialContext {
2171            reason: DenialReason::RedirectNotInAllowlist,
2172            source: Some("crossref".to_string()),
2173            attempted: Some("evil.example.com".to_string()),
2174            expected: Some(vec![
2175                "api.crossref.org".to_string(),
2176                "*.crossref.org".to_string(),
2177            ]),
2178            hop_index: Some(1),
2179            cap: None,
2180            actual: None,
2181        };
2182        let s = serde_json::to_string(&dc).expect("ser");
2183        let back: DenialContext = serde_json::from_str(&s).expect("de");
2184        assert_eq!(back, dc);
2185    }
2186
2187    #[test]
2188    fn denial_context_serialize_elides_empty_fields() {
2189        // `skip_serializing_if = "Option::is_none"` must keep the wire form
2190        // lean: every `None` field MUST NOT appear on the wire. Reason is
2191        // always present.
2192        let dc = DenialContext {
2193            reason: DenialReason::CapabilityNotGranted,
2194            source: None,
2195            attempted: None,
2196            expected: None,
2197            hop_index: None,
2198            cap: None,
2199            actual: None,
2200        };
2201        let s = serde_json::to_string(&dc).expect("ser");
2202        assert_eq!(s, "{\"reason\":\"capability_not_granted\"}");
2203    }
2204
2205    #[test]
2206    fn denial_context_expected_some_empty_vec_preserves_explicit_empty_allowlist() {
2207        // Post-refinement disambiguation: `expected: Some(vec![])` is the
2208        // "explicit empty allowlist" signal and MUST survive the wire as
2209        // `"expected":[]`. Only `expected: None` is skipped on serialize.
2210        // This is the bug the previous `Vec<String>` shape masked.
2211        let dc = DenialContext {
2212            reason: DenialReason::RedirectNotInAllowlist,
2213            source: Some("crossref".to_string()),
2214            attempted: Some("evil.example.com".to_string()),
2215            expected: Some(Vec::new()),
2216            hop_index: None,
2217            cap: None,
2218            actual: None,
2219        };
2220        let s = serde_json::to_string(&dc).expect("ser");
2221        assert!(
2222            s.contains("\"expected\":[]"),
2223            "expected:[] must survive on the wire (got: {s})"
2224        );
2225        let back: DenialContext = serde_json::from_str(&s).expect("de");
2226        assert_eq!(back.expected, Some(Vec::new()));
2227    }
2228
2229    #[test]
2230    fn denial_context_deserialize_tolerates_missing_optional_fields() {
2231        // Consumer-side contract (ADR-0023 §3): consumers MUST tolerate
2232        // any subset of fields being present. Missing optional fields
2233        // deserialize to their defaults via `#[serde(default)]`.
2234        let wire = r#"{"reason":"size_cap_exceeded","cap":104857600,"actual":209715200}"#;
2235        let dc: DenialContext = serde_json::from_str(wire).expect("de");
2236        assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
2237        assert_eq!(dc.cap, Some(104857600));
2238        assert_eq!(dc.actual, Some(209715200));
2239        assert!(dc.source.is_none());
2240        assert!(dc.attempted.is_none());
2241        assert!(dc.expected.is_none());
2242        assert!(dc.hop_index.is_none());
2243    }
2244
2245    #[test]
2246    fn full_error_envelope_with_denial_context_serializes_to_pinned_json() {
2247        // Pins the byte-exact wire shape of the full failure envelope
2248        // documented in docs/ERRORS.md §3 + §3.1 and ADR-0023 §1. A
2249        // future regression that flips key order or skip-rules anywhere
2250        // in the chain breaks this test loudly.
2251        //
2252        // Note: serde_json's `Map` (used by `json!`) sorts keys
2253        // alphabetically when the `preserve_order` feature is NOT
2254        // enabled (we do not enable it). Embedding a `DenialContext`
2255        // via `json!` first re-serialises it through the same alphabet-
2256        // sorted Map path, so the inner field order is also alphabetical
2257        // here — NOT the struct field-order produced by direct
2258        // `to_string(&DenialContext)`. This is by design: the public
2259        // wire shape is canonicalised by serde_json's Map ordering, so
2260        // the byte-exact pin below documents that exact canonicalisation.
2261        let denial = DenialContext {
2262            reason: DenialReason::RedirectNotInAllowlist,
2263            source: Some("crossref".into()),
2264            attempted: Some("evil.example.com".into()),
2265            expected: Some(vec!["api.crossref.org".into(), "*.crossref.org".into()]),
2266            hop_index: Some(1),
2267            cap: None,
2268            actual: None,
2269        };
2270        let envelope = serde_json::json!({
2271            "ok": false,
2272            "error": {
2273                "code": ErrorCode::NetworkError,
2274                "message": "redirect target evil.example.com not in allowlist for source crossref",
2275                "denial_context": denial,
2276            }
2277        });
2278        let actual = serde_json::to_string(&envelope).expect("serialize envelope");
2279        let expected = r#"{"error":{"code":"NETWORK_ERROR","denial_context":{"attempted":"evil.example.com","expected":["api.crossref.org","*.crossref.org"],"hop_index":1,"reason":"redirect_not_in_allowlist","source":"crossref"},"message":"redirect target evil.example.com not in allowlist for source crossref"},"ok":false}"#;
2280        assert_eq!(actual, expected);
2281    }
2282
2283    #[test]
2284    fn denial_context_rejects_unknown_fields() {
2285        // `#[serde(deny_unknown_fields)]` (ADR-0023 §3, PUBLIC_API.md §8):
2286        // an unknown field on the wire MUST be a deserialize error so
2287        // forward-compat field additions stay a breaking change.
2288        let wire = r#"{"reason":"capability_not_granted","banana":1}"#;
2289        let result: Result<DenialContext, _> = serde_json::from_str(wire);
2290        assert!(
2291            result.is_err(),
2292            "deny_unknown_fields must reject 'banana': {:?}",
2293            result.map(|d| d.reason),
2294        );
2295    }
2296}