doiget_core/lib.rs
1//! # doiget-core
2//!
3//! Core library for [doiget](https://github.com/sotashimozono/doiget): an Open Access
4//! first paper-fetcher with strict capability gating, fail-closed provenance logging,
5//! and a BiblioFetch.jl-compatible store layout.
6//!
7//! Phase 0 ships only this skeleton. Real implementations land in Phase 1.
8//! See `docs/PUBLIC_API.md` for the semver-locked surface and `docs/ARCHITECTURE.md`
9//! for the high-level design.
10
11#![warn(missing_docs)]
12#![forbid(unsafe_code)]
13
14use serde::{Deserialize, Serialize};
15use sha2::Digest;
16
17// --- Modules ---
18pub mod canonical;
19pub mod dry_run;
20pub mod http;
21pub mod orchestrator;
22pub mod provenance;
23pub mod rate_limiter;
24pub mod refs;
25pub mod source;
26pub mod sources;
27pub mod store;
28pub mod user_extension;
29
30// Phase 4 citation graph (ADR-0010). Compile-gated by the `citation`
31// Cargo feature, which itself enables the `metadata` feature so the
32// Tier-2 source impls are available.
33#[cfg(feature = "citation")]
34pub mod citation_graph;
35
36// Re-export the canonical-tuple audit-identity types at the crate root
37// per ADR-0024 / `docs/PUBLIC_API.md` §1. The types themselves live in
38// the [`canonical`] submodule.
39pub use crate::canonical::{CanonicalRef, SourceType};
40
41/// Crate version. Used by `doiget-cli --version` and `doiget_health`.
42pub const VERSION: &str = env!("CARGO_PKG_VERSION");
43
44/// TOML schema version this build writes. See `docs/STORE.md` §3.
45pub const SCHEMA_VERSION: &str = "1.0";
46
47/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
48pub const MAX_CONCURRENT_FETCHES: u32 = 5;
49
50/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
51pub const MAX_FETCHES_PER_SECOND: f32 = 5.0;
52
53/// Maximum batch size for `doiget batch` and `doiget_batch_fetch`.
54pub const MCP_BATCH_MAX_SIZE: usize = 100;
55
56/// Slice 2 alias for [`MCP_BATCH_MAX_SIZE`] using the
57/// spec-language name (`docs/MCP_TOOLS.md` §1 / Slice 2 plan). The
58/// numeric value MUST equal [`MCP_BATCH_MAX_SIZE`]; an internal test
59/// pins the equivalence so the two constants cannot drift.
60pub const MAX_BATCH_REFS: usize = MCP_BATCH_MAX_SIZE;
61
62/// Maximum queued MCP requests beyond `MAX_CONCURRENT_FETCHES`. Excess returns
63/// `ErrorCode::RateLimited`. See `docs/SECURITY.md` §1.4 / `docs/MCP_TOOLS.md`.
64pub const MCP_QUEUE_DEPTH_MAX: usize = 100;
65
66/// MCP server stdin-EOF graceful-shutdown deadline, in seconds. See ADR-0001
67/// and `docs/MCP_TOOLS.md` §8.
68pub const MCP_STDIN_EOF_SHUTDOWN_SEC: u64 = 5;
69
70/// Maximum DOI suffix length accepted at validation. See `docs/SECURITY.md` §1.1.
71pub const DOI_SUFFIX_MAX_LEN: usize = 256;
72
73/// Maximum PDF body size accepted by the fetcher, in bytes. See
74/// `docs/SECURITY.md` §1.2 (Oversized PDF).
75pub const PDF_MAX_BYTES: u64 = 100_000_000;
76
77/// Time-to-live for entries in `~/.cache/doiget/resolver/`. See
78/// `docs/CACHE.md` §3.
79pub const RESOLVER_CACHE_TTL_DAYS: u32 = 7;
80
81/// Time-to-live for entries in `~/.cache/doiget/citations/`. See
82/// `docs/CACHE.md` §3.
83pub const CITATION_CACHE_TTL_DAYS: u32 = 30;
84
85// ---------------------------------------------------------------------------
86// Ref
87// ---------------------------------------------------------------------------
88
89/// A reference to a paper, either by DOI or arXiv id.
90///
91/// See `docs/SECURITY.md` §1.1 for input-validation rules.
92#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
93#[serde(rename_all = "lowercase", tag = "kind", content = "id")]
94pub enum Ref {
95 /// A DOI (e.g., `10.1234/example`).
96 Doi(Doi),
97 /// An arXiv id (e.g., `2401.12345`).
98 Arxiv(ArxivId),
99}
100
101/// A validated DOI string.
102///
103/// Construct via `Doi::parse(s)` (Phase 1+). The inner field is intentionally
104/// `pub(crate)` to forbid bypass construction; tests inside `doiget-core` may
105/// still use `Doi(s)` for fixture purposes.
106///
107/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"10.1234/example"`.
108#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
109#[serde(transparent)]
110pub struct Doi(pub(crate) String);
111
112/// A validated arXiv id string.
113///
114/// Construct via `ArxivId::parse(s)` (Phase 1+). Inner field is `pub(crate)`.
115///
116/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"2401.12345"`.
117#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
118#[serde(transparent)]
119pub struct ArxivId(pub(crate) String);
120
121impl Doi {
122 /// Returns the DOI as a string slice.
123 pub fn as_str(&self) -> &str {
124 &self.0
125 }
126
127 /// Parses and validates a DOI string per `docs/SECURITY.md` §1.1.
128 ///
129 /// Accepts:
130 /// - Bare DOIs: `10.<registrant>/<suffix>` where `<registrant>` is 4–9
131 /// digits and `<suffix>` is a non-empty sequence of characters drawn
132 /// from `[A-Za-z0-9._/():-]` (the `:` covers legacy Kluwer
133 /// `10.1023/A:NNNN` and EDP Sciences `10.1051/jphys:NNNN` DOIs).
134 /// - The `doi:` URI scheme prefix; it is stripped before validation, so
135 /// the stored value never carries a scheme. (Matches the convention
136 /// established in `docs/SAFEKEY.md` §3 step 0.)
137 ///
138 /// Rejects:
139 /// - Inputs missing the literal `10.` prefix (after optional scheme
140 /// strip).
141 /// - Suffixes longer than [`DOI_SUFFIX_MAX_LEN`] bytes.
142 /// - Empty suffixes.
143 /// - Any character outside the suffix charset above (including control
144 /// characters, whitespace, and non-ASCII).
145 ///
146 /// # Errors
147 ///
148 /// Returns a [`RefParseError`] variant that names the specific rejection
149 /// category. Tier 1+ callers should map any [`RefParseError`] to
150 /// [`ErrorCode::InvalidRef`] when surfacing to MCP / CLI.
151 pub fn parse(s: &str) -> Result<Self, RefParseError> {
152 let stripped = parse::strip_doi_scheme(s);
153 parse::validate_doi(stripped)?;
154 Ok(Doi(stripped.to_string()))
155 }
156}
157
158impl ArxivId {
159 /// Returns the arXiv id as a string slice.
160 pub fn as_str(&self) -> &str {
161 &self.0
162 }
163
164 /// Parses and validates an arXiv id per `docs/SECURITY.md` §1.1 and the
165 /// pattern published in `docs/MCP_TOOLS.md`.
166 ///
167 /// Accepts:
168 /// - New-style ids: `YYMM.NNNNN[vN]` where the date block is 4 digits, the
169 /// sequence number is 4–5 digits, and the optional version `vN` is one
170 /// or more digits. Examples: `2401.12345`, `2401.12345v2`.
171 /// - Old-style ids: `subject-class/YYMMNNN[vN]` where the subject class
172 /// is a lowercase token (with optional internal hyphens and an
173 /// optional `.XX` two-uppercase-letter group), and the numeric body
174 /// is exactly 7 digits with optional `vN`. Examples:
175 /// `cond-mat/9501001`, `astro-ph.CO/0703123v2`.
176 /// - The `arxiv:` / `arXiv:` URI scheme prefix; it is stripped before
177 /// validation.
178 ///
179 /// Rejects:
180 /// - Inputs that match neither the new-style nor old-style shape.
181 /// - Inputs containing characters outside the per-shape charset
182 /// (control chars, whitespace, non-ASCII).
183 /// - Empty input.
184 ///
185 /// # Errors
186 ///
187 /// Returns a [`RefParseError`] variant that names the specific rejection
188 /// category.
189 pub fn parse(s: &str) -> Result<Self, RefParseError> {
190 let stripped = parse::strip_arxiv_scheme(s);
191 parse::validate_arxiv(stripped)?;
192 Ok(ArxivId(stripped.to_string()))
193 }
194}
195
196impl Ref {
197 /// Parses a string into a [`Ref`], auto-detecting DOI vs arXiv.
198 ///
199 /// Detection rules:
200 /// 1. If the input begins with the case-insensitive `doi:` scheme, the
201 /// remainder is parsed as a DOI.
202 /// 2. If the input begins with the `arxiv:` or `arXiv:` scheme, the
203 /// remainder is parsed as an arXiv id.
204 /// 3. Otherwise, if the input starts with `10.` it is treated as a bare
205 /// DOI; this matches the heuristic in `docs/SAFEKEY.md` §4 (Julia
206 /// reference) and is stable because DOIs always begin `10.`.
207 /// 4. Failing all of the above, parsing falls back to arXiv.
208 ///
209 /// The returned [`Ref`] never carries the URI scheme — `as_str()` on the
210 /// inner `Doi` / `ArxivId` is always the bare identifier.
211 ///
212 /// # Errors
213 ///
214 /// Returns a [`RefParseError`] from the underlying [`Doi::parse`] or
215 /// [`ArxivId::parse`] call. When the input has an explicit scheme
216 /// (`doi:` / `arxiv:`), the matching parser is dispatched and its error
217 /// surfaces directly. When the input is bare and ambiguous, the
218 /// heuristic in rule 3/4 selects the parser; an unparsable bare input
219 /// surfaces the arXiv parser's error (a non-`10.` ref that also fails
220 /// arXiv validation is never a valid DOI).
221 pub fn parse(s: &str) -> Result<Self, RefParseError> {
222 // Reject empty up front so all three parsers see a meaningful slice;
223 // without this, `strip_*_scheme("")` returns "" and we'd get a
224 // confusing "missing 10. prefix" error for empty input.
225 if s.is_empty() {
226 return Err(RefParseError::Empty);
227 }
228
229 if parse::has_doi_scheme(s) {
230 return Doi::parse(s).map(Ref::Doi);
231 }
232 if parse::has_arxiv_scheme(s) {
233 return ArxivId::parse(s).map(Ref::Arxiv);
234 }
235 if s.starts_with("10.") {
236 return Doi::parse(s).map(Ref::Doi);
237 }
238 ArxivId::parse(s).map(Ref::Arxiv)
239 }
240}
241
242// ---------------------------------------------------------------------------
243// Parser internals
244// ---------------------------------------------------------------------------
245
246mod parse {
247 use super::{RefParseError, DOI_SUFFIX_MAX_LEN};
248
249 /// Case-insensitive `doi:` prefix detector. Matches both `doi:` and
250 /// `DOI:` (and any case mix); the spec in `docs/SAFEKEY.md` §3 only
251 /// names the lowercase form, but the field convention is to be lenient
252 /// in what we accept (the scheme is dropped at the boundary anyway).
253 pub(crate) fn has_doi_scheme(s: &str) -> bool {
254 s.len() >= 4 && s.is_char_boundary(4) && s[..4].eq_ignore_ascii_case("doi:")
255 }
256
257 /// Case-insensitive `arxiv:` prefix detector. Accepts `arxiv:`,
258 /// `arXiv:` (the form used in `docs/MCP_TOOLS.md`), and any other case
259 /// mix.
260 pub(crate) fn has_arxiv_scheme(s: &str) -> bool {
261 s.len() >= 6 && s.is_char_boundary(6) && s[..6].eq_ignore_ascii_case("arxiv:")
262 }
263
264 pub(crate) fn strip_doi_scheme(s: &str) -> &str {
265 if has_doi_scheme(s) {
266 &s[4..]
267 } else {
268 s
269 }
270 }
271
272 pub(crate) fn strip_arxiv_scheme(s: &str) -> &str {
273 if has_arxiv_scheme(s) {
274 &s[6..]
275 } else {
276 s
277 }
278 }
279
280 /// DOI suffix charset per `docs/SECURITY.md` §1.1:
281 /// `[A-Za-z0-9._/():-]`. The forward slash is permitted inside the
282 /// suffix (e.g. `10.1016/...`); the registrant separator is the
283 /// *first* `/` and the suffix is everything after it.
284 ///
285 /// `:` is permitted because two large real publisher DOI families use
286 /// it in the suffix — legacy Kluwer/Springer (`10.1023/A:NNNNNNNNNN`)
287 /// and EDP Sciences / Journal de Physique
288 /// (`10.1051/jphys:NNNNNNNNNNNNNNNNN`). It adds no path-traversal
289 /// capability: traversal requires composing `/` and `.` into `../`,
290 /// and both characters are already in the suffix charset. In addition,
291 /// `safekey` independently escapes every char outside `[A-Za-z0-9._-]`
292 /// before any filesystem use, so `:` never reaches a path literally.
293 /// See ADR-0026 and `docs/SECURITY.md` §1.1.
294 fn is_doi_suffix_char(c: char) -> bool {
295 matches!(c,
296 'A'..='Z' | 'a'..='z' | '0'..='9'
297 | '.' | '_' | '/' | '(' | ')' | '-' | ':'
298 )
299 }
300
301 pub(crate) fn validate_doi(s: &str) -> Result<(), RefParseError> {
302 if s.is_empty() {
303 return Err(RefParseError::Empty);
304 }
305
306 // Must begin with literal "10."; the registrant is 4–9 digits up
307 // to the first '/'. After that, everything is suffix.
308 let rest = s
309 .strip_prefix("10.")
310 .ok_or(RefParseError::MissingDoiPrefix)?;
311 let slash_idx = rest
312 .find('/')
313 .ok_or(RefParseError::MissingDoiSuffixSeparator)?;
314 let registrant = &rest[..slash_idx];
315 let suffix = &rest[slash_idx + 1..];
316
317 // Registrant: 4–9 ASCII digits.
318 if registrant.len() < 4
319 || registrant.len() > 9
320 || !registrant.chars().all(|c| c.is_ascii_digit())
321 {
322 return Err(RefParseError::InvalidDoiRegistrant);
323 }
324
325 // Suffix: non-empty, charset-restricted, length-bounded.
326 if suffix.is_empty() {
327 return Err(RefParseError::EmptyDoiSuffix);
328 }
329 if suffix.len() > DOI_SUFFIX_MAX_LEN {
330 return Err(RefParseError::DoiSuffixTooLong {
331 len: suffix.len(),
332 max: DOI_SUFFIX_MAX_LEN,
333 });
334 }
335 if let Some(bad) = suffix.chars().find(|c| !is_doi_suffix_char(*c)) {
336 return Err(RefParseError::InvalidDoiSuffixChar { ch: bad });
337 }
338 Ok(())
339 }
340
341 /// Validates an arXiv id (with the `arxiv:` / `arXiv:` scheme already
342 /// stripped). Tries the new-style shape first, then the old-style.
343 pub(crate) fn validate_arxiv(s: &str) -> Result<(), RefParseError> {
344 if s.is_empty() {
345 return Err(RefParseError::Empty);
346 }
347 if validate_arxiv_new(s).is_ok() || validate_arxiv_old(s).is_ok() {
348 return Ok(());
349 }
350 Err(RefParseError::InvalidArxivShape)
351 }
352
353 /// New-style arXiv id: `YYMM.NNNNN[vN]`.
354 fn validate_arxiv_new(s: &str) -> Result<(), ()> {
355 let dot_idx = s.find('.').ok_or(())?;
356 let head = &s[..dot_idx];
357 let tail = &s[dot_idx + 1..];
358
359 // Head: exactly 4 ASCII digits.
360 if head.len() != 4 || !head.chars().all(|c| c.is_ascii_digit()) {
361 return Err(());
362 }
363
364 // Tail: 4–5 digits, then optional `v` followed by ≥1 digits.
365 let bytes = tail.as_bytes();
366 let mut i = 0;
367 while i < bytes.len() && bytes[i].is_ascii_digit() {
368 i += 1;
369 }
370 let digits_len = i;
371 if !(4..=5).contains(&digits_len) {
372 return Err(());
373 }
374 if i == bytes.len() {
375 return Ok(());
376 }
377 // Optional version suffix.
378 if bytes[i] != b'v' {
379 return Err(());
380 }
381 i += 1;
382 let v_start = i;
383 while i < bytes.len() && bytes[i].is_ascii_digit() {
384 i += 1;
385 }
386 if i == v_start || i != bytes.len() {
387 return Err(());
388 }
389 Ok(())
390 }
391
392 /// Old-style arXiv id: `subject-class/YYMMNNN[vN]`.
393 /// Subject class: `[a-z]([a-z-]*[a-z])?(\.[A-Z]{2})?`.
394 fn validate_arxiv_old(s: &str) -> Result<(), ()> {
395 let slash_idx = s.find('/').ok_or(())?;
396 let class = &s[..slash_idx];
397 let id = &s[slash_idx + 1..];
398
399 // Class: starts with [a-z], body is [a-z-], optional `.XX` (two
400 // ASCII upper).
401 let (core_class, dot_part) = match class.find('.') {
402 Some(d) => (&class[..d], Some(&class[d + 1..])),
403 None => (class, None),
404 };
405 if core_class.is_empty()
406 || !core_class
407 .chars()
408 .all(|c| c.is_ascii_lowercase() || c == '-')
409 || core_class.starts_with('-')
410 || core_class.ends_with('-')
411 {
412 return Err(());
413 }
414 if let Some(dp) = dot_part {
415 if dp.len() != 2 || !dp.chars().all(|c| c.is_ascii_uppercase()) {
416 return Err(());
417 }
418 }
419
420 // Id: 7 digits, optional `vN`.
421 let bytes = id.as_bytes();
422 let mut i = 0;
423 while i < bytes.len() && bytes[i].is_ascii_digit() {
424 i += 1;
425 }
426 if i != 7 {
427 return Err(());
428 }
429 if i == bytes.len() {
430 return Ok(());
431 }
432 if bytes[i] != b'v' {
433 return Err(());
434 }
435 i += 1;
436 let v_start = i;
437 while i < bytes.len() && bytes[i].is_ascii_digit() {
438 i += 1;
439 }
440 if i == v_start || i != bytes.len() {
441 return Err(());
442 }
443 Ok(())
444 }
445}
446
447// ---------------------------------------------------------------------------
448// RefParseError
449// ---------------------------------------------------------------------------
450
451/// Reasons a `Doi::parse` / `ArxivId::parse` / `Ref::parse` call can fail.
452///
453/// Each variant maps to one rejection category in `docs/SECURITY.md` §1.1.
454/// All variants funnel to [`ErrorCode::InvalidRef`] when surfacing to MCP /
455/// CLI; the granular shape is preserved for tests and for future log
456/// breadcrumbs. The `From<RefParseError> for ErrorCode` impl below makes
457/// `?` propagation collapse to `INVALID_REF` automatically, satisfying
458/// `docs/PUBLIC_API.md` §4.
459///
460/// Marked `#[non_exhaustive]` so adding new categories is a non-breaking
461/// change. Pattern-match with a wildcard arm.
462#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
463#[non_exhaustive]
464pub enum RefParseError {
465 /// Input was empty.
466 #[error("empty input")]
467 Empty,
468 /// Input did not begin with the required `10.` literal (after any
469 /// scheme strip).
470 #[error("DOI must begin with '10.'")]
471 MissingDoiPrefix,
472 /// Input started with `10.` but had no `/` separator between
473 /// registrant and suffix.
474 #[error("DOI must contain '/' between registrant and suffix")]
475 MissingDoiSuffixSeparator,
476 /// Registrant was not 4–9 ASCII digits.
477 #[error("DOI registrant must be 4–9 ASCII digits")]
478 InvalidDoiRegistrant,
479 /// DOI suffix was empty.
480 #[error("DOI suffix is empty")]
481 EmptyDoiSuffix,
482 /// DOI suffix exceeded `DOI_SUFFIX_MAX_LEN` bytes.
483 #[error("DOI suffix is {len} bytes; maximum is {max}")]
484 DoiSuffixTooLong {
485 /// Observed suffix length, in bytes.
486 len: usize,
487 /// Hard upper bound (always [`DOI_SUFFIX_MAX_LEN`]).
488 max: usize,
489 },
490 /// DOI suffix contained a character outside `[A-Za-z0-9._/():-]`.
491 #[error("DOI suffix contains invalid character {ch:?}")]
492 InvalidDoiSuffixChar {
493 /// The first offending character.
494 ch: char,
495 },
496 /// Input matched neither the new-style nor old-style arXiv shape.
497 #[error("input does not match any known arXiv id shape")]
498 InvalidArxivShape,
499}
500
501impl From<RefParseError> for ErrorCode {
502 fn from(_: RefParseError) -> Self {
503 // All parse failures collapse to INVALID_REF at the public boundary,
504 // matching `docs/PUBLIC_API.md` §4 and `docs/SECURITY.md` §1.1.
505 ErrorCode::InvalidRef
506 }
507}
508
509// ---------------------------------------------------------------------------
510// Safekey
511// ---------------------------------------------------------------------------
512
513/// A filesystem-safe key derived deterministically from a `Ref`.
514///
515/// See `docs/SAFEKEY.md` for the full algorithm and reference test vectors.
516/// Construct via `Ref::safekey()` (Phase 1+); inner field is `pub(crate)`.
517///
518/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"doi_10.1234_example"`.
519#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
520#[serde(transparent)]
521pub struct Safekey(pub(crate) String);
522
523impl Safekey {
524 /// Returns the safekey as a string slice.
525 pub fn as_str(&self) -> &str {
526 &self.0
527 }
528}
529
530impl Ref {
531 /// Returns the bare identifier string usable as a provenance `ref` field.
532 ///
533 /// Equivalent to `Doi::as_str` / `ArxivId::as_str` dispatched on the
534 /// variant — the URI scheme (`doi:` / `arxiv:`) is never present in the
535 /// inner identifiers (it is stripped at parse time), so the result is
536 /// always the bare DOI or arXiv id. Used by the CLI / MCP orchestrators
537 /// to populate the `ref` column of provenance log rows
538 /// (`docs/PROVENANCE_LOG.md` §3) without re-matching the variant.
539 pub fn as_input_str(&self) -> &str {
540 match self {
541 Ref::Doi(d) => d.as_str(),
542 Ref::Arxiv(a) => a.as_str(),
543 }
544 }
545
546 /// Derives a deterministic, filesystem-safe key from this reference.
547 ///
548 /// The algorithm is the NORMATIVE binding spec in `docs/SAFEKEY.md` §3.
549 /// Both Rust and Julia implementations MUST produce bit-identical output
550 /// for every entry in `tests/fixtures/safekey/vectors.json`.
551 ///
552 /// # Algorithm summary
553 ///
554 /// 1. Prefix with `doi_` or `arxiv_` (per variant).
555 /// 2. Replace any character outside `[A-Za-z0-9._-]` with `_`.
556 /// 3. Collapse consecutive `_` runs to a single `_`.
557 /// 4. Trim leading/trailing `_`.
558 /// 5. If the result exceeds 192 bytes, take the first 192 bytes plus
559 /// `_` plus the first 8 hex chars of `SHA-256(raw)` (where `raw` is
560 /// the step-1 output, before escaping).
561 ///
562 /// The bound on `as_str()` after step 4 is pure ASCII (steps 1-3 produce
563 /// only ASCII bytes), so the byte-slice in step 5 cannot split a
564 /// multibyte char.
565 pub fn safekey(&self) -> Safekey {
566 // Step 0: prefix per variant. Doi/ArxivId hold the bare identifier
567 // (no `doi:` / `arxiv:` URI scheme — that is stripped by Ref::parse,
568 // not relevant here).
569 let raw = match self {
570 Ref::Doi(d) => format!("doi_{}", d.as_str()),
571 Ref::Arxiv(a) => format!("arxiv_{}", a.as_str()),
572 };
573
574 // Step 1: replace unsafe chars with '_'. Non-ASCII chars (emitted by
575 // String::chars() as full Unicode code points) all hit the wildcard
576 // arm and become a single '_'.
577 let escaped: String = raw
578 .chars()
579 .map(|c| match c {
580 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' => c,
581 _ => '_',
582 })
583 .collect();
584
585 // Step 2: collapse consecutive '_' runs to a single '_'.
586 let mut collapsed = String::with_capacity(escaped.len());
587 let mut last_was_underscore = false;
588 for c in escaped.chars() {
589 if c == '_' {
590 if !last_was_underscore {
591 collapsed.push('_');
592 }
593 last_was_underscore = true;
594 } else {
595 collapsed.push(c);
596 last_was_underscore = false;
597 }
598 }
599
600 // Step 3: trim leading/trailing '_'.
601 let trimmed = collapsed.trim_matches('_');
602
603 // Step 4: length-bound. After steps 1-3 `trimmed` is pure ASCII, so
604 // `len()` (bytes) == char count and `&trimmed[..192]` is char-safe.
605 let key = if trimmed.len() > 192 {
606 let digest = sha2::Sha256::digest(raw.as_bytes());
607 let hash = hex::encode(&digest[..4]);
608 format!("{}_{}", &trimmed[..192], hash)
609 } else {
610 trimmed.to_string()
611 };
612
613 Safekey(key)
614 }
615}
616
617// ---------------------------------------------------------------------------
618// ErrorCode
619// ---------------------------------------------------------------------------
620
621/// The closed set of error codes doiget surfaces.
622///
623/// See `docs/ERRORS.md` for the persona × code matrix.
624///
625/// Marked `#[non_exhaustive]` so adding new variants is a minor (not major)
626/// version bump.
627#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
628#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
629#[non_exhaustive]
630pub enum ErrorCode {
631 /// DOI / arXiv id failed validation.
632 InvalidRef,
633 /// Tier 1 sources reported no OA URL.
634 NoOaAvailable,
635 /// Internal rate cap or upstream 429.
636 RateLimited,
637 /// Transport / DNS / TLS failure.
638 NetworkError,
639 /// Filesystem write failed.
640 StoreError,
641 /// Provenance log write failed; the fetch was aborted.
642 LogError,
643 /// Source not granted by the runtime `CapabilityProfile`.
644 CapabilityDenied,
645 /// Per-request timeout exceeded.
646 FetchTimeout,
647 /// Store entry's `schema_version` is ahead of this build.
648 SchemaTooNew,
649 /// Could not acquire `flock` within 5 s.
650 LockTimeout,
651 /// Bug — please open an issue.
652 InternalError,
653 /// Feature is spec'd but not yet wired in this Phase. Distinct from
654 /// [`Self::InternalError`] (which signals a bug) and
655 /// [`Self::CapabilityDenied`] (which signals a runtime config gate).
656 /// Returned by stubs that exist to pin the public surface ahead of
657 /// orchestrator implementation, so an agent can react with "wait for
658 /// next minor release" rather than "report a bug" or "tweak my
659 /// capability profile". Wire form: `"NOT_IMPLEMENTED"`.
660 NotImplemented,
661}
662
663impl ErrorCode {
664 /// The `SCREAMING_SNAKE_CASE` wire token for this code, as a
665 /// `&'static str`. Identical to the serde representation but
666 /// allocation-free and usable where a borrowed string with a
667 /// `'static` lifetime is required — notably the provenance log
668 /// `error_code` column (`docs/PROVENANCE_LOG.md` §3), so a failure
669 /// row records the *actual* mapped code instead of a hand-written
670 /// literal that can drift from this enum (issue #118).
671 #[must_use]
672 pub fn as_wire(&self) -> &'static str {
673 match self {
674 ErrorCode::InvalidRef => "INVALID_REF",
675 ErrorCode::NoOaAvailable => "NO_OA_AVAILABLE",
676 ErrorCode::RateLimited => "RATE_LIMITED",
677 ErrorCode::NetworkError => "NETWORK_ERROR",
678 ErrorCode::StoreError => "STORE_ERROR",
679 ErrorCode::LogError => "LOG_ERROR",
680 ErrorCode::CapabilityDenied => "CAPABILITY_DENIED",
681 ErrorCode::FetchTimeout => "FETCH_TIMEOUT",
682 ErrorCode::SchemaTooNew => "SCHEMA_TOO_NEW",
683 ErrorCode::LockTimeout => "LOCK_TIMEOUT",
684 ErrorCode::InternalError => "INTERNAL_ERROR",
685 ErrorCode::NotImplemented => "NOT_IMPLEMENTED",
686 }
687 }
688}
689
690// ---------------------------------------------------------------------------
691// DenialReason / DenialContext (ADR-0023)
692// ---------------------------------------------------------------------------
693
694/// Closed-set reasons a denial-class error envelope can carry on its
695/// optional `denial_context.reason` field.
696///
697/// Wire form (JSON / MCP) is `snake_case` — e.g. `"redirect_not_in_allowlist"`.
698/// The set is **closed** per ADR-0023 §2: adding a new variant is a minor
699/// semver bump; renaming or repurposing one is a breaking change. Mirrors
700/// the stability rule that already governs [`ErrorCode`].
701///
702/// See [`DenialContext`] for the surrounding struct, `docs/ERRORS.md` §3.1
703/// for the wire surface, and `docs/PUBLIC_API.md` §8 for the
704/// semver-locked surface contract.
705#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
706#[serde(rename_all = "snake_case")]
707pub enum DenialReason {
708 /// Redirect target host did not match the source's allowlist
709 /// (`HttpError::RedirectDenied`).
710 RedirectNotInAllowlist,
711 /// Redirect target had a non-HTTPS scheme (`HttpError::InsecureRedirect`).
712 InsecureScheme,
713 /// Source produced a URL whose host is on a future blocklist.
714 ///
715 /// Reserved — no producer wired yet. Will be emitted by the future
716 /// per-source URL host-blocklist guard once that component lands
717 /// (post-Phase-1 supply-chain hardening; see
718 /// `docs/REDIRECT_ALLOWLIST.md` §4 for the staging plan).
719 HostInBlockList,
720 /// Body exceeded [`PDF_MAX_BYTES`] (`HttpError::OversizedBody`).
721 SizeCapExceeded,
722 /// Store entry's `schema_version` is ahead of this binary.
723 ///
724 /// Reserved — no producer wired yet. Will be emitted by the
725 /// `FsStore` schema-rejection path once the read-side bump check
726 /// lands (it currently only writes the current `SCHEMA_VERSION`).
727 SchemaDrift,
728 /// Source not in the runtime [`CapabilityProfile`]
729 /// (`FetchError::NotEligible`).
730 CapabilityNotGranted,
731 /// Rate limiter rejected the call inside the current window.
732 ///
733 /// Reserved — no producer wired yet. Will be emitted by
734 /// [`RateLimiter`](crate::rate_limiter::RateLimiter) once the
735 /// limiter surfaces structured denials (Phase 2+; today the
736 /// limiter only sleeps to enforce the window).
737 RateLimitWindow,
738 /// SSRF guard rejected a private / link-local / cloud-metadata address.
739 ///
740 /// Reserved — no producer wired yet. Will be emitted by the
741 /// future SSRF pre-flight check (post-Phase-1 supply-chain
742 /// hardening; the workspace currently relies on rustls + the
743 /// HTTPS-only redirect policy to keep the attack surface small).
744 SsrfPrivateAddress,
745 /// Response Content-Type / magic-byte mismatch (`HttpError::NotAPdf`).
746 ContentTypeMismatch,
747}
748
749/// Structured machine-parseable companion to `error.message` for
750/// recoverable denials.
751///
752/// The field is **optional and additive** on the public error envelope —
753/// every previously-shipped `{code, message}` envelope remains valid, and
754/// agents that ignore this struct continue to work. When present, it
755/// carries the concrete parameters an LLM agent can use to plan a recovery
756/// (e.g. "the redirect to `evil.example.com` was denied because it is not
757/// in the crossref allowlist") without text-mining `error.message`.
758///
759/// ## Wire shape
760///
761/// `#[serde(deny_unknown_fields)]`: forward-compatible field additions on
762/// the wire are forbidden by design — adding a field to this struct is a
763/// **breaking** change. This is why the type is **not** `#[non_exhaustive]`
764/// (per `docs/PUBLIC_API.md` §8): both production rules — Rust struct
765/// construction outside the crate AND wire-level extension — must agree.
766///
767/// All fields except `reason` are optional. Producers populate the fields
768/// relevant to the reason and leave the rest at `None`; consumers MUST
769/// tolerate any subset of fields being present. Optional fields are
770/// skipped on serialize but accepted as missing on deserialize via
771/// `#[serde(default, skip_serializing_if = "Option::is_none")]`.
772///
773/// [`Self::expected`] is `Option<Vec<String>>` rather than `Vec<String>`
774/// so the producer can distinguish "this reason has no allowlist channel"
775/// (`None` → field absent on the wire) from "this is the explicit list of
776/// acceptable values, possibly empty" (`Some(vec![])` → `"expected":[]` on
777/// the wire). The previous `Vec<String>` shape collapsed both states
778/// into "field omitted", which an LLM agent could not safely disambiguate.
779///
780/// Mapping table: see ADR-0023 §4, plus the
781/// `From<&HttpError> for Option<DenialContext>` and
782/// `From<&FetchError> for Option<DenialContext>` impls in
783/// [`crate::http`] / [`crate::source`].
784#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
785#[serde(deny_unknown_fields)]
786pub struct DenialContext {
787 /// Closed-enum reason code; the only required field.
788 pub reason: DenialReason,
789 /// Resolver source key (e.g. `"crossref"`) when one is in scope.
790 #[serde(default, skip_serializing_if = "Option::is_none")]
791 pub source: Option<String>,
792 /// Concrete value the producer attempted (host, path, hex magic bytes,
793 /// scheme prefix). Shape is reason-specific; consumers MUST treat it
794 /// as opaque text.
795 #[serde(default, skip_serializing_if = "Option::is_none")]
796 pub attempted: Option<String>,
797 /// Allowlist entries / acceptable values. `Option<Vec<String>>` so the
798 /// producer can distinguish "this reason has no allowlist channel"
799 /// (`None`, field absent on the wire) from "this is the explicit list
800 /// of acceptable values, possibly empty" (`Some(vec![])`, `"expected":[]`
801 /// on the wire). The inner `Vec<String>` is used even when only one
802 /// value is meaningful (e.g. `Some(vec!["%PDF-".into()])`) so the
803 /// format does not have to flip when multiple values are acceptable.
804 #[serde(default, skip_serializing_if = "Option::is_none")]
805 pub expected: Option<Vec<String>>,
806 /// Redirect-chain hop position, 0-indexed. `u8` because the chain is
807 /// hard-capped at [`crate::http`]'s `MAX_REDIRECTS` (= 10) and any
808 /// larger value indicates a bug.
809 #[serde(default, skip_serializing_if = "Option::is_none")]
810 pub hop_index: Option<u8>,
811 /// Size or rate cap value (e.g. [`PDF_MAX_BYTES`]).
812 #[serde(default, skip_serializing_if = "Option::is_none")]
813 pub cap: Option<u64>,
814 /// Observed value (e.g. response bytes when [`Self::cap`] is the byte
815 /// cap, or row schema_version when [`Self::cap`] is the binary's).
816 #[serde(default, skip_serializing_if = "Option::is_none")]
817 pub actual: Option<u64>,
818}
819
820// ---------------------------------------------------------------------------
821// CapabilityProfile (placeholder; full impl in Phase 1)
822// ---------------------------------------------------------------------------
823
824/// Marker for the always-on Open Access tier. See `docs/CAPABILITY.md`.
825#[derive(Debug, Clone, Copy)]
826pub struct AlwaysOn;
827
828/// Which Tier 2 metadata sources are enabled this session. See `docs/CAPABILITY.md`.
829#[derive(Debug, Clone, Default)]
830#[non_exhaustive]
831pub struct MetadataAccess {
832 /// Phase 4+; enabled by `DOIGET_ENABLE_OPENALEX`.
833 pub openalex: bool,
834 /// Phase 4+; enabled by `DOIGET_ENABLE_S2`.
835 pub semantic_scholar: bool,
836 /// Phase 4+; enabled by `DOIGET_ENABLE_DOAJ`.
837 pub doaj: bool,
838}
839
840/// Process-wide rate limits. Hard-coded; not configurable.
841///
842/// Construct only via [`RateLimits::HARD_CODED`]. The struct fields are
843/// `pub(crate)` so downstream code cannot synthesize a `RateLimits` with
844/// different values, which would weaken `docs/LEGAL.md` §6 safeguard 8.
845#[derive(Debug, Clone, Copy)]
846#[non_exhaustive]
847pub struct RateLimits {
848 pub(crate) max_concurrent_fetches: u32,
849 pub(crate) max_fetches_per_second: f32,
850 pub(crate) per_source_backoff_ms: u64,
851}
852
853impl RateLimits {
854 /// The single, hard-coded set of rate limits. There is no other public
855 /// constructor — see the type-level docs.
856 pub const HARD_CODED: Self = Self {
857 max_concurrent_fetches: MAX_CONCURRENT_FETCHES,
858 max_fetches_per_second: MAX_FETCHES_PER_SECOND,
859 per_source_backoff_ms: 200,
860 };
861
862 /// Maximum number of concurrent fetches in flight.
863 pub const fn max_concurrent_fetches(&self) -> u32 {
864 self.max_concurrent_fetches
865 }
866
867 /// Maximum fetch attempts per second across all sources.
868 pub const fn max_fetches_per_second(&self) -> f32 {
869 self.max_fetches_per_second
870 }
871
872 /// Per-source backoff in milliseconds between consecutive requests.
873 pub const fn per_source_backoff_ms(&self) -> u64 {
874 self.per_source_backoff_ms
875 }
876}
877
878/// A successful TDM grant.
879///
880/// Carries the validated API key (`docs/CAPABILITY.md` §1) so that the key
881/// flows from the startup capability gate into the source, rather than each
882/// TDM source re-reading the env var at fetch time (issue #153 — an env
883/// mutation between startup and fetch is otherwise undetectable).
884///
885/// The `api_key` field exists only when at least one `tdm-*` Cargo feature
886/// is compiled in (the `secrecy` dependency is `optional = true` and gated
887/// on those features per ADR-0002, so default release binaries contain no
888/// TDM code path at all). The struct is `#[non_exhaustive]`; the
889/// `tdm-*`-gated `api_key` field is therefore additive, not breaking, for
890/// builds that toggle the feature set.
891///
892/// `docs/CAPABILITY.md` §1 specifies the type as `Secret<String>`; that is
893/// the `secrecy` 0.9 spelling. The workspace pins `secrecy` 0.10, whose
894/// equivalent owned-string secret type is `secrecy::SecretString`
895/// (`= SecretBox<str>`). CAPABILITY.md §1 has been updated to match the
896/// 0.10 API. `Debug` redacts the value.
897///
898/// Implements `Default` so in-crate test fixtures using
899/// `TdmGrant { agree_env_var: ..., ..Default::default() }` keep compiling;
900/// the default `api_key` is an empty secret.
901#[derive(Debug, Clone)]
902#[non_exhaustive]
903pub struct TdmGrant {
904 /// The publisher API key, validated present at startup by
905 /// [`CapabilityProfile::from_env`]. Wrapped in
906 /// `secrecy::SecretString` so `Debug` never prints it; use
907 /// `secrecy::ExposeSecret::expose_secret` at the point of use.
908 ///
909 /// Only present when a `tdm-*` feature is compiled in (see the
910 /// type-level docs and ADR-0002).
911 #[cfg(any(
912 feature = "tdm-elsevier",
913 feature = "tdm-aps",
914 feature = "tdm-springer"
915 ))]
916 pub api_key: secrecy::SecretString,
917 /// Which env var the user used to acknowledge the publisher's ToS.
918 pub agree_env_var: String,
919 /// When the agreement env var was first observed at startup.
920 pub agreed_at: chrono::DateTime<chrono::Utc>,
921}
922
923impl Default for TdmGrant {
924 fn default() -> Self {
925 Self {
926 #[cfg(any(
927 feature = "tdm-elsevier",
928 feature = "tdm-aps",
929 feature = "tdm-springer"
930 ))]
931 api_key: secrecy::SecretString::from(String::new()),
932 agree_env_var: String::new(),
933 agreed_at: chrono::Utc::now(),
934 }
935 }
936}
937
938/// Runtime gate for which sources may be invoked. See `docs/CAPABILITY.md`.
939///
940/// Marked `#[non_exhaustive]` so adding new capability classes is non-breaking.
941/// Pattern-match only against the documented variants and use a wildcard arm.
942///
943/// **Construction**: external callers use [`CapabilityProfile::from_env()`].
944/// Struct-literal construction is blocked outside this crate by
945/// `#[non_exhaustive]`; this is intentional — the type's safety guarantees
946/// rely on the resolution rules in `from_env`. `Default` is **not yet**
947/// implemented; Phase 1 will add it once the field set stabilizes.
948#[derive(Debug, Clone)]
949#[non_exhaustive]
950pub struct CapabilityProfile {
951 /// Tier 1 OA sources are always permitted.
952 pub oa: AlwaysOn,
953 /// Tier 2 metadata access (Phase 4+).
954 pub metadata: MetadataAccess,
955 /// Tier 3 grants are populated only when both env var and feature compile-in are set.
956 pub tdm_elsevier: Option<TdmGrant>,
957 /// Tier 3 grants are populated only when both env var and feature compile-in are set.
958 pub tdm_aps: Option<TdmGrant>,
959 /// Tier 3 grants are populated only when both env var and feature compile-in are set.
960 pub tdm_springer: Option<TdmGrant>,
961 /// Hard-coded rate limits for this process.
962 pub rate_limits: RateLimits,
963}
964
965/// Errors that can arise during `CapabilityProfile::from_env`.
966#[derive(Debug, thiserror::Error)]
967pub enum CapabilityError {
968 /// User set the agree env var but provided no key. See `docs/CAPABILITY.md` §2.
969 #[error("env {agree_var} is set but {key_var} is missing")]
970 AgreedButNoKey {
971 /// The agreement env var the user set.
972 agree_var: String,
973 /// The key env var that should accompany it.
974 key_var: String,
975 },
976 /// Key env var is set but user has not agreed. See `docs/CAPABILITY.md` §2.
977 #[error("key for {agree_var} is present but {agree_var} is not set to '1'")]
978 KeyButNotAgreed {
979 /// The agreement env var the user must set to `1` before the key takes effect.
980 agree_var: String,
981 },
982}
983
984impl CapabilityProfile {
985 /// Read the runtime profile from environment variables.
986 ///
987 /// Implements the resolution algorithm specified in
988 /// [`docs/CAPABILITY.md`](../../../docs/CAPABILITY.md) §2.
989 ///
990 /// # Tier 1 (Open Access)
991 ///
992 /// Always permitted; not gated on any env var or feature.
993 ///
994 /// # Tier 2 (metadata)
995 ///
996 /// Each metadata source becomes available when its env var is set
997 /// (presence-checked, value ignored) **and** the `metadata` Cargo feature
998 /// was compiled in. If the env var is set but the feature is not compiled
999 /// in, a `tracing::warn!` is emitted and the source is left disabled —
1000 /// this is not an error so that users can move binaries between machines
1001 /// (or switch feature sets between cargo invocations) without breaking
1002 /// startup. See `docs/CAPABILITY.md` §3 for the env var list.
1003 ///
1004 /// # Tier 3 (TDM)
1005 ///
1006 /// For each publisher in `{ELSEVIER, APS, SPRINGER}`, the
1007 /// `DOIGET_AGREE_TDM_<X>` agreement env var is paired with
1008 /// `DOIGET_KEY_<X>`. Resolution rules (per `docs/CAPABILITY.md` §2):
1009 ///
1010 /// - both unset → `tdm_<x> = None` (no error);
1011 /// - `agree == "1"` and key set → `Some(TdmGrant { .. })` (subject to the
1012 /// feature gate below);
1013 /// - `agree == "1"` and key unset → [`CapabilityError::AgreedButNoKey`];
1014 /// - key set but `agree` unset (or `agree != "1"`) →
1015 /// [`CapabilityError::KeyButNotAgreed`].
1016 ///
1017 /// When both env vars are set correctly **but** the corresponding
1018 /// `tdm-<x>` Cargo feature is not compiled in, this function emits a
1019 /// `tracing::warn!` and sets the grant to `None` rather than returning an
1020 /// error — same rationale as for the Tier 2 warn-and-skip behavior.
1021 ///
1022 /// # Precondition: tracing subscriber must be installed first
1023 ///
1024 /// Warn breadcrumbs are delivered via `tracing::warn!`. Callers MUST
1025 /// install a `tracing-subscriber` (or equivalent) **before** invoking
1026 /// this function, otherwise warnings are silently dropped. The
1027 /// `doiget-cli` binary does this in `main.rs`.
1028 ///
1029 /// # Errors
1030 ///
1031 /// Returns [`CapabilityError::AgreedButNoKey`] or
1032 /// [`CapabilityError::KeyButNotAgreed`] when the TDM env-var pair for any
1033 /// publisher is misconfigured. See the variant docs for the precise
1034 /// trigger conditions.
1035 ///
1036 /// # Note on `api_key` storage
1037 ///
1038 /// When a `tdm-*` feature is compiled in, [`TdmGrant`] carries the
1039 /// validated key as `secrecy::SecretString` (issue #153). The key is
1040 /// read exactly once here, at startup; TDM sources consume it from the
1041 /// grant and never re-read the env var at fetch time. This makes the
1042 /// grant a true startup attestation — an env mutation between startup
1043 /// and fetch can no longer silently change the credential in flight.
1044 /// See the [`TdmGrant`] doc-comment and `docs/CAPABILITY.md` §1/§2.
1045 pub fn from_env() -> Result<Self, CapabilityError> {
1046 // Issue #153: the validated API key is now threaded through
1047 // `TdmGrant` (as `secrecy::SecretString`, behind the `tdm-*`
1048 // features) by `resolve_tdm_grant` below — sources no longer
1049 // re-read the key env var at fetch time. See the `TdmGrant`
1050 // doc-comment and `docs/CAPABILITY.md` §1/§2.
1051
1052 // -- Tier 2 metadata -------------------------------------------------
1053 let metadata = MetadataAccess {
1054 openalex: resolve_metadata_flag(
1055 "DOIGET_ENABLE_OPENALEX",
1056 "metadata",
1057 cfg!(feature = "metadata"),
1058 ),
1059 semantic_scholar: resolve_metadata_flag(
1060 "DOIGET_ENABLE_S2",
1061 "metadata",
1062 cfg!(feature = "metadata"),
1063 ),
1064 doaj: resolve_metadata_flag(
1065 "DOIGET_ENABLE_DOAJ",
1066 "metadata",
1067 cfg!(feature = "metadata"),
1068 ),
1069 };
1070
1071 // -- Tier 3 TDM grants ----------------------------------------------
1072 let tdm_elsevier = resolve_tdm_grant(
1073 "DOIGET_AGREE_TDM_ELSEVIER",
1074 "DOIGET_KEY_ELSEVIER",
1075 "tdm-elsevier",
1076 cfg!(feature = "tdm-elsevier"),
1077 )?;
1078 let tdm_aps = resolve_tdm_grant(
1079 "DOIGET_AGREE_TDM_APS",
1080 "DOIGET_KEY_APS",
1081 "tdm-aps",
1082 cfg!(feature = "tdm-aps"),
1083 )?;
1084 let tdm_springer = resolve_tdm_grant(
1085 "DOIGET_AGREE_TDM_SPRINGER",
1086 "DOIGET_KEY_SPRINGER",
1087 "tdm-springer",
1088 cfg!(feature = "tdm-springer"),
1089 )?;
1090
1091 Ok(Self {
1092 oa: AlwaysOn,
1093 metadata,
1094 tdm_elsevier,
1095 tdm_aps,
1096 tdm_springer,
1097 rate_limits: RateLimits::HARD_CODED,
1098 })
1099 }
1100}
1101
1102/// Resolve a Tier 2 metadata flag from its env var and compile-in feature.
1103///
1104/// Returns `true` only when both the env var is present and the feature is
1105/// compiled in. When the env var is set without the feature, emits a
1106/// `tracing::warn!` and returns `false` — see [`CapabilityProfile::from_env`]
1107/// for the rationale (binaries may move between hosts / feature sets).
1108fn resolve_metadata_flag(env_var: &str, feature: &str, feature_enabled: bool) -> bool {
1109 let env_set = std::env::var_os(env_var).is_some();
1110 match (env_set, feature_enabled) {
1111 (true, true) => true,
1112 (true, false) => {
1113 tracing::warn!(
1114 env_var,
1115 feature,
1116 "{} is set but feature {} was not compiled in; the source will be unavailable",
1117 env_var,
1118 feature
1119 );
1120 false
1121 }
1122 (false, _) => false,
1123 }
1124}
1125
1126/// Resolve a Tier 3 TDM grant from the `agree`/`key` env-var pair and the
1127/// per-publisher Cargo feature.
1128///
1129/// Implements the rules in `docs/CAPABILITY.md` §2:
1130///
1131/// - both unset → `Ok(None)`.
1132/// - `agree == "1"` and `key` set → `Ok(Some(TdmGrant { .. }))` (when the
1133/// feature is enabled), or warn-and-`Ok(None)` (when the feature is not
1134/// compiled in).
1135/// - `agree == "1"` and `key` unset →
1136/// [`CapabilityError::AgreedButNoKey`].
1137/// - `key` set and `agree` unset OR `agree` set to anything other than `"1"`
1138/// → [`CapabilityError::KeyButNotAgreed`].
1139fn resolve_tdm_grant(
1140 agree_var: &str,
1141 key_var: &str,
1142 feature: &str,
1143 feature_enabled: bool,
1144) -> Result<Option<TdmGrant>, CapabilityError> {
1145 // `agree` is "agreed" iff the value is exactly the literal "1"; any other
1146 // value (including "true", "yes", empty) is treated as not-agreed per
1147 // `docs/CAPABILITY.md` §2.
1148 let agree_raw = std::env::var(agree_var).ok();
1149 let agreed = matches!(agree_raw.as_deref(), Some("1"));
1150 let agree_present = agree_raw.is_some();
1151 // Read the key value once, at startup, so the validated key flows
1152 // through `TdmGrant` and sources never re-read the env (issue #153).
1153 // An empty value is treated as "not set" — an empty API key cannot
1154 // authenticate, and silently constructing a grant around it would
1155 // mask the misconfiguration the AgreedButNoKey rule exists to surface.
1156 let key_value = std::env::var(key_var).ok().filter(|v| !v.is_empty());
1157
1158 match (agreed, agree_present, key_value) {
1159 (true, _, Some(key)) => {
1160 if feature_enabled {
1161 Ok(Some(build_tdm_grant(agree_var, key)))
1162 } else {
1163 // `key` is dropped here; under no-tdm builds it is the only
1164 // consumer of the owned `String`, which is intended.
1165 let _ = key;
1166 tracing::warn!(
1167 env_var = agree_var,
1168 feature,
1169 "{} is set but feature {} was not compiled in; the source will be unavailable",
1170 agree_var,
1171 feature
1172 );
1173 Ok(None)
1174 }
1175 }
1176 (true, _, None) => Err(CapabilityError::AgreedButNoKey {
1177 agree_var: agree_var.to_string(),
1178 key_var: key_var.to_string(),
1179 }),
1180 // agree set to non-"1", key also set: KeyButNotAgreed (the key would
1181 // otherwise authorize the source without an explicit agreement).
1182 (false, true, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1183 agree_var: agree_var.to_string(),
1184 }),
1185 // agree unset, key set: KeyButNotAgreed (same rule).
1186 (false, false, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1187 agree_var: agree_var.to_string(),
1188 }),
1189 // agree set to non-"1" and no key: treat as no-grant. The user
1190 // expressed something but did not opt in and provided no credential,
1191 // so silent skip is the safe default (no source enabled).
1192 (false, true, None) => Ok(None),
1193 // Neither env var set: no grant, no error.
1194 (false, false, None) => Ok(None),
1195 }
1196}
1197
1198/// Construct a [`TdmGrant`] from the validated agreement var and key value.
1199///
1200/// Split out so the `tdm-*`-gated `api_key` field is populated in exactly
1201/// one place. When no `tdm-*` feature is compiled in the `key` is consumed
1202/// (dropped) here — the grant is still produced so that startup attestation
1203/// behavior (the warn-and-skip path) does not change shape between feature
1204/// sets.
1205fn build_tdm_grant(agree_var: &str, key: String) -> TdmGrant {
1206 #[cfg(any(
1207 feature = "tdm-elsevier",
1208 feature = "tdm-aps",
1209 feature = "tdm-springer"
1210 ))]
1211 {
1212 TdmGrant {
1213 api_key: secrecy::SecretString::from(key),
1214 agree_env_var: agree_var.to_string(),
1215 agreed_at: chrono::Utc::now(),
1216 }
1217 }
1218 #[cfg(not(any(
1219 feature = "tdm-elsevier",
1220 feature = "tdm-aps",
1221 feature = "tdm-springer"
1222 )))]
1223 {
1224 let _ = key;
1225 TdmGrant {
1226 agree_env_var: agree_var.to_string(),
1227 agreed_at: chrono::Utc::now(),
1228 }
1229 }
1230}
1231
1232// ---------------------------------------------------------------------------
1233// Tests — one smoke test per legally-load-bearing constant. See
1234// `docs/LEGAL.md` §6 safeguard 8 and `docs/PHASES.md` §4. These also keep the
1235// `cargo test --workspace` job from being a false-green during Phase 0.
1236// ---------------------------------------------------------------------------
1237
1238// `expect`/`unwrap` are idiomatic in tests where panics double as assertions.
1239// The workspace lints deny them in production code; relax for the test module
1240// only.
1241#[cfg(test)]
1242#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1243mod tests {
1244 use super::*;
1245
1246 #[test]
1247 fn rate_limits_hard_coded_match_legal_safeguards() {
1248 // docs/LEGAL.md §6 safeguard 8 names these exact values.
1249 assert_eq!(RateLimits::HARD_CODED.max_concurrent_fetches(), 5);
1250 assert!((RateLimits::HARD_CODED.max_fetches_per_second() - 5.0).abs() < f32::EPSILON);
1251 assert_eq!(RateLimits::HARD_CODED.per_source_backoff_ms(), 200);
1252 }
1253
1254 #[test]
1255 fn batch_size_caps_match_security_doc() {
1256 // docs/SECURITY.md §1.4 + docs/MCP_TOOLS.md.
1257 assert_eq!(MCP_BATCH_MAX_SIZE, 100);
1258 assert_eq!(MCP_QUEUE_DEPTH_MAX, 100);
1259 assert_eq!(DOI_SUFFIX_MAX_LEN, 256);
1260 assert_eq!(MCP_STDIN_EOF_SHUTDOWN_SEC, 5);
1261 // Slice 2: spec-language alias for MCP_BATCH_MAX_SIZE must
1262 // numerically agree with the original constant.
1263 assert_eq!(MAX_BATCH_REFS, MCP_BATCH_MAX_SIZE);
1264 }
1265
1266 #[test]
1267 fn schema_version_is_pinned_to_1_0() {
1268 // docs/STORE.md §3 — Phase 0/1 writes 1.0 exactly.
1269 // A bump to 1.1 (minor, backward-compat additions) requires updating
1270 // both this test and the cross-tool compat fixtures simultaneously.
1271 assert_eq!(SCHEMA_VERSION, "1.0");
1272 }
1273
1274 // -----------------------------------------------------------------
1275 // CapabilityProfile::from_env — Phase 1 resolution algorithm tests.
1276 //
1277 // These tests mutate process-global env state via std::env::set_var /
1278 // remove_var, so each test holds an `EnvGuard` RAII drop guard that
1279 // captures the pre-test value of every env var it touches and restores
1280 // it on drop (even on panic). They also use `#[serial_test::serial]` so
1281 // that no two tests in this module touch env state concurrently — the
1282 // workspace's test runner defaults to multi-threaded.
1283 //
1284 // Spec: docs/CAPABILITY.md §2 (resolution algorithm) and §3 (env var
1285 // reference table).
1286 // -----------------------------------------------------------------
1287
1288 /// RAII guard that captures the prior value of an env var on construction
1289 /// and restores it on drop. Use one guard per touched var per test.
1290 struct EnvGuard {
1291 var: &'static str,
1292 prior: Option<std::ffi::OsString>,
1293 }
1294
1295 impl EnvGuard {
1296 /// Capture and clear `var`. Use `set` afterwards to install a value.
1297 fn unset(var: &'static str) -> Self {
1298 let prior = std::env::var_os(var);
1299 // SAFETY (env mutation): tests are serialized via
1300 // `#[serial_test::serial]`. `remove_var` is sound when no other
1301 // thread reads or writes the environment concurrently.
1302 std::env::remove_var(var);
1303 EnvGuard { var, prior }
1304 }
1305
1306 /// Capture, then set `var` to `value`.
1307 fn set(var: &'static str, value: &str) -> Self {
1308 let prior = std::env::var_os(var);
1309 std::env::set_var(var, value);
1310 EnvGuard { var, prior }
1311 }
1312 }
1313
1314 impl Drop for EnvGuard {
1315 fn drop(&mut self) {
1316 match &self.prior {
1317 Some(v) => std::env::set_var(self.var, v),
1318 None => std::env::remove_var(self.var),
1319 }
1320 }
1321 }
1322
1323 /// Convenience: unset every Tier 2 / Tier 3 env var the resolution
1324 /// algorithm reads, returning a vector of guards that restore them on
1325 /// drop. Callers can then `EnvGuard::set` individual vars on top.
1326 fn unset_all_capability_env_vars() -> Vec<EnvGuard> {
1327 [
1328 "DOIGET_ENABLE_OPENALEX",
1329 "DOIGET_ENABLE_S2",
1330 "DOIGET_ENABLE_DOAJ",
1331 "DOIGET_AGREE_TDM_ELSEVIER",
1332 "DOIGET_KEY_ELSEVIER",
1333 "DOIGET_AGREE_TDM_APS",
1334 "DOIGET_KEY_APS",
1335 "DOIGET_AGREE_TDM_SPRINGER",
1336 "DOIGET_KEY_SPRINGER",
1337 ]
1338 .iter()
1339 .map(|v| EnvGuard::unset(v))
1340 .collect()
1341 }
1342
1343 #[test]
1344 #[serial_test::serial]
1345 fn from_env_no_env_vars_set_returns_tier_1_only() {
1346 // Rule: with every relevant env var unset, the resolved profile has
1347 // all TDM grants `None` and all metadata flags `false`. Hard-coded
1348 // rate limits still apply. (Replaces the old Phase 0 stub test.)
1349 let _g = unset_all_capability_env_vars();
1350
1351 let p = CapabilityProfile::from_env().expect("clean env never errors");
1352 assert!(p.tdm_elsevier.is_none());
1353 assert!(p.tdm_aps.is_none());
1354 assert!(p.tdm_springer.is_none());
1355 assert!(!p.metadata.openalex);
1356 assert!(!p.metadata.semantic_scholar);
1357 assert!(!p.metadata.doaj);
1358 assert_eq!(p.rate_limits.max_concurrent_fetches(), 5);
1359 }
1360
1361 #[test]
1362 #[serial_test::serial]
1363 fn from_env_no_tdm_returns_tier_1_profile() {
1364 // Rule (CAPABILITY.md §2): with every TDM env var unset, all
1365 // `tdm_*` fields are `None` and no error is produced.
1366 let _g = unset_all_capability_env_vars();
1367
1368 let p = CapabilityProfile::from_env().expect("no TDM env -> Ok");
1369 assert!(p.tdm_elsevier.is_none());
1370 assert!(p.tdm_aps.is_none());
1371 assert!(p.tdm_springer.is_none());
1372 }
1373
1374 #[test]
1375 #[serial_test::serial]
1376 fn from_env_agreed_but_no_key_errs() {
1377 // Rule (CAPABILITY.md §2): agree=1 + key unset -> AgreedButNoKey.
1378 let _g = unset_all_capability_env_vars();
1379 let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1380
1381 let result = CapabilityProfile::from_env();
1382 match result {
1383 Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1384 assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1385 assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1386 }
1387 other => panic!("expected AgreedButNoKey, got {:?}", other),
1388 }
1389 }
1390
1391 #[test]
1392 #[serial_test::serial]
1393 fn from_env_agreed_but_empty_key_errs() {
1394 // Security-adjacent (PR #161 review): an *empty* key string is
1395 // treated as "not set" by `resolve_tdm_grant`. With agree=1 and
1396 // DOIGET_KEY_ELSEVIER="" the misconfiguration must surface as
1397 // AgreedButNoKey, not silently build a grant around an empty
1398 // secret that could never authenticate.
1399 let _g = unset_all_capability_env_vars();
1400 let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1401 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1402
1403 let result = CapabilityProfile::from_env();
1404 match result {
1405 Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1406 assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1407 assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1408 }
1409 other => panic!("expected AgreedButNoKey for empty key, got {:?}", other),
1410 }
1411 }
1412
1413 #[test]
1414 #[serial_test::serial]
1415 fn from_env_empty_key_without_agree_is_no_grant() {
1416 // Security-adjacent (PR #161 review): an empty key with the
1417 // agree var unset is indistinguishable from "no key at all".
1418 // It must resolve to Ok(None) (no grant, no error) — an empty
1419 // string must NOT trip the KeyButNotAgreed leaked-credential
1420 // rule, since there is no credential.
1421 let _g = unset_all_capability_env_vars();
1422 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1423
1424 let p = CapabilityProfile::from_env()
1425 .expect("empty key + agree unset must be Ok(None), not an error");
1426 assert!(
1427 p.tdm_elsevier.is_none(),
1428 "empty DOIGET_KEY_ELSEVIER with no agree var must yield no grant"
1429 );
1430 assert!(p.tdm_aps.is_none());
1431 assert!(p.tdm_springer.is_none());
1432 }
1433
1434 #[test]
1435 #[serial_test::serial]
1436 fn from_env_key_but_not_agreed_errs() {
1437 // Rule (CAPABILITY.md §2): key set + agree unset -> KeyButNotAgreed.
1438 // A leaked DOIGET_KEY_ELSEVIER must not silently enable a source.
1439 let _g = unset_all_capability_env_vars();
1440 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1441
1442 let result = CapabilityProfile::from_env();
1443 match result {
1444 Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1445 assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1446 }
1447 other => panic!("expected KeyButNotAgreed, got {:?}", other),
1448 }
1449 }
1450
1451 #[test]
1452 #[serial_test::serial]
1453 fn from_env_agree_not_one_errs() {
1454 // Rule (CAPABILITY.md §2): the agree var must be exactly "1". Any
1455 // other value (here: "true") is treated as not-agreed; combined
1456 // with a key set, that triggers KeyButNotAgreed.
1457 let _g = unset_all_capability_env_vars();
1458 let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "true");
1459 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1460
1461 let result = CapabilityProfile::from_env();
1462 match result {
1463 Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1464 assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1465 }
1466 other => panic!("expected KeyButNotAgreed, got {:?}", other),
1467 }
1468 }
1469
1470 #[test]
1471 #[serial_test::serial]
1472 fn from_env_both_set_correctly_returns_grant() {
1473 // Rule (CAPABILITY.md §2): agree=1 + key set -> Some(TdmGrant) when
1474 // the corresponding feature is compiled in; else None (warn-and-skip).
1475 // The feature gate for elsevier is `tdm-elsevier`; this test asserts
1476 // both branches via `cfg!`.
1477 let _g = unset_all_capability_env_vars();
1478 let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1479 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1480
1481 let p = CapabilityProfile::from_env().expect("agree=1 + key -> Ok");
1482
1483 if cfg!(feature = "tdm-elsevier") {
1484 let grant = p
1485 .tdm_elsevier
1486 .as_ref()
1487 .expect("feature tdm-elsevier compiled in -> Some(TdmGrant)");
1488 assert_eq!(grant.agree_env_var, "DOIGET_AGREE_TDM_ELSEVIER");
1489 // Issue #153 / PR #161 review: prove the key was actually
1490 // threaded into TdmGrant::api_key at startup (not just that
1491 // the agree var was recorded). The field is cfg-gated to
1492 // the same `tdm-*` set as the assertion below, so gate the
1493 // check identically.
1494 #[cfg(any(
1495 feature = "tdm-elsevier",
1496 feature = "tdm-aps",
1497 feature = "tdm-springer"
1498 ))]
1499 {
1500 use secrecy::ExposeSecret as _;
1501 assert_eq!(
1502 grant.api_key.expose_secret(),
1503 "sk-test",
1504 "the DOIGET_KEY_ELSEVIER value must be threaded into \
1505 TdmGrant::api_key (issue #153)"
1506 );
1507 }
1508 } else {
1509 assert!(
1510 p.tdm_elsevier.is_none(),
1511 "feature tdm-elsevier NOT compiled in -> None (warn-and-skip)"
1512 );
1513 }
1514 }
1515
1516 #[test]
1517 #[serial_test::serial]
1518 fn from_env_metadata_env_warns_without_feature() {
1519 // Rule (CAPABILITY.md §2): metadata env var without the `metadata`
1520 // feature -> source disabled (warn-and-skip, not an error).
1521 // We don't capture the tracing warn here; we just assert the field
1522 // is `false` when the feature is absent and `true` when present.
1523 let _g = unset_all_capability_env_vars();
1524 let _enable = EnvGuard::set("DOIGET_ENABLE_OPENALEX", "1");
1525
1526 let p = CapabilityProfile::from_env().expect("metadata env never errors");
1527
1528 if cfg!(feature = "metadata") {
1529 assert!(p.metadata.openalex);
1530 } else {
1531 assert!(!p.metadata.openalex);
1532 }
1533 }
1534
1535 // -----------------------------------------------------------------
1536 // Safekey reference vectors (docs/SAFEKEY.md §3, NORMATIVE).
1537 //
1538 // The vectors.json file is the binding cross-tool contract with
1539 // BiblioFetch.jl: every entry MUST round-trip identically through
1540 // both implementations. Phase 0 ships 13 entries; the full 100-entry
1541 // set is gated on the BiblioFetch.jl pre-flight (ADR-0007 Status:
1542 // Proposed at the time of this Phase 1 implementation).
1543 //
1544 // `Ref::parse` is concurrent W3-A work and is not on `main` yet, so
1545 // this test branches on the input prefix (`doi:` / `arxiv:`) and
1546 // constructs the variant directly via the in-crate `pub(crate)`
1547 // tuple constructor.
1548 // -----------------------------------------------------------------
1549
1550 #[derive(Deserialize)]
1551 struct SafekeyVector {
1552 input: String,
1553 expected: String,
1554 }
1555
1556 #[derive(Deserialize)]
1557 struct SafekeyVectorFile {
1558 vectors: Vec<SafekeyVector>,
1559 }
1560
1561 /// In-crate test helper: build a `Ref` from the user-facing form used
1562 /// in the vectors file, by stripping the `doi:` / `arxiv:` URI scheme
1563 /// and wrapping the remainder. This bypasses validation; it is fine
1564 /// here because the vectors are hand-curated and the test asserts the
1565 /// derivation algorithm, not parser semantics.
1566 fn ref_from_vector_input(input: &str) -> Ref {
1567 if let Some(rest) = input.strip_prefix("doi:") {
1568 Ref::Doi(Doi(rest.to_string()))
1569 } else if let Some(rest) = input.strip_prefix("arxiv:") {
1570 Ref::Arxiv(ArxivId(rest.to_string()))
1571 } else {
1572 panic!(
1573 "vectors.json entry has unknown ref scheme (expected doi: or arxiv: prefix): {}",
1574 input
1575 );
1576 }
1577 }
1578
1579 #[test]
1580 fn safekey_matches_reference_vectors() {
1581 // include_str! resolves relative to the file containing this macro
1582 // call (crates/doiget-core/src/lib.rs), so we go up three levels
1583 // to reach the workspace root, then down to tests/fixtures.
1584 let raw = include_str!("../../../tests/fixtures/safekey/vectors.json");
1585 let parsed: SafekeyVectorFile =
1586 serde_json::from_str(raw).expect("vectors.json is valid JSON matching schema");
1587
1588 // Phase 0 final ships the full NORMATIVE 100-entry set
1589 // (docs/SAFEKEY.md §5). The fixture is the binding cross-tool
1590 // contract with BiblioFetch.jl; tightening the count guard to
1591 // `== 100` ensures the set cannot silently grow or shrink without
1592 // a coordinated ADR bump (per docs/SAFEKEY.md status block).
1593 assert_eq!(
1594 parsed.vectors.len(),
1595 100,
1596 "vectors.json MUST be exactly 100 entries (NORMATIVE per docs/SAFEKEY.md §5); got {}",
1597 parsed.vectors.len()
1598 );
1599
1600 let mut failures: Vec<String> = Vec::new();
1601 for v in &parsed.vectors {
1602 let r = ref_from_vector_input(&v.input);
1603 let got = r.safekey().as_str().to_string();
1604 if got != v.expected {
1605 failures.push(format!(
1606 "input={:?}\n expected={:?}\n got ={:?}",
1607 v.input, v.expected, got
1608 ));
1609 }
1610 }
1611
1612 assert!(
1613 failures.is_empty(),
1614 "{}/{} safekey reference vectors failed:\n{}",
1615 failures.len(),
1616 parsed.vectors.len(),
1617 failures.join("\n")
1618 );
1619 }
1620
1621 #[test]
1622 fn safekey_truncates_long_inputs_with_sha256_suffix() {
1623 // Construct a synthetic DOI whose suffix produces a `trimmed` longer than
1624 // 192 chars after step 3. 220 ASCII-safe chars + the `doi_10.1234/`
1625 // prefix easily exceeds 192. The resulting key must be exactly 201 chars:
1626 // 192 (trimmed prefix) + 1 (`_` separator) + 8 (hex of first 4 bytes of
1627 // SHA-256(raw)). Per docs/SAFEKEY.md §3 step 5.
1628 let suffix = "a".repeat(220);
1629 let doi = Doi(format!("10.1234/{}", suffix));
1630 let key = Ref::Doi(doi).safekey();
1631 let s = key.as_str();
1632
1633 // Shape: <192 ASCII chars from {A-Za-z0-9._-}> + "_" + <8 hex chars>
1634 assert_eq!(
1635 s.len(),
1636 201,
1637 "expected 201-char truncated key, got {}: {}",
1638 s.len(),
1639 s
1640 );
1641 assert_eq!(&s[192..193], "_", "expected '_' separator at byte 192");
1642 let hash_part = &s[193..];
1643 assert_eq!(hash_part.len(), 8, "hash suffix must be 8 hex chars");
1644 assert!(
1645 hash_part
1646 .chars()
1647 .all(|c| c.is_ascii_hexdigit() && !c.is_ascii_uppercase()),
1648 "hash suffix must be lowercase hex: {}",
1649 hash_part
1650 );
1651
1652 // Determinism: same input twice must produce the same key.
1653 let key2 = Ref::Doi(Doi(format!("10.1234/{}", "a".repeat(220)))).safekey();
1654 assert_eq!(s, key2.as_str(), "safekey must be deterministic");
1655
1656 // Hash content: must equal hex(sha256(raw)[..4]) where raw is the
1657 // pre-escape prefixed form per docs/SAFEKEY.md §3 step 5.
1658 use sha2::Digest;
1659 let raw = format!("doi_10.1234/{}", "a".repeat(220));
1660 let expected_hash = {
1661 let digest = sha2::Sha256::digest(raw.as_bytes());
1662 format!(
1663 "{:02x}{:02x}{:02x}{:02x}",
1664 digest[0], digest[1], digest[2], digest[3]
1665 )
1666 };
1667 assert_eq!(
1668 hash_part, expected_hash,
1669 "hash must match SHA-256 of raw form"
1670 );
1671 }
1672
1673 // -----------------------------------------------------------------
1674 // Doi::parse / ArxivId::parse / Ref::parse — Phase 1 W3-A.
1675 // Spec: docs/SECURITY.md §1.1 (input validation). The rejection
1676 // category set is the binding contract; each test case below names
1677 // which rule it exercises in a comment.
1678 // -----------------------------------------------------------------
1679
1680 // ---- Doi::parse happy paths (≥6) --------------------------------
1681
1682 #[test]
1683 fn doi_parse_accepts_bare_canonical_form() {
1684 // Rule: "10.<registrant>/<suffix>" is the canonical bare form.
1685 let d = Doi::parse("10.1234/example").expect("canonical bare DOI");
1686 assert_eq!(d.as_str(), "10.1234/example");
1687 }
1688
1689 #[test]
1690 fn doi_parse_accepts_doi_uri_scheme() {
1691 // Rule: the `doi:` scheme is stripped at construction; as_str
1692 // never carries it (matches docs/SAFEKEY.md §3 step 0).
1693 let d = Doi::parse("doi:10.1234/example").expect("doi: scheme accepted");
1694 assert_eq!(d.as_str(), "10.1234/example");
1695 }
1696
1697 #[test]
1698 fn doi_parse_accepts_complex_real_world_suffix() {
1699 // Rule: suffix charset includes `.`, `(`, `)`, `-`. From a real
1700 // PhysRevLett DOI used elsewhere in the test fixture set.
1701 let d = Doi::parse("10.1103/PhysRevLett.130.200601").expect("real-world PhysRev DOI");
1702 assert_eq!(d.as_str(), "10.1103/PhysRevLett.130.200601");
1703 }
1704
1705 #[test]
1706 fn doi_parse_accepts_parens_in_suffix() {
1707 // Rule: `(` and `)` are explicitly listed in the spec charset.
1708 let d = Doi::parse("10.1016/S0370-1573(98)00122-3").expect("parens in suffix");
1709 assert_eq!(d.as_str(), "10.1016/S0370-1573(98)00122-3");
1710 }
1711
1712 #[test]
1713 fn doi_parse_accepts_nested_slashes_in_suffix() {
1714 // Rule: `/` is a suffix character; only the first `/` is the
1715 // registrant/suffix separator.
1716 let d = Doi::parse("10.1234/foo/bar/baz").expect("nested slashes");
1717 assert_eq!(d.as_str(), "10.1234/foo/bar/baz");
1718 }
1719
1720 #[test]
1721 fn doi_parse_accepts_colon_in_legacy_kluwer_suffix() {
1722 // #194: legacy Kluwer/Springer DOIs (`10.1023/A:NNNNNNNNNN`)
1723 // carry a `:` in the suffix. Real DOI: "Entanglement, Quantum
1724 // Phase Transitions, and DMRG" (Kluwer, 2002).
1725 let d = Doi::parse("10.1023/A:1019601218492").expect("legacy Kluwer colon DOI");
1726 assert_eq!(d.as_str(), "10.1023/A:1019601218492");
1727 }
1728
1729 #[test]
1730 fn doi_parse_accepts_colon_in_edp_jphys_suffix() {
1731 // #194: EDP Sciences / Journal de Physique legacy corpus uses
1732 // `10.1051/jphys:NNNNNNNNNNNNNNNNN`. Real DOIs from the dogfood
1733 // Ising-RG run; both resolve at doi.org and via Crossref.
1734 let d = Doi::parse("10.1051/jphys:0198900500120136500").expect("EDP jphys colon DOI");
1735 assert_eq!(d.as_str(), "10.1051/jphys:0198900500120136500");
1736 let d2 = Doi::parse("doi:10.1051/jphys:0198500460100164500").expect("scheme + colon");
1737 assert_eq!(d2.as_str(), "10.1051/jphys:0198500460100164500");
1738 }
1739
1740 #[test]
1741 fn doi_parse_rejects_semicolon_in_suffix() {
1742 // #194 / ADR-0026: `;` is the natural ASCII neighbor of `:` and
1743 // is explicitly EXCLUDED from the suffix charset extension
1744 // (ADR-0026 §"Out of scope"). This test guards against an
1745 // over-broad `matches!` arm (e.g. an accidental `':'..=';'` range
1746 // typo) re-admitting `;` along with `:`.
1747 let result = Doi::parse("10.1234/foo;bar");
1748 assert!(
1749 matches!(result, Err(RefParseError::InvalidDoiSuffixChar { ch: ';' })),
1750 "expected InvalidDoiSuffixChar with ch=';', got {:?}",
1751 result
1752 );
1753 }
1754
1755 #[test]
1756 fn doi_parse_accepts_suffix_at_max_len_boundary() {
1757 // Rule: a suffix of exactly DOI_SUFFIX_MAX_LEN bytes is accepted;
1758 // 1 byte more is rejected (covered separately below).
1759 let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN);
1760 let input = format!("10.1234/{}", suffix);
1761 let d = Doi::parse(&input).expect("suffix at max len");
1762 assert_eq!(d.as_str().len(), "10.1234/".len() + DOI_SUFFIX_MAX_LEN);
1763 }
1764
1765 #[test]
1766 fn doi_parse_uri_scheme_is_case_insensitive() {
1767 // Rule: be lenient on scheme casing; the scheme is stripped
1768 // either way so the stored form is identical.
1769 let d = Doi::parse("DOI:10.1234/example").expect("uppercase scheme");
1770 assert_eq!(d.as_str(), "10.1234/example");
1771 }
1772
1773 // ---- Doi::parse rejection paths (≥6) ----------------------------
1774
1775 #[test]
1776 fn doi_parse_rejects_missing_10_prefix() {
1777 // Rule: must start with "10." literal.
1778 assert_eq!(
1779 Doi::parse("11.1234/example"),
1780 Err(RefParseError::MissingDoiPrefix)
1781 );
1782 }
1783
1784 #[test]
1785 fn doi_parse_rejects_empty_input() {
1786 // Rule: empty inputs are not valid DOIs.
1787 assert_eq!(Doi::parse(""), Err(RefParseError::Empty));
1788 }
1789
1790 #[test]
1791 fn doi_parse_rejects_missing_suffix_separator() {
1792 // Rule: must contain a `/` between registrant and suffix.
1793 assert_eq!(
1794 Doi::parse("10.1234"),
1795 Err(RefParseError::MissingDoiSuffixSeparator)
1796 );
1797 }
1798
1799 #[test]
1800 fn doi_parse_rejects_empty_suffix() {
1801 // Rule: suffix must be non-empty.
1802 assert_eq!(Doi::parse("10.1234/"), Err(RefParseError::EmptyDoiSuffix));
1803 }
1804
1805 #[test]
1806 fn doi_parse_rejects_invalid_registrant_too_short() {
1807 // Rule: registrant must be 4–9 digits.
1808 assert_eq!(
1809 Doi::parse("10.12/example"),
1810 Err(RefParseError::InvalidDoiRegistrant)
1811 );
1812 }
1813
1814 #[test]
1815 fn doi_parse_rejects_non_digit_registrant() {
1816 // Rule: registrant chars must all be ASCII digits.
1817 assert_eq!(
1818 Doi::parse("10.12ab/example"),
1819 Err(RefParseError::InvalidDoiRegistrant)
1820 );
1821 }
1822
1823 #[test]
1824 fn doi_parse_rejects_control_char_in_suffix() {
1825 // Rule (from docs/SECURITY.md §1.1, log-injection mitigation):
1826 // control chars are not in the suffix charset; reject before they
1827 // can reach the provenance log.
1828 let result = Doi::parse("10.1234/foo\nbar");
1829 assert!(
1830 matches!(
1831 result,
1832 Err(RefParseError::InvalidDoiSuffixChar { ch: '\n' })
1833 ),
1834 "got {:?}",
1835 result
1836 );
1837 }
1838
1839 #[test]
1840 fn doi_parse_rejects_suffix_over_max_len() {
1841 // Rule: DOI_SUFFIX_MAX_LEN + 1 bytes is rejected.
1842 let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 1);
1843 let input = format!("10.1234/{}", suffix);
1844 let result = Doi::parse(&input);
1845 match result {
1846 Err(RefParseError::DoiSuffixTooLong { len, max }) => {
1847 assert_eq!(len, DOI_SUFFIX_MAX_LEN + 1);
1848 assert_eq!(max, DOI_SUFFIX_MAX_LEN);
1849 }
1850 other => panic!("expected DoiSuffixTooLong, got {:?}", other),
1851 }
1852 }
1853
1854 #[test]
1855 fn doi_parse_rejects_non_ascii_in_suffix() {
1856 // Rule: spec charset is ASCII-only; non-ASCII becomes an
1857 // InvalidDoiSuffixChar (consistent with safekey behavior of
1858 // collapsing such chars to '_', which is a downstream concern).
1859 let result = Doi::parse("10.1234/物理学");
1860 assert!(
1861 matches!(result, Err(RefParseError::InvalidDoiSuffixChar { .. })),
1862 "got {:?}",
1863 result
1864 );
1865 }
1866
1867 // ---- ArxivId::parse happy paths (≥6) ----------------------------
1868
1869 #[test]
1870 fn arxiv_parse_accepts_new_style_4_digit_seq() {
1871 // Rule: new-style YYMM.NNNN (4-digit sequence number).
1872 let a = ArxivId::parse("0704.0001").expect("new-style 4-digit seq");
1873 assert_eq!(a.as_str(), "0704.0001");
1874 }
1875
1876 #[test]
1877 fn arxiv_parse_accepts_new_style_5_digit_seq() {
1878 // Rule: new-style YYMM.NNNNN (5-digit sequence number, post-2015).
1879 let a = ArxivId::parse("2401.12345").expect("new-style 5-digit seq");
1880 assert_eq!(a.as_str(), "2401.12345");
1881 }
1882
1883 #[test]
1884 fn arxiv_parse_accepts_new_style_with_version() {
1885 // Rule: optional `vN` version suffix.
1886 let a = ArxivId::parse("2401.12345v2").expect("with version");
1887 assert_eq!(a.as_str(), "2401.12345v2");
1888 }
1889
1890 #[test]
1891 fn arxiv_parse_accepts_old_style() {
1892 // Rule: old-style subject-class/YYMMNNN.
1893 let a = ArxivId::parse("cond-mat/9501001").expect("old-style cond-mat");
1894 assert_eq!(a.as_str(), "cond-mat/9501001");
1895 }
1896
1897 #[test]
1898 fn arxiv_parse_accepts_old_style_with_subclass_and_version() {
1899 // Rule: old-style subject-class may have a `.XX` two-upper subclass
1900 // and an optional `vN` suffix.
1901 let a = ArxivId::parse("astro-ph.CO/0703123v2").expect("old-style with subclass + version");
1902 assert_eq!(a.as_str(), "astro-ph.CO/0703123v2");
1903 }
1904
1905 #[test]
1906 fn arxiv_parse_accepts_arxiv_uri_scheme() {
1907 // Rule: `arxiv:` / `arXiv:` scheme is stripped at construction.
1908 let a = ArxivId::parse("arxiv:2401.12345").expect("arxiv: scheme");
1909 assert_eq!(a.as_str(), "2401.12345");
1910 }
1911
1912 #[test]
1913 fn arxiv_parse_accepts_arxiv_uri_scheme_mixed_case() {
1914 // Rule: scheme case-insensitive; matches the `arXiv:` form named
1915 // in docs/MCP_TOOLS.md.
1916 let a = ArxivId::parse("arXiv:2401.12345v2").expect("arXiv: scheme");
1917 assert_eq!(a.as_str(), "2401.12345v2");
1918 }
1919
1920 // ---- ArxivId::parse rejection paths (≥6) ------------------------
1921
1922 #[test]
1923 fn arxiv_parse_rejects_empty_input() {
1924 // Rule: empty rejected up-front.
1925 assert_eq!(ArxivId::parse(""), Err(RefParseError::Empty));
1926 }
1927
1928 #[test]
1929 fn arxiv_parse_rejects_no_dot_or_slash() {
1930 // Rule: must contain `.` (new-style) or `/` (old-style).
1931 assert_eq!(
1932 ArxivId::parse("notanarxivid"),
1933 Err(RefParseError::InvalidArxivShape)
1934 );
1935 }
1936
1937 #[test]
1938 fn arxiv_parse_rejects_new_style_wrong_head_length() {
1939 // Rule: head must be exactly 4 digits.
1940 assert_eq!(
1941 ArxivId::parse("240.12345"),
1942 Err(RefParseError::InvalidArxivShape)
1943 );
1944 }
1945
1946 #[test]
1947 fn arxiv_parse_rejects_new_style_seq_too_short() {
1948 // Rule: seq must be 4–5 digits.
1949 assert_eq!(
1950 ArxivId::parse("2401.123"),
1951 Err(RefParseError::InvalidArxivShape)
1952 );
1953 }
1954
1955 #[test]
1956 fn arxiv_parse_rejects_old_style_wrong_id_length() {
1957 // Rule: old-style id is exactly 7 digits.
1958 assert_eq!(
1959 ArxivId::parse("cond-mat/95001"),
1960 Err(RefParseError::InvalidArxivShape)
1961 );
1962 }
1963
1964 #[test]
1965 fn arxiv_parse_rejects_invalid_version_suffix() {
1966 // Rule: version suffix is `v` followed by ≥1 digits, nothing else.
1967 assert_eq!(
1968 ArxivId::parse("2401.12345v"),
1969 Err(RefParseError::InvalidArxivShape)
1970 );
1971 }
1972
1973 #[test]
1974 fn arxiv_parse_rejects_control_char() {
1975 // Rule (docs/SECURITY.md §1.1 log-injection): no control chars.
1976 assert_eq!(
1977 ArxivId::parse("2401.12345\n"),
1978 Err(RefParseError::InvalidArxivShape)
1979 );
1980 }
1981
1982 #[test]
1983 fn arxiv_parse_rejects_non_ascii() {
1984 // Rule: ASCII-only.
1985 assert_eq!(
1986 ArxivId::parse("2401.物理"),
1987 Err(RefParseError::InvalidArxivShape)
1988 );
1989 }
1990
1991 // ---- Ref::parse happy paths (≥6) --------------------------------
1992
1993 #[test]
1994 fn ref_parse_dispatches_doi_scheme_to_doi() {
1995 // Detection rule 1: explicit `doi:` scheme.
1996 match Ref::parse("doi:10.1234/example").expect("doi: dispatched to Doi") {
1997 Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/example"),
1998 other => panic!("expected Ref::Doi, got {:?}", other),
1999 }
2000 }
2001
2002 #[test]
2003 fn ref_parse_dispatches_arxiv_scheme_to_arxiv() {
2004 // Detection rule 2: explicit `arxiv:` scheme.
2005 match Ref::parse("arxiv:2401.12345").expect("arxiv: dispatched to Arxiv") {
2006 Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2007 other => panic!("expected Ref::Arxiv, got {:?}", other),
2008 }
2009 }
2010
2011 #[test]
2012 fn ref_parse_dispatches_arxiv_mixed_case_scheme() {
2013 // Detection rule 2 (case-insensitive): `arXiv:` form.
2014 match Ref::parse("arXiv:cond-mat/9501001").expect("arXiv: dispatched") {
2015 Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2016 other => panic!("expected Ref::Arxiv, got {:?}", other),
2017 }
2018 }
2019
2020 #[test]
2021 fn ref_parse_bare_doi_resolves_to_doi() {
2022 // Detection rule 3: bare input starting with `10.` is a DOI.
2023 match Ref::parse("10.1234/foo").expect("bare DOI") {
2024 Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/foo"),
2025 other => panic!("expected Ref::Doi, got {:?}", other),
2026 }
2027 }
2028
2029 #[test]
2030 fn ref_parse_bare_arxiv_new_resolves_to_arxiv() {
2031 // Detection rule 4: bare input not starting with `10.` falls
2032 // through to arXiv. Tests the ambiguous-input branch named in the
2033 // PR brief: `2401.12345` should resolve to ArxivId.
2034 match Ref::parse("2401.12345").expect("bare new-style arXiv") {
2035 Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2036 other => panic!("expected Ref::Arxiv, got {:?}", other),
2037 }
2038 }
2039
2040 #[test]
2041 fn ref_parse_bare_arxiv_old_resolves_to_arxiv() {
2042 // Detection rule 4: bare old-style arXiv id.
2043 match Ref::parse("cond-mat/9501001").expect("bare old-style arXiv") {
2044 Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2045 other => panic!("expected Ref::Arxiv, got {:?}", other),
2046 }
2047 }
2048
2049 // ---- Ref::parse rejection paths (≥6) ----------------------------
2050
2051 #[test]
2052 fn ref_parse_rejects_empty() {
2053 // Rule: empty up-front.
2054 assert_eq!(Ref::parse(""), Err(RefParseError::Empty));
2055 }
2056
2057 #[test]
2058 fn ref_parse_doi_scheme_with_invalid_doi_propagates_doi_error() {
2059 // When the scheme is explicit, we surface the parser's error
2060 // verbatim — not a generic "shape mismatch".
2061 assert_eq!(
2062 Ref::parse("doi:10.1234"),
2063 Err(RefParseError::MissingDoiSuffixSeparator)
2064 );
2065 }
2066
2067 #[test]
2068 fn ref_parse_arxiv_scheme_with_invalid_arxiv_propagates_arxiv_error() {
2069 assert_eq!(
2070 Ref::parse("arxiv:notanid"),
2071 Err(RefParseError::InvalidArxivShape)
2072 );
2073 }
2074
2075 #[test]
2076 fn ref_parse_bare_with_10_prefix_uses_doi_errors() {
2077 // Bare `10.…` heuristic: DOI parser is dispatched and its error
2078 // surfaces (here: bad registrant).
2079 assert_eq!(
2080 Ref::parse("10.12/x"),
2081 Err(RefParseError::InvalidDoiRegistrant)
2082 );
2083 }
2084
2085 #[test]
2086 fn ref_parse_bare_without_10_prefix_uses_arxiv_errors() {
2087 // Bare ambiguous fallback: ArxivId parser is dispatched and its
2088 // error surfaces. `1.2.3` is neither a DOI nor an arXiv shape.
2089 assert_eq!(Ref::parse("1.2.3"), Err(RefParseError::InvalidArxivShape));
2090 }
2091
2092 #[test]
2093 fn ref_parse_rejects_doi_scheme_with_oversized_suffix() {
2094 // Length-bound: DOI suffix > DOI_SUFFIX_MAX_LEN through Ref::parse
2095 // surfaces DoiSuffixTooLong, not a generic InvalidArxivShape.
2096 let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 5);
2097 let input = format!("doi:10.1234/{}", suffix);
2098 match Ref::parse(&input) {
2099 Err(RefParseError::DoiSuffixTooLong { .. }) => {}
2100 other => panic!("expected DoiSuffixTooLong, got {:?}", other),
2101 }
2102 }
2103
2104 #[test]
2105 fn ref_parse_round_trip_via_serde_preserves_inner_string() {
2106 // Wire-format check: Doi/ArxivId are #[serde(transparent)], and a
2107 // round-trip through Ref::parse → serde_json → Ref must preserve
2108 // the inner identifier. Guards against accidental scheme leakage
2109 // into the stored form.
2110 let r = Ref::parse("doi:10.1234/example").expect("parse ok");
2111 let json = serde_json::to_string(&r).expect("serialize");
2112 // The transparent inner value is the bare identifier (no `doi:`).
2113 assert!(
2114 json.contains("10.1234/example") && !json.contains("doi:"),
2115 "scheme leaked into wire form: {}",
2116 json
2117 );
2118 }
2119
2120 #[test]
2121 fn ref_parse_error_maps_to_invalid_ref_error_code() {
2122 // Public-API contract (docs/PUBLIC_API.md §4): all parse failures
2123 // collapse to ErrorCode::InvalidRef at the public boundary.
2124 let err: ErrorCode = RefParseError::Empty.into();
2125 assert_eq!(err, ErrorCode::InvalidRef);
2126 let err2: ErrorCode = RefParseError::MissingDoiPrefix.into();
2127 assert_eq!(err2, ErrorCode::InvalidRef);
2128 }
2129
2130 // -----------------------------------------------------------------
2131 // DenialReason / DenialContext (ADR-0023) — wire-shape tests.
2132 // -----------------------------------------------------------------
2133
2134 #[test]
2135 fn denial_reason_serializes_snake_case() {
2136 // ADR-0023 §2 / docs/PUBLIC_API.md §8: wire form is snake_case.
2137 let s = serde_json::to_string(&DenialReason::RedirectNotInAllowlist).expect("ser");
2138 assert_eq!(s, "\"redirect_not_in_allowlist\"");
2139 let s = serde_json::to_string(&DenialReason::SizeCapExceeded).expect("ser");
2140 assert_eq!(s, "\"size_cap_exceeded\"");
2141 let s = serde_json::to_string(&DenialReason::ContentTypeMismatch).expect("ser");
2142 assert_eq!(s, "\"content_type_mismatch\"");
2143 }
2144
2145 #[test]
2146 fn denial_reason_round_trip_via_serde() {
2147 // Round-trip every closed-set variant so adding a new variant
2148 // forces this test to be updated (the closed-set contract).
2149 for r in [
2150 DenialReason::RedirectNotInAllowlist,
2151 DenialReason::InsecureScheme,
2152 DenialReason::HostInBlockList,
2153 DenialReason::SizeCapExceeded,
2154 DenialReason::SchemaDrift,
2155 DenialReason::CapabilityNotGranted,
2156 DenialReason::RateLimitWindow,
2157 DenialReason::SsrfPrivateAddress,
2158 DenialReason::ContentTypeMismatch,
2159 ] {
2160 let s = serde_json::to_string(&r).expect("ser");
2161 let back: DenialReason = serde_json::from_str(&s).expect("de");
2162 assert_eq!(back, r, "round-trip mismatch for {:?} -> {}", r, s);
2163 }
2164 }
2165
2166 #[test]
2167 fn denial_context_round_trips_full_shape() {
2168 // A populated context (the redirect-denied case from ADR-0023 §1
2169 // example) survives a JSON round-trip. Whole-struct equality
2170 // exercises the `PartialEq` derive added per ADR-0023 §3 (added
2171 // in the multi-agent review feedback PR — see ADR-0023 history).
2172 let dc = DenialContext {
2173 reason: DenialReason::RedirectNotInAllowlist,
2174 source: Some("crossref".to_string()),
2175 attempted: Some("evil.example.com".to_string()),
2176 expected: Some(vec![
2177 "api.crossref.org".to_string(),
2178 "*.crossref.org".to_string(),
2179 ]),
2180 hop_index: Some(1),
2181 cap: None,
2182 actual: None,
2183 };
2184 let s = serde_json::to_string(&dc).expect("ser");
2185 let back: DenialContext = serde_json::from_str(&s).expect("de");
2186 assert_eq!(back, dc);
2187 }
2188
2189 #[test]
2190 fn denial_context_serialize_elides_empty_fields() {
2191 // `skip_serializing_if = "Option::is_none"` must keep the wire form
2192 // lean: every `None` field MUST NOT appear on the wire. Reason is
2193 // always present.
2194 let dc = DenialContext {
2195 reason: DenialReason::CapabilityNotGranted,
2196 source: None,
2197 attempted: None,
2198 expected: None,
2199 hop_index: None,
2200 cap: None,
2201 actual: None,
2202 };
2203 let s = serde_json::to_string(&dc).expect("ser");
2204 assert_eq!(s, "{\"reason\":\"capability_not_granted\"}");
2205 }
2206
2207 #[test]
2208 fn denial_context_expected_some_empty_vec_preserves_explicit_empty_allowlist() {
2209 // Post-refinement disambiguation: `expected: Some(vec![])` is the
2210 // "explicit empty allowlist" signal and MUST survive the wire as
2211 // `"expected":[]`. Only `expected: None` is skipped on serialize.
2212 // This is the bug the previous `Vec<String>` shape masked.
2213 let dc = DenialContext {
2214 reason: DenialReason::RedirectNotInAllowlist,
2215 source: Some("crossref".to_string()),
2216 attempted: Some("evil.example.com".to_string()),
2217 expected: Some(Vec::new()),
2218 hop_index: None,
2219 cap: None,
2220 actual: None,
2221 };
2222 let s = serde_json::to_string(&dc).expect("ser");
2223 assert!(
2224 s.contains("\"expected\":[]"),
2225 "expected:[] must survive on the wire (got: {s})"
2226 );
2227 let back: DenialContext = serde_json::from_str(&s).expect("de");
2228 assert_eq!(back.expected, Some(Vec::new()));
2229 }
2230
2231 #[test]
2232 fn denial_context_deserialize_tolerates_missing_optional_fields() {
2233 // Consumer-side contract (ADR-0023 §3): consumers MUST tolerate
2234 // any subset of fields being present. Missing optional fields
2235 // deserialize to their defaults via `#[serde(default)]`.
2236 let wire = r#"{"reason":"size_cap_exceeded","cap":104857600,"actual":209715200}"#;
2237 let dc: DenialContext = serde_json::from_str(wire).expect("de");
2238 assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
2239 assert_eq!(dc.cap, Some(104857600));
2240 assert_eq!(dc.actual, Some(209715200));
2241 assert!(dc.source.is_none());
2242 assert!(dc.attempted.is_none());
2243 assert!(dc.expected.is_none());
2244 assert!(dc.hop_index.is_none());
2245 }
2246
2247 #[test]
2248 fn full_error_envelope_with_denial_context_serializes_to_pinned_json() {
2249 // Pins the byte-exact wire shape of the full failure envelope
2250 // documented in docs/ERRORS.md §3 + §3.1 and ADR-0023 §1. A
2251 // future regression that flips key order or skip-rules anywhere
2252 // in the chain breaks this test loudly.
2253 //
2254 // Note: serde_json's `Map` (used by `json!`) sorts keys
2255 // alphabetically when the `preserve_order` feature is NOT
2256 // enabled (we do not enable it). Embedding a `DenialContext`
2257 // via `json!` first re-serialises it through the same alphabet-
2258 // sorted Map path, so the inner field order is also alphabetical
2259 // here — NOT the struct field-order produced by direct
2260 // `to_string(&DenialContext)`. This is by design: the public
2261 // wire shape is canonicalised by serde_json's Map ordering, so
2262 // the byte-exact pin below documents that exact canonicalisation.
2263 let denial = DenialContext {
2264 reason: DenialReason::RedirectNotInAllowlist,
2265 source: Some("crossref".into()),
2266 attempted: Some("evil.example.com".into()),
2267 expected: Some(vec!["api.crossref.org".into(), "*.crossref.org".into()]),
2268 hop_index: Some(1),
2269 cap: None,
2270 actual: None,
2271 };
2272 let envelope = serde_json::json!({
2273 "ok": false,
2274 "error": {
2275 "code": ErrorCode::NetworkError,
2276 "message": "redirect target evil.example.com not in allowlist for source crossref",
2277 "denial_context": denial,
2278 }
2279 });
2280 let actual = serde_json::to_string(&envelope).expect("serialize envelope");
2281 let expected = r#"{"error":{"code":"NETWORK_ERROR","denial_context":{"attempted":"evil.example.com","expected":["api.crossref.org","*.crossref.org"],"hop_index":1,"reason":"redirect_not_in_allowlist","source":"crossref"},"message":"redirect target evil.example.com not in allowlist for source crossref"},"ok":false}"#;
2282 assert_eq!(actual, expected);
2283 }
2284
2285 #[test]
2286 fn denial_context_rejects_unknown_fields() {
2287 // `#[serde(deny_unknown_fields)]` (ADR-0023 §3, PUBLIC_API.md §8):
2288 // an unknown field on the wire MUST be a deserialize error so
2289 // forward-compat field additions stay a breaking change.
2290 let wire = r#"{"reason":"capability_not_granted","banana":1}"#;
2291 let result: Result<DenialContext, _> = serde_json::from_str(wire);
2292 assert!(
2293 result.is_err(),
2294 "deny_unknown_fields must reject 'banana': {:?}",
2295 result.map(|d| d.reason),
2296 );
2297 }
2298}