doiget_core/lib.rs
1//! # doiget-core
2//!
3//! Core library for [doiget](https://github.com/sotashimozono/doiget): an Open Access
4//! first paper-fetcher with strict capability gating, fail-closed provenance logging,
5//! and a BiblioFetch.jl-compatible store layout.
6//!
7//! Phase 0 ships only this skeleton. Real implementations land in Phase 1.
8//! See `docs/PUBLIC_API.md` for the semver-locked surface and `docs/ARCHITECTURE.md`
9//! for the high-level design.
10
11#![warn(missing_docs)]
12#![forbid(unsafe_code)]
13
14use serde::{Deserialize, Serialize};
15use sha2::Digest;
16
17// --- Modules ---
18pub mod canonical;
19pub mod dry_run;
20pub mod http;
21pub mod orchestrator;
22pub mod provenance;
23pub mod rate_limiter;
24pub mod refs;
25pub mod resolver_cache;
26pub mod source;
27pub mod sources;
28pub mod store;
29pub mod user_extension;
30pub mod verify_config;
31
32// Phase 4 citation graph (ADR-0010). Compile-gated by the `citation`
33// Cargo feature, which itself enables the `metadata` feature so the
34// Tier-2 source impls are available.
35#[cfg(feature = "citation")]
36pub mod citation_graph;
37
38// Re-export the canonical-tuple audit-identity types at the crate root
39// per ADR-0024 / `docs/PUBLIC_API.md` §1. The types themselves live in
40// the [`canonical`] submodule.
41pub use crate::canonical::{CanonicalRef, SourceType};
42
43/// Crate version. Used by `doiget-cli --version` and `doiget_health`.
44pub const VERSION: &str = env!("CARGO_PKG_VERSION");
45
46/// TOML schema version this build writes. See `docs/STORE.md` §3.
47pub const SCHEMA_VERSION: &str = "1.0";
48
49/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
50pub const MAX_CONCURRENT_FETCHES: u32 = 5;
51
52/// Hard-coded rate limit. See `docs/LEGAL.md` §6 safeguard 8.
53pub const MAX_FETCHES_PER_SECOND: f32 = 5.0;
54
55/// Maximum batch size for `doiget batch` and `doiget_batch_fetch`.
56pub const MCP_BATCH_MAX_SIZE: usize = 100;
57
58/// Slice 2 alias for [`MCP_BATCH_MAX_SIZE`] using the
59/// spec-language name (`docs/MCP_TOOLS.md` §1 / Slice 2 plan). The
60/// numeric value MUST equal [`MCP_BATCH_MAX_SIZE`]; an internal test
61/// pins the equivalence so the two constants cannot drift.
62pub const MAX_BATCH_REFS: usize = MCP_BATCH_MAX_SIZE;
63
64/// Maximum queued MCP requests beyond `MAX_CONCURRENT_FETCHES`. Excess returns
65/// `ErrorCode::RateLimited`. See `docs/SECURITY.md` §1.4 / `docs/MCP_TOOLS.md`.
66pub const MCP_QUEUE_DEPTH_MAX: usize = 100;
67
68/// MCP server stdin-EOF graceful-shutdown deadline, in seconds. See ADR-0001
69/// and `docs/MCP_TOOLS.md` §8.
70pub const MCP_STDIN_EOF_SHUTDOWN_SEC: u64 = 5;
71
72/// Maximum DOI suffix length accepted at validation. See `docs/SECURITY.md` §1.1.
73pub const DOI_SUFFIX_MAX_LEN: usize = 256;
74
75/// Maximum PDF body size accepted by the fetcher, in bytes. See
76/// `docs/SECURITY.md` §1.2 (Oversized PDF).
77pub const PDF_MAX_BYTES: u64 = 100_000_000;
78
79/// Time-to-live for entries in `~/.cache/doiget/resolver/`. See
80/// `docs/CACHE.md` §3.
81pub const RESOLVER_CACHE_TTL_DAYS: u32 = 7;
82
83/// Time-to-live for entries in `~/.cache/doiget/citations/`. See
84/// `docs/CACHE.md` §3.
85pub const CITATION_CACHE_TTL_DAYS: u32 = 30;
86
87// ---------------------------------------------------------------------------
88// Ref
89// ---------------------------------------------------------------------------
90
91/// A reference to a paper, either by DOI or arXiv id.
92///
93/// See `docs/SECURITY.md` §1.1 for input-validation rules.
94#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
95#[serde(rename_all = "lowercase", tag = "kind", content = "id")]
96pub enum Ref {
97 /// A DOI (e.g., `10.1234/example`).
98 Doi(Doi),
99 /// An arXiv id (e.g., `2401.12345`).
100 Arxiv(ArxivId),
101}
102
103/// A validated DOI string.
104///
105/// Construct via `Doi::parse(s)` (Phase 1+). The inner field is intentionally
106/// `pub(crate)` to forbid bypass construction; tests inside `doiget-core` may
107/// still use `Doi(s)` for fixture purposes.
108///
109/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"10.1234/example"`.
110#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
111#[serde(transparent)]
112pub struct Doi(pub(crate) String);
113
114/// A validated arXiv id string.
115///
116/// Construct via `ArxivId::parse(s)` (Phase 1+). Inner field is `pub(crate)`.
117///
118/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"2401.12345"`.
119#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
120#[serde(transparent)]
121pub struct ArxivId(pub(crate) String);
122
123impl Doi {
124 /// Returns the DOI as a string slice.
125 pub fn as_str(&self) -> &str {
126 &self.0
127 }
128
129 /// Parses and validates a DOI string per `docs/SECURITY.md` §1.1.
130 ///
131 /// Accepts:
132 /// - Bare DOIs: `10.<registrant>/<suffix>` where `<registrant>` is 4–9
133 /// digits and `<suffix>` is a non-empty sequence of characters drawn
134 /// from `[A-Za-z0-9._/():-]` (the `:` covers legacy Kluwer
135 /// `10.1023/A:NNNN` and EDP Sciences `10.1051/jphys:NNNN` DOIs).
136 /// - The `doi:` URI scheme prefix; it is stripped before validation, so
137 /// the stored value never carries a scheme. (Matches the convention
138 /// established in `docs/SAFEKEY.md` §3 step 0.)
139 ///
140 /// Rejects:
141 /// - Inputs missing the literal `10.` prefix (after optional scheme
142 /// strip).
143 /// - Suffixes longer than [`DOI_SUFFIX_MAX_LEN`] bytes.
144 /// - Empty suffixes.
145 /// - Any character outside the suffix charset above (including control
146 /// characters, whitespace, and non-ASCII).
147 ///
148 /// # Errors
149 ///
150 /// Returns a [`RefParseError`] variant that names the specific rejection
151 /// category. Tier 1+ callers should map any [`RefParseError`] to
152 /// [`ErrorCode::InvalidRef`] when surfacing to MCP / CLI.
153 pub fn parse(s: &str) -> Result<Self, RefParseError> {
154 let stripped = parse::strip_doi_scheme(s);
155 parse::validate_doi(stripped)?;
156 Ok(Doi(stripped.to_string()))
157 }
158}
159
160impl ArxivId {
161 /// Returns the arXiv id as a string slice.
162 pub fn as_str(&self) -> &str {
163 &self.0
164 }
165
166 /// Parses and validates an arXiv id per `docs/SECURITY.md` §1.1 and the
167 /// pattern published in `docs/MCP_TOOLS.md`.
168 ///
169 /// Accepts:
170 /// - New-style ids: `YYMM.NNNNN[vN]` where the date block is 4 digits, the
171 /// sequence number is 4–5 digits, and the optional version `vN` is one
172 /// or more digits. Examples: `2401.12345`, `2401.12345v2`.
173 /// - Old-style ids: `subject-class/YYMMNNN[vN]` where the subject class
174 /// is a lowercase token (with optional internal hyphens and an
175 /// optional `.XX` two-uppercase-letter group), and the numeric body
176 /// is exactly 7 digits with optional `vN`. Examples:
177 /// `cond-mat/9501001`, `astro-ph.CO/0703123v2`.
178 /// - The `arxiv:` / `arXiv:` URI scheme prefix; it is stripped before
179 /// validation.
180 ///
181 /// Rejects:
182 /// - Inputs that match neither the new-style nor old-style shape.
183 /// - Inputs containing characters outside the per-shape charset
184 /// (control chars, whitespace, non-ASCII).
185 /// - Empty input.
186 ///
187 /// # Errors
188 ///
189 /// Returns a [`RefParseError`] variant that names the specific rejection
190 /// category.
191 pub fn parse(s: &str) -> Result<Self, RefParseError> {
192 let stripped = parse::strip_arxiv_scheme(s);
193 parse::validate_arxiv(stripped)?;
194 Ok(ArxivId(stripped.to_string()))
195 }
196}
197
198impl Ref {
199 /// Parses a string into a [`Ref`], auto-detecting DOI vs arXiv.
200 ///
201 /// Detection rules:
202 /// 1. If the input begins with the case-insensitive `doi:` scheme, the
203 /// remainder is parsed as a DOI.
204 /// 2. If the input begins with the `arxiv:` or `arXiv:` scheme, the
205 /// remainder is parsed as an arXiv id.
206 /// 3. Otherwise, if the input starts with `10.` it is treated as a bare
207 /// DOI; this matches the heuristic in `docs/SAFEKEY.md` §4 (Julia
208 /// reference) and is stable because DOIs always begin `10.`.
209 /// 4. Failing all of the above, parsing falls back to arXiv.
210 ///
211 /// The returned [`Ref`] never carries the URI scheme — `as_str()` on the
212 /// inner `Doi` / `ArxivId` is always the bare identifier.
213 ///
214 /// # Errors
215 ///
216 /// Returns a [`RefParseError`] from the underlying [`Doi::parse`] or
217 /// [`ArxivId::parse`] call. When the input has an explicit scheme
218 /// (`doi:` / `arxiv:`), the matching parser is dispatched and its error
219 /// surfaces directly. When the input is bare and ambiguous, the
220 /// heuristic in rule 3/4 selects the parser; an unparsable bare input
221 /// surfaces the arXiv parser's error (a non-`10.` ref that also fails
222 /// arXiv validation is never a valid DOI).
223 pub fn parse(s: &str) -> Result<Self, RefParseError> {
224 // Reject empty up front so all three parsers see a meaningful slice;
225 // without this, `strip_*_scheme("")` returns "" and we'd get a
226 // confusing "missing 10. prefix" error for empty input.
227 if s.is_empty() {
228 return Err(RefParseError::Empty);
229 }
230
231 if parse::has_doi_scheme(s) {
232 return Doi::parse(s).map(Ref::Doi);
233 }
234 if parse::has_arxiv_scheme(s) {
235 return ArxivId::parse(s).map(Ref::Arxiv);
236 }
237 if s.starts_with("10.") {
238 return Doi::parse(s).map(Ref::Doi);
239 }
240 ArxivId::parse(s).map(Ref::Arxiv)
241 }
242}
243
244// ---------------------------------------------------------------------------
245// Parser internals
246// ---------------------------------------------------------------------------
247
248mod parse {
249 use super::{RefParseError, DOI_SUFFIX_MAX_LEN};
250
251 /// Case-insensitive `doi:` prefix detector. Matches both `doi:` and
252 /// `DOI:` (and any case mix); the spec in `docs/SAFEKEY.md` §3 only
253 /// names the lowercase form, but the field convention is to be lenient
254 /// in what we accept (the scheme is dropped at the boundary anyway).
255 pub(crate) fn has_doi_scheme(s: &str) -> bool {
256 s.len() >= 4 && s.is_char_boundary(4) && s[..4].eq_ignore_ascii_case("doi:")
257 }
258
259 /// Case-insensitive `arxiv:` prefix detector. Accepts `arxiv:`,
260 /// `arXiv:` (the form used in `docs/MCP_TOOLS.md`), and any other case
261 /// mix.
262 pub(crate) fn has_arxiv_scheme(s: &str) -> bool {
263 s.len() >= 6 && s.is_char_boundary(6) && s[..6].eq_ignore_ascii_case("arxiv:")
264 }
265
266 pub(crate) fn strip_doi_scheme(s: &str) -> &str {
267 if has_doi_scheme(s) {
268 &s[4..]
269 } else {
270 s
271 }
272 }
273
274 pub(crate) fn strip_arxiv_scheme(s: &str) -> &str {
275 if has_arxiv_scheme(s) {
276 &s[6..]
277 } else {
278 s
279 }
280 }
281
282 /// DOI suffix charset per `docs/SECURITY.md` §1.1:
283 /// `[A-Za-z0-9._/():-]`. The forward slash is permitted inside the
284 /// suffix (e.g. `10.1016/...`); the registrant separator is the
285 /// *first* `/` and the suffix is everything after it.
286 ///
287 /// `:` is permitted because two large real publisher DOI families use
288 /// it in the suffix — legacy Kluwer/Springer (`10.1023/A:NNNNNNNNNN`)
289 /// and EDP Sciences / Journal de Physique
290 /// (`10.1051/jphys:NNNNNNNNNNNNNNNNN`). It adds no path-traversal
291 /// capability: traversal requires composing `/` and `.` into `../`,
292 /// and both characters are already in the suffix charset. In addition,
293 /// `safekey` independently escapes every char outside `[A-Za-z0-9._-]`
294 /// before any filesystem use, so `:` never reaches a path literally.
295 /// See ADR-0026 and `docs/SECURITY.md` §1.1.
296 fn is_doi_suffix_char(c: char) -> bool {
297 matches!(c,
298 'A'..='Z' | 'a'..='z' | '0'..='9'
299 | '.' | '_' | '/' | '(' | ')' | '-' | ':'
300 )
301 }
302
303 pub(crate) fn validate_doi(s: &str) -> Result<(), RefParseError> {
304 if s.is_empty() {
305 return Err(RefParseError::Empty);
306 }
307
308 // Must begin with literal "10."; the registrant is 4–9 digits up
309 // to the first '/'. After that, everything is suffix.
310 let rest = s
311 .strip_prefix("10.")
312 .ok_or(RefParseError::MissingDoiPrefix)?;
313 let slash_idx = rest
314 .find('/')
315 .ok_or(RefParseError::MissingDoiSuffixSeparator)?;
316 let registrant = &rest[..slash_idx];
317 let suffix = &rest[slash_idx + 1..];
318
319 // Registrant: 4–9 ASCII digits.
320 if registrant.len() < 4
321 || registrant.len() > 9
322 || !registrant.chars().all(|c| c.is_ascii_digit())
323 {
324 return Err(RefParseError::InvalidDoiRegistrant);
325 }
326
327 // Suffix: non-empty, charset-restricted, length-bounded.
328 if suffix.is_empty() {
329 return Err(RefParseError::EmptyDoiSuffix);
330 }
331 if suffix.len() > DOI_SUFFIX_MAX_LEN {
332 return Err(RefParseError::DoiSuffixTooLong {
333 len: suffix.len(),
334 max: DOI_SUFFIX_MAX_LEN,
335 });
336 }
337 if let Some(bad) = suffix.chars().find(|c| !is_doi_suffix_char(*c)) {
338 return Err(RefParseError::InvalidDoiSuffixChar { ch: bad });
339 }
340 Ok(())
341 }
342
343 /// Validates an arXiv id (with the `arxiv:` / `arXiv:` scheme already
344 /// stripped). Tries the new-style shape first, then the old-style.
345 pub(crate) fn validate_arxiv(s: &str) -> Result<(), RefParseError> {
346 if s.is_empty() {
347 return Err(RefParseError::Empty);
348 }
349 if validate_arxiv_new(s).is_ok() || validate_arxiv_old(s).is_ok() {
350 return Ok(());
351 }
352 Err(RefParseError::InvalidArxivShape)
353 }
354
355 /// New-style arXiv id: `YYMM.NNNNN[vN]`.
356 fn validate_arxiv_new(s: &str) -> Result<(), ()> {
357 let dot_idx = s.find('.').ok_or(())?;
358 let head = &s[..dot_idx];
359 let tail = &s[dot_idx + 1..];
360
361 // Head: exactly 4 ASCII digits.
362 if head.len() != 4 || !head.chars().all(|c| c.is_ascii_digit()) {
363 return Err(());
364 }
365
366 // Tail: 4–5 digits, then optional `v` followed by ≥1 digits.
367 let bytes = tail.as_bytes();
368 let mut i = 0;
369 while i < bytes.len() && bytes[i].is_ascii_digit() {
370 i += 1;
371 }
372 let digits_len = i;
373 if !(4..=5).contains(&digits_len) {
374 return Err(());
375 }
376 if i == bytes.len() {
377 return Ok(());
378 }
379 // Optional version suffix.
380 if bytes[i] != b'v' {
381 return Err(());
382 }
383 i += 1;
384 let v_start = i;
385 while i < bytes.len() && bytes[i].is_ascii_digit() {
386 i += 1;
387 }
388 if i == v_start || i != bytes.len() {
389 return Err(());
390 }
391 Ok(())
392 }
393
394 /// Old-style arXiv id: `subject-class/YYMMNNN[vN]`.
395 /// Subject class: `[a-z]([a-z-]*[a-z])?(\.[A-Z]{2})?`.
396 fn validate_arxiv_old(s: &str) -> Result<(), ()> {
397 let slash_idx = s.find('/').ok_or(())?;
398 let class = &s[..slash_idx];
399 let id = &s[slash_idx + 1..];
400
401 // Class: starts with [a-z], body is [a-z-], optional `.XX` (two
402 // ASCII upper).
403 let (core_class, dot_part) = match class.find('.') {
404 Some(d) => (&class[..d], Some(&class[d + 1..])),
405 None => (class, None),
406 };
407 if core_class.is_empty()
408 || !core_class
409 .chars()
410 .all(|c| c.is_ascii_lowercase() || c == '-')
411 || core_class.starts_with('-')
412 || core_class.ends_with('-')
413 {
414 return Err(());
415 }
416 if let Some(dp) = dot_part {
417 if dp.len() != 2 || !dp.chars().all(|c| c.is_ascii_uppercase()) {
418 return Err(());
419 }
420 }
421
422 // Id: 7 digits, optional `vN`.
423 let bytes = id.as_bytes();
424 let mut i = 0;
425 while i < bytes.len() && bytes[i].is_ascii_digit() {
426 i += 1;
427 }
428 if i != 7 {
429 return Err(());
430 }
431 if i == bytes.len() {
432 return Ok(());
433 }
434 if bytes[i] != b'v' {
435 return Err(());
436 }
437 i += 1;
438 let v_start = i;
439 while i < bytes.len() && bytes[i].is_ascii_digit() {
440 i += 1;
441 }
442 if i == v_start || i != bytes.len() {
443 return Err(());
444 }
445 Ok(())
446 }
447}
448
449// ---------------------------------------------------------------------------
450// RefParseError
451// ---------------------------------------------------------------------------
452
453/// Reasons a `Doi::parse` / `ArxivId::parse` / `Ref::parse` call can fail.
454///
455/// Each variant maps to one rejection category in `docs/SECURITY.md` §1.1.
456/// All variants funnel to [`ErrorCode::InvalidRef`] when surfacing to MCP /
457/// CLI; the granular shape is preserved for tests and for future log
458/// breadcrumbs. The `From<RefParseError> for ErrorCode` impl below makes
459/// `?` propagation collapse to `INVALID_REF` automatically, satisfying
460/// `docs/PUBLIC_API.md` §4.
461///
462/// Marked `#[non_exhaustive]` so adding new categories is a non-breaking
463/// change. Pattern-match with a wildcard arm.
464#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
465#[non_exhaustive]
466pub enum RefParseError {
467 /// Input was empty.
468 #[error("empty input")]
469 Empty,
470 /// Input did not begin with the required `10.` literal (after any
471 /// scheme strip).
472 #[error("DOI must begin with '10.'")]
473 MissingDoiPrefix,
474 /// Input started with `10.` but had no `/` separator between
475 /// registrant and suffix.
476 #[error("DOI must contain '/' between registrant and suffix")]
477 MissingDoiSuffixSeparator,
478 /// Registrant was not 4–9 ASCII digits.
479 #[error("DOI registrant must be 4–9 ASCII digits")]
480 InvalidDoiRegistrant,
481 /// DOI suffix was empty.
482 #[error("DOI suffix is empty")]
483 EmptyDoiSuffix,
484 /// DOI suffix exceeded `DOI_SUFFIX_MAX_LEN` bytes.
485 #[error("DOI suffix is {len} bytes; maximum is {max}")]
486 DoiSuffixTooLong {
487 /// Observed suffix length, in bytes.
488 len: usize,
489 /// Hard upper bound (always [`DOI_SUFFIX_MAX_LEN`]).
490 max: usize,
491 },
492 /// DOI suffix contained a character outside `[A-Za-z0-9._/():-]`.
493 #[error("DOI suffix contains invalid character {ch:?}")]
494 InvalidDoiSuffixChar {
495 /// The first offending character.
496 ch: char,
497 },
498 /// Input matched neither the new-style nor old-style arXiv shape.
499 #[error("input does not match any known arXiv id shape")]
500 InvalidArxivShape,
501}
502
503impl From<RefParseError> for ErrorCode {
504 fn from(_: RefParseError) -> Self {
505 // All parse failures collapse to INVALID_REF at the public boundary,
506 // matching `docs/PUBLIC_API.md` §4 and `docs/SECURITY.md` §1.1.
507 ErrorCode::InvalidRef
508 }
509}
510
511// ---------------------------------------------------------------------------
512// Safekey
513// ---------------------------------------------------------------------------
514
515/// A filesystem-safe key derived deterministically from a `Ref`.
516///
517/// See `docs/SAFEKEY.md` for the full algorithm and reference test vectors.
518/// Construct via `Ref::safekey()` (Phase 1+); inner field is `pub(crate)`.
519///
520/// Wire format: bare string (`#[serde(transparent)]`), e.g. `"doi_10.1234_example"`.
521#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
522#[serde(transparent)]
523pub struct Safekey(pub(crate) String);
524
525impl Safekey {
526 /// Returns the safekey as a string slice.
527 pub fn as_str(&self) -> &str {
528 &self.0
529 }
530}
531
532impl Ref {
533 /// Returns the bare identifier string usable as a provenance `ref` field.
534 ///
535 /// Equivalent to `Doi::as_str` / `ArxivId::as_str` dispatched on the
536 /// variant — the URI scheme (`doi:` / `arxiv:`) is never present in the
537 /// inner identifiers (it is stripped at parse time), so the result is
538 /// always the bare DOI or arXiv id. Used by the CLI / MCP orchestrators
539 /// to populate the `ref` column of provenance log rows
540 /// (`docs/PROVENANCE_LOG.md` §3) without re-matching the variant.
541 pub fn as_input_str(&self) -> &str {
542 match self {
543 Ref::Doi(d) => d.as_str(),
544 Ref::Arxiv(a) => a.as_str(),
545 }
546 }
547
548 /// Derives a deterministic, filesystem-safe key from this reference.
549 ///
550 /// The algorithm is the NORMATIVE binding spec in `docs/SAFEKEY.md` §3.
551 /// Both Rust and Julia implementations MUST produce bit-identical output
552 /// for every entry in `tests/fixtures/safekey/vectors.json`.
553 ///
554 /// # Algorithm summary
555 ///
556 /// 1. Prefix with `doi_` or `arxiv_` (per variant).
557 /// 2. Replace any character outside `[A-Za-z0-9._-]` with `_`.
558 /// 3. Collapse consecutive `_` runs to a single `_`.
559 /// 4. Trim leading/trailing `_`.
560 /// 5. If the result exceeds 192 bytes, take the first 192 bytes plus
561 /// `_` plus the first 8 hex chars of `SHA-256(raw)` (where `raw` is
562 /// the step-1 output, before escaping).
563 ///
564 /// The bound on `as_str()` after step 4 is pure ASCII (steps 1-3 produce
565 /// only ASCII bytes), so the byte-slice in step 5 cannot split a
566 /// multibyte char.
567 pub fn safekey(&self) -> Safekey {
568 // Step 0: prefix per variant. Doi/ArxivId hold the bare identifier
569 // (no `doi:` / `arxiv:` URI scheme — that is stripped by Ref::parse,
570 // not relevant here).
571 let raw = match self {
572 Ref::Doi(d) => format!("doi_{}", d.as_str()),
573 Ref::Arxiv(a) => format!("arxiv_{}", a.as_str()),
574 };
575
576 // Step 1: replace unsafe chars with '_'. Non-ASCII chars (emitted by
577 // String::chars() as full Unicode code points) all hit the wildcard
578 // arm and become a single '_'.
579 let escaped: String = raw
580 .chars()
581 .map(|c| match c {
582 'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' => c,
583 _ => '_',
584 })
585 .collect();
586
587 // Step 2: collapse consecutive '_' runs to a single '_'.
588 let mut collapsed = String::with_capacity(escaped.len());
589 let mut last_was_underscore = false;
590 for c in escaped.chars() {
591 if c == '_' {
592 if !last_was_underscore {
593 collapsed.push('_');
594 }
595 last_was_underscore = true;
596 } else {
597 collapsed.push(c);
598 last_was_underscore = false;
599 }
600 }
601
602 // Step 3: trim leading/trailing '_'.
603 let trimmed = collapsed.trim_matches('_');
604
605 // Step 4: length-bound. After steps 1-3 `trimmed` is pure ASCII, so
606 // `len()` (bytes) == char count and `&trimmed[..192]` is char-safe.
607 let key = if trimmed.len() > 192 {
608 let digest = sha2::Sha256::digest(raw.as_bytes());
609 let hash = hex::encode(&digest[..4]);
610 format!("{}_{}", &trimmed[..192], hash)
611 } else {
612 trimmed.to_string()
613 };
614
615 Safekey(key)
616 }
617}
618
619// ---------------------------------------------------------------------------
620// ErrorCode
621// ---------------------------------------------------------------------------
622
623/// The closed set of error codes doiget surfaces.
624///
625/// See `docs/ERRORS.md` for the persona × code matrix.
626///
627/// Marked `#[non_exhaustive]` so adding new variants is a minor (not major)
628/// version bump.
629#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
630#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
631#[non_exhaustive]
632pub enum ErrorCode {
633 /// DOI / arXiv id failed validation.
634 InvalidRef,
635 /// Tier 1 sources reported no OA URL.
636 NoOaAvailable,
637 /// Internal rate cap or upstream 429.
638 RateLimited,
639 /// Transport / DNS / TLS failure.
640 NetworkError,
641 /// Filesystem write failed.
642 StoreError,
643 /// Provenance log write failed; the fetch was aborted.
644 LogError,
645 /// Source not granted by the runtime `CapabilityProfile`.
646 CapabilityDenied,
647 /// Per-request timeout exceeded.
648 FetchTimeout,
649 /// Store entry's `schema_version` is ahead of this build.
650 SchemaTooNew,
651 /// Could not acquire `flock` within 5 s.
652 LockTimeout,
653 /// Bug — please open an issue.
654 InternalError,
655 /// Feature is spec'd but not yet wired in this Phase. Distinct from
656 /// [`Self::InternalError`] (which signals a bug) and
657 /// [`Self::CapabilityDenied`] (which signals a runtime config gate).
658 /// Returned by stubs that exist to pin the public surface ahead of
659 /// orchestrator implementation, so an agent can react with "wait for
660 /// next minor release" rather than "report a bug" or "tweak my
661 /// capability profile". Wire form: `"NOT_IMPLEMENTED"`.
662 NotImplemented,
663}
664
665impl ErrorCode {
666 /// The `SCREAMING_SNAKE_CASE` wire token for this code, as a
667 /// `&'static str`. Identical to the serde representation but
668 /// allocation-free and usable where a borrowed string with a
669 /// `'static` lifetime is required — notably the provenance log
670 /// `error_code` column (`docs/PROVENANCE_LOG.md` §3), so a failure
671 /// row records the *actual* mapped code instead of a hand-written
672 /// literal that can drift from this enum (issue #118).
673 #[must_use]
674 pub fn as_wire(&self) -> &'static str {
675 match self {
676 ErrorCode::InvalidRef => "INVALID_REF",
677 ErrorCode::NoOaAvailable => "NO_OA_AVAILABLE",
678 ErrorCode::RateLimited => "RATE_LIMITED",
679 ErrorCode::NetworkError => "NETWORK_ERROR",
680 ErrorCode::StoreError => "STORE_ERROR",
681 ErrorCode::LogError => "LOG_ERROR",
682 ErrorCode::CapabilityDenied => "CAPABILITY_DENIED",
683 ErrorCode::FetchTimeout => "FETCH_TIMEOUT",
684 ErrorCode::SchemaTooNew => "SCHEMA_TOO_NEW",
685 ErrorCode::LockTimeout => "LOCK_TIMEOUT",
686 ErrorCode::InternalError => "INTERNAL_ERROR",
687 ErrorCode::NotImplemented => "NOT_IMPLEMENTED",
688 }
689 }
690}
691
692// ---------------------------------------------------------------------------
693// DenialReason / DenialContext (ADR-0023)
694// ---------------------------------------------------------------------------
695
696/// Closed-set reasons a denial-class error envelope can carry on its
697/// optional `denial_context.reason` field.
698///
699/// Wire form (JSON / MCP) is `snake_case` — e.g. `"redirect_not_in_allowlist"`.
700/// The set is **closed** per ADR-0023 §2: adding a new variant is a minor
701/// semver bump; renaming or repurposing one is a breaking change. Mirrors
702/// the stability rule that already governs [`ErrorCode`].
703///
704/// See [`DenialContext`] for the surrounding struct, `docs/ERRORS.md` §3.1
705/// for the wire surface, and `docs/PUBLIC_API.md` §8 for the
706/// semver-locked surface contract.
707#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
708#[serde(rename_all = "snake_case")]
709pub enum DenialReason {
710 /// Redirect target host did not match the source's allowlist
711 /// (`HttpError::RedirectDenied`).
712 RedirectNotInAllowlist,
713 /// Redirect target had a non-HTTPS scheme (`HttpError::InsecureRedirect`).
714 InsecureScheme,
715 /// Source produced a URL whose host is on a future blocklist.
716 ///
717 /// Reserved — no producer wired yet. Will be emitted by the future
718 /// per-source URL host-blocklist guard once that component lands
719 /// (post-Phase-1 supply-chain hardening; see
720 /// `docs/REDIRECT_ALLOWLIST.md` §4 for the staging plan).
721 HostInBlockList,
722 /// Body exceeded [`PDF_MAX_BYTES`] (`HttpError::OversizedBody`).
723 SizeCapExceeded,
724 /// Store entry's `schema_version` is ahead of this binary.
725 ///
726 /// Reserved — no producer wired yet. Will be emitted by the
727 /// `FsStore` schema-rejection path once the read-side bump check
728 /// lands (it currently only writes the current `SCHEMA_VERSION`).
729 SchemaDrift,
730 /// Source not in the runtime [`CapabilityProfile`]
731 /// (`FetchError::NotEligible`).
732 CapabilityNotGranted,
733 /// Rate limiter rejected the call inside the current window.
734 ///
735 /// Reserved — no producer wired yet. Will be emitted by
736 /// [`RateLimiter`](crate::rate_limiter::RateLimiter) once the
737 /// limiter surfaces structured denials (Phase 2+; today the
738 /// limiter only sleeps to enforce the window).
739 RateLimitWindow,
740 /// SSRF guard rejected a private / link-local / cloud-metadata address.
741 ///
742 /// Reserved — no producer wired yet. Will be emitted by the
743 /// future SSRF pre-flight check (post-Phase-1 supply-chain
744 /// hardening; the workspace currently relies on rustls + the
745 /// HTTPS-only redirect policy to keep the attack surface small).
746 SsrfPrivateAddress,
747 /// Response Content-Type / magic-byte mismatch (`HttpError::NotAPdf`).
748 ContentTypeMismatch,
749}
750
751/// Structured machine-parseable companion to `error.message` for
752/// recoverable denials.
753///
754/// The field is **optional and additive** on the public error envelope —
755/// every previously-shipped `{code, message}` envelope remains valid, and
756/// agents that ignore this struct continue to work. When present, it
757/// carries the concrete parameters an LLM agent can use to plan a recovery
758/// (e.g. "the redirect to `evil.example.com` was denied because it is not
759/// in the crossref allowlist") without text-mining `error.message`.
760///
761/// ## Wire shape
762///
763/// `#[serde(deny_unknown_fields)]`: forward-compatible field additions on
764/// the wire are forbidden by design — adding a field to this struct is a
765/// **breaking** change. This is why the type is **not** `#[non_exhaustive]`
766/// (per `docs/PUBLIC_API.md` §8): both production rules — Rust struct
767/// construction outside the crate AND wire-level extension — must agree.
768///
769/// All fields except `reason` are optional. Producers populate the fields
770/// relevant to the reason and leave the rest at `None`; consumers MUST
771/// tolerate any subset of fields being present. Optional fields are
772/// skipped on serialize but accepted as missing on deserialize via
773/// `#[serde(default, skip_serializing_if = "Option::is_none")]`.
774///
775/// [`Self::expected`] is `Option<Vec<String>>` rather than `Vec<String>`
776/// so the producer can distinguish "this reason has no allowlist channel"
777/// (`None` → field absent on the wire) from "this is the explicit list of
778/// acceptable values, possibly empty" (`Some(vec![])` → `"expected":[]` on
779/// the wire). The previous `Vec<String>` shape collapsed both states
780/// into "field omitted", which an LLM agent could not safely disambiguate.
781///
782/// Mapping table: see ADR-0023 §4, plus the
783/// `From<&HttpError> for Option<DenialContext>` and
784/// `From<&FetchError> for Option<DenialContext>` impls in
785/// [`crate::http`] / [`crate::source`].
786#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
787#[serde(deny_unknown_fields)]
788pub struct DenialContext {
789 /// Closed-enum reason code; the only required field.
790 pub reason: DenialReason,
791 /// Resolver source key (e.g. `"crossref"`) when one is in scope.
792 #[serde(default, skip_serializing_if = "Option::is_none")]
793 pub source: Option<String>,
794 /// Concrete value the producer attempted (host, path, hex magic bytes,
795 /// scheme prefix). Shape is reason-specific; consumers MUST treat it
796 /// as opaque text.
797 #[serde(default, skip_serializing_if = "Option::is_none")]
798 pub attempted: Option<String>,
799 /// Allowlist entries / acceptable values. `Option<Vec<String>>` so the
800 /// producer can distinguish "this reason has no allowlist channel"
801 /// (`None`, field absent on the wire) from "this is the explicit list
802 /// of acceptable values, possibly empty" (`Some(vec![])`, `"expected":[]`
803 /// on the wire). The inner `Vec<String>` is used even when only one
804 /// value is meaningful (e.g. `Some(vec!["%PDF-".into()])`) so the
805 /// format does not have to flip when multiple values are acceptable.
806 #[serde(default, skip_serializing_if = "Option::is_none")]
807 pub expected: Option<Vec<String>>,
808 /// Redirect-chain hop position, 0-indexed. `u8` because the chain is
809 /// hard-capped at [`crate::http`]'s `MAX_REDIRECTS` (= 10) and any
810 /// larger value indicates a bug.
811 #[serde(default, skip_serializing_if = "Option::is_none")]
812 pub hop_index: Option<u8>,
813 /// Size or rate cap value (e.g. [`PDF_MAX_BYTES`]).
814 #[serde(default, skip_serializing_if = "Option::is_none")]
815 pub cap: Option<u64>,
816 /// Observed value (e.g. response bytes when [`Self::cap`] is the byte
817 /// cap, or row schema_version when [`Self::cap`] is the binary's).
818 #[serde(default, skip_serializing_if = "Option::is_none")]
819 pub actual: Option<u64>,
820}
821
822// ---------------------------------------------------------------------------
823// ResolvedCandidate / ResolveResult (Issue #242)
824// ---------------------------------------------------------------------------
825
826/// A candidate paper resolved from a bibliographic citation string.
827#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
828pub struct ResolvedCandidate {
829 /// Resolved DOI.
830 pub doi: String,
831 /// Title of the resolved candidate.
832 pub title: String,
833 /// First author or primary author representation.
834 pub author: String,
835 /// Publication year, if resolved.
836 pub year: Option<i32>,
837 /// Token similarity overlap score in `0.0..=1.0`.
838 pub score: f64,
839 /// Resolving metadata source (e.g. `"crossref"`).
840 pub source: String,
841}
842
843/// The result structure returned by bibliographic citation resolution.
844#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
845pub struct ResolveResult {
846 /// The original query bibliographic citation string.
847 pub query: String,
848 /// Ranked candidate list (highest score first, thresholded to >= 0.5).
849 pub candidates: Vec<ResolvedCandidate>,
850}
851
852// ---------------------------------------------------------------------------
853// CapabilityProfile (placeholder; full impl in Phase 1)
854// ---------------------------------------------------------------------------
855
856/// Marker for the always-on Open Access tier. See `docs/CAPABILITY.md`.
857#[derive(Debug, Clone, Copy)]
858pub struct AlwaysOn;
859
860/// Which Tier 2 metadata sources are enabled this session. See `docs/CAPABILITY.md`.
861#[derive(Debug, Clone, Default)]
862#[non_exhaustive]
863pub struct MetadataAccess {
864 /// Phase 4+; enabled by `DOIGET_ENABLE_OPENALEX`.
865 pub openalex: bool,
866 /// Phase 4+; enabled by `DOIGET_ENABLE_S2`.
867 pub semantic_scholar: bool,
868 /// Phase 4+; enabled by `DOIGET_ENABLE_DOAJ`.
869 pub doaj: bool,
870}
871
872/// Process-wide rate limits. Hard-coded; not configurable.
873///
874/// Construct only via [`RateLimits::HARD_CODED`]. The struct fields are
875/// `pub(crate)` so downstream code cannot synthesize a `RateLimits` with
876/// different values, which would weaken `docs/LEGAL.md` §6 safeguard 8.
877#[derive(Debug, Clone, Copy)]
878#[non_exhaustive]
879pub struct RateLimits {
880 pub(crate) max_concurrent_fetches: u32,
881 pub(crate) max_fetches_per_second: f32,
882 pub(crate) per_source_backoff_ms: u64,
883}
884
885impl RateLimits {
886 /// The single, hard-coded set of rate limits. There is no other public
887 /// constructor — see the type-level docs.
888 pub const HARD_CODED: Self = Self {
889 max_concurrent_fetches: MAX_CONCURRENT_FETCHES,
890 max_fetches_per_second: MAX_FETCHES_PER_SECOND,
891 per_source_backoff_ms: 200,
892 };
893
894 /// Maximum number of concurrent fetches in flight.
895 pub const fn max_concurrent_fetches(&self) -> u32 {
896 self.max_concurrent_fetches
897 }
898
899 /// Maximum fetch attempts per second across all sources.
900 pub const fn max_fetches_per_second(&self) -> f32 {
901 self.max_fetches_per_second
902 }
903
904 /// Per-source backoff in milliseconds between consecutive requests.
905 pub const fn per_source_backoff_ms(&self) -> u64 {
906 self.per_source_backoff_ms
907 }
908}
909
910/// A successful TDM grant.
911///
912/// Carries the validated API key (`docs/CAPABILITY.md` §1) so that the key
913/// flows from the startup capability gate into the source, rather than each
914/// TDM source re-reading the env var at fetch time (issue #153 — an env
915/// mutation between startup and fetch is otherwise undetectable).
916///
917/// The `api_key` field exists only when at least one `tdm-*` Cargo feature
918/// is compiled in (the `secrecy` dependency is `optional = true` and gated
919/// on those features per ADR-0002, so default release binaries contain no
920/// TDM code path at all). The struct is `#[non_exhaustive]`; the
921/// `tdm-*`-gated `api_key` field is therefore additive, not breaking, for
922/// builds that toggle the feature set.
923///
924/// `docs/CAPABILITY.md` §1 specifies the type as `Secret<String>`; that is
925/// the `secrecy` 0.9 spelling. The workspace pins `secrecy` 0.10, whose
926/// equivalent owned-string secret type is `secrecy::SecretString`
927/// (`= SecretBox<str>`). CAPABILITY.md §1 has been updated to match the
928/// 0.10 API. `Debug` redacts the value.
929///
930/// Implements `Default` so in-crate test fixtures using
931/// `TdmGrant { agree_env_var: ..., ..Default::default() }` keep compiling;
932/// the default `api_key` is an empty secret.
933#[derive(Debug, Clone)]
934#[non_exhaustive]
935pub struct TdmGrant {
936 /// The publisher API key, validated present at startup by
937 /// [`CapabilityProfile::from_env`]. Wrapped in
938 /// `secrecy::SecretString` so `Debug` never prints it; use
939 /// `secrecy::ExposeSecret::expose_secret` at the point of use.
940 ///
941 /// Only present when a `tdm-*` feature is compiled in (see the
942 /// type-level docs and ADR-0002).
943 #[cfg(any(
944 feature = "tdm-elsevier",
945 feature = "tdm-aps",
946 feature = "tdm-springer"
947 ))]
948 pub api_key: secrecy::SecretString,
949 /// Which env var the user used to acknowledge the publisher's ToS.
950 pub agree_env_var: String,
951 /// When the agreement env var was first observed at startup.
952 pub agreed_at: chrono::DateTime<chrono::Utc>,
953}
954
955impl Default for TdmGrant {
956 fn default() -> Self {
957 Self {
958 #[cfg(any(
959 feature = "tdm-elsevier",
960 feature = "tdm-aps",
961 feature = "tdm-springer"
962 ))]
963 api_key: secrecy::SecretString::from(String::new()),
964 agree_env_var: String::new(),
965 agreed_at: chrono::Utc::now(),
966 }
967 }
968}
969
970/// Runtime gate for which sources may be invoked. See `docs/CAPABILITY.md`.
971///
972/// Marked `#[non_exhaustive]` so adding new capability classes is non-breaking.
973/// Pattern-match only against the documented variants and use a wildcard arm.
974///
975/// **Construction**: external callers use [`CapabilityProfile::from_env()`].
976/// Struct-literal construction is blocked outside this crate by
977/// `#[non_exhaustive]`; this is intentional — the type's safety guarantees
978/// rely on the resolution rules in `from_env`. `Default` is **not yet**
979/// implemented; Phase 1 will add it once the field set stabilizes.
980#[derive(Debug, Clone)]
981#[non_exhaustive]
982pub struct CapabilityProfile {
983 /// Tier 1 OA sources are always permitted.
984 pub oa: AlwaysOn,
985 /// Tier 2 metadata access (Phase 4+).
986 pub metadata: MetadataAccess,
987 /// Tier 3 grants are populated only when both env var and feature compile-in are set.
988 pub tdm_elsevier: Option<TdmGrant>,
989 /// Tier 3 grants are populated only when both env var and feature compile-in are set.
990 pub tdm_aps: Option<TdmGrant>,
991 /// Tier 3 grants are populated only when both env var and feature compile-in are set.
992 pub tdm_springer: Option<TdmGrant>,
993 /// Hard-coded rate limits for this process.
994 pub rate_limits: RateLimits,
995}
996
997/// Errors that can arise during `CapabilityProfile::from_env`.
998#[derive(Debug, thiserror::Error)]
999pub enum CapabilityError {
1000 /// User set the agree env var but provided no key. See `docs/CAPABILITY.md` §2.
1001 #[error("env {agree_var} is set but {key_var} is missing")]
1002 AgreedButNoKey {
1003 /// The agreement env var the user set.
1004 agree_var: String,
1005 /// The key env var that should accompany it.
1006 key_var: String,
1007 },
1008 /// Key env var is set but user has not agreed. See `docs/CAPABILITY.md` §2.
1009 #[error("key for {agree_var} is present but {agree_var} is not set to '1'")]
1010 KeyButNotAgreed {
1011 /// The agreement env var the user must set to `1` before the key takes effect.
1012 agree_var: String,
1013 },
1014}
1015
1016impl CapabilityProfile {
1017 /// Read the runtime profile from environment variables.
1018 ///
1019 /// Implements the resolution algorithm specified in
1020 /// [`docs/CAPABILITY.md`](../../../docs/CAPABILITY.md) §2.
1021 ///
1022 /// # Tier 1 (Open Access)
1023 ///
1024 /// Always permitted; not gated on any env var or feature.
1025 ///
1026 /// # Tier 2 (metadata)
1027 ///
1028 /// Each metadata source becomes available when its env var is set
1029 /// (presence-checked, value ignored) **and** the `metadata` Cargo feature
1030 /// was compiled in. If the env var is set but the feature is not compiled
1031 /// in, a `tracing::warn!` is emitted and the source is left disabled —
1032 /// this is not an error so that users can move binaries between machines
1033 /// (or switch feature sets between cargo invocations) without breaking
1034 /// startup. See `docs/CAPABILITY.md` §3 for the env var list.
1035 ///
1036 /// # Tier 3 (TDM)
1037 ///
1038 /// For each publisher in `{ELSEVIER, APS, SPRINGER}`, the
1039 /// `DOIGET_AGREE_TDM_<X>` agreement env var is paired with
1040 /// `DOIGET_KEY_<X>`. Resolution rules (per `docs/CAPABILITY.md` §2):
1041 ///
1042 /// - both unset → `tdm_<x> = None` (no error);
1043 /// - `agree == "1"` and key set → `Some(TdmGrant { .. })` (subject to the
1044 /// feature gate below);
1045 /// - `agree == "1"` and key unset → [`CapabilityError::AgreedButNoKey`];
1046 /// - key set but `agree` unset (or `agree != "1"`) →
1047 /// [`CapabilityError::KeyButNotAgreed`].
1048 ///
1049 /// When both env vars are set correctly **but** the corresponding
1050 /// `tdm-<x>` Cargo feature is not compiled in, this function emits a
1051 /// `tracing::warn!` and sets the grant to `None` rather than returning an
1052 /// error — same rationale as for the Tier 2 warn-and-skip behavior.
1053 ///
1054 /// # Precondition: tracing subscriber must be installed first
1055 ///
1056 /// Warn breadcrumbs are delivered via `tracing::warn!`. Callers MUST
1057 /// install a `tracing-subscriber` (or equivalent) **before** invoking
1058 /// this function, otherwise warnings are silently dropped. The
1059 /// `doiget-cli` binary does this in `main.rs`.
1060 ///
1061 /// # Errors
1062 ///
1063 /// Returns [`CapabilityError::AgreedButNoKey`] or
1064 /// [`CapabilityError::KeyButNotAgreed`] when the TDM env-var pair for any
1065 /// publisher is misconfigured. See the variant docs for the precise
1066 /// trigger conditions.
1067 ///
1068 /// # Note on `api_key` storage
1069 ///
1070 /// When a `tdm-*` feature is compiled in, [`TdmGrant`] carries the
1071 /// validated key as `secrecy::SecretString` (issue #153). The key is
1072 /// read exactly once here, at startup; TDM sources consume it from the
1073 /// grant and never re-read the env var at fetch time. This makes the
1074 /// grant a true startup attestation — an env mutation between startup
1075 /// and fetch can no longer silently change the credential in flight.
1076 /// See the [`TdmGrant`] doc-comment and `docs/CAPABILITY.md` §1/§2.
1077 pub fn from_env() -> Result<Self, CapabilityError> {
1078 // Issue #153: the validated API key is now threaded through
1079 // `TdmGrant` (as `secrecy::SecretString`, behind the `tdm-*`
1080 // features) by `resolve_tdm_grant` below — sources no longer
1081 // re-read the key env var at fetch time. See the `TdmGrant`
1082 // doc-comment and `docs/CAPABILITY.md` §1/§2.
1083
1084 // -- Tier 2 metadata -------------------------------------------------
1085 let metadata = MetadataAccess {
1086 openalex: resolve_metadata_flag(
1087 "DOIGET_ENABLE_OPENALEX",
1088 "metadata",
1089 cfg!(feature = "metadata"),
1090 ),
1091 semantic_scholar: resolve_metadata_flag(
1092 "DOIGET_ENABLE_S2",
1093 "metadata",
1094 cfg!(feature = "metadata"),
1095 ),
1096 doaj: resolve_metadata_flag(
1097 "DOIGET_ENABLE_DOAJ",
1098 "metadata",
1099 cfg!(feature = "metadata"),
1100 ),
1101 };
1102
1103 // -- Tier 3 TDM grants ----------------------------------------------
1104 let tdm_elsevier = resolve_tdm_grant(
1105 "DOIGET_AGREE_TDM_ELSEVIER",
1106 "DOIGET_KEY_ELSEVIER",
1107 "tdm-elsevier",
1108 cfg!(feature = "tdm-elsevier"),
1109 )?;
1110 let tdm_aps = resolve_tdm_grant(
1111 "DOIGET_AGREE_TDM_APS",
1112 "DOIGET_KEY_APS",
1113 "tdm-aps",
1114 cfg!(feature = "tdm-aps"),
1115 )?;
1116 let tdm_springer = resolve_tdm_grant(
1117 "DOIGET_AGREE_TDM_SPRINGER",
1118 "DOIGET_KEY_SPRINGER",
1119 "tdm-springer",
1120 cfg!(feature = "tdm-springer"),
1121 )?;
1122
1123 Ok(Self {
1124 oa: AlwaysOn,
1125 metadata,
1126 tdm_elsevier,
1127 tdm_aps,
1128 tdm_springer,
1129 rate_limits: RateLimits::HARD_CODED,
1130 })
1131 }
1132}
1133
1134/// Resolve a Tier 2 metadata flag from its env var and compile-in feature.
1135///
1136/// Returns `true` only when both the env var is present and the feature is
1137/// compiled in. When the env var is set without the feature, emits a
1138/// `tracing::warn!` and returns `false` — see [`CapabilityProfile::from_env`]
1139/// for the rationale (binaries may move between hosts / feature sets).
1140fn resolve_metadata_flag(env_var: &str, feature: &str, feature_enabled: bool) -> bool {
1141 let env_set = std::env::var_os(env_var).is_some();
1142 match (env_set, feature_enabled) {
1143 (true, true) => true,
1144 (true, false) => {
1145 tracing::warn!(
1146 env_var,
1147 feature,
1148 "{} is set but feature {} was not compiled in; the source will be unavailable",
1149 env_var,
1150 feature
1151 );
1152 false
1153 }
1154 (false, _) => false,
1155 }
1156}
1157
1158/// Resolve a Tier 3 TDM grant from the `agree`/`key` env-var pair and the
1159/// per-publisher Cargo feature.
1160///
1161/// Implements the rules in `docs/CAPABILITY.md` §2:
1162///
1163/// - both unset → `Ok(None)`.
1164/// - `agree == "1"` and `key` set → `Ok(Some(TdmGrant { .. }))` (when the
1165/// feature is enabled), or warn-and-`Ok(None)` (when the feature is not
1166/// compiled in).
1167/// - `agree == "1"` and `key` unset →
1168/// [`CapabilityError::AgreedButNoKey`].
1169/// - `key` set and `agree` unset OR `agree` set to anything other than `"1"`
1170/// → [`CapabilityError::KeyButNotAgreed`].
1171fn resolve_tdm_grant(
1172 agree_var: &str,
1173 key_var: &str,
1174 feature: &str,
1175 feature_enabled: bool,
1176) -> Result<Option<TdmGrant>, CapabilityError> {
1177 // `agree` is "agreed" iff the value is exactly the literal "1"; any other
1178 // value (including "true", "yes", empty) is treated as not-agreed per
1179 // `docs/CAPABILITY.md` §2.
1180 let agree_raw = std::env::var(agree_var).ok();
1181 let agreed = matches!(agree_raw.as_deref(), Some("1"));
1182 let agree_present = agree_raw.is_some();
1183 // Read the key value once, at startup, so the validated key flows
1184 // through `TdmGrant` and sources never re-read the env (issue #153).
1185 // An empty value is treated as "not set" — an empty API key cannot
1186 // authenticate, and silently constructing a grant around it would
1187 // mask the misconfiguration the AgreedButNoKey rule exists to surface.
1188 let key_value = std::env::var(key_var).ok().filter(|v| !v.is_empty());
1189
1190 match (agreed, agree_present, key_value) {
1191 (true, _, Some(key)) => {
1192 if feature_enabled {
1193 Ok(Some(build_tdm_grant(agree_var, key)))
1194 } else {
1195 // `key` is dropped here; under no-tdm builds it is the only
1196 // consumer of the owned `String`, which is intended.
1197 let _ = key;
1198 tracing::warn!(
1199 env_var = agree_var,
1200 feature,
1201 "{} is set but feature {} was not compiled in; the source will be unavailable",
1202 agree_var,
1203 feature
1204 );
1205 Ok(None)
1206 }
1207 }
1208 (true, _, None) => Err(CapabilityError::AgreedButNoKey {
1209 agree_var: agree_var.to_string(),
1210 key_var: key_var.to_string(),
1211 }),
1212 // agree set to non-"1", key also set: KeyButNotAgreed (the key would
1213 // otherwise authorize the source without an explicit agreement).
1214 (false, true, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1215 agree_var: agree_var.to_string(),
1216 }),
1217 // agree unset, key set: KeyButNotAgreed (same rule).
1218 (false, false, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
1219 agree_var: agree_var.to_string(),
1220 }),
1221 // agree set to non-"1" and no key: treat as no-grant. The user
1222 // expressed something but did not opt in and provided no credential,
1223 // so silent skip is the safe default (no source enabled).
1224 (false, true, None) => Ok(None),
1225 // Neither env var set: no grant, no error.
1226 (false, false, None) => Ok(None),
1227 }
1228}
1229
1230/// Construct a [`TdmGrant`] from the validated agreement var and key value.
1231///
1232/// Split out so the `tdm-*`-gated `api_key` field is populated in exactly
1233/// one place. When no `tdm-*` feature is compiled in the `key` is consumed
1234/// (dropped) here — the grant is still produced so that startup attestation
1235/// behavior (the warn-and-skip path) does not change shape between feature
1236/// sets.
1237fn build_tdm_grant(agree_var: &str, key: String) -> TdmGrant {
1238 #[cfg(any(
1239 feature = "tdm-elsevier",
1240 feature = "tdm-aps",
1241 feature = "tdm-springer"
1242 ))]
1243 {
1244 TdmGrant {
1245 api_key: secrecy::SecretString::from(key),
1246 agree_env_var: agree_var.to_string(),
1247 agreed_at: chrono::Utc::now(),
1248 }
1249 }
1250 #[cfg(not(any(
1251 feature = "tdm-elsevier",
1252 feature = "tdm-aps",
1253 feature = "tdm-springer"
1254 )))]
1255 {
1256 let _ = key;
1257 TdmGrant {
1258 agree_env_var: agree_var.to_string(),
1259 agreed_at: chrono::Utc::now(),
1260 }
1261 }
1262}
1263
1264// ---------------------------------------------------------------------------
1265// Tests — one smoke test per legally-load-bearing constant. See
1266// `docs/LEGAL.md` §6 safeguard 8 and `docs/PHASES.md` §4. These also keep the
1267// `cargo test --workspace` job from being a false-green during Phase 0.
1268// ---------------------------------------------------------------------------
1269
1270// `expect`/`unwrap` are idiomatic in tests where panics double as assertions.
1271// The workspace lints deny them in production code; relax for the test module
1272// only.
1273#[cfg(test)]
1274#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1275mod tests {
1276 use super::*;
1277
1278 #[test]
1279 fn rate_limits_hard_coded_match_legal_safeguards() {
1280 // docs/LEGAL.md §6 safeguard 8 names these exact values.
1281 assert_eq!(RateLimits::HARD_CODED.max_concurrent_fetches(), 5);
1282 assert!((RateLimits::HARD_CODED.max_fetches_per_second() - 5.0).abs() < f32::EPSILON);
1283 assert_eq!(RateLimits::HARD_CODED.per_source_backoff_ms(), 200);
1284 }
1285
1286 #[test]
1287 fn batch_size_caps_match_security_doc() {
1288 // docs/SECURITY.md §1.4 + docs/MCP_TOOLS.md.
1289 assert_eq!(MCP_BATCH_MAX_SIZE, 100);
1290 assert_eq!(MCP_QUEUE_DEPTH_MAX, 100);
1291 assert_eq!(DOI_SUFFIX_MAX_LEN, 256);
1292 assert_eq!(MCP_STDIN_EOF_SHUTDOWN_SEC, 5);
1293 // Slice 2: spec-language alias for MCP_BATCH_MAX_SIZE must
1294 // numerically agree with the original constant.
1295 assert_eq!(MAX_BATCH_REFS, MCP_BATCH_MAX_SIZE);
1296 }
1297
1298 #[test]
1299 fn schema_version_is_pinned_to_1_0() {
1300 // docs/STORE.md §3 — Phase 0/1 writes 1.0 exactly.
1301 // A bump to 1.1 (minor, backward-compat additions) requires updating
1302 // both this test and the cross-tool compat fixtures simultaneously.
1303 assert_eq!(SCHEMA_VERSION, "1.0");
1304 }
1305
1306 // -----------------------------------------------------------------
1307 // CapabilityProfile::from_env — Phase 1 resolution algorithm tests.
1308 //
1309 // These tests mutate process-global env state via std::env::set_var /
1310 // remove_var, so each test holds an `EnvGuard` RAII drop guard that
1311 // captures the pre-test value of every env var it touches and restores
1312 // it on drop (even on panic). They also use `#[serial_test::serial]` so
1313 // that no two tests in this module touch env state concurrently — the
1314 // workspace's test runner defaults to multi-threaded.
1315 //
1316 // Spec: docs/CAPABILITY.md §2 (resolution algorithm) and §3 (env var
1317 // reference table).
1318 // -----------------------------------------------------------------
1319
1320 /// RAII guard that captures the prior value of an env var on construction
1321 /// and restores it on drop. Use one guard per touched var per test.
1322 struct EnvGuard {
1323 var: &'static str,
1324 prior: Option<std::ffi::OsString>,
1325 }
1326
1327 impl EnvGuard {
1328 /// Capture and clear `var`. Use `set` afterwards to install a value.
1329 fn unset(var: &'static str) -> Self {
1330 let prior = std::env::var_os(var);
1331 // SAFETY (env mutation): tests are serialized via
1332 // `#[serial_test::serial]`. `remove_var` is sound when no other
1333 // thread reads or writes the environment concurrently.
1334 std::env::remove_var(var);
1335 EnvGuard { var, prior }
1336 }
1337
1338 /// Capture, then set `var` to `value`.
1339 fn set(var: &'static str, value: &str) -> Self {
1340 let prior = std::env::var_os(var);
1341 std::env::set_var(var, value);
1342 EnvGuard { var, prior }
1343 }
1344 }
1345
1346 impl Drop for EnvGuard {
1347 fn drop(&mut self) {
1348 match &self.prior {
1349 Some(v) => std::env::set_var(self.var, v),
1350 None => std::env::remove_var(self.var),
1351 }
1352 }
1353 }
1354
1355 /// Convenience: unset every Tier 2 / Tier 3 env var the resolution
1356 /// algorithm reads, returning a vector of guards that restore them on
1357 /// drop. Callers can then `EnvGuard::set` individual vars on top.
1358 fn unset_all_capability_env_vars() -> Vec<EnvGuard> {
1359 [
1360 "DOIGET_ENABLE_OPENALEX",
1361 "DOIGET_ENABLE_S2",
1362 "DOIGET_ENABLE_DOAJ",
1363 "DOIGET_AGREE_TDM_ELSEVIER",
1364 "DOIGET_KEY_ELSEVIER",
1365 "DOIGET_AGREE_TDM_APS",
1366 "DOIGET_KEY_APS",
1367 "DOIGET_AGREE_TDM_SPRINGER",
1368 "DOIGET_KEY_SPRINGER",
1369 ]
1370 .iter()
1371 .map(|v| EnvGuard::unset(v))
1372 .collect()
1373 }
1374
1375 #[test]
1376 #[serial_test::serial]
1377 fn from_env_no_env_vars_set_returns_tier_1_only() {
1378 // Rule: with every relevant env var unset, the resolved profile has
1379 // all TDM grants `None` and all metadata flags `false`. Hard-coded
1380 // rate limits still apply. (Replaces the old Phase 0 stub test.)
1381 let _g = unset_all_capability_env_vars();
1382
1383 let p = CapabilityProfile::from_env().expect("clean env never errors");
1384 assert!(p.tdm_elsevier.is_none());
1385 assert!(p.tdm_aps.is_none());
1386 assert!(p.tdm_springer.is_none());
1387 assert!(!p.metadata.openalex);
1388 assert!(!p.metadata.semantic_scholar);
1389 assert!(!p.metadata.doaj);
1390 assert_eq!(p.rate_limits.max_concurrent_fetches(), 5);
1391 }
1392
1393 #[test]
1394 #[serial_test::serial]
1395 fn from_env_no_tdm_returns_tier_1_profile() {
1396 // Rule (CAPABILITY.md §2): with every TDM env var unset, all
1397 // `tdm_*` fields are `None` and no error is produced.
1398 let _g = unset_all_capability_env_vars();
1399
1400 let p = CapabilityProfile::from_env().expect("no TDM env -> Ok");
1401 assert!(p.tdm_elsevier.is_none());
1402 assert!(p.tdm_aps.is_none());
1403 assert!(p.tdm_springer.is_none());
1404 }
1405
1406 #[test]
1407 #[serial_test::serial]
1408 fn from_env_agreed_but_no_key_errs() {
1409 // Rule (CAPABILITY.md §2): agree=1 + key unset -> AgreedButNoKey.
1410 let _g = unset_all_capability_env_vars();
1411 let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1412
1413 let result = CapabilityProfile::from_env();
1414 match result {
1415 Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1416 assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1417 assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1418 }
1419 other => panic!("expected AgreedButNoKey, got {:?}", other),
1420 }
1421 }
1422
1423 #[test]
1424 #[serial_test::serial]
1425 fn from_env_agreed_but_empty_key_errs() {
1426 // Security-adjacent (PR #161 review): an *empty* key string is
1427 // treated as "not set" by `resolve_tdm_grant`. With agree=1 and
1428 // DOIGET_KEY_ELSEVIER="" the misconfiguration must surface as
1429 // AgreedButNoKey, not silently build a grant around an empty
1430 // secret that could never authenticate.
1431 let _g = unset_all_capability_env_vars();
1432 let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1433 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1434
1435 let result = CapabilityProfile::from_env();
1436 match result {
1437 Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
1438 assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1439 assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
1440 }
1441 other => panic!("expected AgreedButNoKey for empty key, got {:?}", other),
1442 }
1443 }
1444
1445 #[test]
1446 #[serial_test::serial]
1447 fn from_env_empty_key_without_agree_is_no_grant() {
1448 // Security-adjacent (PR #161 review): an empty key with the
1449 // agree var unset is indistinguishable from "no key at all".
1450 // It must resolve to Ok(None) (no grant, no error) — an empty
1451 // string must NOT trip the KeyButNotAgreed leaked-credential
1452 // rule, since there is no credential.
1453 let _g = unset_all_capability_env_vars();
1454 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
1455
1456 let p = CapabilityProfile::from_env()
1457 .expect("empty key + agree unset must be Ok(None), not an error");
1458 assert!(
1459 p.tdm_elsevier.is_none(),
1460 "empty DOIGET_KEY_ELSEVIER with no agree var must yield no grant"
1461 );
1462 assert!(p.tdm_aps.is_none());
1463 assert!(p.tdm_springer.is_none());
1464 }
1465
1466 #[test]
1467 #[serial_test::serial]
1468 fn from_env_key_but_not_agreed_errs() {
1469 // Rule (CAPABILITY.md §2): key set + agree unset -> KeyButNotAgreed.
1470 // A leaked DOIGET_KEY_ELSEVIER must not silently enable a source.
1471 let _g = unset_all_capability_env_vars();
1472 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1473
1474 let result = CapabilityProfile::from_env();
1475 match result {
1476 Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1477 assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1478 }
1479 other => panic!("expected KeyButNotAgreed, got {:?}", other),
1480 }
1481 }
1482
1483 #[test]
1484 #[serial_test::serial]
1485 fn from_env_agree_not_one_errs() {
1486 // Rule (CAPABILITY.md §2): the agree var must be exactly "1". Any
1487 // other value (here: "true") is treated as not-agreed; combined
1488 // with a key set, that triggers KeyButNotAgreed.
1489 let _g = unset_all_capability_env_vars();
1490 let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "true");
1491 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1492
1493 let result = CapabilityProfile::from_env();
1494 match result {
1495 Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
1496 assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
1497 }
1498 other => panic!("expected KeyButNotAgreed, got {:?}", other),
1499 }
1500 }
1501
1502 #[test]
1503 #[serial_test::serial]
1504 fn from_env_both_set_correctly_returns_grant() {
1505 // Rule (CAPABILITY.md §2): agree=1 + key set -> Some(TdmGrant) when
1506 // the corresponding feature is compiled in; else None (warn-and-skip).
1507 // The feature gate for elsevier is `tdm-elsevier`; this test asserts
1508 // both branches via `cfg!`.
1509 let _g = unset_all_capability_env_vars();
1510 let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
1511 let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
1512
1513 let p = CapabilityProfile::from_env().expect("agree=1 + key -> Ok");
1514
1515 if cfg!(feature = "tdm-elsevier") {
1516 let grant = p
1517 .tdm_elsevier
1518 .as_ref()
1519 .expect("feature tdm-elsevier compiled in -> Some(TdmGrant)");
1520 assert_eq!(grant.agree_env_var, "DOIGET_AGREE_TDM_ELSEVIER");
1521 // Issue #153 / PR #161 review: prove the key was actually
1522 // threaded into TdmGrant::api_key at startup (not just that
1523 // the agree var was recorded). The field is cfg-gated to
1524 // the same `tdm-*` set as the assertion below, so gate the
1525 // check identically.
1526 #[cfg(any(
1527 feature = "tdm-elsevier",
1528 feature = "tdm-aps",
1529 feature = "tdm-springer"
1530 ))]
1531 {
1532 use secrecy::ExposeSecret as _;
1533 assert_eq!(
1534 grant.api_key.expose_secret(),
1535 "sk-test",
1536 "the DOIGET_KEY_ELSEVIER value must be threaded into \
1537 TdmGrant::api_key (issue #153)"
1538 );
1539 }
1540 } else {
1541 assert!(
1542 p.tdm_elsevier.is_none(),
1543 "feature tdm-elsevier NOT compiled in -> None (warn-and-skip)"
1544 );
1545 }
1546 }
1547
1548 #[test]
1549 #[serial_test::serial]
1550 fn from_env_metadata_env_warns_without_feature() {
1551 // Rule (CAPABILITY.md §2): metadata env var without the `metadata`
1552 // feature -> source disabled (warn-and-skip, not an error).
1553 // We don't capture the tracing warn here; we just assert the field
1554 // is `false` when the feature is absent and `true` when present.
1555 let _g = unset_all_capability_env_vars();
1556 let _enable = EnvGuard::set("DOIGET_ENABLE_OPENALEX", "1");
1557
1558 let p = CapabilityProfile::from_env().expect("metadata env never errors");
1559
1560 if cfg!(feature = "metadata") {
1561 assert!(p.metadata.openalex);
1562 } else {
1563 assert!(!p.metadata.openalex);
1564 }
1565 }
1566
1567 // -----------------------------------------------------------------
1568 // Safekey reference vectors (docs/SAFEKEY.md §3, NORMATIVE).
1569 //
1570 // The vectors.json file is the binding cross-tool contract with
1571 // BiblioFetch.jl: every entry MUST round-trip identically through
1572 // both implementations. Phase 0 ships 13 entries; the full 100-entry
1573 // set is gated on the BiblioFetch.jl pre-flight (ADR-0007 Status:
1574 // Proposed at the time of this Phase 1 implementation).
1575 //
1576 // `Ref::parse` is concurrent W3-A work and is not on `main` yet, so
1577 // this test branches on the input prefix (`doi:` / `arxiv:`) and
1578 // constructs the variant directly via the in-crate `pub(crate)`
1579 // tuple constructor.
1580 // -----------------------------------------------------------------
1581
1582 #[derive(Deserialize)]
1583 struct SafekeyVector {
1584 input: String,
1585 expected: String,
1586 }
1587
1588 #[derive(Deserialize)]
1589 struct SafekeyVectorFile {
1590 vectors: Vec<SafekeyVector>,
1591 }
1592
1593 /// In-crate test helper: build a `Ref` from the user-facing form used
1594 /// in the vectors file, by stripping the `doi:` / `arxiv:` URI scheme
1595 /// and wrapping the remainder. This bypasses validation; it is fine
1596 /// here because the vectors are hand-curated and the test asserts the
1597 /// derivation algorithm, not parser semantics.
1598 fn ref_from_vector_input(input: &str) -> Ref {
1599 if let Some(rest) = input.strip_prefix("doi:") {
1600 Ref::Doi(Doi(rest.to_string()))
1601 } else if let Some(rest) = input.strip_prefix("arxiv:") {
1602 Ref::Arxiv(ArxivId(rest.to_string()))
1603 } else {
1604 panic!(
1605 "vectors.json entry has unknown ref scheme (expected doi: or arxiv: prefix): {}",
1606 input
1607 );
1608 }
1609 }
1610
1611 #[test]
1612 fn safekey_matches_reference_vectors() {
1613 // include_str! resolves relative to the file containing this macro
1614 // call (crates/doiget-core/src/lib.rs), so we go up three levels
1615 // to reach the workspace root, then down to tests/fixtures.
1616 let raw = include_str!("../../../tests/fixtures/safekey/vectors.json");
1617 let parsed: SafekeyVectorFile =
1618 serde_json::from_str(raw).expect("vectors.json is valid JSON matching schema");
1619
1620 // Phase 0 final ships the full NORMATIVE 100-entry set
1621 // (docs/SAFEKEY.md §5). The fixture is the binding cross-tool
1622 // contract with BiblioFetch.jl; tightening the count guard to
1623 // `== 100` ensures the set cannot silently grow or shrink without
1624 // a coordinated ADR bump (per docs/SAFEKEY.md status block).
1625 assert_eq!(
1626 parsed.vectors.len(),
1627 100,
1628 "vectors.json MUST be exactly 100 entries (NORMATIVE per docs/SAFEKEY.md §5); got {}",
1629 parsed.vectors.len()
1630 );
1631
1632 let mut failures: Vec<String> = Vec::new();
1633 for v in &parsed.vectors {
1634 let r = ref_from_vector_input(&v.input);
1635 let got = r.safekey().as_str().to_string();
1636 if got != v.expected {
1637 failures.push(format!(
1638 "input={:?}\n expected={:?}\n got ={:?}",
1639 v.input, v.expected, got
1640 ));
1641 }
1642 }
1643
1644 assert!(
1645 failures.is_empty(),
1646 "{}/{} safekey reference vectors failed:\n{}",
1647 failures.len(),
1648 parsed.vectors.len(),
1649 failures.join("\n")
1650 );
1651 }
1652
1653 #[test]
1654 fn safekey_truncates_long_inputs_with_sha256_suffix() {
1655 // Construct a synthetic DOI whose suffix produces a `trimmed` longer than
1656 // 192 chars after step 3. 220 ASCII-safe chars + the `doi_10.1234/`
1657 // prefix easily exceeds 192. The resulting key must be exactly 201 chars:
1658 // 192 (trimmed prefix) + 1 (`_` separator) + 8 (hex of first 4 bytes of
1659 // SHA-256(raw)). Per docs/SAFEKEY.md §3 step 5.
1660 let suffix = "a".repeat(220);
1661 let doi = Doi(format!("10.1234/{}", suffix));
1662 let key = Ref::Doi(doi).safekey();
1663 let s = key.as_str();
1664
1665 // Shape: <192 ASCII chars from {A-Za-z0-9._-}> + "_" + <8 hex chars>
1666 assert_eq!(
1667 s.len(),
1668 201,
1669 "expected 201-char truncated key, got {}: {}",
1670 s.len(),
1671 s
1672 );
1673 assert_eq!(&s[192..193], "_", "expected '_' separator at byte 192");
1674 let hash_part = &s[193..];
1675 assert_eq!(hash_part.len(), 8, "hash suffix must be 8 hex chars");
1676 assert!(
1677 hash_part
1678 .chars()
1679 .all(|c| c.is_ascii_hexdigit() && !c.is_ascii_uppercase()),
1680 "hash suffix must be lowercase hex: {}",
1681 hash_part
1682 );
1683
1684 // Determinism: same input twice must produce the same key.
1685 let key2 = Ref::Doi(Doi(format!("10.1234/{}", "a".repeat(220)))).safekey();
1686 assert_eq!(s, key2.as_str(), "safekey must be deterministic");
1687
1688 // Hash content: must equal hex(sha256(raw)[..4]) where raw is the
1689 // pre-escape prefixed form per docs/SAFEKEY.md §3 step 5.
1690 use sha2::Digest;
1691 let raw = format!("doi_10.1234/{}", "a".repeat(220));
1692 let expected_hash = {
1693 let digest = sha2::Sha256::digest(raw.as_bytes());
1694 format!(
1695 "{:02x}{:02x}{:02x}{:02x}",
1696 digest[0], digest[1], digest[2], digest[3]
1697 )
1698 };
1699 assert_eq!(
1700 hash_part, expected_hash,
1701 "hash must match SHA-256 of raw form"
1702 );
1703 }
1704
1705 // -----------------------------------------------------------------
1706 // Doi::parse / ArxivId::parse / Ref::parse — Phase 1 W3-A.
1707 // Spec: docs/SECURITY.md §1.1 (input validation). The rejection
1708 // category set is the binding contract; each test case below names
1709 // which rule it exercises in a comment.
1710 // -----------------------------------------------------------------
1711
1712 // ---- Doi::parse happy paths (≥6) --------------------------------
1713
1714 #[test]
1715 fn doi_parse_accepts_bare_canonical_form() {
1716 // Rule: "10.<registrant>/<suffix>" is the canonical bare form.
1717 let d = Doi::parse("10.1234/example").expect("canonical bare DOI");
1718 assert_eq!(d.as_str(), "10.1234/example");
1719 }
1720
1721 #[test]
1722 fn doi_parse_accepts_doi_uri_scheme() {
1723 // Rule: the `doi:` scheme is stripped at construction; as_str
1724 // never carries it (matches docs/SAFEKEY.md §3 step 0).
1725 let d = Doi::parse("doi:10.1234/example").expect("doi: scheme accepted");
1726 assert_eq!(d.as_str(), "10.1234/example");
1727 }
1728
1729 #[test]
1730 fn doi_parse_accepts_complex_real_world_suffix() {
1731 // Rule: suffix charset includes `.`, `(`, `)`, `-`. From a real
1732 // PhysRevLett DOI used elsewhere in the test fixture set.
1733 let d = Doi::parse("10.1103/PhysRevLett.130.200601").expect("real-world PhysRev DOI");
1734 assert_eq!(d.as_str(), "10.1103/PhysRevLett.130.200601");
1735 }
1736
1737 #[test]
1738 fn doi_parse_accepts_parens_in_suffix() {
1739 // Rule: `(` and `)` are explicitly listed in the spec charset.
1740 let d = Doi::parse("10.1016/S0370-1573(98)00122-3").expect("parens in suffix");
1741 assert_eq!(d.as_str(), "10.1016/S0370-1573(98)00122-3");
1742 }
1743
1744 #[test]
1745 fn doi_parse_accepts_nested_slashes_in_suffix() {
1746 // Rule: `/` is a suffix character; only the first `/` is the
1747 // registrant/suffix separator.
1748 let d = Doi::parse("10.1234/foo/bar/baz").expect("nested slashes");
1749 assert_eq!(d.as_str(), "10.1234/foo/bar/baz");
1750 }
1751
1752 #[test]
1753 fn doi_parse_accepts_colon_in_legacy_kluwer_suffix() {
1754 // #194: legacy Kluwer/Springer DOIs (`10.1023/A:NNNNNNNNNN`)
1755 // carry a `:` in the suffix. Real DOI: "Entanglement, Quantum
1756 // Phase Transitions, and DMRG" (Kluwer, 2002).
1757 let d = Doi::parse("10.1023/A:1019601218492").expect("legacy Kluwer colon DOI");
1758 assert_eq!(d.as_str(), "10.1023/A:1019601218492");
1759 }
1760
1761 #[test]
1762 fn doi_parse_accepts_colon_in_edp_jphys_suffix() {
1763 // #194: EDP Sciences / Journal de Physique legacy corpus uses
1764 // `10.1051/jphys:NNNNNNNNNNNNNNNNN`. Real DOIs from the dogfood
1765 // Ising-RG run; both resolve at doi.org and via Crossref.
1766 let d = Doi::parse("10.1051/jphys:0198900500120136500").expect("EDP jphys colon DOI");
1767 assert_eq!(d.as_str(), "10.1051/jphys:0198900500120136500");
1768 let d2 = Doi::parse("doi:10.1051/jphys:0198500460100164500").expect("scheme + colon");
1769 assert_eq!(d2.as_str(), "10.1051/jphys:0198500460100164500");
1770 }
1771
1772 #[test]
1773 fn doi_parse_rejects_semicolon_in_suffix() {
1774 // #194 / ADR-0026: `;` is the natural ASCII neighbor of `:` and
1775 // is explicitly EXCLUDED from the suffix charset extension
1776 // (ADR-0026 §"Out of scope"). This test guards against an
1777 // over-broad `matches!` arm (e.g. an accidental `':'..=';'` range
1778 // typo) re-admitting `;` along with `:`.
1779 let result = Doi::parse("10.1234/foo;bar");
1780 assert!(
1781 matches!(result, Err(RefParseError::InvalidDoiSuffixChar { ch: ';' })),
1782 "expected InvalidDoiSuffixChar with ch=';', got {:?}",
1783 result
1784 );
1785 }
1786
1787 #[test]
1788 fn doi_parse_accepts_suffix_at_max_len_boundary() {
1789 // Rule: a suffix of exactly DOI_SUFFIX_MAX_LEN bytes is accepted;
1790 // 1 byte more is rejected (covered separately below).
1791 let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN);
1792 let input = format!("10.1234/{}", suffix);
1793 let d = Doi::parse(&input).expect("suffix at max len");
1794 assert_eq!(d.as_str().len(), "10.1234/".len() + DOI_SUFFIX_MAX_LEN);
1795 }
1796
1797 #[test]
1798 fn doi_parse_uri_scheme_is_case_insensitive() {
1799 // Rule: be lenient on scheme casing; the scheme is stripped
1800 // either way so the stored form is identical.
1801 let d = Doi::parse("DOI:10.1234/example").expect("uppercase scheme");
1802 assert_eq!(d.as_str(), "10.1234/example");
1803 }
1804
1805 // ---- Doi::parse rejection paths (≥6) ----------------------------
1806
1807 #[test]
1808 fn doi_parse_rejects_missing_10_prefix() {
1809 // Rule: must start with "10." literal.
1810 assert_eq!(
1811 Doi::parse("11.1234/example"),
1812 Err(RefParseError::MissingDoiPrefix)
1813 );
1814 }
1815
1816 #[test]
1817 fn doi_parse_rejects_empty_input() {
1818 // Rule: empty inputs are not valid DOIs.
1819 assert_eq!(Doi::parse(""), Err(RefParseError::Empty));
1820 }
1821
1822 #[test]
1823 fn doi_parse_rejects_missing_suffix_separator() {
1824 // Rule: must contain a `/` between registrant and suffix.
1825 assert_eq!(
1826 Doi::parse("10.1234"),
1827 Err(RefParseError::MissingDoiSuffixSeparator)
1828 );
1829 }
1830
1831 #[test]
1832 fn doi_parse_rejects_empty_suffix() {
1833 // Rule: suffix must be non-empty.
1834 assert_eq!(Doi::parse("10.1234/"), Err(RefParseError::EmptyDoiSuffix));
1835 }
1836
1837 #[test]
1838 fn doi_parse_rejects_invalid_registrant_too_short() {
1839 // Rule: registrant must be 4–9 digits.
1840 assert_eq!(
1841 Doi::parse("10.12/example"),
1842 Err(RefParseError::InvalidDoiRegistrant)
1843 );
1844 }
1845
1846 #[test]
1847 fn doi_parse_rejects_non_digit_registrant() {
1848 // Rule: registrant chars must all be ASCII digits.
1849 assert_eq!(
1850 Doi::parse("10.12ab/example"),
1851 Err(RefParseError::InvalidDoiRegistrant)
1852 );
1853 }
1854
1855 #[test]
1856 fn doi_parse_rejects_control_char_in_suffix() {
1857 // Rule (from docs/SECURITY.md §1.1, log-injection mitigation):
1858 // control chars are not in the suffix charset; reject before they
1859 // can reach the provenance log.
1860 let result = Doi::parse("10.1234/foo\nbar");
1861 assert!(
1862 matches!(
1863 result,
1864 Err(RefParseError::InvalidDoiSuffixChar { ch: '\n' })
1865 ),
1866 "got {:?}",
1867 result
1868 );
1869 }
1870
1871 #[test]
1872 fn doi_parse_rejects_suffix_over_max_len() {
1873 // Rule: DOI_SUFFIX_MAX_LEN + 1 bytes is rejected.
1874 let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 1);
1875 let input = format!("10.1234/{}", suffix);
1876 let result = Doi::parse(&input);
1877 match result {
1878 Err(RefParseError::DoiSuffixTooLong { len, max }) => {
1879 assert_eq!(len, DOI_SUFFIX_MAX_LEN + 1);
1880 assert_eq!(max, DOI_SUFFIX_MAX_LEN);
1881 }
1882 other => panic!("expected DoiSuffixTooLong, got {:?}", other),
1883 }
1884 }
1885
1886 #[test]
1887 fn doi_parse_rejects_non_ascii_in_suffix() {
1888 // Rule: spec charset is ASCII-only; non-ASCII becomes an
1889 // InvalidDoiSuffixChar (consistent with safekey behavior of
1890 // collapsing such chars to '_', which is a downstream concern).
1891 let result = Doi::parse("10.1234/物理学");
1892 assert!(
1893 matches!(result, Err(RefParseError::InvalidDoiSuffixChar { .. })),
1894 "got {:?}",
1895 result
1896 );
1897 }
1898
1899 // ---- ArxivId::parse happy paths (≥6) ----------------------------
1900
1901 #[test]
1902 fn arxiv_parse_accepts_new_style_4_digit_seq() {
1903 // Rule: new-style YYMM.NNNN (4-digit sequence number).
1904 let a = ArxivId::parse("0704.0001").expect("new-style 4-digit seq");
1905 assert_eq!(a.as_str(), "0704.0001");
1906 }
1907
1908 #[test]
1909 fn arxiv_parse_accepts_new_style_5_digit_seq() {
1910 // Rule: new-style YYMM.NNNNN (5-digit sequence number, post-2015).
1911 let a = ArxivId::parse("2401.12345").expect("new-style 5-digit seq");
1912 assert_eq!(a.as_str(), "2401.12345");
1913 }
1914
1915 #[test]
1916 fn arxiv_parse_accepts_new_style_with_version() {
1917 // Rule: optional `vN` version suffix.
1918 let a = ArxivId::parse("2401.12345v2").expect("with version");
1919 assert_eq!(a.as_str(), "2401.12345v2");
1920 }
1921
1922 #[test]
1923 fn arxiv_parse_accepts_old_style() {
1924 // Rule: old-style subject-class/YYMMNNN.
1925 let a = ArxivId::parse("cond-mat/9501001").expect("old-style cond-mat");
1926 assert_eq!(a.as_str(), "cond-mat/9501001");
1927 }
1928
1929 #[test]
1930 fn arxiv_parse_accepts_old_style_with_subclass_and_version() {
1931 // Rule: old-style subject-class may have a `.XX` two-upper subclass
1932 // and an optional `vN` suffix.
1933 let a = ArxivId::parse("astro-ph.CO/0703123v2").expect("old-style with subclass + version");
1934 assert_eq!(a.as_str(), "astro-ph.CO/0703123v2");
1935 }
1936
1937 #[test]
1938 fn arxiv_parse_accepts_arxiv_uri_scheme() {
1939 // Rule: `arxiv:` / `arXiv:` scheme is stripped at construction.
1940 let a = ArxivId::parse("arxiv:2401.12345").expect("arxiv: scheme");
1941 assert_eq!(a.as_str(), "2401.12345");
1942 }
1943
1944 #[test]
1945 fn arxiv_parse_accepts_arxiv_uri_scheme_mixed_case() {
1946 // Rule: scheme case-insensitive; matches the `arXiv:` form named
1947 // in docs/MCP_TOOLS.md.
1948 let a = ArxivId::parse("arXiv:2401.12345v2").expect("arXiv: scheme");
1949 assert_eq!(a.as_str(), "2401.12345v2");
1950 }
1951
1952 // ---- ArxivId::parse rejection paths (≥6) ------------------------
1953
1954 #[test]
1955 fn arxiv_parse_rejects_empty_input() {
1956 // Rule: empty rejected up-front.
1957 assert_eq!(ArxivId::parse(""), Err(RefParseError::Empty));
1958 }
1959
1960 #[test]
1961 fn arxiv_parse_rejects_no_dot_or_slash() {
1962 // Rule: must contain `.` (new-style) or `/` (old-style).
1963 assert_eq!(
1964 ArxivId::parse("notanarxivid"),
1965 Err(RefParseError::InvalidArxivShape)
1966 );
1967 }
1968
1969 #[test]
1970 fn arxiv_parse_rejects_new_style_wrong_head_length() {
1971 // Rule: head must be exactly 4 digits.
1972 assert_eq!(
1973 ArxivId::parse("240.12345"),
1974 Err(RefParseError::InvalidArxivShape)
1975 );
1976 }
1977
1978 #[test]
1979 fn arxiv_parse_rejects_new_style_seq_too_short() {
1980 // Rule: seq must be 4–5 digits.
1981 assert_eq!(
1982 ArxivId::parse("2401.123"),
1983 Err(RefParseError::InvalidArxivShape)
1984 );
1985 }
1986
1987 #[test]
1988 fn arxiv_parse_rejects_old_style_wrong_id_length() {
1989 // Rule: old-style id is exactly 7 digits.
1990 assert_eq!(
1991 ArxivId::parse("cond-mat/95001"),
1992 Err(RefParseError::InvalidArxivShape)
1993 );
1994 }
1995
1996 #[test]
1997 fn arxiv_parse_rejects_invalid_version_suffix() {
1998 // Rule: version suffix is `v` followed by ≥1 digits, nothing else.
1999 assert_eq!(
2000 ArxivId::parse("2401.12345v"),
2001 Err(RefParseError::InvalidArxivShape)
2002 );
2003 }
2004
2005 #[test]
2006 fn arxiv_parse_rejects_control_char() {
2007 // Rule (docs/SECURITY.md §1.1 log-injection): no control chars.
2008 assert_eq!(
2009 ArxivId::parse("2401.12345\n"),
2010 Err(RefParseError::InvalidArxivShape)
2011 );
2012 }
2013
2014 #[test]
2015 fn arxiv_parse_rejects_non_ascii() {
2016 // Rule: ASCII-only.
2017 assert_eq!(
2018 ArxivId::parse("2401.物理"),
2019 Err(RefParseError::InvalidArxivShape)
2020 );
2021 }
2022
2023 // ---- Ref::parse happy paths (≥6) --------------------------------
2024
2025 #[test]
2026 fn ref_parse_dispatches_doi_scheme_to_doi() {
2027 // Detection rule 1: explicit `doi:` scheme.
2028 match Ref::parse("doi:10.1234/example").expect("doi: dispatched to Doi") {
2029 Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/example"),
2030 other => panic!("expected Ref::Doi, got {:?}", other),
2031 }
2032 }
2033
2034 #[test]
2035 fn ref_parse_dispatches_arxiv_scheme_to_arxiv() {
2036 // Detection rule 2: explicit `arxiv:` scheme.
2037 match Ref::parse("arxiv:2401.12345").expect("arxiv: dispatched to Arxiv") {
2038 Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2039 other => panic!("expected Ref::Arxiv, got {:?}", other),
2040 }
2041 }
2042
2043 #[test]
2044 fn ref_parse_dispatches_arxiv_mixed_case_scheme() {
2045 // Detection rule 2 (case-insensitive): `arXiv:` form.
2046 match Ref::parse("arXiv:cond-mat/9501001").expect("arXiv: dispatched") {
2047 Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2048 other => panic!("expected Ref::Arxiv, got {:?}", other),
2049 }
2050 }
2051
2052 #[test]
2053 fn ref_parse_bare_doi_resolves_to_doi() {
2054 // Detection rule 3: bare input starting with `10.` is a DOI.
2055 match Ref::parse("10.1234/foo").expect("bare DOI") {
2056 Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/foo"),
2057 other => panic!("expected Ref::Doi, got {:?}", other),
2058 }
2059 }
2060
2061 #[test]
2062 fn ref_parse_bare_arxiv_new_resolves_to_arxiv() {
2063 // Detection rule 4: bare input not starting with `10.` falls
2064 // through to arXiv. Tests the ambiguous-input branch named in the
2065 // PR brief: `2401.12345` should resolve to ArxivId.
2066 match Ref::parse("2401.12345").expect("bare new-style arXiv") {
2067 Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
2068 other => panic!("expected Ref::Arxiv, got {:?}", other),
2069 }
2070 }
2071
2072 #[test]
2073 fn ref_parse_bare_arxiv_old_resolves_to_arxiv() {
2074 // Detection rule 4: bare old-style arXiv id.
2075 match Ref::parse("cond-mat/9501001").expect("bare old-style arXiv") {
2076 Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
2077 other => panic!("expected Ref::Arxiv, got {:?}", other),
2078 }
2079 }
2080
2081 // ---- Ref::parse rejection paths (≥6) ----------------------------
2082
2083 #[test]
2084 fn ref_parse_rejects_empty() {
2085 // Rule: empty up-front.
2086 assert_eq!(Ref::parse(""), Err(RefParseError::Empty));
2087 }
2088
2089 #[test]
2090 fn ref_parse_doi_scheme_with_invalid_doi_propagates_doi_error() {
2091 // When the scheme is explicit, we surface the parser's error
2092 // verbatim — not a generic "shape mismatch".
2093 assert_eq!(
2094 Ref::parse("doi:10.1234"),
2095 Err(RefParseError::MissingDoiSuffixSeparator)
2096 );
2097 }
2098
2099 #[test]
2100 fn ref_parse_arxiv_scheme_with_invalid_arxiv_propagates_arxiv_error() {
2101 assert_eq!(
2102 Ref::parse("arxiv:notanid"),
2103 Err(RefParseError::InvalidArxivShape)
2104 );
2105 }
2106
2107 #[test]
2108 fn ref_parse_bare_with_10_prefix_uses_doi_errors() {
2109 // Bare `10.…` heuristic: DOI parser is dispatched and its error
2110 // surfaces (here: bad registrant).
2111 assert_eq!(
2112 Ref::parse("10.12/x"),
2113 Err(RefParseError::InvalidDoiRegistrant)
2114 );
2115 }
2116
2117 #[test]
2118 fn ref_parse_bare_without_10_prefix_uses_arxiv_errors() {
2119 // Bare ambiguous fallback: ArxivId parser is dispatched and its
2120 // error surfaces. `1.2.3` is neither a DOI nor an arXiv shape.
2121 assert_eq!(Ref::parse("1.2.3"), Err(RefParseError::InvalidArxivShape));
2122 }
2123
2124 #[test]
2125 fn ref_parse_rejects_doi_scheme_with_oversized_suffix() {
2126 // Length-bound: DOI suffix > DOI_SUFFIX_MAX_LEN through Ref::parse
2127 // surfaces DoiSuffixTooLong, not a generic InvalidArxivShape.
2128 let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 5);
2129 let input = format!("doi:10.1234/{}", suffix);
2130 match Ref::parse(&input) {
2131 Err(RefParseError::DoiSuffixTooLong { .. }) => {}
2132 other => panic!("expected DoiSuffixTooLong, got {:?}", other),
2133 }
2134 }
2135
2136 #[test]
2137 fn ref_parse_round_trip_via_serde_preserves_inner_string() {
2138 // Wire-format check: Doi/ArxivId are #[serde(transparent)], and a
2139 // round-trip through Ref::parse → serde_json → Ref must preserve
2140 // the inner identifier. Guards against accidental scheme leakage
2141 // into the stored form.
2142 let r = Ref::parse("doi:10.1234/example").expect("parse ok");
2143 let json = serde_json::to_string(&r).expect("serialize");
2144 // The transparent inner value is the bare identifier (no `doi:`).
2145 assert!(
2146 json.contains("10.1234/example") && !json.contains("doi:"),
2147 "scheme leaked into wire form: {}",
2148 json
2149 );
2150 }
2151
2152 #[test]
2153 fn ref_parse_error_maps_to_invalid_ref_error_code() {
2154 // Public-API contract (docs/PUBLIC_API.md §4): all parse failures
2155 // collapse to ErrorCode::InvalidRef at the public boundary.
2156 let err: ErrorCode = RefParseError::Empty.into();
2157 assert_eq!(err, ErrorCode::InvalidRef);
2158 let err2: ErrorCode = RefParseError::MissingDoiPrefix.into();
2159 assert_eq!(err2, ErrorCode::InvalidRef);
2160 }
2161
2162 // -----------------------------------------------------------------
2163 // DenialReason / DenialContext (ADR-0023) — wire-shape tests.
2164 // -----------------------------------------------------------------
2165
2166 #[test]
2167 fn denial_reason_serializes_snake_case() {
2168 // ADR-0023 §2 / docs/PUBLIC_API.md §8: wire form is snake_case.
2169 let s = serde_json::to_string(&DenialReason::RedirectNotInAllowlist).expect("ser");
2170 assert_eq!(s, "\"redirect_not_in_allowlist\"");
2171 let s = serde_json::to_string(&DenialReason::SizeCapExceeded).expect("ser");
2172 assert_eq!(s, "\"size_cap_exceeded\"");
2173 let s = serde_json::to_string(&DenialReason::ContentTypeMismatch).expect("ser");
2174 assert_eq!(s, "\"content_type_mismatch\"");
2175 }
2176
2177 #[test]
2178 fn denial_reason_round_trip_via_serde() {
2179 // Round-trip every closed-set variant so adding a new variant
2180 // forces this test to be updated (the closed-set contract).
2181 for r in [
2182 DenialReason::RedirectNotInAllowlist,
2183 DenialReason::InsecureScheme,
2184 DenialReason::HostInBlockList,
2185 DenialReason::SizeCapExceeded,
2186 DenialReason::SchemaDrift,
2187 DenialReason::CapabilityNotGranted,
2188 DenialReason::RateLimitWindow,
2189 DenialReason::SsrfPrivateAddress,
2190 DenialReason::ContentTypeMismatch,
2191 ] {
2192 let s = serde_json::to_string(&r).expect("ser");
2193 let back: DenialReason = serde_json::from_str(&s).expect("de");
2194 assert_eq!(back, r, "round-trip mismatch for {:?} -> {}", r, s);
2195 }
2196 }
2197
2198 #[test]
2199 fn denial_context_round_trips_full_shape() {
2200 // A populated context (the redirect-denied case from ADR-0023 §1
2201 // example) survives a JSON round-trip. Whole-struct equality
2202 // exercises the `PartialEq` derive added per ADR-0023 §3 (added
2203 // in the multi-agent review feedback PR — see ADR-0023 history).
2204 let dc = DenialContext {
2205 reason: DenialReason::RedirectNotInAllowlist,
2206 source: Some("crossref".to_string()),
2207 attempted: Some("evil.example.com".to_string()),
2208 expected: Some(vec![
2209 "api.crossref.org".to_string(),
2210 "*.crossref.org".to_string(),
2211 ]),
2212 hop_index: Some(1),
2213 cap: None,
2214 actual: None,
2215 };
2216 let s = serde_json::to_string(&dc).expect("ser");
2217 let back: DenialContext = serde_json::from_str(&s).expect("de");
2218 assert_eq!(back, dc);
2219 }
2220
2221 #[test]
2222 fn denial_context_serialize_elides_empty_fields() {
2223 // `skip_serializing_if = "Option::is_none"` must keep the wire form
2224 // lean: every `None` field MUST NOT appear on the wire. Reason is
2225 // always present.
2226 let dc = DenialContext {
2227 reason: DenialReason::CapabilityNotGranted,
2228 source: None,
2229 attempted: None,
2230 expected: None,
2231 hop_index: None,
2232 cap: None,
2233 actual: None,
2234 };
2235 let s = serde_json::to_string(&dc).expect("ser");
2236 assert_eq!(s, "{\"reason\":\"capability_not_granted\"}");
2237 }
2238
2239 #[test]
2240 fn denial_context_expected_some_empty_vec_preserves_explicit_empty_allowlist() {
2241 // Post-refinement disambiguation: `expected: Some(vec![])` is the
2242 // "explicit empty allowlist" signal and MUST survive the wire as
2243 // `"expected":[]`. Only `expected: None` is skipped on serialize.
2244 // This is the bug the previous `Vec<String>` shape masked.
2245 let dc = DenialContext {
2246 reason: DenialReason::RedirectNotInAllowlist,
2247 source: Some("crossref".to_string()),
2248 attempted: Some("evil.example.com".to_string()),
2249 expected: Some(Vec::new()),
2250 hop_index: None,
2251 cap: None,
2252 actual: None,
2253 };
2254 let s = serde_json::to_string(&dc).expect("ser");
2255 assert!(
2256 s.contains("\"expected\":[]"),
2257 "expected:[] must survive on the wire (got: {s})"
2258 );
2259 let back: DenialContext = serde_json::from_str(&s).expect("de");
2260 assert_eq!(back.expected, Some(Vec::new()));
2261 }
2262
2263 #[test]
2264 fn denial_context_deserialize_tolerates_missing_optional_fields() {
2265 // Consumer-side contract (ADR-0023 §3): consumers MUST tolerate
2266 // any subset of fields being present. Missing optional fields
2267 // deserialize to their defaults via `#[serde(default)]`.
2268 let wire = r#"{"reason":"size_cap_exceeded","cap":104857600,"actual":209715200}"#;
2269 let dc: DenialContext = serde_json::from_str(wire).expect("de");
2270 assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
2271 assert_eq!(dc.cap, Some(104857600));
2272 assert_eq!(dc.actual, Some(209715200));
2273 assert!(dc.source.is_none());
2274 assert!(dc.attempted.is_none());
2275 assert!(dc.expected.is_none());
2276 assert!(dc.hop_index.is_none());
2277 }
2278
2279 #[test]
2280 fn full_error_envelope_with_denial_context_serializes_to_pinned_json() {
2281 // Pins the byte-exact wire shape of the full failure envelope
2282 // documented in docs/ERRORS.md §3 + §3.1 and ADR-0023 §1. A
2283 // future regression that flips key order or skip-rules anywhere
2284 // in the chain breaks this test loudly.
2285 //
2286 // Note: serde_json's `Map` (used by `json!`) sorts keys
2287 // alphabetically when the `preserve_order` feature is NOT
2288 // enabled (we do not enable it). Embedding a `DenialContext`
2289 // via `json!` first re-serialises it through the same alphabet-
2290 // sorted Map path, so the inner field order is also alphabetical
2291 // here — NOT the struct field-order produced by direct
2292 // `to_string(&DenialContext)`. This is by design: the public
2293 // wire shape is canonicalised by serde_json's Map ordering, so
2294 // the byte-exact pin below documents that exact canonicalisation.
2295 let denial = DenialContext {
2296 reason: DenialReason::RedirectNotInAllowlist,
2297 source: Some("crossref".into()),
2298 attempted: Some("evil.example.com".into()),
2299 expected: Some(vec!["api.crossref.org".into(), "*.crossref.org".into()]),
2300 hop_index: Some(1),
2301 cap: None,
2302 actual: None,
2303 };
2304 let envelope = serde_json::json!({
2305 "ok": false,
2306 "error": {
2307 "code": ErrorCode::NetworkError,
2308 "message": "redirect target evil.example.com not in allowlist for source crossref",
2309 "denial_context": denial,
2310 }
2311 });
2312 let actual = serde_json::to_string(&envelope).expect("serialize envelope");
2313 let expected = r#"{"error":{"code":"NETWORK_ERROR","denial_context":{"attempted":"evil.example.com","expected":["api.crossref.org","*.crossref.org"],"hop_index":1,"reason":"redirect_not_in_allowlist","source":"crossref"},"message":"redirect target evil.example.com not in allowlist for source crossref"},"ok":false}"#;
2314 assert_eq!(actual, expected);
2315 }
2316
2317 #[test]
2318 fn denial_context_rejects_unknown_fields() {
2319 // `#[serde(deny_unknown_fields)]` (ADR-0023 §3, PUBLIC_API.md §8):
2320 // an unknown field on the wire MUST be a deserialize error so
2321 // forward-compat field additions stay a breaking change.
2322 let wire = r#"{"reason":"capability_not_granted","banana":1}"#;
2323 let result: Result<DenialContext, _> = serde_json::from_str(wire);
2324 assert!(
2325 result.is_err(),
2326 "deny_unknown_fields must reject 'banana': {:?}",
2327 result.map(|d| d.reason),
2328 );
2329 }
2330}