Skip to main content

doiget_core/
http.rs

1// allow: outbound-network
2//! Centralized HTTP client wrapper. All `Source` impls fetch through here.
3//!
4//! Security defaults per `docs/SECURITY.md`:
5//!   - rustls TLS only (no openssl, no native-tls — enforced by `deny.toml`)
6//!   - HTTPS-only redirect policy (file://, data://, http:// rejected)
7//!   - Per-source redirect host allowlist (`docs/REDIRECT_ALLOWLIST.md`)
8//!   - Body size cap ([`crate::PDF_MAX_BYTES`] = 100 MB)
9//!   - Per-request timeouts (connect 10s, read 60s, total 300s)
10//!   - PDF magic-byte check on the first 5 bytes (`%PDF-`)
11//!   - User-Agent: `doiget/<version> (+https://github.com/sotashimozono/doiget)`
12//!
13//! See `docs/SECURITY.md` §1.2-1.3 / §1.10 and `docs/REDIRECT_ALLOWLIST.md`.
14//!
15//! # Architectural note: per-source `reqwest::Client`
16//!
17//! `reqwest::redirect::Policy::custom` receives only an `Attempt` value, which
18//! exposes the next URL and previous URL chain but **not** the original
19//! request's headers. That makes the "tag the request with `X-Doiget-Source`
20//! and inspect it from inside the redirect closure" approach infeasible on
21//! `reqwest 0.13.x`. Instead, [`HttpClient`] holds one
22//! [`reqwest::Client`] per source — each client's redirect closure captures
23//! that source's [`SourceAllowlist`] so cross-source confusion is impossible
24//! by construction.
25
26use std::collections::HashMap;
27use std::sync::Arc;
28use std::sync::Once;
29use std::time::Duration;
30
31use bytes::{Bytes, BytesMut};
32use futures_util::StreamExt;
33use reqwest::redirect::Policy;
34use reqwest::{Client, ClientBuilder, Url};
35use thiserror::Error;
36
37use crate::{PDF_MAX_BYTES, VERSION};
38
39/// PDF magic-byte prefix per the PDF 1.7 specification (ISO 32000-1 §7.5.2).
40/// `b"%PDF-"`.
41const PDF_MAGIC: [u8; 5] = [0x25, 0x50, 0x44, 0x46, 0x2D];
42
43/// Hard cap on redirect chain length. Matches `reqwest`'s default of 10.
44/// Re-asserted here so the value is reviewed alongside the other security
45/// defaults in this module rather than inheriting silently from upstream.
46const MAX_REDIRECTS: usize = 10;
47
48/// Connect timeout per `docs/SECURITY.md` §1.2 (Slowloris row).
49const CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
50
51/// Read (idle-between-bytes) timeout per `docs/SECURITY.md` §1.2.
52const READ_TIMEOUT: Duration = Duration::from_secs(60);
53
54/// Total per-request timeout per `docs/SECURITY.md` §1.2.
55const TOTAL_TIMEOUT: Duration = Duration::from_secs(300);
56
57/// Max retry attempts AFTER the first try, for transient failures only
58/// (connect/timeout/mid-stream network errors and the transient HTTP
59/// status set). 3 retries → up to 4 total attempts. See issue #117.
60const MAX_FETCH_RETRIES: u32 = 3;
61
62/// Base delay for the exponential backoff (`base * 2^attempt`, jittered).
63const RETRY_BASE_DELAY: Duration = Duration::from_millis(500);
64
65/// Hard ceiling on any single backoff / `Retry-After` sleep. Keeps the
66/// worst-case retry chain comfortably inside [`TOTAL_TIMEOUT`].
67const RETRY_MAX_DELAY: Duration = Duration::from_secs(30);
68
69/// HTTP status codes worth retrying: request timeout, rate-limited, and
70/// the transient 5xx family. A plain 500 is included because upstreams
71/// (Crossref/Unpaywall) intermittently 500 under load. 4xx other than
72/// 408/429 are caller/permanent and never retried.
73fn is_transient_status(code: u16) -> bool {
74    matches!(code, 408 | 429 | 500 | 502 | 503 | 504)
75}
76
77/// A `reqwest::Error` is transient iff it is a connect or timeout
78/// failure or a mid-body transfer error. Redirect-policy aborts
79/// (allowlist denial), builder errors, and decode errors are NOT
80/// transient — retrying them cannot help and would mask a real denial.
81fn reqwest_is_transient(e: &reqwest::Error) -> bool {
82    (e.is_timeout() || e.is_connect() || e.is_body()) && !e.is_redirect()
83}
84
85/// Parse a `Retry-After` header expressed as integer seconds (the
86/// HTTP-date form is accepted by the RFC but rare for these APIs and
87/// deliberately ignored for the MVP — we fall back to exponential
88/// backoff in that case). Capped at [`RETRY_MAX_DELAY`].
89fn parse_retry_after(headers: &reqwest::header::HeaderMap) -> Option<Duration> {
90    let secs: u64 = headers
91        .get(reqwest::header::RETRY_AFTER)?
92        .to_str()
93        .ok()?
94        .trim()
95        .parse()
96        .ok()?;
97    Some(Duration::from_secs(secs).min(RETRY_MAX_DELAY))
98}
99
100/// Exponential backoff with decorrelated jitter. `RETRY_BASE_DELAY *
101/// 2^attempt`, capped at [`RETRY_MAX_DELAY`], plus 0..base jitter so a
102/// fleet of clients does not thunder back in lockstep. Jitter is derived
103/// from the wall-clock subsec nanos rather than pulling in an RNG
104/// dependency — adequate decorrelation for backoff, not a security
105/// primitive.
106fn backoff_delay(attempt: u32) -> Duration {
107    let factor = 1u64 << attempt.min(20);
108    let base_ms = RETRY_BASE_DELAY.as_millis() as u64;
109    let capped_ms = base_ms
110        .saturating_mul(factor)
111        .min(RETRY_MAX_DELAY.as_millis() as u64);
112    let jitter_ms = std::time::SystemTime::now()
113        .duration_since(std::time::UNIX_EPOCH)
114        .map(|d| (d.subsec_nanos() as u64) % base_ms.max(1))
115        .unwrap_or(0);
116    Duration::from_millis(capped_ms.saturating_add(jitter_ms))
117}
118
119// ---------------------------------------------------------------------------
120// SourceAllowlist
121// ---------------------------------------------------------------------------
122
123/// Per-source allowlist entry. Matches the schema in
124/// `docs/REDIRECT_ALLOWLIST.md` §2.
125#[derive(Debug, Clone)]
126#[non_exhaustive]
127pub struct SourceAllowlist {
128    /// Source key. MUST match a `source` value in `docs/SOURCES.md` §1
129    /// (e.g. `crossref`, `unpaywall`, `arxiv`).
130    pub source: String,
131    /// Each pattern is either a literal FQDN or a `*.<suffix>` glob (matches
132    /// the suffix and any subdomain — see `docs/REDIRECT_ALLOWLIST.md` §2.2
133    /// matching rule).
134    pub redirect_hosts: Vec<String>,
135}
136
137impl SourceAllowlist {
138    /// Construct a new allowlist entry.
139    pub fn new(source: impl Into<String>, redirect_hosts: Vec<String>) -> Self {
140        Self {
141            source: source.into(),
142            redirect_hosts,
143        }
144    }
145
146    /// Returns `true` if `host` matches any pattern in this allowlist.
147    ///
148    /// Matching is byte-level on the lowercased ASCII form of the host.
149    /// Callers MUST lowercase upstream; this method also lowercases as a
150    /// defense-in-depth measure but treats the result as ASCII (Punycode
151    /// is the caller's responsibility per `docs/REDIRECT_ALLOWLIST.md`
152    /// §2.2 rule 4).
153    pub fn matches(&self, host: &str) -> bool {
154        let host_lc = host.to_ascii_lowercase();
155        self.redirect_hosts
156            .iter()
157            .any(|pat| host_matches_pattern(&host_lc, pat))
158    }
159}
160
161/// Returns `true` if `host` (already lowercased) matches `pattern` per
162/// `docs/REDIRECT_ALLOWLIST.md` §2.2.
163fn host_matches_pattern(host: &str, pattern: &str) -> bool {
164    let pat_lc = pattern.to_ascii_lowercase();
165    if let Some(suffix) = pat_lc.strip_prefix("*.") {
166        // Suffix-glob: matches `<suffix>` exactly OR `*.<suffix>`.
167        host == suffix || host.ends_with(&format!(".{}", suffix))
168    } else {
169        // Exact-FQDN: byte-identical (after lowercasing both sides).
170        host == pat_lc
171    }
172}
173
174/// Hard-coded Phase 1 allowlist for Tier 1 sources. Sourced from
175/// `docs/REDIRECT_ALLOWLIST.md` §3.
176///
177/// Marked `Phase 1; revisit during real fetches` in the spec — entries
178/// flagged `(unverified)` (e.g. arXiv subdomain redirect behavior) MUST be
179/// confirmed or removed before Phase 1 is closed; see §3.3 of the spec.
180pub fn tier_1_allowlist() -> Vec<SourceAllowlist> {
181    vec![
182        // §3.1 crossref
183        SourceAllowlist::new(
184            "crossref",
185            vec!["api.crossref.org".to_string(), "*.crossref.org".to_string()],
186        ),
187        // §3.2 unpaywall
188        SourceAllowlist::new("unpaywall", vec!["api.unpaywall.org".to_string()]),
189        // §3.3 arxiv
190        SourceAllowlist::new(
191            "arxiv",
192            vec![
193                "arxiv.org".to_string(),
194                "export.arxiv.org".to_string(),
195                "*.arxiv.org".to_string(),
196            ],
197        ),
198    ]
199}
200
201/// Hard-coded Phase 4 allowlist for Tier 2 metadata sources (OpenAlex,
202/// Semantic Scholar, DOAJ). Sourced from `docs/SOURCES.md` §1 (the Tier 2
203/// table) and `docs/REDIRECT_ALLOWLIST.md` §3 (same redirect-allowlist
204/// policy as Tier 1, distinct source keys).
205///
206/// Returned hosts:
207///
208/// - `"openalex"` → `api.openalex.org` (production OpenAlex REST API).
209/// - `"semantic_scholar"` → `api.semanticscholar.org` (S2 Graph API base).
210/// - `"doaj"` → `doaj.org` + `*.doaj.org` (DOAJ public API; wildcard
211///   covers `api.doaj.org` and any v4+ subdomain split).
212///
213/// Per `docs/SOURCES.md` §4 "OpenAlex / Semantic Scholar / DOAJ", these
214/// sources are **metadata-only**: their `Source::fetch` impls MUST
215/// return `pdf_bytes: None`. The redirect closure in [`HttpClient`]
216/// uses this list to deny redirects to off-list hosts under each Tier
217/// 2 source key — identical mechanism to Tier 1, but the per-tool
218/// capability gate (`profile.metadata.openalex` etc.) is layered on
219/// top so the network surface remains capability-aware.
220pub fn tier_2_allowlist() -> Vec<SourceAllowlist> {
221    vec![
222        SourceAllowlist::new("openalex", vec!["api.openalex.org".to_string()]),
223        SourceAllowlist::new(
224            "semantic_scholar",
225            vec!["api.semanticscholar.org".to_string()],
226        ),
227        SourceAllowlist::new(
228            "doaj",
229            vec!["doaj.org".to_string(), "*.doaj.org".to_string()],
230        ),
231    ]
232}
233
234/// Hard-coded Phase 5a allowlist for the Springer Nature OA TDM
235/// source. Compile-gated by the `tdm-springer` Cargo feature so
236/// default release binaries never include the host pattern (per
237/// ADR-0002 and `docs/SOURCES.md` §3).
238///
239/// Returned entry:
240/// - `"tdm-springer"` → `api.springernature.com` (production base) +
241///   `*.springernature.com` (covers load-balancing subdomains; the
242///   redirect closure denies anything outside the wildcard).
243///
244/// Per `docs/SOURCES.md` §4 "TDM sources (Phase 5)", a fetch under
245/// this source key requires ALL THREE gates: Cargo feature compiled
246/// in, `DOIGET_KEY_SPRINGER` env var present, and
247/// `DOIGET_AGREE_TDM_SPRINGER=1`. The `CapabilityProfile` gate
248/// enforces the env-var pair; this allowlist is the transport gate.
249#[cfg(feature = "tdm-springer")]
250pub fn tier_3_springer_allowlist() -> Vec<SourceAllowlist> {
251    vec![SourceAllowlist::new(
252        "tdm-springer",
253        vec![
254            "api.springernature.com".to_string(),
255            "*.springernature.com".to_string(),
256        ],
257    )]
258}
259
260/// Hard-coded Phase 5b allowlist for the APS Harvest TDM source.
261/// Compile-gated by the `tdm-aps` Cargo feature so default release
262/// binaries never include the host pattern (per ADR-0002 and
263/// `docs/SOURCES.md` §3).
264///
265/// Returned entry:
266/// - `"tdm-aps"` → `harvest.aps.org` (production base) +
267///   `*.aps.org` (covers load-balancing subdomains; the redirect
268///   closure denies anything outside the wildcard).
269///
270/// Three-gate activation: Cargo feature compiled in,
271/// `DOIGET_KEY_APS` env var present, and `DOIGET_AGREE_TDM_APS=1`.
272/// The `CapabilityProfile` gate enforces the env-var pair; this
273/// allowlist is the transport gate.
274#[cfg(feature = "tdm-aps")]
275pub fn tier_3_aps_allowlist() -> Vec<SourceAllowlist> {
276    vec![SourceAllowlist::new(
277        "tdm-aps",
278        vec!["harvest.aps.org".to_string(), "*.aps.org".to_string()],
279    )]
280}
281
282/// Hard-coded Phase 5c allowlist for the Elsevier ScienceDirect TDM
283/// source. Compile-gated by the `tdm-elsevier` Cargo feature so
284/// default release binaries never include the host pattern (per
285/// ADR-0002 and `docs/SOURCES.md` §3).
286///
287/// Returned entry:
288/// - `"tdm-elsevier"` → `api.elsevier.com` (production base) +
289///   `*.elsevier.com` (covers load-balancing subdomains; the
290///   redirect closure denies anything outside the wildcard).
291///
292/// Three-gate activation: Cargo feature compiled in,
293/// `DOIGET_KEY_ELSEVIER` env var present, and
294/// `DOIGET_AGREE_TDM_ELSEVIER=1`. The `CapabilityProfile` gate
295/// enforces the env-var pair; this allowlist is the transport gate.
296#[cfg(feature = "tdm-elsevier")]
297pub fn tier_3_elsevier_allowlist() -> Vec<SourceAllowlist> {
298    vec![SourceAllowlist::new(
299        "tdm-elsevier",
300        vec!["api.elsevier.com".to_string(), "*.elsevier.com".to_string()],
301    )]
302}
303
304/// Hard-coded Phase 1 allowlist for the synthetic `"oa-publisher"` source —
305/// the publisher / preprint / repository hosts to which Unpaywall's
306/// `best_oa_location.url` (or `url_for_pdf`) typically resolves.
307///
308/// **Status: informed-best-effort.** Per `docs/REDIRECT_ALLOWLIST.md` §3,
309/// every entry below is a documented OA-publisher host pulled from the
310/// public DOI / OA discovery surface as of this function's authoring; they
311/// are **not** a substitute for empirical validation. Entries marked
312/// `(unverified)` MUST be confirmed by a real fetch or removed before
313/// Phase 1 is closed.
314///
315/// The orchestrator (`doiget-cli::commands::fetch::fetch_doi`) calls
316/// [`HttpClient::fetch_pdf`] under the `"oa-publisher"` source key when
317/// Unpaywall returns an OA URL. If the OA host is not in this list, the
318/// PDF leg is denied (`HttpError::RedirectDenied`) and the orchestrator
319/// falls back to metadata-only success (the `informed-best-effort`
320/// posture from the spec section above).
321pub fn oa_publisher_allowlist() -> Vec<SourceAllowlist> {
322    vec![SourceAllowlist::new(
323        "oa-publisher",
324        vec![
325            // Springer Nature OA imprints. Springer / SpringerOpen / Nature
326            // OA URLs all resolve under one of these registrable suffixes.
327            // (unverified) — confirm by replaying real Unpaywall responses.
328            "*.springer.com".to_string(),
329            "*.springeropen.com".to_string(),
330            "*.springernature.com".to_string(),
331            "*.nature.com".to_string(),
332            // Wiley OA. (unverified)
333            "*.wiley.com".to_string(),
334            // Elsevier OA route only — the TDM gated path is a separate
335            // source (`tdm-elsevier`, Phase 5c) and is not covered here.
336            // (unverified)
337            "*.elsevier.com".to_string(),
338            "*.sciencedirect.com".to_string(),
339            // Frontiers. (unverified)
340            "*.frontiersin.org".to_string(),
341            // MDPI. (unverified)
342            "*.mdpi.com".to_string(),
343            // PLOS. (unverified)
344            "*.plos.org".to_string(),
345            // Preprint servers — biorxiv / medrxiv. (unverified)
346            "*.biorxiv.org".to_string(),
347            "*.medrxiv.org".to_string(),
348            // Europe PMC + NIH PMC. (unverified)
349            "europepmc.org".to_string(),
350            "*.europepmc.org".to_string(),
351            "*.nih.gov".to_string(),
352            "*.ncbi.nlm.nih.gov".to_string(),
353            // Physics-society / diamond-OA hosts. UNLIKE the entries
354            // above, these are EMPIRICALLY VERIFIED: a real `doiget batch`
355            // over 30 OpenAlex-OA finite-temperature-MPS DOIs observed
356            // Unpaywall `best_oa_location` resolving to these hosts and
357            // being denied (#193, REDIRECT_ALLOWLIST.md §3.4, ADR-0027).
358            // APS — journals.aps.org / link.aps.org (green & gold OA;
359            // society host; `*.aps.org` is also trusted under the separate
360            // `tdm-aps` Tier-3 source key WHEN that feature is compiled
361            // in — `tier_3_aps_allowlist` is `#[cfg(feature = "tdm-aps")]`
362            // and absent from default release builds).
363            "*.aps.org".to_string(),
364            // SciPost — diamond OA, community-run physics publisher.
365            "scipost.org".to_string(),
366            "*.scipost.org".to_string(),
367            // IOP Publishing — iopscience.iop.org (New J. Phys. etc.).
368            "*.iop.org".to_string(),
369            // arXiv — already on the `arxiv` tier-1 allowlist, but the
370            // Unpaywall-driven path uses the `oa-publisher` source key,
371            // so we mirror the host list here too. See REDIRECT_ALLOWLIST.md
372            // §3.3 for the underlying entries.
373            "arxiv.org".to_string(),
374            "*.arxiv.org".to_string(),
375        ],
376    )]
377}
378
379// ---------------------------------------------------------------------------
380// HttpError
381// ---------------------------------------------------------------------------
382
383/// Errors that can arise during HTTP fetches.
384#[derive(Debug, Error)]
385#[non_exhaustive]
386pub enum HttpError {
387    /// Transport / DNS / TLS failure or other `reqwest`-level error. Note
388    /// that `reqwest` surfaces a redirect-policy abort (via `Attempt::error`)
389    /// as a `reqwest::Error` carrying the source error — callers seeing
390    /// `Network` for what they believed was a redirect violation should
391    /// inspect the inner error chain.
392    #[error("network error: {0}")]
393    Network(#[from] reqwest::Error),
394    /// Redirect target host did not match any pattern in the source's
395    /// `redirect_hosts`. See `docs/REDIRECT_ALLOWLIST.md` §2.2.
396    ///
397    /// Field naming: `source_key` rather than `source` because `thiserror`
398    /// auto-treats a field literally named `source` as a `#[source]` error
399    /// chain link (which would require the field to implement `std::error::Error`).
400    ///
401    /// `expected_hosts` carries a snapshot of the source's allowlist
402    /// patterns at the time of the denial — populated for the structured
403    /// `denial_context.expected` channel introduced by ADR-0023 §4
404    /// (NORMATIVE mapping table). Cloning the patterns into the error
405    /// keeps the `From<&HttpError> for Option<DenialContext>` impl from
406    /// having to re-look-up the allowlist by `source_key`. May be empty
407    /// when the rejection happened before any allowlist was matched
408    /// (e.g. URL had no host component at all).
409    #[error("redirect target {host} not in allowlist for source {source_key}")]
410    RedirectDenied {
411        /// Source key whose allowlist rejected the redirect.
412        source_key: String,
413        /// The lowercased host that was rejected.
414        host: String,
415        /// Snapshot of the source's `redirect_hosts` at denial time.
416        /// Surfaces as `denial_context.expected` (ADR-0023 §4).
417        expected_hosts: Vec<String>,
418    },
419    /// Redirect target had a scheme other than `https`. See
420    /// `docs/SECURITY.md` §1.3.
421    #[error("redirect to non-HTTPS scheme: {scheme}")]
422    InsecureRedirect {
423        /// The disallowed scheme (e.g. `http`, `file`, `data`).
424        scheme: String,
425    },
426    /// Body would exceed [`PDF_MAX_BYTES`] either by a `Content-Length`
427    /// hint or by accumulated streamed bytes. See `docs/SECURITY.md` §1.2.
428    #[error("body too large: {actual} bytes (cap = {cap})")]
429    OversizedBody {
430        /// Observed size (header value or accumulated bytes).
431        actual: u64,
432        /// Hard upper bound (always [`PDF_MAX_BYTES`]).
433        cap: u64,
434    },
435    /// PDF magic-byte mismatch — the body does not start with `%PDF-`.
436    /// We deliberately do NOT use `Content-Type` (publishers misbehave —
437    /// the magic byte is the trustworthy signal per `docs/SECURITY.md`
438    /// §1.2 "Magic-byte mismatch" row).
439    #[error("PDF magic-byte mismatch: got {got:?}")]
440    NotAPdf {
441        /// First five bytes of the response body (zero-padded if shorter).
442        got: [u8; 5],
443    },
444    /// Server returned a non-2xx status.
445    #[error("HTTP {status} from {url}")]
446    HttpStatus {
447        /// HTTP status code.
448        status: u16,
449        /// The URL that produced the status.
450        url: String,
451    },
452    /// No allowlist entry exists for this source. The caller asked
453    /// [`HttpClient`] to fetch on behalf of a source that wasn't passed to
454    /// [`HttpClient::new`].
455    ///
456    /// See note on `RedirectDenied` for why the field is `source_key`.
457    #[error("no allowlist registered for source {source_key}")]
458    UnknownSource {
459        /// The unregistered source key.
460        source_key: String,
461    },
462    /// A header name or value passed to
463    /// [`HttpClient::fetch_bytes_with_headers`] was not a valid HTTP
464    /// header. The header parser only accepts the visible-ASCII subset
465    /// per RFC 7230 §3.2; control characters and non-ASCII bytes are
466    /// rejected before the request is even built. Surfaces as
467    /// `ErrorCode::InternalError` at the public boundary (callers
468    /// supplying bad headers are responsible for fixing the call site;
469    /// not a denial in the ADR-0023 sense).
470    #[error("invalid HTTP header `{name}`: {reason}")]
471    InvalidHeader {
472        /// The header name as supplied by the caller.
473        name: String,
474        /// `"name"` or `"value"` — which side failed parsing.
475        reason: String,
476    },
477}
478
479// ---------------------------------------------------------------------------
480// HttpError -> Option<DenialContext>  (ADR-0023 §4 mapping table)
481// ---------------------------------------------------------------------------
482
483/// Map an [`HttpError`] reference to the structured [`crate::DenialContext`]
484/// channel introduced by ADR-0023.
485///
486/// Returns `Some(_)` for the four denial classes named in ADR-0023 §4
487/// (`RedirectDenied`, `OversizedBody`, `NotAPdf`, `InsecureRedirect`) and
488/// `None` for every other variant — `Network`, `HttpStatus`,
489/// `UnknownSource` are not denials in the ADR-0023 sense (they are
490/// transport / upstream / programming-error signals, not allowlist or
491/// cap rejections).
492///
493/// The `&HttpError` borrow form is used (rather than `HttpError`) so the
494/// caller — typically the orchestrator that already needs the original
495/// error for `error.message` and the `From<HttpError> for ErrorCode`
496/// collapse — does not have to clone the error to produce the optional
497/// structured side-channel.
498impl From<&HttpError> for Option<crate::DenialContext> {
499    fn from(e: &HttpError) -> Self {
500        use crate::{DenialContext, DenialReason};
501        match e {
502            HttpError::RedirectDenied {
503                source_key,
504                host,
505                expected_hosts,
506            } => Some(DenialContext {
507                reason: DenialReason::RedirectNotInAllowlist,
508                source: Some(source_key.clone()),
509                attempted: Some(host.clone()),
510                expected: Some(expected_hosts.clone()),
511                hop_index: None,
512                cap: None,
513                actual: None,
514            }),
515            HttpError::OversizedBody { actual, cap } => Some(DenialContext {
516                reason: DenialReason::SizeCapExceeded,
517                source: None,
518                attempted: None,
519                // The size-cap reason has no allowlist channel; use
520                // `None` to signal "field not populated by producer"
521                // rather than `Some(vec![])` (which would mean "explicit
522                // empty allowlist"). See `DenialContext::expected` docs.
523                expected: None,
524                hop_index: None,
525                cap: Some(*cap),
526                actual: Some(*actual),
527            }),
528            HttpError::NotAPdf { got } => Some(DenialContext {
529                reason: DenialReason::ContentTypeMismatch,
530                source: None,
531                // ADR-0023 §4 mapping table: hex-encode the first 5 bytes
532                // for the `attempted` field. `format!("{:02x}...")` is
533                // chosen over `hex::encode` to avoid pulling the
534                // additional dep into this conversion path; the result is
535                // bit-identical (lowercase, zero-padded).
536                attempted: Some(format!(
537                    "{:02x}{:02x}{:02x}{:02x}{:02x}",
538                    got[0], got[1], got[2], got[3], got[4]
539                )),
540                expected: Some(vec!["%PDF-".to_string()]),
541                hop_index: None,
542                cap: None,
543                actual: None,
544            }),
545            HttpError::InsecureRedirect { scheme } => Some(DenialContext {
546                reason: DenialReason::InsecureScheme,
547                source: None,
548                attempted: Some(format!("{}:...", scheme)),
549                expected: Some(vec!["https".to_string()]),
550                hop_index: None,
551                cap: None,
552                actual: None,
553            }),
554            // `reqwest` wraps a custom error returned by the redirect
555            // policy closure (`attempt.error(HttpError::RedirectDenied{..})`
556            // / `attempt.error(HttpError::InsecureRedirect{..})`) inside a
557            // `reqwest::Error`, which surfaces here as `HttpError::Network`.
558            // Without source-chain walking, production redirect denials —
559            // the most operationally important denial class — would never
560            // produce a `DenialContext`, defeating the whole point of
561            // ADR-0023.
562            //
563            // Walk the `std::error::Error::source()` chain on the inner
564            // `reqwest::Error` and downcast each link to `&HttpError`. If
565            // a wrapped `HttpError` is found, recurse via this same `From`
566            // impl. Otherwise the network error is a "real" transport /
567            // DNS / TLS failure with no denial semantics — return `None`.
568            //
569            // `std::error::Error::source(e)` is fully-qualified to
570            // disambiguate against the inherent (and unrelated)
571            // `reqwest::Error::source()`.
572            HttpError::Network(e) => {
573                let mut source: Option<&(dyn std::error::Error + 'static)> =
574                    std::error::Error::source(e);
575                while let Some(s) = source {
576                    if let Some(http_err) = s.downcast_ref::<HttpError>() {
577                        return Option::<crate::DenialContext>::from(http_err);
578                    }
579                    source = s.source();
580                }
581                None
582            }
583            // The remaining variants are not "denials" in the ADR-0023
584            // sense — HttpStatus/UnknownSource are upstream / programming-
585            // error signals; InvalidHeader is a caller-bug signal.
586            HttpError::HttpStatus { .. }
587            | HttpError::UnknownSource { .. }
588            | HttpError::InvalidHeader { .. } => None,
589        }
590    }
591}
592
593// ---------------------------------------------------------------------------
594// HttpClient
595// ---------------------------------------------------------------------------
596
597/// Workspace-wide HTTP client with the security defaults applied.
598///
599/// Internally holds one `reqwest::Client` per source. Construct via
600/// [`HttpClient::new`] with the full set of allowlists the calling process
601/// will need.
602#[derive(Clone, Debug)]
603pub struct HttpClient {
604    /// One [`reqwest::Client`] per source. Each client carries a redirect
605    /// policy that captures only that source's allowlist. `Arc` so cloning
606    /// is cheap.
607    clients: Arc<HashMap<String, Client>>,
608    /// The exact [`SourceAllowlist`] each per-source client was built from,
609    /// keyed by source. The redirect closure inside each `reqwest::Client`
610    /// captures its allowlist *by move*, so it cannot be read back from the
611    /// client itself. This map keeps the identical `SourceAllowlist`
612    /// available to callers that must perform a *pre-fetch* host check on a
613    /// metadata-discovered URL (issue #145 / `docs/REDIRECT_ALLOWLIST.md`
614    /// §1: the allowlist is consulted "on the OA URL discovered through
615    /// metadata sources before the actual PDF fetch is issued", not only on
616    /// redirect hops). Storing the same value here — rather than re-deriving
617    /// it from [`oa_publisher_allowlist`] at the call site — guarantees the
618    /// pre-check and the redirect closure can never drift, and that the
619    /// check works under the test constructors too (which register a
620    /// wiremock host as the allowlist).
621    allowlists: Arc<HashMap<String, SourceAllowlist>>,
622}
623
624impl HttpClient {
625    /// Build a client with rustls + redirect-allowlist + size cap +
626    /// timeouts.
627    ///
628    /// `allowlists` MUST cover every source whose URL might be passed in;
629    /// fetches against unregistered sources return
630    /// [`HttpError::UnknownSource`].
631    ///
632    /// # Errors
633    ///
634    /// Returns the underlying `reqwest::Error` if `ClientBuilder::build`
635    /// fails (typically a TLS-backend init failure).
636    pub fn new(allowlists: Vec<SourceAllowlist>) -> Result<Self, reqwest::Error> {
637        let mut clients = HashMap::with_capacity(allowlists.len());
638        let mut allowlist_map = HashMap::with_capacity(allowlists.len());
639        for entry in allowlists {
640            let source = entry.source.clone();
641            // Keep the *same* allowlist value both inside the redirect
642            // closure (via `build_client`) and queryable on the client
643            // (issue #145 pre-fetch check). `build_client` takes the
644            // allowlist by value, so clone once for the side table first.
645            allowlist_map.insert(source.clone(), entry.clone());
646            let client = build_client(entry)?;
647            clients.insert(source, client);
648        }
649        Ok(Self {
650            clients: Arc::new(clients),
651            allowlists: Arc::new(allowlist_map),
652        })
653    }
654
655    /// The [`SourceAllowlist`] this client was built with for `source`, or
656    /// `None` if `source` was not registered.
657    ///
658    /// This is the *identical* value captured by the per-source redirect
659    /// closure (see [`HttpClient`]'s `allowlists` field doc). It exists so
660    /// the orchestrator can apply the `docs/REDIRECT_ALLOWLIST.md` §1
661    /// pre-fetch host check on a metadata-discovered OA URL — the URL that
662    /// is fetched *without* necessarily passing through a redirect hop —
663    /// using the same source of truth the redirect closure uses, so the two
664    /// can never disagree. Callers MUST use this for the `"oa-publisher"`
665    /// leg only; the initial template-constructed URL is exempt per
666    /// `docs/REDIRECT_ALLOWLIST.md` §6.
667    pub fn source_allowlist(&self, source: &str) -> Option<&SourceAllowlist> {
668        self.allowlists.get(source)
669    }
670
671    /// Fetch a URL, treating it as a JSON or text body. Caps at
672    /// [`PDF_MAX_BYTES`].
673    ///
674    /// Returns the response body bytes plus the effective final URL after
675    /// redirects (post-allowlist verification — every hop has already been
676    /// validated by the time this returns).
677    ///
678    /// # Errors
679    ///
680    /// Any [`HttpError`] variant.
681    pub async fn fetch_bytes(&self, source: &str, url: Url) -> Result<(Bytes, Url), HttpError> {
682        self.fetch_inner(source, url, &[], false).await
683    }
684
685    /// Like [`Self::fetch_bytes`] but attaches additional request
686    /// headers to the outgoing GET. The headers are validated up-front
687    /// against the visible-ASCII subset (RFC 7230 §3.2); any failure
688    /// returns [`HttpError::InvalidHeader`] before the request is sent.
689    ///
690    /// Used by Tier-3 TDM sources that authenticate via a header
691    /// (APS Harvest `X-API-Key`, Elsevier ScienceDirect `X-ELS-APIKey`).
692    /// Header values appear on the wire only — they are never logged.
693    ///
694    /// # Errors
695    ///
696    /// Any [`HttpError`] variant including [`HttpError::InvalidHeader`].
697    pub async fn fetch_bytes_with_headers(
698        &self,
699        source: &str,
700        url: Url,
701        headers: &[(&str, &str)],
702    ) -> Result<(Bytes, Url), HttpError> {
703        self.fetch_inner(source, url, headers, false).await
704    }
705
706    /// Fetch a URL expected to be a PDF. Same as [`Self::fetch_bytes`] plus
707    /// the magic-byte check on the first 5 bytes
708    /// (`%PDF-` = `[0x25, 0x50, 0x44, 0x46, 0x2D]`). Mismatch returns
709    /// [`HttpError::NotAPdf`].
710    ///
711    /// # Errors
712    ///
713    /// Any [`HttpError`] variant including [`HttpError::NotAPdf`].
714    pub async fn fetch_pdf(&self, source: &str, url: Url) -> Result<(Bytes, Url), HttpError> {
715        self.fetch_inner(source, url, &[], true).await
716    }
717
718    async fn fetch_inner(
719        &self,
720        source: &str,
721        url: Url,
722        headers: &[(&str, &str)],
723        check_pdf_magic: bool,
724    ) -> Result<(Bytes, Url), HttpError> {
725        let client = self
726            .clients
727            .get(source)
728            .ok_or_else(|| HttpError::UnknownSource {
729                source_key: source.to_string(),
730            })?;
731
732        // Parse headers up-front so an invalid name/value fails BEFORE
733        // we touch the network. `HeaderName::from_bytes` / `HeaderValue::from_str`
734        // accept the visible-ASCII subset only (RFC 7230 §3.2).
735        let mut header_map = reqwest::header::HeaderMap::with_capacity(headers.len());
736        for (name, value) in headers {
737            let hn = reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|_| {
738                HttpError::InvalidHeader {
739                    name: (*name).to_string(),
740                    reason: "name".to_string(),
741                }
742            })?;
743            let hv = reqwest::header::HeaderValue::from_str(value).map_err(|_| {
744                HttpError::InvalidHeader {
745                    name: (*name).to_string(),
746                    reason: "value".to_string(),
747                }
748            })?;
749            header_map.insert(hn, hv);
750        }
751
752        // Bounded retry loop (issue #117). Only transient classes are
753        // retried — connect/timeout/mid-stream network errors and the
754        // transient HTTP status set. Allowlist denials, NotAPdf,
755        // OversizedBody, 4xx (non-408/429) are deterministic and return
756        // on the first occurrence. GET is idempotent so a retried
757        // attempt re-streams the body from scratch.
758        let mut attempt: u32 = 0;
759        loop {
760            let send_result = client
761                .get(url.clone())
762                .headers(header_map.clone())
763                .send()
764                .await;
765            let response = match send_result {
766                Ok(r) => r,
767                Err(e) => {
768                    if attempt < MAX_FETCH_RETRIES && reqwest_is_transient(&e) {
769                        let d = backoff_delay(attempt);
770                        tracing::warn!(
771                            source,
772                            attempt,
773                            delay_ms = d.as_millis() as u64,
774                            error = %e,
775                            "transient send failure; retrying"
776                        );
777                        tokio::time::sleep(d).await;
778                        attempt += 1;
779                        continue;
780                    }
781                    return Err(HttpError::Network(e));
782                }
783            };
784            let final_url = response.url().clone();
785
786            // Status check before body read so we can fail fast.
787            let status = response.status();
788            if !status.is_success() {
789                let code = status.as_u16();
790                if attempt < MAX_FETCH_RETRIES && is_transient_status(code) {
791                    // Prefer the server's `Retry-After` over our backoff
792                    // when present (429/503 commonly carry it).
793                    let d = parse_retry_after(response.headers())
794                        .unwrap_or_else(|| backoff_delay(attempt));
795                    tracing::warn!(
796                        source,
797                        attempt,
798                        status = code,
799                        delay_ms = d.as_millis() as u64,
800                        "transient HTTP status; retrying"
801                    );
802                    tokio::time::sleep(d).await;
803                    attempt += 1;
804                    continue;
805                }
806                return Err(HttpError::HttpStatus {
807                    status: code,
808                    // Issue #146: Springer Nature authenticates via an
809                    // `api_key` URL query parameter (no header path
810                    // upstream). This error string is logged and may
811                    // surface to the user, so strip any `api_key`
812                    // value before it leaves the client. No other
813                    // source puts a secret in the query string, so
814                    // this is a no-op for them.
815                    url: redact_api_key_query(&final_url),
816                });
817            }
818
819            // Content-Length fast-path: if header is present and exceeds
820            // the cap, fail without reading any body (deterministic — not
821            // retried). Per `docs/SECURITY.md` §1.2.
822            if let Some(len) = response.content_length() {
823                if len > PDF_MAX_BYTES {
824                    return Err(HttpError::OversizedBody {
825                        actual: len,
826                        cap: PDF_MAX_BYTES,
827                    });
828                }
829            }
830
831            // Stream body and enforce the cap as bytes accumulate. A
832            // mid-stream transport error is transient (retry); an
833            // oversized body is deterministic (return).
834            let mut buf = BytesMut::new();
835            let mut stream = response.bytes_stream();
836            let mut oversized_at: Option<u64> = None;
837            let mut stream_err: Option<reqwest::Error> = None;
838            while let Some(chunk) = stream.next().await {
839                let chunk = match chunk {
840                    Ok(c) => c,
841                    Err(e) => {
842                        stream_err = Some(e);
843                        break;
844                    }
845                };
846                let projected = (buf.len() as u64).saturating_add(chunk.len() as u64);
847                if projected > PDF_MAX_BYTES {
848                    oversized_at = Some(projected);
849                    break;
850                }
851                buf.extend_from_slice(&chunk);
852            }
853            if let Some(actual) = oversized_at {
854                return Err(HttpError::OversizedBody {
855                    actual,
856                    cap: PDF_MAX_BYTES,
857                });
858            }
859            if let Some(e) = stream_err {
860                if attempt < MAX_FETCH_RETRIES && reqwest_is_transient(&e) {
861                    let d = backoff_delay(attempt);
862                    tracing::warn!(
863                        source,
864                        attempt,
865                        delay_ms = d.as_millis() as u64,
866                        error = %e,
867                        "transient mid-stream failure; retrying"
868                    );
869                    tokio::time::sleep(d).await;
870                    attempt += 1;
871                    continue;
872                }
873                return Err(HttpError::Network(e));
874            }
875            let body = buf.freeze();
876
877            if check_pdf_magic {
878                let mut got = [0u8; 5];
879                let n = body.len().min(5);
880                got[..n].copy_from_slice(&body[..n]);
881                if got != PDF_MAGIC {
882                    return Err(HttpError::NotAPdf { got });
883                }
884            }
885
886            return Ok((body, final_url));
887        }
888    }
889}
890
891/// Return `url` rendered as a string with the value of any `api_key`
892/// query parameter replaced by `REDACTED` (issue #146).
893///
894/// Springer Nature's TDM API authenticates **only** via an `api_key`
895/// query parameter — there is no header-auth path upstream — so the key
896/// is unavoidably in the request URL. This keeps it out of *our* log
897/// and error sinks (the `HttpError::HttpStatus` string in particular,
898/// which is `tracing`-logged and can surface to the user). It is a
899/// structural no-op for every other source, none of which carry a
900/// secret in the query string. Other pairs and their order are
901/// preserved; a URL with no `api_key` pair is rendered unchanged.
902fn redact_api_key_query(url: &url::Url) -> String {
903    const API_KEY_PARAM: &str = "api_key";
904    if url.query_pairs().all(|(k, _)| k != API_KEY_PARAM) {
905        return url.to_string();
906    }
907    let mut redacted = url.clone();
908    let pairs: Vec<(String, String)> = url
909        .query_pairs()
910        .map(|(k, v)| {
911            if k == API_KEY_PARAM {
912                (k.into_owned(), "REDACTED".to_string())
913            } else {
914                (k.into_owned(), v.into_owned())
915            }
916        })
917        .collect();
918    redacted.query_pairs_mut().clear().extend_pairs(pairs);
919    redacted.to_string()
920}
921
922/// Test-oriented [`HttpClient`] constructor. Originally `cfg(test)`; now
923/// also reachable from the `doiget-cli` orchestrator's integration tests
924/// (which live outside this crate and therefore cannot see `cfg(test)`-gated
925/// items). The constructor name retains its `for_tests_allow_http` signal —
926/// production code MUST use [`HttpClient::new`] with [`tier_1_allowlist`].
927#[allow(clippy::expect_used)]
928impl HttpClient {
929    /// Build a test-oriented `HttpClient` against an `http://` wiremock
930    /// origin. The redirect closure still rejects insecure schemes — we only
931    /// relax `https_only` at the connection level so wiremock can serve.
932    /// This is acceptable because the redirect closure (which is the
933    /// security-load-bearing path) is exercised by the
934    /// `redirect_to_http_is_rejected_by_closure` test below.
935    ///
936    /// Production callers MUST use [`HttpClient::new`] with
937    /// [`tier_1_allowlist`] — the `for_tests_allow_http` suffix is the load-
938    /// bearing signal that this constructor lifts the initial-leg HTTPS-only
939    /// requirement.
940    pub fn new_for_tests_allow_http(source: &str, allowlist_host: &str) -> Self {
941        let allowlist = SourceAllowlist::new(source, vec![allowlist_host.to_string()]);
942        let client = build_client_allow_http(allowlist.clone()).expect("test client builds");
943        let mut map = HashMap::new();
944        let mut allowlist_map = HashMap::new();
945        allowlist_map.insert(allowlist.source.clone(), allowlist.clone());
946        map.insert(allowlist.source.clone(), client);
947        Self {
948            clients: Arc::new(map),
949            allowlists: Arc::new(allowlist_map),
950        }
951    }
952
953    /// Multi-source variant of [`HttpClient::new_for_tests_allow_http`].
954    ///
955    /// Builds a relaxed-`https_only` client per `(source, allowlist_host)`
956    /// pair. Used by the `doiget-cli` orchestrator's integration tests when
957    /// more than one upstream needs to be wiremocked simultaneously
958    /// (e.g. Crossref + Unpaywall against two different mock servers).
959    /// Production callers MUST use [`HttpClient::new`] with
960    /// [`tier_1_allowlist`].
961    pub fn new_for_tests_allow_http_multi(entries: &[(&str, &str)]) -> Self {
962        let mut map = HashMap::with_capacity(entries.len());
963        let mut allowlist_map = HashMap::with_capacity(entries.len());
964        for (source, host) in entries {
965            let allowlist = SourceAllowlist::new(*source, vec![host.to_string()]);
966            let client = build_client_allow_http(allowlist.clone()).expect("test client builds");
967            allowlist_map.insert(allowlist.source.clone(), allowlist.clone());
968            map.insert(allowlist.source.clone(), client);
969        }
970        Self {
971            clients: Arc::new(map),
972            allowlists: Arc::new(allowlist_map),
973        }
974    }
975}
976
977fn build_client_allow_http(allowlist: SourceAllowlist) -> Result<Client, reqwest::Error> {
978    ensure_crypto_provider();
979    let allowlist_for_closure = allowlist.clone();
980    let redirect_policy = Policy::custom(move |attempt| {
981        let scheme = attempt.url().scheme().to_string();
982        let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
983        let prev_count = attempt.previous().len();
984        if scheme != "https" {
985            return attempt.error(HttpError::InsecureRedirect { scheme });
986        }
987        if prev_count >= MAX_REDIRECTS {
988            return attempt.stop();
989        }
990        let host = match host_opt {
991            Some(h) => h,
992            None => {
993                return attempt.error(HttpError::RedirectDenied {
994                    source_key: allowlist_for_closure.source.clone(),
995                    host: String::new(),
996                    expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
997                });
998            }
999        };
1000        if !allowlist_for_closure.matches(&host) {
1001            return attempt.error(HttpError::RedirectDenied {
1002                source_key: allowlist_for_closure.source.clone(),
1003                host,
1004                expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1005            });
1006        }
1007        attempt.follow()
1008    });
1009    ClientBuilder::new()
1010        // `https_only(false)` only at this scope — production builders
1011        // (the public `HttpClient::new`) keep it on.
1012        .https_only(false)
1013        .redirect(redirect_policy)
1014        .connect_timeout(CONNECT_TIMEOUT)
1015        .timeout(TOTAL_TIMEOUT)
1016        .read_timeout(READ_TIMEOUT)
1017        .user_agent(format!(
1018            "doiget/{} (+https://github.com/sotashimozono/doiget)",
1019            VERSION
1020        ))
1021        .tls_backend_rustls()
1022        .build()
1023}
1024
1025// ---------------------------------------------------------------------------
1026// ClientBuilder helpers
1027// ---------------------------------------------------------------------------
1028
1029/// Install the `ring` `rustls` crypto provider as the process default,
1030/// exactly once.
1031///
1032/// reqwest is built with the `rustls-no-provider` feature (ADR-0020
1033/// Amendment 1: drop aws-lc-rs so `cargo install` needs no cmake/C
1034/// toolchain and musl-static builds cleanly). With no bundled provider,
1035/// `reqwest::ClientBuilder::build` calls
1036/// `rustls::crypto::CryptoProvider::get_default()` and **panics**
1037/// (`"No provider set"`) unless a process-default provider was installed
1038/// first. Every client constructor below calls this; the `Once` makes it
1039/// safe to invoke from many sites and from concurrent tests.
1040fn ensure_crypto_provider() {
1041    static INIT: Once = Once::new();
1042    INIT.call_once(|| {
1043        // `install_default` errors only if a provider is already set;
1044        // under `Once` that is unreachable, but ignore it rather than
1045        // panic (another linked crate could have installed one first).
1046        let _ = rustls::crypto::ring::default_provider().install_default();
1047    });
1048}
1049
1050fn build_client(allowlist: SourceAllowlist) -> Result<Client, reqwest::Error> {
1051    ensure_crypto_provider();
1052
1053    let user_agent = format!(
1054        "doiget/{} (+https://github.com/sotashimozono/doiget)",
1055        VERSION
1056    );
1057
1058    // Redirect policy: capture the per-source allowlist by value. The
1059    // closure is called for every redirect hop — there is no global
1060    // fallback, every hop is checked. Hard cap at MAX_REDIRECTS via the
1061    // attempt counter (mirrors reqwest's built-in limit).
1062    let allowlist_for_closure = allowlist.clone();
1063    let redirect_policy = Policy::custom(move |attempt| {
1064        // Inspect the candidate URL via owned copies so we can move
1065        // `attempt` into `error()` / `follow()` / `stop()` later without
1066        // the borrow checker complaining about an outstanding borrow of
1067        // `attempt`.
1068        let scheme = attempt.url().scheme().to_string();
1069        let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
1070        let prev_count = attempt.previous().len();
1071
1072        // 1. Reject non-HTTPS up front. The `https_only(true)` builder
1073        //    flag below also catches this, but we want the dedicated
1074        //    `InsecureRedirect` error path (not a generic `https_only`
1075        //    abort) — see `docs/SECURITY.md` §1.3.
1076        if scheme != "https" {
1077            return attempt.error(HttpError::InsecureRedirect { scheme });
1078        }
1079
1080        // 2. Hop limit (`docs/SECURITY.md` §1.3 redirect_limit row).
1081        if prev_count >= MAX_REDIRECTS {
1082            return attempt.stop();
1083        }
1084
1085        // 3. Allowlist check on the candidate target host.
1086        //    `host_str()` is `None` for URLs without a host (e.g. data
1087        //    URIs); treat that as an allowlist miss.
1088        let host = match host_opt {
1089            Some(h) => h,
1090            None => {
1091                return attempt.error(HttpError::RedirectDenied {
1092                    source_key: allowlist_for_closure.source.clone(),
1093                    host: String::new(),
1094                    expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1095                });
1096            }
1097        };
1098        if !allowlist_for_closure.matches(&host) {
1099            return attempt.error(HttpError::RedirectDenied {
1100                source_key: allowlist_for_closure.source.clone(),
1101                host,
1102                expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1103            });
1104        }
1105
1106        attempt.follow()
1107    });
1108
1109    ClientBuilder::new()
1110        .https_only(true)
1111        .redirect(redirect_policy)
1112        .connect_timeout(CONNECT_TIMEOUT)
1113        .timeout(TOTAL_TIMEOUT)
1114        .read_timeout(READ_TIMEOUT)
1115        .user_agent(user_agent)
1116        // `tls_backend_rustls()` is the non-deprecated equivalent of the
1117        // older `use_rustls_tls()`. The workspace pins reqwest with
1118        // `rustls-no-provider` (ADR-0020 Amendment 1), so this is a
1119        // re-assertion at builder level rather than a feature switch; the
1120        // `ring` provider installed by `ensure_crypto_provider()` above
1121        // is what reqwest picks up via `CryptoProvider::get_default()`.
1122        .tls_backend_rustls()
1123        .build()
1124}
1125
1126// ---------------------------------------------------------------------------
1127// Tests
1128// ---------------------------------------------------------------------------
1129
1130#[cfg(test)]
1131#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1132mod tests {
1133    use super::*;
1134    use wiremock::matchers::{method, path};
1135    use wiremock::{Mock, MockServer, ResponseTemplate};
1136
1137    // ---------------------------------------------------------------
1138    // Allowlist matching — pure unit tests, no network.
1139    // ---------------------------------------------------------------
1140
1141    #[test]
1142    fn tier_1_allowlist_includes_crossref() {
1143        let lists = tier_1_allowlist();
1144        let crossref = lists
1145            .iter()
1146            .find(|a| a.source == "crossref")
1147            .expect("crossref entry");
1148        assert!(
1149            crossref
1150                .redirect_hosts
1151                .iter()
1152                .any(|h| h.contains("crossref.org")),
1153            "crossref allowlist must contain a crossref.org pattern; got {:?}",
1154            crossref.redirect_hosts,
1155        );
1156    }
1157
1158    #[test]
1159    fn tier_1_allowlist_includes_unpaywall_and_arxiv() {
1160        let lists = tier_1_allowlist();
1161        assert!(lists.iter().any(|a| a.source == "unpaywall"));
1162        assert!(lists.iter().any(|a| a.source == "arxiv"));
1163    }
1164
1165    #[test]
1166    fn oa_publisher_allowlist_groups_under_one_synthetic_source() {
1167        // The OA-publisher fan-out from Unpaywall's `best_oa_location.url`
1168        // is keyed under a single synthetic `"oa-publisher"` source so the
1169        // orchestrator can pass that one source key to
1170        // `HttpClient::fetch_pdf`. See `docs/REDIRECT_ALLOWLIST.md` §3 (the
1171        // informed-best-effort note) and the function-level docs in
1172        // [`oa_publisher_allowlist`].
1173        let lists = oa_publisher_allowlist();
1174        assert_eq!(lists.len(), 1, "exactly one synthetic source entry");
1175        assert_eq!(lists[0].source, "oa-publisher");
1176    }
1177
1178    #[test]
1179    fn oa_publisher_allowlist_matches_known_oa_hosts() {
1180        let lists = oa_publisher_allowlist();
1181        let oa = lists
1182            .iter()
1183            .find(|a| a.source == "oa-publisher")
1184            .expect("oa-publisher entry");
1185        // Spot-check a representative entry per host family.
1186        assert!(oa.matches("link.springer.com"));
1187        assert!(oa.matches("nature.com"));
1188        assert!(oa.matches("onlinelibrary.wiley.com"));
1189        assert!(oa.matches("www.frontiersin.org"));
1190        assert!(oa.matches("www.mdpi.com"));
1191        assert!(oa.matches("journals.plos.org"));
1192        assert!(oa.matches("www.biorxiv.org"));
1193        assert!(oa.matches("europepmc.org"));
1194        assert!(oa.matches("www.ncbi.nlm.nih.gov"));
1195        assert!(oa.matches("arxiv.org"));
1196        // #193: physics-society / diamond-OA hosts (empirically observed
1197        // as Unpaywall best_oa_location targets in the dogfood run).
1198        assert!(oa.matches("link.aps.org"));
1199        assert!(oa.matches("journals.aps.org"));
1200        assert!(oa.matches("scipost.org"));
1201        assert!(oa.matches("www.scipost.org"));
1202        assert!(oa.matches("iopscience.iop.org"));
1203        // Document intent of the `*.<suffix>` form: per
1204        // `REDIRECT_ALLOWLIST.md` §2.2 rule 3 it matches the bare
1205        // registrable domain AND any subdomain. Unpaywall has not been
1206        // observed returning bare-domain PDF URLs for these publishers,
1207        // but accepting them is consistent with every other `*.` entry in
1208        // this list (e.g. `arxiv.org` matched by `*.arxiv.org`) and is
1209        // what the matching rule already implements.
1210        assert!(oa.matches("aps.org"));
1211        assert!(oa.matches("iop.org"));
1212        // Multi-level subdomains also match (e.g. SciPost's deep paths);
1213        // documents the wildcard scope rather than testing a known URL.
1214        assert!(oa.matches("submissions.scipost.org"));
1215        // Negative: an attacker host is not covered.
1216        assert!(!oa.matches("attacker.test"));
1217        // Negative: dot-boundary safety for the new entries — a different
1218        // suffix that merely ends with the registrable name must NOT match.
1219        assert!(!oa.matches("notaps.org"));
1220        assert!(!oa.matches("evilscipost.org"));
1221        assert!(!oa.matches("notiop.org"));
1222        // Negative: dot-boundary safety — `*.springer.com` must not match
1223        // `notspringer.com`.
1224        assert!(!oa.matches("notspringer.com"));
1225    }
1226
1227    #[test]
1228    fn allowlist_matches_exact_fqdn() {
1229        let a = SourceAllowlist::new("crossref", vec!["api.crossref.org".to_string()]);
1230        assert!(a.matches("api.crossref.org"));
1231        assert!(!a.matches("crossref.org"));
1232        assert!(!a.matches("xapi.crossref.org"));
1233    }
1234
1235    #[test]
1236    fn allowlist_matches_subdomain_glob() {
1237        // Per docs/REDIRECT_ALLOWLIST.md §2.2 rule 3: `*.<suffix>`
1238        // matches both `<suffix>` itself AND any `*.<suffix>` subdomain,
1239        // but never matches a different suffix that happens to end with
1240        // `<suffix>` without a dot boundary.
1241        let a = SourceAllowlist::new("crossref", vec!["*.crossref.org".to_string()]);
1242        assert!(a.matches("doi.crossref.org"));
1243        assert!(a.matches("crossref.org"));
1244        assert!(!a.matches("notcrossref.org"));
1245        assert!(!a.matches("crossref.org.attacker.test"));
1246    }
1247
1248    #[test]
1249    fn allowlist_matches_is_case_insensitive() {
1250        let a = SourceAllowlist::new("crossref", vec!["API.crossref.ORG".to_string()]);
1251        assert!(a.matches("api.crossref.org"));
1252        assert!(a.matches("API.CROSSREF.ORG"));
1253    }
1254
1255    #[test]
1256    fn allowlist_with_no_redirect_hosts_matches_nothing() {
1257        // §2.2 rule 5: an empty `redirect_hosts` means "no redirects
1258        // permitted from this source".
1259        let a = SourceAllowlist::new("ghost", Vec::<String>::new());
1260        assert!(!a.matches("anything.test"));
1261        assert!(!a.matches(""));
1262    }
1263
1264    // ---------------------------------------------------------------
1265    // PDF magic-byte handling — tests on the body-parsing path. We
1266    // exercise the magic-byte branch via the public API against a
1267    // wiremock server so the assertion runs through the full
1268    // streaming codepath.
1269    // ---------------------------------------------------------------
1270
1271    /// Build a test-only `HttpClient` against an `http://` wiremock
1272    /// origin.
1273    ///
1274    /// Slice 5 (PR #84 advisory item A4 refactor): this helper now
1275    /// delegates to the public
1276    /// [`HttpClient::new_for_tests_allow_http`] constructor (defined
1277    /// just above the test module) instead of re-implementing the
1278    /// redirect-policy + `https_only(false)` builder. The two
1279    /// implementations had drifted into duplicates — keeping a private
1280    /// re-implementation only meant a future security tweak to the
1281    /// builder would silently leave the tests on a stale path.
1282    fn build_test_client_for_http(source: &str, allowlist_host: &str) -> HttpClient {
1283        HttpClient::new_for_tests_allow_http(source, allowlist_host)
1284    }
1285
1286    #[tokio::test]
1287    async fn pdf_magic_byte_match_succeeds() {
1288        let server = MockServer::start().await;
1289        let body = b"%PDF-1.7\n...some pdf bytes...".to_vec();
1290        Mock::given(method("GET"))
1291            .and(path("/paper.pdf"))
1292            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
1293            .mount(&server)
1294            .await;
1295        let host = server
1296            .uri()
1297            .parse::<Url>()
1298            .unwrap()
1299            .host_str()
1300            .unwrap()
1301            .to_string();
1302        let client = build_test_client_for_http("crossref", &host);
1303        let url: Url = format!("{}/paper.pdf", server.uri()).parse().unwrap();
1304        let (got_body, _final_url) = client.fetch_pdf("crossref", url).await.expect("ok");
1305        assert_eq!(&got_body[..], &body[..]);
1306    }
1307
1308    #[tokio::test]
1309    async fn pdf_magic_byte_mismatch_rejects() {
1310        let server = MockServer::start().await;
1311        Mock::given(method("GET"))
1312            .and(path("/not_a_pdf"))
1313            .respond_with(
1314                ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
1315            )
1316            .mount(&server)
1317            .await;
1318        let host = server
1319            .uri()
1320            .parse::<Url>()
1321            .unwrap()
1322            .host_str()
1323            .unwrap()
1324            .to_string();
1325        let client = build_test_client_for_http("crossref", &host);
1326        let url: Url = format!("{}/not_a_pdf", server.uri()).parse().unwrap();
1327        let err = client
1328            .fetch_pdf("crossref", url)
1329            .await
1330            .expect_err("not pdf");
1331        match err {
1332            HttpError::NotAPdf { got } => {
1333                assert_eq!(&got, b"<html");
1334            }
1335            other => panic!("expected NotAPdf, got {:?}", other),
1336        }
1337    }
1338
1339    #[tokio::test]
1340    async fn fetch_bytes_does_not_check_pdf_magic() {
1341        // The non-PDF path returns the body unchanged regardless of
1342        // magic bytes. This pins the boundary between the JSON/text
1343        // path and the PDF path.
1344        let server = MockServer::start().await;
1345        Mock::given(method("GET"))
1346            .and(path("/data.json"))
1347            .respond_with(
1348                ResponseTemplate::new(200).set_body_bytes(br#"{"hello":"world"}"#.to_vec()),
1349            )
1350            .mount(&server)
1351            .await;
1352        let host = server
1353            .uri()
1354            .parse::<Url>()
1355            .unwrap()
1356            .host_str()
1357            .unwrap()
1358            .to_string();
1359        let client = build_test_client_for_http("crossref", &host);
1360        let url: Url = format!("{}/data.json", server.uri()).parse().unwrap();
1361        let (body, _final_url) = client.fetch_bytes("crossref", url).await.expect("ok");
1362        assert_eq!(&body[..], br#"{"hello":"world"}"#);
1363    }
1364
1365    #[tokio::test]
1366    async fn oversized_body_via_content_length_short_circuits() {
1367        // Wiremock can advertise a `Content-Length` larger than the body
1368        // it actually serves; hyper accepts the mismatch and our
1369        // fast-path check fires before any body bytes are consumed.
1370        let server = MockServer::start().await;
1371        let oversized = PDF_MAX_BYTES + 1;
1372        Mock::given(method("GET"))
1373            .and(path("/huge"))
1374            .respond_with(
1375                ResponseTemplate::new(200)
1376                    .insert_header("content-length", oversized.to_string().as_str())
1377                    .set_body_bytes(b"%PDF-".to_vec()),
1378            )
1379            .mount(&server)
1380            .await;
1381        let host = server
1382            .uri()
1383            .parse::<Url>()
1384            .unwrap()
1385            .host_str()
1386            .unwrap()
1387            .to_string();
1388        let client = build_test_client_for_http("crossref", &host);
1389        let url: Url = format!("{}/huge", server.uri()).parse().unwrap();
1390        let err = client
1391            .fetch_bytes("crossref", url)
1392            .await
1393            .expect_err("should reject");
1394        match err {
1395            HttpError::OversizedBody { actual, cap } => {
1396                assert!(actual > cap, "actual {} should exceed cap {}", actual, cap);
1397                assert_eq!(cap, PDF_MAX_BYTES);
1398            }
1399            // The mismatched Content-Length may also trip an underlying
1400            // transport error before our fast-path runs. Either outcome
1401            // satisfies the security goal (the transfer was aborted
1402            // without buffering 100 GB), so accept Network here as a
1403            // wiremock idiosyncrasy rather than a contract relaxation.
1404            HttpError::Network(_) => {}
1405            other => panic!("expected OversizedBody or Network, got {:?}", other),
1406        }
1407    }
1408
1409    #[tokio::test]
1410    async fn unknown_source_rejected() {
1411        let client = HttpClient::new(tier_1_allowlist()).expect("client builds");
1412        let url: Url = "https://api.crossref.org/works/10.1234/x".parse().unwrap();
1413        let err = client
1414            .fetch_bytes("not-a-source", url)
1415            .await
1416            .expect_err("unknown source");
1417        match err {
1418            HttpError::UnknownSource { source_key } => {
1419                assert_eq!(source_key, "not-a-source")
1420            }
1421            other => panic!("expected UnknownSource, got {:?}", other),
1422        }
1423    }
1424
1425    #[tokio::test]
1426    async fn http_status_error_surfaces() {
1427        let server = MockServer::start().await;
1428        Mock::given(method("GET"))
1429            .and(path("/missing"))
1430            .respond_with(ResponseTemplate::new(404))
1431            .mount(&server)
1432            .await;
1433        let host = server
1434            .uri()
1435            .parse::<Url>()
1436            .unwrap()
1437            .host_str()
1438            .unwrap()
1439            .to_string();
1440        let client = build_test_client_for_http("crossref", &host);
1441        let url: Url = format!("{}/missing", server.uri()).parse().unwrap();
1442        let err = client.fetch_bytes("crossref", url).await.expect_err("404");
1443        match err {
1444            HttpError::HttpStatus { status, .. } => assert_eq!(status, 404),
1445            other => panic!("expected HttpStatus, got {:?}", other),
1446        }
1447    }
1448
1449    // ---------------------------------------------------------------
1450    // Redirect policy tests — drive the closure via wiremock 30x
1451    // responses pointing at insecure / off-allowlist targets. With
1452    // `https_only(true)` on the production builder the request never
1453    // leaves the initial leg — we run these against the test builder
1454    // (which relaxes `https_only` for the *initial* leg only) so the
1455    // redirect closure is reached and exercised.
1456    // ---------------------------------------------------------------
1457
1458    #[tokio::test]
1459    async fn redirect_to_http_is_rejected_by_closure() {
1460        let server = MockServer::start().await;
1461        Mock::given(method("GET"))
1462            .and(path("/redir"))
1463            .respond_with(
1464                ResponseTemplate::new(302).insert_header("location", "http://attacker.test/file"),
1465            )
1466            .mount(&server)
1467            .await;
1468        let host = server
1469            .uri()
1470            .parse::<Url>()
1471            .unwrap()
1472            .host_str()
1473            .unwrap()
1474            .to_string();
1475        let client = build_test_client_for_http("crossref", &host);
1476        let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1477        let err = client
1478            .fetch_bytes("crossref", url)
1479            .await
1480            .expect_err("redirect to http rejected");
1481        match err {
1482            HttpError::Network(e) => {
1483                let msg = format!("{:?}", e);
1484                assert!(
1485                    msg.contains("InsecureRedirect") || msg.contains("non-HTTPS"),
1486                    "expected insecure-redirect signal in error chain, got {}",
1487                    msg
1488                );
1489            }
1490            other => panic!("expected Network(InsecureRedirect), got {:?}", other),
1491        }
1492    }
1493
1494    #[tokio::test]
1495    async fn redirect_outside_allowlist_is_rejected_by_closure() {
1496        let server = MockServer::start().await;
1497        Mock::given(method("GET"))
1498            .and(path("/redir"))
1499            .respond_with(
1500                ResponseTemplate::new(302).insert_header("location", "https://attacker.test/file"),
1501            )
1502            .mount(&server)
1503            .await;
1504        let host = server
1505            .uri()
1506            .parse::<Url>()
1507            .unwrap()
1508            .host_str()
1509            .unwrap()
1510            .to_string();
1511        let client = build_test_client_for_http("crossref", &host);
1512        let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1513        let err = client
1514            .fetch_bytes("crossref", url)
1515            .await
1516            .expect_err("redirect to attacker rejected");
1517        match err {
1518            HttpError::Network(e) => {
1519                let msg = format!("{:?}", e);
1520                assert!(
1521                    msg.contains("RedirectDenied") || msg.contains("not in allowlist"),
1522                    "expected redirect-denied signal in error chain, got {}",
1523                    msg
1524                );
1525            }
1526            other => panic!("expected Network(RedirectDenied), got {:?}", other),
1527        }
1528    }
1529
1530    #[tokio::test]
1531    async fn redirect_to_allowlisted_https_host_is_followed_by_closure() {
1532        // 302 to an https host that IS in the allowlist. The redirect
1533        // dispatch will fail (DNS won't resolve `mirror.allowed.test`)
1534        // but the closure must NOT short-circuit — failure mode is a
1535        // transport error, not InsecureRedirect / RedirectDenied.
1536        let server = MockServer::start().await;
1537        Mock::given(method("GET"))
1538            .and(path("/redir"))
1539            .respond_with(
1540                ResponseTemplate::new(302)
1541                    .insert_header("location", "https://mirror.allowed.test/file"),
1542            )
1543            .mount(&server)
1544            .await;
1545        let initial_host = server
1546            .uri()
1547            .parse::<Url>()
1548            .unwrap()
1549            .host_str()
1550            .unwrap()
1551            .to_string();
1552        // Allow the initial host AND the redirect target host.
1553        let allowlist = SourceAllowlist::new(
1554            "crossref",
1555            vec![initial_host.clone(), "*.allowed.test".to_string()],
1556        );
1557        let allowlist_for_closure = allowlist.clone();
1558        let policy = Policy::custom(move |attempt| {
1559            let scheme = attempt.url().scheme().to_string();
1560            let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
1561            if scheme != "https" {
1562                return attempt.error(HttpError::InsecureRedirect { scheme });
1563            }
1564            let h = match host_opt {
1565                Some(h) => h,
1566                None => {
1567                    return attempt.error(HttpError::RedirectDenied {
1568                        source_key: allowlist_for_closure.source.clone(),
1569                        host: String::new(),
1570                        expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1571                    });
1572                }
1573            };
1574            if !allowlist_for_closure.matches(&h) {
1575                return attempt.error(HttpError::RedirectDenied {
1576                    source_key: allowlist_for_closure.source.clone(),
1577                    host: h,
1578                    expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1579                });
1580            }
1581            attempt.follow()
1582        });
1583        ensure_crypto_provider();
1584        let raw_client = ClientBuilder::new()
1585            .https_only(false)
1586            .redirect(policy)
1587            .connect_timeout(CONNECT_TIMEOUT)
1588            .timeout(Duration::from_secs(5))
1589            .user_agent("doiget/test")
1590            .tls_backend_rustls()
1591            .build()
1592            .expect("client builds");
1593        let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1594        let err = raw_client.get(url).send().await.expect_err("DNS fails");
1595        // The error should NOT carry our InsecureRedirect / RedirectDenied
1596        // marker — the closure approved the redirect.
1597        let msg = format!("{:?}", err);
1598        assert!(
1599            !msg.contains("RedirectDenied") && !msg.contains("InsecureRedirect"),
1600            "closure short-circuited an allowed redirect: {}",
1601            msg,
1602        );
1603    }
1604
1605    #[test]
1606    fn http_client_clone_is_cheap() {
1607        // Sanity: cloning shares the inner Arc<HashMap<...>>.
1608        let a = HttpClient::new(tier_1_allowlist()).expect("builds");
1609        let b = a.clone();
1610        assert_eq!(a.clients.len(), b.clients.len());
1611        assert!(Arc::ptr_eq(&a.clients, &b.clients));
1612    }
1613
1614    // ---------------------------------------------------------------
1615    // HttpError -> Option<DenialContext>  (ADR-0023 §4 mapping)
1616    // ---------------------------------------------------------------
1617
1618    #[test]
1619    fn denial_from_redirect_denied_carries_attempted_and_expected() {
1620        use crate::{DenialContext, DenialReason};
1621        let e = HttpError::RedirectDenied {
1622            source_key: "crossref".to_string(),
1623            host: "evil.example.com".to_string(),
1624            expected_hosts: vec!["api.crossref.org".to_string(), "*.crossref.org".to_string()],
1625        };
1626        let dc: Option<DenialContext> = (&e).into();
1627        let dc = dc.expect("RedirectDenied -> Some(DenialContext)");
1628        assert_eq!(dc.reason, DenialReason::RedirectNotInAllowlist);
1629        assert_eq!(dc.source.as_deref(), Some("crossref"));
1630        assert_eq!(dc.attempted.as_deref(), Some("evil.example.com"));
1631        assert_eq!(
1632            dc.expected.as_deref(),
1633            Some(&["api.crossref.org".to_string(), "*.crossref.org".to_string()][..])
1634        );
1635        assert!(dc.cap.is_none());
1636        assert!(dc.actual.is_none());
1637        assert!(dc.hop_index.is_none());
1638    }
1639
1640    #[test]
1641    fn denial_from_oversized_body_carries_cap_and_actual() {
1642        use crate::{DenialContext, DenialReason};
1643        let e = HttpError::OversizedBody {
1644            actual: 209_715_200,
1645            cap: PDF_MAX_BYTES,
1646        };
1647        let dc: Option<DenialContext> = (&e).into();
1648        let dc = dc.expect("OversizedBody -> Some(DenialContext)");
1649        assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
1650        assert_eq!(dc.cap, Some(PDF_MAX_BYTES));
1651        assert_eq!(dc.actual, Some(209_715_200));
1652        assert!(dc.source.is_none());
1653        assert!(dc.attempted.is_none());
1654        // OversizedBody has no allowlist channel: producer leaves
1655        // `expected` at `None` (NOT `Some(vec![])`). See the field doc on
1656        // `DenialContext::expected` for the disambiguation.
1657        assert!(dc.expected.is_none());
1658    }
1659
1660    #[test]
1661    fn denial_from_not_a_pdf_hex_encodes_got_bytes() {
1662        use crate::{DenialContext, DenialReason};
1663        // First 5 bytes of "<html" — what the magic-byte check sees when
1664        // a publisher returns an HTML interstitial instead of a PDF.
1665        let e = HttpError::NotAPdf {
1666            got: [0x3c, 0x68, 0x74, 0x6d, 0x6c],
1667        };
1668        let dc: Option<DenialContext> = (&e).into();
1669        let dc = dc.expect("NotAPdf -> Some(DenialContext)");
1670        assert_eq!(dc.reason, DenialReason::ContentTypeMismatch);
1671        assert_eq!(dc.attempted.as_deref(), Some("3c68746d6c"));
1672        assert_eq!(dc.expected.as_deref(), Some(&["%PDF-".to_string()][..]));
1673    }
1674
1675    #[test]
1676    fn denial_from_insecure_redirect_marks_insecure_scheme() {
1677        use crate::{DenialContext, DenialReason};
1678        let e = HttpError::InsecureRedirect {
1679            scheme: "http".to_string(),
1680        };
1681        let dc: Option<DenialContext> = (&e).into();
1682        let dc = dc.expect("InsecureRedirect -> Some(DenialContext)");
1683        // ADR-0023 §4 (post-incorporation review): InsecureRedirect maps
1684        // to its own dedicated `InsecureScheme` reason, not the host-
1685        // allowlist reason — they are semantically distinct denials.
1686        assert_eq!(dc.reason, DenialReason::InsecureScheme);
1687        assert_eq!(dc.attempted.as_deref(), Some("http:..."));
1688        assert_eq!(dc.expected.as_deref(), Some(&["https".to_string()][..]));
1689    }
1690
1691    #[test]
1692    fn denial_from_non_denial_variants_returns_none() {
1693        use crate::DenialContext;
1694        // Network / HttpStatus / UnknownSource are not denials; they
1695        // map to None per ADR-0023 §4.
1696        let e = HttpError::HttpStatus {
1697            status: 503,
1698            url: "https://api.crossref.org/works/x".to_string(),
1699        };
1700        let dc: Option<DenialContext> = (&e).into();
1701        assert!(dc.is_none(), "HttpStatus must not produce a DenialContext");
1702
1703        let e = HttpError::UnknownSource {
1704            source_key: "ghost".to_string(),
1705        };
1706        let dc: Option<DenialContext> = (&e).into();
1707        assert!(
1708            dc.is_none(),
1709            "UnknownSource must not produce a DenialContext"
1710        );
1711    }
1712
1713    // ---------------------------------------------------------------
1714    // Issue #117 — transient retry / backoff. Real time: wiremock
1715    // serves over real localhost IO and tokio `start_paused` is
1716    // incompatible with that (it auto-advances past reqwest's
1717    // timeout). Backoff is small enough that the slowest case
1718    // (persistent 503, 3 retries ≈ 3.5s) stays within the suite budget.
1719    // ---------------------------------------------------------------
1720
1721    fn host_of(server: &MockServer) -> String {
1722        server
1723            .uri()
1724            .parse::<Url>()
1725            .unwrap()
1726            .host_str()
1727            .unwrap()
1728            .to_string()
1729    }
1730
1731    #[tokio::test]
1732    async fn transient_503_then_200_succeeds() {
1733        let server = MockServer::start().await;
1734        // Catch-all 200 mounted first (lowest precedence); the
1735        // single-shot 503 mounted last takes precedence for the first
1736        // request only, then falls through to the 200.
1737        Mock::given(method("GET"))
1738            .and(path("/p"))
1739            .respond_with(ResponseTemplate::new(200).set_body_string(r#"{"ok":1}"#))
1740            .mount(&server)
1741            .await;
1742        Mock::given(method("GET"))
1743            .and(path("/p"))
1744            .respond_with(ResponseTemplate::new(503))
1745            .up_to_n_times(1)
1746            .mount(&server)
1747            .await;
1748
1749        let client = build_test_client_for_http("crossref", &host_of(&server));
1750        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
1751        let (body, _) = client
1752            .fetch_bytes("crossref", url)
1753            .await
1754            .expect("503-then-200 must succeed after one retry");
1755        assert_eq!(&body[..], br#"{"ok":1}"#);
1756    }
1757
1758    #[tokio::test]
1759    async fn persistent_503_exhausts_and_returns_httpstatus() {
1760        let server = MockServer::start().await;
1761        Mock::given(method("GET"))
1762            .and(path("/p"))
1763            .respond_with(ResponseTemplate::new(503))
1764            .mount(&server)
1765            .await;
1766
1767        let client = build_test_client_for_http("crossref", &host_of(&server));
1768        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
1769        let err = client
1770            .fetch_bytes("crossref", url)
1771            .await
1772            .expect_err("persistent 503 must exhaust retries");
1773        match err {
1774            HttpError::HttpStatus { status, .. } => assert_eq!(status, 503),
1775            other => panic!("expected HttpStatus 503, got {other:?}"),
1776        }
1777        // First attempt + MAX_FETCH_RETRIES retries.
1778        let reqs = server
1779            .received_requests()
1780            .await
1781            .expect("wiremock records requests");
1782        assert_eq!(reqs.len(), (MAX_FETCH_RETRIES + 1) as usize);
1783    }
1784
1785    #[tokio::test]
1786    async fn retry_after_429_then_200_succeeds() {
1787        let server = MockServer::start().await;
1788        Mock::given(method("GET"))
1789            .and(path("/p"))
1790            .respond_with(ResponseTemplate::new(200).set_body_string("ok"))
1791            .mount(&server)
1792            .await;
1793        Mock::given(method("GET"))
1794            .and(path("/p"))
1795            .respond_with(ResponseTemplate::new(429).insert_header("Retry-After", "1"))
1796            .up_to_n_times(1)
1797            .mount(&server)
1798            .await;
1799
1800        let client = build_test_client_for_http("crossref", &host_of(&server));
1801        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
1802        let (body, _) = client
1803            .fetch_bytes("crossref", url)
1804            .await
1805            .expect("429+Retry-After then 200 must succeed");
1806        assert_eq!(&body[..], b"ok");
1807    }
1808
1809    #[tokio::test]
1810    async fn permanent_404_is_not_retried() {
1811        let server = MockServer::start().await;
1812        Mock::given(method("GET"))
1813            .and(path("/p"))
1814            .respond_with(ResponseTemplate::new(404))
1815            .mount(&server)
1816            .await;
1817
1818        let client = build_test_client_for_http("crossref", &host_of(&server));
1819        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
1820        let _ = client
1821            .fetch_bytes("crossref", url)
1822            .await
1823            .expect_err("404 must fail");
1824        let reqs = server
1825            .received_requests()
1826            .await
1827            .expect("wiremock records requests");
1828        assert_eq!(reqs.len(), 1, "4xx (non-408/429) must NOT be retried");
1829    }
1830}