Skip to main content

doiget_core/
http.rs

1// allow: outbound-network
2//! Centralized HTTP client wrapper. All `Source` impls fetch through here.
3//!
4//! Security defaults per `docs/SECURITY.md`:
5//!   - rustls TLS only (no openssl, no native-tls — enforced by `deny.toml`)
6//!   - HTTPS-only redirect policy (file://, data://, http:// rejected)
7//!   - Per-source redirect host allowlist (`docs/REDIRECT_ALLOWLIST.md`)
8//!   - Body size cap ([`crate::PDF_MAX_BYTES`] = 100 MB)
9//!   - Per-request timeouts (connect 10s, read 60s, total 300s)
10//!   - PDF magic-byte check on the first 5 bytes (`%PDF-`)
11//!   - User-Agent: `doiget/<version> (+https://github.com/sotashimozono/doiget)`
12//!
13//! See `docs/SECURITY.md` §1.2-1.3 / §1.10 and `docs/REDIRECT_ALLOWLIST.md`.
14//!
15//! # Architectural note: per-source `reqwest::Client`
16//!
17//! `reqwest::redirect::Policy::custom` receives only an `Attempt` value, which
18//! exposes the next URL and previous URL chain but **not** the original
19//! request's headers. That makes the "tag the request with `X-Doiget-Source`
20//! and inspect it from inside the redirect closure" approach infeasible on
21//! `reqwest 0.13.x`. Instead, [`HttpClient`] holds one
22//! [`reqwest::Client`] per source — each client's redirect closure captures
23//! that source's [`SourceAllowlist`] so cross-source confusion is impossible
24//! by construction.
25
26use std::collections::HashMap;
27use std::sync::Arc;
28use std::time::Duration;
29
30use bytes::{Bytes, BytesMut};
31use futures_util::StreamExt;
32use reqwest::redirect::Policy;
33use reqwest::{Client, ClientBuilder, Url};
34use thiserror::Error;
35
36use crate::{PDF_MAX_BYTES, VERSION};
37
38/// PDF magic-byte prefix per the PDF 1.7 specification (ISO 32000-1 §7.5.2).
39/// `b"%PDF-"`.
40const PDF_MAGIC: [u8; 5] = [0x25, 0x50, 0x44, 0x46, 0x2D];
41
42/// Hard cap on redirect chain length. Matches `reqwest`'s default of 10.
43/// Re-asserted here so the value is reviewed alongside the other security
44/// defaults in this module rather than inheriting silently from upstream.
45const MAX_REDIRECTS: usize = 10;
46
47/// Connect timeout per `docs/SECURITY.md` §1.2 (Slowloris row).
48const CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
49
50/// Read (idle-between-bytes) timeout per `docs/SECURITY.md` §1.2.
51const READ_TIMEOUT: Duration = Duration::from_secs(60);
52
53/// Total per-request timeout per `docs/SECURITY.md` §1.2.
54const TOTAL_TIMEOUT: Duration = Duration::from_secs(300);
55
56/// Max retry attempts AFTER the first try, for transient failures only
57/// (connect/timeout/mid-stream network errors and the transient HTTP
58/// status set). 3 retries → up to 4 total attempts. See issue #117.
59const MAX_FETCH_RETRIES: u32 = 3;
60
61/// Base delay for the exponential backoff (`base * 2^attempt`, jittered).
62const RETRY_BASE_DELAY: Duration = Duration::from_millis(500);
63
64/// Hard ceiling on any single backoff / `Retry-After` sleep. Keeps the
65/// worst-case retry chain comfortably inside [`TOTAL_TIMEOUT`].
66const RETRY_MAX_DELAY: Duration = Duration::from_secs(30);
67
68/// HTTP status codes worth retrying: request timeout, rate-limited, and
69/// the transient 5xx family. A plain 500 is included because upstreams
70/// (Crossref/Unpaywall) intermittently 500 under load. 4xx other than
71/// 408/429 are caller/permanent and never retried.
72fn is_transient_status(code: u16) -> bool {
73    matches!(code, 408 | 429 | 500 | 502 | 503 | 504)
74}
75
76/// A `reqwest::Error` is transient iff it is a connect or timeout
77/// failure or a mid-body transfer error. Redirect-policy aborts
78/// (allowlist denial), builder errors, and decode errors are NOT
79/// transient — retrying them cannot help and would mask a real denial.
80fn reqwest_is_transient(e: &reqwest::Error) -> bool {
81    (e.is_timeout() || e.is_connect() || e.is_body()) && !e.is_redirect()
82}
83
84/// Parse a `Retry-After` header expressed as integer seconds (the
85/// HTTP-date form is accepted by the RFC but rare for these APIs and
86/// deliberately ignored for the MVP — we fall back to exponential
87/// backoff in that case). Capped at [`RETRY_MAX_DELAY`].
88fn parse_retry_after(headers: &reqwest::header::HeaderMap) -> Option<Duration> {
89    let secs: u64 = headers
90        .get(reqwest::header::RETRY_AFTER)?
91        .to_str()
92        .ok()?
93        .trim()
94        .parse()
95        .ok()?;
96    Some(Duration::from_secs(secs).min(RETRY_MAX_DELAY))
97}
98
99/// Exponential backoff with decorrelated jitter. `RETRY_BASE_DELAY *
100/// 2^attempt`, capped at [`RETRY_MAX_DELAY`], plus 0..base jitter so a
101/// fleet of clients does not thunder back in lockstep. Jitter is derived
102/// from the wall-clock subsec nanos rather than pulling in an RNG
103/// dependency — adequate decorrelation for backoff, not a security
104/// primitive.
105fn backoff_delay(attempt: u32) -> Duration {
106    let factor = 1u64 << attempt.min(20);
107    let base_ms = RETRY_BASE_DELAY.as_millis() as u64;
108    let capped_ms = base_ms
109        .saturating_mul(factor)
110        .min(RETRY_MAX_DELAY.as_millis() as u64);
111    let jitter_ms = std::time::SystemTime::now()
112        .duration_since(std::time::UNIX_EPOCH)
113        .map(|d| (d.subsec_nanos() as u64) % base_ms.max(1))
114        .unwrap_or(0);
115    Duration::from_millis(capped_ms.saturating_add(jitter_ms))
116}
117
118// ---------------------------------------------------------------------------
119// SourceAllowlist
120// ---------------------------------------------------------------------------
121
122/// Per-source allowlist entry. Matches the schema in
123/// `docs/REDIRECT_ALLOWLIST.md` §2.
124#[derive(Debug, Clone)]
125#[non_exhaustive]
126pub struct SourceAllowlist {
127    /// Source key. MUST match a `source` value in `docs/SOURCES.md` §1
128    /// (e.g. `crossref`, `unpaywall`, `arxiv`).
129    pub source: String,
130    /// Each pattern is either a literal FQDN or a `*.<suffix>` glob (matches
131    /// the suffix and any subdomain — see `docs/REDIRECT_ALLOWLIST.md` §2.2
132    /// matching rule).
133    pub redirect_hosts: Vec<String>,
134}
135
136impl SourceAllowlist {
137    /// Construct a new allowlist entry.
138    pub fn new(source: impl Into<String>, redirect_hosts: Vec<String>) -> Self {
139        Self {
140            source: source.into(),
141            redirect_hosts,
142        }
143    }
144
145    /// Returns `true` if `host` matches any pattern in this allowlist.
146    ///
147    /// Matching is byte-level on the lowercased ASCII form of the host.
148    /// Callers MUST lowercase upstream; this method also lowercases as a
149    /// defense-in-depth measure but treats the result as ASCII (Punycode
150    /// is the caller's responsibility per `docs/REDIRECT_ALLOWLIST.md`
151    /// §2.2 rule 4).
152    pub fn matches(&self, host: &str) -> bool {
153        let host_lc = host.to_ascii_lowercase();
154        self.redirect_hosts
155            .iter()
156            .any(|pat| host_matches_pattern(&host_lc, pat))
157    }
158}
159
160/// Returns `true` if `host` (already lowercased) matches `pattern` per
161/// `docs/REDIRECT_ALLOWLIST.md` §2.2.
162fn host_matches_pattern(host: &str, pattern: &str) -> bool {
163    let pat_lc = pattern.to_ascii_lowercase();
164    if let Some(suffix) = pat_lc.strip_prefix("*.") {
165        // Suffix-glob: matches `<suffix>` exactly OR `*.<suffix>`.
166        host == suffix || host.ends_with(&format!(".{}", suffix))
167    } else {
168        // Exact-FQDN: byte-identical (after lowercasing both sides).
169        host == pat_lc
170    }
171}
172
173/// Hard-coded Phase 1 allowlist for Tier 1 sources. Sourced from
174/// `docs/REDIRECT_ALLOWLIST.md` §3.
175///
176/// Marked `Phase 1; revisit during real fetches` in the spec — entries
177/// flagged `(unverified)` (e.g. arXiv subdomain redirect behavior) MUST be
178/// confirmed or removed before Phase 1 is closed; see §3.3 of the spec.
179pub fn tier_1_allowlist() -> Vec<SourceAllowlist> {
180    vec![
181        // §3.1 crossref
182        SourceAllowlist::new(
183            "crossref",
184            vec!["api.crossref.org".to_string(), "*.crossref.org".to_string()],
185        ),
186        // §3.2 unpaywall
187        SourceAllowlist::new("unpaywall", vec!["api.unpaywall.org".to_string()]),
188        // §3.3 arxiv
189        SourceAllowlist::new(
190            "arxiv",
191            vec![
192                "arxiv.org".to_string(),
193                "export.arxiv.org".to_string(),
194                "*.arxiv.org".to_string(),
195            ],
196        ),
197    ]
198}
199
200/// Hard-coded Phase 4 allowlist for Tier 2 metadata sources (OpenAlex,
201/// Semantic Scholar, DOAJ). Sourced from `docs/SOURCES.md` §1 (the Tier 2
202/// table) and `docs/REDIRECT_ALLOWLIST.md` §3 (same redirect-allowlist
203/// policy as Tier 1, distinct source keys).
204///
205/// Returned hosts:
206///
207/// - `"openalex"` → `api.openalex.org` (production OpenAlex REST API).
208/// - `"semantic_scholar"` → `api.semanticscholar.org` (S2 Graph API base).
209/// - `"doaj"` → `doaj.org` + `*.doaj.org` (DOAJ public API; wildcard
210///   covers `api.doaj.org` and any v4+ subdomain split).
211///
212/// Per `docs/SOURCES.md` §4 "OpenAlex / Semantic Scholar / DOAJ", these
213/// sources are **metadata-only**: their `Source::fetch` impls MUST
214/// return `pdf_bytes: None`. The redirect closure in [`HttpClient`]
215/// uses this list to deny redirects to off-list hosts under each Tier
216/// 2 source key — identical mechanism to Tier 1, but the per-tool
217/// capability gate (`profile.metadata.openalex` etc.) is layered on
218/// top so the network surface remains capability-aware.
219pub fn tier_2_allowlist() -> Vec<SourceAllowlist> {
220    vec![
221        SourceAllowlist::new("openalex", vec!["api.openalex.org".to_string()]),
222        SourceAllowlist::new(
223            "semantic_scholar",
224            vec!["api.semanticscholar.org".to_string()],
225        ),
226        SourceAllowlist::new(
227            "doaj",
228            vec!["doaj.org".to_string(), "*.doaj.org".to_string()],
229        ),
230    ]
231}
232
233/// Hard-coded Phase 5a allowlist for the Springer Nature OA TDM
234/// source. Compile-gated by the `tdm-springer` Cargo feature so
235/// default release binaries never include the host pattern (per
236/// ADR-0002 and `docs/SOURCES.md` §3).
237///
238/// Returned entry:
239/// - `"tdm-springer"` → `api.springernature.com` (production base) +
240///   `*.springernature.com` (covers load-balancing subdomains; the
241///   redirect closure denies anything outside the wildcard).
242///
243/// Per `docs/SOURCES.md` §4 "TDM sources (Phase 5)", a fetch under
244/// this source key requires ALL THREE gates: Cargo feature compiled
245/// in, `DOIGET_KEY_SPRINGER` env var present, and
246/// `DOIGET_AGREE_TDM_SPRINGER=1`. The `CapabilityProfile` gate
247/// enforces the env-var pair; this allowlist is the transport gate.
248#[cfg(feature = "tdm-springer")]
249pub fn tier_3_springer_allowlist() -> Vec<SourceAllowlist> {
250    vec![SourceAllowlist::new(
251        "tdm-springer",
252        vec![
253            "api.springernature.com".to_string(),
254            "*.springernature.com".to_string(),
255        ],
256    )]
257}
258
259/// Hard-coded Phase 5b allowlist for the APS Harvest TDM source.
260/// Compile-gated by the `tdm-aps` Cargo feature so default release
261/// binaries never include the host pattern (per ADR-0002 and
262/// `docs/SOURCES.md` §3).
263///
264/// Returned entry:
265/// - `"tdm-aps"` → `harvest.aps.org` (production base) +
266///   `*.aps.org` (covers load-balancing subdomains; the redirect
267///   closure denies anything outside the wildcard).
268///
269/// Three-gate activation: Cargo feature compiled in,
270/// `DOIGET_KEY_APS` env var present, and `DOIGET_AGREE_TDM_APS=1`.
271/// The `CapabilityProfile` gate enforces the env-var pair; this
272/// allowlist is the transport gate.
273#[cfg(feature = "tdm-aps")]
274pub fn tier_3_aps_allowlist() -> Vec<SourceAllowlist> {
275    vec![SourceAllowlist::new(
276        "tdm-aps",
277        vec!["harvest.aps.org".to_string(), "*.aps.org".to_string()],
278    )]
279}
280
281/// Hard-coded Phase 5c allowlist for the Elsevier ScienceDirect TDM
282/// source. Compile-gated by the `tdm-elsevier` Cargo feature so
283/// default release binaries never include the host pattern (per
284/// ADR-0002 and `docs/SOURCES.md` §3).
285///
286/// Returned entry:
287/// - `"tdm-elsevier"` → `api.elsevier.com` (production base) +
288///   `*.elsevier.com` (covers load-balancing subdomains; the
289///   redirect closure denies anything outside the wildcard).
290///
291/// Three-gate activation: Cargo feature compiled in,
292/// `DOIGET_KEY_ELSEVIER` env var present, and
293/// `DOIGET_AGREE_TDM_ELSEVIER=1`. The `CapabilityProfile` gate
294/// enforces the env-var pair; this allowlist is the transport gate.
295#[cfg(feature = "tdm-elsevier")]
296pub fn tier_3_elsevier_allowlist() -> Vec<SourceAllowlist> {
297    vec![SourceAllowlist::new(
298        "tdm-elsevier",
299        vec!["api.elsevier.com".to_string(), "*.elsevier.com".to_string()],
300    )]
301}
302
303/// Hard-coded Phase 1 allowlist for the synthetic `"oa-publisher"` source —
304/// the publisher / preprint / repository hosts to which Unpaywall's
305/// `best_oa_location.url` (or `url_for_pdf`) typically resolves.
306///
307/// **Status: informed-best-effort.** Per `docs/REDIRECT_ALLOWLIST.md` §3,
308/// every entry below is a documented OA-publisher host pulled from the
309/// public DOI / OA discovery surface as of this function's authoring; they
310/// are **not** a substitute for empirical validation. Entries marked
311/// `(unverified)` MUST be confirmed by a real fetch or removed before
312/// Phase 1 is closed.
313///
314/// The orchestrator (`doiget-cli::commands::fetch::fetch_doi`) calls
315/// [`HttpClient::fetch_pdf`] under the `"oa-publisher"` source key when
316/// Unpaywall returns an OA URL. If the OA host is not in this list, the
317/// PDF leg is denied (`HttpError::RedirectDenied`) and the orchestrator
318/// falls back to metadata-only success (the `informed-best-effort`
319/// posture from the spec section above).
320pub fn oa_publisher_allowlist() -> Vec<SourceAllowlist> {
321    vec![SourceAllowlist::new(
322        "oa-publisher",
323        vec![
324            // Springer Nature OA imprints. Springer / SpringerOpen / Nature
325            // OA URLs all resolve under one of these registrable suffixes.
326            // (unverified) — confirm by replaying real Unpaywall responses.
327            "*.springer.com".to_string(),
328            "*.springeropen.com".to_string(),
329            "*.springernature.com".to_string(),
330            "*.nature.com".to_string(),
331            // Wiley OA. (unverified)
332            "*.wiley.com".to_string(),
333            // Elsevier OA route only — the TDM gated path is a separate
334            // source (`tdm-elsevier`, Phase 5c) and is not covered here.
335            // (unverified)
336            "*.elsevier.com".to_string(),
337            "*.sciencedirect.com".to_string(),
338            // Frontiers. (unverified)
339            "*.frontiersin.org".to_string(),
340            // MDPI. (unverified)
341            "*.mdpi.com".to_string(),
342            // PLOS. (unverified)
343            "*.plos.org".to_string(),
344            // Preprint servers — biorxiv / medrxiv. (unverified)
345            "*.biorxiv.org".to_string(),
346            "*.medrxiv.org".to_string(),
347            // Europe PMC + NIH PMC. (unverified)
348            "europepmc.org".to_string(),
349            "*.europepmc.org".to_string(),
350            "*.nih.gov".to_string(),
351            "*.ncbi.nlm.nih.gov".to_string(),
352            // arXiv — already on the `arxiv` tier-1 allowlist, but the
353            // Unpaywall-driven path uses the `oa-publisher` source key,
354            // so we mirror the host list here too. See REDIRECT_ALLOWLIST.md
355            // §3.3 for the underlying entries.
356            "arxiv.org".to_string(),
357            "*.arxiv.org".to_string(),
358        ],
359    )]
360}
361
362// ---------------------------------------------------------------------------
363// HttpError
364// ---------------------------------------------------------------------------
365
366/// Errors that can arise during HTTP fetches.
367#[derive(Debug, Error)]
368#[non_exhaustive]
369pub enum HttpError {
370    /// Transport / DNS / TLS failure or other `reqwest`-level error. Note
371    /// that `reqwest` surfaces a redirect-policy abort (via `Attempt::error`)
372    /// as a `reqwest::Error` carrying the source error — callers seeing
373    /// `Network` for what they believed was a redirect violation should
374    /// inspect the inner error chain.
375    #[error("network error: {0}")]
376    Network(#[from] reqwest::Error),
377    /// Redirect target host did not match any pattern in the source's
378    /// `redirect_hosts`. See `docs/REDIRECT_ALLOWLIST.md` §2.2.
379    ///
380    /// Field naming: `source_key` rather than `source` because `thiserror`
381    /// auto-treats a field literally named `source` as a `#[source]` error
382    /// chain link (which would require the field to implement `std::error::Error`).
383    ///
384    /// `expected_hosts` carries a snapshot of the source's allowlist
385    /// patterns at the time of the denial — populated for the structured
386    /// `denial_context.expected` channel introduced by ADR-0023 §4
387    /// (NORMATIVE mapping table). Cloning the patterns into the error
388    /// keeps the `From<&HttpError> for Option<DenialContext>` impl from
389    /// having to re-look-up the allowlist by `source_key`. May be empty
390    /// when the rejection happened before any allowlist was matched
391    /// (e.g. URL had no host component at all).
392    #[error("redirect target {host} not in allowlist for source {source_key}")]
393    RedirectDenied {
394        /// Source key whose allowlist rejected the redirect.
395        source_key: String,
396        /// The lowercased host that was rejected.
397        host: String,
398        /// Snapshot of the source's `redirect_hosts` at denial time.
399        /// Surfaces as `denial_context.expected` (ADR-0023 §4).
400        expected_hosts: Vec<String>,
401    },
402    /// Redirect target had a scheme other than `https`. See
403    /// `docs/SECURITY.md` §1.3.
404    #[error("redirect to non-HTTPS scheme: {scheme}")]
405    InsecureRedirect {
406        /// The disallowed scheme (e.g. `http`, `file`, `data`).
407        scheme: String,
408    },
409    /// Body would exceed [`PDF_MAX_BYTES`] either by a `Content-Length`
410    /// hint or by accumulated streamed bytes. See `docs/SECURITY.md` §1.2.
411    #[error("body too large: {actual} bytes (cap = {cap})")]
412    OversizedBody {
413        /// Observed size (header value or accumulated bytes).
414        actual: u64,
415        /// Hard upper bound (always [`PDF_MAX_BYTES`]).
416        cap: u64,
417    },
418    /// PDF magic-byte mismatch — the body does not start with `%PDF-`.
419    /// We deliberately do NOT use `Content-Type` (publishers misbehave —
420    /// the magic byte is the trustworthy signal per `docs/SECURITY.md`
421    /// §1.2 "Magic-byte mismatch" row).
422    #[error("PDF magic-byte mismatch: got {got:?}")]
423    NotAPdf {
424        /// First five bytes of the response body (zero-padded if shorter).
425        got: [u8; 5],
426    },
427    /// Server returned a non-2xx status.
428    #[error("HTTP {status} from {url}")]
429    HttpStatus {
430        /// HTTP status code.
431        status: u16,
432        /// The URL that produced the status.
433        url: String,
434    },
435    /// No allowlist entry exists for this source. The caller asked
436    /// [`HttpClient`] to fetch on behalf of a source that wasn't passed to
437    /// [`HttpClient::new`].
438    ///
439    /// See note on `RedirectDenied` for why the field is `source_key`.
440    #[error("no allowlist registered for source {source_key}")]
441    UnknownSource {
442        /// The unregistered source key.
443        source_key: String,
444    },
445    /// A header name or value passed to
446    /// [`HttpClient::fetch_bytes_with_headers`] was not a valid HTTP
447    /// header. The header parser only accepts the visible-ASCII subset
448    /// per RFC 7230 §3.2; control characters and non-ASCII bytes are
449    /// rejected before the request is even built. Surfaces as
450    /// `ErrorCode::InternalError` at the public boundary (callers
451    /// supplying bad headers are responsible for fixing the call site;
452    /// not a denial in the ADR-0023 sense).
453    #[error("invalid HTTP header `{name}`: {reason}")]
454    InvalidHeader {
455        /// The header name as supplied by the caller.
456        name: String,
457        /// `"name"` or `"value"` — which side failed parsing.
458        reason: String,
459    },
460}
461
462// ---------------------------------------------------------------------------
463// HttpError -> Option<DenialContext>  (ADR-0023 §4 mapping table)
464// ---------------------------------------------------------------------------
465
466/// Map an [`HttpError`] reference to the structured [`crate::DenialContext`]
467/// channel introduced by ADR-0023.
468///
469/// Returns `Some(_)` for the four denial classes named in ADR-0023 §4
470/// (`RedirectDenied`, `OversizedBody`, `NotAPdf`, `InsecureRedirect`) and
471/// `None` for every other variant — `Network`, `HttpStatus`,
472/// `UnknownSource` are not denials in the ADR-0023 sense (they are
473/// transport / upstream / programming-error signals, not allowlist or
474/// cap rejections).
475///
476/// The `&HttpError` borrow form is used (rather than `HttpError`) so the
477/// caller — typically the orchestrator that already needs the original
478/// error for `error.message` and the `From<HttpError> for ErrorCode`
479/// collapse — does not have to clone the error to produce the optional
480/// structured side-channel.
481impl From<&HttpError> for Option<crate::DenialContext> {
482    fn from(e: &HttpError) -> Self {
483        use crate::{DenialContext, DenialReason};
484        match e {
485            HttpError::RedirectDenied {
486                source_key,
487                host,
488                expected_hosts,
489            } => Some(DenialContext {
490                reason: DenialReason::RedirectNotInAllowlist,
491                source: Some(source_key.clone()),
492                attempted: Some(host.clone()),
493                expected: Some(expected_hosts.clone()),
494                hop_index: None,
495                cap: None,
496                actual: None,
497            }),
498            HttpError::OversizedBody { actual, cap } => Some(DenialContext {
499                reason: DenialReason::SizeCapExceeded,
500                source: None,
501                attempted: None,
502                // The size-cap reason has no allowlist channel; use
503                // `None` to signal "field not populated by producer"
504                // rather than `Some(vec![])` (which would mean "explicit
505                // empty allowlist"). See `DenialContext::expected` docs.
506                expected: None,
507                hop_index: None,
508                cap: Some(*cap),
509                actual: Some(*actual),
510            }),
511            HttpError::NotAPdf { got } => Some(DenialContext {
512                reason: DenialReason::ContentTypeMismatch,
513                source: None,
514                // ADR-0023 §4 mapping table: hex-encode the first 5 bytes
515                // for the `attempted` field. `format!("{:02x}...")` is
516                // chosen over `hex::encode` to avoid pulling the
517                // additional dep into this conversion path; the result is
518                // bit-identical (lowercase, zero-padded).
519                attempted: Some(format!(
520                    "{:02x}{:02x}{:02x}{:02x}{:02x}",
521                    got[0], got[1], got[2], got[3], got[4]
522                )),
523                expected: Some(vec!["%PDF-".to_string()]),
524                hop_index: None,
525                cap: None,
526                actual: None,
527            }),
528            HttpError::InsecureRedirect { scheme } => Some(DenialContext {
529                reason: DenialReason::InsecureScheme,
530                source: None,
531                attempted: Some(format!("{}:...", scheme)),
532                expected: Some(vec!["https".to_string()]),
533                hop_index: None,
534                cap: None,
535                actual: None,
536            }),
537            // `reqwest` wraps a custom error returned by the redirect
538            // policy closure (`attempt.error(HttpError::RedirectDenied{..})`
539            // / `attempt.error(HttpError::InsecureRedirect{..})`) inside a
540            // `reqwest::Error`, which surfaces here as `HttpError::Network`.
541            // Without source-chain walking, production redirect denials —
542            // the most operationally important denial class — would never
543            // produce a `DenialContext`, defeating the whole point of
544            // ADR-0023.
545            //
546            // Walk the `std::error::Error::source()` chain on the inner
547            // `reqwest::Error` and downcast each link to `&HttpError`. If
548            // a wrapped `HttpError` is found, recurse via this same `From`
549            // impl. Otherwise the network error is a "real" transport /
550            // DNS / TLS failure with no denial semantics — return `None`.
551            //
552            // `std::error::Error::source(e)` is fully-qualified to
553            // disambiguate against the inherent (and unrelated)
554            // `reqwest::Error::source()`.
555            HttpError::Network(e) => {
556                let mut source: Option<&(dyn std::error::Error + 'static)> =
557                    std::error::Error::source(e);
558                while let Some(s) = source {
559                    if let Some(http_err) = s.downcast_ref::<HttpError>() {
560                        return Option::<crate::DenialContext>::from(http_err);
561                    }
562                    source = s.source();
563                }
564                None
565            }
566            // The remaining variants are not "denials" in the ADR-0023
567            // sense — HttpStatus/UnknownSource are upstream / programming-
568            // error signals; InvalidHeader is a caller-bug signal.
569            HttpError::HttpStatus { .. }
570            | HttpError::UnknownSource { .. }
571            | HttpError::InvalidHeader { .. } => None,
572        }
573    }
574}
575
576// ---------------------------------------------------------------------------
577// HttpClient
578// ---------------------------------------------------------------------------
579
580/// Workspace-wide HTTP client with the security defaults applied.
581///
582/// Internally holds one `reqwest::Client` per source. Construct via
583/// [`HttpClient::new`] with the full set of allowlists the calling process
584/// will need.
585#[derive(Clone, Debug)]
586pub struct HttpClient {
587    /// One [`reqwest::Client`] per source. Each client carries a redirect
588    /// policy that captures only that source's allowlist. `Arc` so cloning
589    /// is cheap.
590    clients: Arc<HashMap<String, Client>>,
591    /// The exact [`SourceAllowlist`] each per-source client was built from,
592    /// keyed by source. The redirect closure inside each `reqwest::Client`
593    /// captures its allowlist *by move*, so it cannot be read back from the
594    /// client itself. This map keeps the identical `SourceAllowlist`
595    /// available to callers that must perform a *pre-fetch* host check on a
596    /// metadata-discovered URL (issue #145 / `docs/REDIRECT_ALLOWLIST.md`
597    /// §1: the allowlist is consulted "on the OA URL discovered through
598    /// metadata sources before the actual PDF fetch is issued", not only on
599    /// redirect hops). Storing the same value here — rather than re-deriving
600    /// it from [`oa_publisher_allowlist`] at the call site — guarantees the
601    /// pre-check and the redirect closure can never drift, and that the
602    /// check works under the test constructors too (which register a
603    /// wiremock host as the allowlist).
604    allowlists: Arc<HashMap<String, SourceAllowlist>>,
605}
606
607impl HttpClient {
608    /// Build a client with rustls + redirect-allowlist + size cap +
609    /// timeouts.
610    ///
611    /// `allowlists` MUST cover every source whose URL might be passed in;
612    /// fetches against unregistered sources return
613    /// [`HttpError::UnknownSource`].
614    ///
615    /// # Errors
616    ///
617    /// Returns the underlying `reqwest::Error` if `ClientBuilder::build`
618    /// fails (typically a TLS-backend init failure).
619    pub fn new(allowlists: Vec<SourceAllowlist>) -> Result<Self, reqwest::Error> {
620        let mut clients = HashMap::with_capacity(allowlists.len());
621        let mut allowlist_map = HashMap::with_capacity(allowlists.len());
622        for entry in allowlists {
623            let source = entry.source.clone();
624            // Keep the *same* allowlist value both inside the redirect
625            // closure (via `build_client`) and queryable on the client
626            // (issue #145 pre-fetch check). `build_client` takes the
627            // allowlist by value, so clone once for the side table first.
628            allowlist_map.insert(source.clone(), entry.clone());
629            let client = build_client(entry)?;
630            clients.insert(source, client);
631        }
632        Ok(Self {
633            clients: Arc::new(clients),
634            allowlists: Arc::new(allowlist_map),
635        })
636    }
637
638    /// The [`SourceAllowlist`] this client was built with for `source`, or
639    /// `None` if `source` was not registered.
640    ///
641    /// This is the *identical* value captured by the per-source redirect
642    /// closure (see [`HttpClient`]'s `allowlists` field doc). It exists so
643    /// the orchestrator can apply the `docs/REDIRECT_ALLOWLIST.md` §1
644    /// pre-fetch host check on a metadata-discovered OA URL — the URL that
645    /// is fetched *without* necessarily passing through a redirect hop —
646    /// using the same source of truth the redirect closure uses, so the two
647    /// can never disagree. Callers MUST use this for the `"oa-publisher"`
648    /// leg only; the initial template-constructed URL is exempt per
649    /// `docs/REDIRECT_ALLOWLIST.md` §6.
650    pub fn source_allowlist(&self, source: &str) -> Option<&SourceAllowlist> {
651        self.allowlists.get(source)
652    }
653
654    /// Fetch a URL, treating it as a JSON or text body. Caps at
655    /// [`PDF_MAX_BYTES`].
656    ///
657    /// Returns the response body bytes plus the effective final URL after
658    /// redirects (post-allowlist verification — every hop has already been
659    /// validated by the time this returns).
660    ///
661    /// # Errors
662    ///
663    /// Any [`HttpError`] variant.
664    pub async fn fetch_bytes(&self, source: &str, url: Url) -> Result<(Bytes, Url), HttpError> {
665        self.fetch_inner(source, url, &[], false).await
666    }
667
668    /// Like [`Self::fetch_bytes`] but attaches additional request
669    /// headers to the outgoing GET. The headers are validated up-front
670    /// against the visible-ASCII subset (RFC 7230 §3.2); any failure
671    /// returns [`HttpError::InvalidHeader`] before the request is sent.
672    ///
673    /// Used by Tier-3 TDM sources that authenticate via a header
674    /// (APS Harvest `X-API-Key`, Elsevier ScienceDirect `X-ELS-APIKey`).
675    /// Header values appear on the wire only — they are never logged.
676    ///
677    /// # Errors
678    ///
679    /// Any [`HttpError`] variant including [`HttpError::InvalidHeader`].
680    pub async fn fetch_bytes_with_headers(
681        &self,
682        source: &str,
683        url: Url,
684        headers: &[(&str, &str)],
685    ) -> Result<(Bytes, Url), HttpError> {
686        self.fetch_inner(source, url, headers, false).await
687    }
688
689    /// Fetch a URL expected to be a PDF. Same as [`Self::fetch_bytes`] plus
690    /// the magic-byte check on the first 5 bytes
691    /// (`%PDF-` = `[0x25, 0x50, 0x44, 0x46, 0x2D]`). Mismatch returns
692    /// [`HttpError::NotAPdf`].
693    ///
694    /// # Errors
695    ///
696    /// Any [`HttpError`] variant including [`HttpError::NotAPdf`].
697    pub async fn fetch_pdf(&self, source: &str, url: Url) -> Result<(Bytes, Url), HttpError> {
698        self.fetch_inner(source, url, &[], true).await
699    }
700
701    async fn fetch_inner(
702        &self,
703        source: &str,
704        url: Url,
705        headers: &[(&str, &str)],
706        check_pdf_magic: bool,
707    ) -> Result<(Bytes, Url), HttpError> {
708        let client = self
709            .clients
710            .get(source)
711            .ok_or_else(|| HttpError::UnknownSource {
712                source_key: source.to_string(),
713            })?;
714
715        // Parse headers up-front so an invalid name/value fails BEFORE
716        // we touch the network. `HeaderName::from_bytes` / `HeaderValue::from_str`
717        // accept the visible-ASCII subset only (RFC 7230 §3.2).
718        let mut header_map = reqwest::header::HeaderMap::with_capacity(headers.len());
719        for (name, value) in headers {
720            let hn = reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|_| {
721                HttpError::InvalidHeader {
722                    name: (*name).to_string(),
723                    reason: "name".to_string(),
724                }
725            })?;
726            let hv = reqwest::header::HeaderValue::from_str(value).map_err(|_| {
727                HttpError::InvalidHeader {
728                    name: (*name).to_string(),
729                    reason: "value".to_string(),
730                }
731            })?;
732            header_map.insert(hn, hv);
733        }
734
735        // Bounded retry loop (issue #117). Only transient classes are
736        // retried — connect/timeout/mid-stream network errors and the
737        // transient HTTP status set. Allowlist denials, NotAPdf,
738        // OversizedBody, 4xx (non-408/429) are deterministic and return
739        // on the first occurrence. GET is idempotent so a retried
740        // attempt re-streams the body from scratch.
741        let mut attempt: u32 = 0;
742        loop {
743            let send_result = client
744                .get(url.clone())
745                .headers(header_map.clone())
746                .send()
747                .await;
748            let response = match send_result {
749                Ok(r) => r,
750                Err(e) => {
751                    if attempt < MAX_FETCH_RETRIES && reqwest_is_transient(&e) {
752                        let d = backoff_delay(attempt);
753                        tracing::warn!(
754                            source,
755                            attempt,
756                            delay_ms = d.as_millis() as u64,
757                            error = %e,
758                            "transient send failure; retrying"
759                        );
760                        tokio::time::sleep(d).await;
761                        attempt += 1;
762                        continue;
763                    }
764                    return Err(HttpError::Network(e));
765                }
766            };
767            let final_url = response.url().clone();
768
769            // Status check before body read so we can fail fast.
770            let status = response.status();
771            if !status.is_success() {
772                let code = status.as_u16();
773                if attempt < MAX_FETCH_RETRIES && is_transient_status(code) {
774                    // Prefer the server's `Retry-After` over our backoff
775                    // when present (429/503 commonly carry it).
776                    let d = parse_retry_after(response.headers())
777                        .unwrap_or_else(|| backoff_delay(attempt));
778                    tracing::warn!(
779                        source,
780                        attempt,
781                        status = code,
782                        delay_ms = d.as_millis() as u64,
783                        "transient HTTP status; retrying"
784                    );
785                    tokio::time::sleep(d).await;
786                    attempt += 1;
787                    continue;
788                }
789                return Err(HttpError::HttpStatus {
790                    status: code,
791                    // Issue #146: Springer Nature authenticates via an
792                    // `api_key` URL query parameter (no header path
793                    // upstream). This error string is logged and may
794                    // surface to the user, so strip any `api_key`
795                    // value before it leaves the client. No other
796                    // source puts a secret in the query string, so
797                    // this is a no-op for them.
798                    url: redact_api_key_query(&final_url),
799                });
800            }
801
802            // Content-Length fast-path: if header is present and exceeds
803            // the cap, fail without reading any body (deterministic — not
804            // retried). Per `docs/SECURITY.md` §1.2.
805            if let Some(len) = response.content_length() {
806                if len > PDF_MAX_BYTES {
807                    return Err(HttpError::OversizedBody {
808                        actual: len,
809                        cap: PDF_MAX_BYTES,
810                    });
811                }
812            }
813
814            // Stream body and enforce the cap as bytes accumulate. A
815            // mid-stream transport error is transient (retry); an
816            // oversized body is deterministic (return).
817            let mut buf = BytesMut::new();
818            let mut stream = response.bytes_stream();
819            let mut oversized_at: Option<u64> = None;
820            let mut stream_err: Option<reqwest::Error> = None;
821            while let Some(chunk) = stream.next().await {
822                let chunk = match chunk {
823                    Ok(c) => c,
824                    Err(e) => {
825                        stream_err = Some(e);
826                        break;
827                    }
828                };
829                let projected = (buf.len() as u64).saturating_add(chunk.len() as u64);
830                if projected > PDF_MAX_BYTES {
831                    oversized_at = Some(projected);
832                    break;
833                }
834                buf.extend_from_slice(&chunk);
835            }
836            if let Some(actual) = oversized_at {
837                return Err(HttpError::OversizedBody {
838                    actual,
839                    cap: PDF_MAX_BYTES,
840                });
841            }
842            if let Some(e) = stream_err {
843                if attempt < MAX_FETCH_RETRIES && reqwest_is_transient(&e) {
844                    let d = backoff_delay(attempt);
845                    tracing::warn!(
846                        source,
847                        attempt,
848                        delay_ms = d.as_millis() as u64,
849                        error = %e,
850                        "transient mid-stream failure; retrying"
851                    );
852                    tokio::time::sleep(d).await;
853                    attempt += 1;
854                    continue;
855                }
856                return Err(HttpError::Network(e));
857            }
858            let body = buf.freeze();
859
860            if check_pdf_magic {
861                let mut got = [0u8; 5];
862                let n = body.len().min(5);
863                got[..n].copy_from_slice(&body[..n]);
864                if got != PDF_MAGIC {
865                    return Err(HttpError::NotAPdf { got });
866                }
867            }
868
869            return Ok((body, final_url));
870        }
871    }
872}
873
874/// Return `url` rendered as a string with the value of any `api_key`
875/// query parameter replaced by `REDACTED` (issue #146).
876///
877/// Springer Nature's TDM API authenticates **only** via an `api_key`
878/// query parameter — there is no header-auth path upstream — so the key
879/// is unavoidably in the request URL. This keeps it out of *our* log
880/// and error sinks (the `HttpError::HttpStatus` string in particular,
881/// which is `tracing`-logged and can surface to the user). It is a
882/// structural no-op for every other source, none of which carry a
883/// secret in the query string. Other pairs and their order are
884/// preserved; a URL with no `api_key` pair is rendered unchanged.
885fn redact_api_key_query(url: &url::Url) -> String {
886    const API_KEY_PARAM: &str = "api_key";
887    if url.query_pairs().all(|(k, _)| k != API_KEY_PARAM) {
888        return url.to_string();
889    }
890    let mut redacted = url.clone();
891    let pairs: Vec<(String, String)> = url
892        .query_pairs()
893        .map(|(k, v)| {
894            if k == API_KEY_PARAM {
895                (k.into_owned(), "REDACTED".to_string())
896            } else {
897                (k.into_owned(), v.into_owned())
898            }
899        })
900        .collect();
901    redacted.query_pairs_mut().clear().extend_pairs(pairs);
902    redacted.to_string()
903}
904
905/// Test-oriented [`HttpClient`] constructor. Originally `cfg(test)`; now
906/// also reachable from the `doiget-cli` orchestrator's integration tests
907/// (which live outside this crate and therefore cannot see `cfg(test)`-gated
908/// items). The constructor name retains its `for_tests_allow_http` signal —
909/// production code MUST use [`HttpClient::new`] with [`tier_1_allowlist`].
910#[allow(clippy::expect_used)]
911impl HttpClient {
912    /// Build a test-oriented `HttpClient` against an `http://` wiremock
913    /// origin. The redirect closure still rejects insecure schemes — we only
914    /// relax `https_only` at the connection level so wiremock can serve.
915    /// This is acceptable because the redirect closure (which is the
916    /// security-load-bearing path) is exercised by the
917    /// `redirect_to_http_is_rejected_by_closure` test below.
918    ///
919    /// Production callers MUST use [`HttpClient::new`] with
920    /// [`tier_1_allowlist`] — the `for_tests_allow_http` suffix is the load-
921    /// bearing signal that this constructor lifts the initial-leg HTTPS-only
922    /// requirement.
923    pub fn new_for_tests_allow_http(source: &str, allowlist_host: &str) -> Self {
924        let allowlist = SourceAllowlist::new(source, vec![allowlist_host.to_string()]);
925        let client = build_client_allow_http(allowlist.clone()).expect("test client builds");
926        let mut map = HashMap::new();
927        let mut allowlist_map = HashMap::new();
928        allowlist_map.insert(allowlist.source.clone(), allowlist.clone());
929        map.insert(allowlist.source.clone(), client);
930        Self {
931            clients: Arc::new(map),
932            allowlists: Arc::new(allowlist_map),
933        }
934    }
935
936    /// Multi-source variant of [`HttpClient::new_for_tests_allow_http`].
937    ///
938    /// Builds a relaxed-`https_only` client per `(source, allowlist_host)`
939    /// pair. Used by the `doiget-cli` orchestrator's integration tests when
940    /// more than one upstream needs to be wiremocked simultaneously
941    /// (e.g. Crossref + Unpaywall against two different mock servers).
942    /// Production callers MUST use [`HttpClient::new`] with
943    /// [`tier_1_allowlist`].
944    pub fn new_for_tests_allow_http_multi(entries: &[(&str, &str)]) -> Self {
945        let mut map = HashMap::with_capacity(entries.len());
946        let mut allowlist_map = HashMap::with_capacity(entries.len());
947        for (source, host) in entries {
948            let allowlist = SourceAllowlist::new(*source, vec![host.to_string()]);
949            let client = build_client_allow_http(allowlist.clone()).expect("test client builds");
950            allowlist_map.insert(allowlist.source.clone(), allowlist.clone());
951            map.insert(allowlist.source.clone(), client);
952        }
953        Self {
954            clients: Arc::new(map),
955            allowlists: Arc::new(allowlist_map),
956        }
957    }
958}
959
960fn build_client_allow_http(allowlist: SourceAllowlist) -> Result<Client, reqwest::Error> {
961    let allowlist_for_closure = allowlist.clone();
962    let redirect_policy = Policy::custom(move |attempt| {
963        let scheme = attempt.url().scheme().to_string();
964        let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
965        let prev_count = attempt.previous().len();
966        if scheme != "https" {
967            return attempt.error(HttpError::InsecureRedirect { scheme });
968        }
969        if prev_count >= MAX_REDIRECTS {
970            return attempt.stop();
971        }
972        let host = match host_opt {
973            Some(h) => h,
974            None => {
975                return attempt.error(HttpError::RedirectDenied {
976                    source_key: allowlist_for_closure.source.clone(),
977                    host: String::new(),
978                    expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
979                });
980            }
981        };
982        if !allowlist_for_closure.matches(&host) {
983            return attempt.error(HttpError::RedirectDenied {
984                source_key: allowlist_for_closure.source.clone(),
985                host,
986                expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
987            });
988        }
989        attempt.follow()
990    });
991    ClientBuilder::new()
992        // `https_only(false)` only at this scope — production builders
993        // (the public `HttpClient::new`) keep it on.
994        .https_only(false)
995        .redirect(redirect_policy)
996        .connect_timeout(CONNECT_TIMEOUT)
997        .timeout(TOTAL_TIMEOUT)
998        .read_timeout(READ_TIMEOUT)
999        .user_agent(format!(
1000            "doiget/{} (+https://github.com/sotashimozono/doiget)",
1001            VERSION
1002        ))
1003        .tls_backend_rustls()
1004        .build()
1005}
1006
1007// ---------------------------------------------------------------------------
1008// ClientBuilder helpers
1009// ---------------------------------------------------------------------------
1010
1011fn build_client(allowlist: SourceAllowlist) -> Result<Client, reqwest::Error> {
1012    let user_agent = format!(
1013        "doiget/{} (+https://github.com/sotashimozono/doiget)",
1014        VERSION
1015    );
1016
1017    // Redirect policy: capture the per-source allowlist by value. The
1018    // closure is called for every redirect hop — there is no global
1019    // fallback, every hop is checked. Hard cap at MAX_REDIRECTS via the
1020    // attempt counter (mirrors reqwest's built-in limit).
1021    let allowlist_for_closure = allowlist.clone();
1022    let redirect_policy = Policy::custom(move |attempt| {
1023        // Inspect the candidate URL via owned copies so we can move
1024        // `attempt` into `error()` / `follow()` / `stop()` later without
1025        // the borrow checker complaining about an outstanding borrow of
1026        // `attempt`.
1027        let scheme = attempt.url().scheme().to_string();
1028        let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
1029        let prev_count = attempt.previous().len();
1030
1031        // 1. Reject non-HTTPS up front. The `https_only(true)` builder
1032        //    flag below also catches this, but we want the dedicated
1033        //    `InsecureRedirect` error path (not a generic `https_only`
1034        //    abort) — see `docs/SECURITY.md` §1.3.
1035        if scheme != "https" {
1036            return attempt.error(HttpError::InsecureRedirect { scheme });
1037        }
1038
1039        // 2. Hop limit (`docs/SECURITY.md` §1.3 redirect_limit row).
1040        if prev_count >= MAX_REDIRECTS {
1041            return attempt.stop();
1042        }
1043
1044        // 3. Allowlist check on the candidate target host.
1045        //    `host_str()` is `None` for URLs without a host (e.g. data
1046        //    URIs); treat that as an allowlist miss.
1047        let host = match host_opt {
1048            Some(h) => h,
1049            None => {
1050                return attempt.error(HttpError::RedirectDenied {
1051                    source_key: allowlist_for_closure.source.clone(),
1052                    host: String::new(),
1053                    expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1054                });
1055            }
1056        };
1057        if !allowlist_for_closure.matches(&host) {
1058            return attempt.error(HttpError::RedirectDenied {
1059                source_key: allowlist_for_closure.source.clone(),
1060                host,
1061                expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1062            });
1063        }
1064
1065        attempt.follow()
1066    });
1067
1068    ClientBuilder::new()
1069        .https_only(true)
1070        .redirect(redirect_policy)
1071        .connect_timeout(CONNECT_TIMEOUT)
1072        .timeout(TOTAL_TIMEOUT)
1073        .read_timeout(READ_TIMEOUT)
1074        .user_agent(user_agent)
1075        // `tls_backend_rustls()` is the non-deprecated equivalent of the
1076        // older `use_rustls_tls()`. The workspace `reqwest` features
1077        // already pin `rustls`, so this is a re-assertion at builder
1078        // level rather than a feature switch.
1079        .tls_backend_rustls()
1080        .build()
1081}
1082
1083// ---------------------------------------------------------------------------
1084// Tests
1085// ---------------------------------------------------------------------------
1086
1087#[cfg(test)]
1088#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1089mod tests {
1090    use super::*;
1091    use wiremock::matchers::{method, path};
1092    use wiremock::{Mock, MockServer, ResponseTemplate};
1093
1094    // ---------------------------------------------------------------
1095    // Allowlist matching — pure unit tests, no network.
1096    // ---------------------------------------------------------------
1097
1098    #[test]
1099    fn tier_1_allowlist_includes_crossref() {
1100        let lists = tier_1_allowlist();
1101        let crossref = lists
1102            .iter()
1103            .find(|a| a.source == "crossref")
1104            .expect("crossref entry");
1105        assert!(
1106            crossref
1107                .redirect_hosts
1108                .iter()
1109                .any(|h| h.contains("crossref.org")),
1110            "crossref allowlist must contain a crossref.org pattern; got {:?}",
1111            crossref.redirect_hosts,
1112        );
1113    }
1114
1115    #[test]
1116    fn tier_1_allowlist_includes_unpaywall_and_arxiv() {
1117        let lists = tier_1_allowlist();
1118        assert!(lists.iter().any(|a| a.source == "unpaywall"));
1119        assert!(lists.iter().any(|a| a.source == "arxiv"));
1120    }
1121
1122    #[test]
1123    fn oa_publisher_allowlist_groups_under_one_synthetic_source() {
1124        // The OA-publisher fan-out from Unpaywall's `best_oa_location.url`
1125        // is keyed under a single synthetic `"oa-publisher"` source so the
1126        // orchestrator can pass that one source key to
1127        // `HttpClient::fetch_pdf`. See `docs/REDIRECT_ALLOWLIST.md` §3 (the
1128        // informed-best-effort note) and the function-level docs in
1129        // [`oa_publisher_allowlist`].
1130        let lists = oa_publisher_allowlist();
1131        assert_eq!(lists.len(), 1, "exactly one synthetic source entry");
1132        assert_eq!(lists[0].source, "oa-publisher");
1133    }
1134
1135    #[test]
1136    fn oa_publisher_allowlist_matches_known_oa_hosts() {
1137        let lists = oa_publisher_allowlist();
1138        let oa = lists
1139            .iter()
1140            .find(|a| a.source == "oa-publisher")
1141            .expect("oa-publisher entry");
1142        // Spot-check a representative entry per host family.
1143        assert!(oa.matches("link.springer.com"));
1144        assert!(oa.matches("nature.com"));
1145        assert!(oa.matches("onlinelibrary.wiley.com"));
1146        assert!(oa.matches("www.frontiersin.org"));
1147        assert!(oa.matches("www.mdpi.com"));
1148        assert!(oa.matches("journals.plos.org"));
1149        assert!(oa.matches("www.biorxiv.org"));
1150        assert!(oa.matches("europepmc.org"));
1151        assert!(oa.matches("www.ncbi.nlm.nih.gov"));
1152        assert!(oa.matches("arxiv.org"));
1153        // Negative: an attacker host is not covered.
1154        assert!(!oa.matches("attacker.test"));
1155        // Negative: dot-boundary safety — `*.springer.com` must not match
1156        // `notspringer.com`.
1157        assert!(!oa.matches("notspringer.com"));
1158    }
1159
1160    #[test]
1161    fn allowlist_matches_exact_fqdn() {
1162        let a = SourceAllowlist::new("crossref", vec!["api.crossref.org".to_string()]);
1163        assert!(a.matches("api.crossref.org"));
1164        assert!(!a.matches("crossref.org"));
1165        assert!(!a.matches("xapi.crossref.org"));
1166    }
1167
1168    #[test]
1169    fn allowlist_matches_subdomain_glob() {
1170        // Per docs/REDIRECT_ALLOWLIST.md §2.2 rule 3: `*.<suffix>`
1171        // matches both `<suffix>` itself AND any `*.<suffix>` subdomain,
1172        // but never matches a different suffix that happens to end with
1173        // `<suffix>` without a dot boundary.
1174        let a = SourceAllowlist::new("crossref", vec!["*.crossref.org".to_string()]);
1175        assert!(a.matches("doi.crossref.org"));
1176        assert!(a.matches("crossref.org"));
1177        assert!(!a.matches("notcrossref.org"));
1178        assert!(!a.matches("crossref.org.attacker.test"));
1179    }
1180
1181    #[test]
1182    fn allowlist_matches_is_case_insensitive() {
1183        let a = SourceAllowlist::new("crossref", vec!["API.crossref.ORG".to_string()]);
1184        assert!(a.matches("api.crossref.org"));
1185        assert!(a.matches("API.CROSSREF.ORG"));
1186    }
1187
1188    #[test]
1189    fn allowlist_with_no_redirect_hosts_matches_nothing() {
1190        // §2.2 rule 5: an empty `redirect_hosts` means "no redirects
1191        // permitted from this source".
1192        let a = SourceAllowlist::new("ghost", Vec::<String>::new());
1193        assert!(!a.matches("anything.test"));
1194        assert!(!a.matches(""));
1195    }
1196
1197    // ---------------------------------------------------------------
1198    // PDF magic-byte handling — tests on the body-parsing path. We
1199    // exercise the magic-byte branch via the public API against a
1200    // wiremock server so the assertion runs through the full
1201    // streaming codepath.
1202    // ---------------------------------------------------------------
1203
1204    /// Build a test-only `HttpClient` against an `http://` wiremock
1205    /// origin.
1206    ///
1207    /// Slice 5 (PR #84 advisory item A4 refactor): this helper now
1208    /// delegates to the public
1209    /// [`HttpClient::new_for_tests_allow_http`] constructor (defined
1210    /// just above the test module) instead of re-implementing the
1211    /// redirect-policy + `https_only(false)` builder. The two
1212    /// implementations had drifted into duplicates — keeping a private
1213    /// re-implementation only meant a future security tweak to the
1214    /// builder would silently leave the tests on a stale path.
1215    fn build_test_client_for_http(source: &str, allowlist_host: &str) -> HttpClient {
1216        HttpClient::new_for_tests_allow_http(source, allowlist_host)
1217    }
1218
1219    #[tokio::test]
1220    async fn pdf_magic_byte_match_succeeds() {
1221        let server = MockServer::start().await;
1222        let body = b"%PDF-1.7\n...some pdf bytes...".to_vec();
1223        Mock::given(method("GET"))
1224            .and(path("/paper.pdf"))
1225            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
1226            .mount(&server)
1227            .await;
1228        let host = server
1229            .uri()
1230            .parse::<Url>()
1231            .unwrap()
1232            .host_str()
1233            .unwrap()
1234            .to_string();
1235        let client = build_test_client_for_http("crossref", &host);
1236        let url: Url = format!("{}/paper.pdf", server.uri()).parse().unwrap();
1237        let (got_body, _final_url) = client.fetch_pdf("crossref", url).await.expect("ok");
1238        assert_eq!(&got_body[..], &body[..]);
1239    }
1240
1241    #[tokio::test]
1242    async fn pdf_magic_byte_mismatch_rejects() {
1243        let server = MockServer::start().await;
1244        Mock::given(method("GET"))
1245            .and(path("/not_a_pdf"))
1246            .respond_with(
1247                ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
1248            )
1249            .mount(&server)
1250            .await;
1251        let host = server
1252            .uri()
1253            .parse::<Url>()
1254            .unwrap()
1255            .host_str()
1256            .unwrap()
1257            .to_string();
1258        let client = build_test_client_for_http("crossref", &host);
1259        let url: Url = format!("{}/not_a_pdf", server.uri()).parse().unwrap();
1260        let err = client
1261            .fetch_pdf("crossref", url)
1262            .await
1263            .expect_err("not pdf");
1264        match err {
1265            HttpError::NotAPdf { got } => {
1266                assert_eq!(&got, b"<html");
1267            }
1268            other => panic!("expected NotAPdf, got {:?}", other),
1269        }
1270    }
1271
1272    #[tokio::test]
1273    async fn fetch_bytes_does_not_check_pdf_magic() {
1274        // The non-PDF path returns the body unchanged regardless of
1275        // magic bytes. This pins the boundary between the JSON/text
1276        // path and the PDF path.
1277        let server = MockServer::start().await;
1278        Mock::given(method("GET"))
1279            .and(path("/data.json"))
1280            .respond_with(
1281                ResponseTemplate::new(200).set_body_bytes(br#"{"hello":"world"}"#.to_vec()),
1282            )
1283            .mount(&server)
1284            .await;
1285        let host = server
1286            .uri()
1287            .parse::<Url>()
1288            .unwrap()
1289            .host_str()
1290            .unwrap()
1291            .to_string();
1292        let client = build_test_client_for_http("crossref", &host);
1293        let url: Url = format!("{}/data.json", server.uri()).parse().unwrap();
1294        let (body, _final_url) = client.fetch_bytes("crossref", url).await.expect("ok");
1295        assert_eq!(&body[..], br#"{"hello":"world"}"#);
1296    }
1297
1298    #[tokio::test]
1299    async fn oversized_body_via_content_length_short_circuits() {
1300        // Wiremock can advertise a `Content-Length` larger than the body
1301        // it actually serves; hyper accepts the mismatch and our
1302        // fast-path check fires before any body bytes are consumed.
1303        let server = MockServer::start().await;
1304        let oversized = PDF_MAX_BYTES + 1;
1305        Mock::given(method("GET"))
1306            .and(path("/huge"))
1307            .respond_with(
1308                ResponseTemplate::new(200)
1309                    .insert_header("content-length", oversized.to_string().as_str())
1310                    .set_body_bytes(b"%PDF-".to_vec()),
1311            )
1312            .mount(&server)
1313            .await;
1314        let host = server
1315            .uri()
1316            .parse::<Url>()
1317            .unwrap()
1318            .host_str()
1319            .unwrap()
1320            .to_string();
1321        let client = build_test_client_for_http("crossref", &host);
1322        let url: Url = format!("{}/huge", server.uri()).parse().unwrap();
1323        let err = client
1324            .fetch_bytes("crossref", url)
1325            .await
1326            .expect_err("should reject");
1327        match err {
1328            HttpError::OversizedBody { actual, cap } => {
1329                assert!(actual > cap, "actual {} should exceed cap {}", actual, cap);
1330                assert_eq!(cap, PDF_MAX_BYTES);
1331            }
1332            // The mismatched Content-Length may also trip an underlying
1333            // transport error before our fast-path runs. Either outcome
1334            // satisfies the security goal (the transfer was aborted
1335            // without buffering 100 GB), so accept Network here as a
1336            // wiremock idiosyncrasy rather than a contract relaxation.
1337            HttpError::Network(_) => {}
1338            other => panic!("expected OversizedBody or Network, got {:?}", other),
1339        }
1340    }
1341
1342    #[tokio::test]
1343    async fn unknown_source_rejected() {
1344        let client = HttpClient::new(tier_1_allowlist()).expect("client builds");
1345        let url: Url = "https://api.crossref.org/works/10.1234/x".parse().unwrap();
1346        let err = client
1347            .fetch_bytes("not-a-source", url)
1348            .await
1349            .expect_err("unknown source");
1350        match err {
1351            HttpError::UnknownSource { source_key } => {
1352                assert_eq!(source_key, "not-a-source")
1353            }
1354            other => panic!("expected UnknownSource, got {:?}", other),
1355        }
1356    }
1357
1358    #[tokio::test]
1359    async fn http_status_error_surfaces() {
1360        let server = MockServer::start().await;
1361        Mock::given(method("GET"))
1362            .and(path("/missing"))
1363            .respond_with(ResponseTemplate::new(404))
1364            .mount(&server)
1365            .await;
1366        let host = server
1367            .uri()
1368            .parse::<Url>()
1369            .unwrap()
1370            .host_str()
1371            .unwrap()
1372            .to_string();
1373        let client = build_test_client_for_http("crossref", &host);
1374        let url: Url = format!("{}/missing", server.uri()).parse().unwrap();
1375        let err = client.fetch_bytes("crossref", url).await.expect_err("404");
1376        match err {
1377            HttpError::HttpStatus { status, .. } => assert_eq!(status, 404),
1378            other => panic!("expected HttpStatus, got {:?}", other),
1379        }
1380    }
1381
1382    // ---------------------------------------------------------------
1383    // Redirect policy tests — drive the closure via wiremock 30x
1384    // responses pointing at insecure / off-allowlist targets. With
1385    // `https_only(true)` on the production builder the request never
1386    // leaves the initial leg — we run these against the test builder
1387    // (which relaxes `https_only` for the *initial* leg only) so the
1388    // redirect closure is reached and exercised.
1389    // ---------------------------------------------------------------
1390
1391    #[tokio::test]
1392    async fn redirect_to_http_is_rejected_by_closure() {
1393        let server = MockServer::start().await;
1394        Mock::given(method("GET"))
1395            .and(path("/redir"))
1396            .respond_with(
1397                ResponseTemplate::new(302).insert_header("location", "http://attacker.test/file"),
1398            )
1399            .mount(&server)
1400            .await;
1401        let host = server
1402            .uri()
1403            .parse::<Url>()
1404            .unwrap()
1405            .host_str()
1406            .unwrap()
1407            .to_string();
1408        let client = build_test_client_for_http("crossref", &host);
1409        let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1410        let err = client
1411            .fetch_bytes("crossref", url)
1412            .await
1413            .expect_err("redirect to http rejected");
1414        match err {
1415            HttpError::Network(e) => {
1416                let msg = format!("{:?}", e);
1417                assert!(
1418                    msg.contains("InsecureRedirect") || msg.contains("non-HTTPS"),
1419                    "expected insecure-redirect signal in error chain, got {}",
1420                    msg
1421                );
1422            }
1423            other => panic!("expected Network(InsecureRedirect), got {:?}", other),
1424        }
1425    }
1426
1427    #[tokio::test]
1428    async fn redirect_outside_allowlist_is_rejected_by_closure() {
1429        let server = MockServer::start().await;
1430        Mock::given(method("GET"))
1431            .and(path("/redir"))
1432            .respond_with(
1433                ResponseTemplate::new(302).insert_header("location", "https://attacker.test/file"),
1434            )
1435            .mount(&server)
1436            .await;
1437        let host = server
1438            .uri()
1439            .parse::<Url>()
1440            .unwrap()
1441            .host_str()
1442            .unwrap()
1443            .to_string();
1444        let client = build_test_client_for_http("crossref", &host);
1445        let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1446        let err = client
1447            .fetch_bytes("crossref", url)
1448            .await
1449            .expect_err("redirect to attacker rejected");
1450        match err {
1451            HttpError::Network(e) => {
1452                let msg = format!("{:?}", e);
1453                assert!(
1454                    msg.contains("RedirectDenied") || msg.contains("not in allowlist"),
1455                    "expected redirect-denied signal in error chain, got {}",
1456                    msg
1457                );
1458            }
1459            other => panic!("expected Network(RedirectDenied), got {:?}", other),
1460        }
1461    }
1462
1463    #[tokio::test]
1464    async fn redirect_to_allowlisted_https_host_is_followed_by_closure() {
1465        // 302 to an https host that IS in the allowlist. The redirect
1466        // dispatch will fail (DNS won't resolve `mirror.allowed.test`)
1467        // but the closure must NOT short-circuit — failure mode is a
1468        // transport error, not InsecureRedirect / RedirectDenied.
1469        let server = MockServer::start().await;
1470        Mock::given(method("GET"))
1471            .and(path("/redir"))
1472            .respond_with(
1473                ResponseTemplate::new(302)
1474                    .insert_header("location", "https://mirror.allowed.test/file"),
1475            )
1476            .mount(&server)
1477            .await;
1478        let initial_host = server
1479            .uri()
1480            .parse::<Url>()
1481            .unwrap()
1482            .host_str()
1483            .unwrap()
1484            .to_string();
1485        // Allow the initial host AND the redirect target host.
1486        let allowlist = SourceAllowlist::new(
1487            "crossref",
1488            vec![initial_host.clone(), "*.allowed.test".to_string()],
1489        );
1490        let allowlist_for_closure = allowlist.clone();
1491        let policy = Policy::custom(move |attempt| {
1492            let scheme = attempt.url().scheme().to_string();
1493            let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
1494            if scheme != "https" {
1495                return attempt.error(HttpError::InsecureRedirect { scheme });
1496            }
1497            let h = match host_opt {
1498                Some(h) => h,
1499                None => {
1500                    return attempt.error(HttpError::RedirectDenied {
1501                        source_key: allowlist_for_closure.source.clone(),
1502                        host: String::new(),
1503                        expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1504                    });
1505                }
1506            };
1507            if !allowlist_for_closure.matches(&h) {
1508                return attempt.error(HttpError::RedirectDenied {
1509                    source_key: allowlist_for_closure.source.clone(),
1510                    host: h,
1511                    expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1512                });
1513            }
1514            attempt.follow()
1515        });
1516        let raw_client = ClientBuilder::new()
1517            .https_only(false)
1518            .redirect(policy)
1519            .connect_timeout(CONNECT_TIMEOUT)
1520            .timeout(Duration::from_secs(5))
1521            .user_agent("doiget/test")
1522            .tls_backend_rustls()
1523            .build()
1524            .expect("client builds");
1525        let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1526        let err = raw_client.get(url).send().await.expect_err("DNS fails");
1527        // The error should NOT carry our InsecureRedirect / RedirectDenied
1528        // marker — the closure approved the redirect.
1529        let msg = format!("{:?}", err);
1530        assert!(
1531            !msg.contains("RedirectDenied") && !msg.contains("InsecureRedirect"),
1532            "closure short-circuited an allowed redirect: {}",
1533            msg,
1534        );
1535    }
1536
1537    #[test]
1538    fn http_client_clone_is_cheap() {
1539        // Sanity: cloning shares the inner Arc<HashMap<...>>.
1540        let a = HttpClient::new(tier_1_allowlist()).expect("builds");
1541        let b = a.clone();
1542        assert_eq!(a.clients.len(), b.clients.len());
1543        assert!(Arc::ptr_eq(&a.clients, &b.clients));
1544    }
1545
1546    // ---------------------------------------------------------------
1547    // HttpError -> Option<DenialContext>  (ADR-0023 §4 mapping)
1548    // ---------------------------------------------------------------
1549
1550    #[test]
1551    fn denial_from_redirect_denied_carries_attempted_and_expected() {
1552        use crate::{DenialContext, DenialReason};
1553        let e = HttpError::RedirectDenied {
1554            source_key: "crossref".to_string(),
1555            host: "evil.example.com".to_string(),
1556            expected_hosts: vec!["api.crossref.org".to_string(), "*.crossref.org".to_string()],
1557        };
1558        let dc: Option<DenialContext> = (&e).into();
1559        let dc = dc.expect("RedirectDenied -> Some(DenialContext)");
1560        assert_eq!(dc.reason, DenialReason::RedirectNotInAllowlist);
1561        assert_eq!(dc.source.as_deref(), Some("crossref"));
1562        assert_eq!(dc.attempted.as_deref(), Some("evil.example.com"));
1563        assert_eq!(
1564            dc.expected.as_deref(),
1565            Some(&["api.crossref.org".to_string(), "*.crossref.org".to_string()][..])
1566        );
1567        assert!(dc.cap.is_none());
1568        assert!(dc.actual.is_none());
1569        assert!(dc.hop_index.is_none());
1570    }
1571
1572    #[test]
1573    fn denial_from_oversized_body_carries_cap_and_actual() {
1574        use crate::{DenialContext, DenialReason};
1575        let e = HttpError::OversizedBody {
1576            actual: 209_715_200,
1577            cap: PDF_MAX_BYTES,
1578        };
1579        let dc: Option<DenialContext> = (&e).into();
1580        let dc = dc.expect("OversizedBody -> Some(DenialContext)");
1581        assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
1582        assert_eq!(dc.cap, Some(PDF_MAX_BYTES));
1583        assert_eq!(dc.actual, Some(209_715_200));
1584        assert!(dc.source.is_none());
1585        assert!(dc.attempted.is_none());
1586        // OversizedBody has no allowlist channel: producer leaves
1587        // `expected` at `None` (NOT `Some(vec![])`). See the field doc on
1588        // `DenialContext::expected` for the disambiguation.
1589        assert!(dc.expected.is_none());
1590    }
1591
1592    #[test]
1593    fn denial_from_not_a_pdf_hex_encodes_got_bytes() {
1594        use crate::{DenialContext, DenialReason};
1595        // First 5 bytes of "<html" — what the magic-byte check sees when
1596        // a publisher returns an HTML interstitial instead of a PDF.
1597        let e = HttpError::NotAPdf {
1598            got: [0x3c, 0x68, 0x74, 0x6d, 0x6c],
1599        };
1600        let dc: Option<DenialContext> = (&e).into();
1601        let dc = dc.expect("NotAPdf -> Some(DenialContext)");
1602        assert_eq!(dc.reason, DenialReason::ContentTypeMismatch);
1603        assert_eq!(dc.attempted.as_deref(), Some("3c68746d6c"));
1604        assert_eq!(dc.expected.as_deref(), Some(&["%PDF-".to_string()][..]));
1605    }
1606
1607    #[test]
1608    fn denial_from_insecure_redirect_marks_insecure_scheme() {
1609        use crate::{DenialContext, DenialReason};
1610        let e = HttpError::InsecureRedirect {
1611            scheme: "http".to_string(),
1612        };
1613        let dc: Option<DenialContext> = (&e).into();
1614        let dc = dc.expect("InsecureRedirect -> Some(DenialContext)");
1615        // ADR-0023 §4 (post-incorporation review): InsecureRedirect maps
1616        // to its own dedicated `InsecureScheme` reason, not the host-
1617        // allowlist reason — they are semantically distinct denials.
1618        assert_eq!(dc.reason, DenialReason::InsecureScheme);
1619        assert_eq!(dc.attempted.as_deref(), Some("http:..."));
1620        assert_eq!(dc.expected.as_deref(), Some(&["https".to_string()][..]));
1621    }
1622
1623    #[test]
1624    fn denial_from_non_denial_variants_returns_none() {
1625        use crate::DenialContext;
1626        // Network / HttpStatus / UnknownSource are not denials; they
1627        // map to None per ADR-0023 §4.
1628        let e = HttpError::HttpStatus {
1629            status: 503,
1630            url: "https://api.crossref.org/works/x".to_string(),
1631        };
1632        let dc: Option<DenialContext> = (&e).into();
1633        assert!(dc.is_none(), "HttpStatus must not produce a DenialContext");
1634
1635        let e = HttpError::UnknownSource {
1636            source_key: "ghost".to_string(),
1637        };
1638        let dc: Option<DenialContext> = (&e).into();
1639        assert!(
1640            dc.is_none(),
1641            "UnknownSource must not produce a DenialContext"
1642        );
1643    }
1644
1645    // ---------------------------------------------------------------
1646    // Issue #117 — transient retry / backoff. Real time: wiremock
1647    // serves over real localhost IO and tokio `start_paused` is
1648    // incompatible with that (it auto-advances past reqwest's
1649    // timeout). Backoff is small enough that the slowest case
1650    // (persistent 503, 3 retries ≈ 3.5s) stays within the suite budget.
1651    // ---------------------------------------------------------------
1652
1653    fn host_of(server: &MockServer) -> String {
1654        server
1655            .uri()
1656            .parse::<Url>()
1657            .unwrap()
1658            .host_str()
1659            .unwrap()
1660            .to_string()
1661    }
1662
1663    #[tokio::test]
1664    async fn transient_503_then_200_succeeds() {
1665        let server = MockServer::start().await;
1666        // Catch-all 200 mounted first (lowest precedence); the
1667        // single-shot 503 mounted last takes precedence for the first
1668        // request only, then falls through to the 200.
1669        Mock::given(method("GET"))
1670            .and(path("/p"))
1671            .respond_with(ResponseTemplate::new(200).set_body_string(r#"{"ok":1}"#))
1672            .mount(&server)
1673            .await;
1674        Mock::given(method("GET"))
1675            .and(path("/p"))
1676            .respond_with(ResponseTemplate::new(503))
1677            .up_to_n_times(1)
1678            .mount(&server)
1679            .await;
1680
1681        let client = build_test_client_for_http("crossref", &host_of(&server));
1682        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
1683        let (body, _) = client
1684            .fetch_bytes("crossref", url)
1685            .await
1686            .expect("503-then-200 must succeed after one retry");
1687        assert_eq!(&body[..], br#"{"ok":1}"#);
1688    }
1689
1690    #[tokio::test]
1691    async fn persistent_503_exhausts_and_returns_httpstatus() {
1692        let server = MockServer::start().await;
1693        Mock::given(method("GET"))
1694            .and(path("/p"))
1695            .respond_with(ResponseTemplate::new(503))
1696            .mount(&server)
1697            .await;
1698
1699        let client = build_test_client_for_http("crossref", &host_of(&server));
1700        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
1701        let err = client
1702            .fetch_bytes("crossref", url)
1703            .await
1704            .expect_err("persistent 503 must exhaust retries");
1705        match err {
1706            HttpError::HttpStatus { status, .. } => assert_eq!(status, 503),
1707            other => panic!("expected HttpStatus 503, got {other:?}"),
1708        }
1709        // First attempt + MAX_FETCH_RETRIES retries.
1710        let reqs = server
1711            .received_requests()
1712            .await
1713            .expect("wiremock records requests");
1714        assert_eq!(reqs.len(), (MAX_FETCH_RETRIES + 1) as usize);
1715    }
1716
1717    #[tokio::test]
1718    async fn retry_after_429_then_200_succeeds() {
1719        let server = MockServer::start().await;
1720        Mock::given(method("GET"))
1721            .and(path("/p"))
1722            .respond_with(ResponseTemplate::new(200).set_body_string("ok"))
1723            .mount(&server)
1724            .await;
1725        Mock::given(method("GET"))
1726            .and(path("/p"))
1727            .respond_with(ResponseTemplate::new(429).insert_header("Retry-After", "1"))
1728            .up_to_n_times(1)
1729            .mount(&server)
1730            .await;
1731
1732        let client = build_test_client_for_http("crossref", &host_of(&server));
1733        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
1734        let (body, _) = client
1735            .fetch_bytes("crossref", url)
1736            .await
1737            .expect("429+Retry-After then 200 must succeed");
1738        assert_eq!(&body[..], b"ok");
1739    }
1740
1741    #[tokio::test]
1742    async fn permanent_404_is_not_retried() {
1743        let server = MockServer::start().await;
1744        Mock::given(method("GET"))
1745            .and(path("/p"))
1746            .respond_with(ResponseTemplate::new(404))
1747            .mount(&server)
1748            .await;
1749
1750        let client = build_test_client_for_http("crossref", &host_of(&server));
1751        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
1752        let _ = client
1753            .fetch_bytes("crossref", url)
1754            .await
1755            .expect_err("404 must fail");
1756        let reqs = server
1757            .received_requests()
1758            .await
1759            .expect("wiremock records requests");
1760        assert_eq!(reqs.len(), 1, "4xx (non-408/429) must NOT be retried");
1761    }
1762}