Skip to main content

doiget_core/
http.rs

1// allow: outbound-network
2//! Centralized HTTP client wrapper. All `Source` impls fetch through here.
3//!
4//! Security defaults per `docs/SECURITY.md`:
5//!   - rustls TLS only (no openssl, no native-tls — enforced by `deny.toml`)
6//!   - HTTPS-only redirect policy (file://, data://, http:// rejected)
7//!   - Per-source redirect host allowlist (`docs/REDIRECT_ALLOWLIST.md`)
8//!   - Body size cap ([`crate::PDF_MAX_BYTES`] = 100 MB)
9//!   - Per-request timeouts (connect 10s, read 60s, total 300s)
10//!   - PDF magic-byte check on the first 5 bytes (`%PDF-`)
11//!   - User-Agent: `doiget/<version> (+https://github.com/sotashimozono/doiget)`
12//!
13//! See `docs/SECURITY.md` §1.2-1.3 / §1.10 and `docs/REDIRECT_ALLOWLIST.md`.
14//!
15//! # Architectural note: per-source `reqwest::Client`
16//!
17//! `reqwest::redirect::Policy::custom` receives only an `Attempt` value, which
18//! exposes the next URL and previous URL chain but **not** the original
19//! request's headers. That makes the "tag the request with `X-Doiget-Source`
20//! and inspect it from inside the redirect closure" approach infeasible on
21//! `reqwest 0.13.x`. Instead, [`HttpClient`] holds one
22//! [`reqwest::Client`] per source — each client's redirect closure captures
23//! that source's [`SourceAllowlist`] so cross-source confusion is impossible
24//! by construction.
25
26use std::collections::HashMap;
27use std::sync::Arc;
28use std::sync::Once;
29use std::time::Duration;
30
31use bytes::{Bytes, BytesMut};
32use futures_util::StreamExt;
33use reqwest::redirect::Policy;
34use reqwest::{Client, ClientBuilder, Url};
35use thiserror::Error;
36
37use crate::{PDF_MAX_BYTES, VERSION};
38
39/// PDF magic-byte prefix per the PDF 1.7 specification (ISO 32000-1 §7.5.2).
40/// `b"%PDF-"`.
41const PDF_MAGIC: [u8; 5] = [0x25, 0x50, 0x44, 0x46, 0x2D];
42
43/// Hard cap on redirect chain length. Matches `reqwest`'s default of 10.
44/// Re-asserted here so the value is reviewed alongside the other security
45/// defaults in this module rather than inheriting silently from upstream.
46const MAX_REDIRECTS: usize = 10;
47
48/// Connect timeout per `docs/SECURITY.md` §1.2 (Slowloris row).
49const CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
50
51/// Read (idle-between-bytes) timeout per `docs/SECURITY.md` §1.2.
52const READ_TIMEOUT: Duration = Duration::from_secs(60);
53
54/// Total per-request timeout per `docs/SECURITY.md` §1.2.
55const TOTAL_TIMEOUT: Duration = Duration::from_secs(300);
56
57/// Max retry attempts AFTER the first try, for transient failures only
58/// (connect/timeout/mid-stream network errors and the transient HTTP
59/// status set). 3 retries → up to 4 total attempts. See issue #117.
60const MAX_FETCH_RETRIES: u32 = 3;
61
62/// Base delay for the exponential backoff (`base * 2^attempt`, jittered).
63const RETRY_BASE_DELAY: Duration = Duration::from_millis(500);
64
65/// Hard ceiling on any single backoff / `Retry-After` sleep. Keeps the
66/// worst-case retry chain comfortably inside [`TOTAL_TIMEOUT`].
67const RETRY_MAX_DELAY: Duration = Duration::from_secs(30);
68
69/// HTTP status codes worth retrying: request timeout, rate-limited, and
70/// the transient 5xx family. A plain 500 is included because upstreams
71/// (Crossref/Unpaywall) intermittently 500 under load. 4xx other than
72/// 408/429 are caller/permanent and never retried.
73fn is_transient_status(code: u16) -> bool {
74    matches!(code, 408 | 429 | 500 | 502 | 503 | 504)
75}
76
77/// A `reqwest::Error` is transient iff it is a connect or timeout
78/// failure or a mid-body transfer error. Redirect-policy aborts
79/// (allowlist denial), builder errors, and decode errors are NOT
80/// transient — retrying them cannot help and would mask a real denial.
81fn reqwest_is_transient(e: &reqwest::Error) -> bool {
82    (e.is_timeout() || e.is_connect() || e.is_body()) && !e.is_redirect()
83}
84
85/// Parse a `Retry-After` header expressed as integer seconds (the
86/// HTTP-date form is accepted by the RFC but rare for these APIs and
87/// deliberately ignored for the MVP — we fall back to exponential
88/// backoff in that case). Capped at [`RETRY_MAX_DELAY`].
89fn parse_retry_after(headers: &reqwest::header::HeaderMap) -> Option<Duration> {
90    let secs: u64 = headers
91        .get(reqwest::header::RETRY_AFTER)?
92        .to_str()
93        .ok()?
94        .trim()
95        .parse()
96        .ok()?;
97    Some(Duration::from_secs(secs).min(RETRY_MAX_DELAY))
98}
99
100/// Exponential backoff with decorrelated jitter. `RETRY_BASE_DELAY *
101/// 2^attempt`, capped at [`RETRY_MAX_DELAY`], plus 0..base jitter so a
102/// fleet of clients does not thunder back in lockstep. Jitter is derived
103/// from the wall-clock subsec nanos rather than pulling in an RNG
104/// dependency — adequate decorrelation for backoff, not a security
105/// primitive.
106fn backoff_delay(attempt: u32) -> Duration {
107    let factor = 1u64 << attempt.min(20);
108    let base_ms = RETRY_BASE_DELAY.as_millis() as u64;
109    let capped_ms = base_ms
110        .saturating_mul(factor)
111        .min(RETRY_MAX_DELAY.as_millis() as u64);
112    let jitter_ms = std::time::SystemTime::now()
113        .duration_since(std::time::UNIX_EPOCH)
114        .map(|d| (d.subsec_nanos() as u64) % base_ms.max(1))
115        .unwrap_or(0);
116    Duration::from_millis(capped_ms.saturating_add(jitter_ms))
117}
118
119// ---------------------------------------------------------------------------
120// SourceAllowlist
121// ---------------------------------------------------------------------------
122
123/// Per-source allowlist entry. Matches the schema in
124/// `docs/REDIRECT_ALLOWLIST.md` §2.
125#[derive(Debug, Clone)]
126#[non_exhaustive]
127pub struct SourceAllowlist {
128    /// Source key. MUST match a `source` value in `docs/SOURCES.md` §1
129    /// (e.g. `crossref`, `unpaywall`, `arxiv`).
130    pub source: String,
131    /// Each pattern is either a literal FQDN or a `*.<suffix>` glob (matches
132    /// the suffix and any subdomain — see `docs/REDIRECT_ALLOWLIST.md` §2.2
133    /// matching rule).
134    pub redirect_hosts: Vec<String>,
135}
136
137impl SourceAllowlist {
138    /// Construct a new allowlist entry.
139    pub fn new(source: impl Into<String>, redirect_hosts: Vec<String>) -> Self {
140        Self {
141            source: source.into(),
142            redirect_hosts,
143        }
144    }
145
146    /// Returns `true` if `host` matches any pattern in this allowlist.
147    ///
148    /// Matching is byte-level on the lowercased ASCII form of the host.
149    /// Callers MUST lowercase upstream; this method also lowercases as a
150    /// defense-in-depth measure but treats the result as ASCII (Punycode
151    /// is the caller's responsibility per `docs/REDIRECT_ALLOWLIST.md`
152    /// §2.2 rule 4).
153    pub fn matches(&self, host: &str) -> bool {
154        let host_lc = host.to_ascii_lowercase();
155        self.redirect_hosts
156            .iter()
157            .any(|pat| host_matches_pattern(&host_lc, pat))
158    }
159}
160
161/// Returns `true` if `host` (already lowercased) matches `pattern` per
162/// `docs/REDIRECT_ALLOWLIST.md` §2.2.
163fn host_matches_pattern(host: &str, pattern: &str) -> bool {
164    let pat_lc = pattern.to_ascii_lowercase();
165    if let Some(suffix) = pat_lc.strip_prefix("*.") {
166        // Suffix-glob: matches `<suffix>` exactly OR `*.<suffix>`.
167        host == suffix || host.ends_with(&format!(".{}", suffix))
168    } else {
169        // Exact-FQDN: byte-identical (after lowercasing both sides).
170        host == pat_lc
171    }
172}
173
174/// Hard-coded Phase 1 allowlist for Tier 1 sources. Sourced from
175/// `docs/REDIRECT_ALLOWLIST.md` §3.
176///
177/// Marked `Phase 1; revisit during real fetches` in the spec — entries
178/// flagged `(unverified)` (e.g. arXiv subdomain redirect behavior) MUST be
179/// confirmed or removed before Phase 1 is closed; see §3.3 of the spec.
180pub fn tier_1_allowlist() -> Vec<SourceAllowlist> {
181    vec![
182        // §3.1 crossref
183        SourceAllowlist::new(
184            "crossref",
185            vec!["api.crossref.org".to_string(), "*.crossref.org".to_string()],
186        ),
187        // §3.2 unpaywall
188        SourceAllowlist::new("unpaywall", vec!["api.unpaywall.org".to_string()]),
189        // §3.3 arxiv
190        SourceAllowlist::new(
191            "arxiv",
192            vec![
193                "arxiv.org".to_string(),
194                "export.arxiv.org".to_string(),
195                "*.arxiv.org".to_string(),
196            ],
197        ),
198    ]
199}
200
201/// Hard-coded Phase 4 allowlist for Tier 2 metadata sources (OpenAlex,
202/// Semantic Scholar, DOAJ). Sourced from `docs/SOURCES.md` §1 (the Tier 2
203/// table) and `docs/REDIRECT_ALLOWLIST.md` §3 (same redirect-allowlist
204/// policy as Tier 1, distinct source keys).
205///
206/// Returned hosts:
207///
208/// - `"openalex"` → `api.openalex.org` (production OpenAlex REST API).
209/// - `"semantic_scholar"` → `api.semanticscholar.org` (S2 Graph API base).
210/// - `"doaj"` → `doaj.org` + `*.doaj.org` (DOAJ public API; wildcard
211///   covers `api.doaj.org` and any v4+ subdomain split).
212///
213/// Per `docs/SOURCES.md` §4 "OpenAlex / Semantic Scholar / DOAJ", these
214/// sources are **metadata-only**: their `Source::fetch` impls MUST
215/// return `pdf_bytes: None`. The redirect closure in [`HttpClient`]
216/// uses this list to deny redirects to off-list hosts under each Tier
217/// 2 source key — identical mechanism to Tier 1, but the per-tool
218/// capability gate (`profile.metadata.openalex` etc.) is layered on
219/// top so the network surface remains capability-aware.
220pub fn tier_2_allowlist() -> Vec<SourceAllowlist> {
221    vec![
222        SourceAllowlist::new("openalex", vec!["api.openalex.org".to_string()]),
223        SourceAllowlist::new(
224            "semantic_scholar",
225            vec!["api.semanticscholar.org".to_string()],
226        ),
227        SourceAllowlist::new(
228            "doaj",
229            vec!["doaj.org".to_string(), "*.doaj.org".to_string()],
230        ),
231    ]
232}
233
234/// Always-compiled allowlist for the **discovery search** call path
235/// (ADR-0031).
236///
237/// Registers `api.openalex.org` under the `"openalex"` source key so the
238/// Tier-1 `discovery::paper_search` (`GET /works?search=`) can reach the
239/// endpoint in the **default `oa-only` binary** — unlike
240/// [`tier_2_allowlist`], which the CLI only wires in under
241/// `#[cfg(feature = "citation")]`.
242///
243/// Discovery search is classified as Tier 1 OA metadata (read-only, never
244/// paywalled, never a PDF — same risk class as Crossref/Unpaywall), so its
245/// transport allowlist must exist regardless of the `metadata`/`citation`
246/// features (ADR-0031 D1/D2). The CLI's `build_http_client` extends the
247/// production allowlist with this **unconditionally**; in `citation`
248/// builds [`tier_2_allowlist`] re-registers the identical
249/// `"openalex" → api.openalex.org` entry, which is a harmless idempotent
250/// `HashMap` overwrite in [`HttpClient::new`].
251pub fn discovery_allowlist() -> Vec<SourceAllowlist> {
252    vec![SourceAllowlist::new(
253        "openalex",
254        vec!["api.openalex.org".to_string()],
255    )]
256}
257
258/// Always-compiled allowlist for the **full-text extraction** call path
259/// (ADR-0032).
260///
261/// Registers `ar5iv.labs.arxiv.org` under a dedicated `"ar5iv"` source key
262/// so [`crate::paper_text::paper_text`] (`GET /html/<arxiv-id>`) can reach
263/// the ar5iv LaTeXML-XHTML renderer in the **default `oa-only` binary** —
264/// the same always-on posture as [`discovery_allowlist`].
265///
266/// The host is an arXiv subdomain (`*.arxiv.org` already matches it under
267/// the [`tier_1_allowlist`] `"arxiv"` key), so this adds no new
268/// registrable domain to the network surface — it only registers the host
269/// under a **distinct source key** so the provenance trail records that
270/// extracted text came from the ar5iv HTML renderer, not the arXiv
271/// PDF/Atom API (ADR-0032 D3). Full-text extraction is classified Tier-1
272/// OA metadata (read-only, OA, never a PDF reinterpretation), so its
273/// transport allowlist must exist regardless of any feature gate
274/// (ADR-0032 D2). The CLI's `build_http_client` extends the production
275/// allowlist with this **unconditionally**.
276pub fn fulltext_allowlist() -> Vec<SourceAllowlist> {
277    vec![SourceAllowlist::new(
278        "ar5iv",
279        vec!["ar5iv.labs.arxiv.org".to_string()],
280    )]
281}
282
283/// Hard-coded Phase 5a allowlist for the Springer Nature OA TDM
284/// source. Compile-gated by the `tdm-springer` Cargo feature so
285/// default release binaries never include the host pattern (per
286/// ADR-0002 and `docs/SOURCES.md` §3).
287///
288/// Returned entry:
289/// - `"tdm-springer"` → `api.springernature.com` (production base) +
290///   `*.springernature.com` (covers load-balancing subdomains; the
291///   redirect closure denies anything outside the wildcard).
292///
293/// Per `docs/SOURCES.md` §4 "TDM sources (Phase 5)", a fetch under
294/// this source key requires ALL THREE gates: Cargo feature compiled
295/// in, `DOIGET_KEY_SPRINGER` env var present, and
296/// `DOIGET_AGREE_TDM_SPRINGER=1`. The `CapabilityProfile` gate
297/// enforces the env-var pair; this allowlist is the transport gate.
298#[cfg(feature = "tdm-springer")]
299pub fn tier_3_springer_allowlist() -> Vec<SourceAllowlist> {
300    vec![SourceAllowlist::new(
301        "tdm-springer",
302        vec![
303            "api.springernature.com".to_string(),
304            "*.springernature.com".to_string(),
305        ],
306    )]
307}
308
309/// Hard-coded Phase 5b allowlist for the APS Harvest TDM source.
310/// Compile-gated by the `tdm-aps` Cargo feature so default release
311/// binaries never include the host pattern (per ADR-0002 and
312/// `docs/SOURCES.md` §3).
313///
314/// Returned entry:
315/// - `"tdm-aps"` → `harvest.aps.org` (production base) +
316///   `*.aps.org` (covers load-balancing subdomains; the redirect
317///   closure denies anything outside the wildcard).
318///
319/// Three-gate activation: Cargo feature compiled in,
320/// `DOIGET_KEY_APS` env var present, and `DOIGET_AGREE_TDM_APS=1`.
321/// The `CapabilityProfile` gate enforces the env-var pair; this
322/// allowlist is the transport gate.
323#[cfg(feature = "tdm-aps")]
324pub fn tier_3_aps_allowlist() -> Vec<SourceAllowlist> {
325    vec![SourceAllowlist::new(
326        "tdm-aps",
327        vec!["harvest.aps.org".to_string(), "*.aps.org".to_string()],
328    )]
329}
330
331/// Hard-coded Phase 5c allowlist for the Elsevier ScienceDirect TDM
332/// source. Compile-gated by the `tdm-elsevier` Cargo feature so
333/// default release binaries never include the host pattern (per
334/// ADR-0002 and `docs/SOURCES.md` §3).
335///
336/// Returned entry:
337/// - `"tdm-elsevier"` → `api.elsevier.com` (production base) +
338///   `*.elsevier.com` (covers load-balancing subdomains; the
339///   redirect closure denies anything outside the wildcard).
340///
341/// Three-gate activation: Cargo feature compiled in,
342/// `DOIGET_KEY_ELSEVIER` env var present, and
343/// `DOIGET_AGREE_TDM_ELSEVIER=1`. The `CapabilityProfile` gate
344/// enforces the env-var pair; this allowlist is the transport gate.
345#[cfg(feature = "tdm-elsevier")]
346pub fn tier_3_elsevier_allowlist() -> Vec<SourceAllowlist> {
347    vec![SourceAllowlist::new(
348        "tdm-elsevier",
349        vec!["api.elsevier.com".to_string(), "*.elsevier.com".to_string()],
350    )]
351}
352
353/// Hard-coded Phase 1 allowlist for the synthetic `"oa-publisher"` source —
354/// the publisher / preprint / repository hosts to which Unpaywall's
355/// `best_oa_location.url` (or `url_for_pdf`) typically resolves.
356///
357/// **Status: informed-best-effort.** Per `docs/REDIRECT_ALLOWLIST.md` §3,
358/// every entry below is a documented OA-publisher host pulled from the
359/// public DOI / OA discovery surface as of this function's authoring; they
360/// are **not** a substitute for empirical validation. Entries marked
361/// `(unverified)` MUST be confirmed by a real fetch or removed before
362/// Phase 1 is closed.
363///
364/// The orchestrator (`doiget-cli::commands::fetch::fetch_doi`) calls
365/// [`HttpClient::fetch_pdf`] under the `"oa-publisher"` source key when
366/// Unpaywall returns an OA URL. If the OA host is not in this list, the
367/// PDF leg is denied (`HttpError::RedirectDenied`) and the orchestrator
368/// falls back to metadata-only success (the `informed-best-effort`
369/// posture from the spec section above).
370pub fn oa_publisher_allowlist() -> Vec<SourceAllowlist> {
371    vec![SourceAllowlist::new(
372        "oa-publisher",
373        vec![
374            // Springer Nature OA imprints. Springer / SpringerOpen / Nature
375            // OA URLs all resolve under one of these registrable suffixes.
376            // (unverified) — confirm by replaying real Unpaywall responses.
377            "*.springer.com".to_string(),
378            "*.springeropen.com".to_string(),
379            "*.springernature.com".to_string(),
380            "*.nature.com".to_string(),
381            // Wiley OA. (unverified)
382            "*.wiley.com".to_string(),
383            // Elsevier OA route only — the TDM gated path is a separate
384            // source (`tdm-elsevier`, Phase 5c) and is not covered here.
385            // (unverified)
386            "*.elsevier.com".to_string(),
387            "*.sciencedirect.com".to_string(),
388            // Frontiers. (unverified)
389            "*.frontiersin.org".to_string(),
390            // MDPI. (unverified)
391            "*.mdpi.com".to_string(),
392            // PLOS. (unverified)
393            "*.plos.org".to_string(),
394            // Preprint servers — biorxiv / medrxiv. (unverified)
395            "*.biorxiv.org".to_string(),
396            "*.medrxiv.org".to_string(),
397            // Europe PMC + NIH PMC. (unverified)
398            "europepmc.org".to_string(),
399            "*.europepmc.org".to_string(),
400            "*.nih.gov".to_string(),
401            "*.ncbi.nlm.nih.gov".to_string(),
402            // Physics-society / diamond-OA hosts. UNLIKE the entries
403            // above, these are EMPIRICALLY VERIFIED: a real `doiget batch`
404            // over 30 OpenAlex-OA finite-temperature-MPS DOIs observed
405            // Unpaywall `best_oa_location` resolving to these hosts and
406            // being denied (#193, REDIRECT_ALLOWLIST.md §3.4, ADR-0027).
407            // APS — journals.aps.org / link.aps.org (green & gold OA;
408            // society host; `*.aps.org` is also trusted under the separate
409            // `tdm-aps` Tier-3 source key WHEN that feature is compiled
410            // in — `tier_3_aps_allowlist` is `#[cfg(feature = "tdm-aps")]`
411            // and absent from default release builds).
412            "*.aps.org".to_string(),
413            // SciPost — diamond OA, community-run physics publisher.
414            "scipost.org".to_string(),
415            "*.scipost.org".to_string(),
416            // IOP Publishing — iopscience.iop.org (New J. Phys. etc.).
417            "*.iop.org".to_string(),
418            // arXiv — already on the `arxiv` tier-1 allowlist, but the
419            // Unpaywall-driven path uses the `oa-publisher` source key,
420            // so we mirror the host list here too. See REDIRECT_ALLOWLIST.md
421            // §3.3 for the underlying entries.
422            "arxiv.org".to_string(),
423            "*.arxiv.org".to_string(),
424        ],
425    )]
426}
427
428// ---------------------------------------------------------------------------
429// HttpError
430// ---------------------------------------------------------------------------
431
432/// Errors that can arise during HTTP fetches.
433#[derive(Debug, Error)]
434#[non_exhaustive]
435pub enum HttpError {
436    /// Transport / DNS / TLS failure or other `reqwest`-level error. Note
437    /// that `reqwest` surfaces a redirect-policy abort (via `Attempt::error`)
438    /// as a `reqwest::Error` carrying the source error — callers seeing
439    /// `Network` for what they believed was a redirect violation should
440    /// inspect the inner error chain.
441    #[error("network error: {0}")]
442    Network(#[from] reqwest::Error),
443    /// Redirect target host did not match any pattern in the source's
444    /// `redirect_hosts`. See `docs/REDIRECT_ALLOWLIST.md` §2.2.
445    ///
446    /// Field naming: `source_key` rather than `source` because `thiserror`
447    /// auto-treats a field literally named `source` as a `#[source]` error
448    /// chain link (which would require the field to implement `std::error::Error`).
449    ///
450    /// `expected_hosts` carries a snapshot of the source's allowlist
451    /// patterns at the time of the denial — populated for the structured
452    /// `denial_context.expected` channel introduced by ADR-0023 §4
453    /// (NORMATIVE mapping table). Cloning the patterns into the error
454    /// keeps the `From<&HttpError> for Option<DenialContext>` impl from
455    /// having to re-look-up the allowlist by `source_key`. May be empty
456    /// when the rejection happened before any allowlist was matched
457    /// (e.g. URL had no host component at all).
458    #[error("redirect target {host} not in allowlist for source {source_key}")]
459    RedirectDenied {
460        /// Source key whose allowlist rejected the redirect.
461        source_key: String,
462        /// The lowercased host that was rejected.
463        host: String,
464        /// Snapshot of the source's `redirect_hosts` at denial time.
465        /// Surfaces as `denial_context.expected` (ADR-0023 §4).
466        expected_hosts: Vec<String>,
467    },
468    /// Redirect target had a scheme other than `https`. See
469    /// `docs/SECURITY.md` §1.3.
470    #[error("redirect to non-HTTPS scheme: {scheme}")]
471    InsecureRedirect {
472        /// The disallowed scheme (e.g. `http`, `file`, `data`).
473        scheme: String,
474    },
475    /// Body would exceed [`PDF_MAX_BYTES`] either by a `Content-Length`
476    /// hint or by accumulated streamed bytes. See `docs/SECURITY.md` §1.2.
477    #[error("body too large: {actual} bytes (cap = {cap})")]
478    OversizedBody {
479        /// Observed size (header value or accumulated bytes).
480        actual: u64,
481        /// Hard upper bound (always [`PDF_MAX_BYTES`]).
482        cap: u64,
483    },
484    /// PDF magic-byte mismatch — the body does not start with `%PDF-`.
485    /// We deliberately do NOT use `Content-Type` (publishers misbehave —
486    /// the magic byte is the trustworthy signal per `docs/SECURITY.md`
487    /// §1.2 "Magic-byte mismatch" row).
488    #[error("PDF magic-byte mismatch: got {got:?}")]
489    NotAPdf {
490        /// First five bytes of the response body (zero-padded if shorter).
491        got: [u8; 5],
492    },
493    /// Server returned a non-2xx status.
494    #[error("HTTP {status} from {url}")]
495    HttpStatus {
496        /// HTTP status code.
497        status: u16,
498        /// The URL that produced the status.
499        url: String,
500    },
501    /// No allowlist entry exists for this source. The caller asked
502    /// [`HttpClient`] to fetch on behalf of a source that wasn't passed to
503    /// [`HttpClient::new`].
504    ///
505    /// See note on `RedirectDenied` for why the field is `source_key`.
506    #[error("no allowlist registered for source {source_key}")]
507    UnknownSource {
508        /// The unregistered source key.
509        source_key: String,
510    },
511    /// A header name or value passed to
512    /// [`HttpClient::fetch_bytes_with_headers`] was not a valid HTTP
513    /// header. The header parser only accepts the visible-ASCII subset
514    /// per RFC 7230 §3.2; control characters and non-ASCII bytes are
515    /// rejected before the request is even built. Surfaces as
516    /// `ErrorCode::InternalError` at the public boundary (callers
517    /// supplying bad headers are responsible for fixing the call site;
518    /// not a denial in the ADR-0023 sense).
519    #[error("invalid HTTP header `{name}`: {reason}")]
520    InvalidHeader {
521        /// The header name as supplied by the caller.
522        name: String,
523        /// `"name"` or `"value"` — which side failed parsing.
524        reason: String,
525    },
526}
527
528// ---------------------------------------------------------------------------
529// HttpError -> Option<DenialContext>  (ADR-0023 §4 mapping table)
530// ---------------------------------------------------------------------------
531
532/// Map an [`HttpError`] reference to the structured [`crate::DenialContext`]
533/// channel introduced by ADR-0023.
534///
535/// Returns `Some(_)` for the four denial classes named in ADR-0023 §4
536/// (`RedirectDenied`, `OversizedBody`, `NotAPdf`, `InsecureRedirect`) and
537/// `None` for every other variant — `Network`, `HttpStatus`,
538/// `UnknownSource` are not denials in the ADR-0023 sense (they are
539/// transport / upstream / programming-error signals, not allowlist or
540/// cap rejections).
541///
542/// The `&HttpError` borrow form is used (rather than `HttpError`) so the
543/// caller — typically the orchestrator that already needs the original
544/// error for `error.message` and the `From<HttpError> for ErrorCode`
545/// collapse — does not have to clone the error to produce the optional
546/// structured side-channel.
547impl From<&HttpError> for Option<crate::DenialContext> {
548    fn from(e: &HttpError) -> Self {
549        use crate::{DenialContext, DenialReason};
550        match e {
551            HttpError::RedirectDenied {
552                source_key,
553                host,
554                expected_hosts,
555            } => Some(DenialContext {
556                reason: DenialReason::RedirectNotInAllowlist,
557                source: Some(source_key.clone()),
558                attempted: Some(host.clone()),
559                expected: Some(expected_hosts.clone()),
560                hop_index: None,
561                cap: None,
562                actual: None,
563            }),
564            HttpError::OversizedBody { actual, cap } => Some(DenialContext {
565                reason: DenialReason::SizeCapExceeded,
566                source: None,
567                attempted: None,
568                // The size-cap reason has no allowlist channel; use
569                // `None` to signal "field not populated by producer"
570                // rather than `Some(vec![])` (which would mean "explicit
571                // empty allowlist"). See `DenialContext::expected` docs.
572                expected: None,
573                hop_index: None,
574                cap: Some(*cap),
575                actual: Some(*actual),
576            }),
577            HttpError::NotAPdf { got } => Some(DenialContext {
578                reason: DenialReason::ContentTypeMismatch,
579                source: None,
580                // ADR-0023 §4 mapping table: hex-encode the first 5 bytes
581                // for the `attempted` field. `format!("{:02x}...")` is
582                // chosen over `hex::encode` to avoid pulling the
583                // additional dep into this conversion path; the result is
584                // bit-identical (lowercase, zero-padded).
585                attempted: Some(format!(
586                    "{:02x}{:02x}{:02x}{:02x}{:02x}",
587                    got[0], got[1], got[2], got[3], got[4]
588                )),
589                expected: Some(vec!["%PDF-".to_string()]),
590                hop_index: None,
591                cap: None,
592                actual: None,
593            }),
594            HttpError::InsecureRedirect { scheme } => Some(DenialContext {
595                reason: DenialReason::InsecureScheme,
596                source: None,
597                attempted: Some(format!("{}:...", scheme)),
598                expected: Some(vec!["https".to_string()]),
599                hop_index: None,
600                cap: None,
601                actual: None,
602            }),
603            // `reqwest` wraps a custom error returned by the redirect
604            // policy closure (`attempt.error(HttpError::RedirectDenied{..})`
605            // / `attempt.error(HttpError::InsecureRedirect{..})`) inside a
606            // `reqwest::Error`, which surfaces here as `HttpError::Network`.
607            // Without source-chain walking, production redirect denials —
608            // the most operationally important denial class — would never
609            // produce a `DenialContext`, defeating the whole point of
610            // ADR-0023.
611            //
612            // Walk the `std::error::Error::source()` chain on the inner
613            // `reqwest::Error` and downcast each link to `&HttpError`. If
614            // a wrapped `HttpError` is found, recurse via this same `From`
615            // impl. Otherwise the network error is a "real" transport /
616            // DNS / TLS failure with no denial semantics — return `None`.
617            //
618            // `std::error::Error::source(e)` is fully-qualified to
619            // disambiguate against the inherent (and unrelated)
620            // `reqwest::Error::source()`.
621            HttpError::Network(e) => {
622                let mut source: Option<&(dyn std::error::Error + 'static)> =
623                    std::error::Error::source(e);
624                while let Some(s) = source {
625                    if let Some(http_err) = s.downcast_ref::<HttpError>() {
626                        return Option::<crate::DenialContext>::from(http_err);
627                    }
628                    source = s.source();
629                }
630                None
631            }
632            // The remaining variants are not "denials" in the ADR-0023
633            // sense — HttpStatus/UnknownSource are upstream / programming-
634            // error signals; InvalidHeader is a caller-bug signal.
635            HttpError::HttpStatus { .. }
636            | HttpError::UnknownSource { .. }
637            | HttpError::InvalidHeader { .. } => None,
638        }
639    }
640}
641
642// ---------------------------------------------------------------------------
643// HttpClient
644// ---------------------------------------------------------------------------
645
646/// Workspace-wide HTTP client with the security defaults applied.
647///
648/// Internally holds one `reqwest::Client` per source. Construct via
649/// [`HttpClient::new`] with the full set of allowlists the calling process
650/// will need.
651#[derive(Clone, Debug)]
652pub struct HttpClient {
653    /// One [`reqwest::Client`] per source. Each client carries a redirect
654    /// policy that captures only that source's allowlist. `Arc` so cloning
655    /// is cheap.
656    clients: Arc<HashMap<String, Client>>,
657    /// The exact [`SourceAllowlist`] each per-source client was built from,
658    /// keyed by source. The redirect closure inside each `reqwest::Client`
659    /// captures its allowlist *by move*, so it cannot be read back from the
660    /// client itself. This map keeps the identical `SourceAllowlist`
661    /// available to callers that must perform a *pre-fetch* host check on a
662    /// metadata-discovered URL (issue #145 / `docs/REDIRECT_ALLOWLIST.md`
663    /// §1: the allowlist is consulted "on the OA URL discovered through
664    /// metadata sources before the actual PDF fetch is issued", not only on
665    /// redirect hops). Storing the same value here — rather than re-deriving
666    /// it from [`oa_publisher_allowlist`] at the call site — guarantees the
667    /// pre-check and the redirect closure can never drift, and that the
668    /// check works under the test constructors too (which register a
669    /// wiremock host as the allowlist).
670    allowlists: Arc<HashMap<String, SourceAllowlist>>,
671}
672
673impl HttpClient {
674    /// Build a client with rustls + redirect-allowlist + size cap +
675    /// timeouts.
676    ///
677    /// `allowlists` MUST cover every source whose URL might be passed in;
678    /// fetches against unregistered sources return
679    /// [`HttpError::UnknownSource`].
680    ///
681    /// # Errors
682    ///
683    /// Returns the underlying `reqwest::Error` if `ClientBuilder::build`
684    /// fails (typically a TLS-backend init failure).
685    pub fn new(allowlists: Vec<SourceAllowlist>) -> Result<Self, reqwest::Error> {
686        let ua = format!(
687            "doiget/{} (+https://github.com/sotashimozono/doiget)",
688            VERSION
689        );
690        Self::new_with_user_agent(allowlists, &ua)
691    }
692
693    /// Build a client with a custom `User-Agent` header.
694    ///
695    /// Used by `doiget batch --user-agent` to override the default UA for
696    /// hosts that classify the default string as a bot.
697    pub fn new_with_user_agent(
698        allowlists: Vec<SourceAllowlist>,
699        user_agent: &str,
700    ) -> Result<Self, reqwest::Error> {
701        let mut clients = HashMap::with_capacity(allowlists.len());
702        let mut allowlist_map = HashMap::with_capacity(allowlists.len());
703        for entry in allowlists {
704            let source = entry.source.clone();
705            allowlist_map.insert(source.clone(), entry.clone());
706            let client = build_client(entry, user_agent)?;
707            clients.insert(source, client);
708        }
709        Ok(Self {
710            clients: Arc::new(clients),
711            allowlists: Arc::new(allowlist_map),
712        })
713    }
714
715    /// The [`SourceAllowlist`] this client was built with for `source`, or
716    /// `None` if `source` was not registered.
717    ///
718    /// This is the *identical* value captured by the per-source redirect
719    /// closure (see [`HttpClient`]'s `allowlists` field doc). It exists so
720    /// the orchestrator can apply the `docs/REDIRECT_ALLOWLIST.md` §1
721    /// pre-fetch host check on a metadata-discovered OA URL — the URL that
722    /// is fetched *without* necessarily passing through a redirect hop —
723    /// using the same source of truth the redirect closure uses, so the two
724    /// can never disagree. Callers MUST use this for the `"oa-publisher"`
725    /// leg only; the initial template-constructed URL is exempt per
726    /// `docs/REDIRECT_ALLOWLIST.md` §6.
727    pub fn source_allowlist(&self, source: &str) -> Option<&SourceAllowlist> {
728        self.allowlists.get(source)
729    }
730
731    /// Fetch a URL, treating it as a JSON or text body. Caps at
732    /// [`PDF_MAX_BYTES`].
733    ///
734    /// Returns the response body bytes plus the effective final URL after
735    /// redirects (post-allowlist verification — every hop has already been
736    /// validated by the time this returns).
737    ///
738    /// # Errors
739    ///
740    /// Any [`HttpError`] variant.
741    pub async fn fetch_bytes(&self, source: &str, url: Url) -> Result<(Bytes, Url), HttpError> {
742        self.fetch_inner(source, url, &[], false).await
743    }
744
745    /// Like [`Self::fetch_bytes`] but attaches additional request
746    /// headers to the outgoing GET. The headers are validated up-front
747    /// against the visible-ASCII subset (RFC 7230 §3.2); any failure
748    /// returns [`HttpError::InvalidHeader`] before the request is sent.
749    ///
750    /// Used by Tier-3 TDM sources that authenticate via a header
751    /// (APS Harvest `X-API-Key`, Elsevier ScienceDirect `X-ELS-APIKey`).
752    /// Header values appear on the wire only — they are never logged.
753    ///
754    /// # Errors
755    ///
756    /// Any [`HttpError`] variant including [`HttpError::InvalidHeader`].
757    pub async fn fetch_bytes_with_headers(
758        &self,
759        source: &str,
760        url: Url,
761        headers: &[(&str, &str)],
762    ) -> Result<(Bytes, Url), HttpError> {
763        self.fetch_inner(source, url, headers, false).await
764    }
765
766    /// Fetch a URL expected to be a PDF. Same as [`Self::fetch_bytes`] plus
767    /// the magic-byte check on the first 5 bytes
768    /// (`%PDF-` = `[0x25, 0x50, 0x44, 0x46, 0x2D]`). Mismatch returns
769    /// [`HttpError::NotAPdf`].
770    ///
771    /// # Errors
772    ///
773    /// Any [`HttpError`] variant including [`HttpError::NotAPdf`].
774    pub async fn fetch_pdf(&self, source: &str, url: Url) -> Result<(Bytes, Url), HttpError> {
775        self.fetch_inner(source, url, &[], true).await
776    }
777
778    async fn fetch_inner(
779        &self,
780        source: &str,
781        url: Url,
782        headers: &[(&str, &str)],
783        check_pdf_magic: bool,
784    ) -> Result<(Bytes, Url), HttpError> {
785        // Normalise legacy `http://` URLs returned by OpenAlex /
786        // Unpaywall metadata before send. See `upgrade_http_to_https`
787        // for the rationale (TLS posture preserved per ADR-0020) and
788        // the loopback carve-out.
789        let url = upgrade_http_to_https(url);
790
791        let client = self
792            .clients
793            .get(source)
794            .ok_or_else(|| HttpError::UnknownSource {
795                source_key: source.to_string(),
796            })?;
797
798        // Parse headers up-front so an invalid name/value fails BEFORE
799        // we touch the network. `HeaderName::from_bytes` / `HeaderValue::from_str`
800        // accept the visible-ASCII subset only (RFC 7230 §3.2).
801        let mut header_map = reqwest::header::HeaderMap::with_capacity(headers.len());
802        for (name, value) in headers {
803            let hn = reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|_| {
804                HttpError::InvalidHeader {
805                    name: (*name).to_string(),
806                    reason: "name".to_string(),
807                }
808            })?;
809            let hv = reqwest::header::HeaderValue::from_str(value).map_err(|_| {
810                HttpError::InvalidHeader {
811                    name: (*name).to_string(),
812                    reason: "value".to_string(),
813                }
814            })?;
815            header_map.insert(hn, hv);
816        }
817
818        // Bounded retry loop (issue #117). Only transient classes are
819        // retried — connect/timeout/mid-stream network errors and the
820        // transient HTTP status set. Allowlist denials, NotAPdf,
821        // OversizedBody, 4xx (non-408/429) are deterministic and return
822        // on the first occurrence. GET is idempotent so a retried
823        // attempt re-streams the body from scratch.
824        let mut attempt: u32 = 0;
825        loop {
826            let send_result = client
827                .get(url.clone())
828                .headers(header_map.clone())
829                .send()
830                .await;
831            let response = match send_result {
832                Ok(r) => r,
833                Err(e) => {
834                    if attempt < MAX_FETCH_RETRIES && reqwest_is_transient(&e) {
835                        let d = backoff_delay(attempt);
836                        tracing::warn!(
837                            source,
838                            attempt,
839                            delay_ms = d.as_millis() as u64,
840                            error = %e,
841                            "transient send failure; retrying"
842                        );
843                        tokio::time::sleep(d).await;
844                        attempt += 1;
845                        continue;
846                    }
847                    return Err(HttpError::Network(e));
848                }
849            };
850            let final_url = response.url().clone();
851
852            // Status check before body read so we can fail fast.
853            let status = response.status();
854            if !status.is_success() {
855                let code = status.as_u16();
856                if attempt < MAX_FETCH_RETRIES && is_transient_status(code) {
857                    // Prefer the server's `Retry-After` over our backoff
858                    // when present (429/503 commonly carry it).
859                    let d = parse_retry_after(response.headers())
860                        .unwrap_or_else(|| backoff_delay(attempt));
861                    tracing::warn!(
862                        source,
863                        attempt,
864                        status = code,
865                        delay_ms = d.as_millis() as u64,
866                        "transient HTTP status; retrying"
867                    );
868                    tokio::time::sleep(d).await;
869                    attempt += 1;
870                    continue;
871                }
872                return Err(HttpError::HttpStatus {
873                    status: code,
874                    // Issue #146: Springer Nature authenticates via an
875                    // `api_key` URL query parameter (no header path
876                    // upstream). This error string is logged and may
877                    // surface to the user, so strip any `api_key`
878                    // value before it leaves the client. No other
879                    // source puts a secret in the query string, so
880                    // this is a no-op for them.
881                    url: redact_api_key_query(&final_url),
882                });
883            }
884
885            // Content-Length fast-path: if header is present and exceeds
886            // the cap, fail without reading any body (deterministic — not
887            // retried). Per `docs/SECURITY.md` §1.2.
888            if let Some(len) = response.content_length() {
889                if len > PDF_MAX_BYTES {
890                    return Err(HttpError::OversizedBody {
891                        actual: len,
892                        cap: PDF_MAX_BYTES,
893                    });
894                }
895            }
896
897            // Stream body and enforce the cap as bytes accumulate. A
898            // mid-stream transport error is transient (retry); an
899            // oversized body is deterministic (return).
900            let mut buf = BytesMut::new();
901            let mut stream = response.bytes_stream();
902            let mut oversized_at: Option<u64> = None;
903            let mut stream_err: Option<reqwest::Error> = None;
904            while let Some(chunk) = stream.next().await {
905                let chunk = match chunk {
906                    Ok(c) => c,
907                    Err(e) => {
908                        stream_err = Some(e);
909                        break;
910                    }
911                };
912                let projected = (buf.len() as u64).saturating_add(chunk.len() as u64);
913                if projected > PDF_MAX_BYTES {
914                    oversized_at = Some(projected);
915                    break;
916                }
917                buf.extend_from_slice(&chunk);
918            }
919            if let Some(actual) = oversized_at {
920                return Err(HttpError::OversizedBody {
921                    actual,
922                    cap: PDF_MAX_BYTES,
923                });
924            }
925            if let Some(e) = stream_err {
926                if attempt < MAX_FETCH_RETRIES && reqwest_is_transient(&e) {
927                    let d = backoff_delay(attempt);
928                    tracing::warn!(
929                        source,
930                        attempt,
931                        delay_ms = d.as_millis() as u64,
932                        error = %e,
933                        "transient mid-stream failure; retrying"
934                    );
935                    tokio::time::sleep(d).await;
936                    attempt += 1;
937                    continue;
938                }
939                return Err(HttpError::Network(e));
940            }
941            let body = buf.freeze();
942
943            if check_pdf_magic {
944                let mut got = [0u8; 5];
945                let n = body.len().min(5);
946                got[..n].copy_from_slice(&body[..n]);
947                if got != PDF_MAGIC {
948                    return Err(HttpError::NotAPdf { got });
949                }
950            }
951
952            return Ok((body, final_url));
953        }
954    }
955}
956
957/// Return `url` rendered as a string with the value of any `api_key`
958/// query parameter replaced by `REDACTED` (issue #146).
959///
960/// Springer Nature's TDM API authenticates **only** via an `api_key`
961/// query parameter — there is no header-auth path upstream — so the key
962/// is unavoidably in the request URL. This keeps it out of *our* log
963/// and error sinks (the `HttpError::HttpStatus` string in particular,
964/// which is `tracing`-logged and can surface to the user). It is a
965/// structural no-op for every other source, none of which carry a
966/// secret in the query string. Other pairs and their order are
967/// preserved; a URL with no `api_key` pair is rendered unchanged.
968fn redact_api_key_query(url: &url::Url) -> String {
969    const API_KEY_PARAM: &str = "api_key";
970    if url.query_pairs().all(|(k, _)| k != API_KEY_PARAM) {
971        return url.to_string();
972    }
973    let mut redacted = url.clone();
974    let pairs: Vec<(String, String)> = url
975        .query_pairs()
976        .map(|(k, v)| {
977            if k == API_KEY_PARAM {
978                (k.into_owned(), "REDACTED".to_string())
979            } else {
980                (k.into_owned(), v.into_owned())
981            }
982        })
983        .collect();
984    redacted.query_pairs_mut().clear().extend_pairs(pairs);
985    redacted.to_string()
986}
987
988/// Test-oriented [`HttpClient`] constructor. Originally `cfg(test)`; now
989/// also reachable from the `doiget-cli` orchestrator's integration tests
990/// (which live outside this crate and therefore cannot see `cfg(test)`-gated
991/// items). The constructor name retains its `for_tests_allow_http` signal —
992/// production code MUST use [`HttpClient::new`] with [`tier_1_allowlist`].
993#[allow(clippy::expect_used)]
994impl HttpClient {
995    /// Build a test-oriented `HttpClient` against an `http://` wiremock
996    /// origin. The redirect closure still rejects insecure schemes — we only
997    /// relax `https_only` at the connection level so wiremock can serve.
998    /// This is acceptable because the redirect closure (which is the
999    /// security-load-bearing path) is exercised by the
1000    /// `redirect_to_http_is_rejected_by_closure` test below.
1001    ///
1002    /// Production callers MUST use [`HttpClient::new`] with
1003    /// [`tier_1_allowlist`] — the `for_tests_allow_http` suffix is the load-
1004    /// bearing signal that this constructor lifts the initial-leg HTTPS-only
1005    /// requirement.
1006    pub fn new_for_tests_allow_http(source: &str, allowlist_host: &str) -> Self {
1007        let allowlist = SourceAllowlist::new(source, vec![allowlist_host.to_string()]);
1008        let client = build_client_allow_http(allowlist.clone()).expect("test client builds");
1009        let mut map = HashMap::new();
1010        let mut allowlist_map = HashMap::new();
1011        allowlist_map.insert(allowlist.source.clone(), allowlist.clone());
1012        map.insert(allowlist.source.clone(), client);
1013        Self {
1014            clients: Arc::new(map),
1015            allowlists: Arc::new(allowlist_map),
1016        }
1017    }
1018
1019    /// Multi-source variant of [`HttpClient::new_for_tests_allow_http`].
1020    ///
1021    /// Builds a relaxed-`https_only` client per `(source, allowlist_host)`
1022    /// pair. Used by the `doiget-cli` orchestrator's integration tests when
1023    /// more than one upstream needs to be wiremocked simultaneously
1024    /// (e.g. Crossref + Unpaywall against two different mock servers).
1025    /// Production callers MUST use [`HttpClient::new`] with
1026    /// [`tier_1_allowlist`].
1027    pub fn new_for_tests_allow_http_multi(entries: &[(&str, &str)]) -> Self {
1028        let mut map = HashMap::with_capacity(entries.len());
1029        let mut allowlist_map = HashMap::with_capacity(entries.len());
1030        for (source, host) in entries {
1031            let allowlist = SourceAllowlist::new(*source, vec![host.to_string()]);
1032            let client = build_client_allow_http(allowlist.clone()).expect("test client builds");
1033            allowlist_map.insert(allowlist.source.clone(), allowlist.clone());
1034            map.insert(allowlist.source.clone(), client);
1035        }
1036        Self {
1037            clients: Arc::new(map),
1038            allowlists: Arc::new(allowlist_map),
1039        }
1040    }
1041}
1042
1043fn build_client_allow_http(allowlist: SourceAllowlist) -> Result<Client, reqwest::Error> {
1044    ensure_crypto_provider();
1045    let allowlist_for_closure = allowlist.clone();
1046    let redirect_policy = Policy::custom(move |attempt| {
1047        let scheme = attempt.url().scheme().to_string();
1048        let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
1049        let prev_count = attempt.previous().len();
1050        if scheme != "https" {
1051            return attempt.error(HttpError::InsecureRedirect { scheme });
1052        }
1053        if prev_count >= MAX_REDIRECTS {
1054            return attempt.stop();
1055        }
1056        let host = match host_opt {
1057            Some(h) => h,
1058            None => {
1059                return attempt.error(HttpError::RedirectDenied {
1060                    source_key: allowlist_for_closure.source.clone(),
1061                    host: String::new(),
1062                    expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1063                });
1064            }
1065        };
1066        if !allowlist_for_closure.matches(&host) {
1067            return attempt.error(HttpError::RedirectDenied {
1068                source_key: allowlist_for_closure.source.clone(),
1069                host,
1070                expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1071            });
1072        }
1073        attempt.follow()
1074    });
1075    ClientBuilder::new()
1076        // `https_only(false)` only at this scope — production builders
1077        // (the public `HttpClient::new`) keep it on.
1078        .https_only(false)
1079        .redirect(redirect_policy)
1080        .connect_timeout(CONNECT_TIMEOUT)
1081        .timeout(TOTAL_TIMEOUT)
1082        .read_timeout(READ_TIMEOUT)
1083        .user_agent(format!(
1084            "doiget/{} (+https://github.com/sotashimozono/doiget)",
1085            VERSION
1086        ))
1087        .tls_backend_rustls()
1088        .build()
1089}
1090
1091// ---------------------------------------------------------------------------
1092// ClientBuilder helpers
1093// ---------------------------------------------------------------------------
1094
1095/// Install the `ring` `rustls` crypto provider as the process default,
1096/// exactly once.
1097///
1098/// reqwest is built with the `rustls-no-provider` feature (ADR-0020
1099/// Amendment 1: drop aws-lc-rs so `cargo install` needs no cmake/C
1100/// toolchain and musl-static builds cleanly). With no bundled provider,
1101/// `reqwest::ClientBuilder::build` calls
1102/// `rustls::crypto::CryptoProvider::get_default()` and **panics**
1103/// (`"No provider set"`) unless a process-default provider was installed
1104/// first. Every client constructor below calls this; the `Once` makes it
1105/// safe to invoke from many sites and from concurrent tests.
1106fn ensure_crypto_provider() {
1107    static INIT: Once = Once::new();
1108    INIT.call_once(|| {
1109        // `install_default` errors only if a provider is already set;
1110        // under `Once` that is unreachable, but ignore it rather than
1111        // panic (another linked crate could have installed one first).
1112        let _ = rustls::crypto::ring::default_provider().install_default();
1113    });
1114}
1115
1116/// Public entry point for callers that build their own `reqwest::Client`
1117/// outside of [`HttpClient`] and need the process-default TLS provider
1118/// installed first (ADR-0020 Amendment 1).
1119///
1120/// Safe to call multiple times; the underlying `Once` makes it idempotent.
1121pub fn init_tls() {
1122    ensure_crypto_provider();
1123}
1124
1125/// Upgrade an `http://` URL to `https://` for legacy publisher
1126/// metadata. Loopback hosts (`localhost`, any RFC 6761 `.localhost`
1127/// TLD subdomain, `127.0.0.0/8`, `::1`, IPv4-mapped IPv6 loopback)
1128/// are returned unchanged so the `new_for_tests_allow_http*` wiremock
1129/// path continues to talk plain HTTP to the local fixture server.
1130///
1131/// Non-`http` schemes (`https`, `file`, anything else) and cannot-be-
1132/// base URLs are returned unchanged. The function is total: it never
1133/// panics and never returns an error.
1134///
1135/// # Audit / posture
1136///
1137/// On a successful upgrade the function emits a `tracing::info!` event
1138/// so the rewrite appears in the operator's default-level structured
1139/// log. On the (in-practice unreachable) `set_scheme` failure path a
1140/// `tracing::warn!` event is emitted before returning the original
1141/// URL; the production client's `https_only(true)` then rejects the
1142/// send with a clear network error, preserving the TLS posture
1143/// established by ADR-0020.
1144///
1145/// # `Domain("localhost")` arm subtlety
1146///
1147/// The url crate resolves the bare host `localhost` to `127.0.0.1`
1148/// (Ipv4 variant) when parsing an `http://` URL, so the `Domain` arm
1149/// does NOT fire for that case (the `Ipv4` arm catches it). The arm
1150/// IS load-bearing for the RFC 6761 `.localhost` TLD (e.g.
1151/// `myservice.localhost`, `api.localhost`), which the url crate does
1152/// NOT auto-resolve to an IP and keeps as `Host::Domain`.
1153fn upgrade_http_to_https(url: Url) -> Url {
1154    if url.scheme() != "http" {
1155        return url;
1156    }
1157    match url.host() {
1158        None => {
1159            // Cannot-be-base URL (e.g. `http:foo`) — `set_scheme`
1160            // would reject the conversion.
1161            return url;
1162        }
1163        Some(url::Host::Domain(d)) if is_localhost_domain(d) => return url,
1164        Some(url::Host::Ipv4(ip)) if ip.is_loopback() => return url,
1165        Some(url::Host::Ipv6(ip)) if is_ipv6_loopback(ip) => return url,
1166        Some(_) => {}
1167    }
1168    let mut upgraded = url.clone();
1169    if upgraded.set_scheme("https").is_err() {
1170        // url-crate `set_scheme` is documented to fail only for
1171        // cannot-be-base URLs and a few cross-family transitions;
1172        // `http -> https` is supported because both are "special"
1173        // schemes. The fallback below is defence-in-depth.
1174        tracing::warn!(
1175            url = %url,
1176            "set_scheme(http -> https) failed unexpectedly; \
1177             sending original URL — https_only(true) will reject",
1178        );
1179        return url;
1180    }
1181    tracing::info!(
1182        original = %url,
1183        upgraded = %upgraded,
1184        "upgraded http -> https for legacy publisher metadata"
1185    );
1186    upgraded
1187}
1188
1189/// `true` for the `localhost` literal and any RFC 6761 `.localhost`
1190/// TLD subdomain (`myservice.localhost`, `api.localhost`, etc.).
1191/// ASCII-case-insensitive per host-name conventions.
1192fn is_localhost_domain(d: &str) -> bool {
1193    if d.eq_ignore_ascii_case("localhost") {
1194        return true;
1195    }
1196    let suffix = ".localhost";
1197    let d_bytes = d.as_bytes();
1198    let s_bytes = suffix.as_bytes();
1199    if d_bytes.len() <= s_bytes.len() {
1200        return false;
1201    }
1202    let tail = &d_bytes[d_bytes.len() - s_bytes.len()..];
1203    tail.eq_ignore_ascii_case(s_bytes)
1204}
1205
1206/// `true` for `::1` and any IPv4-mapped loopback
1207/// (`::ffff:127.0.0.0/8`). `Ipv6Addr::is_loopback()` covers only `::1`,
1208/// so dual-stack callers that hit `[::ffff:127.0.0.1]` would otherwise
1209/// be silently upgraded.
1210fn is_ipv6_loopback(ip: std::net::Ipv6Addr) -> bool {
1211    if ip.is_loopback() {
1212        return true;
1213    }
1214    matches!(ip.to_ipv4_mapped(), Some(v4) if v4.is_loopback())
1215}
1216
1217fn build_client(allowlist: SourceAllowlist, ua: &str) -> Result<Client, reqwest::Error> {
1218    ensure_crypto_provider();
1219
1220    let user_agent = ua.to_string();
1221
1222    // Redirect policy: capture the per-source allowlist by value. The
1223    // closure is called for every redirect hop — there is no global
1224    // fallback, every hop is checked. Hard cap at MAX_REDIRECTS via the
1225    // attempt counter (mirrors reqwest's built-in limit).
1226    let allowlist_for_closure = allowlist.clone();
1227    let redirect_policy = Policy::custom(move |attempt| {
1228        // Inspect the candidate URL via owned copies so we can move
1229        // `attempt` into `error()` / `follow()` / `stop()` later without
1230        // the borrow checker complaining about an outstanding borrow of
1231        // `attempt`.
1232        let scheme = attempt.url().scheme().to_string();
1233        let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
1234        let prev_count = attempt.previous().len();
1235
1236        // 1. Reject non-HTTPS up front. The `https_only(true)` builder
1237        //    flag below also catches this, but we want the dedicated
1238        //    `InsecureRedirect` error path (not a generic `https_only`
1239        //    abort) — see `docs/SECURITY.md` §1.3.
1240        if scheme != "https" {
1241            return attempt.error(HttpError::InsecureRedirect { scheme });
1242        }
1243
1244        // 2. Hop limit (`docs/SECURITY.md` §1.3 redirect_limit row).
1245        if prev_count >= MAX_REDIRECTS {
1246            return attempt.stop();
1247        }
1248
1249        // 3. Allowlist check on the candidate target host.
1250        //    `host_str()` is `None` for URLs without a host (e.g. data
1251        //    URIs); treat that as an allowlist miss.
1252        let host = match host_opt {
1253            Some(h) => h,
1254            None => {
1255                return attempt.error(HttpError::RedirectDenied {
1256                    source_key: allowlist_for_closure.source.clone(),
1257                    host: String::new(),
1258                    expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1259                });
1260            }
1261        };
1262        if !allowlist_for_closure.matches(&host) {
1263            return attempt.error(HttpError::RedirectDenied {
1264                source_key: allowlist_for_closure.source.clone(),
1265                host,
1266                expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1267            });
1268        }
1269
1270        attempt.follow()
1271    });
1272
1273    ClientBuilder::new()
1274        .https_only(true)
1275        .redirect(redirect_policy)
1276        .connect_timeout(CONNECT_TIMEOUT)
1277        .timeout(TOTAL_TIMEOUT)
1278        .read_timeout(READ_TIMEOUT)
1279        .user_agent(user_agent)
1280        // `tls_backend_rustls()` is the non-deprecated equivalent of the
1281        // older `use_rustls_tls()`. The workspace pins reqwest with
1282        // `rustls-no-provider` (ADR-0020 Amendment 1), so this is a
1283        // re-assertion at builder level rather than a feature switch; the
1284        // `ring` provider installed by `ensure_crypto_provider()` above
1285        // is what reqwest picks up via `CryptoProvider::get_default()`.
1286        .tls_backend_rustls()
1287        .build()
1288}
1289
1290// ---------------------------------------------------------------------------
1291// Tests
1292// ---------------------------------------------------------------------------
1293
1294#[cfg(test)]
1295#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1296mod tests {
1297    use super::*;
1298    use wiremock::matchers::{method, path};
1299    use wiremock::{Mock, MockServer, ResponseTemplate};
1300
1301    // ---------------------------------------------------------------
1302    // http -> https scheme upgrade (#220) — pure unit tests, no network.
1303    // ---------------------------------------------------------------
1304
1305    #[test]
1306    fn upgrade_http_to_https_rewrites_public_http_url() {
1307        let input = Url::parse("http://link.aps.org/pdf/10.1103/PhysRev.123.456").unwrap();
1308        let out = upgrade_http_to_https(input.clone());
1309        assert_eq!(out.scheme(), "https");
1310        assert_eq!(out.host_str(), Some("link.aps.org"));
1311        assert_eq!(out.path(), "/pdf/10.1103/PhysRev.123.456");
1312    }
1313
1314    #[test]
1315    fn upgrade_http_to_https_preserves_port_path_query_fragment() {
1316        let input = Url::parse("http://example.org:8080/a/b?q=1#frag").unwrap();
1317        let out = upgrade_http_to_https(input);
1318        assert_eq!(out.as_str(), "https://example.org:8080/a/b?q=1#frag");
1319    }
1320
1321    #[test]
1322    fn upgrade_http_to_https_is_idempotent_on_https() {
1323        let input = Url::parse("https://api.crossref.org/works/10.1234/foo").unwrap();
1324        let out = upgrade_http_to_https(input.clone());
1325        assert_eq!(out, input);
1326    }
1327
1328    #[test]
1329    fn upgrade_http_to_https_skips_localhost() {
1330        // wiremock binds to `127.0.0.1:PORT`; the loopback exception
1331        // is the load-bearing rule that keeps `new_for_tests_allow_http*`
1332        // working alongside the production fetch path.
1333        let input = Url::parse("http://localhost:7878/pdf").unwrap();
1334        let out = upgrade_http_to_https(input.clone());
1335        assert_eq!(out, input, "localhost MUST NOT be upgraded");
1336    }
1337
1338    #[test]
1339    fn upgrade_http_to_https_skips_127_loopback_block() {
1340        for host in ["127.0.0.1", "127.0.0.42", "127.255.255.254"] {
1341            let raw = format!("http://{host}:1234/x");
1342            let input = Url::parse(&raw).unwrap();
1343            let out = upgrade_http_to_https(input.clone());
1344            assert_eq!(out, input, "host `{host}` MUST NOT be upgraded");
1345        }
1346    }
1347
1348    #[test]
1349    fn upgrade_http_to_https_skips_ipv6_loopback() {
1350        let input = Url::parse("http://[::1]:9000/path").unwrap();
1351        let out = upgrade_http_to_https(input.clone());
1352        assert_eq!(out, input, "IPv6 loopback MUST NOT be upgraded");
1353    }
1354
1355    #[test]
1356    fn upgrade_http_to_https_preserves_case_in_path() {
1357        // Some publishers (e.g. APS legacy redirects) use mixed-case
1358        // path segments; upgrade must NOT lowercase or canonicalise.
1359        let input = Url::parse("http://link.aps.org/PDF/10.1103/PhysRevB.109.045136").unwrap();
1360        let out = upgrade_http_to_https(input);
1361        assert_eq!(out.path(), "/PDF/10.1103/PhysRevB.109.045136");
1362    }
1363
1364    // ---- Review-pass extensions ------------------------------------
1365
1366    #[test]
1367    fn upgrade_http_to_https_skips_dot_localhost_tld() {
1368        // RFC 6761 reserves the entire `.localhost` TLD for loopback.
1369        // A developer running `http://myservice.localhost:8080/` MUST
1370        // NOT see their URL silently upgraded to https.
1371        for raw in [
1372            "http://myservice.localhost/",
1373            "http://api.localhost:8080/x",
1374            "http://a.b.LOCALHOST/y",
1375        ] {
1376            let input = Url::parse(raw).unwrap();
1377            let out = upgrade_http_to_https(input.clone());
1378            assert_eq!(out, input, "{raw} MUST NOT be upgraded");
1379        }
1380    }
1381
1382    #[test]
1383    fn upgrade_http_to_https_skips_ipv4_mapped_ipv6_loopback() {
1384        // `::ffff:127.0.0.1` is the IPv4-mapped IPv6 form of 127.0.0.1.
1385        // `Ipv6Addr::is_loopback()` alone returns false for this form,
1386        // so dual-stack callers binding wiremock to it would be
1387        // silently upgraded without the `to_ipv4_mapped()` check.
1388        for raw in [
1389            "http://[::ffff:127.0.0.1]:9000/x",
1390            "http://[::ffff:127.0.0.42]/y",
1391        ] {
1392            let input = Url::parse(raw).unwrap();
1393            let out = upgrade_http_to_https(input.clone());
1394            assert_eq!(out, input, "{raw} MUST NOT be upgraded");
1395        }
1396    }
1397
1398    #[test]
1399    fn upgrade_http_to_https_is_noop_on_non_http_schemes() {
1400        // The first guard (`url.scheme() != "http"`) covers everything
1401        // that isn't http: https (idempotent), file, data, ftp...
1402        for raw in [
1403            "https://api.crossref.org/works/10.1234/foo",
1404            "file:///etc/passwd",
1405            "data:text/plain,hello",
1406            "ftp://ftp.example.org/papers/",
1407        ] {
1408            let input = Url::parse(raw).unwrap();
1409            let out = upgrade_http_to_https(input.clone());
1410            assert_eq!(
1411                out, input,
1412                "{raw} non-http scheme MUST be returned unchanged"
1413            );
1414        }
1415    }
1416
1417    #[test]
1418    fn upgrade_http_to_https_http_url_always_has_host() {
1419        // The url crate's parser enforces authority for "special"
1420        // schemes (`http`, `https`, `ws`, `wss`, `ftp`, `file`).
1421        // `Url::parse("http:foo")` synthesises a Domain("foo")
1422        // authority, so an http URL with `host() == None` is
1423        // unreachable from `Url::parse`. The `None` arm in
1424        // `upgrade_http_to_https` is defence-in-depth only — pinned
1425        // here so a future url-crate behavior change is caught.
1426        let url = Url::parse("http:foo").expect("parse");
1427        assert!(
1428            url.host().is_some(),
1429            "http URLs always carry a host per WHATWG URL spec"
1430        );
1431        // The fn still produces a sensible result (upgrade applies).
1432        let out = upgrade_http_to_https(url.clone());
1433        assert_eq!(out.scheme(), "https");
1434    }
1435
1436    #[test]
1437    fn upgrade_http_to_https_skips_localhost_case_insensitive() {
1438        // The literal `localhost` is resolved by the url crate to
1439        // `127.0.0.1` (Ipv4) at parse time for `http://` URLs, so the
1440        // Ipv4 arm catches lowercase. The Domain-arm coverage is
1441        // load-bearing only for the `.localhost` TLD case, but we
1442        // still pin the casefold semantics in case the url crate
1443        // changes its parsing rules.
1444        for raw in ["http://LOCALHOST/", "http://Localhost:8080/x"] {
1445            let input = Url::parse(raw).unwrap();
1446            let out = upgrade_http_to_https(input.clone());
1447            assert_eq!(out, input, "{raw} MUST NOT be upgraded");
1448        }
1449    }
1450
1451    #[test]
1452    fn is_localhost_domain_matches_literal_and_tld_suffix() {
1453        assert!(is_localhost_domain("localhost"));
1454        assert!(is_localhost_domain("LOCALHOST"));
1455        assert!(is_localhost_domain("api.localhost"));
1456        assert!(is_localhost_domain("nested.api.localhost"));
1457        assert!(is_localhost_domain("X.LocalHost"));
1458        assert!(!is_localhost_domain("localhost.example.org"));
1459        assert!(!is_localhost_domain("notlocalhost"));
1460        assert!(!is_localhost_domain(""));
1461        assert!(!is_localhost_domain(".localhost")); // empty label not valid
1462    }
1463
1464    #[test]
1465    fn is_ipv6_loopback_covers_both_pure_and_mapped() {
1466        use std::net::Ipv6Addr;
1467        assert!(is_ipv6_loopback(Ipv6Addr::LOCALHOST)); // ::1
1468        assert!(is_ipv6_loopback("::ffff:127.0.0.1".parse().unwrap()));
1469        assert!(is_ipv6_loopback("::ffff:127.0.0.42".parse().unwrap()));
1470        assert!(!is_ipv6_loopback("::".parse().unwrap()));
1471        assert!(!is_ipv6_loopback("2001:db8::1".parse().unwrap()));
1472        // IPv4-mapped non-loopback must NOT be considered loopback.
1473        assert!(!is_ipv6_loopback("::ffff:1.2.3.4".parse().unwrap()));
1474    }
1475
1476    // ---------------------------------------------------------------
1477    // Allowlist matching — pure unit tests, no network.
1478    // ---------------------------------------------------------------
1479
1480    #[test]
1481    fn tier_1_allowlist_includes_crossref() {
1482        let lists = tier_1_allowlist();
1483        let crossref = lists
1484            .iter()
1485            .find(|a| a.source == "crossref")
1486            .expect("crossref entry");
1487        assert!(
1488            crossref
1489                .redirect_hosts
1490                .iter()
1491                .any(|h| h.contains("crossref.org")),
1492            "crossref allowlist must contain a crossref.org pattern; got {:?}",
1493            crossref.redirect_hosts,
1494        );
1495    }
1496
1497    #[test]
1498    fn tier_1_allowlist_includes_unpaywall_and_arxiv() {
1499        let lists = tier_1_allowlist();
1500        assert!(lists.iter().any(|a| a.source == "unpaywall"));
1501        assert!(lists.iter().any(|a| a.source == "arxiv"));
1502    }
1503
1504    #[test]
1505    fn fulltext_allowlist_registers_ar5iv_host_under_distinct_key() {
1506        // ADR-0032 D3: the ar5iv renderer is registered under its own
1507        // `"ar5iv"` source key (not `"arxiv"`) so provenance distinguishes
1508        // full-text HTML from the arXiv PDF/Atom API.
1509        let lists = fulltext_allowlist();
1510        assert_eq!(lists.len(), 1, "exactly one full-text source entry");
1511        let ar5iv = &lists[0];
1512        assert_eq!(ar5iv.source, "ar5iv");
1513        assert!(ar5iv.matches("ar5iv.labs.arxiv.org"));
1514        // It is also an arXiv subdomain — the existing `*.arxiv.org` glob
1515        // already covers the host, so no new registrable domain is added.
1516        let arxiv = tier_1_allowlist()
1517            .into_iter()
1518            .find(|a| a.source == "arxiv")
1519            .expect("arxiv entry");
1520        assert!(
1521            arxiv.matches("ar5iv.labs.arxiv.org"),
1522            "ar5iv host must fall under the existing *.arxiv.org surface"
1523        );
1524    }
1525
1526    #[test]
1527    fn oa_publisher_allowlist_groups_under_one_synthetic_source() {
1528        // The OA-publisher fan-out from Unpaywall's `best_oa_location.url`
1529        // is keyed under a single synthetic `"oa-publisher"` source so the
1530        // orchestrator can pass that one source key to
1531        // `HttpClient::fetch_pdf`. See `docs/REDIRECT_ALLOWLIST.md` §3 (the
1532        // informed-best-effort note) and the function-level docs in
1533        // [`oa_publisher_allowlist`].
1534        let lists = oa_publisher_allowlist();
1535        assert_eq!(lists.len(), 1, "exactly one synthetic source entry");
1536        assert_eq!(lists[0].source, "oa-publisher");
1537    }
1538
1539    #[test]
1540    fn oa_publisher_allowlist_matches_known_oa_hosts() {
1541        let lists = oa_publisher_allowlist();
1542        let oa = lists
1543            .iter()
1544            .find(|a| a.source == "oa-publisher")
1545            .expect("oa-publisher entry");
1546        // Spot-check a representative entry per host family.
1547        assert!(oa.matches("link.springer.com"));
1548        assert!(oa.matches("nature.com"));
1549        assert!(oa.matches("onlinelibrary.wiley.com"));
1550        assert!(oa.matches("www.frontiersin.org"));
1551        assert!(oa.matches("www.mdpi.com"));
1552        assert!(oa.matches("journals.plos.org"));
1553        assert!(oa.matches("www.biorxiv.org"));
1554        assert!(oa.matches("europepmc.org"));
1555        assert!(oa.matches("www.ncbi.nlm.nih.gov"));
1556        assert!(oa.matches("arxiv.org"));
1557        // #193: physics-society / diamond-OA hosts (empirically observed
1558        // as Unpaywall best_oa_location targets in the dogfood run).
1559        assert!(oa.matches("link.aps.org"));
1560        assert!(oa.matches("journals.aps.org"));
1561        assert!(oa.matches("scipost.org"));
1562        assert!(oa.matches("www.scipost.org"));
1563        assert!(oa.matches("iopscience.iop.org"));
1564        // Document intent of the `*.<suffix>` form: per
1565        // `REDIRECT_ALLOWLIST.md` §2.2 rule 3 it matches the bare
1566        // registrable domain AND any subdomain. Unpaywall has not been
1567        // observed returning bare-domain PDF URLs for these publishers,
1568        // but accepting them is consistent with every other `*.` entry in
1569        // this list (e.g. `arxiv.org` matched by `*.arxiv.org`) and is
1570        // what the matching rule already implements.
1571        assert!(oa.matches("aps.org"));
1572        assert!(oa.matches("iop.org"));
1573        // Multi-level subdomains also match (e.g. SciPost's deep paths);
1574        // documents the wildcard scope rather than testing a known URL.
1575        assert!(oa.matches("submissions.scipost.org"));
1576        // Negative: an attacker host is not covered.
1577        assert!(!oa.matches("attacker.test"));
1578        // Negative: dot-boundary safety for the new entries — a different
1579        // suffix that merely ends with the registrable name must NOT match.
1580        assert!(!oa.matches("notaps.org"));
1581        assert!(!oa.matches("evilscipost.org"));
1582        assert!(!oa.matches("notiop.org"));
1583        // Negative: dot-boundary safety — `*.springer.com` must not match
1584        // `notspringer.com`.
1585        assert!(!oa.matches("notspringer.com"));
1586    }
1587
1588    #[test]
1589    fn allowlist_matches_exact_fqdn() {
1590        let a = SourceAllowlist::new("crossref", vec!["api.crossref.org".to_string()]);
1591        assert!(a.matches("api.crossref.org"));
1592        assert!(!a.matches("crossref.org"));
1593        assert!(!a.matches("xapi.crossref.org"));
1594    }
1595
1596    #[test]
1597    fn allowlist_matches_subdomain_glob() {
1598        // Per docs/REDIRECT_ALLOWLIST.md §2.2 rule 3: `*.<suffix>`
1599        // matches both `<suffix>` itself AND any `*.<suffix>` subdomain,
1600        // but never matches a different suffix that happens to end with
1601        // `<suffix>` without a dot boundary.
1602        let a = SourceAllowlist::new("crossref", vec!["*.crossref.org".to_string()]);
1603        assert!(a.matches("doi.crossref.org"));
1604        assert!(a.matches("crossref.org"));
1605        assert!(!a.matches("notcrossref.org"));
1606        assert!(!a.matches("crossref.org.attacker.test"));
1607    }
1608
1609    #[test]
1610    fn allowlist_matches_is_case_insensitive() {
1611        let a = SourceAllowlist::new("crossref", vec!["API.crossref.ORG".to_string()]);
1612        assert!(a.matches("api.crossref.org"));
1613        assert!(a.matches("API.CROSSREF.ORG"));
1614    }
1615
1616    #[test]
1617    fn allowlist_with_no_redirect_hosts_matches_nothing() {
1618        // §2.2 rule 5: an empty `redirect_hosts` means "no redirects
1619        // permitted from this source".
1620        let a = SourceAllowlist::new("ghost", Vec::<String>::new());
1621        assert!(!a.matches("anything.test"));
1622        assert!(!a.matches(""));
1623    }
1624
1625    // ---------------------------------------------------------------
1626    // PDF magic-byte handling — tests on the body-parsing path. We
1627    // exercise the magic-byte branch via the public API against a
1628    // wiremock server so the assertion runs through the full
1629    // streaming codepath.
1630    // ---------------------------------------------------------------
1631
1632    /// Build a test-only `HttpClient` against an `http://` wiremock
1633    /// origin.
1634    ///
1635    /// Slice 5 (PR #84 advisory item A4 refactor): this helper now
1636    /// delegates to the public
1637    /// [`HttpClient::new_for_tests_allow_http`] constructor (defined
1638    /// just above the test module) instead of re-implementing the
1639    /// redirect-policy + `https_only(false)` builder. The two
1640    /// implementations had drifted into duplicates — keeping a private
1641    /// re-implementation only meant a future security tweak to the
1642    /// builder would silently leave the tests on a stale path.
1643    fn build_test_client_for_http(source: &str, allowlist_host: &str) -> HttpClient {
1644        HttpClient::new_for_tests_allow_http(source, allowlist_host)
1645    }
1646
1647    #[tokio::test]
1648    async fn pdf_magic_byte_match_succeeds() {
1649        let server = MockServer::start().await;
1650        let body = b"%PDF-1.7\n...some pdf bytes...".to_vec();
1651        Mock::given(method("GET"))
1652            .and(path("/paper.pdf"))
1653            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
1654            .mount(&server)
1655            .await;
1656        let host = server
1657            .uri()
1658            .parse::<Url>()
1659            .unwrap()
1660            .host_str()
1661            .unwrap()
1662            .to_string();
1663        let client = build_test_client_for_http("crossref", &host);
1664        let url: Url = format!("{}/paper.pdf", server.uri()).parse().unwrap();
1665        let (got_body, _final_url) = client.fetch_pdf("crossref", url).await.expect("ok");
1666        assert_eq!(&got_body[..], &body[..]);
1667    }
1668
1669    #[tokio::test]
1670    async fn pdf_magic_byte_mismatch_rejects() {
1671        let server = MockServer::start().await;
1672        Mock::given(method("GET"))
1673            .and(path("/not_a_pdf"))
1674            .respond_with(
1675                ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
1676            )
1677            .mount(&server)
1678            .await;
1679        let host = server
1680            .uri()
1681            .parse::<Url>()
1682            .unwrap()
1683            .host_str()
1684            .unwrap()
1685            .to_string();
1686        let client = build_test_client_for_http("crossref", &host);
1687        let url: Url = format!("{}/not_a_pdf", server.uri()).parse().unwrap();
1688        let err = client
1689            .fetch_pdf("crossref", url)
1690            .await
1691            .expect_err("not pdf");
1692        match err {
1693            HttpError::NotAPdf { got } => {
1694                assert_eq!(&got, b"<html");
1695            }
1696            other => panic!("expected NotAPdf, got {:?}", other),
1697        }
1698    }
1699
1700    #[tokio::test]
1701    async fn fetch_bytes_does_not_check_pdf_magic() {
1702        // The non-PDF path returns the body unchanged regardless of
1703        // magic bytes. This pins the boundary between the JSON/text
1704        // path and the PDF path.
1705        let server = MockServer::start().await;
1706        Mock::given(method("GET"))
1707            .and(path("/data.json"))
1708            .respond_with(
1709                ResponseTemplate::new(200).set_body_bytes(br#"{"hello":"world"}"#.to_vec()),
1710            )
1711            .mount(&server)
1712            .await;
1713        let host = server
1714            .uri()
1715            .parse::<Url>()
1716            .unwrap()
1717            .host_str()
1718            .unwrap()
1719            .to_string();
1720        let client = build_test_client_for_http("crossref", &host);
1721        let url: Url = format!("{}/data.json", server.uri()).parse().unwrap();
1722        let (body, _final_url) = client.fetch_bytes("crossref", url).await.expect("ok");
1723        assert_eq!(&body[..], br#"{"hello":"world"}"#);
1724    }
1725
1726    #[tokio::test]
1727    async fn oversized_body_via_content_length_short_circuits() {
1728        // Wiremock can advertise a `Content-Length` larger than the body
1729        // it actually serves; hyper accepts the mismatch and our
1730        // fast-path check fires before any body bytes are consumed.
1731        let server = MockServer::start().await;
1732        let oversized = PDF_MAX_BYTES + 1;
1733        Mock::given(method("GET"))
1734            .and(path("/huge"))
1735            .respond_with(
1736                ResponseTemplate::new(200)
1737                    .insert_header("content-length", oversized.to_string().as_str())
1738                    .set_body_bytes(b"%PDF-".to_vec()),
1739            )
1740            .mount(&server)
1741            .await;
1742        let host = server
1743            .uri()
1744            .parse::<Url>()
1745            .unwrap()
1746            .host_str()
1747            .unwrap()
1748            .to_string();
1749        let client = build_test_client_for_http("crossref", &host);
1750        let url: Url = format!("{}/huge", server.uri()).parse().unwrap();
1751        let err = client
1752            .fetch_bytes("crossref", url)
1753            .await
1754            .expect_err("should reject");
1755        match err {
1756            HttpError::OversizedBody { actual, cap } => {
1757                assert!(actual > cap, "actual {} should exceed cap {}", actual, cap);
1758                assert_eq!(cap, PDF_MAX_BYTES);
1759            }
1760            // The mismatched Content-Length may also trip an underlying
1761            // transport error before our fast-path runs. Either outcome
1762            // satisfies the security goal (the transfer was aborted
1763            // without buffering 100 GB), so accept Network here as a
1764            // wiremock idiosyncrasy rather than a contract relaxation.
1765            HttpError::Network(_) => {}
1766            other => panic!("expected OversizedBody or Network, got {:?}", other),
1767        }
1768    }
1769
1770    #[tokio::test]
1771    async fn unknown_source_rejected() {
1772        let client = HttpClient::new(tier_1_allowlist()).expect("client builds");
1773        let url: Url = "https://api.crossref.org/works/10.1234/x".parse().unwrap();
1774        let err = client
1775            .fetch_bytes("not-a-source", url)
1776            .await
1777            .expect_err("unknown source");
1778        match err {
1779            HttpError::UnknownSource { source_key } => {
1780                assert_eq!(source_key, "not-a-source")
1781            }
1782            other => panic!("expected UnknownSource, got {:?}", other),
1783        }
1784    }
1785
1786    #[tokio::test]
1787    async fn http_status_error_surfaces() {
1788        let server = MockServer::start().await;
1789        Mock::given(method("GET"))
1790            .and(path("/missing"))
1791            .respond_with(ResponseTemplate::new(404))
1792            .mount(&server)
1793            .await;
1794        let host = server
1795            .uri()
1796            .parse::<Url>()
1797            .unwrap()
1798            .host_str()
1799            .unwrap()
1800            .to_string();
1801        let client = build_test_client_for_http("crossref", &host);
1802        let url: Url = format!("{}/missing", server.uri()).parse().unwrap();
1803        let err = client.fetch_bytes("crossref", url).await.expect_err("404");
1804        match err {
1805            HttpError::HttpStatus { status, .. } => assert_eq!(status, 404),
1806            other => panic!("expected HttpStatus, got {:?}", other),
1807        }
1808    }
1809
1810    // ---------------------------------------------------------------
1811    // Redirect policy tests — drive the closure via wiremock 30x
1812    // responses pointing at insecure / off-allowlist targets. With
1813    // `https_only(true)` on the production builder the request never
1814    // leaves the initial leg — we run these against the test builder
1815    // (which relaxes `https_only` for the *initial* leg only) so the
1816    // redirect closure is reached and exercised.
1817    // ---------------------------------------------------------------
1818
1819    #[tokio::test]
1820    async fn redirect_to_http_is_rejected_by_closure() {
1821        let server = MockServer::start().await;
1822        Mock::given(method("GET"))
1823            .and(path("/redir"))
1824            .respond_with(
1825                ResponseTemplate::new(302).insert_header("location", "http://attacker.test/file"),
1826            )
1827            .mount(&server)
1828            .await;
1829        let host = server
1830            .uri()
1831            .parse::<Url>()
1832            .unwrap()
1833            .host_str()
1834            .unwrap()
1835            .to_string();
1836        let client = build_test_client_for_http("crossref", &host);
1837        let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1838        let err = client
1839            .fetch_bytes("crossref", url)
1840            .await
1841            .expect_err("redirect to http rejected");
1842        match err {
1843            HttpError::Network(e) => {
1844                let msg = format!("{:?}", e);
1845                assert!(
1846                    msg.contains("InsecureRedirect") || msg.contains("non-HTTPS"),
1847                    "expected insecure-redirect signal in error chain, got {}",
1848                    msg
1849                );
1850            }
1851            other => panic!("expected Network(InsecureRedirect), got {:?}", other),
1852        }
1853    }
1854
1855    #[tokio::test]
1856    async fn redirect_outside_allowlist_is_rejected_by_closure() {
1857        let server = MockServer::start().await;
1858        Mock::given(method("GET"))
1859            .and(path("/redir"))
1860            .respond_with(
1861                ResponseTemplate::new(302).insert_header("location", "https://attacker.test/file"),
1862            )
1863            .mount(&server)
1864            .await;
1865        let host = server
1866            .uri()
1867            .parse::<Url>()
1868            .unwrap()
1869            .host_str()
1870            .unwrap()
1871            .to_string();
1872        let client = build_test_client_for_http("crossref", &host);
1873        let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1874        let err = client
1875            .fetch_bytes("crossref", url)
1876            .await
1877            .expect_err("redirect to attacker rejected");
1878        match err {
1879            HttpError::Network(e) => {
1880                let msg = format!("{:?}", e);
1881                assert!(
1882                    msg.contains("RedirectDenied") || msg.contains("not in allowlist"),
1883                    "expected redirect-denied signal in error chain, got {}",
1884                    msg
1885                );
1886            }
1887            other => panic!("expected Network(RedirectDenied), got {:?}", other),
1888        }
1889    }
1890
1891    #[tokio::test]
1892    async fn redirect_to_allowlisted_https_host_is_followed_by_closure() {
1893        // 302 to an https host that IS in the allowlist. The redirect
1894        // dispatch will fail (DNS won't resolve `mirror.allowed.test`)
1895        // but the closure must NOT short-circuit — failure mode is a
1896        // transport error, not InsecureRedirect / RedirectDenied.
1897        let server = MockServer::start().await;
1898        Mock::given(method("GET"))
1899            .and(path("/redir"))
1900            .respond_with(
1901                ResponseTemplate::new(302)
1902                    .insert_header("location", "https://mirror.allowed.test/file"),
1903            )
1904            .mount(&server)
1905            .await;
1906        let initial_host = server
1907            .uri()
1908            .parse::<Url>()
1909            .unwrap()
1910            .host_str()
1911            .unwrap()
1912            .to_string();
1913        // Allow the initial host AND the redirect target host.
1914        let allowlist = SourceAllowlist::new(
1915            "crossref",
1916            vec![initial_host.clone(), "*.allowed.test".to_string()],
1917        );
1918        let allowlist_for_closure = allowlist.clone();
1919        let policy = Policy::custom(move |attempt| {
1920            let scheme = attempt.url().scheme().to_string();
1921            let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
1922            if scheme != "https" {
1923                return attempt.error(HttpError::InsecureRedirect { scheme });
1924            }
1925            let h = match host_opt {
1926                Some(h) => h,
1927                None => {
1928                    return attempt.error(HttpError::RedirectDenied {
1929                        source_key: allowlist_for_closure.source.clone(),
1930                        host: String::new(),
1931                        expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1932                    });
1933                }
1934            };
1935            if !allowlist_for_closure.matches(&h) {
1936                return attempt.error(HttpError::RedirectDenied {
1937                    source_key: allowlist_for_closure.source.clone(),
1938                    host: h,
1939                    expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1940                });
1941            }
1942            attempt.follow()
1943        });
1944        ensure_crypto_provider();
1945        let raw_client = ClientBuilder::new()
1946            .https_only(false)
1947            .redirect(policy)
1948            .connect_timeout(CONNECT_TIMEOUT)
1949            .timeout(Duration::from_secs(5))
1950            .user_agent("doiget/test")
1951            .tls_backend_rustls()
1952            .build()
1953            .expect("client builds");
1954        let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1955        let err = raw_client.get(url).send().await.expect_err("DNS fails");
1956        // The error should NOT carry our InsecureRedirect / RedirectDenied
1957        // marker — the closure approved the redirect.
1958        let msg = format!("{:?}", err);
1959        assert!(
1960            !msg.contains("RedirectDenied") && !msg.contains("InsecureRedirect"),
1961            "closure short-circuited an allowed redirect: {}",
1962            msg,
1963        );
1964    }
1965
1966    #[test]
1967    fn http_client_clone_is_cheap() {
1968        // Sanity: cloning shares the inner Arc<HashMap<...>>.
1969        let a = HttpClient::new(tier_1_allowlist()).expect("builds");
1970        let b = a.clone();
1971        assert_eq!(a.clients.len(), b.clients.len());
1972        assert!(Arc::ptr_eq(&a.clients, &b.clients));
1973    }
1974
1975    // ---------------------------------------------------------------
1976    // HttpError -> Option<DenialContext>  (ADR-0023 §4 mapping)
1977    // ---------------------------------------------------------------
1978
1979    #[test]
1980    fn denial_from_redirect_denied_carries_attempted_and_expected() {
1981        use crate::{DenialContext, DenialReason};
1982        let e = HttpError::RedirectDenied {
1983            source_key: "crossref".to_string(),
1984            host: "evil.example.com".to_string(),
1985            expected_hosts: vec!["api.crossref.org".to_string(), "*.crossref.org".to_string()],
1986        };
1987        let dc: Option<DenialContext> = (&e).into();
1988        let dc = dc.expect("RedirectDenied -> Some(DenialContext)");
1989        assert_eq!(dc.reason, DenialReason::RedirectNotInAllowlist);
1990        assert_eq!(dc.source.as_deref(), Some("crossref"));
1991        assert_eq!(dc.attempted.as_deref(), Some("evil.example.com"));
1992        assert_eq!(
1993            dc.expected.as_deref(),
1994            Some(&["api.crossref.org".to_string(), "*.crossref.org".to_string()][..])
1995        );
1996        assert!(dc.cap.is_none());
1997        assert!(dc.actual.is_none());
1998        assert!(dc.hop_index.is_none());
1999    }
2000
2001    #[test]
2002    fn denial_from_oversized_body_carries_cap_and_actual() {
2003        use crate::{DenialContext, DenialReason};
2004        let e = HttpError::OversizedBody {
2005            actual: 209_715_200,
2006            cap: PDF_MAX_BYTES,
2007        };
2008        let dc: Option<DenialContext> = (&e).into();
2009        let dc = dc.expect("OversizedBody -> Some(DenialContext)");
2010        assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
2011        assert_eq!(dc.cap, Some(PDF_MAX_BYTES));
2012        assert_eq!(dc.actual, Some(209_715_200));
2013        assert!(dc.source.is_none());
2014        assert!(dc.attempted.is_none());
2015        // OversizedBody has no allowlist channel: producer leaves
2016        // `expected` at `None` (NOT `Some(vec![])`). See the field doc on
2017        // `DenialContext::expected` for the disambiguation.
2018        assert!(dc.expected.is_none());
2019    }
2020
2021    #[test]
2022    fn denial_from_not_a_pdf_hex_encodes_got_bytes() {
2023        use crate::{DenialContext, DenialReason};
2024        // First 5 bytes of "<html" — what the magic-byte check sees when
2025        // a publisher returns an HTML interstitial instead of a PDF.
2026        let e = HttpError::NotAPdf {
2027            got: [0x3c, 0x68, 0x74, 0x6d, 0x6c],
2028        };
2029        let dc: Option<DenialContext> = (&e).into();
2030        let dc = dc.expect("NotAPdf -> Some(DenialContext)");
2031        assert_eq!(dc.reason, DenialReason::ContentTypeMismatch);
2032        assert_eq!(dc.attempted.as_deref(), Some("3c68746d6c"));
2033        assert_eq!(dc.expected.as_deref(), Some(&["%PDF-".to_string()][..]));
2034    }
2035
2036    #[test]
2037    fn denial_from_insecure_redirect_marks_insecure_scheme() {
2038        use crate::{DenialContext, DenialReason};
2039        let e = HttpError::InsecureRedirect {
2040            scheme: "http".to_string(),
2041        };
2042        let dc: Option<DenialContext> = (&e).into();
2043        let dc = dc.expect("InsecureRedirect -> Some(DenialContext)");
2044        // ADR-0023 §4 (post-incorporation review): InsecureRedirect maps
2045        // to its own dedicated `InsecureScheme` reason, not the host-
2046        // allowlist reason — they are semantically distinct denials.
2047        assert_eq!(dc.reason, DenialReason::InsecureScheme);
2048        assert_eq!(dc.attempted.as_deref(), Some("http:..."));
2049        assert_eq!(dc.expected.as_deref(), Some(&["https".to_string()][..]));
2050    }
2051
2052    #[test]
2053    fn denial_from_non_denial_variants_returns_none() {
2054        use crate::DenialContext;
2055        // Network / HttpStatus / UnknownSource are not denials; they
2056        // map to None per ADR-0023 §4.
2057        let e = HttpError::HttpStatus {
2058            status: 503,
2059            url: "https://api.crossref.org/works/x".to_string(),
2060        };
2061        let dc: Option<DenialContext> = (&e).into();
2062        assert!(dc.is_none(), "HttpStatus must not produce a DenialContext");
2063
2064        let e = HttpError::UnknownSource {
2065            source_key: "ghost".to_string(),
2066        };
2067        let dc: Option<DenialContext> = (&e).into();
2068        assert!(
2069            dc.is_none(),
2070            "UnknownSource must not produce a DenialContext"
2071        );
2072    }
2073
2074    // ---------------------------------------------------------------
2075    // Issue #117 — transient retry / backoff. Real time: wiremock
2076    // serves over real localhost IO and tokio `start_paused` is
2077    // incompatible with that (it auto-advances past reqwest's
2078    // timeout). Backoff is small enough that the slowest case
2079    // (persistent 503, 3 retries ≈ 3.5s) stays within the suite budget.
2080    // ---------------------------------------------------------------
2081
2082    fn host_of(server: &MockServer) -> String {
2083        server
2084            .uri()
2085            .parse::<Url>()
2086            .unwrap()
2087            .host_str()
2088            .unwrap()
2089            .to_string()
2090    }
2091
2092    #[tokio::test]
2093    async fn transient_503_then_200_succeeds() {
2094        let server = MockServer::start().await;
2095        // Catch-all 200 mounted first (lowest precedence); the
2096        // single-shot 503 mounted last takes precedence for the first
2097        // request only, then falls through to the 200.
2098        Mock::given(method("GET"))
2099            .and(path("/p"))
2100            .respond_with(ResponseTemplate::new(200).set_body_string(r#"{"ok":1}"#))
2101            .mount(&server)
2102            .await;
2103        Mock::given(method("GET"))
2104            .and(path("/p"))
2105            .respond_with(ResponseTemplate::new(503))
2106            .up_to_n_times(1)
2107            .mount(&server)
2108            .await;
2109
2110        let client = build_test_client_for_http("crossref", &host_of(&server));
2111        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
2112        let (body, _) = client
2113            .fetch_bytes("crossref", url)
2114            .await
2115            .expect("503-then-200 must succeed after one retry");
2116        assert_eq!(&body[..], br#"{"ok":1}"#);
2117    }
2118
2119    #[tokio::test]
2120    async fn persistent_503_exhausts_and_returns_httpstatus() {
2121        let server = MockServer::start().await;
2122        Mock::given(method("GET"))
2123            .and(path("/p"))
2124            .respond_with(ResponseTemplate::new(503))
2125            .mount(&server)
2126            .await;
2127
2128        let client = build_test_client_for_http("crossref", &host_of(&server));
2129        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
2130        let err = client
2131            .fetch_bytes("crossref", url)
2132            .await
2133            .expect_err("persistent 503 must exhaust retries");
2134        match err {
2135            HttpError::HttpStatus { status, .. } => assert_eq!(status, 503),
2136            other => panic!("expected HttpStatus 503, got {other:?}"),
2137        }
2138        // First attempt + MAX_FETCH_RETRIES retries.
2139        let reqs = server
2140            .received_requests()
2141            .await
2142            .expect("wiremock records requests");
2143        assert_eq!(reqs.len(), (MAX_FETCH_RETRIES + 1) as usize);
2144    }
2145
2146    #[tokio::test]
2147    async fn retry_after_429_then_200_succeeds() {
2148        let server = MockServer::start().await;
2149        Mock::given(method("GET"))
2150            .and(path("/p"))
2151            .respond_with(ResponseTemplate::new(200).set_body_string("ok"))
2152            .mount(&server)
2153            .await;
2154        Mock::given(method("GET"))
2155            .and(path("/p"))
2156            .respond_with(ResponseTemplate::new(429).insert_header("Retry-After", "1"))
2157            .up_to_n_times(1)
2158            .mount(&server)
2159            .await;
2160
2161        let client = build_test_client_for_http("crossref", &host_of(&server));
2162        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
2163        let (body, _) = client
2164            .fetch_bytes("crossref", url)
2165            .await
2166            .expect("429+Retry-After then 200 must succeed");
2167        assert_eq!(&body[..], b"ok");
2168    }
2169
2170    #[tokio::test]
2171    async fn permanent_404_is_not_retried() {
2172        let server = MockServer::start().await;
2173        Mock::given(method("GET"))
2174            .and(path("/p"))
2175            .respond_with(ResponseTemplate::new(404))
2176            .mount(&server)
2177            .await;
2178
2179        let client = build_test_client_for_http("crossref", &host_of(&server));
2180        let url: Url = format!("{}/p", server.uri()).parse().unwrap();
2181        let _ = client
2182            .fetch_bytes("crossref", url)
2183            .await
2184            .expect_err("404 must fail");
2185        let reqs = server
2186            .received_requests()
2187            .await
2188            .expect("wiremock records requests");
2189        assert_eq!(reqs.len(), 1, "4xx (non-408/429) must NOT be retried");
2190    }
2191}