doiget_core/http.rs
1// allow: outbound-network
2//! Centralized HTTP client wrapper. All `Source` impls fetch through here.
3//!
4//! Security defaults per `docs/SECURITY.md`:
5//! - rustls TLS only (no openssl, no native-tls — enforced by `deny.toml`)
6//! - HTTPS-only redirect policy (file://, data://, http:// rejected)
7//! - Per-source redirect host allowlist (`docs/REDIRECT_ALLOWLIST.md`)
8//! - Body size cap ([`crate::PDF_MAX_BYTES`] = 100 MB)
9//! - Per-request timeouts (connect 10s, read 60s, total 300s)
10//! - PDF magic-byte check on the first 5 bytes (`%PDF-`)
11//! - User-Agent: `doiget/<version> (+https://github.com/sotashimozono/doiget)`
12//!
13//! See `docs/SECURITY.md` §1.2-1.3 / §1.10 and `docs/REDIRECT_ALLOWLIST.md`.
14//!
15//! # Architectural note: per-source `reqwest::Client`
16//!
17//! `reqwest::redirect::Policy::custom` receives only an `Attempt` value, which
18//! exposes the next URL and previous URL chain but **not** the original
19//! request's headers. That makes the "tag the request with `X-Doiget-Source`
20//! and inspect it from inside the redirect closure" approach infeasible on
21//! `reqwest 0.13.x`. Instead, [`HttpClient`] holds one
22//! [`reqwest::Client`] per source — each client's redirect closure captures
23//! that source's [`SourceAllowlist`] so cross-source confusion is impossible
24//! by construction.
25
26use std::collections::HashMap;
27use std::sync::Arc;
28use std::sync::Once;
29use std::time::Duration;
30
31use bytes::{Bytes, BytesMut};
32use futures_util::StreamExt;
33use reqwest::redirect::Policy;
34use reqwest::{Client, ClientBuilder, Url};
35use thiserror::Error;
36
37use crate::{PDF_MAX_BYTES, VERSION};
38
39/// PDF magic-byte prefix per the PDF 1.7 specification (ISO 32000-1 §7.5.2).
40/// `b"%PDF-"`.
41const PDF_MAGIC: [u8; 5] = [0x25, 0x50, 0x44, 0x46, 0x2D];
42
43/// Hard cap on redirect chain length. Matches `reqwest`'s default of 10.
44/// Re-asserted here so the value is reviewed alongside the other security
45/// defaults in this module rather than inheriting silently from upstream.
46const MAX_REDIRECTS: usize = 10;
47
48/// Connect timeout per `docs/SECURITY.md` §1.2 (Slowloris row).
49const CONNECT_TIMEOUT: Duration = Duration::from_secs(10);
50
51/// Read (idle-between-bytes) timeout per `docs/SECURITY.md` §1.2.
52const READ_TIMEOUT: Duration = Duration::from_secs(60);
53
54/// Total per-request timeout per `docs/SECURITY.md` §1.2.
55const TOTAL_TIMEOUT: Duration = Duration::from_secs(300);
56
57/// Max retry attempts AFTER the first try, for transient failures only
58/// (connect/timeout/mid-stream network errors and the transient HTTP
59/// status set). 3 retries → up to 4 total attempts. See issue #117.
60const MAX_FETCH_RETRIES: u32 = 3;
61
62/// Base delay for the exponential backoff (`base * 2^attempt`, jittered).
63const RETRY_BASE_DELAY: Duration = Duration::from_millis(500);
64
65/// Hard ceiling on any single backoff / `Retry-After` sleep. Keeps the
66/// worst-case retry chain comfortably inside [`TOTAL_TIMEOUT`].
67const RETRY_MAX_DELAY: Duration = Duration::from_secs(30);
68
69/// HTTP status codes worth retrying: request timeout, rate-limited, and
70/// the transient 5xx family. A plain 500 is included because upstreams
71/// (Crossref/Unpaywall) intermittently 500 under load. 4xx other than
72/// 408/429 are caller/permanent and never retried.
73fn is_transient_status(code: u16) -> bool {
74 matches!(code, 408 | 429 | 500 | 502 | 503 | 504)
75}
76
77/// A `reqwest::Error` is transient iff it is a connect or timeout
78/// failure or a mid-body transfer error. Redirect-policy aborts
79/// (allowlist denial), builder errors, and decode errors are NOT
80/// transient — retrying them cannot help and would mask a real denial.
81fn reqwest_is_transient(e: &reqwest::Error) -> bool {
82 (e.is_timeout() || e.is_connect() || e.is_body()) && !e.is_redirect()
83}
84
85/// Parse a `Retry-After` header expressed as integer seconds (the
86/// HTTP-date form is accepted by the RFC but rare for these APIs and
87/// deliberately ignored for the MVP — we fall back to exponential
88/// backoff in that case). Capped at [`RETRY_MAX_DELAY`].
89fn parse_retry_after(headers: &reqwest::header::HeaderMap) -> Option<Duration> {
90 let secs: u64 = headers
91 .get(reqwest::header::RETRY_AFTER)?
92 .to_str()
93 .ok()?
94 .trim()
95 .parse()
96 .ok()?;
97 Some(Duration::from_secs(secs).min(RETRY_MAX_DELAY))
98}
99
100/// Exponential backoff with decorrelated jitter. `RETRY_BASE_DELAY *
101/// 2^attempt`, capped at [`RETRY_MAX_DELAY`], plus 0..base jitter so a
102/// fleet of clients does not thunder back in lockstep. Jitter is derived
103/// from the wall-clock subsec nanos rather than pulling in an RNG
104/// dependency — adequate decorrelation for backoff, not a security
105/// primitive.
106fn backoff_delay(attempt: u32) -> Duration {
107 let factor = 1u64 << attempt.min(20);
108 let base_ms = RETRY_BASE_DELAY.as_millis() as u64;
109 let capped_ms = base_ms
110 .saturating_mul(factor)
111 .min(RETRY_MAX_DELAY.as_millis() as u64);
112 let jitter_ms = std::time::SystemTime::now()
113 .duration_since(std::time::UNIX_EPOCH)
114 .map(|d| (d.subsec_nanos() as u64) % base_ms.max(1))
115 .unwrap_or(0);
116 Duration::from_millis(capped_ms.saturating_add(jitter_ms))
117}
118
119// ---------------------------------------------------------------------------
120// SourceAllowlist
121// ---------------------------------------------------------------------------
122
123/// Per-source allowlist entry. Matches the schema in
124/// `docs/REDIRECT_ALLOWLIST.md` §2.
125#[derive(Debug, Clone)]
126#[non_exhaustive]
127pub struct SourceAllowlist {
128 /// Source key. MUST match a `source` value in `docs/SOURCES.md` §1
129 /// (e.g. `crossref`, `unpaywall`, `arxiv`).
130 pub source: String,
131 /// Each pattern is either a literal FQDN or a `*.<suffix>` glob (matches
132 /// the suffix and any subdomain — see `docs/REDIRECT_ALLOWLIST.md` §2.2
133 /// matching rule).
134 pub redirect_hosts: Vec<String>,
135}
136
137impl SourceAllowlist {
138 /// Construct a new allowlist entry.
139 pub fn new(source: impl Into<String>, redirect_hosts: Vec<String>) -> Self {
140 Self {
141 source: source.into(),
142 redirect_hosts,
143 }
144 }
145
146 /// Returns `true` if `host` matches any pattern in this allowlist.
147 ///
148 /// Matching is byte-level on the lowercased ASCII form of the host.
149 /// Callers MUST lowercase upstream; this method also lowercases as a
150 /// defense-in-depth measure but treats the result as ASCII (Punycode
151 /// is the caller's responsibility per `docs/REDIRECT_ALLOWLIST.md`
152 /// §2.2 rule 4).
153 pub fn matches(&self, host: &str) -> bool {
154 let host_lc = host.to_ascii_lowercase();
155 self.redirect_hosts
156 .iter()
157 .any(|pat| host_matches_pattern(&host_lc, pat))
158 }
159}
160
161/// Returns `true` if `host` (already lowercased) matches `pattern` per
162/// `docs/REDIRECT_ALLOWLIST.md` §2.2.
163fn host_matches_pattern(host: &str, pattern: &str) -> bool {
164 let pat_lc = pattern.to_ascii_lowercase();
165 if let Some(suffix) = pat_lc.strip_prefix("*.") {
166 // Suffix-glob: matches `<suffix>` exactly OR `*.<suffix>`.
167 host == suffix || host.ends_with(&format!(".{}", suffix))
168 } else {
169 // Exact-FQDN: byte-identical (after lowercasing both sides).
170 host == pat_lc
171 }
172}
173
174/// Hard-coded Phase 1 allowlist for Tier 1 sources. Sourced from
175/// `docs/REDIRECT_ALLOWLIST.md` §3.
176///
177/// Marked `Phase 1; revisit during real fetches` in the spec — entries
178/// flagged `(unverified)` (e.g. arXiv subdomain redirect behavior) MUST be
179/// confirmed or removed before Phase 1 is closed; see §3.3 of the spec.
180pub fn tier_1_allowlist() -> Vec<SourceAllowlist> {
181 vec![
182 // §3.1 crossref
183 SourceAllowlist::new(
184 "crossref",
185 vec!["api.crossref.org".to_string(), "*.crossref.org".to_string()],
186 ),
187 // §3.2 unpaywall
188 SourceAllowlist::new("unpaywall", vec!["api.unpaywall.org".to_string()]),
189 // §3.3 arxiv
190 SourceAllowlist::new(
191 "arxiv",
192 vec![
193 "arxiv.org".to_string(),
194 "export.arxiv.org".to_string(),
195 "*.arxiv.org".to_string(),
196 ],
197 ),
198 ]
199}
200
201/// Hard-coded Phase 4 allowlist for Tier 2 metadata sources (OpenAlex,
202/// Semantic Scholar, DOAJ). Sourced from `docs/SOURCES.md` §1 (the Tier 2
203/// table) and `docs/REDIRECT_ALLOWLIST.md` §3 (same redirect-allowlist
204/// policy as Tier 1, distinct source keys).
205///
206/// Returned hosts:
207///
208/// - `"openalex"` → `api.openalex.org` (production OpenAlex REST API).
209/// - `"semantic_scholar"` → `api.semanticscholar.org` (S2 Graph API base).
210/// - `"doaj"` → `doaj.org` + `*.doaj.org` (DOAJ public API; wildcard
211/// covers `api.doaj.org` and any v4+ subdomain split).
212///
213/// Per `docs/SOURCES.md` §4 "OpenAlex / Semantic Scholar / DOAJ", these
214/// sources are **metadata-only**: their `Source::fetch` impls MUST
215/// return `pdf_bytes: None`. The redirect closure in [`HttpClient`]
216/// uses this list to deny redirects to off-list hosts under each Tier
217/// 2 source key — identical mechanism to Tier 1, but the per-tool
218/// capability gate (`profile.metadata.openalex` etc.) is layered on
219/// top so the network surface remains capability-aware.
220pub fn tier_2_allowlist() -> Vec<SourceAllowlist> {
221 vec![
222 SourceAllowlist::new("openalex", vec!["api.openalex.org".to_string()]),
223 SourceAllowlist::new(
224 "semantic_scholar",
225 vec!["api.semanticscholar.org".to_string()],
226 ),
227 SourceAllowlist::new(
228 "doaj",
229 vec!["doaj.org".to_string(), "*.doaj.org".to_string()],
230 ),
231 ]
232}
233
234/// Hard-coded Phase 5a allowlist for the Springer Nature OA TDM
235/// source. Compile-gated by the `tdm-springer` Cargo feature so
236/// default release binaries never include the host pattern (per
237/// ADR-0002 and `docs/SOURCES.md` §3).
238///
239/// Returned entry:
240/// - `"tdm-springer"` → `api.springernature.com` (production base) +
241/// `*.springernature.com` (covers load-balancing subdomains; the
242/// redirect closure denies anything outside the wildcard).
243///
244/// Per `docs/SOURCES.md` §4 "TDM sources (Phase 5)", a fetch under
245/// this source key requires ALL THREE gates: Cargo feature compiled
246/// in, `DOIGET_KEY_SPRINGER` env var present, and
247/// `DOIGET_AGREE_TDM_SPRINGER=1`. The `CapabilityProfile` gate
248/// enforces the env-var pair; this allowlist is the transport gate.
249#[cfg(feature = "tdm-springer")]
250pub fn tier_3_springer_allowlist() -> Vec<SourceAllowlist> {
251 vec![SourceAllowlist::new(
252 "tdm-springer",
253 vec![
254 "api.springernature.com".to_string(),
255 "*.springernature.com".to_string(),
256 ],
257 )]
258}
259
260/// Hard-coded Phase 5b allowlist for the APS Harvest TDM source.
261/// Compile-gated by the `tdm-aps` Cargo feature so default release
262/// binaries never include the host pattern (per ADR-0002 and
263/// `docs/SOURCES.md` §3).
264///
265/// Returned entry:
266/// - `"tdm-aps"` → `harvest.aps.org` (production base) +
267/// `*.aps.org` (covers load-balancing subdomains; the redirect
268/// closure denies anything outside the wildcard).
269///
270/// Three-gate activation: Cargo feature compiled in,
271/// `DOIGET_KEY_APS` env var present, and `DOIGET_AGREE_TDM_APS=1`.
272/// The `CapabilityProfile` gate enforces the env-var pair; this
273/// allowlist is the transport gate.
274#[cfg(feature = "tdm-aps")]
275pub fn tier_3_aps_allowlist() -> Vec<SourceAllowlist> {
276 vec![SourceAllowlist::new(
277 "tdm-aps",
278 vec!["harvest.aps.org".to_string(), "*.aps.org".to_string()],
279 )]
280}
281
282/// Hard-coded Phase 5c allowlist for the Elsevier ScienceDirect TDM
283/// source. Compile-gated by the `tdm-elsevier` Cargo feature so
284/// default release binaries never include the host pattern (per
285/// ADR-0002 and `docs/SOURCES.md` §3).
286///
287/// Returned entry:
288/// - `"tdm-elsevier"` → `api.elsevier.com` (production base) +
289/// `*.elsevier.com` (covers load-balancing subdomains; the
290/// redirect closure denies anything outside the wildcard).
291///
292/// Three-gate activation: Cargo feature compiled in,
293/// `DOIGET_KEY_ELSEVIER` env var present, and
294/// `DOIGET_AGREE_TDM_ELSEVIER=1`. The `CapabilityProfile` gate
295/// enforces the env-var pair; this allowlist is the transport gate.
296#[cfg(feature = "tdm-elsevier")]
297pub fn tier_3_elsevier_allowlist() -> Vec<SourceAllowlist> {
298 vec![SourceAllowlist::new(
299 "tdm-elsevier",
300 vec!["api.elsevier.com".to_string(), "*.elsevier.com".to_string()],
301 )]
302}
303
304/// Hard-coded Phase 1 allowlist for the synthetic `"oa-publisher"` source —
305/// the publisher / preprint / repository hosts to which Unpaywall's
306/// `best_oa_location.url` (or `url_for_pdf`) typically resolves.
307///
308/// **Status: informed-best-effort.** Per `docs/REDIRECT_ALLOWLIST.md` §3,
309/// every entry below is a documented OA-publisher host pulled from the
310/// public DOI / OA discovery surface as of this function's authoring; they
311/// are **not** a substitute for empirical validation. Entries marked
312/// `(unverified)` MUST be confirmed by a real fetch or removed before
313/// Phase 1 is closed.
314///
315/// The orchestrator (`doiget-cli::commands::fetch::fetch_doi`) calls
316/// [`HttpClient::fetch_pdf`] under the `"oa-publisher"` source key when
317/// Unpaywall returns an OA URL. If the OA host is not in this list, the
318/// PDF leg is denied (`HttpError::RedirectDenied`) and the orchestrator
319/// falls back to metadata-only success (the `informed-best-effort`
320/// posture from the spec section above).
321pub fn oa_publisher_allowlist() -> Vec<SourceAllowlist> {
322 vec![SourceAllowlist::new(
323 "oa-publisher",
324 vec![
325 // Springer Nature OA imprints. Springer / SpringerOpen / Nature
326 // OA URLs all resolve under one of these registrable suffixes.
327 // (unverified) — confirm by replaying real Unpaywall responses.
328 "*.springer.com".to_string(),
329 "*.springeropen.com".to_string(),
330 "*.springernature.com".to_string(),
331 "*.nature.com".to_string(),
332 // Wiley OA. (unverified)
333 "*.wiley.com".to_string(),
334 // Elsevier OA route only — the TDM gated path is a separate
335 // source (`tdm-elsevier`, Phase 5c) and is not covered here.
336 // (unverified)
337 "*.elsevier.com".to_string(),
338 "*.sciencedirect.com".to_string(),
339 // Frontiers. (unverified)
340 "*.frontiersin.org".to_string(),
341 // MDPI. (unverified)
342 "*.mdpi.com".to_string(),
343 // PLOS. (unverified)
344 "*.plos.org".to_string(),
345 // Preprint servers — biorxiv / medrxiv. (unverified)
346 "*.biorxiv.org".to_string(),
347 "*.medrxiv.org".to_string(),
348 // Europe PMC + NIH PMC. (unverified)
349 "europepmc.org".to_string(),
350 "*.europepmc.org".to_string(),
351 "*.nih.gov".to_string(),
352 "*.ncbi.nlm.nih.gov".to_string(),
353 // Physics-society / diamond-OA hosts. UNLIKE the entries
354 // above, these are EMPIRICALLY VERIFIED: a real `doiget batch`
355 // over 30 OpenAlex-OA finite-temperature-MPS DOIs observed
356 // Unpaywall `best_oa_location` resolving to these hosts and
357 // being denied (#193, REDIRECT_ALLOWLIST.md §3.4, ADR-0027).
358 // APS — journals.aps.org / link.aps.org (green & gold OA;
359 // society host; `*.aps.org` is also trusted under the separate
360 // `tdm-aps` Tier-3 source key WHEN that feature is compiled
361 // in — `tier_3_aps_allowlist` is `#[cfg(feature = "tdm-aps")]`
362 // and absent from default release builds).
363 "*.aps.org".to_string(),
364 // SciPost — diamond OA, community-run physics publisher.
365 "scipost.org".to_string(),
366 "*.scipost.org".to_string(),
367 // IOP Publishing — iopscience.iop.org (New J. Phys. etc.).
368 "*.iop.org".to_string(),
369 // arXiv — already on the `arxiv` tier-1 allowlist, but the
370 // Unpaywall-driven path uses the `oa-publisher` source key,
371 // so we mirror the host list here too. See REDIRECT_ALLOWLIST.md
372 // §3.3 for the underlying entries.
373 "arxiv.org".to_string(),
374 "*.arxiv.org".to_string(),
375 ],
376 )]
377}
378
379// ---------------------------------------------------------------------------
380// HttpError
381// ---------------------------------------------------------------------------
382
383/// Errors that can arise during HTTP fetches.
384#[derive(Debug, Error)]
385#[non_exhaustive]
386pub enum HttpError {
387 /// Transport / DNS / TLS failure or other `reqwest`-level error. Note
388 /// that `reqwest` surfaces a redirect-policy abort (via `Attempt::error`)
389 /// as a `reqwest::Error` carrying the source error — callers seeing
390 /// `Network` for what they believed was a redirect violation should
391 /// inspect the inner error chain.
392 #[error("network error: {0}")]
393 Network(#[from] reqwest::Error),
394 /// Redirect target host did not match any pattern in the source's
395 /// `redirect_hosts`. See `docs/REDIRECT_ALLOWLIST.md` §2.2.
396 ///
397 /// Field naming: `source_key` rather than `source` because `thiserror`
398 /// auto-treats a field literally named `source` as a `#[source]` error
399 /// chain link (which would require the field to implement `std::error::Error`).
400 ///
401 /// `expected_hosts` carries a snapshot of the source's allowlist
402 /// patterns at the time of the denial — populated for the structured
403 /// `denial_context.expected` channel introduced by ADR-0023 §4
404 /// (NORMATIVE mapping table). Cloning the patterns into the error
405 /// keeps the `From<&HttpError> for Option<DenialContext>` impl from
406 /// having to re-look-up the allowlist by `source_key`. May be empty
407 /// when the rejection happened before any allowlist was matched
408 /// (e.g. URL had no host component at all).
409 #[error("redirect target {host} not in allowlist for source {source_key}")]
410 RedirectDenied {
411 /// Source key whose allowlist rejected the redirect.
412 source_key: String,
413 /// The lowercased host that was rejected.
414 host: String,
415 /// Snapshot of the source's `redirect_hosts` at denial time.
416 /// Surfaces as `denial_context.expected` (ADR-0023 §4).
417 expected_hosts: Vec<String>,
418 },
419 /// Redirect target had a scheme other than `https`. See
420 /// `docs/SECURITY.md` §1.3.
421 #[error("redirect to non-HTTPS scheme: {scheme}")]
422 InsecureRedirect {
423 /// The disallowed scheme (e.g. `http`, `file`, `data`).
424 scheme: String,
425 },
426 /// Body would exceed [`PDF_MAX_BYTES`] either by a `Content-Length`
427 /// hint or by accumulated streamed bytes. See `docs/SECURITY.md` §1.2.
428 #[error("body too large: {actual} bytes (cap = {cap})")]
429 OversizedBody {
430 /// Observed size (header value or accumulated bytes).
431 actual: u64,
432 /// Hard upper bound (always [`PDF_MAX_BYTES`]).
433 cap: u64,
434 },
435 /// PDF magic-byte mismatch — the body does not start with `%PDF-`.
436 /// We deliberately do NOT use `Content-Type` (publishers misbehave —
437 /// the magic byte is the trustworthy signal per `docs/SECURITY.md`
438 /// §1.2 "Magic-byte mismatch" row).
439 #[error("PDF magic-byte mismatch: got {got:?}")]
440 NotAPdf {
441 /// First five bytes of the response body (zero-padded if shorter).
442 got: [u8; 5],
443 },
444 /// Server returned a non-2xx status.
445 #[error("HTTP {status} from {url}")]
446 HttpStatus {
447 /// HTTP status code.
448 status: u16,
449 /// The URL that produced the status.
450 url: String,
451 },
452 /// No allowlist entry exists for this source. The caller asked
453 /// [`HttpClient`] to fetch on behalf of a source that wasn't passed to
454 /// [`HttpClient::new`].
455 ///
456 /// See note on `RedirectDenied` for why the field is `source_key`.
457 #[error("no allowlist registered for source {source_key}")]
458 UnknownSource {
459 /// The unregistered source key.
460 source_key: String,
461 },
462 /// A header name or value passed to
463 /// [`HttpClient::fetch_bytes_with_headers`] was not a valid HTTP
464 /// header. The header parser only accepts the visible-ASCII subset
465 /// per RFC 7230 §3.2; control characters and non-ASCII bytes are
466 /// rejected before the request is even built. Surfaces as
467 /// `ErrorCode::InternalError` at the public boundary (callers
468 /// supplying bad headers are responsible for fixing the call site;
469 /// not a denial in the ADR-0023 sense).
470 #[error("invalid HTTP header `{name}`: {reason}")]
471 InvalidHeader {
472 /// The header name as supplied by the caller.
473 name: String,
474 /// `"name"` or `"value"` — which side failed parsing.
475 reason: String,
476 },
477}
478
479// ---------------------------------------------------------------------------
480// HttpError -> Option<DenialContext> (ADR-0023 §4 mapping table)
481// ---------------------------------------------------------------------------
482
483/// Map an [`HttpError`] reference to the structured [`crate::DenialContext`]
484/// channel introduced by ADR-0023.
485///
486/// Returns `Some(_)` for the four denial classes named in ADR-0023 §4
487/// (`RedirectDenied`, `OversizedBody`, `NotAPdf`, `InsecureRedirect`) and
488/// `None` for every other variant — `Network`, `HttpStatus`,
489/// `UnknownSource` are not denials in the ADR-0023 sense (they are
490/// transport / upstream / programming-error signals, not allowlist or
491/// cap rejections).
492///
493/// The `&HttpError` borrow form is used (rather than `HttpError`) so the
494/// caller — typically the orchestrator that already needs the original
495/// error for `error.message` and the `From<HttpError> for ErrorCode`
496/// collapse — does not have to clone the error to produce the optional
497/// structured side-channel.
498impl From<&HttpError> for Option<crate::DenialContext> {
499 fn from(e: &HttpError) -> Self {
500 use crate::{DenialContext, DenialReason};
501 match e {
502 HttpError::RedirectDenied {
503 source_key,
504 host,
505 expected_hosts,
506 } => Some(DenialContext {
507 reason: DenialReason::RedirectNotInAllowlist,
508 source: Some(source_key.clone()),
509 attempted: Some(host.clone()),
510 expected: Some(expected_hosts.clone()),
511 hop_index: None,
512 cap: None,
513 actual: None,
514 }),
515 HttpError::OversizedBody { actual, cap } => Some(DenialContext {
516 reason: DenialReason::SizeCapExceeded,
517 source: None,
518 attempted: None,
519 // The size-cap reason has no allowlist channel; use
520 // `None` to signal "field not populated by producer"
521 // rather than `Some(vec![])` (which would mean "explicit
522 // empty allowlist"). See `DenialContext::expected` docs.
523 expected: None,
524 hop_index: None,
525 cap: Some(*cap),
526 actual: Some(*actual),
527 }),
528 HttpError::NotAPdf { got } => Some(DenialContext {
529 reason: DenialReason::ContentTypeMismatch,
530 source: None,
531 // ADR-0023 §4 mapping table: hex-encode the first 5 bytes
532 // for the `attempted` field. `format!("{:02x}...")` is
533 // chosen over `hex::encode` to avoid pulling the
534 // additional dep into this conversion path; the result is
535 // bit-identical (lowercase, zero-padded).
536 attempted: Some(format!(
537 "{:02x}{:02x}{:02x}{:02x}{:02x}",
538 got[0], got[1], got[2], got[3], got[4]
539 )),
540 expected: Some(vec!["%PDF-".to_string()]),
541 hop_index: None,
542 cap: None,
543 actual: None,
544 }),
545 HttpError::InsecureRedirect { scheme } => Some(DenialContext {
546 reason: DenialReason::InsecureScheme,
547 source: None,
548 attempted: Some(format!("{}:...", scheme)),
549 expected: Some(vec!["https".to_string()]),
550 hop_index: None,
551 cap: None,
552 actual: None,
553 }),
554 // `reqwest` wraps a custom error returned by the redirect
555 // policy closure (`attempt.error(HttpError::RedirectDenied{..})`
556 // / `attempt.error(HttpError::InsecureRedirect{..})`) inside a
557 // `reqwest::Error`, which surfaces here as `HttpError::Network`.
558 // Without source-chain walking, production redirect denials —
559 // the most operationally important denial class — would never
560 // produce a `DenialContext`, defeating the whole point of
561 // ADR-0023.
562 //
563 // Walk the `std::error::Error::source()` chain on the inner
564 // `reqwest::Error` and downcast each link to `&HttpError`. If
565 // a wrapped `HttpError` is found, recurse via this same `From`
566 // impl. Otherwise the network error is a "real" transport /
567 // DNS / TLS failure with no denial semantics — return `None`.
568 //
569 // `std::error::Error::source(e)` is fully-qualified to
570 // disambiguate against the inherent (and unrelated)
571 // `reqwest::Error::source()`.
572 HttpError::Network(e) => {
573 let mut source: Option<&(dyn std::error::Error + 'static)> =
574 std::error::Error::source(e);
575 while let Some(s) = source {
576 if let Some(http_err) = s.downcast_ref::<HttpError>() {
577 return Option::<crate::DenialContext>::from(http_err);
578 }
579 source = s.source();
580 }
581 None
582 }
583 // The remaining variants are not "denials" in the ADR-0023
584 // sense — HttpStatus/UnknownSource are upstream / programming-
585 // error signals; InvalidHeader is a caller-bug signal.
586 HttpError::HttpStatus { .. }
587 | HttpError::UnknownSource { .. }
588 | HttpError::InvalidHeader { .. } => None,
589 }
590 }
591}
592
593// ---------------------------------------------------------------------------
594// HttpClient
595// ---------------------------------------------------------------------------
596
597/// Workspace-wide HTTP client with the security defaults applied.
598///
599/// Internally holds one `reqwest::Client` per source. Construct via
600/// [`HttpClient::new`] with the full set of allowlists the calling process
601/// will need.
602#[derive(Clone, Debug)]
603pub struct HttpClient {
604 /// One [`reqwest::Client`] per source. Each client carries a redirect
605 /// policy that captures only that source's allowlist. `Arc` so cloning
606 /// is cheap.
607 clients: Arc<HashMap<String, Client>>,
608 /// The exact [`SourceAllowlist`] each per-source client was built from,
609 /// keyed by source. The redirect closure inside each `reqwest::Client`
610 /// captures its allowlist *by move*, so it cannot be read back from the
611 /// client itself. This map keeps the identical `SourceAllowlist`
612 /// available to callers that must perform a *pre-fetch* host check on a
613 /// metadata-discovered URL (issue #145 / `docs/REDIRECT_ALLOWLIST.md`
614 /// §1: the allowlist is consulted "on the OA URL discovered through
615 /// metadata sources before the actual PDF fetch is issued", not only on
616 /// redirect hops). Storing the same value here — rather than re-deriving
617 /// it from [`oa_publisher_allowlist`] at the call site — guarantees the
618 /// pre-check and the redirect closure can never drift, and that the
619 /// check works under the test constructors too (which register a
620 /// wiremock host as the allowlist).
621 allowlists: Arc<HashMap<String, SourceAllowlist>>,
622}
623
624impl HttpClient {
625 /// Build a client with rustls + redirect-allowlist + size cap +
626 /// timeouts.
627 ///
628 /// `allowlists` MUST cover every source whose URL might be passed in;
629 /// fetches against unregistered sources return
630 /// [`HttpError::UnknownSource`].
631 ///
632 /// # Errors
633 ///
634 /// Returns the underlying `reqwest::Error` if `ClientBuilder::build`
635 /// fails (typically a TLS-backend init failure).
636 pub fn new(allowlists: Vec<SourceAllowlist>) -> Result<Self, reqwest::Error> {
637 let mut clients = HashMap::with_capacity(allowlists.len());
638 let mut allowlist_map = HashMap::with_capacity(allowlists.len());
639 for entry in allowlists {
640 let source = entry.source.clone();
641 // Keep the *same* allowlist value both inside the redirect
642 // closure (via `build_client`) and queryable on the client
643 // (issue #145 pre-fetch check). `build_client` takes the
644 // allowlist by value, so clone once for the side table first.
645 allowlist_map.insert(source.clone(), entry.clone());
646 let client = build_client(entry)?;
647 clients.insert(source, client);
648 }
649 Ok(Self {
650 clients: Arc::new(clients),
651 allowlists: Arc::new(allowlist_map),
652 })
653 }
654
655 /// The [`SourceAllowlist`] this client was built with for `source`, or
656 /// `None` if `source` was not registered.
657 ///
658 /// This is the *identical* value captured by the per-source redirect
659 /// closure (see [`HttpClient`]'s `allowlists` field doc). It exists so
660 /// the orchestrator can apply the `docs/REDIRECT_ALLOWLIST.md` §1
661 /// pre-fetch host check on a metadata-discovered OA URL — the URL that
662 /// is fetched *without* necessarily passing through a redirect hop —
663 /// using the same source of truth the redirect closure uses, so the two
664 /// can never disagree. Callers MUST use this for the `"oa-publisher"`
665 /// leg only; the initial template-constructed URL is exempt per
666 /// `docs/REDIRECT_ALLOWLIST.md` §6.
667 pub fn source_allowlist(&self, source: &str) -> Option<&SourceAllowlist> {
668 self.allowlists.get(source)
669 }
670
671 /// Fetch a URL, treating it as a JSON or text body. Caps at
672 /// [`PDF_MAX_BYTES`].
673 ///
674 /// Returns the response body bytes plus the effective final URL after
675 /// redirects (post-allowlist verification — every hop has already been
676 /// validated by the time this returns).
677 ///
678 /// # Errors
679 ///
680 /// Any [`HttpError`] variant.
681 pub async fn fetch_bytes(&self, source: &str, url: Url) -> Result<(Bytes, Url), HttpError> {
682 self.fetch_inner(source, url, &[], false).await
683 }
684
685 /// Like [`Self::fetch_bytes`] but attaches additional request
686 /// headers to the outgoing GET. The headers are validated up-front
687 /// against the visible-ASCII subset (RFC 7230 §3.2); any failure
688 /// returns [`HttpError::InvalidHeader`] before the request is sent.
689 ///
690 /// Used by Tier-3 TDM sources that authenticate via a header
691 /// (APS Harvest `X-API-Key`, Elsevier ScienceDirect `X-ELS-APIKey`).
692 /// Header values appear on the wire only — they are never logged.
693 ///
694 /// # Errors
695 ///
696 /// Any [`HttpError`] variant including [`HttpError::InvalidHeader`].
697 pub async fn fetch_bytes_with_headers(
698 &self,
699 source: &str,
700 url: Url,
701 headers: &[(&str, &str)],
702 ) -> Result<(Bytes, Url), HttpError> {
703 self.fetch_inner(source, url, headers, false).await
704 }
705
706 /// Fetch a URL expected to be a PDF. Same as [`Self::fetch_bytes`] plus
707 /// the magic-byte check on the first 5 bytes
708 /// (`%PDF-` = `[0x25, 0x50, 0x44, 0x46, 0x2D]`). Mismatch returns
709 /// [`HttpError::NotAPdf`].
710 ///
711 /// # Errors
712 ///
713 /// Any [`HttpError`] variant including [`HttpError::NotAPdf`].
714 pub async fn fetch_pdf(&self, source: &str, url: Url) -> Result<(Bytes, Url), HttpError> {
715 self.fetch_inner(source, url, &[], true).await
716 }
717
718 async fn fetch_inner(
719 &self,
720 source: &str,
721 url: Url,
722 headers: &[(&str, &str)],
723 check_pdf_magic: bool,
724 ) -> Result<(Bytes, Url), HttpError> {
725 // Normalise legacy `http://` URLs returned by OpenAlex /
726 // Unpaywall metadata before send. See `upgrade_http_to_https`
727 // for the rationale (TLS posture preserved per ADR-0020) and
728 // the loopback carve-out.
729 let url = upgrade_http_to_https(url);
730
731 let client = self
732 .clients
733 .get(source)
734 .ok_or_else(|| HttpError::UnknownSource {
735 source_key: source.to_string(),
736 })?;
737
738 // Parse headers up-front so an invalid name/value fails BEFORE
739 // we touch the network. `HeaderName::from_bytes` / `HeaderValue::from_str`
740 // accept the visible-ASCII subset only (RFC 7230 §3.2).
741 let mut header_map = reqwest::header::HeaderMap::with_capacity(headers.len());
742 for (name, value) in headers {
743 let hn = reqwest::header::HeaderName::from_bytes(name.as_bytes()).map_err(|_| {
744 HttpError::InvalidHeader {
745 name: (*name).to_string(),
746 reason: "name".to_string(),
747 }
748 })?;
749 let hv = reqwest::header::HeaderValue::from_str(value).map_err(|_| {
750 HttpError::InvalidHeader {
751 name: (*name).to_string(),
752 reason: "value".to_string(),
753 }
754 })?;
755 header_map.insert(hn, hv);
756 }
757
758 // Bounded retry loop (issue #117). Only transient classes are
759 // retried — connect/timeout/mid-stream network errors and the
760 // transient HTTP status set. Allowlist denials, NotAPdf,
761 // OversizedBody, 4xx (non-408/429) are deterministic and return
762 // on the first occurrence. GET is idempotent so a retried
763 // attempt re-streams the body from scratch.
764 let mut attempt: u32 = 0;
765 loop {
766 let send_result = client
767 .get(url.clone())
768 .headers(header_map.clone())
769 .send()
770 .await;
771 let response = match send_result {
772 Ok(r) => r,
773 Err(e) => {
774 if attempt < MAX_FETCH_RETRIES && reqwest_is_transient(&e) {
775 let d = backoff_delay(attempt);
776 tracing::warn!(
777 source,
778 attempt,
779 delay_ms = d.as_millis() as u64,
780 error = %e,
781 "transient send failure; retrying"
782 );
783 tokio::time::sleep(d).await;
784 attempt += 1;
785 continue;
786 }
787 return Err(HttpError::Network(e));
788 }
789 };
790 let final_url = response.url().clone();
791
792 // Status check before body read so we can fail fast.
793 let status = response.status();
794 if !status.is_success() {
795 let code = status.as_u16();
796 if attempt < MAX_FETCH_RETRIES && is_transient_status(code) {
797 // Prefer the server's `Retry-After` over our backoff
798 // when present (429/503 commonly carry it).
799 let d = parse_retry_after(response.headers())
800 .unwrap_or_else(|| backoff_delay(attempt));
801 tracing::warn!(
802 source,
803 attempt,
804 status = code,
805 delay_ms = d.as_millis() as u64,
806 "transient HTTP status; retrying"
807 );
808 tokio::time::sleep(d).await;
809 attempt += 1;
810 continue;
811 }
812 return Err(HttpError::HttpStatus {
813 status: code,
814 // Issue #146: Springer Nature authenticates via an
815 // `api_key` URL query parameter (no header path
816 // upstream). This error string is logged and may
817 // surface to the user, so strip any `api_key`
818 // value before it leaves the client. No other
819 // source puts a secret in the query string, so
820 // this is a no-op for them.
821 url: redact_api_key_query(&final_url),
822 });
823 }
824
825 // Content-Length fast-path: if header is present and exceeds
826 // the cap, fail without reading any body (deterministic — not
827 // retried). Per `docs/SECURITY.md` §1.2.
828 if let Some(len) = response.content_length() {
829 if len > PDF_MAX_BYTES {
830 return Err(HttpError::OversizedBody {
831 actual: len,
832 cap: PDF_MAX_BYTES,
833 });
834 }
835 }
836
837 // Stream body and enforce the cap as bytes accumulate. A
838 // mid-stream transport error is transient (retry); an
839 // oversized body is deterministic (return).
840 let mut buf = BytesMut::new();
841 let mut stream = response.bytes_stream();
842 let mut oversized_at: Option<u64> = None;
843 let mut stream_err: Option<reqwest::Error> = None;
844 while let Some(chunk) = stream.next().await {
845 let chunk = match chunk {
846 Ok(c) => c,
847 Err(e) => {
848 stream_err = Some(e);
849 break;
850 }
851 };
852 let projected = (buf.len() as u64).saturating_add(chunk.len() as u64);
853 if projected > PDF_MAX_BYTES {
854 oversized_at = Some(projected);
855 break;
856 }
857 buf.extend_from_slice(&chunk);
858 }
859 if let Some(actual) = oversized_at {
860 return Err(HttpError::OversizedBody {
861 actual,
862 cap: PDF_MAX_BYTES,
863 });
864 }
865 if let Some(e) = stream_err {
866 if attempt < MAX_FETCH_RETRIES && reqwest_is_transient(&e) {
867 let d = backoff_delay(attempt);
868 tracing::warn!(
869 source,
870 attempt,
871 delay_ms = d.as_millis() as u64,
872 error = %e,
873 "transient mid-stream failure; retrying"
874 );
875 tokio::time::sleep(d).await;
876 attempt += 1;
877 continue;
878 }
879 return Err(HttpError::Network(e));
880 }
881 let body = buf.freeze();
882
883 if check_pdf_magic {
884 let mut got = [0u8; 5];
885 let n = body.len().min(5);
886 got[..n].copy_from_slice(&body[..n]);
887 if got != PDF_MAGIC {
888 return Err(HttpError::NotAPdf { got });
889 }
890 }
891
892 return Ok((body, final_url));
893 }
894 }
895}
896
897/// Return `url` rendered as a string with the value of any `api_key`
898/// query parameter replaced by `REDACTED` (issue #146).
899///
900/// Springer Nature's TDM API authenticates **only** via an `api_key`
901/// query parameter — there is no header-auth path upstream — so the key
902/// is unavoidably in the request URL. This keeps it out of *our* log
903/// and error sinks (the `HttpError::HttpStatus` string in particular,
904/// which is `tracing`-logged and can surface to the user). It is a
905/// structural no-op for every other source, none of which carry a
906/// secret in the query string. Other pairs and their order are
907/// preserved; a URL with no `api_key` pair is rendered unchanged.
908fn redact_api_key_query(url: &url::Url) -> String {
909 const API_KEY_PARAM: &str = "api_key";
910 if url.query_pairs().all(|(k, _)| k != API_KEY_PARAM) {
911 return url.to_string();
912 }
913 let mut redacted = url.clone();
914 let pairs: Vec<(String, String)> = url
915 .query_pairs()
916 .map(|(k, v)| {
917 if k == API_KEY_PARAM {
918 (k.into_owned(), "REDACTED".to_string())
919 } else {
920 (k.into_owned(), v.into_owned())
921 }
922 })
923 .collect();
924 redacted.query_pairs_mut().clear().extend_pairs(pairs);
925 redacted.to_string()
926}
927
928/// Test-oriented [`HttpClient`] constructor. Originally `cfg(test)`; now
929/// also reachable from the `doiget-cli` orchestrator's integration tests
930/// (which live outside this crate and therefore cannot see `cfg(test)`-gated
931/// items). The constructor name retains its `for_tests_allow_http` signal —
932/// production code MUST use [`HttpClient::new`] with [`tier_1_allowlist`].
933#[allow(clippy::expect_used)]
934impl HttpClient {
935 /// Build a test-oriented `HttpClient` against an `http://` wiremock
936 /// origin. The redirect closure still rejects insecure schemes — we only
937 /// relax `https_only` at the connection level so wiremock can serve.
938 /// This is acceptable because the redirect closure (which is the
939 /// security-load-bearing path) is exercised by the
940 /// `redirect_to_http_is_rejected_by_closure` test below.
941 ///
942 /// Production callers MUST use [`HttpClient::new`] with
943 /// [`tier_1_allowlist`] — the `for_tests_allow_http` suffix is the load-
944 /// bearing signal that this constructor lifts the initial-leg HTTPS-only
945 /// requirement.
946 pub fn new_for_tests_allow_http(source: &str, allowlist_host: &str) -> Self {
947 let allowlist = SourceAllowlist::new(source, vec![allowlist_host.to_string()]);
948 let client = build_client_allow_http(allowlist.clone()).expect("test client builds");
949 let mut map = HashMap::new();
950 let mut allowlist_map = HashMap::new();
951 allowlist_map.insert(allowlist.source.clone(), allowlist.clone());
952 map.insert(allowlist.source.clone(), client);
953 Self {
954 clients: Arc::new(map),
955 allowlists: Arc::new(allowlist_map),
956 }
957 }
958
959 /// Multi-source variant of [`HttpClient::new_for_tests_allow_http`].
960 ///
961 /// Builds a relaxed-`https_only` client per `(source, allowlist_host)`
962 /// pair. Used by the `doiget-cli` orchestrator's integration tests when
963 /// more than one upstream needs to be wiremocked simultaneously
964 /// (e.g. Crossref + Unpaywall against two different mock servers).
965 /// Production callers MUST use [`HttpClient::new`] with
966 /// [`tier_1_allowlist`].
967 pub fn new_for_tests_allow_http_multi(entries: &[(&str, &str)]) -> Self {
968 let mut map = HashMap::with_capacity(entries.len());
969 let mut allowlist_map = HashMap::with_capacity(entries.len());
970 for (source, host) in entries {
971 let allowlist = SourceAllowlist::new(*source, vec![host.to_string()]);
972 let client = build_client_allow_http(allowlist.clone()).expect("test client builds");
973 allowlist_map.insert(allowlist.source.clone(), allowlist.clone());
974 map.insert(allowlist.source.clone(), client);
975 }
976 Self {
977 clients: Arc::new(map),
978 allowlists: Arc::new(allowlist_map),
979 }
980 }
981}
982
983fn build_client_allow_http(allowlist: SourceAllowlist) -> Result<Client, reqwest::Error> {
984 ensure_crypto_provider();
985 let allowlist_for_closure = allowlist.clone();
986 let redirect_policy = Policy::custom(move |attempt| {
987 let scheme = attempt.url().scheme().to_string();
988 let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
989 let prev_count = attempt.previous().len();
990 if scheme != "https" {
991 return attempt.error(HttpError::InsecureRedirect { scheme });
992 }
993 if prev_count >= MAX_REDIRECTS {
994 return attempt.stop();
995 }
996 let host = match host_opt {
997 Some(h) => h,
998 None => {
999 return attempt.error(HttpError::RedirectDenied {
1000 source_key: allowlist_for_closure.source.clone(),
1001 host: String::new(),
1002 expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1003 });
1004 }
1005 };
1006 if !allowlist_for_closure.matches(&host) {
1007 return attempt.error(HttpError::RedirectDenied {
1008 source_key: allowlist_for_closure.source.clone(),
1009 host,
1010 expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1011 });
1012 }
1013 attempt.follow()
1014 });
1015 ClientBuilder::new()
1016 // `https_only(false)` only at this scope — production builders
1017 // (the public `HttpClient::new`) keep it on.
1018 .https_only(false)
1019 .redirect(redirect_policy)
1020 .connect_timeout(CONNECT_TIMEOUT)
1021 .timeout(TOTAL_TIMEOUT)
1022 .read_timeout(READ_TIMEOUT)
1023 .user_agent(format!(
1024 "doiget/{} (+https://github.com/sotashimozono/doiget)",
1025 VERSION
1026 ))
1027 .tls_backend_rustls()
1028 .build()
1029}
1030
1031// ---------------------------------------------------------------------------
1032// ClientBuilder helpers
1033// ---------------------------------------------------------------------------
1034
1035/// Install the `ring` `rustls` crypto provider as the process default,
1036/// exactly once.
1037///
1038/// reqwest is built with the `rustls-no-provider` feature (ADR-0020
1039/// Amendment 1: drop aws-lc-rs so `cargo install` needs no cmake/C
1040/// toolchain and musl-static builds cleanly). With no bundled provider,
1041/// `reqwest::ClientBuilder::build` calls
1042/// `rustls::crypto::CryptoProvider::get_default()` and **panics**
1043/// (`"No provider set"`) unless a process-default provider was installed
1044/// first. Every client constructor below calls this; the `Once` makes it
1045/// safe to invoke from many sites and from concurrent tests.
1046fn ensure_crypto_provider() {
1047 static INIT: Once = Once::new();
1048 INIT.call_once(|| {
1049 // `install_default` errors only if a provider is already set;
1050 // under `Once` that is unreachable, but ignore it rather than
1051 // panic (another linked crate could have installed one first).
1052 let _ = rustls::crypto::ring::default_provider().install_default();
1053 });
1054}
1055
1056/// Upgrade an `http://` URL to `https://` for legacy publisher
1057/// metadata. Loopback hosts (`localhost`, any RFC 6761 `.localhost`
1058/// TLD subdomain, `127.0.0.0/8`, `::1`, IPv4-mapped IPv6 loopback)
1059/// are returned unchanged so the `new_for_tests_allow_http*` wiremock
1060/// path continues to talk plain HTTP to the local fixture server.
1061///
1062/// Non-`http` schemes (`https`, `file`, anything else) and cannot-be-
1063/// base URLs are returned unchanged. The function is total: it never
1064/// panics and never returns an error.
1065///
1066/// # Audit / posture
1067///
1068/// On a successful upgrade the function emits a `tracing::info!` event
1069/// so the rewrite appears in the operator's default-level structured
1070/// log. On the (in-practice unreachable) `set_scheme` failure path a
1071/// `tracing::warn!` event is emitted before returning the original
1072/// URL; the production client's `https_only(true)` then rejects the
1073/// send with a clear network error, preserving the TLS posture
1074/// established by ADR-0020.
1075///
1076/// # `Domain("localhost")` arm subtlety
1077///
1078/// The url crate resolves the bare host `localhost` to `127.0.0.1`
1079/// (Ipv4 variant) when parsing an `http://` URL, so the `Domain` arm
1080/// does NOT fire for that case (the `Ipv4` arm catches it). The arm
1081/// IS load-bearing for the RFC 6761 `.localhost` TLD (e.g.
1082/// `myservice.localhost`, `api.localhost`), which the url crate does
1083/// NOT auto-resolve to an IP and keeps as `Host::Domain`.
1084fn upgrade_http_to_https(url: Url) -> Url {
1085 if url.scheme() != "http" {
1086 return url;
1087 }
1088 match url.host() {
1089 None => {
1090 // Cannot-be-base URL (e.g. `http:foo`) — `set_scheme`
1091 // would reject the conversion.
1092 return url;
1093 }
1094 Some(url::Host::Domain(d)) if is_localhost_domain(d) => return url,
1095 Some(url::Host::Ipv4(ip)) if ip.is_loopback() => return url,
1096 Some(url::Host::Ipv6(ip)) if is_ipv6_loopback(ip) => return url,
1097 Some(_) => {}
1098 }
1099 let mut upgraded = url.clone();
1100 if upgraded.set_scheme("https").is_err() {
1101 // url-crate `set_scheme` is documented to fail only for
1102 // cannot-be-base URLs and a few cross-family transitions;
1103 // `http -> https` is supported because both are "special"
1104 // schemes. The fallback below is defence-in-depth.
1105 tracing::warn!(
1106 url = %url,
1107 "set_scheme(http -> https) failed unexpectedly; \
1108 sending original URL — https_only(true) will reject",
1109 );
1110 return url;
1111 }
1112 tracing::info!(
1113 original = %url,
1114 upgraded = %upgraded,
1115 "upgraded http -> https for legacy publisher metadata"
1116 );
1117 upgraded
1118}
1119
1120/// `true` for the `localhost` literal and any RFC 6761 `.localhost`
1121/// TLD subdomain (`myservice.localhost`, `api.localhost`, etc.).
1122/// ASCII-case-insensitive per host-name conventions.
1123fn is_localhost_domain(d: &str) -> bool {
1124 if d.eq_ignore_ascii_case("localhost") {
1125 return true;
1126 }
1127 let suffix = ".localhost";
1128 let d_bytes = d.as_bytes();
1129 let s_bytes = suffix.as_bytes();
1130 if d_bytes.len() <= s_bytes.len() {
1131 return false;
1132 }
1133 let tail = &d_bytes[d_bytes.len() - s_bytes.len()..];
1134 tail.eq_ignore_ascii_case(s_bytes)
1135}
1136
1137/// `true` for `::1` and any IPv4-mapped loopback
1138/// (`::ffff:127.0.0.0/8`). `Ipv6Addr::is_loopback()` covers only `::1`,
1139/// so dual-stack callers that hit `[::ffff:127.0.0.1]` would otherwise
1140/// be silently upgraded.
1141fn is_ipv6_loopback(ip: std::net::Ipv6Addr) -> bool {
1142 if ip.is_loopback() {
1143 return true;
1144 }
1145 matches!(ip.to_ipv4_mapped(), Some(v4) if v4.is_loopback())
1146}
1147
1148fn build_client(allowlist: SourceAllowlist) -> Result<Client, reqwest::Error> {
1149 ensure_crypto_provider();
1150
1151 let user_agent = format!(
1152 "doiget/{} (+https://github.com/sotashimozono/doiget)",
1153 VERSION
1154 );
1155
1156 // Redirect policy: capture the per-source allowlist by value. The
1157 // closure is called for every redirect hop — there is no global
1158 // fallback, every hop is checked. Hard cap at MAX_REDIRECTS via the
1159 // attempt counter (mirrors reqwest's built-in limit).
1160 let allowlist_for_closure = allowlist.clone();
1161 let redirect_policy = Policy::custom(move |attempt| {
1162 // Inspect the candidate URL via owned copies so we can move
1163 // `attempt` into `error()` / `follow()` / `stop()` later without
1164 // the borrow checker complaining about an outstanding borrow of
1165 // `attempt`.
1166 let scheme = attempt.url().scheme().to_string();
1167 let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
1168 let prev_count = attempt.previous().len();
1169
1170 // 1. Reject non-HTTPS up front. The `https_only(true)` builder
1171 // flag below also catches this, but we want the dedicated
1172 // `InsecureRedirect` error path (not a generic `https_only`
1173 // abort) — see `docs/SECURITY.md` §1.3.
1174 if scheme != "https" {
1175 return attempt.error(HttpError::InsecureRedirect { scheme });
1176 }
1177
1178 // 2. Hop limit (`docs/SECURITY.md` §1.3 redirect_limit row).
1179 if prev_count >= MAX_REDIRECTS {
1180 return attempt.stop();
1181 }
1182
1183 // 3. Allowlist check on the candidate target host.
1184 // `host_str()` is `None` for URLs without a host (e.g. data
1185 // URIs); treat that as an allowlist miss.
1186 let host = match host_opt {
1187 Some(h) => h,
1188 None => {
1189 return attempt.error(HttpError::RedirectDenied {
1190 source_key: allowlist_for_closure.source.clone(),
1191 host: String::new(),
1192 expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1193 });
1194 }
1195 };
1196 if !allowlist_for_closure.matches(&host) {
1197 return attempt.error(HttpError::RedirectDenied {
1198 source_key: allowlist_for_closure.source.clone(),
1199 host,
1200 expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1201 });
1202 }
1203
1204 attempt.follow()
1205 });
1206
1207 ClientBuilder::new()
1208 .https_only(true)
1209 .redirect(redirect_policy)
1210 .connect_timeout(CONNECT_TIMEOUT)
1211 .timeout(TOTAL_TIMEOUT)
1212 .read_timeout(READ_TIMEOUT)
1213 .user_agent(user_agent)
1214 // `tls_backend_rustls()` is the non-deprecated equivalent of the
1215 // older `use_rustls_tls()`. The workspace pins reqwest with
1216 // `rustls-no-provider` (ADR-0020 Amendment 1), so this is a
1217 // re-assertion at builder level rather than a feature switch; the
1218 // `ring` provider installed by `ensure_crypto_provider()` above
1219 // is what reqwest picks up via `CryptoProvider::get_default()`.
1220 .tls_backend_rustls()
1221 .build()
1222}
1223
1224// ---------------------------------------------------------------------------
1225// Tests
1226// ---------------------------------------------------------------------------
1227
1228#[cfg(test)]
1229#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1230mod tests {
1231 use super::*;
1232 use wiremock::matchers::{method, path};
1233 use wiremock::{Mock, MockServer, ResponseTemplate};
1234
1235 // ---------------------------------------------------------------
1236 // http -> https scheme upgrade (#220) — pure unit tests, no network.
1237 // ---------------------------------------------------------------
1238
1239 #[test]
1240 fn upgrade_http_to_https_rewrites_public_http_url() {
1241 let input = Url::parse("http://link.aps.org/pdf/10.1103/PhysRev.123.456").unwrap();
1242 let out = upgrade_http_to_https(input.clone());
1243 assert_eq!(out.scheme(), "https");
1244 assert_eq!(out.host_str(), Some("link.aps.org"));
1245 assert_eq!(out.path(), "/pdf/10.1103/PhysRev.123.456");
1246 }
1247
1248 #[test]
1249 fn upgrade_http_to_https_preserves_port_path_query_fragment() {
1250 let input = Url::parse("http://example.org:8080/a/b?q=1#frag").unwrap();
1251 let out = upgrade_http_to_https(input);
1252 assert_eq!(out.as_str(), "https://example.org:8080/a/b?q=1#frag");
1253 }
1254
1255 #[test]
1256 fn upgrade_http_to_https_is_idempotent_on_https() {
1257 let input = Url::parse("https://api.crossref.org/works/10.1234/foo").unwrap();
1258 let out = upgrade_http_to_https(input.clone());
1259 assert_eq!(out, input);
1260 }
1261
1262 #[test]
1263 fn upgrade_http_to_https_skips_localhost() {
1264 // wiremock binds to `127.0.0.1:PORT`; the loopback exception
1265 // is the load-bearing rule that keeps `new_for_tests_allow_http*`
1266 // working alongside the production fetch path.
1267 let input = Url::parse("http://localhost:7878/pdf").unwrap();
1268 let out = upgrade_http_to_https(input.clone());
1269 assert_eq!(out, input, "localhost MUST NOT be upgraded");
1270 }
1271
1272 #[test]
1273 fn upgrade_http_to_https_skips_127_loopback_block() {
1274 for host in ["127.0.0.1", "127.0.0.42", "127.255.255.254"] {
1275 let raw = format!("http://{host}:1234/x");
1276 let input = Url::parse(&raw).unwrap();
1277 let out = upgrade_http_to_https(input.clone());
1278 assert_eq!(out, input, "host `{host}` MUST NOT be upgraded");
1279 }
1280 }
1281
1282 #[test]
1283 fn upgrade_http_to_https_skips_ipv6_loopback() {
1284 let input = Url::parse("http://[::1]:9000/path").unwrap();
1285 let out = upgrade_http_to_https(input.clone());
1286 assert_eq!(out, input, "IPv6 loopback MUST NOT be upgraded");
1287 }
1288
1289 #[test]
1290 fn upgrade_http_to_https_preserves_case_in_path() {
1291 // Some publishers (e.g. APS legacy redirects) use mixed-case
1292 // path segments; upgrade must NOT lowercase or canonicalise.
1293 let input = Url::parse("http://link.aps.org/PDF/10.1103/PhysRevB.109.045136").unwrap();
1294 let out = upgrade_http_to_https(input);
1295 assert_eq!(out.path(), "/PDF/10.1103/PhysRevB.109.045136");
1296 }
1297
1298 // ---- Review-pass extensions ------------------------------------
1299
1300 #[test]
1301 fn upgrade_http_to_https_skips_dot_localhost_tld() {
1302 // RFC 6761 reserves the entire `.localhost` TLD for loopback.
1303 // A developer running `http://myservice.localhost:8080/` MUST
1304 // NOT see their URL silently upgraded to https.
1305 for raw in [
1306 "http://myservice.localhost/",
1307 "http://api.localhost:8080/x",
1308 "http://a.b.LOCALHOST/y",
1309 ] {
1310 let input = Url::parse(raw).unwrap();
1311 let out = upgrade_http_to_https(input.clone());
1312 assert_eq!(out, input, "{raw} MUST NOT be upgraded");
1313 }
1314 }
1315
1316 #[test]
1317 fn upgrade_http_to_https_skips_ipv4_mapped_ipv6_loopback() {
1318 // `::ffff:127.0.0.1` is the IPv4-mapped IPv6 form of 127.0.0.1.
1319 // `Ipv6Addr::is_loopback()` alone returns false for this form,
1320 // so dual-stack callers binding wiremock to it would be
1321 // silently upgraded without the `to_ipv4_mapped()` check.
1322 for raw in [
1323 "http://[::ffff:127.0.0.1]:9000/x",
1324 "http://[::ffff:127.0.0.42]/y",
1325 ] {
1326 let input = Url::parse(raw).unwrap();
1327 let out = upgrade_http_to_https(input.clone());
1328 assert_eq!(out, input, "{raw} MUST NOT be upgraded");
1329 }
1330 }
1331
1332 #[test]
1333 fn upgrade_http_to_https_is_noop_on_non_http_schemes() {
1334 // The first guard (`url.scheme() != "http"`) covers everything
1335 // that isn't http: https (idempotent), file, data, ftp...
1336 for raw in [
1337 "https://api.crossref.org/works/10.1234/foo",
1338 "file:///etc/passwd",
1339 "data:text/plain,hello",
1340 "ftp://ftp.example.org/papers/",
1341 ] {
1342 let input = Url::parse(raw).unwrap();
1343 let out = upgrade_http_to_https(input.clone());
1344 assert_eq!(
1345 out, input,
1346 "{raw} non-http scheme MUST be returned unchanged"
1347 );
1348 }
1349 }
1350
1351 #[test]
1352 fn upgrade_http_to_https_http_url_always_has_host() {
1353 // The url crate's parser enforces authority for "special"
1354 // schemes (`http`, `https`, `ws`, `wss`, `ftp`, `file`).
1355 // `Url::parse("http:foo")` synthesises a Domain("foo")
1356 // authority, so an http URL with `host() == None` is
1357 // unreachable from `Url::parse`. The `None` arm in
1358 // `upgrade_http_to_https` is defence-in-depth only — pinned
1359 // here so a future url-crate behavior change is caught.
1360 let url = Url::parse("http:foo").expect("parse");
1361 assert!(
1362 url.host().is_some(),
1363 "http URLs always carry a host per WHATWG URL spec"
1364 );
1365 // The fn still produces a sensible result (upgrade applies).
1366 let out = upgrade_http_to_https(url.clone());
1367 assert_eq!(out.scheme(), "https");
1368 }
1369
1370 #[test]
1371 fn upgrade_http_to_https_skips_localhost_case_insensitive() {
1372 // The literal `localhost` is resolved by the url crate to
1373 // `127.0.0.1` (Ipv4) at parse time for `http://` URLs, so the
1374 // Ipv4 arm catches lowercase. The Domain-arm coverage is
1375 // load-bearing only for the `.localhost` TLD case, but we
1376 // still pin the casefold semantics in case the url crate
1377 // changes its parsing rules.
1378 for raw in ["http://LOCALHOST/", "http://Localhost:8080/x"] {
1379 let input = Url::parse(raw).unwrap();
1380 let out = upgrade_http_to_https(input.clone());
1381 assert_eq!(out, input, "{raw} MUST NOT be upgraded");
1382 }
1383 }
1384
1385 #[test]
1386 fn is_localhost_domain_matches_literal_and_tld_suffix() {
1387 assert!(is_localhost_domain("localhost"));
1388 assert!(is_localhost_domain("LOCALHOST"));
1389 assert!(is_localhost_domain("api.localhost"));
1390 assert!(is_localhost_domain("nested.api.localhost"));
1391 assert!(is_localhost_domain("X.LocalHost"));
1392 assert!(!is_localhost_domain("localhost.example.org"));
1393 assert!(!is_localhost_domain("notlocalhost"));
1394 assert!(!is_localhost_domain(""));
1395 assert!(!is_localhost_domain(".localhost")); // empty label not valid
1396 }
1397
1398 #[test]
1399 fn is_ipv6_loopback_covers_both_pure_and_mapped() {
1400 use std::net::Ipv6Addr;
1401 assert!(is_ipv6_loopback(Ipv6Addr::LOCALHOST)); // ::1
1402 assert!(is_ipv6_loopback("::ffff:127.0.0.1".parse().unwrap()));
1403 assert!(is_ipv6_loopback("::ffff:127.0.0.42".parse().unwrap()));
1404 assert!(!is_ipv6_loopback("::".parse().unwrap()));
1405 assert!(!is_ipv6_loopback("2001:db8::1".parse().unwrap()));
1406 // IPv4-mapped non-loopback must NOT be considered loopback.
1407 assert!(!is_ipv6_loopback("::ffff:1.2.3.4".parse().unwrap()));
1408 }
1409
1410 // ---------------------------------------------------------------
1411 // Allowlist matching — pure unit tests, no network.
1412 // ---------------------------------------------------------------
1413
1414 #[test]
1415 fn tier_1_allowlist_includes_crossref() {
1416 let lists = tier_1_allowlist();
1417 let crossref = lists
1418 .iter()
1419 .find(|a| a.source == "crossref")
1420 .expect("crossref entry");
1421 assert!(
1422 crossref
1423 .redirect_hosts
1424 .iter()
1425 .any(|h| h.contains("crossref.org")),
1426 "crossref allowlist must contain a crossref.org pattern; got {:?}",
1427 crossref.redirect_hosts,
1428 );
1429 }
1430
1431 #[test]
1432 fn tier_1_allowlist_includes_unpaywall_and_arxiv() {
1433 let lists = tier_1_allowlist();
1434 assert!(lists.iter().any(|a| a.source == "unpaywall"));
1435 assert!(lists.iter().any(|a| a.source == "arxiv"));
1436 }
1437
1438 #[test]
1439 fn oa_publisher_allowlist_groups_under_one_synthetic_source() {
1440 // The OA-publisher fan-out from Unpaywall's `best_oa_location.url`
1441 // is keyed under a single synthetic `"oa-publisher"` source so the
1442 // orchestrator can pass that one source key to
1443 // `HttpClient::fetch_pdf`. See `docs/REDIRECT_ALLOWLIST.md` §3 (the
1444 // informed-best-effort note) and the function-level docs in
1445 // [`oa_publisher_allowlist`].
1446 let lists = oa_publisher_allowlist();
1447 assert_eq!(lists.len(), 1, "exactly one synthetic source entry");
1448 assert_eq!(lists[0].source, "oa-publisher");
1449 }
1450
1451 #[test]
1452 fn oa_publisher_allowlist_matches_known_oa_hosts() {
1453 let lists = oa_publisher_allowlist();
1454 let oa = lists
1455 .iter()
1456 .find(|a| a.source == "oa-publisher")
1457 .expect("oa-publisher entry");
1458 // Spot-check a representative entry per host family.
1459 assert!(oa.matches("link.springer.com"));
1460 assert!(oa.matches("nature.com"));
1461 assert!(oa.matches("onlinelibrary.wiley.com"));
1462 assert!(oa.matches("www.frontiersin.org"));
1463 assert!(oa.matches("www.mdpi.com"));
1464 assert!(oa.matches("journals.plos.org"));
1465 assert!(oa.matches("www.biorxiv.org"));
1466 assert!(oa.matches("europepmc.org"));
1467 assert!(oa.matches("www.ncbi.nlm.nih.gov"));
1468 assert!(oa.matches("arxiv.org"));
1469 // #193: physics-society / diamond-OA hosts (empirically observed
1470 // as Unpaywall best_oa_location targets in the dogfood run).
1471 assert!(oa.matches("link.aps.org"));
1472 assert!(oa.matches("journals.aps.org"));
1473 assert!(oa.matches("scipost.org"));
1474 assert!(oa.matches("www.scipost.org"));
1475 assert!(oa.matches("iopscience.iop.org"));
1476 // Document intent of the `*.<suffix>` form: per
1477 // `REDIRECT_ALLOWLIST.md` §2.2 rule 3 it matches the bare
1478 // registrable domain AND any subdomain. Unpaywall has not been
1479 // observed returning bare-domain PDF URLs for these publishers,
1480 // but accepting them is consistent with every other `*.` entry in
1481 // this list (e.g. `arxiv.org` matched by `*.arxiv.org`) and is
1482 // what the matching rule already implements.
1483 assert!(oa.matches("aps.org"));
1484 assert!(oa.matches("iop.org"));
1485 // Multi-level subdomains also match (e.g. SciPost's deep paths);
1486 // documents the wildcard scope rather than testing a known URL.
1487 assert!(oa.matches("submissions.scipost.org"));
1488 // Negative: an attacker host is not covered.
1489 assert!(!oa.matches("attacker.test"));
1490 // Negative: dot-boundary safety for the new entries — a different
1491 // suffix that merely ends with the registrable name must NOT match.
1492 assert!(!oa.matches("notaps.org"));
1493 assert!(!oa.matches("evilscipost.org"));
1494 assert!(!oa.matches("notiop.org"));
1495 // Negative: dot-boundary safety — `*.springer.com` must not match
1496 // `notspringer.com`.
1497 assert!(!oa.matches("notspringer.com"));
1498 }
1499
1500 #[test]
1501 fn allowlist_matches_exact_fqdn() {
1502 let a = SourceAllowlist::new("crossref", vec!["api.crossref.org".to_string()]);
1503 assert!(a.matches("api.crossref.org"));
1504 assert!(!a.matches("crossref.org"));
1505 assert!(!a.matches("xapi.crossref.org"));
1506 }
1507
1508 #[test]
1509 fn allowlist_matches_subdomain_glob() {
1510 // Per docs/REDIRECT_ALLOWLIST.md §2.2 rule 3: `*.<suffix>`
1511 // matches both `<suffix>` itself AND any `*.<suffix>` subdomain,
1512 // but never matches a different suffix that happens to end with
1513 // `<suffix>` without a dot boundary.
1514 let a = SourceAllowlist::new("crossref", vec!["*.crossref.org".to_string()]);
1515 assert!(a.matches("doi.crossref.org"));
1516 assert!(a.matches("crossref.org"));
1517 assert!(!a.matches("notcrossref.org"));
1518 assert!(!a.matches("crossref.org.attacker.test"));
1519 }
1520
1521 #[test]
1522 fn allowlist_matches_is_case_insensitive() {
1523 let a = SourceAllowlist::new("crossref", vec!["API.crossref.ORG".to_string()]);
1524 assert!(a.matches("api.crossref.org"));
1525 assert!(a.matches("API.CROSSREF.ORG"));
1526 }
1527
1528 #[test]
1529 fn allowlist_with_no_redirect_hosts_matches_nothing() {
1530 // §2.2 rule 5: an empty `redirect_hosts` means "no redirects
1531 // permitted from this source".
1532 let a = SourceAllowlist::new("ghost", Vec::<String>::new());
1533 assert!(!a.matches("anything.test"));
1534 assert!(!a.matches(""));
1535 }
1536
1537 // ---------------------------------------------------------------
1538 // PDF magic-byte handling — tests on the body-parsing path. We
1539 // exercise the magic-byte branch via the public API against a
1540 // wiremock server so the assertion runs through the full
1541 // streaming codepath.
1542 // ---------------------------------------------------------------
1543
1544 /// Build a test-only `HttpClient` against an `http://` wiremock
1545 /// origin.
1546 ///
1547 /// Slice 5 (PR #84 advisory item A4 refactor): this helper now
1548 /// delegates to the public
1549 /// [`HttpClient::new_for_tests_allow_http`] constructor (defined
1550 /// just above the test module) instead of re-implementing the
1551 /// redirect-policy + `https_only(false)` builder. The two
1552 /// implementations had drifted into duplicates — keeping a private
1553 /// re-implementation only meant a future security tweak to the
1554 /// builder would silently leave the tests on a stale path.
1555 fn build_test_client_for_http(source: &str, allowlist_host: &str) -> HttpClient {
1556 HttpClient::new_for_tests_allow_http(source, allowlist_host)
1557 }
1558
1559 #[tokio::test]
1560 async fn pdf_magic_byte_match_succeeds() {
1561 let server = MockServer::start().await;
1562 let body = b"%PDF-1.7\n...some pdf bytes...".to_vec();
1563 Mock::given(method("GET"))
1564 .and(path("/paper.pdf"))
1565 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
1566 .mount(&server)
1567 .await;
1568 let host = server
1569 .uri()
1570 .parse::<Url>()
1571 .unwrap()
1572 .host_str()
1573 .unwrap()
1574 .to_string();
1575 let client = build_test_client_for_http("crossref", &host);
1576 let url: Url = format!("{}/paper.pdf", server.uri()).parse().unwrap();
1577 let (got_body, _final_url) = client.fetch_pdf("crossref", url).await.expect("ok");
1578 assert_eq!(&got_body[..], &body[..]);
1579 }
1580
1581 #[tokio::test]
1582 async fn pdf_magic_byte_mismatch_rejects() {
1583 let server = MockServer::start().await;
1584 Mock::given(method("GET"))
1585 .and(path("/not_a_pdf"))
1586 .respond_with(
1587 ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
1588 )
1589 .mount(&server)
1590 .await;
1591 let host = server
1592 .uri()
1593 .parse::<Url>()
1594 .unwrap()
1595 .host_str()
1596 .unwrap()
1597 .to_string();
1598 let client = build_test_client_for_http("crossref", &host);
1599 let url: Url = format!("{}/not_a_pdf", server.uri()).parse().unwrap();
1600 let err = client
1601 .fetch_pdf("crossref", url)
1602 .await
1603 .expect_err("not pdf");
1604 match err {
1605 HttpError::NotAPdf { got } => {
1606 assert_eq!(&got, b"<html");
1607 }
1608 other => panic!("expected NotAPdf, got {:?}", other),
1609 }
1610 }
1611
1612 #[tokio::test]
1613 async fn fetch_bytes_does_not_check_pdf_magic() {
1614 // The non-PDF path returns the body unchanged regardless of
1615 // magic bytes. This pins the boundary between the JSON/text
1616 // path and the PDF path.
1617 let server = MockServer::start().await;
1618 Mock::given(method("GET"))
1619 .and(path("/data.json"))
1620 .respond_with(
1621 ResponseTemplate::new(200).set_body_bytes(br#"{"hello":"world"}"#.to_vec()),
1622 )
1623 .mount(&server)
1624 .await;
1625 let host = server
1626 .uri()
1627 .parse::<Url>()
1628 .unwrap()
1629 .host_str()
1630 .unwrap()
1631 .to_string();
1632 let client = build_test_client_for_http("crossref", &host);
1633 let url: Url = format!("{}/data.json", server.uri()).parse().unwrap();
1634 let (body, _final_url) = client.fetch_bytes("crossref", url).await.expect("ok");
1635 assert_eq!(&body[..], br#"{"hello":"world"}"#);
1636 }
1637
1638 #[tokio::test]
1639 async fn oversized_body_via_content_length_short_circuits() {
1640 // Wiremock can advertise a `Content-Length` larger than the body
1641 // it actually serves; hyper accepts the mismatch and our
1642 // fast-path check fires before any body bytes are consumed.
1643 let server = MockServer::start().await;
1644 let oversized = PDF_MAX_BYTES + 1;
1645 Mock::given(method("GET"))
1646 .and(path("/huge"))
1647 .respond_with(
1648 ResponseTemplate::new(200)
1649 .insert_header("content-length", oversized.to_string().as_str())
1650 .set_body_bytes(b"%PDF-".to_vec()),
1651 )
1652 .mount(&server)
1653 .await;
1654 let host = server
1655 .uri()
1656 .parse::<Url>()
1657 .unwrap()
1658 .host_str()
1659 .unwrap()
1660 .to_string();
1661 let client = build_test_client_for_http("crossref", &host);
1662 let url: Url = format!("{}/huge", server.uri()).parse().unwrap();
1663 let err = client
1664 .fetch_bytes("crossref", url)
1665 .await
1666 .expect_err("should reject");
1667 match err {
1668 HttpError::OversizedBody { actual, cap } => {
1669 assert!(actual > cap, "actual {} should exceed cap {}", actual, cap);
1670 assert_eq!(cap, PDF_MAX_BYTES);
1671 }
1672 // The mismatched Content-Length may also trip an underlying
1673 // transport error before our fast-path runs. Either outcome
1674 // satisfies the security goal (the transfer was aborted
1675 // without buffering 100 GB), so accept Network here as a
1676 // wiremock idiosyncrasy rather than a contract relaxation.
1677 HttpError::Network(_) => {}
1678 other => panic!("expected OversizedBody or Network, got {:?}", other),
1679 }
1680 }
1681
1682 #[tokio::test]
1683 async fn unknown_source_rejected() {
1684 let client = HttpClient::new(tier_1_allowlist()).expect("client builds");
1685 let url: Url = "https://api.crossref.org/works/10.1234/x".parse().unwrap();
1686 let err = client
1687 .fetch_bytes("not-a-source", url)
1688 .await
1689 .expect_err("unknown source");
1690 match err {
1691 HttpError::UnknownSource { source_key } => {
1692 assert_eq!(source_key, "not-a-source")
1693 }
1694 other => panic!("expected UnknownSource, got {:?}", other),
1695 }
1696 }
1697
1698 #[tokio::test]
1699 async fn http_status_error_surfaces() {
1700 let server = MockServer::start().await;
1701 Mock::given(method("GET"))
1702 .and(path("/missing"))
1703 .respond_with(ResponseTemplate::new(404))
1704 .mount(&server)
1705 .await;
1706 let host = server
1707 .uri()
1708 .parse::<Url>()
1709 .unwrap()
1710 .host_str()
1711 .unwrap()
1712 .to_string();
1713 let client = build_test_client_for_http("crossref", &host);
1714 let url: Url = format!("{}/missing", server.uri()).parse().unwrap();
1715 let err = client.fetch_bytes("crossref", url).await.expect_err("404");
1716 match err {
1717 HttpError::HttpStatus { status, .. } => assert_eq!(status, 404),
1718 other => panic!("expected HttpStatus, got {:?}", other),
1719 }
1720 }
1721
1722 // ---------------------------------------------------------------
1723 // Redirect policy tests — drive the closure via wiremock 30x
1724 // responses pointing at insecure / off-allowlist targets. With
1725 // `https_only(true)` on the production builder the request never
1726 // leaves the initial leg — we run these against the test builder
1727 // (which relaxes `https_only` for the *initial* leg only) so the
1728 // redirect closure is reached and exercised.
1729 // ---------------------------------------------------------------
1730
1731 #[tokio::test]
1732 async fn redirect_to_http_is_rejected_by_closure() {
1733 let server = MockServer::start().await;
1734 Mock::given(method("GET"))
1735 .and(path("/redir"))
1736 .respond_with(
1737 ResponseTemplate::new(302).insert_header("location", "http://attacker.test/file"),
1738 )
1739 .mount(&server)
1740 .await;
1741 let host = server
1742 .uri()
1743 .parse::<Url>()
1744 .unwrap()
1745 .host_str()
1746 .unwrap()
1747 .to_string();
1748 let client = build_test_client_for_http("crossref", &host);
1749 let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1750 let err = client
1751 .fetch_bytes("crossref", url)
1752 .await
1753 .expect_err("redirect to http rejected");
1754 match err {
1755 HttpError::Network(e) => {
1756 let msg = format!("{:?}", e);
1757 assert!(
1758 msg.contains("InsecureRedirect") || msg.contains("non-HTTPS"),
1759 "expected insecure-redirect signal in error chain, got {}",
1760 msg
1761 );
1762 }
1763 other => panic!("expected Network(InsecureRedirect), got {:?}", other),
1764 }
1765 }
1766
1767 #[tokio::test]
1768 async fn redirect_outside_allowlist_is_rejected_by_closure() {
1769 let server = MockServer::start().await;
1770 Mock::given(method("GET"))
1771 .and(path("/redir"))
1772 .respond_with(
1773 ResponseTemplate::new(302).insert_header("location", "https://attacker.test/file"),
1774 )
1775 .mount(&server)
1776 .await;
1777 let host = server
1778 .uri()
1779 .parse::<Url>()
1780 .unwrap()
1781 .host_str()
1782 .unwrap()
1783 .to_string();
1784 let client = build_test_client_for_http("crossref", &host);
1785 let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1786 let err = client
1787 .fetch_bytes("crossref", url)
1788 .await
1789 .expect_err("redirect to attacker rejected");
1790 match err {
1791 HttpError::Network(e) => {
1792 let msg = format!("{:?}", e);
1793 assert!(
1794 msg.contains("RedirectDenied") || msg.contains("not in allowlist"),
1795 "expected redirect-denied signal in error chain, got {}",
1796 msg
1797 );
1798 }
1799 other => panic!("expected Network(RedirectDenied), got {:?}", other),
1800 }
1801 }
1802
1803 #[tokio::test]
1804 async fn redirect_to_allowlisted_https_host_is_followed_by_closure() {
1805 // 302 to an https host that IS in the allowlist. The redirect
1806 // dispatch will fail (DNS won't resolve `mirror.allowed.test`)
1807 // but the closure must NOT short-circuit — failure mode is a
1808 // transport error, not InsecureRedirect / RedirectDenied.
1809 let server = MockServer::start().await;
1810 Mock::given(method("GET"))
1811 .and(path("/redir"))
1812 .respond_with(
1813 ResponseTemplate::new(302)
1814 .insert_header("location", "https://mirror.allowed.test/file"),
1815 )
1816 .mount(&server)
1817 .await;
1818 let initial_host = server
1819 .uri()
1820 .parse::<Url>()
1821 .unwrap()
1822 .host_str()
1823 .unwrap()
1824 .to_string();
1825 // Allow the initial host AND the redirect target host.
1826 let allowlist = SourceAllowlist::new(
1827 "crossref",
1828 vec![initial_host.clone(), "*.allowed.test".to_string()],
1829 );
1830 let allowlist_for_closure = allowlist.clone();
1831 let policy = Policy::custom(move |attempt| {
1832 let scheme = attempt.url().scheme().to_string();
1833 let host_opt = attempt.url().host_str().map(|h| h.to_ascii_lowercase());
1834 if scheme != "https" {
1835 return attempt.error(HttpError::InsecureRedirect { scheme });
1836 }
1837 let h = match host_opt {
1838 Some(h) => h,
1839 None => {
1840 return attempt.error(HttpError::RedirectDenied {
1841 source_key: allowlist_for_closure.source.clone(),
1842 host: String::new(),
1843 expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1844 });
1845 }
1846 };
1847 if !allowlist_for_closure.matches(&h) {
1848 return attempt.error(HttpError::RedirectDenied {
1849 source_key: allowlist_for_closure.source.clone(),
1850 host: h,
1851 expected_hosts: allowlist_for_closure.redirect_hosts.clone(),
1852 });
1853 }
1854 attempt.follow()
1855 });
1856 ensure_crypto_provider();
1857 let raw_client = ClientBuilder::new()
1858 .https_only(false)
1859 .redirect(policy)
1860 .connect_timeout(CONNECT_TIMEOUT)
1861 .timeout(Duration::from_secs(5))
1862 .user_agent("doiget/test")
1863 .tls_backend_rustls()
1864 .build()
1865 .expect("client builds");
1866 let url: Url = format!("{}/redir", server.uri()).parse().unwrap();
1867 let err = raw_client.get(url).send().await.expect_err("DNS fails");
1868 // The error should NOT carry our InsecureRedirect / RedirectDenied
1869 // marker — the closure approved the redirect.
1870 let msg = format!("{:?}", err);
1871 assert!(
1872 !msg.contains("RedirectDenied") && !msg.contains("InsecureRedirect"),
1873 "closure short-circuited an allowed redirect: {}",
1874 msg,
1875 );
1876 }
1877
1878 #[test]
1879 fn http_client_clone_is_cheap() {
1880 // Sanity: cloning shares the inner Arc<HashMap<...>>.
1881 let a = HttpClient::new(tier_1_allowlist()).expect("builds");
1882 let b = a.clone();
1883 assert_eq!(a.clients.len(), b.clients.len());
1884 assert!(Arc::ptr_eq(&a.clients, &b.clients));
1885 }
1886
1887 // ---------------------------------------------------------------
1888 // HttpError -> Option<DenialContext> (ADR-0023 §4 mapping)
1889 // ---------------------------------------------------------------
1890
1891 #[test]
1892 fn denial_from_redirect_denied_carries_attempted_and_expected() {
1893 use crate::{DenialContext, DenialReason};
1894 let e = HttpError::RedirectDenied {
1895 source_key: "crossref".to_string(),
1896 host: "evil.example.com".to_string(),
1897 expected_hosts: vec!["api.crossref.org".to_string(), "*.crossref.org".to_string()],
1898 };
1899 let dc: Option<DenialContext> = (&e).into();
1900 let dc = dc.expect("RedirectDenied -> Some(DenialContext)");
1901 assert_eq!(dc.reason, DenialReason::RedirectNotInAllowlist);
1902 assert_eq!(dc.source.as_deref(), Some("crossref"));
1903 assert_eq!(dc.attempted.as_deref(), Some("evil.example.com"));
1904 assert_eq!(
1905 dc.expected.as_deref(),
1906 Some(&["api.crossref.org".to_string(), "*.crossref.org".to_string()][..])
1907 );
1908 assert!(dc.cap.is_none());
1909 assert!(dc.actual.is_none());
1910 assert!(dc.hop_index.is_none());
1911 }
1912
1913 #[test]
1914 fn denial_from_oversized_body_carries_cap_and_actual() {
1915 use crate::{DenialContext, DenialReason};
1916 let e = HttpError::OversizedBody {
1917 actual: 209_715_200,
1918 cap: PDF_MAX_BYTES,
1919 };
1920 let dc: Option<DenialContext> = (&e).into();
1921 let dc = dc.expect("OversizedBody -> Some(DenialContext)");
1922 assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
1923 assert_eq!(dc.cap, Some(PDF_MAX_BYTES));
1924 assert_eq!(dc.actual, Some(209_715_200));
1925 assert!(dc.source.is_none());
1926 assert!(dc.attempted.is_none());
1927 // OversizedBody has no allowlist channel: producer leaves
1928 // `expected` at `None` (NOT `Some(vec![])`). See the field doc on
1929 // `DenialContext::expected` for the disambiguation.
1930 assert!(dc.expected.is_none());
1931 }
1932
1933 #[test]
1934 fn denial_from_not_a_pdf_hex_encodes_got_bytes() {
1935 use crate::{DenialContext, DenialReason};
1936 // First 5 bytes of "<html" — what the magic-byte check sees when
1937 // a publisher returns an HTML interstitial instead of a PDF.
1938 let e = HttpError::NotAPdf {
1939 got: [0x3c, 0x68, 0x74, 0x6d, 0x6c],
1940 };
1941 let dc: Option<DenialContext> = (&e).into();
1942 let dc = dc.expect("NotAPdf -> Some(DenialContext)");
1943 assert_eq!(dc.reason, DenialReason::ContentTypeMismatch);
1944 assert_eq!(dc.attempted.as_deref(), Some("3c68746d6c"));
1945 assert_eq!(dc.expected.as_deref(), Some(&["%PDF-".to_string()][..]));
1946 }
1947
1948 #[test]
1949 fn denial_from_insecure_redirect_marks_insecure_scheme() {
1950 use crate::{DenialContext, DenialReason};
1951 let e = HttpError::InsecureRedirect {
1952 scheme: "http".to_string(),
1953 };
1954 let dc: Option<DenialContext> = (&e).into();
1955 let dc = dc.expect("InsecureRedirect -> Some(DenialContext)");
1956 // ADR-0023 §4 (post-incorporation review): InsecureRedirect maps
1957 // to its own dedicated `InsecureScheme` reason, not the host-
1958 // allowlist reason — they are semantically distinct denials.
1959 assert_eq!(dc.reason, DenialReason::InsecureScheme);
1960 assert_eq!(dc.attempted.as_deref(), Some("http:..."));
1961 assert_eq!(dc.expected.as_deref(), Some(&["https".to_string()][..]));
1962 }
1963
1964 #[test]
1965 fn denial_from_non_denial_variants_returns_none() {
1966 use crate::DenialContext;
1967 // Network / HttpStatus / UnknownSource are not denials; they
1968 // map to None per ADR-0023 §4.
1969 let e = HttpError::HttpStatus {
1970 status: 503,
1971 url: "https://api.crossref.org/works/x".to_string(),
1972 };
1973 let dc: Option<DenialContext> = (&e).into();
1974 assert!(dc.is_none(), "HttpStatus must not produce a DenialContext");
1975
1976 let e = HttpError::UnknownSource {
1977 source_key: "ghost".to_string(),
1978 };
1979 let dc: Option<DenialContext> = (&e).into();
1980 assert!(
1981 dc.is_none(),
1982 "UnknownSource must not produce a DenialContext"
1983 );
1984 }
1985
1986 // ---------------------------------------------------------------
1987 // Issue #117 — transient retry / backoff. Real time: wiremock
1988 // serves over real localhost IO and tokio `start_paused` is
1989 // incompatible with that (it auto-advances past reqwest's
1990 // timeout). Backoff is small enough that the slowest case
1991 // (persistent 503, 3 retries ≈ 3.5s) stays within the suite budget.
1992 // ---------------------------------------------------------------
1993
1994 fn host_of(server: &MockServer) -> String {
1995 server
1996 .uri()
1997 .parse::<Url>()
1998 .unwrap()
1999 .host_str()
2000 .unwrap()
2001 .to_string()
2002 }
2003
2004 #[tokio::test]
2005 async fn transient_503_then_200_succeeds() {
2006 let server = MockServer::start().await;
2007 // Catch-all 200 mounted first (lowest precedence); the
2008 // single-shot 503 mounted last takes precedence for the first
2009 // request only, then falls through to the 200.
2010 Mock::given(method("GET"))
2011 .and(path("/p"))
2012 .respond_with(ResponseTemplate::new(200).set_body_string(r#"{"ok":1}"#))
2013 .mount(&server)
2014 .await;
2015 Mock::given(method("GET"))
2016 .and(path("/p"))
2017 .respond_with(ResponseTemplate::new(503))
2018 .up_to_n_times(1)
2019 .mount(&server)
2020 .await;
2021
2022 let client = build_test_client_for_http("crossref", &host_of(&server));
2023 let url: Url = format!("{}/p", server.uri()).parse().unwrap();
2024 let (body, _) = client
2025 .fetch_bytes("crossref", url)
2026 .await
2027 .expect("503-then-200 must succeed after one retry");
2028 assert_eq!(&body[..], br#"{"ok":1}"#);
2029 }
2030
2031 #[tokio::test]
2032 async fn persistent_503_exhausts_and_returns_httpstatus() {
2033 let server = MockServer::start().await;
2034 Mock::given(method("GET"))
2035 .and(path("/p"))
2036 .respond_with(ResponseTemplate::new(503))
2037 .mount(&server)
2038 .await;
2039
2040 let client = build_test_client_for_http("crossref", &host_of(&server));
2041 let url: Url = format!("{}/p", server.uri()).parse().unwrap();
2042 let err = client
2043 .fetch_bytes("crossref", url)
2044 .await
2045 .expect_err("persistent 503 must exhaust retries");
2046 match err {
2047 HttpError::HttpStatus { status, .. } => assert_eq!(status, 503),
2048 other => panic!("expected HttpStatus 503, got {other:?}"),
2049 }
2050 // First attempt + MAX_FETCH_RETRIES retries.
2051 let reqs = server
2052 .received_requests()
2053 .await
2054 .expect("wiremock records requests");
2055 assert_eq!(reqs.len(), (MAX_FETCH_RETRIES + 1) as usize);
2056 }
2057
2058 #[tokio::test]
2059 async fn retry_after_429_then_200_succeeds() {
2060 let server = MockServer::start().await;
2061 Mock::given(method("GET"))
2062 .and(path("/p"))
2063 .respond_with(ResponseTemplate::new(200).set_body_string("ok"))
2064 .mount(&server)
2065 .await;
2066 Mock::given(method("GET"))
2067 .and(path("/p"))
2068 .respond_with(ResponseTemplate::new(429).insert_header("Retry-After", "1"))
2069 .up_to_n_times(1)
2070 .mount(&server)
2071 .await;
2072
2073 let client = build_test_client_for_http("crossref", &host_of(&server));
2074 let url: Url = format!("{}/p", server.uri()).parse().unwrap();
2075 let (body, _) = client
2076 .fetch_bytes("crossref", url)
2077 .await
2078 .expect("429+Retry-After then 200 must succeed");
2079 assert_eq!(&body[..], b"ok");
2080 }
2081
2082 #[tokio::test]
2083 async fn permanent_404_is_not_retried() {
2084 let server = MockServer::start().await;
2085 Mock::given(method("GET"))
2086 .and(path("/p"))
2087 .respond_with(ResponseTemplate::new(404))
2088 .mount(&server)
2089 .await;
2090
2091 let client = build_test_client_for_http("crossref", &host_of(&server));
2092 let url: Url = format!("{}/p", server.uri()).parse().unwrap();
2093 let _ = client
2094 .fetch_bytes("crossref", url)
2095 .await
2096 .expect_err("404 must fail");
2097 let reqs = server
2098 .received_requests()
2099 .await
2100 .expect("wiremock records requests");
2101 assert_eq!(reqs.len(), 1, "4xx (non-408/429) must NOT be retried");
2102 }
2103}