Skip to main content

doiget_core/
source.rs

1//! Source abstraction. Each Tier 1/2/3 fetcher implements this trait.
2//!
3//! Binding spec: `docs/PUBLIC_API.md` §2 (trait surface),
4//! `docs/ARCHITECTURE.md` §6 (per-fetch data flow), and
5//! `docs/PROVENANCE_LOG.md` §3 (the `Fetch` row source impls emit).
6//!
7//! Phase 1 ships the trait + supporting types; concrete impls (Crossref,
8//! Unpaywall, arXiv) land in follow-up PRs (see `docs/SOURCES.md` for the
9//! source matrix and tiering).
10
11use std::sync::Arc;
12
13use async_trait::async_trait;
14use bytes::Bytes;
15use thiserror::Error;
16
17use crate::http::{HttpClient, HttpError};
18use crate::provenance::{LogError, ProvenanceLog};
19use crate::rate_limiter::RateLimiter;
20use crate::{CapabilityProfile, Ref, RefParseError};
21
22/// What a successful fetch returns to the caller.
23///
24/// Whether `pdf_bytes` is `None` depends on the source: metadata-only
25/// sources (Phase 4) leave it unset; OA sources (Phase 1) return PDF bytes
26/// when an OA URL was discovered.
27#[derive(Debug, Clone)]
28#[non_exhaustive]
29pub struct FetchResult {
30    /// Source's name (matches `Source::name()`); set for the audit trail.
31    pub source: String,
32    /// OA license string (`"CC-BY-4.0"`, `"unknown"`, etc.).
33    pub license: String,
34    /// PDF bytes; `None` for metadata-only sources.
35    pub pdf_bytes: Option<Bytes>,
36    /// Final URL after redirect resolution; useful for the metadata
37    /// `[doiget].url` field.
38    pub final_url: Option<url::Url>,
39    /// Source-side metadata payload as a serde_json value. The Source impl
40    /// is responsible for the shape; the caller (Phase 1+ orchestrator)
41    /// maps it into `Metadata` when one exists (Phase 1+).
42    pub metadata_json: Option<serde_json::Value>,
43}
44
45/// Per-fetch context shared by all `Source` impls.
46///
47/// Held by the orchestrator (CLI / MCP server) and passed by reference into
48/// each [`Source::fetch`]. Sources MUST NOT construct their own
49/// [`HttpClient`] / [`RateLimiter`] / [`ProvenanceLog`] — they go through
50/// this context for uniform politeness, redirect allowlisting, and audit
51/// logging.
52#[derive(Clone)]
53pub struct FetchContext {
54    /// Shared, allowlist-aware HTTP client. See [`HttpClient`].
55    pub http: Arc<HttpClient>,
56    /// Process-wide async rate limiter. See [`RateLimiter`].
57    pub rate_limiter: Arc<RateLimiter>,
58    /// Append-only, hash-chained provenance log. Source impls MUST emit
59    /// one `LogEvent::Fetch` row per attempt via `log.append`. See
60    /// [`ProvenanceLog`].
61    pub log: Arc<ProvenanceLog>,
62    /// 26-char ULID identifying this process invocation. Mirrors the
63    /// `session_id` stamped into every provenance row by the writer; held
64    /// here so source impls can include it in their own structured logs
65    /// without re-reading the env.
66    pub session_id: String,
67    /// Resolver cache root (`<cache_root>/resolver/<safekey>.toml`, see
68    /// `docs/CACHE.md` and [`crate::resolver_cache`]). `Some` enables the
69    /// metadata-only resolve cache (repeat resolves served from disk,
70    /// avoiding upstream rate limits); `None` disables it (tests, or a
71    /// caller that opts out). Only `metadata_only` consults it — per-PDF
72    /// fetches are never cached.
73    pub cache_root: Option<camino::Utf8PathBuf>,
74}
75
76impl std::fmt::Debug for FetchContext {
77    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
78        // Avoid printing the full HTTP / rate-limiter / log internals; only
79        // the session_id is human-meaningful for log breadcrumbs.
80        f.debug_struct("FetchContext")
81            .field("session_id", &self.session_id)
82            .finish_non_exhaustive()
83    }
84}
85
86/// Errors returned by [`Source::fetch`].
87///
88/// At the public CLI / MCP boundary, every variant collapses to an
89/// [`crate::ErrorCode`] via the `From<FetchError>` impl below — mirroring
90/// the [`RefParseError`] → [`crate::ErrorCode::InvalidRef`] collapse from
91/// PR #55.
92#[derive(Debug, Error)]
93#[non_exhaustive]
94pub enum FetchError {
95    /// The source does not handle the given ref under the runtime
96    /// capability profile (covers both `can_serve = false` outcomes and
97    /// runtime denials raised inside `fetch`).
98    #[error("source {source_key} cannot serve this ref")]
99    NotEligible {
100        /// The source key that declined.
101        source_key: String,
102    },
103    /// Tier 1 sources reported no OA URL for this ref.
104    #[error("Tier 1 sources reported no OA URL for this ref")]
105    NoOaAvailable,
106    /// Underlying HTTP / network failure. See [`HttpError`].
107    #[error("network error: {0}")]
108    Http(#[from] HttpError),
109    /// Provenance log write failed. Per `docs/SECURITY.md` §1.8 this is a
110    /// fail-closed signal; the surrounding fetch MUST be aborted.
111    #[error("provenance log error: {0}")]
112    Log(#[from] LogError),
113    /// Ref re-parse / validation failed inside the source (e.g. when a
114    /// source receives a borrowed string from upstream and re-validates).
115    #[error("invalid ref: {0}")]
116    InvalidRef(#[from] RefParseError),
117    /// Source-side schema mismatch (unexpected JSON shape, missing
118    /// required field). Surfaces to [`crate::ErrorCode::InternalError`]
119    /// at the public boundary.
120    #[error("source-side schema error: {hint}")]
121    SourceSchema {
122        /// Human-readable hint at the offending field/path; not parsed.
123        hint: String,
124    },
125    /// Batch orchestrator received more refs than
126    /// [`crate::MAX_BATCH_REFS`]. Surfaced to the MCP `doiget_batch_fetch`
127    /// tool as `ErrorCode::InvalidRef` (closest closed-set fit — the
128    /// request shape itself is invalid; no `denial_context` channel
129    /// applies). Slice 2 / `docs/MCP_TOOLS.md` §1.
130    #[error("too many refs: got {got}, max {max}")]
131    TooManyRefs {
132        /// Number of refs the batch orchestrator was handed.
133        got: usize,
134        /// The hard cap ([`crate::MAX_BATCH_REFS`]).
135        max: usize,
136    },
137}
138
139/// Map [`FetchError`] to the closed [`crate::ErrorCode`] set surfaced at
140/// the public CLI / MCP boundary. Mirrors the
141/// `From<RefParseError> for ErrorCode` collapse from PR #55.
142impl From<FetchError> for crate::ErrorCode {
143    fn from(e: FetchError) -> crate::ErrorCode {
144        crate::ErrorCode::from(&e)
145    }
146}
147
148/// Borrow-form of the collapse above, so a caller that still needs the
149/// error for its `Display` message / `denial_context` side-channel
150/// (notably the CLI human-persona renderer, issue #119) can obtain the
151/// closed code without consuming it. The owned impl delegates here so
152/// the mapping table lives in exactly one place.
153impl From<&FetchError> for crate::ErrorCode {
154    fn from(e: &FetchError) -> crate::ErrorCode {
155        match e {
156            FetchError::NotEligible { .. } => crate::ErrorCode::CapabilityDenied,
157            FetchError::NoOaAvailable => crate::ErrorCode::NoOaAvailable,
158            FetchError::Http(_) => crate::ErrorCode::NetworkError,
159            FetchError::Log(_) => crate::ErrorCode::LogError,
160            FetchError::InvalidRef(_) => crate::ErrorCode::InvalidRef,
161            FetchError::SourceSchema { .. } => crate::ErrorCode::InternalError,
162            // Slice 2: a too-large batch is a request-shape failure, so
163            // collapse to `INVALID_REF` (closest closed-set fit). The
164            // `#[non_exhaustive]` wildcard below would otherwise route
165            // it to `INTERNAL_ERROR`, which would mislead agents.
166            FetchError::TooManyRefs { .. } => crate::ErrorCode::InvalidRef,
167        }
168    }
169}
170
171/// Map a [`FetchError`] reference to the structured [`crate::DenialContext`]
172/// channel introduced by ADR-0023 §4.
173///
174/// `&FetchError` (rather than `FetchError`) so the orchestrator can
175/// produce the structured side-channel without consuming the error it
176/// still needs for `error.message` and the `From<FetchError> for
177/// ErrorCode` collapse above. The `Http` arm delegates to the
178/// `From<&HttpError> for Option<DenialContext>` impl in [`crate::http`].
179impl From<&FetchError> for Option<crate::DenialContext> {
180    fn from(e: &FetchError) -> Self {
181        use crate::{DenialContext, DenialReason};
182        match e {
183            FetchError::NotEligible { source_key } => Some(DenialContext {
184                reason: DenialReason::CapabilityNotGranted,
185                source: Some(source_key.clone()),
186                attempted: None,
187                // CapabilityNotGranted has no allowlist channel: the
188                // producer leaves `expected` at `None` (NOT `Some(vec![])`).
189                // See `DenialContext::expected` for the disambiguation.
190                expected: None,
191                hop_index: None,
192                cap: None,
193                actual: None,
194            }),
195            // Delegate to the HttpError mapping (ADR-0023 §4 mapping table).
196            FetchError::Http(http_err) => http_err.into(),
197            // Non-denial variants map to None per ADR-0023 §4. (Slice 2:
198            // `TooManyRefs` is a request-shape failure, not a denial —
199            // adding it to the None arm keeps the mapping table consistent.)
200            FetchError::NoOaAvailable
201            | FetchError::Log(_)
202            | FetchError::InvalidRef(_)
203            | FetchError::SourceSchema { .. }
204            | FetchError::TooManyRefs { .. } => None,
205        }
206    }
207}
208
209/// The trait implemented by every Tier 1 / 2 / 3 fetcher.
210///
211/// Binding signature: `docs/PUBLIC_API.md` §2 (NORMATIVE — the wire shape
212/// of these three methods is semver-locked).
213#[async_trait]
214pub trait Source: Send + Sync {
215    /// Stable name used in metadata (`[doiget].source`) and provenance
216    /// rows. Conventional values: `"crossref"`, `"unpaywall"`, `"arxiv"`,
217    /// `"openalex"`, `"semantic-scholar"`, `"doaj"`, `"tdm-elsevier"`,
218    /// etc. (see `docs/SOURCES.md`).
219    fn name(&self) -> &str;
220
221    /// True if this source can plausibly serve the given ref under the
222    /// runtime capability profile. Implementations MUST be fast and
223    /// non-blocking; the orchestrator calls `can_serve` to decide whether
224    /// to invoke `fetch` at all.
225    fn can_serve(&self, profile: &CapabilityProfile, ref_: &Ref) -> bool;
226
227    /// Perform the source-specific fetch.
228    ///
229    /// Implementations:
230    ///   1. acquire `ctx.rate_limiter.acquire(self.name()).await`,
231    ///   2. fetch via `ctx.http.fetch_bytes` / `ctx.http.fetch_pdf`,
232    ///   3. emit one `LogEvent::Fetch` row via `ctx.log.append`,
233    ///   4. return a [`FetchResult`].
234    ///
235    /// The trait does NOT enforce these steps; it documents the protocol
236    /// so concrete impls produce uniform audit trails (per
237    /// `docs/ARCHITECTURE.md` §6 and `docs/PROVENANCE_LOG.md` §3).
238    async fn fetch(
239        &self,
240        ref_: &Ref,
241        profile: &CapabilityProfile,
242        ctx: &FetchContext,
243    ) -> Result<FetchResult, FetchError>;
244}
245
246// ---------------------------------------------------------------------------
247// Tests
248// ---------------------------------------------------------------------------
249
250#[cfg(test)]
251#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
252mod tests {
253    use super::*;
254
255    use camino::Utf8PathBuf;
256    use tempfile::TempDir;
257
258    use crate::http::{tier_1_allowlist, HttpClient};
259    use crate::provenance::ProvenanceLog;
260    use crate::rate_limiter::RateLimiter;
261    use crate::{CapabilityProfile, Doi, ErrorCode, RateLimits, Ref};
262
263    /// Minimal `Source` impl exercised purely to pin the trait shape and
264    /// verify dispatch through `Box<dyn Source>`. Concrete sources land in
265    /// follow-up PRs (Crossref / Unpaywall / arXiv).
266    struct MockSource;
267
268    #[async_trait]
269    impl Source for MockSource {
270        fn name(&self) -> &str {
271            "mock"
272        }
273        fn can_serve(&self, _: &CapabilityProfile, _: &Ref) -> bool {
274            true
275        }
276        async fn fetch(
277            &self,
278            _: &Ref,
279            _: &CapabilityProfile,
280            _: &FetchContext,
281        ) -> Result<FetchResult, FetchError> {
282            Ok(FetchResult {
283                source: "mock".into(),
284                license: "unknown".into(),
285                pdf_bytes: None,
286                final_url: None,
287                metadata_json: None,
288            })
289        }
290    }
291
292    /// Build a `FetchContext` backed by real (but inert) Round-A
293    /// foundation modules: a `HttpClient` over the Tier-1 allowlist, a
294    /// `RateLimiter` at hard-coded politeness, and a `ProvenanceLog` in
295    /// a tempdir. Returns the dir as well so the caller keeps it alive
296    /// for the duration of the test.
297    fn build_test_context() -> (TempDir, FetchContext) {
298        let td = TempDir::new().expect("tempdir");
299        // Workspace lints ban `std::path::PathBuf` for log paths; convert
300        // via camino's `Utf8PathBuf::try_from`.
301        let log_dir =
302            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
303        let log_path = log_dir.join("test.jsonl");
304
305        let http = Arc::new(HttpClient::new(tier_1_allowlist()).expect("http client builds"));
306        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
307        let session_id = "01J0000000000000000000TEST".to_string();
308        let log = Arc::new(
309            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
310        );
311
312        (
313            td,
314            FetchContext {
315                http,
316                rate_limiter,
317                log,
318                session_id,
319                cache_root: None,
320            },
321        )
322    }
323
324    #[tokio::test]
325    async fn mock_source_compiles_as_trait_object() {
326        // Trait-shape pin: a `Source` impl is dyn-safe and can be boxed.
327        let s: Box<dyn Source> = Box::new(MockSource);
328        assert_eq!(s.name(), "mock");
329        let profile = CapabilityProfile::from_env().expect("Phase 0 stub");
330        let r = Ref::Doi(Doi("10.1234/example".to_string()));
331        assert!(s.can_serve(&profile, &r));
332
333        let (_td, ctx) = build_test_context();
334        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
335        assert_eq!(res.source, "mock");
336    }
337
338    #[tokio::test]
339    async fn mock_source_fetch_returns_result() {
340        // Direct dispatch (not through `dyn`) to exercise the async fn
341        // body and assert the populated FetchResult fields.
342        let s = MockSource;
343        let profile = CapabilityProfile::from_env().expect("Phase 0 stub");
344        let r = Ref::Doi(Doi("10.1234/example".to_string()));
345        let (_td, ctx) = build_test_context();
346
347        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
348        assert_eq!(res.source, "mock");
349        assert_eq!(res.license, "unknown");
350        assert!(res.pdf_bytes.is_none());
351        assert!(res.final_url.is_none());
352        assert!(res.metadata_json.is_none());
353    }
354
355    #[test]
356    fn fetch_error_collapses_to_error_code() {
357        // Mirrors `docs/PUBLIC_API.md` §4 / PR #55 boundary collapse.
358        // Each variant must map to its documented code.
359        let e: ErrorCode = FetchError::NotEligible {
360            source_key: "mock".into(),
361        }
362        .into();
363        assert_eq!(e, ErrorCode::CapabilityDenied);
364
365        let e: ErrorCode = FetchError::NoOaAvailable.into();
366        assert_eq!(e, ErrorCode::NoOaAvailable);
367
368        let e: ErrorCode = FetchError::Http(HttpError::UnknownSource {
369            source_key: "mock".into(),
370        })
371        .into();
372        assert_eq!(e, ErrorCode::NetworkError);
373
374        let e: ErrorCode = FetchError::Log(LogError::Io(std::io::Error::other("synthetic"))).into();
375        assert_eq!(e, ErrorCode::LogError);
376
377        let e: ErrorCode = FetchError::InvalidRef(RefParseError::Empty).into();
378        assert_eq!(e, ErrorCode::InvalidRef);
379
380        let e: ErrorCode = FetchError::SourceSchema {
381            hint: "missing field 'license'".into(),
382        }
383        .into();
384        assert_eq!(e, ErrorCode::InternalError);
385
386        // Slice 2 — TooManyRefs collapses to INVALID_REF, NOT
387        // InternalError (the `#[non_exhaustive]` wildcard would
388        // otherwise misroute this to InternalError).
389        let e: ErrorCode = FetchError::TooManyRefs { got: 101, max: 100 }.into();
390        assert_eq!(e, ErrorCode::InvalidRef);
391    }
392
393    #[test]
394    fn fetch_context_debug_redacts_internals() {
395        // Pin the Debug shape — only `session_id` is printed, the rest is
396        // elided. Prevents accidental log leakage when a context is
397        // included in a `tracing::debug!` event.
398        let (_td, ctx) = build_test_context();
399        let s = format!("{:?}", ctx);
400        assert!(
401            s.contains("session_id"),
402            "session_id must be in Debug: {}",
403            s
404        );
405        assert!(s.contains("01J0000000000000000000TEST"));
406        assert!(
407            !s.contains("HttpClient") && !s.contains("RateLimiter") && !s.contains("ProvenanceLog"),
408            "FetchContext Debug must not dump foundation internals: {}",
409            s,
410        );
411    }
412
413    // ---------------------------------------------------------------
414    // FetchError -> Option<DenialContext>  (ADR-0023 §4)
415    // ---------------------------------------------------------------
416
417    #[test]
418    fn denial_from_not_eligible_carries_source_key() {
419        use crate::{DenialContext, DenialReason};
420        let e = FetchError::NotEligible {
421            source_key: "tdm-elsevier".to_string(),
422        };
423        let dc: Option<DenialContext> = (&e).into();
424        let dc = dc.expect("NotEligible -> Some(DenialContext)");
425        assert_eq!(dc.reason, DenialReason::CapabilityNotGranted);
426        assert_eq!(dc.source.as_deref(), Some("tdm-elsevier"));
427        assert!(dc.attempted.is_none());
428        // Post-refinement: `expected: None` ("producer did not populate")
429        // rather than `Some(vec![])` ("explicit empty allowlist"). See
430        // `DenialContext::expected` field doc for the disambiguation.
431        assert!(dc.expected.is_none());
432    }
433
434    #[test]
435    fn denial_from_http_delegates_to_http_mapping() {
436        use crate::http::HttpError;
437        use crate::{DenialContext, DenialReason, PDF_MAX_BYTES};
438        // The Http arm must delegate to the HttpError mapping rather than
439        // reinventing it, so an OversizedBody surfaces with cap/actual
440        // populated and the SizeCapExceeded reason — proving delegation
441        // works without per-variant duplication.
442        let e = FetchError::Http(HttpError::OversizedBody {
443            actual: 209_715_200,
444            cap: PDF_MAX_BYTES,
445        });
446        let dc: Option<DenialContext> = (&e).into();
447        let dc = dc.expect("Http(OversizedBody) -> Some(DenialContext)");
448        assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
449        assert_eq!(dc.cap, Some(PDF_MAX_BYTES));
450        assert_eq!(dc.actual, Some(209_715_200));
451    }
452
453    #[test]
454    fn denial_from_non_denial_variants_returns_none() {
455        use crate::DenialContext;
456        // Each of the four non-denial FetchError arms maps to None per
457        // ADR-0023 §4.
458        let e = FetchError::NoOaAvailable;
459        let dc: Option<DenialContext> = (&e).into();
460        assert!(dc.is_none(), "NoOaAvailable must not produce DenialContext");
461
462        let e = FetchError::Log(LogError::Io(std::io::Error::other("synthetic")));
463        let dc: Option<DenialContext> = (&e).into();
464        assert!(dc.is_none(), "Log must not produce DenialContext");
465
466        let e = FetchError::InvalidRef(RefParseError::Empty);
467        let dc: Option<DenialContext> = (&e).into();
468        assert!(dc.is_none(), "InvalidRef must not produce DenialContext");
469
470        let e = FetchError::SourceSchema {
471            hint: "missing field 'license'".into(),
472        };
473        let dc: Option<DenialContext> = (&e).into();
474        assert!(dc.is_none(), "SourceSchema must not produce DenialContext");
475    }
476}