Skip to main content

doiget_core/
source.rs

1//! Source abstraction. Each Tier 1/2/3 fetcher implements this trait.
2//!
3//! Binding spec: `docs/PUBLIC_API.md` §2 (trait surface),
4//! `docs/ARCHITECTURE.md` §6 (per-fetch data flow), and
5//! `docs/PROVENANCE_LOG.md` §3 (the `Fetch` row source impls emit).
6//!
7//! Phase 1 ships the trait + supporting types; concrete impls (Crossref,
8//! Unpaywall, arXiv) land in follow-up PRs (see `docs/SOURCES.md` for the
9//! source matrix and tiering).
10
11use std::sync::Arc;
12
13use async_trait::async_trait;
14use bytes::Bytes;
15use thiserror::Error;
16
17use crate::http::{HttpClient, HttpError};
18use crate::provenance::{LogError, ProvenanceLog};
19use crate::rate_limiter::RateLimiter;
20use crate::{CapabilityProfile, Ref, RefParseError};
21
22/// What a successful fetch returns to the caller.
23///
24/// Whether `pdf_bytes` is `None` depends on the source: metadata-only
25/// sources (Phase 4) leave it unset; OA sources (Phase 1) return PDF bytes
26/// when an OA URL was discovered.
27#[derive(Debug, Clone)]
28#[non_exhaustive]
29pub struct FetchResult {
30    /// Source's name (matches `Source::name()`); set for the audit trail.
31    pub source: String,
32    /// OA license string (`"CC-BY-4.0"`, `"unknown"`, etc.).
33    pub license: String,
34    /// PDF bytes; `None` for metadata-only sources.
35    pub pdf_bytes: Option<Bytes>,
36    /// Final URL after redirect resolution; useful for the metadata
37    /// `[doiget].url` field.
38    pub final_url: Option<url::Url>,
39    /// Source-side metadata payload as a serde_json value. The Source impl
40    /// is responsible for the shape; the caller (Phase 1+ orchestrator)
41    /// maps it into `Metadata` when one exists (Phase 1+).
42    pub metadata_json: Option<serde_json::Value>,
43}
44
45/// Per-fetch context shared by all `Source` impls.
46///
47/// Held by the orchestrator (CLI / MCP server) and passed by reference into
48/// each [`Source::fetch`]. Sources MUST NOT construct their own
49/// [`HttpClient`] / [`RateLimiter`] / [`ProvenanceLog`] — they go through
50/// this context for uniform politeness, redirect allowlisting, and audit
51/// logging.
52#[derive(Clone)]
53pub struct FetchContext {
54    /// Shared, allowlist-aware HTTP client. See [`HttpClient`].
55    pub http: Arc<HttpClient>,
56    /// Process-wide async rate limiter. See [`RateLimiter`].
57    pub rate_limiter: Arc<RateLimiter>,
58    /// Append-only, hash-chained provenance log. Source impls MUST emit
59    /// one `LogEvent::Fetch` row per attempt via `log.append`. See
60    /// [`ProvenanceLog`].
61    pub log: Arc<ProvenanceLog>,
62    /// 26-char ULID identifying this process invocation. Mirrors the
63    /// `session_id` stamped into every provenance row by the writer; held
64    /// here so source impls can include it in their own structured logs
65    /// without re-reading the env.
66    pub session_id: String,
67}
68
69impl std::fmt::Debug for FetchContext {
70    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71        // Avoid printing the full HTTP / rate-limiter / log internals; only
72        // the session_id is human-meaningful for log breadcrumbs.
73        f.debug_struct("FetchContext")
74            .field("session_id", &self.session_id)
75            .finish_non_exhaustive()
76    }
77}
78
79/// Errors returned by [`Source::fetch`].
80///
81/// At the public CLI / MCP boundary, every variant collapses to an
82/// [`crate::ErrorCode`] via the `From<FetchError>` impl below — mirroring
83/// the [`RefParseError`] → [`crate::ErrorCode::InvalidRef`] collapse from
84/// PR #55.
85#[derive(Debug, Error)]
86#[non_exhaustive]
87pub enum FetchError {
88    /// The source does not handle the given ref under the runtime
89    /// capability profile (covers both `can_serve = false` outcomes and
90    /// runtime denials raised inside `fetch`).
91    #[error("source {source_key} cannot serve this ref")]
92    NotEligible {
93        /// The source key that declined.
94        source_key: String,
95    },
96    /// Tier 1 sources reported no OA URL for this ref.
97    #[error("Tier 1 sources reported no OA URL for this ref")]
98    NoOaAvailable,
99    /// Underlying HTTP / network failure. See [`HttpError`].
100    #[error("network error: {0}")]
101    Http(#[from] HttpError),
102    /// Provenance log write failed. Per `docs/SECURITY.md` §1.8 this is a
103    /// fail-closed signal; the surrounding fetch MUST be aborted.
104    #[error("provenance log error: {0}")]
105    Log(#[from] LogError),
106    /// Ref re-parse / validation failed inside the source (e.g. when a
107    /// source receives a borrowed string from upstream and re-validates).
108    #[error("invalid ref: {0}")]
109    InvalidRef(#[from] RefParseError),
110    /// Source-side schema mismatch (unexpected JSON shape, missing
111    /// required field). Surfaces to [`crate::ErrorCode::InternalError`]
112    /// at the public boundary.
113    #[error("source-side schema error: {hint}")]
114    SourceSchema {
115        /// Human-readable hint at the offending field/path; not parsed.
116        hint: String,
117    },
118    /// Batch orchestrator received more refs than
119    /// [`crate::MAX_BATCH_REFS`]. Surfaced to the MCP `doiget_batch_fetch`
120    /// tool as `ErrorCode::InvalidRef` (closest closed-set fit — the
121    /// request shape itself is invalid; no `denial_context` channel
122    /// applies). Slice 2 / `docs/MCP_TOOLS.md` §1.
123    #[error("too many refs: got {got}, max {max}")]
124    TooManyRefs {
125        /// Number of refs the batch orchestrator was handed.
126        got: usize,
127        /// The hard cap ([`crate::MAX_BATCH_REFS`]).
128        max: usize,
129    },
130}
131
132/// Map [`FetchError`] to the closed [`crate::ErrorCode`] set surfaced at
133/// the public CLI / MCP boundary. Mirrors the
134/// `From<RefParseError> for ErrorCode` collapse from PR #55.
135impl From<FetchError> for crate::ErrorCode {
136    fn from(e: FetchError) -> crate::ErrorCode {
137        crate::ErrorCode::from(&e)
138    }
139}
140
141/// Borrow-form of the collapse above, so a caller that still needs the
142/// error for its `Display` message / `denial_context` side-channel
143/// (notably the CLI human-persona renderer, issue #119) can obtain the
144/// closed code without consuming it. The owned impl delegates here so
145/// the mapping table lives in exactly one place.
146impl From<&FetchError> for crate::ErrorCode {
147    fn from(e: &FetchError) -> crate::ErrorCode {
148        match e {
149            FetchError::NotEligible { .. } => crate::ErrorCode::CapabilityDenied,
150            FetchError::NoOaAvailable => crate::ErrorCode::NoOaAvailable,
151            FetchError::Http(_) => crate::ErrorCode::NetworkError,
152            FetchError::Log(_) => crate::ErrorCode::LogError,
153            FetchError::InvalidRef(_) => crate::ErrorCode::InvalidRef,
154            FetchError::SourceSchema { .. } => crate::ErrorCode::InternalError,
155            // Slice 2: a too-large batch is a request-shape failure, so
156            // collapse to `INVALID_REF` (closest closed-set fit). The
157            // `#[non_exhaustive]` wildcard below would otherwise route
158            // it to `INTERNAL_ERROR`, which would mislead agents.
159            FetchError::TooManyRefs { .. } => crate::ErrorCode::InvalidRef,
160        }
161    }
162}
163
164/// Map a [`FetchError`] reference to the structured [`crate::DenialContext`]
165/// channel introduced by ADR-0023 §4.
166///
167/// `&FetchError` (rather than `FetchError`) so the orchestrator can
168/// produce the structured side-channel without consuming the error it
169/// still needs for `error.message` and the `From<FetchError> for
170/// ErrorCode` collapse above. The `Http` arm delegates to the
171/// `From<&HttpError> for Option<DenialContext>` impl in [`crate::http`].
172impl From<&FetchError> for Option<crate::DenialContext> {
173    fn from(e: &FetchError) -> Self {
174        use crate::{DenialContext, DenialReason};
175        match e {
176            FetchError::NotEligible { source_key } => Some(DenialContext {
177                reason: DenialReason::CapabilityNotGranted,
178                source: Some(source_key.clone()),
179                attempted: None,
180                // CapabilityNotGranted has no allowlist channel: the
181                // producer leaves `expected` at `None` (NOT `Some(vec![])`).
182                // See `DenialContext::expected` for the disambiguation.
183                expected: None,
184                hop_index: None,
185                cap: None,
186                actual: None,
187            }),
188            // Delegate to the HttpError mapping (ADR-0023 §4 mapping table).
189            FetchError::Http(http_err) => http_err.into(),
190            // Non-denial variants map to None per ADR-0023 §4. (Slice 2:
191            // `TooManyRefs` is a request-shape failure, not a denial —
192            // adding it to the None arm keeps the mapping table consistent.)
193            FetchError::NoOaAvailable
194            | FetchError::Log(_)
195            | FetchError::InvalidRef(_)
196            | FetchError::SourceSchema { .. }
197            | FetchError::TooManyRefs { .. } => None,
198        }
199    }
200}
201
202/// The trait implemented by every Tier 1 / 2 / 3 fetcher.
203///
204/// Binding signature: `docs/PUBLIC_API.md` §2 (NORMATIVE — the wire shape
205/// of these three methods is semver-locked).
206#[async_trait]
207pub trait Source: Send + Sync {
208    /// Stable name used in metadata (`[doiget].source`) and provenance
209    /// rows. Conventional values: `"crossref"`, `"unpaywall"`, `"arxiv"`,
210    /// `"openalex"`, `"semantic-scholar"`, `"doaj"`, `"tdm-elsevier"`,
211    /// etc. (see `docs/SOURCES.md`).
212    fn name(&self) -> &str;
213
214    /// True if this source can plausibly serve the given ref under the
215    /// runtime capability profile. Implementations MUST be fast and
216    /// non-blocking; the orchestrator calls `can_serve` to decide whether
217    /// to invoke `fetch` at all.
218    fn can_serve(&self, profile: &CapabilityProfile, ref_: &Ref) -> bool;
219
220    /// Perform the source-specific fetch.
221    ///
222    /// Implementations:
223    ///   1. acquire `ctx.rate_limiter.acquire(self.name()).await`,
224    ///   2. fetch via `ctx.http.fetch_bytes` / `ctx.http.fetch_pdf`,
225    ///   3. emit one `LogEvent::Fetch` row via `ctx.log.append`,
226    ///   4. return a [`FetchResult`].
227    ///
228    /// The trait does NOT enforce these steps; it documents the protocol
229    /// so concrete impls produce uniform audit trails (per
230    /// `docs/ARCHITECTURE.md` §6 and `docs/PROVENANCE_LOG.md` §3).
231    async fn fetch(
232        &self,
233        ref_: &Ref,
234        profile: &CapabilityProfile,
235        ctx: &FetchContext,
236    ) -> Result<FetchResult, FetchError>;
237}
238
239// ---------------------------------------------------------------------------
240// Tests
241// ---------------------------------------------------------------------------
242
243#[cfg(test)]
244#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
245mod tests {
246    use super::*;
247
248    use camino::Utf8PathBuf;
249    use tempfile::TempDir;
250
251    use crate::http::{tier_1_allowlist, HttpClient};
252    use crate::provenance::ProvenanceLog;
253    use crate::rate_limiter::RateLimiter;
254    use crate::{CapabilityProfile, Doi, ErrorCode, RateLimits, Ref};
255
256    /// Minimal `Source` impl exercised purely to pin the trait shape and
257    /// verify dispatch through `Box<dyn Source>`. Concrete sources land in
258    /// follow-up PRs (Crossref / Unpaywall / arXiv).
259    struct MockSource;
260
261    #[async_trait]
262    impl Source for MockSource {
263        fn name(&self) -> &str {
264            "mock"
265        }
266        fn can_serve(&self, _: &CapabilityProfile, _: &Ref) -> bool {
267            true
268        }
269        async fn fetch(
270            &self,
271            _: &Ref,
272            _: &CapabilityProfile,
273            _: &FetchContext,
274        ) -> Result<FetchResult, FetchError> {
275            Ok(FetchResult {
276                source: "mock".into(),
277                license: "unknown".into(),
278                pdf_bytes: None,
279                final_url: None,
280                metadata_json: None,
281            })
282        }
283    }
284
285    /// Build a `FetchContext` backed by real (but inert) Round-A
286    /// foundation modules: a `HttpClient` over the Tier-1 allowlist, a
287    /// `RateLimiter` at hard-coded politeness, and a `ProvenanceLog` in
288    /// a tempdir. Returns the dir as well so the caller keeps it alive
289    /// for the duration of the test.
290    fn build_test_context() -> (TempDir, FetchContext) {
291        let td = TempDir::new().expect("tempdir");
292        // Workspace lints ban `std::path::PathBuf` for log paths; convert
293        // via camino's `Utf8PathBuf::try_from`.
294        let log_dir =
295            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
296        let log_path = log_dir.join("test.jsonl");
297
298        let http = Arc::new(HttpClient::new(tier_1_allowlist()).expect("http client builds"));
299        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
300        let session_id = "01J0000000000000000000TEST".to_string();
301        let log = Arc::new(
302            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
303        );
304
305        (
306            td,
307            FetchContext {
308                http,
309                rate_limiter,
310                log,
311                session_id,
312            },
313        )
314    }
315
316    #[tokio::test]
317    async fn mock_source_compiles_as_trait_object() {
318        // Trait-shape pin: a `Source` impl is dyn-safe and can be boxed.
319        let s: Box<dyn Source> = Box::new(MockSource);
320        assert_eq!(s.name(), "mock");
321        let profile = CapabilityProfile::from_env().expect("Phase 0 stub");
322        let r = Ref::Doi(Doi("10.1234/example".to_string()));
323        assert!(s.can_serve(&profile, &r));
324
325        let (_td, ctx) = build_test_context();
326        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
327        assert_eq!(res.source, "mock");
328    }
329
330    #[tokio::test]
331    async fn mock_source_fetch_returns_result() {
332        // Direct dispatch (not through `dyn`) to exercise the async fn
333        // body and assert the populated FetchResult fields.
334        let s = MockSource;
335        let profile = CapabilityProfile::from_env().expect("Phase 0 stub");
336        let r = Ref::Doi(Doi("10.1234/example".to_string()));
337        let (_td, ctx) = build_test_context();
338
339        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
340        assert_eq!(res.source, "mock");
341        assert_eq!(res.license, "unknown");
342        assert!(res.pdf_bytes.is_none());
343        assert!(res.final_url.is_none());
344        assert!(res.metadata_json.is_none());
345    }
346
347    #[test]
348    fn fetch_error_collapses_to_error_code() {
349        // Mirrors `docs/PUBLIC_API.md` §4 / PR #55 boundary collapse.
350        // Each variant must map to its documented code.
351        let e: ErrorCode = FetchError::NotEligible {
352            source_key: "mock".into(),
353        }
354        .into();
355        assert_eq!(e, ErrorCode::CapabilityDenied);
356
357        let e: ErrorCode = FetchError::NoOaAvailable.into();
358        assert_eq!(e, ErrorCode::NoOaAvailable);
359
360        let e: ErrorCode = FetchError::Http(HttpError::UnknownSource {
361            source_key: "mock".into(),
362        })
363        .into();
364        assert_eq!(e, ErrorCode::NetworkError);
365
366        let e: ErrorCode = FetchError::Log(LogError::Io(std::io::Error::other("synthetic"))).into();
367        assert_eq!(e, ErrorCode::LogError);
368
369        let e: ErrorCode = FetchError::InvalidRef(RefParseError::Empty).into();
370        assert_eq!(e, ErrorCode::InvalidRef);
371
372        let e: ErrorCode = FetchError::SourceSchema {
373            hint: "missing field 'license'".into(),
374        }
375        .into();
376        assert_eq!(e, ErrorCode::InternalError);
377
378        // Slice 2 — TooManyRefs collapses to INVALID_REF, NOT
379        // InternalError (the `#[non_exhaustive]` wildcard would
380        // otherwise misroute this to InternalError).
381        let e: ErrorCode = FetchError::TooManyRefs { got: 101, max: 100 }.into();
382        assert_eq!(e, ErrorCode::InvalidRef);
383    }
384
385    #[test]
386    fn fetch_context_debug_redacts_internals() {
387        // Pin the Debug shape — only `session_id` is printed, the rest is
388        // elided. Prevents accidental log leakage when a context is
389        // included in a `tracing::debug!` event.
390        let (_td, ctx) = build_test_context();
391        let s = format!("{:?}", ctx);
392        assert!(
393            s.contains("session_id"),
394            "session_id must be in Debug: {}",
395            s
396        );
397        assert!(s.contains("01J0000000000000000000TEST"));
398        assert!(
399            !s.contains("HttpClient") && !s.contains("RateLimiter") && !s.contains("ProvenanceLog"),
400            "FetchContext Debug must not dump foundation internals: {}",
401            s,
402        );
403    }
404
405    // ---------------------------------------------------------------
406    // FetchError -> Option<DenialContext>  (ADR-0023 §4)
407    // ---------------------------------------------------------------
408
409    #[test]
410    fn denial_from_not_eligible_carries_source_key() {
411        use crate::{DenialContext, DenialReason};
412        let e = FetchError::NotEligible {
413            source_key: "tdm-elsevier".to_string(),
414        };
415        let dc: Option<DenialContext> = (&e).into();
416        let dc = dc.expect("NotEligible -> Some(DenialContext)");
417        assert_eq!(dc.reason, DenialReason::CapabilityNotGranted);
418        assert_eq!(dc.source.as_deref(), Some("tdm-elsevier"));
419        assert!(dc.attempted.is_none());
420        // Post-refinement: `expected: None` ("producer did not populate")
421        // rather than `Some(vec![])` ("explicit empty allowlist"). See
422        // `DenialContext::expected` field doc for the disambiguation.
423        assert!(dc.expected.is_none());
424    }
425
426    #[test]
427    fn denial_from_http_delegates_to_http_mapping() {
428        use crate::http::HttpError;
429        use crate::{DenialContext, DenialReason, PDF_MAX_BYTES};
430        // The Http arm must delegate to the HttpError mapping rather than
431        // reinventing it, so an OversizedBody surfaces with cap/actual
432        // populated and the SizeCapExceeded reason — proving delegation
433        // works without per-variant duplication.
434        let e = FetchError::Http(HttpError::OversizedBody {
435            actual: 209_715_200,
436            cap: PDF_MAX_BYTES,
437        });
438        let dc: Option<DenialContext> = (&e).into();
439        let dc = dc.expect("Http(OversizedBody) -> Some(DenialContext)");
440        assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
441        assert_eq!(dc.cap, Some(PDF_MAX_BYTES));
442        assert_eq!(dc.actual, Some(209_715_200));
443    }
444
445    #[test]
446    fn denial_from_non_denial_variants_returns_none() {
447        use crate::DenialContext;
448        // Each of the four non-denial FetchError arms maps to None per
449        // ADR-0023 §4.
450        let e = FetchError::NoOaAvailable;
451        let dc: Option<DenialContext> = (&e).into();
452        assert!(dc.is_none(), "NoOaAvailable must not produce DenialContext");
453
454        let e = FetchError::Log(LogError::Io(std::io::Error::other("synthetic")));
455        let dc: Option<DenialContext> = (&e).into();
456        assert!(dc.is_none(), "Log must not produce DenialContext");
457
458        let e = FetchError::InvalidRef(RefParseError::Empty);
459        let dc: Option<DenialContext> = (&e).into();
460        assert!(dc.is_none(), "InvalidRef must not produce DenialContext");
461
462        let e = FetchError::SourceSchema {
463            hint: "missing field 'license'".into(),
464        };
465        let dc: Option<DenialContext> = (&e).into();
466        assert!(dc.is_none(), "SourceSchema must not produce DenialContext");
467    }
468}