doiget_core/source.rs
1//! Source abstraction. Each Tier 1/2/3 fetcher implements this trait.
2//!
3//! Binding spec: `docs/PUBLIC_API.md` §2 (trait surface),
4//! `docs/ARCHITECTURE.md` §6 (per-fetch data flow), and
5//! `docs/PROVENANCE_LOG.md` §3 (the `Fetch` row source impls emit).
6//!
7//! Phase 1 ships the trait + supporting types; concrete impls (Crossref,
8//! Unpaywall, arXiv) land in follow-up PRs (see `docs/SOURCES.md` for the
9//! source matrix and tiering).
10
11use std::sync::Arc;
12
13use async_trait::async_trait;
14use bytes::Bytes;
15use thiserror::Error;
16
17use crate::http::{HttpClient, HttpError};
18use crate::provenance::{LogError, ProvenanceLog};
19use crate::rate_limiter::RateLimiter;
20use crate::{CapabilityProfile, Ref, RefParseError};
21
22/// What a successful fetch returns to the caller.
23///
24/// Whether `pdf_bytes` is `None` depends on the source: metadata-only
25/// sources (Phase 4) leave it unset; OA sources (Phase 1) return PDF bytes
26/// when an OA URL was discovered.
27#[derive(Debug, Clone)]
28#[non_exhaustive]
29pub struct FetchResult {
30 /// Source's name (matches `Source::name()`); set for the audit trail.
31 pub source: String,
32 /// OA license string (`"CC-BY-4.0"`, `"unknown"`, etc.).
33 pub license: String,
34 /// PDF bytes; `None` for metadata-only sources.
35 pub pdf_bytes: Option<Bytes>,
36 /// Final URL after redirect resolution; useful for the metadata
37 /// `[doiget].url` field.
38 pub final_url: Option<url::Url>,
39 /// Source-side metadata payload as a serde_json value. The Source impl
40 /// is responsible for the shape; the caller (Phase 1+ orchestrator)
41 /// maps it into `Metadata` when one exists (Phase 1+).
42 pub metadata_json: Option<serde_json::Value>,
43}
44
45/// Per-fetch context shared by all `Source` impls.
46///
47/// Held by the orchestrator (CLI / MCP server) and passed by reference into
48/// each [`Source::fetch`]. Sources MUST NOT construct their own
49/// [`HttpClient`] / [`RateLimiter`] / [`ProvenanceLog`] — they go through
50/// this context for uniform politeness, redirect allowlisting, and audit
51/// logging.
52#[derive(Clone)]
53pub struct FetchContext {
54 /// Shared, allowlist-aware HTTP client. See [`HttpClient`].
55 pub http: Arc<HttpClient>,
56 /// Process-wide async rate limiter. See [`RateLimiter`].
57 pub rate_limiter: Arc<RateLimiter>,
58 /// Append-only, hash-chained provenance log. Source impls MUST emit
59 /// one `LogEvent::Fetch` row per attempt via `log.append`. See
60 /// [`ProvenanceLog`].
61 pub log: Arc<ProvenanceLog>,
62 /// 26-char ULID identifying this process invocation. Mirrors the
63 /// `session_id` stamped into every provenance row by the writer; held
64 /// here so source impls can include it in their own structured logs
65 /// without re-reading the env.
66 pub session_id: String,
67}
68
69impl std::fmt::Debug for FetchContext {
70 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71 // Avoid printing the full HTTP / rate-limiter / log internals; only
72 // the session_id is human-meaningful for log breadcrumbs.
73 f.debug_struct("FetchContext")
74 .field("session_id", &self.session_id)
75 .finish_non_exhaustive()
76 }
77}
78
79/// Errors returned by [`Source::fetch`].
80///
81/// At the public CLI / MCP boundary, every variant collapses to an
82/// [`crate::ErrorCode`] via the `From<FetchError>` impl below — mirroring
83/// the [`RefParseError`] → [`crate::ErrorCode::InvalidRef`] collapse from
84/// PR #55.
85#[derive(Debug, Error)]
86#[non_exhaustive]
87pub enum FetchError {
88 /// The source does not handle the given ref under the runtime
89 /// capability profile (covers both `can_serve = false` outcomes and
90 /// runtime denials raised inside `fetch`).
91 #[error("source {source_key} cannot serve this ref")]
92 NotEligible {
93 /// The source key that declined.
94 source_key: String,
95 },
96 /// Tier 1 sources reported no OA URL for this ref.
97 #[error("Tier 1 sources reported no OA URL for this ref")]
98 NoOaAvailable,
99 /// Underlying HTTP / network failure. See [`HttpError`].
100 #[error("network error: {0}")]
101 Http(#[from] HttpError),
102 /// Provenance log write failed. Per `docs/SECURITY.md` §1.8 this is a
103 /// fail-closed signal; the surrounding fetch MUST be aborted.
104 #[error("provenance log error: {0}")]
105 Log(#[from] LogError),
106 /// Ref re-parse / validation failed inside the source (e.g. when a
107 /// source receives a borrowed string from upstream and re-validates).
108 #[error("invalid ref: {0}")]
109 InvalidRef(#[from] RefParseError),
110 /// Source-side schema mismatch (unexpected JSON shape, missing
111 /// required field). Surfaces to [`crate::ErrorCode::InternalError`]
112 /// at the public boundary.
113 #[error("source-side schema error: {hint}")]
114 SourceSchema {
115 /// Human-readable hint at the offending field/path; not parsed.
116 hint: String,
117 },
118 /// Batch orchestrator received more refs than
119 /// [`crate::MAX_BATCH_REFS`]. Surfaced to the MCP `doiget_batch_fetch`
120 /// tool as `ErrorCode::InvalidRef` (closest closed-set fit — the
121 /// request shape itself is invalid; no `denial_context` channel
122 /// applies). Slice 2 / `docs/MCP_TOOLS.md` §1.
123 #[error("too many refs: got {got}, max {max}")]
124 TooManyRefs {
125 /// Number of refs the batch orchestrator was handed.
126 got: usize,
127 /// The hard cap ([`crate::MAX_BATCH_REFS`]).
128 max: usize,
129 },
130}
131
132/// Map [`FetchError`] to the closed [`crate::ErrorCode`] set surfaced at
133/// the public CLI / MCP boundary. Mirrors the
134/// `From<RefParseError> for ErrorCode` collapse from PR #55.
135impl From<FetchError> for crate::ErrorCode {
136 fn from(e: FetchError) -> crate::ErrorCode {
137 crate::ErrorCode::from(&e)
138 }
139}
140
141/// Borrow-form of the collapse above, so a caller that still needs the
142/// error for its `Display` message / `denial_context` side-channel
143/// (notably the CLI human-persona renderer, issue #119) can obtain the
144/// closed code without consuming it. The owned impl delegates here so
145/// the mapping table lives in exactly one place.
146impl From<&FetchError> for crate::ErrorCode {
147 fn from(e: &FetchError) -> crate::ErrorCode {
148 match e {
149 FetchError::NotEligible { .. } => crate::ErrorCode::CapabilityDenied,
150 FetchError::NoOaAvailable => crate::ErrorCode::NoOaAvailable,
151 FetchError::Http(_) => crate::ErrorCode::NetworkError,
152 FetchError::Log(_) => crate::ErrorCode::LogError,
153 FetchError::InvalidRef(_) => crate::ErrorCode::InvalidRef,
154 FetchError::SourceSchema { .. } => crate::ErrorCode::InternalError,
155 // Slice 2: a too-large batch is a request-shape failure, so
156 // collapse to `INVALID_REF` (closest closed-set fit). The
157 // `#[non_exhaustive]` wildcard below would otherwise route
158 // it to `INTERNAL_ERROR`, which would mislead agents.
159 FetchError::TooManyRefs { .. } => crate::ErrorCode::InvalidRef,
160 }
161 }
162}
163
164/// Map a [`FetchError`] reference to the structured [`crate::DenialContext`]
165/// channel introduced by ADR-0023 §4.
166///
167/// `&FetchError` (rather than `FetchError`) so the orchestrator can
168/// produce the structured side-channel without consuming the error it
169/// still needs for `error.message` and the `From<FetchError> for
170/// ErrorCode` collapse above. The `Http` arm delegates to the
171/// `From<&HttpError> for Option<DenialContext>` impl in [`crate::http`].
172impl From<&FetchError> for Option<crate::DenialContext> {
173 fn from(e: &FetchError) -> Self {
174 use crate::{DenialContext, DenialReason};
175 match e {
176 FetchError::NotEligible { source_key } => Some(DenialContext {
177 reason: DenialReason::CapabilityNotGranted,
178 source: Some(source_key.clone()),
179 attempted: None,
180 // CapabilityNotGranted has no allowlist channel: the
181 // producer leaves `expected` at `None` (NOT `Some(vec![])`).
182 // See `DenialContext::expected` for the disambiguation.
183 expected: None,
184 hop_index: None,
185 cap: None,
186 actual: None,
187 }),
188 // Delegate to the HttpError mapping (ADR-0023 §4 mapping table).
189 FetchError::Http(http_err) => http_err.into(),
190 // Non-denial variants map to None per ADR-0023 §4. (Slice 2:
191 // `TooManyRefs` is a request-shape failure, not a denial —
192 // adding it to the None arm keeps the mapping table consistent.)
193 FetchError::NoOaAvailable
194 | FetchError::Log(_)
195 | FetchError::InvalidRef(_)
196 | FetchError::SourceSchema { .. }
197 | FetchError::TooManyRefs { .. } => None,
198 }
199 }
200}
201
202/// The trait implemented by every Tier 1 / 2 / 3 fetcher.
203///
204/// Binding signature: `docs/PUBLIC_API.md` §2 (NORMATIVE — the wire shape
205/// of these three methods is semver-locked).
206#[async_trait]
207pub trait Source: Send + Sync {
208 /// Stable name used in metadata (`[doiget].source`) and provenance
209 /// rows. Conventional values: `"crossref"`, `"unpaywall"`, `"arxiv"`,
210 /// `"openalex"`, `"semantic-scholar"`, `"doaj"`, `"tdm-elsevier"`,
211 /// etc. (see `docs/SOURCES.md`).
212 fn name(&self) -> &str;
213
214 /// True if this source can plausibly serve the given ref under the
215 /// runtime capability profile. Implementations MUST be fast and
216 /// non-blocking; the orchestrator calls `can_serve` to decide whether
217 /// to invoke `fetch` at all.
218 fn can_serve(&self, profile: &CapabilityProfile, ref_: &Ref) -> bool;
219
220 /// Perform the source-specific fetch.
221 ///
222 /// Implementations:
223 /// 1. acquire `ctx.rate_limiter.acquire(self.name()).await`,
224 /// 2. fetch via `ctx.http.fetch_bytes` / `ctx.http.fetch_pdf`,
225 /// 3. emit one `LogEvent::Fetch` row via `ctx.log.append`,
226 /// 4. return a [`FetchResult`].
227 ///
228 /// The trait does NOT enforce these steps; it documents the protocol
229 /// so concrete impls produce uniform audit trails (per
230 /// `docs/ARCHITECTURE.md` §6 and `docs/PROVENANCE_LOG.md` §3).
231 async fn fetch(
232 &self,
233 ref_: &Ref,
234 profile: &CapabilityProfile,
235 ctx: &FetchContext,
236 ) -> Result<FetchResult, FetchError>;
237}
238
239// ---------------------------------------------------------------------------
240// Tests
241// ---------------------------------------------------------------------------
242
243#[cfg(test)]
244#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
245mod tests {
246 use super::*;
247
248 use camino::Utf8PathBuf;
249 use tempfile::TempDir;
250
251 use crate::http::{tier_1_allowlist, HttpClient};
252 use crate::provenance::ProvenanceLog;
253 use crate::rate_limiter::RateLimiter;
254 use crate::{CapabilityProfile, Doi, ErrorCode, RateLimits, Ref};
255
256 /// Minimal `Source` impl exercised purely to pin the trait shape and
257 /// verify dispatch through `Box<dyn Source>`. Concrete sources land in
258 /// follow-up PRs (Crossref / Unpaywall / arXiv).
259 struct MockSource;
260
261 #[async_trait]
262 impl Source for MockSource {
263 fn name(&self) -> &str {
264 "mock"
265 }
266 fn can_serve(&self, _: &CapabilityProfile, _: &Ref) -> bool {
267 true
268 }
269 async fn fetch(
270 &self,
271 _: &Ref,
272 _: &CapabilityProfile,
273 _: &FetchContext,
274 ) -> Result<FetchResult, FetchError> {
275 Ok(FetchResult {
276 source: "mock".into(),
277 license: "unknown".into(),
278 pdf_bytes: None,
279 final_url: None,
280 metadata_json: None,
281 })
282 }
283 }
284
285 /// Build a `FetchContext` backed by real (but inert) Round-A
286 /// foundation modules: a `HttpClient` over the Tier-1 allowlist, a
287 /// `RateLimiter` at hard-coded politeness, and a `ProvenanceLog` in
288 /// a tempdir. Returns the dir as well so the caller keeps it alive
289 /// for the duration of the test.
290 fn build_test_context() -> (TempDir, FetchContext) {
291 let td = TempDir::new().expect("tempdir");
292 // Workspace lints ban `std::path::PathBuf` for log paths; convert
293 // via camino's `Utf8PathBuf::try_from`.
294 let log_dir =
295 Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
296 let log_path = log_dir.join("test.jsonl");
297
298 let http = Arc::new(HttpClient::new(tier_1_allowlist()).expect("http client builds"));
299 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
300 let session_id = "01J0000000000000000000TEST".to_string();
301 let log = Arc::new(
302 ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
303 );
304
305 (
306 td,
307 FetchContext {
308 http,
309 rate_limiter,
310 log,
311 session_id,
312 },
313 )
314 }
315
316 #[tokio::test]
317 async fn mock_source_compiles_as_trait_object() {
318 // Trait-shape pin: a `Source` impl is dyn-safe and can be boxed.
319 let s: Box<dyn Source> = Box::new(MockSource);
320 assert_eq!(s.name(), "mock");
321 let profile = CapabilityProfile::from_env().expect("Phase 0 stub");
322 let r = Ref::Doi(Doi("10.1234/example".to_string()));
323 assert!(s.can_serve(&profile, &r));
324
325 let (_td, ctx) = build_test_context();
326 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
327 assert_eq!(res.source, "mock");
328 }
329
330 #[tokio::test]
331 async fn mock_source_fetch_returns_result() {
332 // Direct dispatch (not through `dyn`) to exercise the async fn
333 // body and assert the populated FetchResult fields.
334 let s = MockSource;
335 let profile = CapabilityProfile::from_env().expect("Phase 0 stub");
336 let r = Ref::Doi(Doi("10.1234/example".to_string()));
337 let (_td, ctx) = build_test_context();
338
339 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
340 assert_eq!(res.source, "mock");
341 assert_eq!(res.license, "unknown");
342 assert!(res.pdf_bytes.is_none());
343 assert!(res.final_url.is_none());
344 assert!(res.metadata_json.is_none());
345 }
346
347 #[test]
348 fn fetch_error_collapses_to_error_code() {
349 // Mirrors `docs/PUBLIC_API.md` §4 / PR #55 boundary collapse.
350 // Each variant must map to its documented code.
351 let e: ErrorCode = FetchError::NotEligible {
352 source_key: "mock".into(),
353 }
354 .into();
355 assert_eq!(e, ErrorCode::CapabilityDenied);
356
357 let e: ErrorCode = FetchError::NoOaAvailable.into();
358 assert_eq!(e, ErrorCode::NoOaAvailable);
359
360 let e: ErrorCode = FetchError::Http(HttpError::UnknownSource {
361 source_key: "mock".into(),
362 })
363 .into();
364 assert_eq!(e, ErrorCode::NetworkError);
365
366 let e: ErrorCode = FetchError::Log(LogError::Io(std::io::Error::other("synthetic"))).into();
367 assert_eq!(e, ErrorCode::LogError);
368
369 let e: ErrorCode = FetchError::InvalidRef(RefParseError::Empty).into();
370 assert_eq!(e, ErrorCode::InvalidRef);
371
372 let e: ErrorCode = FetchError::SourceSchema {
373 hint: "missing field 'license'".into(),
374 }
375 .into();
376 assert_eq!(e, ErrorCode::InternalError);
377
378 // Slice 2 — TooManyRefs collapses to INVALID_REF, NOT
379 // InternalError (the `#[non_exhaustive]` wildcard would
380 // otherwise misroute this to InternalError).
381 let e: ErrorCode = FetchError::TooManyRefs { got: 101, max: 100 }.into();
382 assert_eq!(e, ErrorCode::InvalidRef);
383 }
384
385 #[test]
386 fn fetch_context_debug_redacts_internals() {
387 // Pin the Debug shape — only `session_id` is printed, the rest is
388 // elided. Prevents accidental log leakage when a context is
389 // included in a `tracing::debug!` event.
390 let (_td, ctx) = build_test_context();
391 let s = format!("{:?}", ctx);
392 assert!(
393 s.contains("session_id"),
394 "session_id must be in Debug: {}",
395 s
396 );
397 assert!(s.contains("01J0000000000000000000TEST"));
398 assert!(
399 !s.contains("HttpClient") && !s.contains("RateLimiter") && !s.contains("ProvenanceLog"),
400 "FetchContext Debug must not dump foundation internals: {}",
401 s,
402 );
403 }
404
405 // ---------------------------------------------------------------
406 // FetchError -> Option<DenialContext> (ADR-0023 §4)
407 // ---------------------------------------------------------------
408
409 #[test]
410 fn denial_from_not_eligible_carries_source_key() {
411 use crate::{DenialContext, DenialReason};
412 let e = FetchError::NotEligible {
413 source_key: "tdm-elsevier".to_string(),
414 };
415 let dc: Option<DenialContext> = (&e).into();
416 let dc = dc.expect("NotEligible -> Some(DenialContext)");
417 assert_eq!(dc.reason, DenialReason::CapabilityNotGranted);
418 assert_eq!(dc.source.as_deref(), Some("tdm-elsevier"));
419 assert!(dc.attempted.is_none());
420 // Post-refinement: `expected: None` ("producer did not populate")
421 // rather than `Some(vec![])` ("explicit empty allowlist"). See
422 // `DenialContext::expected` field doc for the disambiguation.
423 assert!(dc.expected.is_none());
424 }
425
426 #[test]
427 fn denial_from_http_delegates_to_http_mapping() {
428 use crate::http::HttpError;
429 use crate::{DenialContext, DenialReason, PDF_MAX_BYTES};
430 // The Http arm must delegate to the HttpError mapping rather than
431 // reinventing it, so an OversizedBody surfaces with cap/actual
432 // populated and the SizeCapExceeded reason — proving delegation
433 // works without per-variant duplication.
434 let e = FetchError::Http(HttpError::OversizedBody {
435 actual: 209_715_200,
436 cap: PDF_MAX_BYTES,
437 });
438 let dc: Option<DenialContext> = (&e).into();
439 let dc = dc.expect("Http(OversizedBody) -> Some(DenialContext)");
440 assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
441 assert_eq!(dc.cap, Some(PDF_MAX_BYTES));
442 assert_eq!(dc.actual, Some(209_715_200));
443 }
444
445 #[test]
446 fn denial_from_non_denial_variants_returns_none() {
447 use crate::DenialContext;
448 // Each of the four non-denial FetchError arms maps to None per
449 // ADR-0023 §4.
450 let e = FetchError::NoOaAvailable;
451 let dc: Option<DenialContext> = (&e).into();
452 assert!(dc.is_none(), "NoOaAvailable must not produce DenialContext");
453
454 let e = FetchError::Log(LogError::Io(std::io::Error::other("synthetic")));
455 let dc: Option<DenialContext> = (&e).into();
456 assert!(dc.is_none(), "Log must not produce DenialContext");
457
458 let e = FetchError::InvalidRef(RefParseError::Empty);
459 let dc: Option<DenialContext> = (&e).into();
460 assert!(dc.is_none(), "InvalidRef must not produce DenialContext");
461
462 let e = FetchError::SourceSchema {
463 hint: "missing field 'license'".into(),
464 };
465 let dc: Option<DenialContext> = (&e).into();
466 assert!(dc.is_none(), "SourceSchema must not produce DenialContext");
467 }
468}