doiget_core/source.rs
1//! Source abstraction. Each Tier 1/2/3 fetcher implements this trait.
2//!
3//! Binding spec: `docs/PUBLIC_API.md` §2 (trait surface),
4//! `docs/ARCHITECTURE.md` §6 (per-fetch data flow), and
5//! `docs/PROVENANCE_LOG.md` §3 (the `Fetch` row source impls emit).
6//!
7//! Phase 1 ships the trait + supporting types; concrete impls (Crossref,
8//! Unpaywall, arXiv) land in follow-up PRs (see `docs/SOURCES.md` for the
9//! source matrix and tiering).
10
11use std::sync::Arc;
12
13use async_trait::async_trait;
14use bytes::Bytes;
15use thiserror::Error;
16
17use crate::http::{HttpClient, HttpError};
18use crate::provenance::{LogError, ProvenanceLog};
19use crate::rate_limiter::RateLimiter;
20use crate::{CapabilityProfile, Ref, RefParseError};
21
22/// What a successful fetch returns to the caller.
23///
24/// Whether `pdf_bytes` is `None` depends on the source: metadata-only
25/// sources (Phase 4) leave it unset; OA sources (Phase 1) return PDF bytes
26/// when an OA URL was discovered.
27#[derive(Debug, Clone)]
28#[non_exhaustive]
29pub struct FetchResult {
30 /// Source's name (matches `Source::name()`); set for the audit trail.
31 pub source: String,
32 /// OA license string (`"CC-BY-4.0"`, `"unknown"`, etc.).
33 pub license: String,
34 /// PDF bytes; `None` for metadata-only sources.
35 pub pdf_bytes: Option<Bytes>,
36 /// Final URL after redirect resolution; useful for the metadata
37 /// `[doiget].url` field.
38 pub final_url: Option<url::Url>,
39 /// Source-side metadata payload as a serde_json value. The Source impl
40 /// is responsible for the shape; the caller (Phase 1+ orchestrator)
41 /// maps it into `Metadata` when one exists (Phase 1+).
42 pub metadata_json: Option<serde_json::Value>,
43}
44
45/// Per-fetch context shared by all `Source` impls.
46///
47/// Held by the orchestrator (CLI / MCP server) and passed by reference into
48/// each [`Source::fetch`]. Sources MUST NOT construct their own
49/// [`HttpClient`] / [`RateLimiter`] / [`ProvenanceLog`] — they go through
50/// this context for uniform politeness, redirect allowlisting, and audit
51/// logging.
52#[derive(Clone)]
53pub struct FetchContext {
54 /// Shared, allowlist-aware HTTP client. See [`HttpClient`].
55 pub http: Arc<HttpClient>,
56 /// Process-wide async rate limiter. See [`RateLimiter`].
57 pub rate_limiter: Arc<RateLimiter>,
58 /// Append-only, hash-chained provenance log. Source impls MUST emit
59 /// one `LogEvent::Fetch` row per attempt via `log.append`. See
60 /// [`ProvenanceLog`].
61 pub log: Arc<ProvenanceLog>,
62 /// 26-char ULID identifying this process invocation. Mirrors the
63 /// `session_id` stamped into every provenance row by the writer; held
64 /// here so source impls can include it in their own structured logs
65 /// without re-reading the env.
66 pub session_id: String,
67 /// Resolver cache root (`<cache_root>/resolver/<safekey>.toml`, see
68 /// `docs/CACHE.md` and [`crate::resolver_cache`]). `Some` enables the
69 /// metadata-only resolve cache (repeat resolves served from disk,
70 /// avoiding upstream rate limits); `None` disables it (tests, or a
71 /// caller that opts out). Only `metadata_only` consults it — per-PDF
72 /// fetches are never cached.
73 pub cache_root: Option<camino::Utf8PathBuf>,
74}
75
76impl std::fmt::Debug for FetchContext {
77 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
78 // Avoid printing the full HTTP / rate-limiter / log internals; only
79 // the session_id is human-meaningful for log breadcrumbs.
80 f.debug_struct("FetchContext")
81 .field("session_id", &self.session_id)
82 .finish_non_exhaustive()
83 }
84}
85
86/// Errors returned by [`Source::fetch`].
87///
88/// At the public CLI / MCP boundary, every variant collapses to an
89/// [`crate::ErrorCode`] via the `From<FetchError>` impl below — mirroring
90/// the [`RefParseError`] → [`crate::ErrorCode::InvalidRef`] collapse from
91/// PR #55.
92#[derive(Debug, Error)]
93#[non_exhaustive]
94pub enum FetchError {
95 /// The source does not handle the given ref under the runtime
96 /// capability profile (covers both `can_serve = false` outcomes and
97 /// runtime denials raised inside `fetch`).
98 #[error("source {source_key} cannot serve this ref")]
99 NotEligible {
100 /// The source key that declined.
101 source_key: String,
102 },
103 /// Tier 1 sources reported no OA URL for this ref.
104 #[error("Tier 1 sources reported no OA URL for this ref")]
105 NoOaAvailable,
106 /// Underlying HTTP / network failure. See [`HttpError`].
107 #[error("network error: {0}")]
108 Http(#[from] HttpError),
109 /// Provenance log write failed. Per `docs/SECURITY.md` §1.8 this is a
110 /// fail-closed signal; the surrounding fetch MUST be aborted.
111 #[error("provenance log error: {0}")]
112 Log(#[from] LogError),
113 /// Ref re-parse / validation failed inside the source (e.g. when a
114 /// source receives a borrowed string from upstream and re-validates).
115 #[error("invalid ref: {0}")]
116 InvalidRef(#[from] RefParseError),
117 /// Source-side schema mismatch (unexpected JSON shape, missing
118 /// required field). Surfaces to [`crate::ErrorCode::InternalError`]
119 /// at the public boundary.
120 #[error("source-side schema error: {hint}")]
121 SourceSchema {
122 /// Human-readable hint at the offending field/path; not parsed.
123 hint: String,
124 },
125 /// Batch orchestrator received more refs than
126 /// [`crate::MAX_BATCH_REFS`]. Surfaced to the MCP `doiget_batch_fetch`
127 /// tool as `ErrorCode::InvalidRef` (closest closed-set fit — the
128 /// request shape itself is invalid; no `denial_context` channel
129 /// applies). Slice 2 / `docs/MCP_TOOLS.md` §1.
130 #[error("too many refs: got {got}, max {max}")]
131 TooManyRefs {
132 /// Number of refs the batch orchestrator was handed.
133 got: usize,
134 /// The hard cap ([`crate::MAX_BATCH_REFS`]).
135 max: usize,
136 },
137}
138
139/// Map [`FetchError`] to the closed [`crate::ErrorCode`] set surfaced at
140/// the public CLI / MCP boundary. Mirrors the
141/// `From<RefParseError> for ErrorCode` collapse from PR #55.
142impl From<FetchError> for crate::ErrorCode {
143 fn from(e: FetchError) -> crate::ErrorCode {
144 crate::ErrorCode::from(&e)
145 }
146}
147
148/// Borrow-form of the collapse above, so a caller that still needs the
149/// error for its `Display` message / `denial_context` side-channel
150/// (notably the CLI human-persona renderer, issue #119) can obtain the
151/// closed code without consuming it. The owned impl delegates here so
152/// the mapping table lives in exactly one place.
153impl From<&FetchError> for crate::ErrorCode {
154 fn from(e: &FetchError) -> crate::ErrorCode {
155 match e {
156 FetchError::NotEligible { .. } => crate::ErrorCode::CapabilityDenied,
157 FetchError::NoOaAvailable => crate::ErrorCode::NoOaAvailable,
158 FetchError::Http(_) => crate::ErrorCode::NetworkError,
159 FetchError::Log(_) => crate::ErrorCode::LogError,
160 FetchError::InvalidRef(_) => crate::ErrorCode::InvalidRef,
161 FetchError::SourceSchema { .. } => crate::ErrorCode::InternalError,
162 // Slice 2: a too-large batch is a request-shape failure, so
163 // collapse to `INVALID_REF` (closest closed-set fit). The
164 // `#[non_exhaustive]` wildcard below would otherwise route
165 // it to `INTERNAL_ERROR`, which would mislead agents.
166 FetchError::TooManyRefs { .. } => crate::ErrorCode::InvalidRef,
167 }
168 }
169}
170
171/// Map a [`FetchError`] reference to the structured [`crate::DenialContext`]
172/// channel introduced by ADR-0023 §4.
173///
174/// `&FetchError` (rather than `FetchError`) so the orchestrator can
175/// produce the structured side-channel without consuming the error it
176/// still needs for `error.message` and the `From<FetchError> for
177/// ErrorCode` collapse above. The `Http` arm delegates to the
178/// `From<&HttpError> for Option<DenialContext>` impl in [`crate::http`].
179impl From<&FetchError> for Option<crate::DenialContext> {
180 fn from(e: &FetchError) -> Self {
181 use crate::{DenialContext, DenialReason};
182 match e {
183 FetchError::NotEligible { source_key } => Some(DenialContext {
184 reason: DenialReason::CapabilityNotGranted,
185 source: Some(source_key.clone()),
186 attempted: None,
187 // CapabilityNotGranted has no allowlist channel: the
188 // producer leaves `expected` at `None` (NOT `Some(vec![])`).
189 // See `DenialContext::expected` for the disambiguation.
190 expected: None,
191 hop_index: None,
192 cap: None,
193 actual: None,
194 }),
195 // Delegate to the HttpError mapping (ADR-0023 §4 mapping table).
196 FetchError::Http(http_err) => http_err.into(),
197 // Non-denial variants map to None per ADR-0023 §4. (Slice 2:
198 // `TooManyRefs` is a request-shape failure, not a denial —
199 // adding it to the None arm keeps the mapping table consistent.)
200 FetchError::NoOaAvailable
201 | FetchError::Log(_)
202 | FetchError::InvalidRef(_)
203 | FetchError::SourceSchema { .. }
204 | FetchError::TooManyRefs { .. } => None,
205 }
206 }
207}
208
209/// The trait implemented by every Tier 1 / 2 / 3 fetcher.
210///
211/// Binding signature: `docs/PUBLIC_API.md` §2 (NORMATIVE — the wire shape
212/// of these three methods is semver-locked).
213#[async_trait]
214pub trait Source: Send + Sync {
215 /// Stable name used in metadata (`[doiget].source`) and provenance
216 /// rows. Conventional values: `"crossref"`, `"unpaywall"`, `"arxiv"`,
217 /// `"openalex"`, `"semantic-scholar"`, `"doaj"`, `"tdm-elsevier"`,
218 /// etc. (see `docs/SOURCES.md`).
219 fn name(&self) -> &str;
220
221 /// True if this source can plausibly serve the given ref under the
222 /// runtime capability profile. Implementations MUST be fast and
223 /// non-blocking; the orchestrator calls `can_serve` to decide whether
224 /// to invoke `fetch` at all.
225 fn can_serve(&self, profile: &CapabilityProfile, ref_: &Ref) -> bool;
226
227 /// Perform the source-specific fetch.
228 ///
229 /// Implementations:
230 /// 1. acquire `ctx.rate_limiter.acquire(self.name()).await`,
231 /// 2. fetch via `ctx.http.fetch_bytes` / `ctx.http.fetch_pdf`,
232 /// 3. emit one `LogEvent::Fetch` row via `ctx.log.append`,
233 /// 4. return a [`FetchResult`].
234 ///
235 /// The trait does NOT enforce these steps; it documents the protocol
236 /// so concrete impls produce uniform audit trails (per
237 /// `docs/ARCHITECTURE.md` §6 and `docs/PROVENANCE_LOG.md` §3).
238 async fn fetch(
239 &self,
240 ref_: &Ref,
241 profile: &CapabilityProfile,
242 ctx: &FetchContext,
243 ) -> Result<FetchResult, FetchError>;
244}
245
246// ---------------------------------------------------------------------------
247// Tests
248// ---------------------------------------------------------------------------
249
250#[cfg(test)]
251#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
252mod tests {
253 use super::*;
254
255 use camino::Utf8PathBuf;
256 use tempfile::TempDir;
257
258 use crate::http::{tier_1_allowlist, HttpClient};
259 use crate::provenance::ProvenanceLog;
260 use crate::rate_limiter::RateLimiter;
261 use crate::{CapabilityProfile, Doi, ErrorCode, RateLimits, Ref};
262
263 /// Minimal `Source` impl exercised purely to pin the trait shape and
264 /// verify dispatch through `Box<dyn Source>`. Concrete sources land in
265 /// follow-up PRs (Crossref / Unpaywall / arXiv).
266 struct MockSource;
267
268 #[async_trait]
269 impl Source for MockSource {
270 fn name(&self) -> &str {
271 "mock"
272 }
273 fn can_serve(&self, _: &CapabilityProfile, _: &Ref) -> bool {
274 true
275 }
276 async fn fetch(
277 &self,
278 _: &Ref,
279 _: &CapabilityProfile,
280 _: &FetchContext,
281 ) -> Result<FetchResult, FetchError> {
282 Ok(FetchResult {
283 source: "mock".into(),
284 license: "unknown".into(),
285 pdf_bytes: None,
286 final_url: None,
287 metadata_json: None,
288 })
289 }
290 }
291
292 /// Build a `FetchContext` backed by real (but inert) Round-A
293 /// foundation modules: a `HttpClient` over the Tier-1 allowlist, a
294 /// `RateLimiter` at hard-coded politeness, and a `ProvenanceLog` in
295 /// a tempdir. Returns the dir as well so the caller keeps it alive
296 /// for the duration of the test.
297 fn build_test_context() -> (TempDir, FetchContext) {
298 let td = TempDir::new().expect("tempdir");
299 // Workspace lints ban `std::path::PathBuf` for log paths; convert
300 // via camino's `Utf8PathBuf::try_from`.
301 let log_dir =
302 Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
303 let log_path = log_dir.join("test.jsonl");
304
305 let http = Arc::new(HttpClient::new(tier_1_allowlist()).expect("http client builds"));
306 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
307 let session_id = "01J0000000000000000000TEST".to_string();
308 let log = Arc::new(
309 ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
310 );
311
312 (
313 td,
314 FetchContext {
315 http,
316 rate_limiter,
317 log,
318 session_id,
319 cache_root: None,
320 },
321 )
322 }
323
324 #[tokio::test]
325 async fn mock_source_compiles_as_trait_object() {
326 // Trait-shape pin: a `Source` impl is dyn-safe and can be boxed.
327 let s: Box<dyn Source> = Box::new(MockSource);
328 assert_eq!(s.name(), "mock");
329 let profile = CapabilityProfile::from_env().expect("Phase 0 stub");
330 let r = Ref::Doi(Doi("10.1234/example".to_string()));
331 assert!(s.can_serve(&profile, &r));
332
333 let (_td, ctx) = build_test_context();
334 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
335 assert_eq!(res.source, "mock");
336 }
337
338 #[tokio::test]
339 async fn mock_source_fetch_returns_result() {
340 // Direct dispatch (not through `dyn`) to exercise the async fn
341 // body and assert the populated FetchResult fields.
342 let s = MockSource;
343 let profile = CapabilityProfile::from_env().expect("Phase 0 stub");
344 let r = Ref::Doi(Doi("10.1234/example".to_string()));
345 let (_td, ctx) = build_test_context();
346
347 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
348 assert_eq!(res.source, "mock");
349 assert_eq!(res.license, "unknown");
350 assert!(res.pdf_bytes.is_none());
351 assert!(res.final_url.is_none());
352 assert!(res.metadata_json.is_none());
353 }
354
355 #[test]
356 fn fetch_error_collapses_to_error_code() {
357 // Mirrors `docs/PUBLIC_API.md` §4 / PR #55 boundary collapse.
358 // Each variant must map to its documented code.
359 let e: ErrorCode = FetchError::NotEligible {
360 source_key: "mock".into(),
361 }
362 .into();
363 assert_eq!(e, ErrorCode::CapabilityDenied);
364
365 let e: ErrorCode = FetchError::NoOaAvailable.into();
366 assert_eq!(e, ErrorCode::NoOaAvailable);
367
368 let e: ErrorCode = FetchError::Http(HttpError::UnknownSource {
369 source_key: "mock".into(),
370 })
371 .into();
372 assert_eq!(e, ErrorCode::NetworkError);
373
374 let e: ErrorCode = FetchError::Log(LogError::Io(std::io::Error::other("synthetic"))).into();
375 assert_eq!(e, ErrorCode::LogError);
376
377 let e: ErrorCode = FetchError::InvalidRef(RefParseError::Empty).into();
378 assert_eq!(e, ErrorCode::InvalidRef);
379
380 let e: ErrorCode = FetchError::SourceSchema {
381 hint: "missing field 'license'".into(),
382 }
383 .into();
384 assert_eq!(e, ErrorCode::InternalError);
385
386 // Slice 2 — TooManyRefs collapses to INVALID_REF, NOT
387 // InternalError (the `#[non_exhaustive]` wildcard would
388 // otherwise misroute this to InternalError).
389 let e: ErrorCode = FetchError::TooManyRefs { got: 101, max: 100 }.into();
390 assert_eq!(e, ErrorCode::InvalidRef);
391 }
392
393 #[test]
394 fn fetch_context_debug_redacts_internals() {
395 // Pin the Debug shape — only `session_id` is printed, the rest is
396 // elided. Prevents accidental log leakage when a context is
397 // included in a `tracing::debug!` event.
398 let (_td, ctx) = build_test_context();
399 let s = format!("{:?}", ctx);
400 assert!(
401 s.contains("session_id"),
402 "session_id must be in Debug: {}",
403 s
404 );
405 assert!(s.contains("01J0000000000000000000TEST"));
406 assert!(
407 !s.contains("HttpClient") && !s.contains("RateLimiter") && !s.contains("ProvenanceLog"),
408 "FetchContext Debug must not dump foundation internals: {}",
409 s,
410 );
411 }
412
413 // ---------------------------------------------------------------
414 // FetchError -> Option<DenialContext> (ADR-0023 §4)
415 // ---------------------------------------------------------------
416
417 #[test]
418 fn denial_from_not_eligible_carries_source_key() {
419 use crate::{DenialContext, DenialReason};
420 let e = FetchError::NotEligible {
421 source_key: "tdm-elsevier".to_string(),
422 };
423 let dc: Option<DenialContext> = (&e).into();
424 let dc = dc.expect("NotEligible -> Some(DenialContext)");
425 assert_eq!(dc.reason, DenialReason::CapabilityNotGranted);
426 assert_eq!(dc.source.as_deref(), Some("tdm-elsevier"));
427 assert!(dc.attempted.is_none());
428 // Post-refinement: `expected: None` ("producer did not populate")
429 // rather than `Some(vec![])` ("explicit empty allowlist"). See
430 // `DenialContext::expected` field doc for the disambiguation.
431 assert!(dc.expected.is_none());
432 }
433
434 #[test]
435 fn denial_from_http_delegates_to_http_mapping() {
436 use crate::http::HttpError;
437 use crate::{DenialContext, DenialReason, PDF_MAX_BYTES};
438 // The Http arm must delegate to the HttpError mapping rather than
439 // reinventing it, so an OversizedBody surfaces with cap/actual
440 // populated and the SizeCapExceeded reason — proving delegation
441 // works without per-variant duplication.
442 let e = FetchError::Http(HttpError::OversizedBody {
443 actual: 209_715_200,
444 cap: PDF_MAX_BYTES,
445 });
446 let dc: Option<DenialContext> = (&e).into();
447 let dc = dc.expect("Http(OversizedBody) -> Some(DenialContext)");
448 assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
449 assert_eq!(dc.cap, Some(PDF_MAX_BYTES));
450 assert_eq!(dc.actual, Some(209_715_200));
451 }
452
453 #[test]
454 fn denial_from_non_denial_variants_returns_none() {
455 use crate::DenialContext;
456 // Each of the four non-denial FetchError arms maps to None per
457 // ADR-0023 §4.
458 let e = FetchError::NoOaAvailable;
459 let dc: Option<DenialContext> = (&e).into();
460 assert!(dc.is_none(), "NoOaAvailable must not produce DenialContext");
461
462 let e = FetchError::Log(LogError::Io(std::io::Error::other("synthetic")));
463 let dc: Option<DenialContext> = (&e).into();
464 assert!(dc.is_none(), "Log must not produce DenialContext");
465
466 let e = FetchError::InvalidRef(RefParseError::Empty);
467 let dc: Option<DenialContext> = (&e).into();
468 assert!(dc.is_none(), "InvalidRef must not produce DenialContext");
469
470 let e = FetchError::SourceSchema {
471 hint: "missing field 'license'".into(),
472 };
473 let dc: Option<DenialContext> = (&e).into();
474 assert!(dc.is_none(), "SourceSchema must not produce DenialContext");
475 }
476}