doiget_core/orchestrator.rs
1//! Cross-source orchestrators that compose multiple [`Source`] impls into
2//! a single user-facing operation.
3//!
4//! Slice 2 of the doiget roadmap promotes [`fetch_paper`] and
5//! [`batch_fetch`] from `doiget-cli` into this module so the MCP server
6//! (`doiget-mcp`) and the CLI share one source of truth for the per-ref
7//! orchestration. The CLI's `commands::fetch::fetch_one` is now a thin
8//! wrapper that delegates here and adds the human-facing stderr print
9//! line. Dry-run preview helpers live as [`fetch_paper_plan`] and
10//! [`batch_fetch_plans`].
11//!
12//! [`Source`]: crate::source::Source
13
14use std::collections::BTreeMap;
15
16use camino::{Utf8Path, Utf8PathBuf};
17use chrono::Utc;
18use serde_json::Value;
19
20use crate::dry_run::{build_fetch_plan, try_build_fetch_plan, FetchPlan};
21use crate::http::HttpError;
22use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
23use crate::source::{FetchContext, FetchError, FetchResult, Source};
24use crate::sources::arxiv::ArxivSource;
25use crate::sources::crossref::CrossrefSource;
26use crate::sources::unpaywall::UnpaywallSource;
27use crate::store::{DoigetExtension, Metadata, Store};
28use crate::{ArxivId, CapabilityProfile, Doi, Ref, Safekey, MAX_BATCH_REFS, SCHEMA_VERSION};
29
30/// Outcome of a successful [`metadata_only`] call.
31///
32/// Mirrors the wire shape documented in `docs/MCP_TOOLS.md` §11: the
33/// `source` identifies which resolver produced the metadata, `license`
34/// is the OA license string when known (Unpaywall channel), `oa_url` is
35/// the discovered OA URL **(never followed by this orchestrator)**, and
36/// `metadata` is the source's native JSON payload (Crossref `message`,
37/// Unpaywall work record, or the parsed arXiv Atom-feed object).
38///
39/// `metadata` is serialized as-is by the MCP envelope builder
40/// (`crates/doiget-mcp/src/lib.rs`); we deliberately do NOT normalize
41/// here so the agent can see exactly what the source returned.
42#[derive(Debug, Clone)]
43#[non_exhaustive]
44pub struct MetadataOnlyOutcome {
45 /// Resolver key that produced the metadata payload. One of
46 /// `"crossref"`, `"unpaywall"`, `"arxiv"` (the closed set named in
47 /// `docs/MCP_TOOLS.md` §11 type alias).
48 pub source: String,
49 /// Resolver profile under which the canonical-digest (ADR-0021 §1)
50 /// was minted for this call. In Slice 4 this equals
51 /// [`Self::source`] verbatim (the metadata-only path emits one row
52 /// per consulted resolver); future slices that introduce overlapping
53 /// resolvers MAY have `resolver_profile != source`. Surfaced through
54 /// the `doiget_metadata_only` MCP envelope per ADR-0021 §4.
55 pub resolver_profile: String,
56 /// OA license string when the resolver could supply one (today only
57 /// the Unpaywall fallback path populates this). `None` when the
58 /// primary source did not surface a license.
59 pub license: Option<String>,
60 /// Discovered OA URL — surfaced to the caller for separate action,
61 /// **never followed by this orchestrator**. The Crossref response's
62 /// `message.link[]` array is mined first; the Unpaywall fallback
63 /// path uses `best_oa_location.url_for_pdf` (or `url`).
64 pub oa_url: Option<String>,
65 /// Source's native metadata payload. For Crossref this is the
66 /// `message` object; for Unpaywall the work record; for arXiv the
67 /// parsed Atom-feed JSON (see
68 /// `crate::sources::arxiv::parse_atom_feed`).
69 pub metadata: Value,
70}
71
72/// Resolve a [`Ref`] to metadata WITHOUT triggering a publisher PDF
73/// fetch.
74///
75/// Binding spec: `docs/MCP_TOOLS.md` §11 (NORMATIVE — this function
76/// MUST NOT call [`crate::http::HttpClient::fetch_pdf`] under any code
77/// path). The posture-lint workflow greps for that pattern; the test
78/// suite additionally exercises the DOI and arXiv branches end-to-end
79/// against wiremock to assert the OA URL is reported, not followed.
80///
81/// # Dispatch
82///
83/// - `Ref::Doi(_)` → Crossref first (bibliographic metadata + OA URL
84/// via `message.link[]`). If Crossref returns a usable payload the
85/// call returns immediately; Unpaywall is consulted only as a fallback
86/// when Crossref fails. The Unpaywall fallback surfaces a license
87/// string and may overwrite `oa_url` with the `best_oa_location`
88/// channel.
89/// - `Ref::Arxiv(_)` → [`ArxivSource::fetch_metadata_only`]: ONLY the
90/// Atom feed (`https://export.arxiv.org/api/query?id_list=<id>`) is
91/// consulted; the PDF endpoint is NOT touched. `license` is set to
92/// the platform-wide `"arxiv-default"` token, `oa_url` is `None`
93/// (the arXiv abstract page is not a PDF URL).
94///
95/// # Side effects
96///
97/// Each consulted source appends ONE `LogEvent::Fetch` row to
98/// `ctx.log` (arXiv emits its row under `Capability::Metadata`; the
99/// DOI sources emit under `Capability::Oa` — they pre-date this
100/// distinction and a follow-up slice may unify them). The orchestrator
101/// itself does NOT bracket the call with `SessionStart` / `SessionEnd`
102/// rows — that is the MCP server's responsibility (it owns the
103/// per-tool-call session boundary).
104///
105/// This function is the **pure resolver**: it consults the source(s)
106/// and emits provenance rows, but it does NOT write to the store.
107/// The `docs/MCP_TOOLS.md` §11 store-write SIDE EFFECT is provided by
108/// [`metadata_only_to_store`], which wraps this and persists the
109/// metadata TOML to `<root>/.metadata/<safekey>.toml`. Keeping the
110/// store-write in a *separate* entry point is exactly what lets
111/// [`resolve_only`] safely delegate here — its contract forbids any
112/// store write, and a pure `metadata_only` can never regress that
113/// invariant (#139).
114///
115/// # Errors
116///
117/// Returns [`FetchError`] from the underlying [`Source`] dispatch. The
118/// MCP boundary converts these to the closed [`crate::ErrorCode`] set
119/// via the existing `From<FetchError> for ErrorCode` impl.
120// Stays `pub` (a `pub(crate)` compile-time guard was considered and
121// rejected): `crates/doiget-core/tests/` integration tests
122// (`real_world_fixtures_e2e`) legitimately drive the PURE resolver
123// directly and assert its outcome, and `tests/` compiles as a separate
124// crate. The #139 pre-fix bug (an MCP caller
125// picking the pure variant when it needed persistence) is instead
126// prevented *structurally*: the MCP layer imports only
127// `metadata_only_to_store`, and `resolve_only` delegates to this pure
128// fn — neither can acquire or skip the store-write by mistake.
129pub async fn metadata_only(
130 ref_: &Ref,
131 profile: &CapabilityProfile,
132 ctx: &FetchContext,
133) -> Result<MetadataOnlyOutcome, FetchError> {
134 match ref_ {
135 Ref::Doi(doi) => metadata_only_doi(doi, ref_, profile, ctx).await,
136 Ref::Arxiv(id) => {
137 let arxiv = arxiv_source_from_env();
138 let metadata = arxiv.fetch_metadata_only(id, ctx).await?;
139 // Pure resolver — no store write here (see fn doc); the
140 // store-write side effect lives in `metadata_only_to_store`.
141 Ok(MetadataOnlyOutcome {
142 source: arxiv.name().to_string(),
143 resolver_profile: arxiv.name().to_string(),
144 license: Some("arxiv-default".to_string()),
145 oa_url: None,
146 metadata,
147 })
148 }
149 }
150}
151
152/// Resolve a [`Ref`] to metadata with **no local persistence**.
153///
154/// This is the audit-trail-preserving sibling of [`metadata_only`]: each
155/// consulted [`Source`] still emits its own `LogEvent::Fetch` row
156/// through `ctx.log` (so the provenance hash chain remains continuous,
157/// per `docs/PROVENANCE_LOG.md`), but the orchestrator MUST NOT write
158/// the metadata TOML to the store under any code path — present or
159/// future.
160///
161/// Binding spec: `docs/MCP_TOOLS.md` §1 (the `doiget_resolve_paper`
162/// tool — Slice 7).
163///
164/// # Why this exists as a distinct orchestrator
165///
166/// [`metadata_only`] is the **pure resolver** and never writes to the
167/// store; the store-write SIDE EFFECT lives only in the separate
168/// [`metadata_only_to_store`] wrapper. Because the write is in a
169/// *different* entry point that this function does not call,
170/// delegating to [`metadata_only`] is permanently safe — there is no
171/// code path by which `resolve_only` can acquire a store write, now or
172/// in future (#139). This structural separation is the entire reason
173/// `metadata_only` was split into a pure core + a persisting wrapper
174/// rather than gaining a `write: bool` parameter.
175///
176/// # Dispatch
177///
178/// Identical to [`metadata_only`] (DOI → Crossref-first with Unpaywall
179/// fallback; arXiv → Atom feed only). The `oa_url` and `license`
180/// outputs follow the same rules.
181///
182/// # Side effects
183///
184/// One `LogEvent::Fetch` row per consulted resolver, written by the
185/// underlying [`Source`] impls. No metadata TOML write. No PDF fetch.
186/// No store mutation.
187///
188/// # Errors
189///
190/// Returns [`FetchError`] from the underlying [`Source`] dispatch,
191/// identical to [`metadata_only`].
192pub async fn resolve_only(
193 ref_: &Ref,
194 profile: &CapabilityProfile,
195 ctx: &FetchContext,
196) -> Result<MetadataOnlyOutcome, FetchError> {
197 // Delegating to the PURE `metadata_only` is the contract-correct
198 // implementation, not a placeholder: `metadata_only` never writes
199 // to the store (the persisting path is the separate
200 // `metadata_only_to_store`, which this function does not call), so
201 // `resolve_only`'s "no store mutation" guarantee holds structurally
202 // and cannot regress (#139).
203 metadata_only(ref_, profile, ctx).await
204}
205
206/// Resolve a [`Ref`] to metadata **and persist the metadata TOML to the
207/// store** — the `docs/MCP_TOOLS.md` §11 `doiget_metadata_only` SIDE
208/// EFFECT (#139).
209///
210/// Wraps the pure [`metadata_only`]: it runs the same resolver dispatch
211/// (so the provenance hash chain is identical), then writes
212/// `<root>/.metadata/<safekey>.toml` via the same
213/// `write_metadata_and_pdf` path `fetch_paper` uses for its
214/// metadata-only fallback, emitting one `StoreWrite` provenance row.
215///
216/// [`resolve_only`] MUST NOT call this — its contract forbids any store
217/// write. The split (pure core vs. persisting wrapper) makes that
218/// invariant structural rather than a convention.
219///
220/// # Errors
221///
222/// [`FetchError`] from the underlying resolver dispatch, or — if the
223/// store write fails — [`FetchError::SourceSchema`] (the closest
224/// closed-set arm; there is no dedicated `FetchError::StoreError`, so
225/// the MCP boundary maps it to `INTERNAL_ERROR` — see the inline note
226/// in `write_metadata_and_pdf`). On store-write failure
227/// `write_metadata_and_pdf` makes a **best-effort** attempt to
228/// append a `StoreWrite`/`Err` provenance row before the error
229/// propagates (that append's own failure is not separately surfaced —
230/// this matches the pre-existing `fetch_paper` metadata-only fallback
231/// path and is out of scope for #139).
232pub async fn metadata_only_to_store(
233 ref_: &Ref,
234 profile: &CapabilityProfile,
235 ctx: &FetchContext,
236 store: &dyn Store,
237) -> Result<MetadataOnlyOutcome, FetchError> {
238 let outcome = metadata_only(ref_, profile, ctx).await?;
239 let safekey = ref_.safekey();
240 let metadata = build_metadata_only_metadata(ref_, &outcome);
241 // `pdf_src = None` => writes `<root>/.metadata/<safekey>.toml` and
242 // appends the `StoreWrite` row (the exact path `fetch_paper` uses
243 // for its DOI metadata-only fallback).
244 write_metadata_and_pdf(store, &safekey, &metadata, None, ctx)?;
245 Ok(outcome)
246}
247
248/// Build the [`Metadata`] persisted by [`metadata_only_to_store`].
249///
250/// Minimal but valid: enough that a subsequent `doiget_info` returns a
251/// non-null `metadata` object (the #139 acceptance criterion). Title is
252/// best-effort from the resolver payload (`title` as a string, or the
253/// first element if it is an array — Crossref's `message.title` is
254/// typically an array, arXiv/Unpaywall typically a string; the
255/// extractor tolerates either regardless of source); it falls back to
256/// the ref id so the required `title` field is never empty.
257/// Bibliographic enrichment
258/// (year, venue, …) is intentionally out of scope here — the
259/// metadata-only contract is "persist what the resolver returned", and
260/// the raw payload is preserved verbatim in `MetadataOnlyOutcome`.
261fn build_metadata_only_metadata(ref_: &Ref, outcome: &MetadataOnlyOutcome) -> Metadata {
262 let (doi, arxiv_id) = match ref_ {
263 Ref::Doi(d) => (Some(d.clone()), None),
264 Ref::Arxiv(a) => (None, Some(a.clone())),
265 };
266 let ref_id = ref_.as_input_str().to_string();
267 let title = match extract_metadata_title(&outcome.metadata) {
268 Some(t) => t,
269 None => {
270 // The resolver returned a payload with no usable title.
271 // Persisting the ref id keeps the entry valid (#139), but
272 // emit a diagnostic so a broken/partial resolver response is
273 // not silently indistinguishable from a genuine title.
274 tracing::warn!(
275 ref_id = %ref_id,
276 source = %outcome.source,
277 "metadata-only: no usable title in resolver payload; \
278 persisting the ref id as the title placeholder"
279 );
280 ref_id
281 }
282 };
283 Metadata {
284 schema_version: SCHEMA_VERSION.to_string(),
285 title,
286 authors: extract_metadata_authors(&outcome.metadata),
287 year: None,
288 doi,
289 arxiv_id,
290 abstract_: None,
291 venue: None,
292 publisher: None,
293 issn: None,
294 isbn: None,
295 type_: None,
296 keywords: Vec::new(),
297 url: outcome.oa_url.clone(),
298 pdf_path: None,
299 doiget: Some(DoigetExtension {
300 fetched_at: Utc::now(),
301 source: outcome.source.clone(),
302 license: outcome
303 .license
304 .clone()
305 .unwrap_or_else(|| "unknown".to_string()),
306 size_bytes: 0,
307 mcp_call_id: None,
308 }),
309 other: BTreeMap::new(),
310 }
311}
312
313/// `title` from a resolver payload: a bare string, or the first
314/// **non-blank** element of an array (Crossref `message.title` is
315/// `[String]`; a leading empty/whitespace element is skipped rather
316/// than masking the real title). Trimmed. `None` if absent/blank.
317fn extract_metadata_title(meta: &Value) -> Option<String> {
318 let t = meta.get("title")?;
319 let s = match t.as_str() {
320 Some(s) => s.trim().to_string(),
321 None => t
322 .as_array()?
323 .iter()
324 .filter_map(Value::as_str)
325 .map(str::trim)
326 .find(|s| !s.is_empty())?
327 .to_string(),
328 };
329 if s.is_empty() {
330 None
331 } else {
332 Some(s)
333 }
334}
335
336/// Best-effort author list, tolerant of the resolver shapes we may see:
337/// Crossref `author: [{given,family}]`, arXiv `authors: [String]`, and
338/// a `z_authors: [{given,family}]` fallback. NOTE: doiget's Unpaywall
339/// source deserializes a *partial* `UnpaywallWork` that does not capture
340/// `z_authors`, so the `z_authors` branch is currently inert for the
341/// Unpaywall path (kept as forward-compat for if/when that struct
342/// captures it) — Unpaywall-sourced metadata-only entries get an empty
343/// author list. Returns `Vec::new()` when nothing is parseable (a valid
344/// metadata TOML — #139 only requires the entry to exist and be
345/// readable).
346fn extract_metadata_authors(meta: &Value) -> Vec<String> {
347 if let Some(arr) = meta.get("authors").and_then(Value::as_array) {
348 let v: Vec<String> = arr
349 .iter()
350 .filter_map(|a| a.as_str().map(str::to_string))
351 .collect();
352 if !v.is_empty() {
353 return v;
354 }
355 }
356 for key in ["author", "z_authors"] {
357 if let Some(arr) = meta.get(key).and_then(Value::as_array) {
358 let v: Vec<String> = arr
359 .iter()
360 .filter_map(|a| {
361 let given = a.get("given").and_then(Value::as_str).unwrap_or("");
362 let family = a.get("family").and_then(Value::as_str).unwrap_or("");
363 let name = format!("{given} {family}");
364 let name = name.trim();
365 if name.is_empty() {
366 a.get("name").and_then(Value::as_str).map(str::to_string)
367 } else {
368 Some(name.to_string())
369 }
370 })
371 .collect();
372 if !v.is_empty() {
373 return v;
374 }
375 }
376 }
377 Vec::new()
378}
379
380// ---------------------------------------------------------------------------
381// Env-aware source constructors (mirrors doiget-cli::commands::fetch::build_*)
382//
383// These let MCP integration tests redirect the orchestrator at a
384// wiremock origin via `DOIGET_*_BASE` env vars, without inverting the
385// `doiget-mcp -> doiget-core` wiring by depending on `doiget-cli`. The
386// override surface is identical to the CLI's `fetch.rs::build_*_source`
387// helpers so a single test fixture can drive both crates.
388// ---------------------------------------------------------------------------
389
390/// `DOIGET_CONTACT_EMAIL`, defaulting to the same `doiget@localhost`
391/// the CLI uses (`crates/doiget-cli/src/commands/fetch.rs::OrchestratorConfig`).
392const FALLBACK_CONTACT_EMAIL: &str = "doiget@localhost";
393
394fn contact_email_from_env() -> String {
395 std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| FALLBACK_CONTACT_EMAIL.to_string())
396}
397
398fn arxiv_source_from_env() -> ArxivSource {
399 if let Ok(s) = std::env::var("DOIGET_ARXIV_BASE") {
400 if let Ok(url) = url::Url::parse(&s) {
401 return ArxivSource::with_base(url);
402 }
403 }
404 ArxivSource::new()
405}
406
407fn crossref_source_from_env(contact: &str) -> CrossrefSource {
408 if let Ok(s) = std::env::var("DOIGET_CROSSREF_BASE") {
409 if let Ok(url) = url::Url::parse(&s) {
410 return CrossrefSource::with_base(url, contact.to_string());
411 }
412 }
413 CrossrefSource::new(contact.to_string())
414}
415
416fn unpaywall_source_from_env(contact: &str) -> UnpaywallSource {
417 if let Ok(s) = std::env::var("DOIGET_UNPAYWALL_BASE") {
418 if let Ok(url) = url::Url::parse(&s) {
419 return UnpaywallSource::with_base(url, contact.to_string());
420 }
421 }
422 UnpaywallSource::new(contact.to_string())
423}
424
425/// DOI branch — Crossref first, with Unpaywall as a fallback when
426/// Crossref fails. Crossref's `message.link[]` array (when present)
427/// supplies the OA URL hint without making a publisher request.
428async fn metadata_only_doi(
429 _doi: &Doi,
430 ref_: &Ref,
431 profile: &CapabilityProfile,
432 ctx: &FetchContext,
433) -> Result<MetadataOnlyOutcome, FetchError> {
434 let contact = contact_email_from_env();
435 let crossref = crossref_source_from_env(&contact);
436 match crossref.fetch(ref_, profile, ctx).await {
437 Ok(res) => {
438 let metadata = res.metadata_json.unwrap_or(Value::Null);
439 let oa_url = extract_crossref_oa_url(&metadata);
440 // Pure resolver — no store write here (see `metadata_only`
441 // doc); persistence is `metadata_only_to_store`'s job.
442 Ok(MetadataOnlyOutcome {
443 source: crossref.name().to_string(),
444 resolver_profile: crossref.name().to_string(),
445 // Crossref does not surface a license directly; the
446 // license channel for DOI metadata is Unpaywall's
447 // `best_oa_location.license`. Leave `None` here; the
448 // agent can call `unpaywall` (or a follow-up slice's
449 // chained orchestrator) if it needs a license string.
450 license: None,
451 oa_url,
452 metadata,
453 })
454 }
455 Err(crossref_err) => {
456 // Crossref failed. Try Unpaywall as a fallback before
457 // surfacing the original error.
458 let unpaywall = unpaywall_source_from_env(&contact);
459 match unpaywall.fetch(ref_, profile, ctx).await {
460 Ok(res) => {
461 let metadata = res.metadata_json.unwrap_or(Value::Null);
462 let oa_url = extract_unpaywall_oa_url(&metadata);
463 let license = if res.license == "unknown" {
464 None
465 } else {
466 Some(res.license)
467 };
468 Ok(MetadataOnlyOutcome {
469 source: unpaywall.name().to_string(),
470 resolver_profile: unpaywall.name().to_string(),
471 license,
472 oa_url,
473 metadata,
474 })
475 }
476 Err(_unpaywall_err) => {
477 // Both sources failed; surface the Crossref error
478 // (the primary path) for diagnosability.
479 Err(crossref_err)
480 }
481 }
482 }
483 }
484}
485
486/// Defensively pull a Crossref OA URL out of a `message.link[]` entry.
487///
488/// The Crossref `Link` model documents `link[].URL` as the OA URL string
489/// when the work has one (see
490/// `<https://api.crossref.org/swagger-ui/index.html>`). Multiple entries
491/// may be present; we return the first non-empty `URL` field
492/// encountered. Returns `None` if the array is missing, empty, or
493/// contains no usable URL string.
494fn extract_crossref_oa_url(msg: &Value) -> Option<String> {
495 let arr = msg.get("link")?.as_array()?;
496 arr.iter()
497 .filter_map(|entry| entry.get("URL").and_then(Value::as_str))
498 .find(|s| !s.is_empty())
499 .map(|s| s.to_string())
500}
501
502/// Defensively pull Unpaywall's preferred OA URL
503/// (`best_oa_location.url_for_pdf`, falling back to `.url`) out of a
504/// metadata payload.
505fn extract_unpaywall_oa_url(meta: &Value) -> Option<String> {
506 let loc = meta.get("best_oa_location")?;
507 loc.get("url_for_pdf")
508 .and_then(Value::as_str)
509 .or_else(|| loc.get("url").and_then(Value::as_str))
510 .map(|s| s.to_string())
511}
512
513// ---------------------------------------------------------------------------
514// fetch_paper — single-ref orchestrator (Slice 2)
515// ---------------------------------------------------------------------------
516
517/// Outcome of a successful [`fetch_paper`] call.
518///
519/// Wire shape mirrors `docs/MCP_TOOLS.md` §5 `FetchResult` minus the
520/// envelope chrome the MCP server wraps it in (`ok: true`, `ref`,
521/// optional `error`).
522///
523/// `path` is the absolute path of the resource the orchestrator wrote to
524/// the store. For arXiv refs and successful DOI OA-PDF fetches this is
525/// `<root>/<safekey>.pdf`; for the DOI metadata-only fallback (OA URL
526/// host off the `oa-publisher` allowlist, or PDF leg failed for another
527/// transport reason — `docs/REDIRECT_ALLOWLIST.md` §3 informed-best-
528/// effort posture) this is `<root>/.metadata/<safekey>.toml`.
529/// Outcome of the DOI OA-PDF leg, carried on [`FetchPaperOutcome`] so a
530/// caller can NEVER silently report a blocked PDF as a plain
531/// "metadata-only" success (issue #118). The product promise is
532/// "immediately explain WHY a paper can't be fetched" — the distinction
533/// between "there was no OA PDF to fetch" and "an OA PDF existed but we
534/// were blocked, and here is the reason" is exactly that explanation.
535#[derive(Debug, Clone)]
536#[non_exhaustive]
537pub enum PdfLegStatus {
538 /// A PDF was fetched and written to disk (arXiv always; DOI when
539 /// the OA-publisher leg succeeded).
540 Fetched,
541 /// No OA URL was discovered (Unpaywall reported no
542 /// `best_oa_location`). Metadata-only is the correct, expected
543 /// result here — not a failure.
544 NoOaUrl,
545 /// An OA URL *was* discovered but the PDF could not be retrieved
546 /// (host outside the oa-publisher allowlist, not-a-PDF body,
547 /// transport failure, …). Metadata was still written, but the
548 /// caller MUST surface this reason rather than pretending the
549 /// fetch was a clean metadata-only success.
550 Blocked {
551 /// Closed-set code, mapped from the underlying transport error
552 /// via the canonical `From<FetchError> for ErrorCode`.
553 code: crate::ErrorCode,
554 /// Human-readable one-line reason (the `FetchError` display).
555 message: String,
556 /// Structured denial side-channel (ADR-0023) when the failure
557 /// was an allowlist / scheme denial; `None` otherwise.
558 denial: Option<crate::DenialContext>,
559 },
560}
561
562/// What `fetch_paper` wrote to disk and how.
563///
564/// `path` is the PDF (`<root>/<safekey>.pdf`) on a successful PDF
565/// fetch, or the metadata TOML (`<root>/.metadata/<safekey>.toml`)
566/// when the DOI path fell back to metadata-only. [`Self::pdf_leg`]
567/// disambiguates *why* there is no PDF (genuinely none available vs.
568/// available-but-blocked) so callers never report a blocked PDF as a
569/// silent success (issue #118).
570#[derive(Debug, Clone)]
571#[non_exhaustive]
572pub struct FetchPaperOutcome {
573 /// `Source::name()` of the resolver whose payload landed on disk:
574 /// `"arxiv"` for an arXiv ref, `"oa-publisher"` when the DOI OA PDF
575 /// leg succeeded, or `"crossref"` / `"unpaywall"` when the DOI path
576 /// fell back to metadata-only. Mirrors the value written to
577 /// `[doiget].source` in the metadata TOML.
578 pub source: String,
579 /// Resolver profile under which the canonical-digest (ADR-0021 §1)
580 /// was minted for the final artifact. For an arXiv fetch this is
581 /// `"arxiv"`; for a successful DOI OA PDF leg this is
582 /// `"oa-publisher"`; for the DOI metadata-only fallback this is the
583 /// metadata source key (`"crossref"` / `"unpaywall"`). Equal to
584 /// [`Self::source`] verbatim in Slice 4 but kept distinct so future
585 /// slices can decouple "which resolver wrote to disk" from "which
586 /// resolver is the audit identity". Surfaced through the
587 /// `doiget_fetch_paper` MCP envelope per ADR-0021 §4.
588 pub resolver_profile: String,
589 /// OA license string (`"CC-BY-4.0"`, `"cc-by"`, `"arxiv-default"`,
590 /// `"unknown"`). Mirrors `[doiget].license`.
591 pub license: String,
592 /// Absolute path of the artifact actually written
593 /// (`<root>/<safekey>.pdf` on success, `<root>/.metadata/<safekey>.toml`
594 /// on metadata-only fallback).
595 pub path: Utf8PathBuf,
596 /// Stored PDF size in bytes; `0` on the metadata-only fallback
597 /// (`docs/REDIRECT_ALLOWLIST.md` §3.5).
598 pub size_bytes: u64,
599 /// The schema version of the metadata TOML written
600 /// (always [`crate::SCHEMA_VERSION`] for this build).
601 pub schema_version: String,
602 /// What happened on the PDF leg (issue #118). `Fetched` /
603 /// `NoOaUrl` are clean outcomes; `Blocked` carries the structured
604 /// reason an OA PDF existed but could not be retrieved, so the
605 /// CLI / MCP surface it instead of a silent metadata-only success.
606 pub pdf_leg: PdfLegStatus,
607 /// Per-ref [`crate::Safekey`] stringified (`Ref::safekey().as_str()`).
608 /// Exposed on the outcome so JSON-mode CLI / MCP callers can
609 /// emit a structured success body without re-parsing the input
610 /// ref (#210 / `docs/ERRORS.md` §3). Always populated.
611 pub safekey: String,
612 /// ADR-0021 §1 canonical-digest as 64-char lowercase hex for the
613 /// resolver_profile that produced this outcome's audit identity.
614 /// For an arXiv fetch this is the digest under `"arxiv"`; for a
615 /// DOI OA PDF leg this is under `"oa-publisher"`; for the DOI
616 /// metadata-only fallback this is under the metadata source key
617 /// (`"crossref"` / `"unpaywall"`). Always populated.
618 pub canonical_digest: String,
619}
620
621impl FetchPaperOutcome {
622 /// Test-only constructor for downstream crates (`doiget-cli`,
623 /// `doiget-mcp`) that need to drive classification / rendering
624 /// logic without running the full orchestrator. Produces a
625 /// minimal but structurally-valid outcome — all required fields
626 /// populated with defensible stubs — so unit tests can assert
627 /// the surrounding behavior (JSONL shape, exit-code mapping,
628 /// PDF-leg branching) in isolation.
629 ///
630 /// `#[doc(hidden)]` because this is not a stable public API; the
631 /// signature may change to fit test needs without a CHANGELOG
632 /// `[BREAKING]` callout.
633 #[doc(hidden)]
634 pub fn for_test_synthetic(
635 safekey: impl Into<String>,
636 source: impl Into<String>,
637 pdf_leg: PdfLegStatus,
638 ) -> Self {
639 let safekey: String = safekey.into();
640 let source: String = source.into();
641 Self {
642 source: source.clone(),
643 resolver_profile: source.clone(),
644 license: "unknown".to_string(),
645 path: Utf8PathBuf::from(format!("/tmp/{safekey}.pdf")),
646 size_bytes: 0,
647 schema_version: SCHEMA_VERSION.to_string(),
648 pdf_leg,
649 safekey: safekey.clone(),
650 // 32 bytes of `0x00` → a stable, non-secret digest stub
651 // that's still 64 chars of lowercase hex.
652 canonical_digest: "00".repeat(32),
653 }
654 }
655}
656
657/// Resolve a [`Ref`] to a PDF (or metadata-only fallback) and write it
658/// through `store`.
659///
660/// Binding spec: `docs/MCP_TOOLS.md` §4 (`doiget_fetch_paper`),
661/// `docs/REDIRECT_ALLOWLIST.md` §3 (informed-best-effort posture for the
662/// DOI OA PDF leg), `docs/PROVENANCE_LOG.md` §3 (per-attempt `Fetch` rows
663/// emitted by the source impls; `StoreWrite` row emitted by this
664/// orchestrator).
665///
666/// # Dispatch
667///
668/// - `Ref::Arxiv(_)` → [`ArxivSource::fetch`]; the source returns PDF
669/// bytes + Atom-feed metadata. The orchestrator writes both the PDF
670/// and the metadata TOML.
671/// - `Ref::Doi(_)` → Crossref metadata + Unpaywall license/OA-URL
672/// enrichment + (when the OA URL host is on the `oa-publisher`
673/// allowlist) a publisher PDF leg. A failure on the PDF leg is
674/// non-fatal: the metadata is still written and the orchestrator
675/// returns `Ok(...)` with `source` set to the metadata source.
676///
677/// # Side effects
678///
679/// Each consulted source emits one `LogEvent::Fetch` row via
680/// `ctx.log.append`. The orchestrator additionally emits one
681/// `LogEvent::StoreWrite` row on the successful write. Session bookend
682/// rows are the caller's responsibility (the CLI's
683/// `commands::fetch::run_with_options` wraps the call; the MCP server's
684/// `doiget_fetch_paper` tool method wraps it too).
685///
686/// # Errors
687///
688/// Returns [`FetchError`] from the underlying [`Source`] dispatch. The
689/// MCP boundary converts these to the closed [`crate::ErrorCode`] set
690/// via the existing `From<FetchError> for ErrorCode` impl.
691pub async fn fetch_paper(
692 ref_: &Ref,
693 profile: &CapabilityProfile,
694 ctx: &FetchContext,
695 store: &dyn Store,
696 store_root: &Utf8Path,
697) -> Result<FetchPaperOutcome, FetchError> {
698 let safekey = ref_.safekey();
699 match ref_ {
700 Ref::Arxiv(id) => {
701 fetch_paper_arxiv(id, ref_, profile, ctx, store, store_root, &safekey).await
702 }
703 Ref::Doi(doi) => {
704 fetch_paper_doi(doi, ref_, profile, ctx, store, store_root, &safekey).await
705 }
706 }
707}
708
709/// Build the dry-run preview ([`FetchPlan`]) for a single ref without
710/// touching the network, store, or provenance log. Thin re-export of
711/// [`crate::dry_run::build_fetch_plan`] under the slice-2 naming the
712/// MCP tool surfaces use; kept here so the MCP `doiget_fetch_paper`
713/// tool method does not have to reach across two modules.
714pub fn fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
715 build_fetch_plan(ref_, store_root)
716}
717
718/// Fallible sibling of [`fetch_paper_plan`] — propagates an internal
719/// allowlist-contract drift as a typed [`FetchError::SourceSchema`]
720/// instead of degrading to an empty `candidate_hosts` list (issue
721/// #156 ②). Thin re-export of [`crate::dry_run::try_build_fetch_plan`].
722/// Added alongside the infallible [`fetch_paper_plan`] rather than
723/// changing its signature, because `fetch_paper_plan` is `pub` and
724/// called from `doiget-mcp`, which is out of scope for this batch.
725///
726/// # Errors
727///
728/// See [`crate::dry_run::try_build_fetch_plan`].
729pub fn try_fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> Result<FetchPlan, FetchError> {
730 try_build_fetch_plan(ref_, store_root)
731}
732
733/// arXiv branch of [`fetch_paper`]. Internal — public callers go
734/// through `fetch_paper`.
735async fn fetch_paper_arxiv(
736 id: &ArxivId,
737 ref_: &Ref,
738 profile: &CapabilityProfile,
739 ctx: &FetchContext,
740 store: &dyn Store,
741 store_root: &Utf8Path,
742 safekey: &Safekey,
743) -> Result<FetchPaperOutcome, FetchError> {
744 let source = arxiv_source_from_env();
745 if !source.can_serve(profile, ref_) {
746 return Err(FetchError::NotEligible {
747 source_key: source.name().to_string(),
748 });
749 }
750
751 let FetchResult {
752 license,
753 pdf_bytes,
754 final_url,
755 ..
756 } = source.fetch(ref_, profile, ctx).await?;
757 let pdf = pdf_bytes.ok_or_else(|| FetchError::SourceSchema {
758 hint: "arxiv source returned no PDF bytes".to_string(),
759 })?;
760 let size_bytes = pdf.len() as u64;
761
762 // Phase 1 minimal metadata. Full Atom-feed extraction (title /
763 // authors) lives in `ArxivSource::fetch_metadata_only` and the
764 // metadata-only orchestrator; the fetch path keeps the placeholder
765 // for now (a follow-up slice may chain in Atom-parse here).
766 let metadata = Metadata {
767 schema_version: SCHEMA_VERSION.to_string(),
768 title: format!("arxiv:{}", id.as_str()),
769 authors: Vec::new(),
770 year: None,
771 doi: None,
772 arxiv_id: Some(id.clone()),
773 abstract_: None,
774 venue: None,
775 publisher: None,
776 issn: None,
777 isbn: None,
778 type_: None,
779 keywords: Vec::new(),
780 url: final_url.as_ref().map(|u| u.to_string()),
781 pdf_path: Some(format!("{}.pdf", safekey.as_str())),
782 doiget: Some(DoigetExtension {
783 fetched_at: Utc::now(),
784 source: "arxiv".to_string(),
785 license: license.clone(),
786 size_bytes,
787 mcp_call_id: None,
788 }),
789 other: BTreeMap::new(),
790 };
791
792 let tmp = stage_pdf_to_tempfile(&pdf)?;
793 let pdf_src = Utf8Path::from_path(tmp.path())
794 .ok_or_else(|| FetchError::SourceSchema {
795 hint: "staging tempfile path is not UTF-8".to_string(),
796 })?
797 .to_path_buf();
798 write_metadata_and_pdf(store, safekey, &metadata, Some(&pdf_src), ctx)?;
799 drop(tmp);
800
801 let path = store_root.join(format!("{}.pdf", safekey.as_str()));
802 let canonical_digest =
803 crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), "arxiv", None).digest_hex();
804 Ok(FetchPaperOutcome {
805 source: "arxiv".to_string(),
806 resolver_profile: "arxiv".to_string(),
807 license,
808 path,
809 size_bytes,
810 schema_version: SCHEMA_VERSION.to_string(),
811 // arXiv always delivers the PDF (or the whole fn already
812 // returned Err above) — there is no metadata-only fallback.
813 pdf_leg: PdfLegStatus::Fetched,
814 safekey: safekey.as_str().to_string(),
815 canonical_digest,
816 })
817}
818
819/// DOI branch of [`fetch_paper`] — Crossref + Unpaywall + (when allowed)
820/// OA-publisher PDF leg. Mirrors the CLI's `fetch_doi` implementation
821/// (`crates/doiget-cli/src/commands/fetch.rs`) — the CLI now delegates
822/// here so both surfaces share one source of truth.
823async fn fetch_paper_doi(
824 doi: &Doi,
825 ref_: &Ref,
826 profile: &CapabilityProfile,
827 ctx: &FetchContext,
828 store: &dyn Store,
829 store_root: &Utf8Path,
830 safekey: &Safekey,
831) -> Result<FetchPaperOutcome, FetchError> {
832 let contact = contact_email_from_env();
833 let unpaywall_contact = unpaywall_email_from_env(&contact);
834 let crossref = crossref_source_from_env(&contact);
835 // Issue #120: Crossref is NON-fatal. A transient Crossref failure
836 // must not abort the whole DOI fetch when Unpaywall alone can
837 // still deliver the OA PDF. We keep the error and only surface it
838 // if nothing usable comes back (see the both-failed guard below).
839 let (cross, crossref_err) = match crossref.fetch(ref_, profile, ctx).await {
840 Ok(r) => (Some(r), None),
841 Err(e) => {
842 tracing::warn!(
843 error = %e,
844 "crossref fetch failed; continuing with unpaywall-only metadata + OA leg"
845 );
846 (None, Some(e))
847 }
848 };
849 let crossref_meta = cross
850 .as_ref()
851 .and_then(|c| c.metadata_json.clone())
852 .unwrap_or(Value::Null);
853 let extracted = extract_crossref_fields(&crossref_meta);
854
855 // Unpaywall second — license enrichment + OA URL chain discovery.
856 // A failure here is non-fatal: we still write the Crossref-
857 // derived metadata.
858 let unpaywall = unpaywall_source_from_env(&unpaywall_contact);
859 let upw_result = unpaywall.fetch(ref_, profile, ctx).await;
860 let (license, source_label, oa_chain) = match upw_result {
861 Ok(r) => {
862 let chain = extract_oa_url_chain(r.metadata_json.as_ref());
863 let label = if r.license != "unknown" {
864 "unpaywall".to_string()
865 } else {
866 "crossref".to_string()
867 };
868 (r.license, label, chain)
869 }
870 Err(e) => {
871 // Unpaywall unreachable / errored. We continue with the
872 // Crossref-only metadata, but the resulting empty OA
873 // chain will be reported downstream as
874 // `PdfLegStatus::NoOaUrl` — semantically distinct from
875 // "Unpaywall confirmed no OA URL". The provenance log
876 // already carries an Unpaywall Fetch err row (the
877 // Unpaywall source impl logged its own attempt before
878 // returning), so the audit trail captures the cause; the
879 // tracing line below makes the orchestrator-level signal
880 // loud as well. Surfacing the distinction at the
881 // `PdfLegStatus` level (a new variant like
882 // `MetadataSourceUnavailable`) is a deliberate
883 // follow-up — see CHANGELOG `[0.4.0]` Notes.
884 tracing::warn!(
885 error = %e,
886 doi = %doi.as_str(),
887 "unpaywall fetch failed; OA chain will be empty (downstream PdfLegStatus::NoOaUrl \
888 is conservative — Unpaywall was unreachable, not authoritatively oa-free)"
889 );
890 ("unknown".to_string(), "crossref".to_string(), Vec::new())
891 }
892 };
893
894 // OA PDF leg — ADR-0029 fetch chain. Walk the candidate URL list
895 // in order; first successful PDF wins, all-failed surfaces as
896 // `PdfLegStatus::Blocked` with the LAST attempt's error (the most
897 // informative for the operator — typically the network /
898 // allowlist reason the chain could not be exhausted). Each
899 // `try_fetch_oa_pdf` call already emits its own per-attempt
900 // provenance row (`oa-publisher` Fetch ok / err), so the audit
901 // trail captures every external request without orchestrator-
902 // side bookkeeping.
903 //
904 // Issue #118: a failure here is NEVER silently turned into a
905 // clean metadata-only success — the structured reason is carried
906 // out on `PdfLegStatus::Blocked`.
907 let (pdf_leg, pdf_bytes) = if oa_chain.is_empty() {
908 (PdfLegStatus::NoOaUrl, None)
909 } else {
910 let mut succeeded: Option<Vec<u8>> = None;
911 let mut last_err: Option<HttpError> = None;
912 let total = oa_chain.len();
913 for (idx, candidate) in oa_chain.iter().enumerate() {
914 let attempt = idx + 1;
915 tracing::debug!(
916 attempt,
917 total,
918 url = %candidate,
919 "trying OA PDF candidate (ADR-0029 chain)"
920 );
921 match try_fetch_oa_pdf(doi, candidate, ctx).await {
922 Ok((bytes, _final_url)) => {
923 if attempt > 1 {
924 tracing::info!(
925 attempt,
926 total,
927 url = %candidate,
928 "OA PDF chain succeeded on fallback candidate (ADR-0029)"
929 );
930 }
931 succeeded = Some(bytes);
932 break;
933 }
934 Err(e) => {
935 tracing::warn!(
936 attempt,
937 total,
938 url = %candidate,
939 error = %e,
940 "OA PDF candidate failed; advancing to next (ADR-0029 chain)"
941 );
942 last_err = Some(e);
943 }
944 }
945 }
946 match (succeeded, last_err) {
947 (Some(bytes), _) => (PdfLegStatus::Fetched, Some(bytes)),
948 (None, Some(e)) => {
949 let fe = FetchError::Http(e);
950 let denial: Option<crate::DenialContext> = (&fe).into();
951 let message = fe.to_string();
952 let code: crate::ErrorCode = fe.into();
953 (
954 PdfLegStatus::Blocked {
955 code,
956 message,
957 denial,
958 },
959 None,
960 )
961 }
962 // Defensive fallback. `oa_chain` is non-empty in this
963 // branch, so structurally at least one iteration must set
964 // either `succeeded` or `last_err`. If a future refactor
965 // breaks the invariant we fail CLOSED — surface a
966 // `Blocked` outcome with a self-describing message
967 // rather than `NoOaUrl` (which would falsely tell the
968 // caller no candidate URL was ever discovered). Routes
969 // to `INTERNAL_ERROR` so the CLI's exit-code mapping
970 // signals a doiget bug, not a remote failure.
971 (None, None) => {
972 tracing::error!(
973 total = oa_chain.len(),
974 "OA PDF chain walker exhausted without recording success or error \
975 (defensive fallback — should be unreachable)"
976 );
977 (
978 PdfLegStatus::Blocked {
979 code: crate::ErrorCode::InternalError,
980 message:
981 "OA PDF chain walker exhausted without recording success or error \
982 (orchestrator bug — please report)"
983 .to_string(),
984 denial: None,
985 },
986 None,
987 )
988 }
989 }
990 };
991
992 // Issue #120: Crossref is non-fatal, but if it failed AND the OA
993 // PDF leg produced nothing, writing a DOI-only stub entry would
994 // mask a total failure and violate the "explain why" promise.
995 // Surface the Crossref error so the caller reports a real reason.
996 if let Some(e) = crossref_err {
997 if pdf_bytes.is_none() {
998 return Err(e);
999 }
1000 }
1001
1002 let (final_source_label, size_bytes, pdf_path_relative, pdf_staged) = match &pdf_bytes {
1003 Some(bytes) => {
1004 let staged = stage_pdf_to_tempfile(bytes)?;
1005 (
1006 "oa-publisher".to_string(),
1007 bytes.len() as u64,
1008 Some(format!("{}.pdf", safekey.as_str())),
1009 Some(staged),
1010 )
1011 }
1012 None => (source_label, 0u64, None, None),
1013 };
1014
1015 let metadata = Metadata {
1016 schema_version: SCHEMA_VERSION.to_string(),
1017 title: extracted.title.unwrap_or_else(|| doi.as_str().to_string()),
1018 authors: extracted.authors,
1019 year: extracted.year,
1020 doi: Some(doi.clone()),
1021 arxiv_id: None,
1022 abstract_: None,
1023 venue: extracted.venue,
1024 publisher: None,
1025 issn: None,
1026 isbn: None,
1027 type_: extracted.type_,
1028 keywords: Vec::new(),
1029 url: cross
1030 .as_ref()
1031 .and_then(|c| c.final_url.as_ref())
1032 .map(|u| u.to_string()),
1033 pdf_path: pdf_path_relative,
1034 doiget: Some(DoigetExtension {
1035 fetched_at: Utc::now(),
1036 source: final_source_label.clone(),
1037 license: license.clone(),
1038 size_bytes,
1039 mcp_call_id: None,
1040 }),
1041 other: BTreeMap::new(),
1042 };
1043
1044 let pdf_src_path = pdf_staged
1045 .as_ref()
1046 .and_then(|tmp| Utf8Path::from_path(tmp.path()).map(|p| p.to_path_buf()));
1047 write_metadata_and_pdf(store, safekey, &metadata, pdf_src_path.as_deref(), ctx)?;
1048 drop(pdf_staged);
1049
1050 let path = if pdf_bytes.is_some() {
1051 store_root.join(format!("{}.pdf", safekey.as_str()))
1052 } else {
1053 store_root
1054 .join(".metadata")
1055 .join(format!("{}.toml", safekey.as_str()))
1056 };
1057 let canonical_digest = crate::CanonicalRef::new(
1058 crate::SourceType::Doi,
1059 doi.as_str(),
1060 &final_source_label,
1061 None,
1062 )
1063 .digest_hex();
1064 Ok(FetchPaperOutcome {
1065 source: final_source_label.clone(),
1066 resolver_profile: final_source_label,
1067 license,
1068 path,
1069 size_bytes,
1070 schema_version: SCHEMA_VERSION.to_string(),
1071 pdf_leg,
1072 safekey: safekey.as_str().to_string(),
1073 canonical_digest,
1074 })
1075}
1076
1077/// Stage PDF bytes to a tempfile so the existing `Store::write` atomic-
1078/// rename code path applies (the store takes a path, not bytes).
1079fn stage_pdf_to_tempfile(bytes: &[u8]) -> Result<tempfile::NamedTempFile, FetchError> {
1080 let tmp = tempfile::NamedTempFile::new().map_err(|e| FetchError::SourceSchema {
1081 hint: format!("creating PDF staging tempfile: {e}"),
1082 })?;
1083 std::fs::write(tmp.path(), bytes).map_err(|e| FetchError::SourceSchema {
1084 hint: format!("staging PDF bytes: {e}"),
1085 })?;
1086 Ok(tmp)
1087}
1088
1089/// Persist `metadata` (and optionally a PDF at `pdf_src`) through the
1090/// trait-object [`Store`] and emit a `StoreWrite` provenance row.
1091fn write_metadata_and_pdf(
1092 store: &dyn Store,
1093 safekey: &Safekey,
1094 metadata: &Metadata,
1095 pdf_src: Option<&Utf8Path>,
1096 ctx: &FetchContext,
1097) -> Result<(), FetchError> {
1098 let store_path_relative = if pdf_src.is_some() {
1099 format!("{}.pdf", safekey.as_str())
1100 } else {
1101 format!(".metadata/{}.toml", safekey.as_str())
1102 };
1103 let size_bytes = metadata.doiget.as_ref().map(|d| d.size_bytes).unwrap_or(0);
1104 let license = metadata.doiget.as_ref().map(|d| d.license.as_str());
1105 let source_name = metadata.doiget.as_ref().map(|d| d.source.as_str());
1106
1107 // ADR-0021 §1 canonical-digest for the StoreWrite row. The store
1108 // entry is keyed on the ref + the resolver that produced its
1109 // metadata (already captured in `metadata.doiget.source`). Build a
1110 // CanonicalRef from whichever id slot is populated.
1111 let canonical_digest: Option<String> = match (metadata.doi.as_ref(), metadata.arxiv_id.as_ref())
1112 {
1113 (Some(d), _) => source_name.map(|s| {
1114 crate::CanonicalRef::new(crate::SourceType::Doi, d.as_str(), s, None).digest_hex()
1115 }),
1116 (None, Some(a)) => source_name.map(|s| {
1117 crate::CanonicalRef::new(crate::SourceType::Arxiv, a.as_str(), s, None).digest_hex()
1118 }),
1119 (None, None) => None,
1120 };
1121
1122 match store.write(safekey, metadata, pdf_src) {
1123 Ok(()) => {
1124 ctx.log.append(RowInput {
1125 event: LogEvent::StoreWrite,
1126 result: LogResult::Ok,
1127 capability: Capability::Oa,
1128 ref_: metadata
1129 .doi
1130 .as_ref()
1131 .map(|d| d.as_str())
1132 .or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
1133 source: source_name,
1134 error_code: None,
1135 size_bytes: Some(size_bytes),
1136 license,
1137 store_path: Some(&store_path_relative),
1138 canonical_digest: canonical_digest.as_deref(),
1139 })?;
1140 Ok(())
1141 }
1142 Err(e) => {
1143 // Best-effort: record the StoreWrite failure before
1144 // propagating the store.write error. We do NOT
1145 // propagate the log-append error itself here — we're
1146 // already in an error state from the store, and the
1147 // primary failure is what the caller needs to act on.
1148 // But the log-append failure is observable via tracing
1149 // so an operator can spot a broken hash chain when
1150 // both fail. Surface as `SourceSchema` so the
1151 // FetchError -> ErrorCode collapse routes it to
1152 // `INTERNAL_ERROR` (closest closed-set fit; `StoreError`
1153 // does not have a direct closed-set arm).
1154 if let Err(log_err) = ctx.log.append(RowInput {
1155 event: LogEvent::StoreWrite,
1156 result: LogResult::Err,
1157 capability: Capability::Oa,
1158 ref_: metadata
1159 .doi
1160 .as_ref()
1161 .map(|d| d.as_str())
1162 .or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
1163 source: source_name,
1164 error_code: Some("STORE_ERROR"),
1165 size_bytes: None,
1166 license: None,
1167 store_path: Some(&store_path_relative),
1168 canonical_digest: canonical_digest.as_deref(),
1169 }) {
1170 tracing::error!(
1171 store_err = %e,
1172 log_err = %log_err,
1173 "BOTH store.write AND provenance log append failed; \
1174 audit trail is broken for this attempt"
1175 );
1176 }
1177 Err(FetchError::SourceSchema {
1178 hint: format!("store write failed: {e}"),
1179 })
1180 }
1181 }
1182}
1183
1184/// Attempt the OA PDF fetch under the `"oa-publisher"` source key.
1185async fn try_fetch_oa_pdf(
1186 doi: &Doi,
1187 url: &url::Url,
1188 ctx: &FetchContext,
1189) -> Result<(Vec<u8>, url::Url), HttpError> {
1190 const SOURCE: &str = "oa-publisher";
1191 let _permit = ctx.rate_limiter.acquire(SOURCE).await;
1192 // ADR-0021 §1: the oa-publisher PDF leg is a DISTINCT audit
1193 // identity from the Crossref/Unpaywall metadata legs even though
1194 // the ref is the same DOI — that's the whole point of carrying
1195 // `resolver_profile` into the digest. Compute once and re-use for
1196 // both the ok and err row variants below.
1197 let canonical =
1198 crate::CanonicalRef::new(crate::SourceType::Doi, doi.as_str(), SOURCE, None).digest_hex();
1199
1200 // Pre-fetch host allowlist check on the metadata-discovered OA URL
1201 // (issue #145; `docs/REDIRECT_ALLOWLIST.md` §1 — NORMATIVE). The
1202 // per-source `redirect_hosts` allowlist is, by §1, consulted "on the
1203 // OA URL discovered through metadata sources before the actual PDF
1204 // fetch is issued", not only on redirect hops. The redirect closure in
1205 // `crate::http` only fires when an *actual redirect* occurs; an OA URL
1206 // whose host is off the `oa-publisher` allowlist that resolves WITHOUT
1207 // a redirect would otherwise reach connect and be misclassified as a
1208 // transport error, violating §1. This is scoped strictly to the
1209 // `"oa-publisher"` PDF leg — §6 explicitly exempts the initial
1210 // template-constructed URL, and `fetch_bytes`/metadata-only/resolve-
1211 // only paths (which never follow the OA URL) are deliberately NOT
1212 // touched. On a host MISS we return the *same* `HttpError::RedirectDenied`
1213 // value the redirect closure produces (same `source_key`, lowercased
1214 // `host`, and `expected_hosts` snapshot), reusing the identical
1215 // allowlist the closure captured (queried via `source_allowlist`, not
1216 // re-derived) so the single source of truth cannot drift. Returning
1217 // that exact variant means the existing `Err(e)` arm below, the
1218 // `From<&HttpError> for Option<DenialContext>` mapping
1219 // (`DenialReason::RedirectNotInAllowlist`), the `PdfLegStatus::Blocked`
1220 // construction in the caller, and PR #162's CLI classification all see
1221 // a byte-identical downstream shape with no new code path.
1222 if let Some(allowlist) = ctx.http.source_allowlist(SOURCE) {
1223 // `Url::host_str()` is `None` for hostless URLs (e.g. `data:`);
1224 // treat that exactly as the redirect closure does (an allowlist
1225 // miss with an empty host string).
1226 let host = url
1227 .host_str()
1228 .map(|h| h.to_ascii_lowercase())
1229 .unwrap_or_default();
1230 if !allowlist.matches(&host) {
1231 let e = HttpError::RedirectDenied {
1232 source_key: SOURCE.to_string(),
1233 host: host.clone(),
1234 expected_hosts: allowlist.redirect_hosts.clone(),
1235 };
1236 tracing::info!(
1237 oa_url = %url,
1238 denied_host = %host,
1239 "OA URL host outside oa-publisher allowlist (pre-fetch check, \
1240 docs/REDIRECT_ALLOWLIST.md §1 / issue #145)"
1241 );
1242 // Emit the SAME provenance row the post-fetch redirect-denied
1243 // path emits: a `Fetch` `Err` row under the `oa-publisher`
1244 // source key with the closed-set `NETWORK_ERROR` code and the
1245 // same canonical digest. Mirrors the `Err(e)` arm below so the
1246 // audit trail is indistinguishable from a redirect-time denial.
1247 let _ = ctx.log.append(RowInput {
1248 event: LogEvent::Fetch,
1249 result: LogResult::Err,
1250 capability: Capability::Oa,
1251 ref_: Some(doi.as_str()),
1252 source: Some(SOURCE),
1253 error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
1254 size_bytes: None,
1255 license: None,
1256 store_path: None,
1257 canonical_digest: Some(&canonical),
1258 });
1259 return Err(e);
1260 }
1261 }
1262
1263 match ctx.http.fetch_pdf(SOURCE, url.clone()).await {
1264 Ok((body, final_url)) => {
1265 let size_bytes = body.len() as u64;
1266 if let Err(e) = ctx.log.append(RowInput {
1267 event: LogEvent::Fetch,
1268 result: LogResult::Ok,
1269 capability: Capability::Oa,
1270 ref_: Some(doi.as_str()),
1271 source: Some(SOURCE),
1272 error_code: None,
1273 size_bytes: Some(size_bytes),
1274 license: None,
1275 store_path: None,
1276 canonical_digest: Some(&canonical),
1277 }) {
1278 tracing::warn!(error = %e, "appending oa-publisher Fetch ok row failed");
1279 }
1280 Ok((body.to_vec(), final_url))
1281 }
1282 Err(e) => {
1283 match &e {
1284 HttpError::RedirectDenied { host, .. } => {
1285 tracing::info!(
1286 oa_url = %url,
1287 denied_host = %host,
1288 "OA URL host outside oa-publisher allowlist"
1289 );
1290 }
1291 HttpError::NotAPdf { .. } => {
1292 tracing::info!(
1293 oa_url = %url,
1294 "OA URL did not return a PDF magic byte"
1295 );
1296 }
1297 other => {
1298 tracing::warn!(
1299 oa_url = %url,
1300 error = %other,
1301 "OA PDF fetch failed"
1302 );
1303 }
1304 }
1305 // Provenance `error_code` is the CLOSED-set code. Every
1306 // `HttpError` collapses to `NETWORK_ERROR` through the
1307 // canonical `From<FetchError> for ErrorCode` (the closed
1308 // set has no finer transport code by design) — so this is
1309 // the correct mapped value, not the misattribution the
1310 // previous hardcode implied. The *fine* reason
1311 // (RedirectDenied vs NotAPdf vs …) is preserved for the
1312 // user via `PdfLegStatus::Blocked.denial` / `.message`
1313 // built by the caller from the returned `HttpError`
1314 // (issue #118). Rendered via `ErrorCode::as_wire` so the
1315 // token can never drift from the enum.
1316 let _ = ctx.log.append(RowInput {
1317 event: LogEvent::Fetch,
1318 result: LogResult::Err,
1319 capability: Capability::Oa,
1320 ref_: Some(doi.as_str()),
1321 source: Some(SOURCE),
1322 error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
1323 size_bytes: None,
1324 license: None,
1325 store_path: None,
1326 canonical_digest: Some(&canonical),
1327 });
1328 Err(e)
1329 }
1330 }
1331}
1332
1333/// Subset of Crossref `message` fields populated into the on-disk metadata.
1334struct CrossrefFields {
1335 title: Option<String>,
1336 authors: Vec<String>,
1337 year: Option<i32>,
1338 venue: Option<String>,
1339 type_: Option<String>,
1340}
1341
1342/// Defensively pull bibliographic fields out of a Crossref envelope's
1343/// `message` object. Every field is optional; malformed shapes degrade
1344/// to `None` rather than panicking.
1345fn extract_crossref_fields(msg: &Value) -> CrossrefFields {
1346 let title = msg
1347 .get("title")
1348 .and_then(|v| v.as_array())
1349 .and_then(|arr| arr.first())
1350 .and_then(|v| v.as_str())
1351 .map(|s| s.to_string());
1352
1353 let authors = msg
1354 .get("author")
1355 .and_then(|v| v.as_array())
1356 .map(|arr| {
1357 arr.iter()
1358 .filter_map(|a| {
1359 let family = a.get("family").and_then(|v| v.as_str());
1360 let given = a.get("given").and_then(|v| v.as_str());
1361 match (family, given) {
1362 (Some(f), Some(g)) => Some(format!("{f}, {g}")),
1363 (Some(f), None) => Some(f.to_string()),
1364 (None, Some(g)) => Some(g.to_string()),
1365 _ => None,
1366 }
1367 })
1368 .collect()
1369 })
1370 .unwrap_or_default();
1371
1372 let year = msg
1373 .get("issued")
1374 .and_then(|v| v.get("date-parts"))
1375 .and_then(|v| v.as_array())
1376 .and_then(|arr| arr.first())
1377 .and_then(|v| v.as_array())
1378 .and_then(|arr| arr.first())
1379 .and_then(|v| v.as_i64())
1380 .and_then(|n| i32::try_from(n).ok());
1381
1382 let venue = msg
1383 .get("container-title")
1384 .and_then(|v| v.as_array())
1385 .and_then(|arr| arr.first())
1386 .and_then(|v| v.as_str())
1387 .map(|s| s.to_string());
1388
1389 let type_ = msg
1390 .get("type")
1391 .and_then(|v| v.as_str())
1392 .map(|s| s.to_string());
1393
1394 CrossrefFields {
1395 title,
1396 authors,
1397 year,
1398 venue,
1399 type_,
1400 }
1401}
1402
1403/// Pull the ordered chain of candidate OA URLs out of an Unpaywall
1404/// `metadata_json` envelope per ADR-0029 D2.
1405///
1406/// Order is `best_oa_location` first (when present), then every
1407/// distinct entry in `oa_locations[]`. Duplicate URLs are deduped by
1408/// exact string match so a candidate that appears as both the "best"
1409/// entry and an array element is fetched at most once.
1410///
1411/// Each location's URL is resolved via the same `url_for_pdf` →
1412/// `url` fallback the single-URL extractor uses.
1413///
1414/// Returns `Vec::new()` when no OA location was reported (the chain
1415/// is empty and the caller surfaces [`PdfLegStatus::NoOaUrl`]).
1416fn extract_oa_url_chain(meta: Option<&Value>) -> Vec<url::Url> {
1417 let meta = match meta {
1418 Some(m) => m,
1419 None => return Vec::new(),
1420 };
1421 let mut out: Vec<url::Url> = Vec::new();
1422 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
1423 let mut push_unique = |u: url::Url| {
1424 let key = u.as_str().to_string();
1425 if seen.insert(key) {
1426 out.push(u);
1427 }
1428 };
1429
1430 // Priority 1: best_oa_location (Unpaywall's own quality-ordered
1431 // pick — ADR-0029 D2 NORMATIVE: defer to the metadata source's
1432 // ordering).
1433 if let Some(best) = meta.get("best_oa_location") {
1434 if let Some(u) = pull_oa_url_from_location(best) {
1435 push_unique(u);
1436 }
1437 }
1438 // Priority 2: every entry in oa_locations[] after the best one.
1439 // The fallback target this ADR exists to enable is precisely the
1440 // arXiv preprint that lives here when `best_oa_location` is a
1441 // WAF-blocked publisher URL.
1442 if let Some(arr) = meta.get("oa_locations").and_then(|v| v.as_array()) {
1443 for loc in arr {
1444 if let Some(u) = pull_oa_url_from_location(loc) {
1445 push_unique(u);
1446 }
1447 }
1448 }
1449 out
1450}
1451
1452/// Resolve a single OA location object to a `url::Url`. Tries
1453/// `url_for_pdf` first (the direct PDF link Unpaywall annotates when
1454/// it knows one), falling back to `url` (the landing page). Returns
1455/// `None` if neither field is present or parses.
1456fn pull_oa_url_from_location(loc: &Value) -> Option<url::Url> {
1457 let candidate = loc
1458 .get("url_for_pdf")
1459 .and_then(|v| v.as_str())
1460 .or_else(|| loc.get("url").and_then(|v| v.as_str()))?;
1461 url::Url::parse(candidate).ok()
1462}
1463
1464fn unpaywall_email_from_env(fallback_contact: &str) -> String {
1465 std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| fallback_contact.to_string())
1466}
1467
1468// ---------------------------------------------------------------------------
1469// batch_fetch — multi-ref orchestrator (Slice 2)
1470// ---------------------------------------------------------------------------
1471
1472/// Per-ref outcome carried inside [`BatchOutcome::results`].
1473///
1474/// Each entry's `outcome` is independent — a single `Err(...)` does not
1475/// abort sibling refs. The MCP `doiget_batch_fetch` tool method
1476/// serializes the success-or-error per row inside `results[]`.
1477#[derive(Debug)]
1478pub struct BatchResultEntry {
1479 /// The parsed ref this entry describes.
1480 pub ref_: Ref,
1481 /// `Ok(...)` on a successful fetch through [`fetch_paper`];
1482 /// `Err(...)` on a per-ref failure (the outer call still returned
1483 /// `Ok(BatchOutcome)`).
1484 pub outcome: Result<FetchPaperOutcome, FetchError>,
1485}
1486
1487/// Outcome of a successful [`batch_fetch`] call.
1488///
1489/// The outer call returns `Err(_)` only on whole-call failures (the
1490/// only such variant in Slice 2 is [`FetchError::TooManyRefs`]). Each
1491/// per-ref result lives inside `results[]` so the agent can see every
1492/// outcome without losing sibling successes.
1493#[derive(Debug)]
1494#[non_exhaustive]
1495pub struct BatchOutcome {
1496 /// One entry per supplied ref, in input order.
1497 pub results: Vec<BatchResultEntry>,
1498}
1499
1500/// Iterate over `refs` through [`fetch_paper`], collecting one
1501/// [`BatchResultEntry`] per ref.
1502///
1503/// **Cap**: caller must supply at most [`MAX_BATCH_REFS`] refs; otherwise
1504/// the function returns `Err(FetchError::TooManyRefs { got, max })`
1505/// before any fetch is attempted. The cap mirrors the CLI's
1506/// `commands::batch` enforcement (`MCP_BATCH_MAX_SIZE`).
1507///
1508/// **Concurrency**: Slice 2 dispatches refs serially through
1509/// [`fetch_paper`]. The CLI's existing `commands::batch::run_with_options`
1510/// keeps its bounded-concurrency `JoinSet`+semaphore path for backward
1511/// compatibility; the MCP server uses this serial loop because the MCP
1512/// tool boundary already serializes calls per session.
1513///
1514/// **Session bookkeeping**: this function does NOT emit `SessionStart`
1515/// / `SessionEnd` rows — that is the caller's responsibility.
1516pub async fn batch_fetch(
1517 refs: &[Ref],
1518 profile: &CapabilityProfile,
1519 ctx: &FetchContext,
1520 store: &dyn Store,
1521 store_root: &Utf8Path,
1522) -> Result<BatchOutcome, FetchError> {
1523 if refs.len() > MAX_BATCH_REFS {
1524 return Err(FetchError::TooManyRefs {
1525 got: refs.len(),
1526 max: MAX_BATCH_REFS,
1527 });
1528 }
1529 let mut results = Vec::with_capacity(refs.len());
1530 for ref_ in refs {
1531 let outcome = fetch_paper(ref_, profile, ctx, store, store_root).await;
1532 results.push(BatchResultEntry {
1533 ref_: ref_.clone(),
1534 outcome,
1535 });
1536 }
1537 Ok(BatchOutcome { results })
1538}
1539
1540/// Dry-run preview for a batch — one [`FetchPlan`] per ref. Enforces
1541/// the same [`MAX_BATCH_REFS`] cap [`batch_fetch`] does.
1542///
1543/// Returns `Err(FetchError::TooManyRefs)` when over the cap, or
1544/// `Err(FetchError::SourceSchema)` if the dry-run allowlist invariant
1545/// has drifted (issue #156 ②: this now propagates as a typed error via
1546/// [`try_build_fetch_plan`] rather than silently emitting an empty
1547/// `candidate_hosts` list — the signature already returned `Result`, so
1548/// this is an in-crate behavior tightening with no caller-visible type
1549/// change). Otherwise `Ok(Vec<(Ref, FetchPlan)>)` parallel to the input
1550/// order.
1551pub fn batch_fetch_plans(
1552 refs: &[Ref],
1553 store_root: &Utf8Path,
1554) -> Result<Vec<(Ref, FetchPlan)>, FetchError> {
1555 if refs.len() > MAX_BATCH_REFS {
1556 return Err(FetchError::TooManyRefs {
1557 got: refs.len(),
1558 max: MAX_BATCH_REFS,
1559 });
1560 }
1561 refs.iter()
1562 .map(|r| try_build_fetch_plan(r, store_root).map(|p| (r.clone(), p)))
1563 .collect()
1564}
1565
1566// ---------------------------------------------------------------------------
1567// Tests
1568// ---------------------------------------------------------------------------
1569
1570#[cfg(test)]
1571#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1572mod tests {
1573 use super::*;
1574
1575 #[test]
1576 fn extract_crossref_oa_url_finds_first_url() {
1577 let msg = serde_json::json!({
1578 "link": [
1579 {"URL": "https://example.org/free.pdf"},
1580 {"URL": "https://example.org/alt.pdf"}
1581 ]
1582 });
1583 assert_eq!(
1584 extract_crossref_oa_url(&msg),
1585 Some("https://example.org/free.pdf".to_string())
1586 );
1587 }
1588
1589 #[test]
1590 fn extract_crossref_oa_url_returns_none_when_absent() {
1591 let msg = serde_json::json!({});
1592 assert!(extract_crossref_oa_url(&msg).is_none());
1593 }
1594
1595 #[test]
1596 fn extract_crossref_oa_url_skips_empty_url_strings() {
1597 let msg = serde_json::json!({
1598 "link": [
1599 {"URL": ""},
1600 {"URL": "https://example.org/real.pdf"}
1601 ]
1602 });
1603 assert_eq!(
1604 extract_crossref_oa_url(&msg),
1605 Some("https://example.org/real.pdf".to_string())
1606 );
1607 }
1608
1609 #[test]
1610 fn extract_unpaywall_oa_url_prefers_url_for_pdf() {
1611 let meta = serde_json::json!({
1612 "best_oa_location": {
1613 "url_for_pdf": "https://example.org/pdf",
1614 "url": "https://example.org/landing"
1615 }
1616 });
1617 assert_eq!(
1618 extract_unpaywall_oa_url(&meta),
1619 Some("https://example.org/pdf".to_string())
1620 );
1621 }
1622
1623 #[test]
1624 fn extract_unpaywall_oa_url_falls_back_to_url() {
1625 let meta = serde_json::json!({
1626 "best_oa_location": {
1627 "url": "https://example.org/landing"
1628 }
1629 });
1630 assert_eq!(
1631 extract_unpaywall_oa_url(&meta),
1632 Some("https://example.org/landing".to_string())
1633 );
1634 }
1635
1636 #[test]
1637 fn extract_unpaywall_oa_url_returns_none_when_absent() {
1638 let meta = serde_json::json!({});
1639 assert!(extract_unpaywall_oa_url(&meta).is_none());
1640 }
1641
1642 // ---------------------------------------------------------------
1643 // Slice 2: fetch_paper / batch_fetch coverage. The wiremock-driven
1644 // happy-path tests live in `crates/doiget-mcp/tests/...` (they need
1645 // a real `Store` impl and an HTTP client wired to `FetchContext`,
1646 // both of which the MCP integration tests already stand up). The
1647 // unit tests here pin the pure-function pieces (extractors, cap
1648 // enforcement, plan-shape preservation).
1649 // ---------------------------------------------------------------
1650
1651 #[test]
1652 fn extract_crossref_fields_parses_minimal_shape() {
1653 let msg = serde_json::json!({
1654 "title": ["Example Title"],
1655 "author": [{ "family": "Smith", "given": "Alice" }],
1656 "issued": { "date-parts": [[2024, 1, 15]] },
1657 "container-title": ["Phys. Rev. X"],
1658 "type": "journal-article"
1659 });
1660 let f = extract_crossref_fields(&msg);
1661 assert_eq!(f.title.as_deref(), Some("Example Title"));
1662 assert_eq!(f.authors, vec!["Smith, Alice".to_string()]);
1663 assert_eq!(f.year, Some(2024));
1664 assert_eq!(f.venue.as_deref(), Some("Phys. Rev. X"));
1665 assert_eq!(f.type_.as_deref(), Some("journal-article"));
1666 }
1667
1668 #[test]
1669 fn extract_crossref_fields_tolerates_missing() {
1670 let f = extract_crossref_fields(&serde_json::json!({}));
1671 assert!(f.title.is_none());
1672 assert!(f.authors.is_empty());
1673 assert!(f.year.is_none());
1674 assert!(f.venue.is_none());
1675 assert!(f.type_.is_none());
1676 }
1677
1678 #[test]
1679 fn extract_oa_url_chain_prefers_best_url_for_pdf() {
1680 // `best_oa_location.url_for_pdf` is the highest-priority
1681 // candidate (ADR-0029 D2 — defer to the metadata source's
1682 // ordering). Falls back to `best_oa_location.url` only when
1683 // no PDF link is annotated.
1684 let meta = serde_json::json!({
1685 "best_oa_location": {
1686 "url_for_pdf": "https://example.org/pdf",
1687 "url": "https://example.org/landing"
1688 }
1689 });
1690 let chain = extract_oa_url_chain(Some(&meta));
1691 assert_eq!(chain.len(), 1);
1692 assert_eq!(chain[0].as_str(), "https://example.org/pdf");
1693 }
1694
1695 #[test]
1696 fn extract_oa_url_chain_falls_back_to_url_when_url_for_pdf_absent() {
1697 let meta = serde_json::json!({
1698 "best_oa_location": {
1699 "url": "https://example.org/landing"
1700 }
1701 });
1702 let chain = extract_oa_url_chain(Some(&meta));
1703 assert_eq!(chain.len(), 1);
1704 assert_eq!(chain[0].as_str(), "https://example.org/landing");
1705 }
1706
1707 #[test]
1708 fn extract_oa_url_chain_is_empty_when_no_locations() {
1709 let meta = serde_json::json!({});
1710 assert!(extract_oa_url_chain(Some(&meta)).is_empty());
1711 assert!(extract_oa_url_chain(None).is_empty());
1712 }
1713
1714 #[test]
1715 fn extract_oa_url_chain_appends_oa_locations_after_best() {
1716 // ADR-0029 D2: best_oa_location first, then the rest of
1717 // oa_locations in metadata-source order. This is the load-
1718 // bearing test: it pins the fact that an arXiv preprint
1719 // listed *after* a WAF-blocked publisher in oa_locations[]
1720 // becomes a fallback candidate the chain walker can reach.
1721 let meta = serde_json::json!({
1722 "best_oa_location": {
1723 "url_for_pdf": "https://publisher.example.org/pdf"
1724 },
1725 "oa_locations": [
1726 {"url_for_pdf": "https://publisher.example.org/pdf"},
1727 {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"},
1728 {"url": "https://repo.example.edu/handle/123"}
1729 ]
1730 });
1731 let chain = extract_oa_url_chain(Some(&meta));
1732 let strs: Vec<&str> = chain.iter().map(|u| u.as_str()).collect();
1733 assert_eq!(
1734 strs,
1735 vec![
1736 "https://publisher.example.org/pdf",
1737 "https://arxiv.org/pdf/2401.12345",
1738 "https://repo.example.edu/handle/123",
1739 ],
1740 "chain ordering MUST be best_oa_location first, oa_locations[] verbatim after"
1741 );
1742 }
1743
1744 #[test]
1745 fn extract_oa_url_chain_dedupes_repeated_urls() {
1746 // A URL that appears as both `best_oa_location` and an entry
1747 // in `oa_locations[]` is fetched at most once. Without this,
1748 // a publisher whose record has the same URL in both slots
1749 // would consume two HTTP requests + two rate-limit ticks.
1750 let meta = serde_json::json!({
1751 "best_oa_location": {
1752 "url_for_pdf": "https://example.org/pdf"
1753 },
1754 "oa_locations": [
1755 {"url_for_pdf": "https://example.org/pdf"},
1756 {"url_for_pdf": "https://example.org/pdf"},
1757 {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
1758 ]
1759 });
1760 let chain = extract_oa_url_chain(Some(&meta));
1761 assert_eq!(chain.len(), 2);
1762 assert_eq!(chain[0].as_str(), "https://example.org/pdf");
1763 assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
1764 }
1765
1766 #[test]
1767 fn extract_oa_url_chain_skips_unparsable_urls() {
1768 // A malformed URL in oa_locations[] is dropped silently
1769 // rather than aborting the chain — the metadata source can
1770 // emit a stray entry without poisoning the whole fetch.
1771 let meta = serde_json::json!({
1772 "best_oa_location": {
1773 "url_for_pdf": "https://good.example.org/pdf"
1774 },
1775 "oa_locations": [
1776 {"url_for_pdf": "not a url"},
1777 {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
1778 ]
1779 });
1780 let chain = extract_oa_url_chain(Some(&meta));
1781 assert_eq!(chain.len(), 2);
1782 assert_eq!(chain[0].as_str(), "https://good.example.org/pdf");
1783 assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
1784 }
1785
1786 #[test]
1787 fn fetch_paper_plan_matches_build_fetch_plan() {
1788 // The slice-2-named alias is a thin pass-through to
1789 // `dry_run::build_fetch_plan`. Pin behavioral equivalence so
1790 // a future refactor that diverges them surfaces here.
1791 use crate::{ArxivId, Doi};
1792 let r = Ref::Doi(Doi("10.1234/example".to_string()));
1793 let root = Utf8PathBuf::from("/tmp/doiget-test");
1794 let plan_a = fetch_paper_plan(&r, &root);
1795 let plan_b = build_fetch_plan(&r, &root);
1796 assert_eq!(plan_a.metadata_sources, plan_b.metadata_sources);
1797 assert_eq!(plan_a.target_pdf_path, plan_b.target_pdf_path);
1798 assert_eq!(plan_a.target_metadata_path, plan_b.target_metadata_path);
1799
1800 let r2 = Ref::Arxiv(ArxivId("2401.12345".to_string()));
1801 let plan_c = fetch_paper_plan(&r2, &root);
1802 let plan_d = build_fetch_plan(&r2, &root);
1803 assert_eq!(plan_c.pdf_sources[0].key, plan_d.pdf_sources[0].key);
1804 }
1805
1806 #[test]
1807 fn batch_fetch_plans_returns_plan_per_ref_in_order() {
1808 use crate::{ArxivId, Doi};
1809 let refs = vec![
1810 Ref::Doi(Doi("10.1234/alpha".to_string())),
1811 Ref::Arxiv(ArxivId("2401.12345".to_string())),
1812 ];
1813 let root = Utf8PathBuf::from("/tmp/doiget-batch-test");
1814 let plans = batch_fetch_plans(&refs, &root).expect("under cap returns Ok");
1815 assert_eq!(plans.len(), 2);
1816 // Order preserved.
1817 assert!(matches!(plans[0].0, Ref::Doi(_)));
1818 assert!(matches!(plans[1].0, Ref::Arxiv(_)));
1819 // DOI plan carries the crossref + unpaywall metadata sources.
1820 assert_eq!(plans[0].1.metadata_sources, vec!["crossref", "unpaywall"]);
1821 // arXiv plan has the arxiv PDF source key.
1822 assert_eq!(plans[1].1.pdf_sources[0].key, "arxiv");
1823 }
1824
1825 #[test]
1826 fn batch_fetch_plans_too_many_refs_returns_err() {
1827 use crate::Doi;
1828 // Build MAX_BATCH_REFS + 1 entries — boundary case.
1829 let n = MAX_BATCH_REFS + 1;
1830 let refs: Vec<Ref> = (0..n)
1831 .map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
1832 .collect();
1833 let root = Utf8PathBuf::from("/tmp/doiget-toomany");
1834 let err = batch_fetch_plans(&refs, &root).expect_err("over cap returns Err");
1835 match err {
1836 FetchError::TooManyRefs { got, max } => {
1837 assert_eq!(got, n);
1838 assert_eq!(max, MAX_BATCH_REFS);
1839 }
1840 other => panic!("expected TooManyRefs, got: {other:?}"),
1841 }
1842 }
1843
1844 #[tokio::test]
1845 async fn batch_fetch_too_many_refs_returns_err_before_any_fetch() {
1846 // The cap is enforced before any per-ref work, so we don't need
1847 // a working store/network here — pass a sentinel store_root and
1848 // a dummy FetchContext that would panic on use.
1849 use crate::http::{tier_1_allowlist, HttpClient};
1850 use crate::provenance::ProvenanceLog;
1851 use crate::rate_limiter::RateLimiter;
1852 use crate::store::FsStore;
1853 use crate::{Doi, RateLimits};
1854 use std::sync::Arc;
1855
1856 let td = tempfile::TempDir::new().expect("tempdir");
1857 let log_path = Utf8Path::from_path(td.path())
1858 .expect("utf-8")
1859 .join("log.jsonl");
1860 let store_root = Utf8Path::from_path(td.path())
1861 .expect("utf-8")
1862 .join("papers");
1863
1864 let ctx = FetchContext {
1865 http: Arc::new(HttpClient::new(tier_1_allowlist()).expect("http client")),
1866 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
1867 log: Arc::new(
1868 ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
1869 .expect("provenance log"),
1870 ),
1871 session_id: "01J0000000000000000000TEST".into(),
1872 };
1873 let profile = CapabilityProfile::from_env().expect("clean env");
1874 let store = FsStore::new(store_root.clone()).expect("fs store");
1875
1876 let n = MAX_BATCH_REFS + 1;
1877 let refs: Vec<Ref> = (0..n)
1878 .map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
1879 .collect();
1880
1881 let err = batch_fetch(&refs, &profile, &ctx, &store, &store_root)
1882 .await
1883 .expect_err("over cap returns Err");
1884 match err {
1885 FetchError::TooManyRefs { got, max } => {
1886 assert_eq!(got, n);
1887 assert_eq!(max, MAX_BATCH_REFS);
1888 }
1889 other => panic!("expected TooManyRefs, got: {other:?}"),
1890 }
1891 }
1892
1893 // Issue #118: a non-PDF OA body must surface as `Err(HttpError)`
1894 // from `try_fetch_oa_pdf` (previously silently flattened to
1895 // `None`, which `fetch_paper_doi` then reported as a clean
1896 // metadata-only success). The compiler-checked `Err(e) =>
1897 // PdfLegStatus::Blocked` arm in `fetch_paper_doi` does the rest.
1898 #[tokio::test]
1899 async fn try_fetch_oa_pdf_non_pdf_body_is_err_not_silent_none() {
1900 use crate::http::HttpClient;
1901 use crate::provenance::ProvenanceLog;
1902 use crate::rate_limiter::RateLimiter;
1903 use crate::{Doi, RateLimits};
1904 use std::sync::Arc;
1905 use wiremock::matchers::method;
1906 use wiremock::{Mock, MockServer, ResponseTemplate};
1907
1908 let server = MockServer::start().await;
1909 Mock::given(method("GET"))
1910 .respond_with(
1911 ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
1912 )
1913 .mount(&server)
1914 .await;
1915 let host = server
1916 .uri()
1917 .parse::<url::Url>()
1918 .expect("uri")
1919 .host_str()
1920 .expect("host")
1921 .to_string();
1922
1923 let td = tempfile::TempDir::new().expect("tempdir");
1924 let log_path = Utf8Path::from_path(td.path())
1925 .expect("utf-8")
1926 .join("log.jsonl");
1927 let ctx = FetchContext {
1928 http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
1929 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
1930 log: Arc::new(
1931 ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
1932 .expect("provenance log"),
1933 ),
1934 session_id: "01J0000000000000000000TEST".into(),
1935 };
1936
1937 let doi = Doi("10.1234/example".to_string());
1938 let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
1939 let res = try_fetch_oa_pdf(&doi, &url, &ctx).await;
1940 match res {
1941 Err(HttpError::NotAPdf { .. }) => {}
1942 other => panic!("expected Err(NotAPdf), got: {other:?}"),
1943 }
1944 }
1945
1946 // Issue #145 / `docs/REDIRECT_ALLOWLIST.md` §1: the `oa-publisher`
1947 // host allowlist MUST be consulted on the metadata-discovered OA URL
1948 // *before the actual PDF fetch is issued*, not only on redirect hops.
1949 // An OA URL whose host is OFF the allowlist and that resolves WITHOUT
1950 // a redirect previously slipped past the redirect closure entirely and
1951 // was misclassified as a transport error. This test pins the fix: the
1952 // pre-fetch check rejects it with the SAME `HttpError::RedirectDenied`
1953 // the redirect closure produces, the OA fetch is NEVER issued (the
1954 // wiremock origin records ZERO requests, proving no PDF bytes were
1955 // requested / written), and the provenance trail is the byte-identical
1956 // `Fetch`/`err`/`oa-publisher`/`NETWORK_ERROR` row the redirect-denied
1957 // path emits.
1958 #[tokio::test]
1959 async fn try_fetch_oa_pdf_off_allowlist_host_no_redirect_is_redirect_denied_145() {
1960 use crate::http::HttpClient;
1961 use crate::provenance::ProvenanceLog;
1962 use crate::rate_limiter::RateLimiter;
1963 use crate::{DenialContext, DenialReason, Doi, RateLimits};
1964 use std::sync::Arc;
1965 use wiremock::matchers::method;
1966 use wiremock::{Mock, MockServer, ResponseTemplate};
1967
1968 // The wiremock origin would serve a valid PDF with NO redirect —
1969 // if the pre-check were absent the fetch would *succeed* against
1970 // an off-allowlist host, which is exactly the §1 violation.
1971 let server = MockServer::start().await;
1972 Mock::given(method("GET"))
1973 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7 real pdf".to_vec()))
1974 .mount(&server)
1975 .await;
1976
1977 // Register a DIFFERENT host as the `oa-publisher` allowlist so the
1978 // wiremock origin (127.0.0.1) is OFF it. `evil.example.com` is a
1979 // valid host string the allowlist will not match.
1980 let td = tempfile::TempDir::new().expect("tempdir");
1981 let log_path = Utf8Path::from_path(td.path())
1982 .expect("utf-8")
1983 .join("log.jsonl");
1984 let ctx = FetchContext {
1985 http: Arc::new(HttpClient::new_for_tests_allow_http(
1986 "oa-publisher",
1987 "allowed-publisher.example.com",
1988 )),
1989 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
1990 log: Arc::new(
1991 ProvenanceLog::open(log_path.clone(), "01J0000000000000000000TEST".into())
1992 .expect("provenance log"),
1993 ),
1994 session_id: "01J0000000000000000000TEST".into(),
1995 };
1996
1997 let doi = Doi("10.1234/example".to_string());
1998 // The OA URL Unpaywall handed back resolves to the wiremock host,
1999 // which is OFF the `oa-publisher` allowlist.
2000 let off_host_url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2001 let res = try_fetch_oa_pdf(&doi, &off_host_url, &ctx).await;
2002
2003 // 1. Same error variant the redirect closure produces.
2004 let err = match res {
2005 Err(e @ HttpError::RedirectDenied { .. }) => e,
2006 other => {
2007 panic!("expected Err(RedirectDenied) from the pre-fetch check, got: {other:?}")
2008 }
2009 };
2010 match &err {
2011 HttpError::RedirectDenied {
2012 source_key,
2013 host,
2014 expected_hosts,
2015 } => {
2016 assert_eq!(source_key, "oa-publisher");
2017 // The host is lowercased, exactly as the redirect closure
2018 // would record it.
2019 assert_eq!(
2020 host,
2021 off_host_url
2022 .host_str()
2023 .expect("wiremock host")
2024 .to_ascii_lowercase()
2025 .as_str()
2026 );
2027 assert_eq!(
2028 expected_hosts,
2029 &vec!["allowed-publisher.example.com".to_string()]
2030 );
2031 }
2032 _ => unreachable!(),
2033 }
2034
2035 // 2. The OA fetch was NEVER issued — the wiremock origin saw zero
2036 // requests, so no PDF bytes were requested or written.
2037 assert!(
2038 server
2039 .received_requests()
2040 .await
2041 .unwrap_or_default()
2042 .is_empty(),
2043 "the off-allowlist OA URL must NOT be fetched: the pre-check \
2044 (REDIRECT_ALLOWLIST.md §1) rejects it before any request is \
2045 issued; wiremock recorded request(s)",
2046 );
2047
2048 // 3. The structured denial side-channel is byte-identical to the
2049 // redirect-closure path: `RedirectNotInAllowlist`, source key,
2050 // attempted host, expected allowlist snapshot.
2051 let dc: Option<DenialContext> = (&err).into();
2052 let dc = dc.expect("pre-fetch RedirectDenied -> Some(DenialContext)");
2053 assert_eq!(dc.reason, DenialReason::RedirectNotInAllowlist);
2054 assert_eq!(dc.source.as_deref(), Some("oa-publisher"));
2055 assert_eq!(
2056 dc.attempted,
2057 Some(off_host_url.host_str().expect("host").to_ascii_lowercase()),
2058 "attempted host must be the rejected OA URL host, lowercased — \
2059 identical to what the redirect closure records",
2060 );
2061 assert_eq!(
2062 dc.expected,
2063 Some(vec!["allowed-publisher.example.com".to_string()]),
2064 );
2065
2066 // 4. Provenance: exactly the `Fetch`/`err`/`oa-publisher`/
2067 // `NETWORK_ERROR` row the post-fetch redirect-denied arm emits
2068 // (same row kind + source key + closed-set code).
2069 let log_txt = std::fs::read_to_string(&log_path).expect("read provenance log");
2070 let fetch_err_row = log_txt
2071 .lines()
2072 .filter_map(|l| serde_json::from_str::<serde_json::Value>(l).ok())
2073 .find(|v| {
2074 v.get("event").and_then(|e| e.as_str()) == Some("fetch")
2075 && v.get("result").and_then(|r| r.as_str()) == Some("err")
2076 })
2077 .expect("a Fetch/err provenance row was written");
2078 assert_eq!(
2079 fetch_err_row.get("source").and_then(|s| s.as_str()),
2080 Some("oa-publisher"),
2081 );
2082 assert_eq!(
2083 fetch_err_row.get("error_code").and_then(|c| c.as_str()),
2084 Some("NETWORK_ERROR"),
2085 );
2086 assert_eq!(
2087 fetch_err_row.get("ref").and_then(|r| r.as_str()),
2088 Some("10.1234/example"),
2089 );
2090 }
2091
2092 // Issue #145 positive / no-regression: an ON-allowlist OA URL still
2093 // fetches the PDF normally. The pre-fetch check must be a pure gate —
2094 // it must not perturb the happy path.
2095 #[tokio::test]
2096 async fn try_fetch_oa_pdf_on_allowlist_host_still_fetches_pdf_no_regression_145() {
2097 use crate::http::HttpClient;
2098 use crate::provenance::ProvenanceLog;
2099 use crate::rate_limiter::RateLimiter;
2100 use crate::{Doi, RateLimits};
2101 use std::sync::Arc;
2102 use wiremock::matchers::method;
2103 use wiremock::{Mock, MockServer, ResponseTemplate};
2104
2105 let server = MockServer::start().await;
2106 let body = b"%PDF-1.7\nhello pdf".to_vec();
2107 Mock::given(method("GET"))
2108 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
2109 .mount(&server)
2110 .await;
2111 // The wiremock host IS the registered `oa-publisher` allowlist, so
2112 // the pre-check passes and the fetch proceeds as before.
2113 let host = server
2114 .uri()
2115 .parse::<url::Url>()
2116 .expect("uri")
2117 .host_str()
2118 .expect("host")
2119 .to_string();
2120
2121 let td = tempfile::TempDir::new().expect("tempdir");
2122 let log_path = Utf8Path::from_path(td.path())
2123 .expect("utf-8")
2124 .join("log.jsonl");
2125 let ctx = FetchContext {
2126 http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
2127 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2128 log: Arc::new(
2129 ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2130 .expect("provenance log"),
2131 ),
2132 session_id: "01J0000000000000000000TEST".into(),
2133 };
2134
2135 let doi = Doi("10.1234/example".to_string());
2136 let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2137 let (bytes, _final_url) = try_fetch_oa_pdf(&doi, &url, &ctx)
2138 .await
2139 .expect("on-allowlist OA URL still fetches the PDF");
2140 assert_eq!(bytes, body, "PDF bytes must be returned unchanged");
2141 }
2142
2143 // Issue #145: the pre-fetch denial and the redirect-closure denial
2144 // MUST produce a byte-identical `DenialContext` so PR #162's CLI
2145 // classification (CAPABILITY_DENIED / exit 3) handles both unchanged.
2146 // This pins the equivalence at the value level: the same source key +
2147 // host + allowlist snapshot map through the SAME
2148 // `From<&HttpError> for Option<DenialContext>` impl to equal structs.
2149 #[test]
2150 fn pre_fetch_denial_produces_byte_identical_denial_context_as_redirect_denied_145() {
2151 use crate::{DenialContext, DenialReason};
2152
2153 // Shape produced by the pre-fetch check in `try_fetch_oa_pdf`.
2154 let pre_fetch = HttpError::RedirectDenied {
2155 source_key: "oa-publisher".to_string(),
2156 host: "attacker.test".to_string(),
2157 expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
2158 };
2159 // Shape produced by the redirect closure in `crate::http` for the
2160 // identical inputs.
2161 let redirect_closure = HttpError::RedirectDenied {
2162 source_key: "oa-publisher".to_string(),
2163 host: "attacker.test".to_string(),
2164 expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
2165 };
2166
2167 let dc_pre: Option<DenialContext> = (&pre_fetch).into();
2168 let dc_red: Option<DenialContext> = (&redirect_closure).into();
2169 let dc_pre = dc_pre.expect("pre-fetch -> Some");
2170 let dc_red = dc_red.expect("redirect -> Some");
2171
2172 // Byte-identical: same reason, same source, same attempted host,
2173 // same expected snapshot, all auxiliary channels None.
2174 assert_eq!(dc_pre, dc_red);
2175 assert_eq!(dc_pre.reason, DenialReason::RedirectNotInAllowlist);
2176 assert_eq!(dc_pre.source.as_deref(), Some("oa-publisher"));
2177 assert_eq!(dc_pre.attempted.as_deref(), Some("attacker.test"));
2178 assert_eq!(
2179 dc_pre.expected,
2180 Some(vec!["*.springer.com".to_string(), "*.plos.org".to_string()]),
2181 );
2182 assert_eq!(dc_pre.hop_index, None);
2183 assert_eq!(dc_pre.cap, None);
2184 assert_eq!(dc_pre.actual, None);
2185 }
2186
2187 // -----------------------------------------------------------------
2188 // #139 — metadata_only_to_store writes the metadata TOML;
2189 // resolve_only / pure metadata_only write NOTHING.
2190 // -----------------------------------------------------------------
2191
2192 /// Build a ctx + FsStore under a fresh tempdir and point Crossref at
2193 /// a wiremock origin that returns one minimal `message`. Returns
2194 /// `(server, ctx, store, store_root, _td)` — `_td` keeps the tempdir
2195 /// alive for the test body.
2196 async fn md139_harness() -> (
2197 wiremock::MockServer,
2198 FetchContext,
2199 crate::store::FsStore,
2200 Utf8PathBuf,
2201 tempfile::TempDir,
2202 ) {
2203 use crate::http::HttpClient;
2204 use crate::provenance::ProvenanceLog;
2205 use crate::rate_limiter::RateLimiter;
2206 use crate::store::FsStore;
2207 use crate::RateLimits;
2208 use std::sync::Arc;
2209 use wiremock::matchers::method;
2210 use wiremock::{Mock, MockServer, ResponseTemplate};
2211
2212 let server = MockServer::start().await;
2213 Mock::given(method("GET"))
2214 .respond_with(ResponseTemplate::new(200).set_body_string(
2215 r#"{"status":"ok","message":{"title":["Example Paper"],"author":[{"given":"Ada","family":"Lovelace"}]}}"#,
2216 ))
2217 .mount(&server)
2218 .await;
2219 std::env::set_var("DOIGET_CROSSREF_BASE", server.uri());
2220
2221 // wiremock serves http://127.0.0.1:PORT; the production client is
2222 // https_only, so the test ctx uses the allow-http test client
2223 // scoped to the crossref/unpaywall source keys + the wiremock host.
2224 let host = server
2225 .uri()
2226 .parse::<url::Url>()
2227 .expect("uri")
2228 .host_str()
2229 .expect("host")
2230 .to_string();
2231
2232 let td = tempfile::TempDir::new().expect("tempdir");
2233 let base = Utf8Path::from_path(td.path()).expect("utf-8");
2234 let log_path = base.join("log.jsonl");
2235 let store_root = base.join("papers");
2236 let ctx = FetchContext {
2237 http: Arc::new(HttpClient::new_for_tests_allow_http_multi(&[
2238 ("crossref", &host),
2239 ("unpaywall", &host),
2240 ])),
2241 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2242 log: Arc::new(
2243 ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2244 .expect("provenance log"),
2245 ),
2246 session_id: "01J0000000000000000000TEST".into(),
2247 };
2248 let store = FsStore::new(store_root.clone()).expect("fs store");
2249 (server, ctx, store, store_root, td)
2250 }
2251
2252 fn metadata_dir_tomls(store_root: &Utf8Path) -> Vec<Utf8PathBuf> {
2253 let md = store_root.join(".metadata");
2254 match std::fs::read_dir(md.as_std_path()) {
2255 Ok(rd) => rd
2256 .filter_map(|e| e.ok())
2257 .filter_map(|e| Utf8PathBuf::from_path_buf(e.path()).ok())
2258 .filter(|p| p.extension() == Some("toml"))
2259 .collect(),
2260 Err(_) => Vec::new(),
2261 }
2262 }
2263
2264 #[tokio::test]
2265 #[serial_test::serial]
2266 async fn metadata_only_to_store_writes_metadata_toml_139() {
2267 let (_server, ctx, store, store_root, _td) = md139_harness().await;
2268 let profile = CapabilityProfile::from_env().expect("clean env");
2269 let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
2270
2271 let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
2272 .await
2273 .expect("metadata_only_to_store ok");
2274 assert_eq!(outcome.source, "crossref");
2275
2276 let tomls = metadata_dir_tomls(&store_root);
2277 assert_eq!(
2278 tomls.len(),
2279 1,
2280 "exactly one .metadata/*.toml must be written (MCP_TOOLS.md §11 SIDE EFFECT, #139); got {tomls:?}"
2281 );
2282 let body = std::fs::read_to_string(&tomls[0]).expect("read metadata toml");
2283 let meta: crate::store::Metadata = toml::from_str(&body).expect("parse metadata toml");
2284 assert_eq!(meta.title, "Example Paper");
2285 assert_eq!(
2286 meta.doi.as_ref().map(|d| d.as_str()),
2287 Some("10.1234/example")
2288 );
2289 let ext = meta.doiget.expect("[doiget] table present");
2290 assert_eq!(ext.source, "crossref");
2291 assert_eq!(ext.size_bytes, 0, "metadata-only entry has no PDF");
2292
2293 std::env::remove_var("DOIGET_CROSSREF_BASE");
2294 }
2295
2296 #[tokio::test]
2297 #[serial_test::serial]
2298 async fn resolve_only_and_pure_metadata_only_write_nothing_139() {
2299 let (_server, ctx, _store, store_root, _td) = md139_harness().await;
2300 let profile = CapabilityProfile::from_env().expect("clean env");
2301 let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
2302
2303 // resolve_only: contractually MUST NOT touch the store.
2304 let r = resolve_only(&ref_, &profile, &ctx)
2305 .await
2306 .expect("resolve_only ok");
2307 assert_eq!(r.source, "crossref");
2308 assert!(
2309 metadata_dir_tomls(&store_root).is_empty(),
2310 "resolve_only MUST NOT write a metadata TOML (docs/MCP_TOOLS.md §1; #139)"
2311 );
2312
2313 // The pure metadata_only is also write-free (the store-write
2314 // lives only in metadata_only_to_store).
2315 let m = metadata_only(&ref_, &profile, &ctx)
2316 .await
2317 .expect("metadata_only ok");
2318 assert_eq!(m.source, "crossref");
2319 assert!(
2320 metadata_dir_tomls(&store_root).is_empty(),
2321 "pure metadata_only MUST NOT write to the store (#139)"
2322 );
2323
2324 std::env::remove_var("DOIGET_CROSSREF_BASE");
2325 }
2326
2327 /// #139 — the arXiv branch of `metadata_only_to_store` must also
2328 /// write the metadata TOML (different code path: Atom feed,
2329 /// source="arxiv", license="arxiv-default", doi=None). Review I3/C1.
2330 #[tokio::test]
2331 #[serial_test::serial]
2332 async fn metadata_only_to_store_arxiv_writes_metadata_toml_139() {
2333 use crate::http::HttpClient;
2334 use crate::provenance::ProvenanceLog;
2335 use crate::rate_limiter::RateLimiter;
2336 use crate::store::FsStore;
2337 use crate::RateLimits;
2338 use std::sync::Arc;
2339 use wiremock::matchers::method;
2340 use wiremock::{Mock, MockServer, ResponseTemplate};
2341
2342 let atom = r#"<?xml version="1.0" encoding="UTF-8"?>
2343<feed xmlns="http://www.w3.org/2005/Atom">
2344 <entry>
2345 <id>http://arxiv.org/abs/2401.12345v1</id>
2346 <published>2024-01-15T00:00:00Z</published>
2347 <title>Example arXiv Paper Title</title>
2348 <summary>Example abstract.</summary>
2349 <author><name>Jane Doe</name></author>
2350 <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
2351 </entry>
2352</feed>"#;
2353 let server = MockServer::start().await;
2354 Mock::given(method("GET"))
2355 .respond_with(ResponseTemplate::new(200).set_body_string(atom))
2356 .mount(&server)
2357 .await;
2358 std::env::set_var("DOIGET_ARXIV_BASE", server.uri());
2359 let host = server
2360 .uri()
2361 .parse::<url::Url>()
2362 .expect("uri")
2363 .host_str()
2364 .expect("host")
2365 .to_string();
2366
2367 let td = tempfile::TempDir::new().expect("tempdir");
2368 let base = Utf8Path::from_path(td.path()).expect("utf-8");
2369 let store_root = base.join("papers");
2370 let ctx = FetchContext {
2371 http: Arc::new(HttpClient::new_for_tests_allow_http("arxiv", &host)),
2372 rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2373 log: Arc::new(
2374 ProvenanceLog::open(base.join("log.jsonl"), "01J0000000000000000000TEST".into())
2375 .expect("provenance log"),
2376 ),
2377 session_id: "01J0000000000000000000TEST".into(),
2378 };
2379 let store = FsStore::new(store_root.clone()).expect("fs store");
2380 let profile = CapabilityProfile::from_env().expect("clean env");
2381 let ref_ = Ref::Arxiv(crate::ArxivId::parse("2401.12345").expect("arxiv id"));
2382
2383 let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
2384 .await
2385 .expect("metadata_only_to_store (arxiv) ok");
2386 assert_eq!(outcome.source, "arxiv");
2387
2388 let tomls = metadata_dir_tomls(&store_root);
2389 assert_eq!(
2390 tomls.len(),
2391 1,
2392 "arXiv metadata-only must write one TOML; got {tomls:?}"
2393 );
2394 let meta: crate::store::Metadata =
2395 toml::from_str(&std::fs::read_to_string(&tomls[0]).expect("read")).expect("parse");
2396 assert_eq!(meta.title, "Example arXiv Paper Title");
2397 assert_eq!(
2398 meta.arxiv_id.as_ref().map(|a| a.as_str()),
2399 Some("2401.12345")
2400 );
2401 assert!(meta.doi.is_none(), "arXiv entry has no DOI");
2402 let ext = meta.doiget.expect("[doiget] table");
2403 assert_eq!(ext.source, "arxiv");
2404 assert_eq!(ext.license, "arxiv-default");
2405
2406 std::env::remove_var("DOIGET_ARXIV_BASE");
2407 }
2408
2409 // ----- pure-function unit tests for the #139 extraction helpers ----
2410
2411 #[test]
2412 fn extract_metadata_title_handles_string_array_missing_blank() {
2413 use serde_json::json;
2414 // bare string (arXiv/Unpaywall shape)
2415 assert_eq!(
2416 extract_metadata_title(&json!({"title": "Hello"})),
2417 Some("Hello".to_string())
2418 );
2419 // single-element array (Crossref `message.title` in practice)
2420 assert_eq!(
2421 extract_metadata_title(&json!({"title": ["Real Title"]})),
2422 Some("Real Title".to_string())
2423 );
2424 // missing key -> None (caller falls back to ref id)
2425 assert_eq!(extract_metadata_title(&json!({"x": 1})), None);
2426 // blank string -> None (must not persist an empty title)
2427 assert_eq!(extract_metadata_title(&json!({"title": " "})), None);
2428 // empty array -> None
2429 assert_eq!(extract_metadata_title(&json!({"title": []})), None);
2430 // A leading blank/whitespace array element is SKIPPED — the first
2431 // non-blank element is taken (a stray leading empty element must
2432 // not mask the real Crossref title).
2433 assert_eq!(
2434 extract_metadata_title(&json!({"title": [" ", "Real Title"]})),
2435 Some("Real Title".to_string())
2436 );
2437 // all-blank array -> None (caller falls back to ref id)
2438 assert_eq!(extract_metadata_title(&json!({"title": [" ", ""]})), None);
2439 }
2440
2441 #[test]
2442 fn extract_metadata_authors_handles_each_resolver_shape() {
2443 use serde_json::json;
2444 // arXiv: authors: [String]
2445 assert_eq!(
2446 extract_metadata_authors(&json!({"authors": ["Jane Doe", "John Roe"]})),
2447 vec!["Jane Doe".to_string(), "John Roe".to_string()]
2448 );
2449 // Crossref: author: [{given,family}]
2450 assert_eq!(
2451 extract_metadata_authors(&json!({"author": [{"given": "Ada", "family": "Lovelace"}]})),
2452 vec!["Ada Lovelace".to_string()]
2453 );
2454 // family-only (given absent) -> trimmed, no leading space
2455 assert_eq!(
2456 extract_metadata_authors(&json!({"author": [{"family": "Onsager"}]})),
2457 vec!["Onsager".to_string()]
2458 );
2459 // `name` fallback when given+family both absent
2460 assert_eq!(
2461 extract_metadata_authors(&json!({"author": [{"name": "K. Wilson"}]})),
2462 vec!["K. Wilson".to_string()]
2463 );
2464 // z_authors fallback shape (forward-compat branch)
2465 assert_eq!(
2466 extract_metadata_authors(&json!({"z_authors": [{"given": "L", "family": "Kadanoff"}]})),
2467 vec!["L Kadanoff".to_string()]
2468 );
2469 // nothing parseable -> empty (still a valid TOML)
2470 assert!(extract_metadata_authors(&json!({"x": 1})).is_empty());
2471 assert!(extract_metadata_authors(&json!({"authors": []})).is_empty());
2472 }
2473}