Skip to main content

doiget_core/
orchestrator.rs

1//! Cross-source orchestrators that compose multiple [`Source`] impls into
2//! a single user-facing operation.
3//!
4//! Slice 2 of the doiget roadmap promotes [`fetch_paper`] and
5//! [`batch_fetch`] from `doiget-cli` into this module so the MCP server
6//! (`doiget-mcp`) and the CLI share one source of truth for the per-ref
7//! orchestration. The CLI's `commands::fetch::fetch_one` is now a thin
8//! wrapper that delegates here and adds the human-facing stderr print
9//! line. Dry-run preview helpers live as [`fetch_paper_plan`] and
10//! [`batch_fetch_plans`].
11//!
12//! [`Source`]: crate::source::Source
13
14use std::collections::BTreeMap;
15
16use camino::{Utf8Path, Utf8PathBuf};
17use chrono::Utc;
18use serde_json::Value;
19
20use crate::dry_run::{build_fetch_plan, try_build_fetch_plan, FetchPlan};
21use crate::http::HttpError;
22use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
23use crate::source::{FetchContext, FetchError, FetchResult, Source};
24use crate::sources::arxiv::ArxivSource;
25use crate::sources::crossref::CrossrefSource;
26use crate::sources::unpaywall::UnpaywallSource;
27use crate::store::{DoigetExtension, Metadata, Store};
28use crate::{ArxivId, CapabilityProfile, Doi, Ref, Safekey, MAX_BATCH_REFS, SCHEMA_VERSION};
29
30/// Outcome of a successful [`metadata_only`] call.
31///
32/// Mirrors the wire shape documented in `docs/MCP_TOOLS.md` §11: the
33/// `source` identifies which resolver produced the metadata, `license`
34/// is the OA license string when known (Unpaywall channel), `oa_url` is
35/// the discovered OA URL **(never followed by this orchestrator)**, and
36/// `metadata` is the source's native JSON payload (Crossref `message`,
37/// Unpaywall work record, or the parsed arXiv Atom-feed object).
38///
39/// `metadata` is serialized as-is by the MCP envelope builder
40/// (`crates/doiget-mcp/src/lib.rs`); we deliberately do NOT normalize
41/// here so the agent can see exactly what the source returned.
42#[derive(Debug, Clone)]
43#[non_exhaustive]
44pub struct MetadataOnlyOutcome {
45    /// Resolver key that produced the metadata payload. One of
46    /// `"crossref"`, `"unpaywall"`, `"arxiv"` (the closed set named in
47    /// `docs/MCP_TOOLS.md` §11 type alias).
48    pub source: String,
49    /// Resolver profile under which the canonical-digest (ADR-0021 §1)
50    /// was minted for this call. In Slice 4 this equals
51    /// [`Self::source`] verbatim (the metadata-only path emits one row
52    /// per consulted resolver); future slices that introduce overlapping
53    /// resolvers MAY have `resolver_profile != source`. Surfaced through
54    /// the `doiget_metadata_only` MCP envelope per ADR-0021 §4.
55    pub resolver_profile: String,
56    /// OA license string when the resolver could supply one (today only
57    /// the Unpaywall fallback path populates this). `None` when the
58    /// primary source did not surface a license.
59    pub license: Option<String>,
60    /// Discovered OA URL — surfaced to the caller for separate action,
61    /// **never followed by this orchestrator**. The Crossref response's
62    /// `message.link[]` array is mined first; the Unpaywall fallback
63    /// path uses `best_oa_location.url_for_pdf` (or `url`).
64    pub oa_url: Option<String>,
65    /// Source's native metadata payload. For Crossref this is the
66    /// `message` object; for Unpaywall the work record; for arXiv the
67    /// parsed Atom-feed JSON (see
68    /// `crate::sources::arxiv::parse_atom_feed`).
69    pub metadata: Value,
70}
71
72/// Resolve a [`Ref`] to metadata WITHOUT triggering a publisher PDF
73/// fetch.
74///
75/// Binding spec: `docs/MCP_TOOLS.md` §11 (NORMATIVE — this function
76/// MUST NOT call [`crate::http::HttpClient::fetch_pdf`] under any code
77/// path). The posture-lint workflow greps for that pattern; the test
78/// suite additionally exercises the DOI and arXiv branches end-to-end
79/// against wiremock to assert the OA URL is reported, not followed.
80///
81/// # Dispatch
82///
83/// - `Ref::Doi(_)` → Crossref first (bibliographic metadata + OA URL
84///   via `message.link[]`). If Crossref returns a usable payload the
85///   call returns immediately; Unpaywall is consulted only as a fallback
86///   when Crossref fails. The Unpaywall fallback surfaces a license
87///   string and may overwrite `oa_url` with the `best_oa_location`
88///   channel.
89/// - `Ref::Arxiv(_)` → [`ArxivSource::fetch_metadata_only`]: ONLY the
90///   Atom feed (`https://export.arxiv.org/api/query?id_list=<id>`) is
91///   consulted; the PDF endpoint is NOT touched. `license` is set to
92///   the platform-wide `"arxiv-default"` token, `oa_url` is `None`
93///   (the arXiv abstract page is not a PDF URL).
94///
95/// # Side effects
96///
97/// Each consulted source appends ONE `LogEvent::Fetch` row to
98/// `ctx.log` (arXiv emits its row under `Capability::Metadata`; the
99/// DOI sources emit under `Capability::Oa` — they pre-date this
100/// distinction and a follow-up slice may unify them). The orchestrator
101/// itself does NOT bracket the call with `SessionStart` / `SessionEnd`
102/// rows — that is the MCP server's responsibility (it owns the
103/// per-tool-call session boundary).
104///
105/// This function is the **pure resolver**: it consults the source(s)
106/// and emits provenance rows, but it does NOT write to the store.
107/// The `docs/MCP_TOOLS.md` §11 store-write SIDE EFFECT is provided by
108/// [`metadata_only_to_store`], which wraps this and persists the
109/// metadata TOML to `<root>/.metadata/<safekey>.toml`. Keeping the
110/// store-write in a *separate* entry point is exactly what lets
111/// [`resolve_only`] safely delegate here — its contract forbids any
112/// store write, and a pure `metadata_only` can never regress that
113/// invariant (#139).
114///
115/// # Errors
116///
117/// Returns [`FetchError`] from the underlying [`Source`] dispatch. The
118/// MCP boundary converts these to the closed [`crate::ErrorCode`] set
119/// via the existing `From<FetchError> for ErrorCode` impl.
120// Stays `pub` (a `pub(crate)` compile-time guard was considered and
121// rejected): `crates/doiget-core/tests/` integration tests
122// (`real_world_fixtures_e2e`) legitimately drive the PURE resolver
123// directly and assert its outcome, and `tests/` compiles as a separate
124// crate. The #139 pre-fix bug (an MCP caller
125// picking the pure variant when it needed persistence) is instead
126// prevented *structurally*: the MCP layer imports only
127// `metadata_only_to_store`, and `resolve_only` delegates to this pure
128// fn — neither can acquire or skip the store-write by mistake.
129pub async fn metadata_only(
130    ref_: &Ref,
131    profile: &CapabilityProfile,
132    ctx: &FetchContext,
133) -> Result<MetadataOnlyOutcome, FetchError> {
134    match ref_ {
135        Ref::Doi(doi) => metadata_only_doi(doi, ref_, profile, ctx).await,
136        Ref::Arxiv(id) => {
137            let arxiv = arxiv_source_from_env();
138            let metadata = arxiv.fetch_metadata_only(id, ctx).await?;
139            // Pure resolver — no store write here (see fn doc); the
140            // store-write side effect lives in `metadata_only_to_store`.
141            Ok(MetadataOnlyOutcome {
142                source: arxiv.name().to_string(),
143                resolver_profile: arxiv.name().to_string(),
144                license: Some("arxiv-default".to_string()),
145                oa_url: None,
146                metadata,
147            })
148        }
149    }
150}
151
152/// Resolve a [`Ref`] to metadata with **no local persistence**.
153///
154/// This is the audit-trail-preserving sibling of [`metadata_only`]: each
155/// consulted [`Source`] still emits its own `LogEvent::Fetch` row
156/// through `ctx.log` (so the provenance hash chain remains continuous,
157/// per `docs/PROVENANCE_LOG.md`), but the orchestrator MUST NOT write
158/// the metadata TOML to the store under any code path — present or
159/// future.
160///
161/// Binding spec: `docs/MCP_TOOLS.md` §1 (the `doiget_resolve_paper`
162/// tool — Slice 7).
163///
164/// # Why this exists as a distinct orchestrator
165///
166/// [`metadata_only`] is the **pure resolver** and never writes to the
167/// store; the store-write SIDE EFFECT lives only in the separate
168/// [`metadata_only_to_store`] wrapper. Because the write is in a
169/// *different* entry point that this function does not call,
170/// delegating to [`metadata_only`] is permanently safe — there is no
171/// code path by which `resolve_only` can acquire a store write, now or
172/// in future (#139). This structural separation is the entire reason
173/// `metadata_only` was split into a pure core + a persisting wrapper
174/// rather than gaining a `write: bool` parameter.
175///
176/// # Dispatch
177///
178/// Identical to [`metadata_only`] (DOI → Crossref-first with Unpaywall
179/// fallback; arXiv → Atom feed only). The `oa_url` and `license`
180/// outputs follow the same rules.
181///
182/// # Side effects
183///
184/// One `LogEvent::Fetch` row per consulted resolver, written by the
185/// underlying [`Source`] impls. No metadata TOML write. No PDF fetch.
186/// No store mutation.
187///
188/// # Errors
189///
190/// Returns [`FetchError`] from the underlying [`Source`] dispatch,
191/// identical to [`metadata_only`].
192pub async fn resolve_only(
193    ref_: &Ref,
194    profile: &CapabilityProfile,
195    ctx: &FetchContext,
196) -> Result<MetadataOnlyOutcome, FetchError> {
197    // Delegating to the PURE `metadata_only` is the contract-correct
198    // implementation, not a placeholder: `metadata_only` never writes
199    // to the store (the persisting path is the separate
200    // `metadata_only_to_store`, which this function does not call), so
201    // `resolve_only`'s "no store mutation" guarantee holds structurally
202    // and cannot regress (#139).
203    metadata_only(ref_, profile, ctx).await
204}
205
206/// Resolve a [`Ref`] to metadata **and persist the metadata TOML to the
207/// store** — the `docs/MCP_TOOLS.md` §11 `doiget_metadata_only` SIDE
208/// EFFECT (#139).
209///
210/// Wraps the pure [`metadata_only`]: it runs the same resolver dispatch
211/// (so the provenance hash chain is identical), then writes
212/// `<root>/.metadata/<safekey>.toml` via the same
213/// `write_metadata_and_pdf` path `fetch_paper` uses for its
214/// metadata-only fallback, emitting one `StoreWrite` provenance row.
215///
216/// [`resolve_only`] MUST NOT call this — its contract forbids any store
217/// write. The split (pure core vs. persisting wrapper) makes that
218/// invariant structural rather than a convention.
219///
220/// # Errors
221///
222/// [`FetchError`] from the underlying resolver dispatch, or — if the
223/// store write fails — [`FetchError::SourceSchema`] (the closest
224/// closed-set arm; there is no dedicated `FetchError::StoreError`, so
225/// the MCP boundary maps it to `INTERNAL_ERROR` — see the inline note
226/// in `write_metadata_and_pdf`). On store-write failure
227/// `write_metadata_and_pdf` makes a **best-effort** attempt to
228/// append a `StoreWrite`/`Err` provenance row before the error
229/// propagates (that append's own failure is not separately surfaced —
230/// this matches the pre-existing `fetch_paper` metadata-only fallback
231/// path and is out of scope for #139).
232pub async fn metadata_only_to_store(
233    ref_: &Ref,
234    profile: &CapabilityProfile,
235    ctx: &FetchContext,
236    store: &dyn Store,
237) -> Result<MetadataOnlyOutcome, FetchError> {
238    let outcome = metadata_only(ref_, profile, ctx).await?;
239    let safekey = ref_.safekey();
240    let metadata = build_metadata_only_metadata(ref_, &outcome);
241    // `pdf_src = None` => writes `<root>/.metadata/<safekey>.toml` and
242    // appends the `StoreWrite` row (the exact path `fetch_paper` uses
243    // for its DOI metadata-only fallback).
244    write_metadata_and_pdf(store, &safekey, &metadata, None, ctx)?;
245    Ok(outcome)
246}
247
248/// Build the [`Metadata`] persisted by [`metadata_only_to_store`].
249///
250/// Minimal but valid: enough that a subsequent `doiget_info` returns a
251/// non-null `metadata` object (the #139 acceptance criterion). Title is
252/// best-effort from the resolver payload (`title` as a string, or the
253/// first element if it is an array — Crossref's `message.title` is
254/// typically an array, arXiv/Unpaywall typically a string; the
255/// extractor tolerates either regardless of source); it falls back to
256/// the ref id so the required `title` field is never empty.
257/// Bibliographic enrichment
258/// (year, venue, …) is intentionally out of scope here — the
259/// metadata-only contract is "persist what the resolver returned", and
260/// the raw payload is preserved verbatim in `MetadataOnlyOutcome`.
261fn build_metadata_only_metadata(ref_: &Ref, outcome: &MetadataOnlyOutcome) -> Metadata {
262    let (doi, arxiv_id) = match ref_ {
263        Ref::Doi(d) => (Some(d.clone()), None),
264        Ref::Arxiv(a) => (None, Some(a.clone())),
265    };
266    let ref_id = ref_.as_input_str().to_string();
267    let title = match extract_metadata_title(&outcome.metadata) {
268        Some(t) => t,
269        None => {
270            // The resolver returned a payload with no usable title.
271            // Persisting the ref id keeps the entry valid (#139), but
272            // emit a diagnostic so a broken/partial resolver response is
273            // not silently indistinguishable from a genuine title.
274            tracing::warn!(
275                ref_id = %ref_id,
276                source = %outcome.source,
277                "metadata-only: no usable title in resolver payload; \
278                 persisting the ref id as the title placeholder"
279            );
280            ref_id
281        }
282    };
283    Metadata {
284        schema_version: SCHEMA_VERSION.to_string(),
285        title,
286        authors: extract_metadata_authors(&outcome.metadata),
287        year: None,
288        doi,
289        arxiv_id,
290        abstract_: None,
291        venue: None,
292        publisher: None,
293        issn: None,
294        isbn: None,
295        type_: None,
296        keywords: Vec::new(),
297        url: outcome.oa_url.clone(),
298        pdf_path: None,
299        doiget: Some(DoigetExtension {
300            fetched_at: Utc::now(),
301            source: outcome.source.clone(),
302            license: outcome
303                .license
304                .clone()
305                .unwrap_or_else(|| "unknown".to_string()),
306            size_bytes: 0,
307            mcp_call_id: None,
308        }),
309        other: BTreeMap::new(),
310    }
311}
312
313/// `title` from a resolver payload: a bare string, or the first
314/// **non-blank** element of an array (Crossref `message.title` is
315/// `[String]`; a leading empty/whitespace element is skipped rather
316/// than masking the real title). Trimmed. `None` if absent/blank.
317fn extract_metadata_title(meta: &Value) -> Option<String> {
318    let t = meta.get("title")?;
319    let s = match t.as_str() {
320        Some(s) => s.trim().to_string(),
321        None => t
322            .as_array()?
323            .iter()
324            .filter_map(Value::as_str)
325            .map(str::trim)
326            .find(|s| !s.is_empty())?
327            .to_string(),
328    };
329    if s.is_empty() {
330        None
331    } else {
332        Some(s)
333    }
334}
335
336/// Best-effort author list, tolerant of the resolver shapes we may see:
337/// Crossref `author: [{given,family}]`, arXiv `authors: [String]`, and
338/// a `z_authors: [{given,family}]` fallback. NOTE: doiget's Unpaywall
339/// source deserializes a *partial* `UnpaywallWork` that does not capture
340/// `z_authors`, so the `z_authors` branch is currently inert for the
341/// Unpaywall path (kept as forward-compat for if/when that struct
342/// captures it) — Unpaywall-sourced metadata-only entries get an empty
343/// author list. Returns `Vec::new()` when nothing is parseable (a valid
344/// metadata TOML — #139 only requires the entry to exist and be
345/// readable).
346fn extract_metadata_authors(meta: &Value) -> Vec<String> {
347    if let Some(arr) = meta.get("authors").and_then(Value::as_array) {
348        let v: Vec<String> = arr
349            .iter()
350            .filter_map(|a| a.as_str().map(str::to_string))
351            .collect();
352        if !v.is_empty() {
353            return v;
354        }
355    }
356    for key in ["author", "z_authors"] {
357        if let Some(arr) = meta.get(key).and_then(Value::as_array) {
358            let v: Vec<String> = arr
359                .iter()
360                .filter_map(|a| {
361                    let given = a.get("given").and_then(Value::as_str).unwrap_or("");
362                    let family = a.get("family").and_then(Value::as_str).unwrap_or("");
363                    let name = format!("{given} {family}");
364                    let name = name.trim();
365                    if name.is_empty() {
366                        a.get("name").and_then(Value::as_str).map(str::to_string)
367                    } else {
368                        Some(name.to_string())
369                    }
370                })
371                .collect();
372            if !v.is_empty() {
373                return v;
374            }
375        }
376    }
377    Vec::new()
378}
379
380// ---------------------------------------------------------------------------
381// Env-aware source constructors (mirrors doiget-cli::commands::fetch::build_*)
382//
383// These let MCP integration tests redirect the orchestrator at a
384// wiremock origin via `DOIGET_*_BASE` env vars, without inverting the
385// `doiget-mcp -> doiget-core` wiring by depending on `doiget-cli`. The
386// override surface is identical to the CLI's `fetch.rs::build_*_source`
387// helpers so a single test fixture can drive both crates.
388// ---------------------------------------------------------------------------
389
390/// `DOIGET_CONTACT_EMAIL`, defaulting to the same `doiget@localhost`
391/// the CLI uses (`crates/doiget-cli/src/commands/fetch.rs::OrchestratorConfig`).
392const FALLBACK_CONTACT_EMAIL: &str = "doiget@localhost";
393
394fn contact_email_from_env() -> String {
395    std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| FALLBACK_CONTACT_EMAIL.to_string())
396}
397
398fn arxiv_source_from_env() -> ArxivSource {
399    if let Ok(s) = std::env::var("DOIGET_ARXIV_BASE") {
400        if let Ok(url) = url::Url::parse(&s) {
401            return ArxivSource::with_base(url);
402        }
403    }
404    ArxivSource::new()
405}
406
407fn crossref_source_from_env(contact: &str) -> CrossrefSource {
408    if let Ok(s) = std::env::var("DOIGET_CROSSREF_BASE") {
409        if let Ok(url) = url::Url::parse(&s) {
410            return CrossrefSource::with_base(url, contact.to_string());
411        }
412    }
413    CrossrefSource::new(contact.to_string())
414}
415
416fn unpaywall_source_from_env(contact: &str) -> UnpaywallSource {
417    if let Ok(s) = std::env::var("DOIGET_UNPAYWALL_BASE") {
418        if let Ok(url) = url::Url::parse(&s) {
419            return UnpaywallSource::with_base(url, contact.to_string());
420        }
421    }
422    UnpaywallSource::new(contact.to_string())
423}
424
425/// DOI branch — Crossref first, with Unpaywall as a fallback when
426/// Crossref fails. Crossref's `message.link[]` array (when present)
427/// supplies the OA URL hint without making a publisher request.
428async fn metadata_only_doi(
429    _doi: &Doi,
430    ref_: &Ref,
431    profile: &CapabilityProfile,
432    ctx: &FetchContext,
433) -> Result<MetadataOnlyOutcome, FetchError> {
434    let contact = contact_email_from_env();
435    let crossref = crossref_source_from_env(&contact);
436    match crossref.fetch(ref_, profile, ctx).await {
437        Ok(res) => {
438            let metadata = res.metadata_json.unwrap_or(Value::Null);
439            let oa_url = extract_crossref_oa_url(&metadata);
440            // Pure resolver — no store write here (see `metadata_only`
441            // doc); persistence is `metadata_only_to_store`'s job.
442            Ok(MetadataOnlyOutcome {
443                source: crossref.name().to_string(),
444                resolver_profile: crossref.name().to_string(),
445                // Crossref does not surface a license directly; the
446                // license channel for DOI metadata is Unpaywall's
447                // `best_oa_location.license`. Leave `None` here; the
448                // agent can call `unpaywall` (or a follow-up slice's
449                // chained orchestrator) if it needs a license string.
450                license: None,
451                oa_url,
452                metadata,
453            })
454        }
455        Err(crossref_err) => {
456            // Crossref failed. Try Unpaywall as a fallback before
457            // surfacing the original error.
458            let unpaywall = unpaywall_source_from_env(&contact);
459            match unpaywall.fetch(ref_, profile, ctx).await {
460                Ok(res) => {
461                    let metadata = res.metadata_json.unwrap_or(Value::Null);
462                    let oa_url = extract_unpaywall_oa_url(&metadata);
463                    let license = if res.license == "unknown" {
464                        None
465                    } else {
466                        Some(res.license)
467                    };
468                    Ok(MetadataOnlyOutcome {
469                        source: unpaywall.name().to_string(),
470                        resolver_profile: unpaywall.name().to_string(),
471                        license,
472                        oa_url,
473                        metadata,
474                    })
475                }
476                Err(_unpaywall_err) => {
477                    // Both sources failed; surface the Crossref error
478                    // (the primary path) for diagnosability.
479                    Err(crossref_err)
480                }
481            }
482        }
483    }
484}
485
486/// Defensively pull a Crossref OA URL out of a `message.link[]` entry.
487///
488/// The Crossref `Link` model documents `link[].URL` as the OA URL string
489/// when the work has one (see
490/// `<https://api.crossref.org/swagger-ui/index.html>`). Multiple entries
491/// may be present; we return the first non-empty `URL` field
492/// encountered. Returns `None` if the array is missing, empty, or
493/// contains no usable URL string.
494fn extract_crossref_oa_url(msg: &Value) -> Option<String> {
495    let arr = msg.get("link")?.as_array()?;
496    arr.iter()
497        .filter_map(|entry| entry.get("URL").and_then(Value::as_str))
498        .find(|s| !s.is_empty())
499        .map(|s| s.to_string())
500}
501
502/// Defensively pull Unpaywall's preferred OA URL
503/// (`best_oa_location.url_for_pdf`, falling back to `.url`) out of a
504/// metadata payload.
505fn extract_unpaywall_oa_url(meta: &Value) -> Option<String> {
506    let loc = meta.get("best_oa_location")?;
507    loc.get("url_for_pdf")
508        .and_then(Value::as_str)
509        .or_else(|| loc.get("url").and_then(Value::as_str))
510        .map(|s| s.to_string())
511}
512
513// ---------------------------------------------------------------------------
514// fetch_paper — single-ref orchestrator (Slice 2)
515// ---------------------------------------------------------------------------
516
517/// Outcome of a successful [`fetch_paper`] call.
518///
519/// Wire shape mirrors `docs/MCP_TOOLS.md` §5 `FetchResult` minus the
520/// envelope chrome the MCP server wraps it in (`ok: true`, `ref`,
521/// optional `error`).
522///
523/// `path` is the absolute path of the resource the orchestrator wrote to
524/// the store. For arXiv refs and successful DOI OA-PDF fetches this is
525/// `<root>/<safekey>.pdf`; for the DOI metadata-only fallback (OA URL
526/// host off the `oa-publisher` allowlist, or PDF leg failed for another
527/// transport reason — `docs/REDIRECT_ALLOWLIST.md` §3 informed-best-
528/// effort posture) this is `<root>/.metadata/<safekey>.toml`.
529/// Outcome of the DOI OA-PDF leg, carried on [`FetchPaperOutcome`] so a
530/// caller can NEVER silently report a blocked PDF as a plain
531/// "metadata-only" success (issue #118). The product promise is
532/// "immediately explain WHY a paper can't be fetched" — the distinction
533/// between "there was no OA PDF to fetch" and "an OA PDF existed but we
534/// were blocked, and here is the reason" is exactly that explanation.
535#[derive(Debug, Clone)]
536#[non_exhaustive]
537pub enum PdfLegStatus {
538    /// A PDF was fetched and written to disk (arXiv always; DOI when
539    /// the OA-publisher leg succeeded).
540    Fetched,
541    /// No OA URL was discovered (Unpaywall reported no
542    /// `best_oa_location`). Metadata-only is the correct, expected
543    /// result here — not a failure.
544    NoOaUrl,
545    /// An OA URL *was* discovered but the PDF could not be retrieved
546    /// (host outside the oa-publisher allowlist, not-a-PDF body,
547    /// transport failure, …). Metadata was still written, but the
548    /// caller MUST surface this reason rather than pretending the
549    /// fetch was a clean metadata-only success.
550    Blocked {
551        /// Closed-set code, mapped from the underlying transport error
552        /// via the canonical `From<FetchError> for ErrorCode`.
553        code: crate::ErrorCode,
554        /// Human-readable one-line reason (the `FetchError` display).
555        message: String,
556        /// Structured denial side-channel (ADR-0023) when the failure
557        /// was an allowlist / scheme denial; `None` otherwise.
558        denial: Option<crate::DenialContext>,
559    },
560}
561
562/// What `fetch_paper` wrote to disk and how.
563///
564/// `path` is the PDF (`<root>/<safekey>.pdf`) on a successful PDF
565/// fetch, or the metadata TOML (`<root>/.metadata/<safekey>.toml`)
566/// when the DOI path fell back to metadata-only. [`Self::pdf_leg`]
567/// disambiguates *why* there is no PDF (genuinely none available vs.
568/// available-but-blocked) so callers never report a blocked PDF as a
569/// silent success (issue #118).
570#[derive(Debug, Clone)]
571#[non_exhaustive]
572pub struct FetchPaperOutcome {
573    /// `Source::name()` of the resolver whose payload landed on disk:
574    /// `"arxiv"` for an arXiv ref, `"oa-publisher"` when the DOI OA PDF
575    /// leg succeeded, or `"crossref"` / `"unpaywall"` when the DOI path
576    /// fell back to metadata-only. Mirrors the value written to
577    /// `[doiget].source` in the metadata TOML.
578    pub source: String,
579    /// Resolver profile under which the canonical-digest (ADR-0021 §1)
580    /// was minted for the final artifact. For an arXiv fetch this is
581    /// `"arxiv"`; for a successful DOI OA PDF leg this is
582    /// `"oa-publisher"`; for the DOI metadata-only fallback this is the
583    /// metadata source key (`"crossref"` / `"unpaywall"`). Equal to
584    /// [`Self::source`] verbatim in Slice 4 but kept distinct so future
585    /// slices can decouple "which resolver wrote to disk" from "which
586    /// resolver is the audit identity". Surfaced through the
587    /// `doiget_fetch_paper` MCP envelope per ADR-0021 §4.
588    pub resolver_profile: String,
589    /// OA license string (`"CC-BY-4.0"`, `"cc-by"`, `"arxiv-default"`,
590    /// `"unknown"`). Mirrors `[doiget].license`.
591    pub license: String,
592    /// Absolute path of the artifact actually written
593    /// (`<root>/<safekey>.pdf` on success, `<root>/.metadata/<safekey>.toml`
594    /// on metadata-only fallback).
595    pub path: Utf8PathBuf,
596    /// Stored PDF size in bytes; `0` on the metadata-only fallback
597    /// (`docs/REDIRECT_ALLOWLIST.md` §3.5).
598    pub size_bytes: u64,
599    /// The schema version of the metadata TOML written
600    /// (always [`crate::SCHEMA_VERSION`] for this build).
601    pub schema_version: String,
602    /// What happened on the PDF leg (issue #118). `Fetched` /
603    /// `NoOaUrl` are clean outcomes; `Blocked` carries the structured
604    /// reason an OA PDF existed but could not be retrieved, so the
605    /// CLI / MCP surface it instead of a silent metadata-only success.
606    pub pdf_leg: PdfLegStatus,
607    /// Per-ref [`crate::Safekey`] stringified (`Ref::safekey().as_str()`).
608    /// Exposed on the outcome so JSON-mode CLI / MCP callers can
609    /// emit a structured success body without re-parsing the input
610    /// ref (#210 / `docs/ERRORS.md` §3). Always populated.
611    pub safekey: String,
612    /// ADR-0021 §1 canonical-digest as 64-char lowercase hex for the
613    /// resolver_profile that produced this outcome's audit identity.
614    /// For an arXiv fetch this is the digest under `"arxiv"`; for a
615    /// DOI OA PDF leg this is under `"oa-publisher"`; for the DOI
616    /// metadata-only fallback this is under the metadata source key
617    /// (`"crossref"` / `"unpaywall"`). Always populated.
618    pub canonical_digest: String,
619}
620
621impl FetchPaperOutcome {
622    /// Test-only constructor for downstream crates (`doiget-cli`,
623    /// `doiget-mcp`) that need to drive classification / rendering
624    /// logic without running the full orchestrator. Produces a
625    /// minimal but structurally-valid outcome — all required fields
626    /// populated with defensible stubs — so unit tests can assert
627    /// the surrounding behavior (JSONL shape, exit-code mapping,
628    /// PDF-leg branching) in isolation.
629    ///
630    /// `#[doc(hidden)]` because this is not a stable public API; the
631    /// signature may change to fit test needs without a CHANGELOG
632    /// `[BREAKING]` callout.
633    #[doc(hidden)]
634    pub fn for_test_synthetic(
635        safekey: impl Into<String>,
636        source: impl Into<String>,
637        pdf_leg: PdfLegStatus,
638    ) -> Self {
639        let safekey: String = safekey.into();
640        let source: String = source.into();
641        Self {
642            source: source.clone(),
643            resolver_profile: source.clone(),
644            license: "unknown".to_string(),
645            path: Utf8PathBuf::from(format!("/tmp/{safekey}.pdf")),
646            size_bytes: 0,
647            schema_version: SCHEMA_VERSION.to_string(),
648            pdf_leg,
649            safekey: safekey.clone(),
650            // 32 bytes of `0x00` → a stable, non-secret digest stub
651            // that's still 64 chars of lowercase hex.
652            canonical_digest: "00".repeat(32),
653        }
654    }
655}
656
657/// Resolve a [`Ref`] to a PDF (or metadata-only fallback) and write it
658/// through `store`.
659///
660/// Binding spec: `docs/MCP_TOOLS.md` §4 (`doiget_fetch_paper`),
661/// `docs/REDIRECT_ALLOWLIST.md` §3 (informed-best-effort posture for the
662/// DOI OA PDF leg), `docs/PROVENANCE_LOG.md` §3 (per-attempt `Fetch` rows
663/// emitted by the source impls; `StoreWrite` row emitted by this
664/// orchestrator).
665///
666/// # Dispatch
667///
668/// - `Ref::Arxiv(_)` → [`ArxivSource::fetch`]; the source returns PDF
669///   bytes + Atom-feed metadata. The orchestrator writes both the PDF
670///   and the metadata TOML.
671/// - `Ref::Doi(_)` → Crossref metadata + Unpaywall license/OA-URL
672///   enrichment + (when the OA URL host is on the `oa-publisher`
673///   allowlist) a publisher PDF leg. A failure on the PDF leg is
674///   non-fatal: the metadata is still written and the orchestrator
675///   returns `Ok(...)` with `source` set to the metadata source.
676///
677/// # Side effects
678///
679/// Each consulted source emits one `LogEvent::Fetch` row via
680/// `ctx.log.append`. The orchestrator additionally emits one
681/// `LogEvent::StoreWrite` row on the successful write. Session bookend
682/// rows are the caller's responsibility (the CLI's
683/// `commands::fetch::run_with_options` wraps the call; the MCP server's
684/// `doiget_fetch_paper` tool method wraps it too).
685///
686/// # Errors
687///
688/// Returns [`FetchError`] from the underlying [`Source`] dispatch. The
689/// MCP boundary converts these to the closed [`crate::ErrorCode`] set
690/// via the existing `From<FetchError> for ErrorCode` impl.
691pub async fn fetch_paper(
692    ref_: &Ref,
693    profile: &CapabilityProfile,
694    ctx: &FetchContext,
695    store: &dyn Store,
696    store_root: &Utf8Path,
697) -> Result<FetchPaperOutcome, FetchError> {
698    let safekey = ref_.safekey();
699    match ref_ {
700        Ref::Arxiv(id) => {
701            fetch_paper_arxiv(id, ref_, profile, ctx, store, store_root, &safekey).await
702        }
703        Ref::Doi(doi) => {
704            fetch_paper_doi(doi, ref_, profile, ctx, store, store_root, &safekey).await
705        }
706    }
707}
708
709/// Build the dry-run preview ([`FetchPlan`]) for a single ref without
710/// touching the network, store, or provenance log. Thin re-export of
711/// [`crate::dry_run::build_fetch_plan`] under the slice-2 naming the
712/// MCP tool surfaces use; kept here so the MCP `doiget_fetch_paper`
713/// tool method does not have to reach across two modules.
714pub fn fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
715    build_fetch_plan(ref_, store_root)
716}
717
718/// Fallible sibling of [`fetch_paper_plan`] — propagates an internal
719/// allowlist-contract drift as a typed [`FetchError::SourceSchema`]
720/// instead of degrading to an empty `candidate_hosts` list (issue
721/// #156 ②). Thin re-export of [`crate::dry_run::try_build_fetch_plan`].
722/// Added alongside the infallible [`fetch_paper_plan`] rather than
723/// changing its signature, because `fetch_paper_plan` is `pub` and
724/// called from `doiget-mcp`, which is out of scope for this batch.
725///
726/// # Errors
727///
728/// See [`crate::dry_run::try_build_fetch_plan`].
729pub fn try_fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> Result<FetchPlan, FetchError> {
730    try_build_fetch_plan(ref_, store_root)
731}
732
733/// arXiv branch of [`fetch_paper`]. Internal — public callers go
734/// through `fetch_paper`.
735async fn fetch_paper_arxiv(
736    id: &ArxivId,
737    ref_: &Ref,
738    profile: &CapabilityProfile,
739    ctx: &FetchContext,
740    store: &dyn Store,
741    store_root: &Utf8Path,
742    safekey: &Safekey,
743) -> Result<FetchPaperOutcome, FetchError> {
744    let source = arxiv_source_from_env();
745    if !source.can_serve(profile, ref_) {
746        return Err(FetchError::NotEligible {
747            source_key: source.name().to_string(),
748        });
749    }
750
751    let FetchResult {
752        license,
753        pdf_bytes,
754        final_url,
755        ..
756    } = source.fetch(ref_, profile, ctx).await?;
757    let pdf = pdf_bytes.ok_or_else(|| FetchError::SourceSchema {
758        hint: "arxiv source returned no PDF bytes".to_string(),
759    })?;
760    let size_bytes = pdf.len() as u64;
761
762    // Phase 1 minimal metadata. Full Atom-feed extraction (title /
763    // authors) lives in `ArxivSource::fetch_metadata_only` and the
764    // metadata-only orchestrator; the fetch path keeps the placeholder
765    // for now (a follow-up slice may chain in Atom-parse here).
766    let metadata = Metadata {
767        schema_version: SCHEMA_VERSION.to_string(),
768        title: format!("arxiv:{}", id.as_str()),
769        authors: Vec::new(),
770        year: None,
771        doi: None,
772        arxiv_id: Some(id.clone()),
773        abstract_: None,
774        venue: None,
775        publisher: None,
776        issn: None,
777        isbn: None,
778        type_: None,
779        keywords: Vec::new(),
780        url: final_url.as_ref().map(|u| u.to_string()),
781        pdf_path: Some(format!("{}.pdf", safekey.as_str())),
782        doiget: Some(DoigetExtension {
783            fetched_at: Utc::now(),
784            source: "arxiv".to_string(),
785            license: license.clone(),
786            size_bytes,
787            mcp_call_id: None,
788        }),
789        other: BTreeMap::new(),
790    };
791
792    let tmp = stage_pdf_to_tempfile(&pdf)?;
793    let pdf_src = Utf8Path::from_path(tmp.path())
794        .ok_or_else(|| FetchError::SourceSchema {
795            hint: "staging tempfile path is not UTF-8".to_string(),
796        })?
797        .to_path_buf();
798    write_metadata_and_pdf(store, safekey, &metadata, Some(&pdf_src), ctx)?;
799    drop(tmp);
800
801    let path = store_root.join(format!("{}.pdf", safekey.as_str()));
802    let canonical_digest =
803        crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), "arxiv", None).digest_hex();
804    Ok(FetchPaperOutcome {
805        source: "arxiv".to_string(),
806        resolver_profile: "arxiv".to_string(),
807        license,
808        path,
809        size_bytes,
810        schema_version: SCHEMA_VERSION.to_string(),
811        // arXiv always delivers the PDF (or the whole fn already
812        // returned Err above) — there is no metadata-only fallback.
813        pdf_leg: PdfLegStatus::Fetched,
814        safekey: safekey.as_str().to_string(),
815        canonical_digest,
816    })
817}
818
819/// DOI branch of [`fetch_paper`] — Crossref + Unpaywall + (when allowed)
820/// OA-publisher PDF leg. Mirrors the CLI's `fetch_doi` implementation
821/// (`crates/doiget-cli/src/commands/fetch.rs`) — the CLI now delegates
822/// here so both surfaces share one source of truth.
823async fn fetch_paper_doi(
824    doi: &Doi,
825    ref_: &Ref,
826    profile: &CapabilityProfile,
827    ctx: &FetchContext,
828    store: &dyn Store,
829    store_root: &Utf8Path,
830    safekey: &Safekey,
831) -> Result<FetchPaperOutcome, FetchError> {
832    let contact = contact_email_from_env();
833    let unpaywall_contact = unpaywall_email_from_env(&contact);
834    let crossref = crossref_source_from_env(&contact);
835    // Issue #120: Crossref is NON-fatal. A transient Crossref failure
836    // must not abort the whole DOI fetch when Unpaywall alone can
837    // still deliver the OA PDF. We keep the error and only surface it
838    // if nothing usable comes back (see the both-failed guard below).
839    let (cross, crossref_err) = match crossref.fetch(ref_, profile, ctx).await {
840        Ok(r) => (Some(r), None),
841        Err(e) => {
842            tracing::warn!(
843                error = %e,
844                "crossref fetch failed; continuing with unpaywall-only metadata + OA leg"
845            );
846            (None, Some(e))
847        }
848    };
849    let crossref_meta = cross
850        .as_ref()
851        .and_then(|c| c.metadata_json.clone())
852        .unwrap_or(Value::Null);
853    let extracted = extract_crossref_fields(&crossref_meta);
854
855    // Unpaywall second — license enrichment + OA URL chain discovery.
856    // A failure here is non-fatal: we still write the Crossref-
857    // derived metadata.
858    let unpaywall = unpaywall_source_from_env(&unpaywall_contact);
859    let upw_result = unpaywall.fetch(ref_, profile, ctx).await;
860    let (license, source_label, oa_chain) = match upw_result {
861        Ok(r) => {
862            let chain = extract_oa_url_chain(r.metadata_json.as_ref());
863            let label = if r.license != "unknown" {
864                "unpaywall".to_string()
865            } else {
866                "crossref".to_string()
867            };
868            (r.license, label, chain)
869        }
870        Err(e) => {
871            // Unpaywall unreachable / errored. We continue with the
872            // Crossref-only metadata, but the resulting empty OA
873            // chain will be reported downstream as
874            // `PdfLegStatus::NoOaUrl` — semantically distinct from
875            // "Unpaywall confirmed no OA URL". The provenance log
876            // already carries an Unpaywall Fetch err row (the
877            // Unpaywall source impl logged its own attempt before
878            // returning), so the audit trail captures the cause; the
879            // tracing line below makes the orchestrator-level signal
880            // loud as well. Surfacing the distinction at the
881            // `PdfLegStatus` level (a new variant like
882            // `MetadataSourceUnavailable`) is a deliberate
883            // follow-up — see CHANGELOG `[0.4.0]` Notes.
884            tracing::warn!(
885                error = %e,
886                doi = %doi.as_str(),
887                "unpaywall fetch failed; OA chain will be empty (downstream PdfLegStatus::NoOaUrl \
888                 is conservative — Unpaywall was unreachable, not authoritatively oa-free)"
889            );
890            ("unknown".to_string(), "crossref".to_string(), Vec::new())
891        }
892    };
893
894    // OA PDF leg — ADR-0029 fetch chain. Walk the candidate URL list
895    // in order; first successful PDF wins, all-failed surfaces as
896    // `PdfLegStatus::Blocked` with the LAST attempt's error (the most
897    // informative for the operator — typically the network /
898    // allowlist reason the chain could not be exhausted). Each
899    // `try_fetch_oa_pdf` call already emits its own per-attempt
900    // provenance row (`oa-publisher` Fetch ok / err), so the audit
901    // trail captures every external request without orchestrator-
902    // side bookkeeping.
903    //
904    // Issue #118: a failure here is NEVER silently turned into a
905    // clean metadata-only success — the structured reason is carried
906    // out on `PdfLegStatus::Blocked`.
907    let (pdf_leg, pdf_bytes) = if oa_chain.is_empty() {
908        (PdfLegStatus::NoOaUrl, None)
909    } else {
910        let mut succeeded: Option<Vec<u8>> = None;
911        let mut last_err: Option<HttpError> = None;
912        let total = oa_chain.len();
913        for (idx, candidate) in oa_chain.iter().enumerate() {
914            let attempt = idx + 1;
915            tracing::debug!(
916                attempt,
917                total,
918                url = %candidate,
919                "trying OA PDF candidate (ADR-0029 chain)"
920            );
921            match try_fetch_oa_pdf(doi, candidate, ctx).await {
922                Ok((bytes, _final_url)) => {
923                    if attempt > 1 {
924                        tracing::info!(
925                            attempt,
926                            total,
927                            url = %candidate,
928                            "OA PDF chain succeeded on fallback candidate (ADR-0029)"
929                        );
930                    }
931                    succeeded = Some(bytes);
932                    break;
933                }
934                Err(e) => {
935                    tracing::warn!(
936                        attempt,
937                        total,
938                        url = %candidate,
939                        error = %e,
940                        "OA PDF candidate failed; advancing to next (ADR-0029 chain)"
941                    );
942                    last_err = Some(e);
943                }
944            }
945        }
946        match (succeeded, last_err) {
947            (Some(bytes), _) => (PdfLegStatus::Fetched, Some(bytes)),
948            (None, Some(e)) => {
949                let fe = FetchError::Http(e);
950                let denial: Option<crate::DenialContext> = (&fe).into();
951                let message = fe.to_string();
952                let code: crate::ErrorCode = fe.into();
953                (
954                    PdfLegStatus::Blocked {
955                        code,
956                        message,
957                        denial,
958                    },
959                    None,
960                )
961            }
962            // Defensive fallback. `oa_chain` is non-empty in this
963            // branch, so structurally at least one iteration must set
964            // either `succeeded` or `last_err`. If a future refactor
965            // breaks the invariant we fail CLOSED — surface a
966            // `Blocked` outcome with a self-describing message
967            // rather than `NoOaUrl` (which would falsely tell the
968            // caller no candidate URL was ever discovered). Routes
969            // to `INTERNAL_ERROR` so the CLI's exit-code mapping
970            // signals a doiget bug, not a remote failure.
971            (None, None) => {
972                tracing::error!(
973                    total = oa_chain.len(),
974                    "OA PDF chain walker exhausted without recording success or error \
975                     (defensive fallback — should be unreachable)"
976                );
977                (
978                    PdfLegStatus::Blocked {
979                        code: crate::ErrorCode::InternalError,
980                        message:
981                            "OA PDF chain walker exhausted without recording success or error \
982                             (orchestrator bug — please report)"
983                                .to_string(),
984                        denial: None,
985                    },
986                    None,
987                )
988            }
989        }
990    };
991
992    // Issue #120: Crossref is non-fatal, but if it failed AND the OA
993    // PDF leg produced nothing, writing a DOI-only stub entry would
994    // mask a total failure and violate the "explain why" promise.
995    // Surface the Crossref error so the caller reports a real reason.
996    if let Some(e) = crossref_err {
997        if pdf_bytes.is_none() {
998            return Err(e);
999        }
1000    }
1001
1002    let (final_source_label, size_bytes, pdf_path_relative, pdf_staged) = match &pdf_bytes {
1003        Some(bytes) => {
1004            let staged = stage_pdf_to_tempfile(bytes)?;
1005            (
1006                "oa-publisher".to_string(),
1007                bytes.len() as u64,
1008                Some(format!("{}.pdf", safekey.as_str())),
1009                Some(staged),
1010            )
1011        }
1012        None => (source_label, 0u64, None, None),
1013    };
1014
1015    let metadata = Metadata {
1016        schema_version: SCHEMA_VERSION.to_string(),
1017        title: extracted.title.unwrap_or_else(|| doi.as_str().to_string()),
1018        authors: extracted.authors,
1019        year: extracted.year,
1020        doi: Some(doi.clone()),
1021        arxiv_id: None,
1022        abstract_: None,
1023        venue: extracted.venue,
1024        publisher: None,
1025        issn: None,
1026        isbn: None,
1027        type_: extracted.type_,
1028        keywords: Vec::new(),
1029        url: cross
1030            .as_ref()
1031            .and_then(|c| c.final_url.as_ref())
1032            .map(|u| u.to_string()),
1033        pdf_path: pdf_path_relative,
1034        doiget: Some(DoigetExtension {
1035            fetched_at: Utc::now(),
1036            source: final_source_label.clone(),
1037            license: license.clone(),
1038            size_bytes,
1039            mcp_call_id: None,
1040        }),
1041        other: BTreeMap::new(),
1042    };
1043
1044    let pdf_src_path = pdf_staged
1045        .as_ref()
1046        .and_then(|tmp| Utf8Path::from_path(tmp.path()).map(|p| p.to_path_buf()));
1047    write_metadata_and_pdf(store, safekey, &metadata, pdf_src_path.as_deref(), ctx)?;
1048    drop(pdf_staged);
1049
1050    let path = if pdf_bytes.is_some() {
1051        store_root.join(format!("{}.pdf", safekey.as_str()))
1052    } else {
1053        store_root
1054            .join(".metadata")
1055            .join(format!("{}.toml", safekey.as_str()))
1056    };
1057    let canonical_digest = crate::CanonicalRef::new(
1058        crate::SourceType::Doi,
1059        doi.as_str(),
1060        &final_source_label,
1061        None,
1062    )
1063    .digest_hex();
1064    Ok(FetchPaperOutcome {
1065        source: final_source_label.clone(),
1066        resolver_profile: final_source_label,
1067        license,
1068        path,
1069        size_bytes,
1070        schema_version: SCHEMA_VERSION.to_string(),
1071        pdf_leg,
1072        safekey: safekey.as_str().to_string(),
1073        canonical_digest,
1074    })
1075}
1076
1077/// Stage PDF bytes to a tempfile so the existing `Store::write` atomic-
1078/// rename code path applies (the store takes a path, not bytes).
1079fn stage_pdf_to_tempfile(bytes: &[u8]) -> Result<tempfile::NamedTempFile, FetchError> {
1080    let tmp = tempfile::NamedTempFile::new().map_err(|e| FetchError::SourceSchema {
1081        hint: format!("creating PDF staging tempfile: {e}"),
1082    })?;
1083    std::fs::write(tmp.path(), bytes).map_err(|e| FetchError::SourceSchema {
1084        hint: format!("staging PDF bytes: {e}"),
1085    })?;
1086    Ok(tmp)
1087}
1088
1089/// Persist `metadata` (and optionally a PDF at `pdf_src`) through the
1090/// trait-object [`Store`] and emit a `StoreWrite` provenance row.
1091fn write_metadata_and_pdf(
1092    store: &dyn Store,
1093    safekey: &Safekey,
1094    metadata: &Metadata,
1095    pdf_src: Option<&Utf8Path>,
1096    ctx: &FetchContext,
1097) -> Result<(), FetchError> {
1098    let store_path_relative = if pdf_src.is_some() {
1099        format!("{}.pdf", safekey.as_str())
1100    } else {
1101        format!(".metadata/{}.toml", safekey.as_str())
1102    };
1103    let size_bytes = metadata.doiget.as_ref().map(|d| d.size_bytes).unwrap_or(0);
1104    let license = metadata.doiget.as_ref().map(|d| d.license.as_str());
1105    let source_name = metadata.doiget.as_ref().map(|d| d.source.as_str());
1106
1107    // ADR-0021 §1 canonical-digest for the StoreWrite row. The store
1108    // entry is keyed on the ref + the resolver that produced its
1109    // metadata (already captured in `metadata.doiget.source`). Build a
1110    // CanonicalRef from whichever id slot is populated.
1111    let canonical_digest: Option<String> = match (metadata.doi.as_ref(), metadata.arxiv_id.as_ref())
1112    {
1113        (Some(d), _) => source_name.map(|s| {
1114            crate::CanonicalRef::new(crate::SourceType::Doi, d.as_str(), s, None).digest_hex()
1115        }),
1116        (None, Some(a)) => source_name.map(|s| {
1117            crate::CanonicalRef::new(crate::SourceType::Arxiv, a.as_str(), s, None).digest_hex()
1118        }),
1119        (None, None) => None,
1120    };
1121
1122    match store.write(safekey, metadata, pdf_src) {
1123        Ok(()) => {
1124            ctx.log.append(RowInput {
1125                event: LogEvent::StoreWrite,
1126                result: LogResult::Ok,
1127                capability: Capability::Oa,
1128                ref_: metadata
1129                    .doi
1130                    .as_ref()
1131                    .map(|d| d.as_str())
1132                    .or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
1133                source: source_name,
1134                error_code: None,
1135                size_bytes: Some(size_bytes),
1136                license,
1137                store_path: Some(&store_path_relative),
1138                canonical_digest: canonical_digest.as_deref(),
1139            })?;
1140            Ok(())
1141        }
1142        Err(e) => {
1143            // Best-effort: record the StoreWrite failure before
1144            // propagating the store.write error. We do NOT
1145            // propagate the log-append error itself here — we're
1146            // already in an error state from the store, and the
1147            // primary failure is what the caller needs to act on.
1148            // But the log-append failure is observable via tracing
1149            // so an operator can spot a broken hash chain when
1150            // both fail. Surface as `SourceSchema` so the
1151            // FetchError -> ErrorCode collapse routes it to
1152            // `INTERNAL_ERROR` (closest closed-set fit; `StoreError`
1153            // does not have a direct closed-set arm).
1154            if let Err(log_err) = ctx.log.append(RowInput {
1155                event: LogEvent::StoreWrite,
1156                result: LogResult::Err,
1157                capability: Capability::Oa,
1158                ref_: metadata
1159                    .doi
1160                    .as_ref()
1161                    .map(|d| d.as_str())
1162                    .or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
1163                source: source_name,
1164                error_code: Some("STORE_ERROR"),
1165                size_bytes: None,
1166                license: None,
1167                store_path: Some(&store_path_relative),
1168                canonical_digest: canonical_digest.as_deref(),
1169            }) {
1170                tracing::error!(
1171                    store_err = %e,
1172                    log_err = %log_err,
1173                    "BOTH store.write AND provenance log append failed; \
1174                     audit trail is broken for this attempt"
1175                );
1176            }
1177            Err(FetchError::SourceSchema {
1178                hint: format!("store write failed: {e}"),
1179            })
1180        }
1181    }
1182}
1183
1184/// Attempt the OA PDF fetch under the `"oa-publisher"` source key.
1185async fn try_fetch_oa_pdf(
1186    doi: &Doi,
1187    url: &url::Url,
1188    ctx: &FetchContext,
1189) -> Result<(Vec<u8>, url::Url), HttpError> {
1190    const SOURCE: &str = "oa-publisher";
1191    let _permit = ctx.rate_limiter.acquire(SOURCE).await;
1192    // ADR-0021 §1: the oa-publisher PDF leg is a DISTINCT audit
1193    // identity from the Crossref/Unpaywall metadata legs even though
1194    // the ref is the same DOI — that's the whole point of carrying
1195    // `resolver_profile` into the digest. Compute once and re-use for
1196    // both the ok and err row variants below.
1197    let canonical =
1198        crate::CanonicalRef::new(crate::SourceType::Doi, doi.as_str(), SOURCE, None).digest_hex();
1199
1200    // Pre-fetch host allowlist check on the metadata-discovered OA URL
1201    // (issue #145; `docs/REDIRECT_ALLOWLIST.md` §1 — NORMATIVE). The
1202    // per-source `redirect_hosts` allowlist is, by §1, consulted "on the
1203    // OA URL discovered through metadata sources before the actual PDF
1204    // fetch is issued", not only on redirect hops. The redirect closure in
1205    // `crate::http` only fires when an *actual redirect* occurs; an OA URL
1206    // whose host is off the `oa-publisher` allowlist that resolves WITHOUT
1207    // a redirect would otherwise reach connect and be misclassified as a
1208    // transport error, violating §1. This is scoped strictly to the
1209    // `"oa-publisher"` PDF leg — §6 explicitly exempts the initial
1210    // template-constructed URL, and `fetch_bytes`/metadata-only/resolve-
1211    // only paths (which never follow the OA URL) are deliberately NOT
1212    // touched. On a host MISS we return the *same* `HttpError::RedirectDenied`
1213    // value the redirect closure produces (same `source_key`, lowercased
1214    // `host`, and `expected_hosts` snapshot), reusing the identical
1215    // allowlist the closure captured (queried via `source_allowlist`, not
1216    // re-derived) so the single source of truth cannot drift. Returning
1217    // that exact variant means the existing `Err(e)` arm below, the
1218    // `From<&HttpError> for Option<DenialContext>` mapping
1219    // (`DenialReason::RedirectNotInAllowlist`), the `PdfLegStatus::Blocked`
1220    // construction in the caller, and PR #162's CLI classification all see
1221    // a byte-identical downstream shape with no new code path.
1222    if let Some(allowlist) = ctx.http.source_allowlist(SOURCE) {
1223        // `Url::host_str()` is `None` for hostless URLs (e.g. `data:`);
1224        // treat that exactly as the redirect closure does (an allowlist
1225        // miss with an empty host string).
1226        let host = url
1227            .host_str()
1228            .map(|h| h.to_ascii_lowercase())
1229            .unwrap_or_default();
1230        if !allowlist.matches(&host) {
1231            let e = HttpError::RedirectDenied {
1232                source_key: SOURCE.to_string(),
1233                host: host.clone(),
1234                expected_hosts: allowlist.redirect_hosts.clone(),
1235            };
1236            tracing::info!(
1237                oa_url = %url,
1238                denied_host = %host,
1239                "OA URL host outside oa-publisher allowlist (pre-fetch check, \
1240                 docs/REDIRECT_ALLOWLIST.md §1 / issue #145)"
1241            );
1242            // Emit the SAME provenance row the post-fetch redirect-denied
1243            // path emits: a `Fetch` `Err` row under the `oa-publisher`
1244            // source key with the closed-set `NETWORK_ERROR` code and the
1245            // same canonical digest. Mirrors the `Err(e)` arm below so the
1246            // audit trail is indistinguishable from a redirect-time denial.
1247            let _ = ctx.log.append(RowInput {
1248                event: LogEvent::Fetch,
1249                result: LogResult::Err,
1250                capability: Capability::Oa,
1251                ref_: Some(doi.as_str()),
1252                source: Some(SOURCE),
1253                error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
1254                size_bytes: None,
1255                license: None,
1256                store_path: None,
1257                canonical_digest: Some(&canonical),
1258            });
1259            return Err(e);
1260        }
1261    }
1262
1263    match ctx.http.fetch_pdf(SOURCE, url.clone()).await {
1264        Ok((body, final_url)) => {
1265            let size_bytes = body.len() as u64;
1266            if let Err(e) = ctx.log.append(RowInput {
1267                event: LogEvent::Fetch,
1268                result: LogResult::Ok,
1269                capability: Capability::Oa,
1270                ref_: Some(doi.as_str()),
1271                source: Some(SOURCE),
1272                error_code: None,
1273                size_bytes: Some(size_bytes),
1274                license: None,
1275                store_path: None,
1276                canonical_digest: Some(&canonical),
1277            }) {
1278                tracing::warn!(error = %e, "appending oa-publisher Fetch ok row failed");
1279            }
1280            Ok((body.to_vec(), final_url))
1281        }
1282        Err(e) => {
1283            match &e {
1284                HttpError::RedirectDenied { host, .. } => {
1285                    tracing::info!(
1286                        oa_url = %url,
1287                        denied_host = %host,
1288                        "OA URL host outside oa-publisher allowlist"
1289                    );
1290                }
1291                HttpError::NotAPdf { .. } => {
1292                    tracing::info!(
1293                        oa_url = %url,
1294                        "OA URL did not return a PDF magic byte"
1295                    );
1296                }
1297                other => {
1298                    tracing::warn!(
1299                        oa_url = %url,
1300                        error = %other,
1301                        "OA PDF fetch failed"
1302                    );
1303                }
1304            }
1305            // Provenance `error_code` is the CLOSED-set code. Every
1306            // `HttpError` collapses to `NETWORK_ERROR` through the
1307            // canonical `From<FetchError> for ErrorCode` (the closed
1308            // set has no finer transport code by design) — so this is
1309            // the correct mapped value, not the misattribution the
1310            // previous hardcode implied. The *fine* reason
1311            // (RedirectDenied vs NotAPdf vs …) is preserved for the
1312            // user via `PdfLegStatus::Blocked.denial` / `.message`
1313            // built by the caller from the returned `HttpError`
1314            // (issue #118). Rendered via `ErrorCode::as_wire` so the
1315            // token can never drift from the enum.
1316            let _ = ctx.log.append(RowInput {
1317                event: LogEvent::Fetch,
1318                result: LogResult::Err,
1319                capability: Capability::Oa,
1320                ref_: Some(doi.as_str()),
1321                source: Some(SOURCE),
1322                error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
1323                size_bytes: None,
1324                license: None,
1325                store_path: None,
1326                canonical_digest: Some(&canonical),
1327            });
1328            Err(e)
1329        }
1330    }
1331}
1332
1333/// Subset of Crossref `message` fields populated into the on-disk metadata.
1334struct CrossrefFields {
1335    title: Option<String>,
1336    authors: Vec<String>,
1337    year: Option<i32>,
1338    venue: Option<String>,
1339    type_: Option<String>,
1340}
1341
1342/// Defensively pull bibliographic fields out of a Crossref envelope's
1343/// `message` object. Every field is optional; malformed shapes degrade
1344/// to `None` rather than panicking.
1345fn extract_crossref_fields(msg: &Value) -> CrossrefFields {
1346    let title = msg
1347        .get("title")
1348        .and_then(|v| v.as_array())
1349        .and_then(|arr| arr.first())
1350        .and_then(|v| v.as_str())
1351        .map(|s| s.to_string());
1352
1353    let authors = msg
1354        .get("author")
1355        .and_then(|v| v.as_array())
1356        .map(|arr| {
1357            arr.iter()
1358                .filter_map(|a| {
1359                    let family = a.get("family").and_then(|v| v.as_str());
1360                    let given = a.get("given").and_then(|v| v.as_str());
1361                    match (family, given) {
1362                        (Some(f), Some(g)) => Some(format!("{f}, {g}")),
1363                        (Some(f), None) => Some(f.to_string()),
1364                        (None, Some(g)) => Some(g.to_string()),
1365                        _ => None,
1366                    }
1367                })
1368                .collect()
1369        })
1370        .unwrap_or_default();
1371
1372    let year = msg
1373        .get("issued")
1374        .and_then(|v| v.get("date-parts"))
1375        .and_then(|v| v.as_array())
1376        .and_then(|arr| arr.first())
1377        .and_then(|v| v.as_array())
1378        .and_then(|arr| arr.first())
1379        .and_then(|v| v.as_i64())
1380        .and_then(|n| i32::try_from(n).ok());
1381
1382    let venue = msg
1383        .get("container-title")
1384        .and_then(|v| v.as_array())
1385        .and_then(|arr| arr.first())
1386        .and_then(|v| v.as_str())
1387        .map(|s| s.to_string());
1388
1389    let type_ = msg
1390        .get("type")
1391        .and_then(|v| v.as_str())
1392        .map(|s| s.to_string());
1393
1394    CrossrefFields {
1395        title,
1396        authors,
1397        year,
1398        venue,
1399        type_,
1400    }
1401}
1402
1403/// Pull the ordered chain of candidate OA URLs out of an Unpaywall
1404/// `metadata_json` envelope per ADR-0029 D2.
1405///
1406/// Order is `best_oa_location` first (when present), then every
1407/// distinct entry in `oa_locations[]`. Duplicate URLs are deduped by
1408/// exact string match so a candidate that appears as both the "best"
1409/// entry and an array element is fetched at most once.
1410///
1411/// Each location's URL is resolved via the same `url_for_pdf` →
1412/// `url` fallback the single-URL extractor uses.
1413///
1414/// Returns `Vec::new()` when no OA location was reported (the chain
1415/// is empty and the caller surfaces [`PdfLegStatus::NoOaUrl`]).
1416fn extract_oa_url_chain(meta: Option<&Value>) -> Vec<url::Url> {
1417    let meta = match meta {
1418        Some(m) => m,
1419        None => return Vec::new(),
1420    };
1421    let mut out: Vec<url::Url> = Vec::new();
1422    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
1423    let mut push_unique = |u: url::Url| {
1424        let key = u.as_str().to_string();
1425        if seen.insert(key) {
1426            out.push(u);
1427        }
1428    };
1429
1430    // Priority 1: best_oa_location (Unpaywall's own quality-ordered
1431    // pick — ADR-0029 D2 NORMATIVE: defer to the metadata source's
1432    // ordering).
1433    if let Some(best) = meta.get("best_oa_location") {
1434        if let Some(u) = pull_oa_url_from_location(best) {
1435            push_unique(u);
1436        }
1437    }
1438    // Priority 2: every entry in oa_locations[] after the best one.
1439    // The fallback target this ADR exists to enable is precisely the
1440    // arXiv preprint that lives here when `best_oa_location` is a
1441    // WAF-blocked publisher URL.
1442    if let Some(arr) = meta.get("oa_locations").and_then(|v| v.as_array()) {
1443        for loc in arr {
1444            if let Some(u) = pull_oa_url_from_location(loc) {
1445                push_unique(u);
1446            }
1447        }
1448    }
1449    out
1450}
1451
1452/// Resolve a single OA location object to a `url::Url`. Tries
1453/// `url_for_pdf` first (the direct PDF link Unpaywall annotates when
1454/// it knows one), falling back to `url` (the landing page). Returns
1455/// `None` if neither field is present or parses.
1456fn pull_oa_url_from_location(loc: &Value) -> Option<url::Url> {
1457    let candidate = loc
1458        .get("url_for_pdf")
1459        .and_then(|v| v.as_str())
1460        .or_else(|| loc.get("url").and_then(|v| v.as_str()))?;
1461    url::Url::parse(candidate).ok()
1462}
1463
1464fn unpaywall_email_from_env(fallback_contact: &str) -> String {
1465    std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| fallback_contact.to_string())
1466}
1467
1468// ---------------------------------------------------------------------------
1469// batch_fetch — multi-ref orchestrator (Slice 2)
1470// ---------------------------------------------------------------------------
1471
1472/// Per-ref outcome carried inside [`BatchOutcome::results`].
1473///
1474/// Each entry's `outcome` is independent — a single `Err(...)` does not
1475/// abort sibling refs. The MCP `doiget_batch_fetch` tool method
1476/// serializes the success-or-error per row inside `results[]`.
1477#[derive(Debug)]
1478pub struct BatchResultEntry {
1479    /// The parsed ref this entry describes.
1480    pub ref_: Ref,
1481    /// `Ok(...)` on a successful fetch through [`fetch_paper`];
1482    /// `Err(...)` on a per-ref failure (the outer call still returned
1483    /// `Ok(BatchOutcome)`).
1484    pub outcome: Result<FetchPaperOutcome, FetchError>,
1485}
1486
1487/// Outcome of a successful [`batch_fetch`] call.
1488///
1489/// The outer call returns `Err(_)` only on whole-call failures (the
1490/// only such variant in Slice 2 is [`FetchError::TooManyRefs`]). Each
1491/// per-ref result lives inside `results[]` so the agent can see every
1492/// outcome without losing sibling successes.
1493#[derive(Debug)]
1494#[non_exhaustive]
1495pub struct BatchOutcome {
1496    /// One entry per supplied ref, in input order.
1497    pub results: Vec<BatchResultEntry>,
1498}
1499
1500/// Iterate over `refs` through [`fetch_paper`], collecting one
1501/// [`BatchResultEntry`] per ref.
1502///
1503/// **Cap**: caller must supply at most [`MAX_BATCH_REFS`] refs; otherwise
1504/// the function returns `Err(FetchError::TooManyRefs { got, max })`
1505/// before any fetch is attempted. The cap mirrors the CLI's
1506/// `commands::batch` enforcement (`MCP_BATCH_MAX_SIZE`).
1507///
1508/// **Concurrency**: Slice 2 dispatches refs serially through
1509/// [`fetch_paper`]. The CLI's existing `commands::batch::run_with_options`
1510/// keeps its bounded-concurrency `JoinSet`+semaphore path for backward
1511/// compatibility; the MCP server uses this serial loop because the MCP
1512/// tool boundary already serializes calls per session.
1513///
1514/// **Session bookkeeping**: this function does NOT emit `SessionStart`
1515/// / `SessionEnd` rows — that is the caller's responsibility.
1516pub async fn batch_fetch(
1517    refs: &[Ref],
1518    profile: &CapabilityProfile,
1519    ctx: &FetchContext,
1520    store: &dyn Store,
1521    store_root: &Utf8Path,
1522) -> Result<BatchOutcome, FetchError> {
1523    if refs.len() > MAX_BATCH_REFS {
1524        return Err(FetchError::TooManyRefs {
1525            got: refs.len(),
1526            max: MAX_BATCH_REFS,
1527        });
1528    }
1529    let mut results = Vec::with_capacity(refs.len());
1530    for ref_ in refs {
1531        let outcome = fetch_paper(ref_, profile, ctx, store, store_root).await;
1532        results.push(BatchResultEntry {
1533            ref_: ref_.clone(),
1534            outcome,
1535        });
1536    }
1537    Ok(BatchOutcome { results })
1538}
1539
1540/// Dry-run preview for a batch — one [`FetchPlan`] per ref. Enforces
1541/// the same [`MAX_BATCH_REFS`] cap [`batch_fetch`] does.
1542///
1543/// Returns `Err(FetchError::TooManyRefs)` when over the cap, or
1544/// `Err(FetchError::SourceSchema)` if the dry-run allowlist invariant
1545/// has drifted (issue #156 ②: this now propagates as a typed error via
1546/// [`try_build_fetch_plan`] rather than silently emitting an empty
1547/// `candidate_hosts` list — the signature already returned `Result`, so
1548/// this is an in-crate behavior tightening with no caller-visible type
1549/// change). Otherwise `Ok(Vec<(Ref, FetchPlan)>)` parallel to the input
1550/// order.
1551pub fn batch_fetch_plans(
1552    refs: &[Ref],
1553    store_root: &Utf8Path,
1554) -> Result<Vec<(Ref, FetchPlan)>, FetchError> {
1555    if refs.len() > MAX_BATCH_REFS {
1556        return Err(FetchError::TooManyRefs {
1557            got: refs.len(),
1558            max: MAX_BATCH_REFS,
1559        });
1560    }
1561    refs.iter()
1562        .map(|r| try_build_fetch_plan(r, store_root).map(|p| (r.clone(), p)))
1563        .collect()
1564}
1565
1566// ---------------------------------------------------------------------------
1567// Tests
1568// ---------------------------------------------------------------------------
1569
1570#[cfg(test)]
1571#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
1572mod tests {
1573    use super::*;
1574
1575    #[test]
1576    fn extract_crossref_oa_url_finds_first_url() {
1577        let msg = serde_json::json!({
1578            "link": [
1579                {"URL": "https://example.org/free.pdf"},
1580                {"URL": "https://example.org/alt.pdf"}
1581            ]
1582        });
1583        assert_eq!(
1584            extract_crossref_oa_url(&msg),
1585            Some("https://example.org/free.pdf".to_string())
1586        );
1587    }
1588
1589    #[test]
1590    fn extract_crossref_oa_url_returns_none_when_absent() {
1591        let msg = serde_json::json!({});
1592        assert!(extract_crossref_oa_url(&msg).is_none());
1593    }
1594
1595    #[test]
1596    fn extract_crossref_oa_url_skips_empty_url_strings() {
1597        let msg = serde_json::json!({
1598            "link": [
1599                {"URL": ""},
1600                {"URL": "https://example.org/real.pdf"}
1601            ]
1602        });
1603        assert_eq!(
1604            extract_crossref_oa_url(&msg),
1605            Some("https://example.org/real.pdf".to_string())
1606        );
1607    }
1608
1609    #[test]
1610    fn extract_unpaywall_oa_url_prefers_url_for_pdf() {
1611        let meta = serde_json::json!({
1612            "best_oa_location": {
1613                "url_for_pdf": "https://example.org/pdf",
1614                "url": "https://example.org/landing"
1615            }
1616        });
1617        assert_eq!(
1618            extract_unpaywall_oa_url(&meta),
1619            Some("https://example.org/pdf".to_string())
1620        );
1621    }
1622
1623    #[test]
1624    fn extract_unpaywall_oa_url_falls_back_to_url() {
1625        let meta = serde_json::json!({
1626            "best_oa_location": {
1627                "url": "https://example.org/landing"
1628            }
1629        });
1630        assert_eq!(
1631            extract_unpaywall_oa_url(&meta),
1632            Some("https://example.org/landing".to_string())
1633        );
1634    }
1635
1636    #[test]
1637    fn extract_unpaywall_oa_url_returns_none_when_absent() {
1638        let meta = serde_json::json!({});
1639        assert!(extract_unpaywall_oa_url(&meta).is_none());
1640    }
1641
1642    // ---------------------------------------------------------------
1643    // Slice 2: fetch_paper / batch_fetch coverage. The wiremock-driven
1644    // happy-path tests live in `crates/doiget-mcp/tests/...` (they need
1645    // a real `Store` impl and an HTTP client wired to `FetchContext`,
1646    // both of which the MCP integration tests already stand up). The
1647    // unit tests here pin the pure-function pieces (extractors, cap
1648    // enforcement, plan-shape preservation).
1649    // ---------------------------------------------------------------
1650
1651    #[test]
1652    fn extract_crossref_fields_parses_minimal_shape() {
1653        let msg = serde_json::json!({
1654            "title": ["Example Title"],
1655            "author": [{ "family": "Smith", "given": "Alice" }],
1656            "issued": { "date-parts": [[2024, 1, 15]] },
1657            "container-title": ["Phys. Rev. X"],
1658            "type": "journal-article"
1659        });
1660        let f = extract_crossref_fields(&msg);
1661        assert_eq!(f.title.as_deref(), Some("Example Title"));
1662        assert_eq!(f.authors, vec!["Smith, Alice".to_string()]);
1663        assert_eq!(f.year, Some(2024));
1664        assert_eq!(f.venue.as_deref(), Some("Phys. Rev. X"));
1665        assert_eq!(f.type_.as_deref(), Some("journal-article"));
1666    }
1667
1668    #[test]
1669    fn extract_crossref_fields_tolerates_missing() {
1670        let f = extract_crossref_fields(&serde_json::json!({}));
1671        assert!(f.title.is_none());
1672        assert!(f.authors.is_empty());
1673        assert!(f.year.is_none());
1674        assert!(f.venue.is_none());
1675        assert!(f.type_.is_none());
1676    }
1677
1678    #[test]
1679    fn extract_oa_url_chain_prefers_best_url_for_pdf() {
1680        // `best_oa_location.url_for_pdf` is the highest-priority
1681        // candidate (ADR-0029 D2 — defer to the metadata source's
1682        // ordering). Falls back to `best_oa_location.url` only when
1683        // no PDF link is annotated.
1684        let meta = serde_json::json!({
1685            "best_oa_location": {
1686                "url_for_pdf": "https://example.org/pdf",
1687                "url": "https://example.org/landing"
1688            }
1689        });
1690        let chain = extract_oa_url_chain(Some(&meta));
1691        assert_eq!(chain.len(), 1);
1692        assert_eq!(chain[0].as_str(), "https://example.org/pdf");
1693    }
1694
1695    #[test]
1696    fn extract_oa_url_chain_falls_back_to_url_when_url_for_pdf_absent() {
1697        let meta = serde_json::json!({
1698            "best_oa_location": {
1699                "url": "https://example.org/landing"
1700            }
1701        });
1702        let chain = extract_oa_url_chain(Some(&meta));
1703        assert_eq!(chain.len(), 1);
1704        assert_eq!(chain[0].as_str(), "https://example.org/landing");
1705    }
1706
1707    #[test]
1708    fn extract_oa_url_chain_is_empty_when_no_locations() {
1709        let meta = serde_json::json!({});
1710        assert!(extract_oa_url_chain(Some(&meta)).is_empty());
1711        assert!(extract_oa_url_chain(None).is_empty());
1712    }
1713
1714    #[test]
1715    fn extract_oa_url_chain_appends_oa_locations_after_best() {
1716        // ADR-0029 D2: best_oa_location first, then the rest of
1717        // oa_locations in metadata-source order. This is the load-
1718        // bearing test: it pins the fact that an arXiv preprint
1719        // listed *after* a WAF-blocked publisher in oa_locations[]
1720        // becomes a fallback candidate the chain walker can reach.
1721        let meta = serde_json::json!({
1722            "best_oa_location": {
1723                "url_for_pdf": "https://publisher.example.org/pdf"
1724            },
1725            "oa_locations": [
1726                {"url_for_pdf": "https://publisher.example.org/pdf"},
1727                {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"},
1728                {"url": "https://repo.example.edu/handle/123"}
1729            ]
1730        });
1731        let chain = extract_oa_url_chain(Some(&meta));
1732        let strs: Vec<&str> = chain.iter().map(|u| u.as_str()).collect();
1733        assert_eq!(
1734            strs,
1735            vec![
1736                "https://publisher.example.org/pdf",
1737                "https://arxiv.org/pdf/2401.12345",
1738                "https://repo.example.edu/handle/123",
1739            ],
1740            "chain ordering MUST be best_oa_location first, oa_locations[] verbatim after"
1741        );
1742    }
1743
1744    #[test]
1745    fn extract_oa_url_chain_dedupes_repeated_urls() {
1746        // A URL that appears as both `best_oa_location` and an entry
1747        // in `oa_locations[]` is fetched at most once. Without this,
1748        // a publisher whose record has the same URL in both slots
1749        // would consume two HTTP requests + two rate-limit ticks.
1750        let meta = serde_json::json!({
1751            "best_oa_location": {
1752                "url_for_pdf": "https://example.org/pdf"
1753            },
1754            "oa_locations": [
1755                {"url_for_pdf": "https://example.org/pdf"},
1756                {"url_for_pdf": "https://example.org/pdf"},
1757                {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
1758            ]
1759        });
1760        let chain = extract_oa_url_chain(Some(&meta));
1761        assert_eq!(chain.len(), 2);
1762        assert_eq!(chain[0].as_str(), "https://example.org/pdf");
1763        assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
1764    }
1765
1766    #[test]
1767    fn extract_oa_url_chain_skips_unparsable_urls() {
1768        // A malformed URL in oa_locations[] is dropped silently
1769        // rather than aborting the chain — the metadata source can
1770        // emit a stray entry without poisoning the whole fetch.
1771        let meta = serde_json::json!({
1772            "best_oa_location": {
1773                "url_for_pdf": "https://good.example.org/pdf"
1774            },
1775            "oa_locations": [
1776                {"url_for_pdf": "not a url"},
1777                {"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
1778            ]
1779        });
1780        let chain = extract_oa_url_chain(Some(&meta));
1781        assert_eq!(chain.len(), 2);
1782        assert_eq!(chain[0].as_str(), "https://good.example.org/pdf");
1783        assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
1784    }
1785
1786    #[test]
1787    fn fetch_paper_plan_matches_build_fetch_plan() {
1788        // The slice-2-named alias is a thin pass-through to
1789        // `dry_run::build_fetch_plan`. Pin behavioral equivalence so
1790        // a future refactor that diverges them surfaces here.
1791        use crate::{ArxivId, Doi};
1792        let r = Ref::Doi(Doi("10.1234/example".to_string()));
1793        let root = Utf8PathBuf::from("/tmp/doiget-test");
1794        let plan_a = fetch_paper_plan(&r, &root);
1795        let plan_b = build_fetch_plan(&r, &root);
1796        assert_eq!(plan_a.metadata_sources, plan_b.metadata_sources);
1797        assert_eq!(plan_a.target_pdf_path, plan_b.target_pdf_path);
1798        assert_eq!(plan_a.target_metadata_path, plan_b.target_metadata_path);
1799
1800        let r2 = Ref::Arxiv(ArxivId("2401.12345".to_string()));
1801        let plan_c = fetch_paper_plan(&r2, &root);
1802        let plan_d = build_fetch_plan(&r2, &root);
1803        assert_eq!(plan_c.pdf_sources[0].key, plan_d.pdf_sources[0].key);
1804    }
1805
1806    #[test]
1807    fn batch_fetch_plans_returns_plan_per_ref_in_order() {
1808        use crate::{ArxivId, Doi};
1809        let refs = vec![
1810            Ref::Doi(Doi("10.1234/alpha".to_string())),
1811            Ref::Arxiv(ArxivId("2401.12345".to_string())),
1812        ];
1813        let root = Utf8PathBuf::from("/tmp/doiget-batch-test");
1814        let plans = batch_fetch_plans(&refs, &root).expect("under cap returns Ok");
1815        assert_eq!(plans.len(), 2);
1816        // Order preserved.
1817        assert!(matches!(plans[0].0, Ref::Doi(_)));
1818        assert!(matches!(plans[1].0, Ref::Arxiv(_)));
1819        // DOI plan carries the crossref + unpaywall metadata sources.
1820        assert_eq!(plans[0].1.metadata_sources, vec!["crossref", "unpaywall"]);
1821        // arXiv plan has the arxiv PDF source key.
1822        assert_eq!(plans[1].1.pdf_sources[0].key, "arxiv");
1823    }
1824
1825    #[test]
1826    fn batch_fetch_plans_too_many_refs_returns_err() {
1827        use crate::Doi;
1828        // Build MAX_BATCH_REFS + 1 entries — boundary case.
1829        let n = MAX_BATCH_REFS + 1;
1830        let refs: Vec<Ref> = (0..n)
1831            .map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
1832            .collect();
1833        let root = Utf8PathBuf::from("/tmp/doiget-toomany");
1834        let err = batch_fetch_plans(&refs, &root).expect_err("over cap returns Err");
1835        match err {
1836            FetchError::TooManyRefs { got, max } => {
1837                assert_eq!(got, n);
1838                assert_eq!(max, MAX_BATCH_REFS);
1839            }
1840            other => panic!("expected TooManyRefs, got: {other:?}"),
1841        }
1842    }
1843
1844    #[tokio::test]
1845    async fn batch_fetch_too_many_refs_returns_err_before_any_fetch() {
1846        // The cap is enforced before any per-ref work, so we don't need
1847        // a working store/network here — pass a sentinel store_root and
1848        // a dummy FetchContext that would panic on use.
1849        use crate::http::{tier_1_allowlist, HttpClient};
1850        use crate::provenance::ProvenanceLog;
1851        use crate::rate_limiter::RateLimiter;
1852        use crate::store::FsStore;
1853        use crate::{Doi, RateLimits};
1854        use std::sync::Arc;
1855
1856        let td = tempfile::TempDir::new().expect("tempdir");
1857        let log_path = Utf8Path::from_path(td.path())
1858            .expect("utf-8")
1859            .join("log.jsonl");
1860        let store_root = Utf8Path::from_path(td.path())
1861            .expect("utf-8")
1862            .join("papers");
1863
1864        let ctx = FetchContext {
1865            http: Arc::new(HttpClient::new(tier_1_allowlist()).expect("http client")),
1866            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
1867            log: Arc::new(
1868                ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
1869                    .expect("provenance log"),
1870            ),
1871            session_id: "01J0000000000000000000TEST".into(),
1872        };
1873        let profile = CapabilityProfile::from_env().expect("clean env");
1874        let store = FsStore::new(store_root.clone()).expect("fs store");
1875
1876        let n = MAX_BATCH_REFS + 1;
1877        let refs: Vec<Ref> = (0..n)
1878            .map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
1879            .collect();
1880
1881        let err = batch_fetch(&refs, &profile, &ctx, &store, &store_root)
1882            .await
1883            .expect_err("over cap returns Err");
1884        match err {
1885            FetchError::TooManyRefs { got, max } => {
1886                assert_eq!(got, n);
1887                assert_eq!(max, MAX_BATCH_REFS);
1888            }
1889            other => panic!("expected TooManyRefs, got: {other:?}"),
1890        }
1891    }
1892
1893    // Issue #118: a non-PDF OA body must surface as `Err(HttpError)`
1894    // from `try_fetch_oa_pdf` (previously silently flattened to
1895    // `None`, which `fetch_paper_doi` then reported as a clean
1896    // metadata-only success). The compiler-checked `Err(e) =>
1897    // PdfLegStatus::Blocked` arm in `fetch_paper_doi` does the rest.
1898    #[tokio::test]
1899    async fn try_fetch_oa_pdf_non_pdf_body_is_err_not_silent_none() {
1900        use crate::http::HttpClient;
1901        use crate::provenance::ProvenanceLog;
1902        use crate::rate_limiter::RateLimiter;
1903        use crate::{Doi, RateLimits};
1904        use std::sync::Arc;
1905        use wiremock::matchers::method;
1906        use wiremock::{Mock, MockServer, ResponseTemplate};
1907
1908        let server = MockServer::start().await;
1909        Mock::given(method("GET"))
1910            .respond_with(
1911                ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
1912            )
1913            .mount(&server)
1914            .await;
1915        let host = server
1916            .uri()
1917            .parse::<url::Url>()
1918            .expect("uri")
1919            .host_str()
1920            .expect("host")
1921            .to_string();
1922
1923        let td = tempfile::TempDir::new().expect("tempdir");
1924        let log_path = Utf8Path::from_path(td.path())
1925            .expect("utf-8")
1926            .join("log.jsonl");
1927        let ctx = FetchContext {
1928            http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
1929            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
1930            log: Arc::new(
1931                ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
1932                    .expect("provenance log"),
1933            ),
1934            session_id: "01J0000000000000000000TEST".into(),
1935        };
1936
1937        let doi = Doi("10.1234/example".to_string());
1938        let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
1939        let res = try_fetch_oa_pdf(&doi, &url, &ctx).await;
1940        match res {
1941            Err(HttpError::NotAPdf { .. }) => {}
1942            other => panic!("expected Err(NotAPdf), got: {other:?}"),
1943        }
1944    }
1945
1946    // Issue #145 / `docs/REDIRECT_ALLOWLIST.md` §1: the `oa-publisher`
1947    // host allowlist MUST be consulted on the metadata-discovered OA URL
1948    // *before the actual PDF fetch is issued*, not only on redirect hops.
1949    // An OA URL whose host is OFF the allowlist and that resolves WITHOUT
1950    // a redirect previously slipped past the redirect closure entirely and
1951    // was misclassified as a transport error. This test pins the fix: the
1952    // pre-fetch check rejects it with the SAME `HttpError::RedirectDenied`
1953    // the redirect closure produces, the OA fetch is NEVER issued (the
1954    // wiremock origin records ZERO requests, proving no PDF bytes were
1955    // requested / written), and the provenance trail is the byte-identical
1956    // `Fetch`/`err`/`oa-publisher`/`NETWORK_ERROR` row the redirect-denied
1957    // path emits.
1958    #[tokio::test]
1959    async fn try_fetch_oa_pdf_off_allowlist_host_no_redirect_is_redirect_denied_145() {
1960        use crate::http::HttpClient;
1961        use crate::provenance::ProvenanceLog;
1962        use crate::rate_limiter::RateLimiter;
1963        use crate::{DenialContext, DenialReason, Doi, RateLimits};
1964        use std::sync::Arc;
1965        use wiremock::matchers::method;
1966        use wiremock::{Mock, MockServer, ResponseTemplate};
1967
1968        // The wiremock origin would serve a valid PDF with NO redirect —
1969        // if the pre-check were absent the fetch would *succeed* against
1970        // an off-allowlist host, which is exactly the §1 violation.
1971        let server = MockServer::start().await;
1972        Mock::given(method("GET"))
1973            .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7 real pdf".to_vec()))
1974            .mount(&server)
1975            .await;
1976
1977        // Register a DIFFERENT host as the `oa-publisher` allowlist so the
1978        // wiremock origin (127.0.0.1) is OFF it. `evil.example.com` is a
1979        // valid host string the allowlist will not match.
1980        let td = tempfile::TempDir::new().expect("tempdir");
1981        let log_path = Utf8Path::from_path(td.path())
1982            .expect("utf-8")
1983            .join("log.jsonl");
1984        let ctx = FetchContext {
1985            http: Arc::new(HttpClient::new_for_tests_allow_http(
1986                "oa-publisher",
1987                "allowed-publisher.example.com",
1988            )),
1989            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
1990            log: Arc::new(
1991                ProvenanceLog::open(log_path.clone(), "01J0000000000000000000TEST".into())
1992                    .expect("provenance log"),
1993            ),
1994            session_id: "01J0000000000000000000TEST".into(),
1995        };
1996
1997        let doi = Doi("10.1234/example".to_string());
1998        // The OA URL Unpaywall handed back resolves to the wiremock host,
1999        // which is OFF the `oa-publisher` allowlist.
2000        let off_host_url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2001        let res = try_fetch_oa_pdf(&doi, &off_host_url, &ctx).await;
2002
2003        // 1. Same error variant the redirect closure produces.
2004        let err = match res {
2005            Err(e @ HttpError::RedirectDenied { .. }) => e,
2006            other => {
2007                panic!("expected Err(RedirectDenied) from the pre-fetch check, got: {other:?}")
2008            }
2009        };
2010        match &err {
2011            HttpError::RedirectDenied {
2012                source_key,
2013                host,
2014                expected_hosts,
2015            } => {
2016                assert_eq!(source_key, "oa-publisher");
2017                // The host is lowercased, exactly as the redirect closure
2018                // would record it.
2019                assert_eq!(
2020                    host,
2021                    off_host_url
2022                        .host_str()
2023                        .expect("wiremock host")
2024                        .to_ascii_lowercase()
2025                        .as_str()
2026                );
2027                assert_eq!(
2028                    expected_hosts,
2029                    &vec!["allowed-publisher.example.com".to_string()]
2030                );
2031            }
2032            _ => unreachable!(),
2033        }
2034
2035        // 2. The OA fetch was NEVER issued — the wiremock origin saw zero
2036        //    requests, so no PDF bytes were requested or written.
2037        assert!(
2038            server
2039                .received_requests()
2040                .await
2041                .unwrap_or_default()
2042                .is_empty(),
2043            "the off-allowlist OA URL must NOT be fetched: the pre-check \
2044             (REDIRECT_ALLOWLIST.md §1) rejects it before any request is \
2045             issued; wiremock recorded request(s)",
2046        );
2047
2048        // 3. The structured denial side-channel is byte-identical to the
2049        //    redirect-closure path: `RedirectNotInAllowlist`, source key,
2050        //    attempted host, expected allowlist snapshot.
2051        let dc: Option<DenialContext> = (&err).into();
2052        let dc = dc.expect("pre-fetch RedirectDenied -> Some(DenialContext)");
2053        assert_eq!(dc.reason, DenialReason::RedirectNotInAllowlist);
2054        assert_eq!(dc.source.as_deref(), Some("oa-publisher"));
2055        assert_eq!(
2056            dc.attempted,
2057            Some(off_host_url.host_str().expect("host").to_ascii_lowercase()),
2058            "attempted host must be the rejected OA URL host, lowercased — \
2059             identical to what the redirect closure records",
2060        );
2061        assert_eq!(
2062            dc.expected,
2063            Some(vec!["allowed-publisher.example.com".to_string()]),
2064        );
2065
2066        // 4. Provenance: exactly the `Fetch`/`err`/`oa-publisher`/
2067        //    `NETWORK_ERROR` row the post-fetch redirect-denied arm emits
2068        //    (same row kind + source key + closed-set code).
2069        let log_txt = std::fs::read_to_string(&log_path).expect("read provenance log");
2070        let fetch_err_row = log_txt
2071            .lines()
2072            .filter_map(|l| serde_json::from_str::<serde_json::Value>(l).ok())
2073            .find(|v| {
2074                v.get("event").and_then(|e| e.as_str()) == Some("fetch")
2075                    && v.get("result").and_then(|r| r.as_str()) == Some("err")
2076            })
2077            .expect("a Fetch/err provenance row was written");
2078        assert_eq!(
2079            fetch_err_row.get("source").and_then(|s| s.as_str()),
2080            Some("oa-publisher"),
2081        );
2082        assert_eq!(
2083            fetch_err_row.get("error_code").and_then(|c| c.as_str()),
2084            Some("NETWORK_ERROR"),
2085        );
2086        assert_eq!(
2087            fetch_err_row.get("ref").and_then(|r| r.as_str()),
2088            Some("10.1234/example"),
2089        );
2090    }
2091
2092    // Issue #145 positive / no-regression: an ON-allowlist OA URL still
2093    // fetches the PDF normally. The pre-fetch check must be a pure gate —
2094    // it must not perturb the happy path.
2095    #[tokio::test]
2096    async fn try_fetch_oa_pdf_on_allowlist_host_still_fetches_pdf_no_regression_145() {
2097        use crate::http::HttpClient;
2098        use crate::provenance::ProvenanceLog;
2099        use crate::rate_limiter::RateLimiter;
2100        use crate::{Doi, RateLimits};
2101        use std::sync::Arc;
2102        use wiremock::matchers::method;
2103        use wiremock::{Mock, MockServer, ResponseTemplate};
2104
2105        let server = MockServer::start().await;
2106        let body = b"%PDF-1.7\nhello pdf".to_vec();
2107        Mock::given(method("GET"))
2108            .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
2109            .mount(&server)
2110            .await;
2111        // The wiremock host IS the registered `oa-publisher` allowlist, so
2112        // the pre-check passes and the fetch proceeds as before.
2113        let host = server
2114            .uri()
2115            .parse::<url::Url>()
2116            .expect("uri")
2117            .host_str()
2118            .expect("host")
2119            .to_string();
2120
2121        let td = tempfile::TempDir::new().expect("tempdir");
2122        let log_path = Utf8Path::from_path(td.path())
2123            .expect("utf-8")
2124            .join("log.jsonl");
2125        let ctx = FetchContext {
2126            http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
2127            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2128            log: Arc::new(
2129                ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2130                    .expect("provenance log"),
2131            ),
2132            session_id: "01J0000000000000000000TEST".into(),
2133        };
2134
2135        let doi = Doi("10.1234/example".to_string());
2136        let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
2137        let (bytes, _final_url) = try_fetch_oa_pdf(&doi, &url, &ctx)
2138            .await
2139            .expect("on-allowlist OA URL still fetches the PDF");
2140        assert_eq!(bytes, body, "PDF bytes must be returned unchanged");
2141    }
2142
2143    // Issue #145: the pre-fetch denial and the redirect-closure denial
2144    // MUST produce a byte-identical `DenialContext` so PR #162's CLI
2145    // classification (CAPABILITY_DENIED / exit 3) handles both unchanged.
2146    // This pins the equivalence at the value level: the same source key +
2147    // host + allowlist snapshot map through the SAME
2148    // `From<&HttpError> for Option<DenialContext>` impl to equal structs.
2149    #[test]
2150    fn pre_fetch_denial_produces_byte_identical_denial_context_as_redirect_denied_145() {
2151        use crate::{DenialContext, DenialReason};
2152
2153        // Shape produced by the pre-fetch check in `try_fetch_oa_pdf`.
2154        let pre_fetch = HttpError::RedirectDenied {
2155            source_key: "oa-publisher".to_string(),
2156            host: "attacker.test".to_string(),
2157            expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
2158        };
2159        // Shape produced by the redirect closure in `crate::http` for the
2160        // identical inputs.
2161        let redirect_closure = HttpError::RedirectDenied {
2162            source_key: "oa-publisher".to_string(),
2163            host: "attacker.test".to_string(),
2164            expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
2165        };
2166
2167        let dc_pre: Option<DenialContext> = (&pre_fetch).into();
2168        let dc_red: Option<DenialContext> = (&redirect_closure).into();
2169        let dc_pre = dc_pre.expect("pre-fetch -> Some");
2170        let dc_red = dc_red.expect("redirect -> Some");
2171
2172        // Byte-identical: same reason, same source, same attempted host,
2173        // same expected snapshot, all auxiliary channels None.
2174        assert_eq!(dc_pre, dc_red);
2175        assert_eq!(dc_pre.reason, DenialReason::RedirectNotInAllowlist);
2176        assert_eq!(dc_pre.source.as_deref(), Some("oa-publisher"));
2177        assert_eq!(dc_pre.attempted.as_deref(), Some("attacker.test"));
2178        assert_eq!(
2179            dc_pre.expected,
2180            Some(vec!["*.springer.com".to_string(), "*.plos.org".to_string()]),
2181        );
2182        assert_eq!(dc_pre.hop_index, None);
2183        assert_eq!(dc_pre.cap, None);
2184        assert_eq!(dc_pre.actual, None);
2185    }
2186
2187    // -----------------------------------------------------------------
2188    // #139 — metadata_only_to_store writes the metadata TOML;
2189    //        resolve_only / pure metadata_only write NOTHING.
2190    // -----------------------------------------------------------------
2191
2192    /// Build a ctx + FsStore under a fresh tempdir and point Crossref at
2193    /// a wiremock origin that returns one minimal `message`. Returns
2194    /// `(server, ctx, store, store_root, _td)` — `_td` keeps the tempdir
2195    /// alive for the test body.
2196    async fn md139_harness() -> (
2197        wiremock::MockServer,
2198        FetchContext,
2199        crate::store::FsStore,
2200        Utf8PathBuf,
2201        tempfile::TempDir,
2202    ) {
2203        use crate::http::HttpClient;
2204        use crate::provenance::ProvenanceLog;
2205        use crate::rate_limiter::RateLimiter;
2206        use crate::store::FsStore;
2207        use crate::RateLimits;
2208        use std::sync::Arc;
2209        use wiremock::matchers::method;
2210        use wiremock::{Mock, MockServer, ResponseTemplate};
2211
2212        let server = MockServer::start().await;
2213        Mock::given(method("GET"))
2214            .respond_with(ResponseTemplate::new(200).set_body_string(
2215                r#"{"status":"ok","message":{"title":["Example Paper"],"author":[{"given":"Ada","family":"Lovelace"}]}}"#,
2216            ))
2217            .mount(&server)
2218            .await;
2219        std::env::set_var("DOIGET_CROSSREF_BASE", server.uri());
2220
2221        // wiremock serves http://127.0.0.1:PORT; the production client is
2222        // https_only, so the test ctx uses the allow-http test client
2223        // scoped to the crossref/unpaywall source keys + the wiremock host.
2224        let host = server
2225            .uri()
2226            .parse::<url::Url>()
2227            .expect("uri")
2228            .host_str()
2229            .expect("host")
2230            .to_string();
2231
2232        let td = tempfile::TempDir::new().expect("tempdir");
2233        let base = Utf8Path::from_path(td.path()).expect("utf-8");
2234        let log_path = base.join("log.jsonl");
2235        let store_root = base.join("papers");
2236        let ctx = FetchContext {
2237            http: Arc::new(HttpClient::new_for_tests_allow_http_multi(&[
2238                ("crossref", &host),
2239                ("unpaywall", &host),
2240            ])),
2241            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2242            log: Arc::new(
2243                ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
2244                    .expect("provenance log"),
2245            ),
2246            session_id: "01J0000000000000000000TEST".into(),
2247        };
2248        let store = FsStore::new(store_root.clone()).expect("fs store");
2249        (server, ctx, store, store_root, td)
2250    }
2251
2252    fn metadata_dir_tomls(store_root: &Utf8Path) -> Vec<Utf8PathBuf> {
2253        let md = store_root.join(".metadata");
2254        match std::fs::read_dir(md.as_std_path()) {
2255            Ok(rd) => rd
2256                .filter_map(|e| e.ok())
2257                .filter_map(|e| Utf8PathBuf::from_path_buf(e.path()).ok())
2258                .filter(|p| p.extension() == Some("toml"))
2259                .collect(),
2260            Err(_) => Vec::new(),
2261        }
2262    }
2263
2264    #[tokio::test]
2265    #[serial_test::serial]
2266    async fn metadata_only_to_store_writes_metadata_toml_139() {
2267        let (_server, ctx, store, store_root, _td) = md139_harness().await;
2268        let profile = CapabilityProfile::from_env().expect("clean env");
2269        let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
2270
2271        let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
2272            .await
2273            .expect("metadata_only_to_store ok");
2274        assert_eq!(outcome.source, "crossref");
2275
2276        let tomls = metadata_dir_tomls(&store_root);
2277        assert_eq!(
2278            tomls.len(),
2279            1,
2280            "exactly one .metadata/*.toml must be written (MCP_TOOLS.md §11 SIDE EFFECT, #139); got {tomls:?}"
2281        );
2282        let body = std::fs::read_to_string(&tomls[0]).expect("read metadata toml");
2283        let meta: crate::store::Metadata = toml::from_str(&body).expect("parse metadata toml");
2284        assert_eq!(meta.title, "Example Paper");
2285        assert_eq!(
2286            meta.doi.as_ref().map(|d| d.as_str()),
2287            Some("10.1234/example")
2288        );
2289        let ext = meta.doiget.expect("[doiget] table present");
2290        assert_eq!(ext.source, "crossref");
2291        assert_eq!(ext.size_bytes, 0, "metadata-only entry has no PDF");
2292
2293        std::env::remove_var("DOIGET_CROSSREF_BASE");
2294    }
2295
2296    #[tokio::test]
2297    #[serial_test::serial]
2298    async fn resolve_only_and_pure_metadata_only_write_nothing_139() {
2299        let (_server, ctx, _store, store_root, _td) = md139_harness().await;
2300        let profile = CapabilityProfile::from_env().expect("clean env");
2301        let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
2302
2303        // resolve_only: contractually MUST NOT touch the store.
2304        let r = resolve_only(&ref_, &profile, &ctx)
2305            .await
2306            .expect("resolve_only ok");
2307        assert_eq!(r.source, "crossref");
2308        assert!(
2309            metadata_dir_tomls(&store_root).is_empty(),
2310            "resolve_only MUST NOT write a metadata TOML (docs/MCP_TOOLS.md §1; #139)"
2311        );
2312
2313        // The pure metadata_only is also write-free (the store-write
2314        // lives only in metadata_only_to_store).
2315        let m = metadata_only(&ref_, &profile, &ctx)
2316            .await
2317            .expect("metadata_only ok");
2318        assert_eq!(m.source, "crossref");
2319        assert!(
2320            metadata_dir_tomls(&store_root).is_empty(),
2321            "pure metadata_only MUST NOT write to the store (#139)"
2322        );
2323
2324        std::env::remove_var("DOIGET_CROSSREF_BASE");
2325    }
2326
2327    /// #139 — the arXiv branch of `metadata_only_to_store` must also
2328    /// write the metadata TOML (different code path: Atom feed,
2329    /// source="arxiv", license="arxiv-default", doi=None). Review I3/C1.
2330    #[tokio::test]
2331    #[serial_test::serial]
2332    async fn metadata_only_to_store_arxiv_writes_metadata_toml_139() {
2333        use crate::http::HttpClient;
2334        use crate::provenance::ProvenanceLog;
2335        use crate::rate_limiter::RateLimiter;
2336        use crate::store::FsStore;
2337        use crate::RateLimits;
2338        use std::sync::Arc;
2339        use wiremock::matchers::method;
2340        use wiremock::{Mock, MockServer, ResponseTemplate};
2341
2342        let atom = r#"<?xml version="1.0" encoding="UTF-8"?>
2343<feed xmlns="http://www.w3.org/2005/Atom">
2344  <entry>
2345    <id>http://arxiv.org/abs/2401.12345v1</id>
2346    <published>2024-01-15T00:00:00Z</published>
2347    <title>Example arXiv Paper Title</title>
2348    <summary>Example abstract.</summary>
2349    <author><name>Jane Doe</name></author>
2350    <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
2351  </entry>
2352</feed>"#;
2353        let server = MockServer::start().await;
2354        Mock::given(method("GET"))
2355            .respond_with(ResponseTemplate::new(200).set_body_string(atom))
2356            .mount(&server)
2357            .await;
2358        std::env::set_var("DOIGET_ARXIV_BASE", server.uri());
2359        let host = server
2360            .uri()
2361            .parse::<url::Url>()
2362            .expect("uri")
2363            .host_str()
2364            .expect("host")
2365            .to_string();
2366
2367        let td = tempfile::TempDir::new().expect("tempdir");
2368        let base = Utf8Path::from_path(td.path()).expect("utf-8");
2369        let store_root = base.join("papers");
2370        let ctx = FetchContext {
2371            http: Arc::new(HttpClient::new_for_tests_allow_http("arxiv", &host)),
2372            rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
2373            log: Arc::new(
2374                ProvenanceLog::open(base.join("log.jsonl"), "01J0000000000000000000TEST".into())
2375                    .expect("provenance log"),
2376            ),
2377            session_id: "01J0000000000000000000TEST".into(),
2378        };
2379        let store = FsStore::new(store_root.clone()).expect("fs store");
2380        let profile = CapabilityProfile::from_env().expect("clean env");
2381        let ref_ = Ref::Arxiv(crate::ArxivId::parse("2401.12345").expect("arxiv id"));
2382
2383        let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
2384            .await
2385            .expect("metadata_only_to_store (arxiv) ok");
2386        assert_eq!(outcome.source, "arxiv");
2387
2388        let tomls = metadata_dir_tomls(&store_root);
2389        assert_eq!(
2390            tomls.len(),
2391            1,
2392            "arXiv metadata-only must write one TOML; got {tomls:?}"
2393        );
2394        let meta: crate::store::Metadata =
2395            toml::from_str(&std::fs::read_to_string(&tomls[0]).expect("read")).expect("parse");
2396        assert_eq!(meta.title, "Example arXiv Paper Title");
2397        assert_eq!(
2398            meta.arxiv_id.as_ref().map(|a| a.as_str()),
2399            Some("2401.12345")
2400        );
2401        assert!(meta.doi.is_none(), "arXiv entry has no DOI");
2402        let ext = meta.doiget.expect("[doiget] table");
2403        assert_eq!(ext.source, "arxiv");
2404        assert_eq!(ext.license, "arxiv-default");
2405
2406        std::env::remove_var("DOIGET_ARXIV_BASE");
2407    }
2408
2409    // ----- pure-function unit tests for the #139 extraction helpers ----
2410
2411    #[test]
2412    fn extract_metadata_title_handles_string_array_missing_blank() {
2413        use serde_json::json;
2414        // bare string (arXiv/Unpaywall shape)
2415        assert_eq!(
2416            extract_metadata_title(&json!({"title": "Hello"})),
2417            Some("Hello".to_string())
2418        );
2419        // single-element array (Crossref `message.title` in practice)
2420        assert_eq!(
2421            extract_metadata_title(&json!({"title": ["Real Title"]})),
2422            Some("Real Title".to_string())
2423        );
2424        // missing key -> None (caller falls back to ref id)
2425        assert_eq!(extract_metadata_title(&json!({"x": 1})), None);
2426        // blank string -> None (must not persist an empty title)
2427        assert_eq!(extract_metadata_title(&json!({"title": "   "})), None);
2428        // empty array -> None
2429        assert_eq!(extract_metadata_title(&json!({"title": []})), None);
2430        // A leading blank/whitespace array element is SKIPPED — the first
2431        // non-blank element is taken (a stray leading empty element must
2432        // not mask the real Crossref title).
2433        assert_eq!(
2434            extract_metadata_title(&json!({"title": ["  ", "Real Title"]})),
2435            Some("Real Title".to_string())
2436        );
2437        // all-blank array -> None (caller falls back to ref id)
2438        assert_eq!(extract_metadata_title(&json!({"title": ["  ", ""]})), None);
2439    }
2440
2441    #[test]
2442    fn extract_metadata_authors_handles_each_resolver_shape() {
2443        use serde_json::json;
2444        // arXiv: authors: [String]
2445        assert_eq!(
2446            extract_metadata_authors(&json!({"authors": ["Jane Doe", "John Roe"]})),
2447            vec!["Jane Doe".to_string(), "John Roe".to_string()]
2448        );
2449        // Crossref: author: [{given,family}]
2450        assert_eq!(
2451            extract_metadata_authors(&json!({"author": [{"given": "Ada", "family": "Lovelace"}]})),
2452            vec!["Ada Lovelace".to_string()]
2453        );
2454        // family-only (given absent) -> trimmed, no leading space
2455        assert_eq!(
2456            extract_metadata_authors(&json!({"author": [{"family": "Onsager"}]})),
2457            vec!["Onsager".to_string()]
2458        );
2459        // `name` fallback when given+family both absent
2460        assert_eq!(
2461            extract_metadata_authors(&json!({"author": [{"name": "K. Wilson"}]})),
2462            vec!["K. Wilson".to_string()]
2463        );
2464        // z_authors fallback shape (forward-compat branch)
2465        assert_eq!(
2466            extract_metadata_authors(&json!({"z_authors": [{"given": "L", "family": "Kadanoff"}]})),
2467            vec!["L Kadanoff".to_string()]
2468        );
2469        // nothing parseable -> empty (still a valid TOML)
2470        assert!(extract_metadata_authors(&json!({"x": 1})).is_empty());
2471        assert!(extract_metadata_authors(&json!({"authors": []})).is_empty());
2472    }
2473}