Skip to main content

doiget_cli/commands/
fetch.rs

1//! `doiget fetch <ref>` subcommand.
2//!
3//! Phase 1 scope:
4//!
5//! - **arXiv refs** — full end-to-end: PDF bytes are fetched via the
6//!   `doiget_core::sources::arxiv::ArxivSource`, the `[doiget]`
7//!   extension table is populated with the resolved license, source,
8//!   size, and `fetched_at`, and the result is written to the on-disk
9//!   store with both the metadata TOML and the PDF.
10//! - **DOI refs** — Crossref metadata + Unpaywall license enrichment + an
11//!   OA PDF fetch when Unpaywall's `best_oa_location.url_for_pdf` (or
12//!   `best_oa_location.url`) resolves to a host on the synthetic
13//!   `"oa-publisher"` allowlist (`docs/REDIRECT_ALLOWLIST.md` §3). The OA
14//!   URL host check is informed-best-effort; if the host is not on the
15//!   allowlist or the body fails the magic-byte check, the orchestrator
16//!   logs a `Fetch err` row under `source = "oa-publisher"` and falls back
17//!   to metadata-only success — the metadata is still useful.
18//!
19//! ## Provenance contract
20//!
21//! Per `docs/PROVENANCE_LOG.md` §3, every invocation emits at least one
22//! `SessionStart`, one or more `Fetch` rows (one per source consulted), one
23//! `StoreWrite` row on success, and one `SessionEnd`. Each `Fetch` row is
24//! appended by the underlying `Source` impl; the orchestrator owns the
25//! session-bookend rows and the `StoreWrite` row.
26//!
27//! ## Configuration surface
28//!
29//! Hard-coded paths with env-var overrides; full `config.toml` plumbing
30//! arrives in a follow-up. See `docs/CONFIG.md` for the eventual surface.
31//!
32//! | Env var | Default | Purpose |
33//! |---|---|---|
34//! | `DOIGET_STORE_ROOT` | `$HOME/papers` (or `%USERPROFILE%\papers` on Windows) | Filesystem store root |
35//! | `DOIGET_LOG_PATH` | `<config>/doiget/access.jsonl` | Provenance log file |
36//! | `DOIGET_CONTACT_EMAIL` | `doiget@localhost` | Polite-pool contact email (User-Agent and Crossref) |
37//! | `DOIGET_UNPAYWALL_EMAIL` | (= contact email) | Unpaywall query-string email |
38//! | `DOIGET_ARXIV_BASE` | `https://arxiv.org` | arXiv source base (test override) |
39//! | `DOIGET_CROSSREF_BASE` | `https://api.crossref.org` | Crossref source base (test override) |
40//! | `DOIGET_UNPAYWALL_BASE` | `https://api.unpaywall.org/v2` | Unpaywall source base (test override) |
41//! | `DOIGET_OA_PUBLISHER_BASE` | (production allowlist) | OA publisher host allowlist override (test override) |
42
43use std::sync::Arc;
44
45use anyhow::{anyhow, Context, Result};
46use camino::Utf8PathBuf;
47
48#[cfg(feature = "citation")]
49use doiget_core::http::tier_2_allowlist;
50use doiget_core::http::{oa_publisher_allowlist, tier_1_allowlist, HttpClient};
51use doiget_core::orchestrator::{fetch_paper as core_fetch_paper, FetchPaperOutcome, PdfLegStatus};
52use doiget_core::provenance::{Capability, LogEvent, LogResult, ProvenanceLog, RowInput};
53use doiget_core::rate_limiter::RateLimiter;
54use doiget_core::source::{FetchContext, FetchError};
55use doiget_core::store::FsStore;
56use doiget_core::{CapabilityProfile, DenialContext, DenialReason, ErrorCode, RateLimits, Ref};
57
58/// Defer to docs/PROVENANCE_LOG.md §3: 26-char ULID per process invocation.
59fn new_session_id() -> String {
60    ulid::Ulid::new().to_string()
61}
62
63// ---------------------------------------------------------------------------
64// Dry-run plan / preview (ADR-0022)
65// ---------------------------------------------------------------------------
66
67// The structured `FetchPlan` shape, the `build_fetch_plan` builder, and
68// the `build_dry_run_envelope` JSON-shape helper live in `doiget-core`
69// so the MCP server can produce a bit-identical envelope without
70// depending on `doiget-cli`. The CLI re-exports them here for callers
71// that already `use doiget_cli::commands::fetch`.
72pub use doiget_core::dry_run::{
73    build_dry_run_envelope, build_fetch_plan, FetchPlan, PdfSourcePlan, RateLimitBudget,
74};
75
76/// Serialize the dry-run envelope and write it to stdout. Used by the
77/// `--dry-run` flag on `doiget fetch` and `doiget batch`. The envelope
78/// shape matches ADR-0022 §1 / `docs/MCP_TOOLS.md` §10.
79///
80/// `pub` so `commands::batch` (multi-ref dry-run) can reuse it. The
81/// function lives in `doiget-cli` (not `doiget-core`) because `println!`
82/// is a CLI concern; the MCP server uses [`build_dry_run_envelope`]
83/// directly and routes the bytes via JSON-RPC.
84///
85/// `print_stdout` is workspace-deny for MCP stdio safety (ADR-0001 /
86/// `docs/SECURITY.md` §3); `--dry-run` is a CLI-only path that never
87/// runs under the MCP server, so the localized `#[allow]` is the
88/// minimal intervention — same pattern used by `commands::config`,
89/// `commands::info`, etc.
90#[allow(clippy::print_stdout)]
91pub fn emit_dry_run_plan_to_stdout(ref_: &Ref, plan: &FetchPlan) -> Result<()> {
92    let envelope = build_dry_run_envelope(ref_, plan);
93    let s = serde_json::to_string(&envelope).context("serializing dry-run envelope to JSON")?;
94    println!("{s}");
95    Ok(())
96}
97
98/// Resolve the provenance log path. `DOIGET_LOG_PATH` wins; otherwise
99/// fall back to `<config>/doiget/access.jsonl` per `docs/PROVENANCE_LOG.md`
100/// §1.
101fn resolve_log_path() -> Result<Utf8PathBuf> {
102    if let Some(s) = read_env_utf8("DOIGET_LOG_PATH")? {
103        return Ok(Utf8PathBuf::from(s));
104    }
105    let cfg = config_dir_utf8()?;
106    Ok(cfg.join("doiget").join("access.jsonl"))
107}
108
109/// Read an env var and assert it is valid UTF-8. Returns `Ok(None)` if
110/// unset; `Ok(Some(s))` if set and UTF-8; `Err(...)` if set but non-UTF-8.
111/// `std::env::var` already requires UTF-8 (returns `VarError::NotUnicode`
112/// otherwise); we wrap it to surface a friendlier error and avoid the
113/// banned `std::path::PathBuf` round-trip.
114fn read_env_utf8(key: &str) -> Result<Option<String>> {
115    match std::env::var(key) {
116        Ok(s) => Ok(Some(s)),
117        Err(std::env::VarError::NotPresent) => Ok(None),
118        Err(std::env::VarError::NotUnicode(_)) => Err(anyhow!("{key} is not valid UTF-8")),
119    }
120}
121
122/// Best-effort home-dir resolution without depending on the `dirs` crate
123/// (every new dep adds cargo-vet exemption churn). Honors `HOME` first
124/// (POSIX + most CI), then `USERPROFILE` (Windows).
125fn home_dir_utf8() -> Result<Utf8PathBuf> {
126    if let Some(s) = read_env_utf8("HOME")? {
127        return Ok(Utf8PathBuf::from(s));
128    }
129    if let Some(s) = read_env_utf8("USERPROFILE")? {
130        return Ok(Utf8PathBuf::from(s));
131    }
132    Err(anyhow!("neither HOME nor USERPROFILE is set"))
133}
134
135/// Best-effort config-dir resolution. Honors `XDG_CONFIG_HOME` first
136/// (POSIX), then `APPDATA` (Windows), then falls back to `$HOME/.config`.
137///
138/// Crate-visible so sibling modules (`commands::capabilities`,
139/// `commands::config`) can resolve the same `<config_dir>/doiget/`
140/// path the production HTTP-client builder reads from. Keep the
141/// signature stable: any divergence between this and the MCP-side
142/// copy (`crates/doiget-mcp/src/lib.rs::config_dir_utf8`) would
143/// silently desync the user-extension allowlist surfaces.
144pub(crate) fn config_dir_utf8() -> Result<Utf8PathBuf> {
145    if let Some(s) = read_env_utf8("XDG_CONFIG_HOME")? {
146        return Ok(Utf8PathBuf::from(s));
147    }
148    if let Some(s) = read_env_utf8("APPDATA")? {
149        return Ok(Utf8PathBuf::from(s));
150    }
151    let home = home_dir_utf8()?;
152    Ok(home.join(".config"))
153}
154
155/// Construct the workspace-wide [`HttpClient`].
156///
157/// Production path: `HttpClient::new(tier_1_allowlist() ∪ oa_publisher_allowlist())` —
158/// strict HTTPS-only with the canonical Tier-1 redirect allowlist (Crossref,
159/// Unpaywall, arXiv) plus the synthetic `"oa-publisher"` allowlist used for
160/// the OA PDF leg of the DOI fetch path (`fetch_doi` issues
161/// `HttpClient::fetch_pdf("oa-publisher", url)` against the URL Unpaywall
162/// returned in `best_oa_location`). The OA-publisher list is
163/// informed-best-effort per `docs/REDIRECT_ALLOWLIST.md` §3.
164///
165/// Test path: when any of the three `DOIGET_*_BASE` env vars is set, build a
166/// multi-source relaxed-`https_only` client whose per-source allowlist is
167/// derived from the corresponding env-var hosts. The `oa-publisher` source
168/// key is registered against the same host (typically the wiremock origin)
169/// when `DOIGET_OA_PUBLISHER_BASE` is set — this lets the integration tests
170/// under `tests/fetch_doi_oa_pdf_e2e.rs` exercise the full PDF leg without
171/// touching the real network.
172fn build_http_client() -> Result<HttpClient> {
173    let arxiv = std::env::var("DOIGET_ARXIV_BASE").ok();
174    let crossref = std::env::var("DOIGET_CROSSREF_BASE").ok();
175    let unpaywall = std::env::var("DOIGET_UNPAYWALL_BASE").ok();
176    let oa_publisher = std::env::var("DOIGET_OA_PUBLISHER_BASE").ok();
177    // Slice 16: `DOIGET_OPENALEX_BASE` selects a wiremock host for the
178    // citation-graph BFS. Only meaningful with `--features citation`,
179    // but reading the env unconditionally keeps the branch logic
180    // simple and is harmless for default builds.
181    let openalex_base = std::env::var("DOIGET_OPENALEX_BASE").ok();
182
183    if arxiv.is_none()
184        && crossref.is_none()
185        && unpaywall.is_none()
186        && oa_publisher.is_none()
187        && openalex_base.is_none()
188    {
189        let mut allowlists = tier_1_allowlist();
190        allowlists.extend(oa_publisher_allowlist());
191        // Slice 16: when the `citation` feature is compiled in, the
192        // graph subcommand walks OpenAlex Work IDs via
193        // `ctx.http.fetch_bytes("openalex", ...)`. The Tier 2
194        // allowlist registers the `api.openalex.org` host under
195        // that source key. CapabilityProfile.metadata.openalex is
196        // the runtime gate; the allowlist is the transport gate.
197        #[cfg(feature = "citation")]
198        allowlists.extend(tier_2_allowlist());
199
200        // ADR-0028 D2: merge user-extension hosts from
201        // `<config_dir>/doiget/config.toml`. See
202        // `doiget_core::user_extension` for the wire contract and
203        // the (deferred) S3b provenance / doctor / capabilities
204        // surfaces.
205        //
206        // Failure handling is opt-in-convenience: a missing config
207        // is silent (Ok-empty), a malformed config emits
208        // `tracing::warn!` and continues with the curated allowlist,
209        // and an unresolvable config dir emits `tracing::debug!`
210        // (only happens in stripped envs with no HOME / XDG /
211        // APPDATA — review pass I3 / A1).
212        match config_dir_utf8() {
213            Ok(cfg_dir) => {
214                let path = cfg_dir.join("doiget").join("config.toml");
215                match doiget_core::user_extension::load(&path) {
216                    Ok(user_hosts) if !user_hosts.is_empty() => {
217                        tracing::info!(
218                            count = user_hosts.len(),
219                            path = %path,
220                            "merging user-extension allowlist hosts (ADR-0028 D2)"
221                        );
222                        doiget_core::user_extension::merge_into_allowlists(
223                            &mut allowlists,
224                            &user_hosts,
225                        );
226                    }
227                    Ok(_) => {}
228                    Err(e) => {
229                        tracing::warn!(
230                            error = %e,
231                            path = %path,
232                            "failed to load user-extension allowlist; \
233                             falling back to curated set only"
234                        );
235                    }
236                }
237            }
238            Err(e) => {
239                tracing::debug!(
240                    error = %e,
241                    "config dir unresolvable; \
242                     user-extension allowlist disabled (curated set only)"
243                );
244            }
245        }
246
247        return HttpClient::new(allowlists).context("building HTTP client");
248    }
249
250    // Test-base mode: build a relaxed client per overridden source.
251    let mut owned: Vec<(String, String)> = Vec::new();
252    for (source, base) in [
253        ("arxiv", arxiv.as_deref()),
254        ("crossref", crossref.as_deref()),
255        ("unpaywall", unpaywall.as_deref()),
256        ("oa-publisher", oa_publisher.as_deref()),
257        ("openalex", openalex_base.as_deref()),
258    ] {
259        if let Some(b) = base {
260            let url = url::Url::parse(b)
261                .with_context(|| format!("DOIGET_*_BASE for {source} is not a URL: {b}"))?;
262            let host = url
263                .host_str()
264                .ok_or_else(|| anyhow!("base URL has no host: {b}"))?;
265            owned.push((source.to_string(), host.to_string()));
266        }
267    }
268    let entries: Vec<(&str, &str)> = owned
269        .iter()
270        .map(|(s, h)| (s.as_str(), h.as_str()))
271        .collect();
272    Ok(HttpClient::new_for_tests_allow_http_multi(&entries))
273}
274
275// Slice 2: the per-source env-aware constructors that used to live here
276// (`build_arxiv_source`, `build_crossref_source`, `build_unpaywall_source`)
277// moved into `doiget-core::orchestrator` so the core `fetch_paper`
278// orchestrator and the MCP server both honor the same `DOIGET_*_BASE`
279// test-override surface. The CLI no longer constructs sources directly —
280// it builds the `FetchContext` + `FsStore` and hands them to the core
281// orchestrator.
282
283/// Resolved configuration derived from the environment.
284///
285/// Slice 2: `contact_email` / `unpaywall_email` are now read by the
286/// `doiget-core::orchestrator::fetch_paper` orchestrator directly from
287/// the env (`contact_email_from_env` / `unpaywall_email_from_env` in
288/// that module), so the CLI no longer threads them through. The fields
289/// stay here so a future slice that adds CLI-flag overrides has a
290/// natural attachment point — the `#[allow(dead_code)]` is the minimal
291/// intervention until that slice lands.
292#[allow(dead_code)]
293pub(crate) struct OrchestratorConfig {
294    pub(crate) store_root: Utf8PathBuf,
295    pub(crate) log_path: Utf8PathBuf,
296    pub(crate) contact_email: String,
297    pub(crate) unpaywall_email: String,
298}
299
300impl OrchestratorConfig {
301    fn from_env() -> Result<Self> {
302        let store_root = super::resolve_store_root()?;
303        let log_path = resolve_log_path()?;
304        let contact_email =
305            std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| "doiget@localhost".into());
306        let unpaywall_email =
307            std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| contact_email.clone());
308        Ok(Self {
309            store_root,
310            log_path,
311            contact_email,
312            unpaywall_email,
313        })
314    }
315}
316
317/// Reusable fetch harness shared by `doiget fetch <ref>` (single ref) and
318/// `doiget batch <path>` (many refs). Owns the shared foundation modules
319/// (`HttpClient` / `RateLimiter` / `ProvenanceLog`), the on-disk store, and
320/// the resolved capability profile, plus the session bookkeeping required by
321/// `docs/PROVENANCE_LOG.md` §3 (the 26-char ULID `session_id`).
322///
323/// Construction is performed once via [`FetchHarness::from_env`]. Per-ref
324/// orchestration runs through [`FetchHarness::fetch_one`]; bookend rows go
325/// via [`FetchHarness::log_session_start`] / [`FetchHarness::log_session_end`]
326/// so the orchestrator can frame either one fetch or many.
327pub(crate) struct FetchHarness {
328    pub(crate) http: Arc<HttpClient>,
329    pub(crate) rate_limiter: Arc<RateLimiter>,
330    pub(crate) log: Arc<ProvenanceLog>,
331    pub(crate) store: FsStore,
332    pub(crate) profile: CapabilityProfile,
333    pub(crate) session_id: String,
334    /// Resolved config; Slice 2 keeps this on the harness for the
335    /// CLI-only env diagnostics path (`commands::config::doctor`), even
336    /// though `fetch_one` no longer needs it (the core orchestrator
337    /// re-reads contact email from env directly).
338    #[allow(dead_code)]
339    pub(crate) cfg: OrchestratorConfig,
340}
341
342impl FetchHarness {
343    /// Build a harness from the same env-var surface documented at the top
344    /// of this module. Creates the log parent directory if missing, opens
345    /// the provenance log (allocating a fresh `session_id`), and constructs
346    /// the HTTP client honoring `DOIGET_*_BASE` overrides for tests.
347    pub(crate) fn from_env() -> Result<Self> {
348        let cfg = OrchestratorConfig::from_env()?;
349        if let Some(parent) = cfg.log_path.parent() {
350            if !parent.as_str().is_empty() {
351                std::fs::create_dir_all(parent.as_std_path())
352                    .with_context(|| format!("creating log dir {parent}"))?;
353            }
354        }
355        let session_id = new_session_id();
356        let log = Arc::new(
357            ProvenanceLog::open(cfg.log_path.clone(), session_id.clone())
358                .context("opening provenance log")?,
359        );
360        let http = Arc::new(build_http_client()?);
361        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
362        let store = FsStore::new(cfg.store_root.clone()).context("opening store")?;
363        let profile = CapabilityProfile::from_env().context("resolving capability profile")?;
364
365        Ok(Self {
366            http,
367            rate_limiter,
368            log,
369            store,
370            profile,
371            session_id,
372            cfg,
373        })
374    }
375
376    /// Build a [`FetchContext`] view over this harness's foundation modules.
377    /// Creating one is cheap (cloning three `Arc`s + a `String`); per-ref
378    /// orchestration constructs one on demand.
379    pub(crate) fn fetch_context(&self) -> FetchContext {
380        FetchContext {
381            http: self.http.clone(),
382            rate_limiter: self.rate_limiter.clone(),
383            log: self.log.clone(),
384            session_id: self.session_id.clone(),
385        }
386    }
387
388    /// Append a `SessionStart` row. `ref_input` is the raw user-supplied ref
389    /// string (single-fetch path); pass `None` for batch sessions where no
390    /// single ref attributes the session.
391    pub(crate) fn log_session_start(&self, ref_input: Option<&str>) -> Result<()> {
392        self.log
393            .append(RowInput {
394                event: LogEvent::SessionStart,
395                result: LogResult::Ok,
396                capability: Capability::Oa,
397                ref_: ref_input,
398                source: None,
399                error_code: None,
400                size_bytes: None,
401                license: None,
402                store_path: None,
403                // Session bookend — no audit identity (ADR-0021 §1).
404                canonical_digest: None,
405            })
406            .context("appending SessionStart row")?;
407        Ok(())
408    }
409
410    /// Append a `SessionEnd` row. `ref_input` mirrors the `log_session_start`
411    /// argument; pass `None` for batch sessions. The result is best-effort —
412    /// if this append fails, the caller already has the underlying fetch
413    /// error (if any) and we don't override it.
414    pub(crate) fn log_session_end(&self, ok: bool, ref_input: Option<&str>) {
415        let result = if ok { LogResult::Ok } else { LogResult::Err };
416        let _ = self.log.append(RowInput {
417            event: LogEvent::SessionEnd,
418            result,
419            capability: Capability::Oa,
420            ref_: ref_input,
421            source: None,
422            error_code: None,
423            size_bytes: None,
424            license: None,
425            store_path: None,
426            // Session bookend — no audit identity (ADR-0021 §1).
427            canonical_digest: None,
428        });
429    }
430
431    /// Run a single ref through the per-kind orchestration (arxiv → PDF +
432    /// metadata; doi → metadata-only via Crossref + Unpaywall, with an
433    /// informed-best-effort OA PDF leg). Errors here are scoped to this
434    /// one ref — the caller decides whether to abort the surrounding
435    /// session.
436    ///
437    /// Slice 2: delegates to
438    /// [`doiget_core::orchestrator::fetch_paper`] for the actual work
439    /// (which both CLI and MCP now share). This function keeps the
440    /// CLI-only stderr success-line print.
441    pub(crate) async fn fetch_one(&self, ref_: &Ref) -> Result<FetchPaperOutcome, FetchError> {
442        // Pure data path: return the typed outcome (or typed error)
443        // without any CLI-only rendering or exit-code synthesis. The
444        // single-fetch caller (`run_with_options`) and the batch
445        // caller (`commands::batch::classify_joined`) each render the
446        // human / JSON surface and map to `CliExit` themselves — see
447        // #210 for the rationale (batch's `--json` JSONL needs the
448        // structured `FetchPaperOutcome` to emit `result.{safekey,
449        // store_path, canonical_digest}` on success and
450        // `denial_context` on a `PdfLegStatus::Blocked` outcome, which
451        // was unreachable through the previous `Result<()>`
452        // signature).
453        let ctx = self.fetch_context();
454        core_fetch_paper(ref_, &self.profile, &ctx, &self.store, self.store.root()).await
455    }
456}
457
458/// `true` iff the outcome represents a clean fetch: `Fetched` (full
459/// PDF) or `NoOaUrl` (metadata-only by design). A `Blocked` PDF leg
460/// is a failure for SessionEnd / exit-code purposes — an OA PDF was
461/// discovered but could not be retrieved — even though the metadata
462/// TOML did land on disk. Pulled out so both `run_with_options` and
463/// `commands::batch` agree on the failure boundary.
464pub(crate) fn outcome_is_clean_success(outcome: &FetchPaperOutcome) -> bool {
465    !matches!(outcome.pdf_leg, PdfLegStatus::Blocked { .. })
466}
467
468/// CLI-only one-line success message on stderr (ADR-0001 stdio
469/// convention). Renders the [`FetchPaperOutcome`] in the same form the
470/// pre-Slice-2 CLI emitted: a full-PDF success names the PDF path; a
471/// metadata-only DOI fallback (size_bytes == 0) names the metadata TOML
472/// path the orchestrator wrote.
473fn emit_success_line(ref_: &Ref, outcome: &FetchPaperOutcome) {
474    let label = match ref_ {
475        Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
476        Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
477    };
478    match &outcome.pdf_leg {
479        PdfLegStatus::Fetched => {
480            print_success(format_args!(
481                "fetched {} ({} bytes) -> {}",
482                label, outcome.size_bytes, outcome.path
483            ));
484        }
485        PdfLegStatus::NoOaUrl => {
486            print_success(format_args!(
487                "fetched {} (metadata-only: no OA PDF available) -> {}",
488                label, outcome.path
489            ));
490        }
491        // Issue #145: `Blocked` is NO LONGER a success outcome. It is
492        // intercepted in `fetch_one` BEFORE `emit_success_line` is
493        // called and rendered via `render_blocked_error` with a
494        // non-zero exit (`docs/ERRORS.md` §3/§6 — no silent failures).
495        // Reaching this arm would mean the interception regressed, so we
496        // fail closed: surface the `error[CODE]:` line here too rather
497        // than printing a misleading success line.
498        PdfLegStatus::Blocked {
499            code,
500            message,
501            denial,
502        } => {
503            // Same #145 reclassification as the primary interception in
504            // `fetch_one`, so this fail-closed fallback stays consistent.
505            let effective = effective_blocked_code(*code, denial.as_ref());
506            render_blocked_error(ref_, outcome, effective, message, denial.as_ref());
507        }
508        // `PdfLegStatus` is `#[non_exhaustive]`; a future variant
509        // degrades to the size-based wording rather than failing the
510        // downstream-crate build.
511        _ => {
512            if outcome.size_bytes == 0 {
513                print_success(format_args!(
514                    "fetched {} (metadata-only) -> {}",
515                    label, outcome.path
516                ));
517            } else {
518                print_success(format_args!(
519                    "fetched {} ({} bytes) -> {}",
520                    label, outcome.size_bytes, outcome.path
521                ));
522            }
523        }
524    }
525}
526
527/// Run the `doiget fetch <ref>` subcommand.
528///
529/// `dry_run` (ADR-0022 §1): when `true`, build a [`FetchPlan`] from the
530/// parsed [`Ref`] and the configured store root, serialize it as JSON to
531/// stdout, and return `Ok(())` immediately, **without** building a
532/// `FetchHarness` (no provenance log open), without contacting the
533/// network, without writing to the store, and without appending a
534/// provenance row.
535///
536/// When `dry_run` is `false`, the function runs the normal end-to-end
537/// orchestration path: open the provenance log, dispatch the per-kind
538/// orchestrator, emit a `SessionStart` / `SessionEnd` bookend pair.
539///
540/// On success returns `Ok(())` and writes a one-line success message to
541/// stderr (per ADR-0001 stdio convention — no stdout writes from `fetch`
542/// on the normal path). On failure, returns an `anyhow::Error` and emits
543/// a `SessionEnd` row with `result=err` to the provenance log before
544/// returning.
545///
546/// # History
547///
548/// Slice 5 (PR #84 advisory item A2/A3 refactor): the previous
549/// `FetchOptions { dry_run: bool }` single-field option bundle plus the
550/// thin `run(input)` backwards-compat wrapper were collapsed into this
551/// single `dry_run: bool` parameter — the option bundle's single-bool
552/// shape was YAGNI, and the wrapper only existed to spare integration
553/// tests a `FetchOptions::default()` literal.
554pub async fn run_with_options(
555    input: String,
556    dry_run: bool,
557    _mode: super::output::OutputMode,
558) -> Result<()> {
559    // `_mode` is threaded per ADR-0017 / #144. Quiet-suppression of the
560    // success line is tracked in #203. The dry-run plan envelope is
561    // product output (the requested artifact) and is unaffected by
562    // mode.
563    // Step 1: parse + safekey. Issue #119: render the cargo-style
564    // `error[INVALID_REF]:` line + carry the exit code, rather than
565    // letting the granular `RefParseError` fall out as an opaque
566    // anyhow `{:?}` dump.
567    let ref_ = match Ref::parse(&input) {
568        Ok(r) => r,
569        Err(e) => {
570            print_err(format_args!(
571                "error[{}]: invalid ref: {e}",
572                ErrorCode::InvalidRef.as_wire()
573            ));
574            return Err(anyhow::Error::new(CliExit(cli_exit_code(
575                ErrorCode::InvalidRef,
576            ))));
577        }
578    };
579
580    // Dry-run branch: build the plan and emit it. NO harness, NO network,
581    // NO store write, NO provenance row. Posture-lint ADR-0022 §5 will
582    // verify this branch never reaches `HttpClient::fetch_*`,
583    // `FsStore::write_*`, or `ProvenanceLog::append`.
584    if dry_run {
585        // Resolve store root for path projections. Failures here surface
586        // as a normal CLI error (not as a denial) — same behaviour the
587        // non-dry-run path would exhibit on a misconfigured environment.
588        let store_root = super::resolve_store_root()?;
589        let plan = build_fetch_plan(&ref_, &store_root);
590        emit_dry_run_plan_to_stdout(&ref_, &plan)?;
591        return Ok(());
592    }
593
594    // Step 2: build harness (foundation modules + provenance log).
595    let harness = FetchHarness::from_env()?;
596
597    // Step 3: emit SessionStart. Fail-closed if the log write fails — the
598    // surrounding fetch MUST NOT proceed (`docs/PROVENANCE_LOG.md` §5).
599    harness.log_session_start(Some(ref_.as_input_str()))?;
600
601    // Step 4: dispatch on ref kind. `fetch_one` now returns the
602    // typed `FetchPaperOutcome` / `FetchError` per #210; the
603    // single-fetch caller (this fn) owns rendering + exit code.
604    let result = harness.fetch_one(&ref_).await;
605
606    // Step 5: emit SessionEnd regardless of outcome. A `Blocked` PDF
607    // leg is NOT a clean success even though the typed `Result` is
608    // `Ok` — `outcome_is_clean_success` collapses both halves so the
609    // SessionEnd `is_ok` field matches the user-facing exit code.
610    let session_ok = match &result {
611        Ok(o) => outcome_is_clean_success(o),
612        Err(_) => false,
613    };
614    harness.log_session_end(session_ok, Some(ref_.as_input_str()));
615
616    // Step 6: render the user-facing surface and map to `CliExit`.
617    // The Blocked-PDF reclassification logic that used to live inside
618    // `fetch_one` was lifted here verbatim so the batch caller can
619    // share the same `effective_blocked_code` / `render_blocked_error`
620    // helpers (issue #210 / #145).
621    match result {
622        Ok(outcome) => {
623            if let PdfLegStatus::Blocked {
624                code,
625                message,
626                denial,
627            } = &outcome.pdf_leg
628            {
629                let effective = effective_blocked_code(*code, denial.as_ref());
630                render_blocked_error(&ref_, &outcome, effective, message, denial.as_ref());
631                return Err(anyhow::Error::new(CliExit(cli_exit_code(effective))));
632            }
633            emit_success_line(&ref_, &outcome);
634            Ok(())
635        }
636        Err(e) => {
637            render_fetch_error(&e);
638            let code: ErrorCode = (&e).into();
639            Err(anyhow::Error::new(CliExit(cli_exit_code(code))))
640        }
641    }
642}
643
644/// Single-line user-visible success message, written to stderr per ADR-0001
645/// (stdio convention — the CLI never writes a success line to stdout). This
646/// is the one place where `eprintln!` is intentional; the workspace
647/// `clippy::print_stderr` lint is `warn` so the localized `#[allow]` is the
648/// minimal intervention.
649#[allow(clippy::print_stderr)]
650fn print_success(args: std::fmt::Arguments<'_>) {
651    eprintln!("{args}");
652}
653
654/// Stderr sink for the `docs/ERRORS.md` §3 human-error lines. Mirrors
655/// [`print_success`]; the localized `#[allow]` is the minimal
656/// intervention for the workspace `clippy::print_stderr` lint.
657#[allow(clippy::print_stderr)]
658fn print_err(args: std::fmt::Arguments<'_>) {
659    eprintln!("{args}");
660}
661
662/// Carries a `docs/ERRORS.md` §4 process exit code out of a CLI
663/// command to `main`, which owns the actual `std::process::exit`
664/// (calling it inside `run_with_options` would kill in-process
665/// integration tests). The human-readable `error[CODE]: …` line has
666/// ALREADY been written to stderr by `render_fetch_error` before
667/// this is constructed, so `main` must NOT print it again. Issue #119.
668#[derive(Debug)]
669pub struct CliExit(pub i32);
670
671impl std::fmt::Display for CliExit {
672    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
673        write!(f, "exiting with status {}", self.0)
674    }
675}
676
677impl std::error::Error for CliExit {}
678
679/// Reclassify a `PdfLegStatus::Blocked` code at the CLI layer (issue
680/// #145 / `docs/ERRORS.md` §2 "NETWORK_ERROR" vs §3.1 / §6).
681///
682/// The core maps *every* `FetchError::Http(_)` to
683/// [`ErrorCode::NetworkError`] (`doiget_core::source`'s
684/// `From<&FetchError> for ErrorCode`). `docs/ERRORS.md` §2 defines
685/// `NETWORK_ERROR` as a transport / DNS / TLS fault where "retry usually
686/// fine" — true for a real network blip, but **false** for a deliberate
687/// supply-chain policy block (off-allowlist redirect, insecure-scheme
688/// redirect, host-blocklist hit): retrying such a block never helps, so
689/// surfacing it as `NETWORK_ERROR` (generic exit 1) misrepresents a flaky
690/// network to humans and agents.
691///
692/// The orchestrator already preserves the true reason on the
693/// [`DenialContext`] side-channel (the `From<&HttpError> for
694/// Option<DenialContext>` impl walks reqwest's `source()` chain, so even
695/// a redirect denial wrapped as `HttpError::Network` still yields
696/// [`DenialReason::RedirectNotInAllowlist`]). When that reason is one of
697/// the closed-set *policy* denials, promote the surface code to
698/// [`ErrorCode::CapabilityDenied`] so the CLI renders
699/// `error[CAPABILITY_DENIED]:` and [`cli_exit_code`] returns exit 3 —
700/// the same code `fetch` / `graph` already use for capability denials.
701/// Non-policy blocks (no `denial`, or a non-policy reason such as
702/// `SizeCapExceeded` / `ContentTypeMismatch`) keep the core's code so a
703/// genuine transport failure still reads as `NETWORK_ERROR`.
704pub(crate) fn effective_blocked_code(code: ErrorCode, denial: Option<&DenialContext>) -> ErrorCode {
705    match denial.map(|d| d.reason) {
706        Some(
707            DenialReason::RedirectNotInAllowlist
708            | DenialReason::InsecureScheme
709            | DenialReason::HostInBlockList,
710        ) => ErrorCode::CapabilityDenied,
711        _ => code,
712    }
713}
714
715/// Snake-case wire token for a [`DenialReason`], matching the
716/// `#[serde(rename_all = "snake_case")]` JSON/MCP surface (ADR-0023 §2)
717/// so the CLI human line uses the SAME vocabulary as the machine
718/// envelope (`docs/ERRORS.md` §3.1). Only the policy-denial reasons the
719/// CLI inlines are enumerated; everything else degrades to a generic
720/// token rather than drifting from the serde form.
721fn denial_reason_wire(reason: DenialReason) -> &'static str {
722    match reason {
723        DenialReason::RedirectNotInAllowlist => "redirect_not_in_allowlist",
724        DenialReason::InsecureScheme => "insecure_scheme",
725        DenialReason::HostInBlockList => "host_in_block_list",
726        _ => "policy_denied",
727    }
728}
729
730/// `docs/ERRORS.md` §4 closed-code → process exit code. Anything not
731/// individually listed falls under "at least one fetch failed" (1).
732///
733/// `pub(crate)` so sibling subcommands (`commands::graph`, …) route
734/// their typed denials through the SAME centralized mapping instead of
735/// open-coding magic exit numbers — keeps the `ErrorCode`→exit contract
736/// single-sourced (issue #149).
737pub(crate) fn cli_exit_code(code: ErrorCode) -> i32 {
738    match code {
739        ErrorCode::CapabilityDenied => 3,
740        ErrorCode::StoreError | ErrorCode::LogError => 4,
741        ErrorCode::FetchTimeout => 124,
742        _ => 1,
743    }
744}
745
746/// Render a terminal [`FetchError`] in the `docs/ERRORS.md` §3
747/// "Researcher (CLI human)" form: `error[CODE]: message` on stderr,
748/// plus an actionable `= note:` line carrying the ADR-0023
749/// `denial_context` (attempted / expected hosts) when the failure was
750/// a denial class. stdout stays clean (ADR-0001).
751fn render_fetch_error(e: &FetchError) {
752    let code: ErrorCode = e.into();
753    print_err(format_args!("error[{}]: {}", code.as_wire(), e));
754    if let Some(dc) = Option::<DenialContext>::from(e) {
755        let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
756        match &dc.expected {
757            Some(exp) if !exp.is_empty() => {
758                print_err(format_args!(
759                    "  = note: attempted {attempted}; allowed: {}",
760                    exp.join(", ")
761                ));
762            }
763            _ => {
764                print_err(format_args!("  = note: attempted {attempted}"));
765            }
766        }
767    }
768}
769
770/// Render a `PdfLegStatus::Blocked` outcome in the `docs/ERRORS.md` §3
771/// "Researcher (CLI human)" form. Issue #145: an OA PDF was discovered
772/// but could not be retrieved — the metadata WAS written, but this is a
773/// denial, not a clean success. We emit the same `error[CODE]:` stderr
774/// shape as [`render_fetch_error`] (so pipelines and humans see an
775/// unambiguous failure), name the metadata path that DID land so the
776/// partial result is still discoverable, and surface the ADR-0023
777/// `denial_context` note when present. stdout stays clean (ADR-0001).
778fn render_blocked_error(
779    ref_: &Ref,
780    outcome: &FetchPaperOutcome,
781    code: ErrorCode,
782    message: &str,
783    denial: Option<&DenialContext>,
784) {
785    let label = match ref_ {
786        Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
787        Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
788    };
789    // Issue #145: when the block is a deliberate policy denial, name the
790    // closed-set reason inline so a human/agent reading the
791    // `error[CAPABILITY_DENIED]:` line immediately sees this is a
792    // supply-chain policy block (retrying is futile), not a flaky network.
793    match denial.map(|d| d.reason) {
794        Some(
795            reason @ (DenialReason::RedirectNotInAllowlist
796            | DenialReason::InsecureScheme
797            | DenialReason::HostInBlockList),
798        ) => {
799            print_err(format_args!(
800                "error[{}]: {label}: an OA PDF was found but its host is blocked by \
801                 supply-chain policy ({}): {message}",
802                code.as_wire(),
803                denial_reason_wire(reason)
804            ));
805        }
806        _ => {
807            print_err(format_args!(
808                "error[{}]: {label}: an OA PDF was found but could not be retrieved: {message}",
809                code.as_wire()
810            ));
811        }
812    }
813    if let Some(dc) = denial {
814        let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
815        match &dc.expected {
816            Some(exp) if !exp.is_empty() => {
817                print_err(format_args!(
818                    "  = note: attempted {attempted}; allowed: {}",
819                    exp.join(", ")
820                ));
821            }
822            _ => {
823                print_err(format_args!("  = note: attempted {attempted}"));
824            }
825        }
826    }
827    // The metadata TOML still landed; point the user at it so the
828    // partial result is not lost (it is still useful), without
829    // pretending the fetch succeeded.
830    print_err(format_args!(
831        "  = note: metadata-only record written to {}",
832        outcome.path
833    ));
834}
835
836// ---------------------------------------------------------------------------
837// Tests
838// ---------------------------------------------------------------------------
839
840#[cfg(test)]
841#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
842mod tests {
843    use super::*;
844    use serial_test::serial;
845
846    #[test]
847    fn new_session_id_is_26_chars() {
848        // ULID textual form is fixed-width 26 chars (Crockford base32).
849        // `docs/PROVENANCE_LOG.md` §3 requires this exact length.
850        let id = new_session_id();
851        assert_eq!(id.len(), 26, "session id must be 26 chars: {:?}", id);
852        // Crockford base32 uses uppercase letters and digits; specifically
853        // I, L, O, U are excluded. Every char must be ASCII alphanumeric.
854        assert!(
855            id.chars().all(|c| c.is_ascii_alphanumeric()),
856            "ulid must be ASCII alphanumeric: {:?}",
857            id
858        );
859    }
860
861    /// Review pass C2: end-to-end coverage of the user-extension
862    /// merge inside `build_http_client`. Without this test the
863    /// production path that turns a `config.toml`
864    /// `[[network.additional_hosts]]` entry into a passing
865    /// allowlist match is unexercised — every existing e2e sets
866    /// `DOIGET_*_BASE` and short-circuits into the test-mode
867    /// builder above.
868    #[test]
869    #[serial]
870    fn build_http_client_merges_user_extension_into_oa_publisher_allowlist() {
871        use std::io::Write;
872
873        // Construct a tempdir + minimal config.toml under it.
874        let td = tempfile::TempDir::new().expect("tempdir");
875        let cfg_dir = td.path().join("doiget");
876        std::fs::create_dir_all(&cfg_dir).expect("mkdir doiget/");
877        let cfg_path = cfg_dir.join("config.toml");
878        let mut f = std::fs::File::create(&cfg_path).expect("create config.toml");
879        f.write_all(
880            br#"
881[[network.additional_hosts]]
882host = "ruj.uj.edu.pl"
883note = "Jagiellonian"
884
885[[network.additional_hosts]]
886host = "*.uj.edu.pl"
887"#,
888        )
889        .expect("write config.toml");
890        drop(f);
891
892        // Save + override env so `config_dir_utf8()` lands on the
893        // tempdir. Restored on Drop by EnvGuard. We also clear the
894        // five `DOIGET_*_BASE` env vars to force the production
895        // branch of `build_http_client`.
896        struct EnvGuard {
897            key: &'static str,
898            prev: Option<String>,
899        }
900        impl EnvGuard {
901            fn save(key: &'static str) -> Self {
902                Self {
903                    key,
904                    prev: std::env::var(key).ok(),
905                }
906            }
907        }
908        impl Drop for EnvGuard {
909            fn drop(&mut self) {
910                match &self.prev {
911                    Some(v) => std::env::set_var(self.key, v),
912                    None => std::env::remove_var(self.key),
913                }
914            }
915        }
916        let _g0 = EnvGuard::save("XDG_CONFIG_HOME");
917        let _g1 = EnvGuard::save("APPDATA");
918        let _g2 = EnvGuard::save("HOME");
919        let _g3 = EnvGuard::save("USERPROFILE");
920        let _g4 = EnvGuard::save("DOIGET_ARXIV_BASE");
921        let _g5 = EnvGuard::save("DOIGET_CROSSREF_BASE");
922        let _g6 = EnvGuard::save("DOIGET_UNPAYWALL_BASE");
923        let _g7 = EnvGuard::save("DOIGET_OA_PUBLISHER_BASE");
924        let _g8 = EnvGuard::save("DOIGET_OPENALEX_BASE");
925        std::env::set_var("XDG_CONFIG_HOME", td.path());
926        std::env::set_var("APPDATA", td.path());
927        std::env::set_var("HOME", td.path());
928        std::env::set_var("USERPROFILE", td.path());
929        std::env::remove_var("DOIGET_ARXIV_BASE");
930        std::env::remove_var("DOIGET_CROSSREF_BASE");
931        std::env::remove_var("DOIGET_UNPAYWALL_BASE");
932        std::env::remove_var("DOIGET_OA_PUBLISHER_BASE");
933        std::env::remove_var("DOIGET_OPENALEX_BASE");
934
935        let client = build_http_client().expect("HttpClient builds");
936        let oa = client
937            .source_allowlist("oa-publisher")
938            .expect("oa-publisher source registered");
939
940        // Pre-existing curated allowlist still effective.
941        assert!(
942            oa.redirect_hosts.iter().any(|p| p == "*.aps.org"),
943            "curated *.aps.org MUST still be present after merge; got {:?}",
944            oa.redirect_hosts
945        );
946        // User-added literal host passes match.
947        assert!(
948            oa.matches("ruj.uj.edu.pl"),
949            "literal `ruj.uj.edu.pl` from user config MUST match"
950        );
951        // User-added wildcard passes match for a subdomain.
952        assert!(
953            oa.matches("alpha.uj.edu.pl"),
954            "wildcard `*.uj.edu.pl` from user config MUST match alpha.uj.edu.pl"
955        );
956        // Unrelated host MUST still fail.
957        assert!(
958            !oa.matches("ruj.uj.edu.ru"),
959            "host outside the suffix MUST NOT match"
960        );
961    }
962
963    // Slice 2: the `extract_crossref_fields_*` unit tests moved to
964    // `doiget_core::orchestrator::tests` along with the function they
965    // covered. The CLI no longer owns those helpers; the marker test
966    // below keeps the CLI's `fetch::tests` non-empty after the helper
967    // migration so a future regression that nukes the delegation path
968    // surfaces as a build failure (the `FetchPaperOutcome` re-import
969    // would stop resolving).
970    #[test]
971    fn fetch_paper_outcome_is_reachable_from_cli() {
972        let _ = std::any::type_name::<doiget_core::orchestrator::FetchPaperOutcome>();
973    }
974
975    /// Minimal `DenialContext` carrying only `reason`; every other field
976    /// is optional (ADR-0023 §3) so `None`/empty is a valid producer
977    /// shape for the reclassification decision under test.
978    fn denial(reason: DenialReason) -> DenialContext {
979        DenialContext {
980            reason,
981            source: None,
982            attempted: None,
983            expected: None,
984            hop_index: None,
985            cap: None,
986            actual: None,
987        }
988    }
989
990    /// Issue #145 / `docs/ERRORS.md` §6.1: a policy-class denial reason
991    /// on a `Blocked` OA-PDF leg must be reclassified from the core's
992    /// blanket `NetworkError` to `CapabilityDenied` at the CLI layer, so
993    /// the user-facing exit becomes 3 (not the generic 1) and a flaky
994    /// network is not implied for a deliberate supply-chain block.
995    #[test]
996    fn policy_denials_reclassify_network_error_to_capability_denied() {
997        for r in [
998            DenialReason::RedirectNotInAllowlist,
999            DenialReason::InsecureScheme,
1000            DenialReason::HostInBlockList,
1001        ] {
1002            let d = denial(r);
1003            assert_eq!(
1004                effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
1005                ErrorCode::CapabilityDenied,
1006                "policy reason {r:?} must promote NetworkError -> CapabilityDenied"
1007            );
1008            assert_eq!(
1009                cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, Some(&d))),
1010                3,
1011                "policy reason {r:?} must map to exit 3 (docs/ERRORS.md §4/§6.1)"
1012            );
1013        }
1014    }
1015
1016    /// A genuine transport fault carries NO `DenialContext`; it must stay
1017    /// `NetworkError` / exit 1 — `docs/ERRORS.md` §2 "retry usually fine"
1018    /// is the correct signal there. (This is exactly the e2e
1019    /// `..._host_off_allowlist` path: first-leg connect failure, no
1020    /// redirect hop, so no allowlist denial is produced.)
1021    #[test]
1022    fn absent_denial_context_keeps_network_error() {
1023        assert_eq!(
1024            effective_blocked_code(ErrorCode::NetworkError, None),
1025            ErrorCode::NetworkError
1026        );
1027        assert_eq!(
1028            cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, None)),
1029            1
1030        );
1031    }
1032
1033    /// Non-policy denial reasons (size cap, content-type mismatch) are
1034    /// NOT supply-chain policy blocks; they keep the core's code so a
1035    /// genuine cap/transport class is not masked as a capability denial.
1036    #[test]
1037    fn non_policy_denials_keep_core_code() {
1038        for r in [
1039            DenialReason::SizeCapExceeded,
1040            DenialReason::ContentTypeMismatch,
1041        ] {
1042            let d = denial(r);
1043            assert_eq!(
1044                effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
1045                ErrorCode::NetworkError,
1046                "non-policy reason {r:?} must NOT be reclassified"
1047            );
1048        }
1049    }
1050
1051    /// The closed-set wire token used in the human `error[...]:` line
1052    /// must match the serde `snake_case` form so the CLI vocabulary does
1053    /// not drift from the JSON/MCP envelope (`docs/ERRORS.md` §3.1).
1054    #[test]
1055    fn denial_reason_wire_matches_serde_snake_case() {
1056        for r in [
1057            DenialReason::RedirectNotInAllowlist,
1058            DenialReason::InsecureScheme,
1059            DenialReason::HostInBlockList,
1060        ] {
1061            let serde_form = serde_json::to_string(&r).expect("serialize DenialReason");
1062            // serde_json wraps the enum unit variant in quotes.
1063            let serde_token = serde_form.trim_matches('"');
1064            assert_eq!(
1065                denial_reason_wire(r),
1066                serde_token,
1067                "CLI wire token for {r:?} must equal the serde snake_case form"
1068            );
1069        }
1070    }
1071}