Skip to main content

doiget_cli/commands/
fetch.rs

1//! `doiget fetch <ref>` subcommand.
2//!
3//! Phase 1 scope:
4//!
5//! - **arXiv refs** — full end-to-end: PDF bytes are fetched via the
6//!   `doiget_core::sources::arxiv::ArxivSource`, the `[doiget]`
7//!   extension table is populated with the resolved license, source,
8//!   size, and `fetched_at`, and the result is written to the on-disk
9//!   store with both the metadata TOML and the PDF.
10//! - **DOI refs** — Crossref metadata + Unpaywall license enrichment + an
11//!   OA PDF fetch when Unpaywall's `best_oa_location.url_for_pdf` (or
12//!   `best_oa_location.url`) resolves to a host on the synthetic
13//!   `"oa-publisher"` allowlist (`docs/REDIRECT_ALLOWLIST.md` §3). The OA
14//!   URL host check is informed-best-effort; if the host is not on the
15//!   allowlist or the body fails the magic-byte check, the orchestrator
16//!   logs a `Fetch err` row under `source = "oa-publisher"` and falls back
17//!   to metadata-only success — the metadata is still useful.
18//!
19//! ## Provenance contract
20//!
21//! Per `docs/PROVENANCE_LOG.md` §3, every invocation emits at least one
22//! `SessionStart`, one or more `Fetch` rows (one per source consulted), one
23//! `StoreWrite` row on success, and one `SessionEnd`. Each `Fetch` row is
24//! appended by the underlying `Source` impl; the orchestrator owns the
25//! session-bookend rows and the `StoreWrite` row.
26//!
27//! ## Configuration surface
28//!
29//! Hard-coded paths with env-var overrides; full `config.toml` plumbing
30//! arrives in a follow-up. See `docs/CONFIG.md` for the eventual surface.
31//!
32//! | Env var | Default | Purpose |
33//! |---|---|---|
34//! | `DOIGET_STORE_ROOT` | `$HOME/papers` (or `%USERPROFILE%\papers` on Windows) | Filesystem store root |
35//! | `DOIGET_LOG_PATH` | `<config>/doiget/access.jsonl` | Provenance log file |
36//! | `DOIGET_CONTACT_EMAIL` | `doiget@localhost` | Polite-pool contact email (User-Agent and Crossref) |
37//! | `DOIGET_UNPAYWALL_EMAIL` | (= contact email) | Unpaywall query-string email |
38//! | `DOIGET_ARXIV_BASE` | `https://arxiv.org` | arXiv source base (test override) |
39//! | `DOIGET_CROSSREF_BASE` | `https://api.crossref.org` | Crossref source base (test override) |
40//! | `DOIGET_UNPAYWALL_BASE` | `https://api.unpaywall.org/v2` | Unpaywall source base (test override) |
41//! | `DOIGET_OA_PUBLISHER_BASE` | (production allowlist) | OA publisher host allowlist override (test override) |
42
43use std::sync::Arc;
44
45use anyhow::{anyhow, Context, Result};
46use camino::Utf8PathBuf;
47
48#[cfg(feature = "citation")]
49use doiget_core::http::tier_2_allowlist;
50use doiget_core::http::{
51    discovery_allowlist, fulltext_allowlist, oa_publisher_allowlist, tier_1_allowlist, HttpClient,
52};
53use doiget_core::orchestrator::{fetch_paper as core_fetch_paper, FetchPaperOutcome, PdfLegStatus};
54use doiget_core::provenance::{Capability, LogEvent, LogResult, ProvenanceLog, RowInput};
55use doiget_core::rate_limiter::RateLimiter;
56use doiget_core::source::{FetchContext, FetchError};
57use doiget_core::store::FsStore;
58use doiget_core::{CapabilityProfile, DenialContext, DenialReason, ErrorCode, RateLimits, Ref};
59
60/// Defer to docs/PROVENANCE_LOG.md §3: 26-char ULID per process invocation.
61pub(crate) fn new_session_id() -> String {
62    ulid::Ulid::new().to_string()
63}
64
65// ---------------------------------------------------------------------------
66// Dry-run plan / preview (ADR-0022)
67// ---------------------------------------------------------------------------
68
69// The structured `FetchPlan` shape, the `build_fetch_plan` builder, and
70// the `build_dry_run_envelope` JSON-shape helper live in `doiget-core`
71// so the MCP server can produce a bit-identical envelope without
72// depending on `doiget-cli`. The CLI re-exports them here for callers
73// that already `use doiget_cli::commands::fetch`.
74pub use doiget_core::dry_run::{
75    build_dry_run_envelope, build_fetch_plan, FetchPlan, PdfSourcePlan, RateLimitBudget,
76};
77
78/// Serialize the dry-run envelope and write it to stdout. Used by the
79/// `--dry-run` flag on `doiget fetch` and `doiget batch`. The envelope
80/// shape matches ADR-0022 §1 / `docs/MCP_TOOLS.md` §10.
81///
82/// `pub` so `commands::batch` (multi-ref dry-run) can reuse it. The
83/// function lives in `doiget-cli` (not `doiget-core`) because `println!`
84/// is a CLI concern; the MCP server uses [`build_dry_run_envelope`]
85/// directly and routes the bytes via JSON-RPC.
86///
87/// `print_stdout` is workspace-deny for MCP stdio safety (ADR-0001 /
88/// `docs/SECURITY.md` §3); `--dry-run` is a CLI-only path that never
89/// runs under the MCP server, so the localized `#[allow]` is the
90/// minimal intervention — same pattern used by `commands::config`,
91/// `commands::info`, etc.
92#[allow(clippy::print_stdout)]
93pub fn emit_dry_run_plan_to_stdout(ref_: &Ref, plan: &FetchPlan) -> Result<()> {
94    let envelope = build_dry_run_envelope(ref_, plan);
95    let s = serde_json::to_string(&envelope).context("serializing dry-run envelope to JSON")?;
96    println!("{s}");
97    Ok(())
98}
99
100/// Resolve the provenance log path. `DOIGET_LOG_PATH` wins; otherwise
101/// fall back to `<config>/doiget/access.jsonl` per `docs/PROVENANCE_LOG.md`
102/// §1.
103pub(crate) fn resolve_log_path() -> Result<Utf8PathBuf> {
104    if let Some(s) = read_env_utf8("DOIGET_LOG_PATH")? {
105        return Ok(Utf8PathBuf::from(s));
106    }
107    let cfg = config_dir_utf8()?;
108    Ok(cfg.join("doiget").join("access.jsonl"))
109}
110
111/// Read an env var and assert it is valid UTF-8. Returns `Ok(None)` if
112/// unset; `Ok(Some(s))` if set and UTF-8; `Err(...)` if set but non-UTF-8.
113/// `std::env::var` already requires UTF-8 (returns `VarError::NotUnicode`
114/// otherwise); we wrap it to surface a friendlier error and avoid the
115/// banned `std::path::PathBuf` round-trip.
116fn read_env_utf8(key: &str) -> Result<Option<String>> {
117    match std::env::var(key) {
118        Ok(s) => Ok(Some(s)),
119        Err(std::env::VarError::NotPresent) => Ok(None),
120        Err(std::env::VarError::NotUnicode(_)) => Err(anyhow!("{key} is not valid UTF-8")),
121    }
122}
123
124/// Best-effort home-dir resolution without depending on the `dirs` crate
125/// (every new dep adds cargo-vet exemption churn). Honors `HOME` first
126/// (POSIX + most CI), then `USERPROFILE` (Windows).
127fn home_dir_utf8() -> Result<Utf8PathBuf> {
128    if let Some(s) = read_env_utf8("HOME")? {
129        return Ok(Utf8PathBuf::from(s));
130    }
131    if let Some(s) = read_env_utf8("USERPROFILE")? {
132        return Ok(Utf8PathBuf::from(s));
133    }
134    Err(anyhow!("neither HOME nor USERPROFILE is set"))
135}
136
137/// Best-effort config-dir resolution. Honors `XDG_CONFIG_HOME` first
138/// (POSIX), then `APPDATA` (Windows), then falls back to `$HOME/.config`.
139///
140/// Crate-visible so sibling modules (`commands::capabilities`,
141/// `commands::config`) can resolve the same `<config_dir>/doiget/`
142/// path the production HTTP-client builder reads from. Keep the
143/// signature stable: any divergence between this and the MCP-side
144/// copy (`crates/doiget-mcp/src/lib.rs::config_dir_utf8`) would
145/// silently desync the user-extension allowlist surfaces.
146pub(crate) fn config_dir_utf8() -> Result<Utf8PathBuf> {
147    if let Some(s) = read_env_utf8("XDG_CONFIG_HOME")? {
148        return Ok(Utf8PathBuf::from(s));
149    }
150    if let Some(s) = read_env_utf8("APPDATA")? {
151        return Ok(Utf8PathBuf::from(s));
152    }
153    let home = home_dir_utf8()?;
154    Ok(home.join(".config"))
155}
156
157/// Best-effort resolver-cache root (`docs/CACHE.md`). Honors
158/// `DOIGET_CACHE_ROOT` first, then `XDG_CACHE_HOME/doiget` (POSIX), then
159/// `LOCALAPPDATA\doiget\cache` (Windows), then `$HOME/.cache/doiget`.
160/// Crate-visible so the `verify` command can enable the resolve cache.
161pub(crate) fn cache_dir_utf8() -> Result<Utf8PathBuf> {
162    if let Some(s) = read_env_utf8("DOIGET_CACHE_ROOT")? {
163        return Ok(Utf8PathBuf::from(s));
164    }
165    if let Some(s) = read_env_utf8("XDG_CACHE_HOME")? {
166        return Ok(Utf8PathBuf::from(s).join("doiget"));
167    }
168    if let Some(s) = read_env_utf8("LOCALAPPDATA")? {
169        return Ok(Utf8PathBuf::from(s).join("doiget").join("cache"));
170    }
171    let home = home_dir_utf8()?;
172    Ok(home.join(".cache").join("doiget"))
173}
174
175/// Build a metadata-resolution [`FetchContext`]: HTTP client, rate
176/// limiter, and provenance log resolved from the environment, with the
177/// resolver cache (`docs/CACHE.md`) enabled best-effort.
178///
179/// This is the shared context for the read-only resolve commands
180/// (`verify`, `cite`) — neither persists to the store, so no store
181/// handle is constructed. Enabling `cache_root` means repeat resolves of
182/// the same ref are served from disk, avoiding upstream rate limits; if
183/// the cache dir can't be resolved the run simply proceeds without it.
184pub(crate) fn build_resolve_context() -> Result<FetchContext> {
185    let session_id = new_session_id();
186    let log_path = resolve_log_path()?;
187    let http = Arc::new(build_http_client()?);
188    let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
189    let log = Arc::new(
190        ProvenanceLog::open(log_path, session_id.clone())
191            .context("failed to open provenance log")?,
192    );
193    let cache_root = cache_dir_utf8().ok();
194    Ok(FetchContext {
195        http,
196        rate_limiter,
197        log,
198        session_id,
199        cache_root,
200    })
201}
202
203/// Construct the workspace-wide [`HttpClient`].
204///
205/// Production path: `HttpClient::new(tier_1_allowlist() ∪ oa_publisher_allowlist())` —
206/// strict HTTPS-only with the canonical Tier-1 redirect allowlist (Crossref,
207/// Unpaywall, arXiv) plus the synthetic `"oa-publisher"` allowlist used for
208/// the OA PDF leg of the DOI fetch path (`fetch_doi` issues
209/// `HttpClient::fetch_pdf("oa-publisher", url)` against the URL Unpaywall
210/// returned in `best_oa_location`). The OA-publisher list is
211/// informed-best-effort per `docs/REDIRECT_ALLOWLIST.md` §3.
212///
213/// Test path: when any of the three `DOIGET_*_BASE` env vars is set, build a
214/// multi-source relaxed-`https_only` client whose per-source allowlist is
215/// derived from the corresponding env-var hosts. The `oa-publisher` source
216/// key is registered against the same host (typically the wiremock origin)
217/// when `DOIGET_OA_PUBLISHER_BASE` is set — this lets the integration tests
218/// under `tests/fetch_doi_oa_pdf_e2e.rs` exercise the full PDF leg without
219/// touching the real network.
220pub(crate) fn build_http_client() -> Result<HttpClient> {
221    let arxiv = std::env::var("DOIGET_ARXIV_BASE").ok();
222    let crossref = std::env::var("DOIGET_CROSSREF_BASE").ok();
223    let unpaywall = std::env::var("DOIGET_UNPAYWALL_BASE").ok();
224    let oa_publisher = std::env::var("DOIGET_OA_PUBLISHER_BASE").ok();
225    // Slice 16: `DOIGET_OPENALEX_BASE` selects a wiremock host for the
226    // citation-graph BFS. Only meaningful with `--features citation`,
227    // but reading the env unconditionally keeps the branch logic
228    // simple and is harmless for default builds.
229    let openalex_base = std::env::var("DOIGET_OPENALEX_BASE").ok();
230    // ADR-0032: `DOIGET_AR5IV_BASE` selects a wiremock host for the
231    // full-text extraction path (`doiget text`). Test-only override,
232    // mirroring `DOIGET_ARXIV_BASE`.
233    let ar5iv_base = std::env::var("DOIGET_AR5IV_BASE").ok();
234
235    if arxiv.is_none()
236        && crossref.is_none()
237        && unpaywall.is_none()
238        && oa_publisher.is_none()
239        && openalex_base.is_none()
240        && ar5iv_base.is_none()
241    {
242        let mut allowlists = tier_1_allowlist();
243        allowlists.extend(oa_publisher_allowlist());
244        // ADR-0031: discovery search (`doiget search`) is Tier-1 OA
245        // metadata, always-on, and ships in the default `oa-only` binary.
246        // Register `api.openalex.org` under the `"openalex"` source key
247        // UNCONDITIONALLY so `discovery::paper_search` can reach the
248        // `/works?search=` endpoint without `--features citation`. In
249        // citation builds the Tier-2 extend below re-registers the same
250        // host under the same key (idempotent HashMap overwrite).
251        allowlists.extend(discovery_allowlist());
252        // ADR-0032: full-text extraction (`doiget text`) is Tier-1 OA
253        // metadata, always-on. Register `ar5iv.labs.arxiv.org` under the
254        // `"ar5iv"` source key unconditionally so `paper_text::paper_text`
255        // can reach ar5iv in `oa-only` builds.
256        allowlists.extend(fulltext_allowlist());
257        // Slice 16: when the `citation` feature is compiled in, the
258        // graph subcommand walks OpenAlex Work IDs via
259        // `ctx.http.fetch_bytes("openalex", ...)`. The Tier 2
260        // allowlist registers the `api.openalex.org` host under
261        // that source key. CapabilityProfile.metadata.openalex is
262        // the runtime gate; the allowlist is the transport gate.
263        #[cfg(feature = "citation")]
264        allowlists.extend(tier_2_allowlist());
265
266        // ADR-0028 D2: merge user-extension hosts from
267        // `<config_dir>/doiget/config.toml`. See
268        // `doiget_core::user_extension` for the wire contract and
269        // the (deferred) S3b provenance / doctor / capabilities
270        // surfaces.
271        //
272        // Failure handling is opt-in-convenience: a missing config
273        // is silent (Ok-empty), a malformed config emits
274        // `tracing::warn!` and continues with the curated allowlist,
275        // and an unresolvable config dir emits `tracing::debug!`
276        // (only happens in stripped envs with no HOME / XDG /
277        // APPDATA — review pass I3 / A1).
278        match config_dir_utf8() {
279            Ok(cfg_dir) => {
280                let path = cfg_dir.join("doiget").join("config.toml");
281                match doiget_core::user_extension::load(&path) {
282                    Ok(user_hosts) if !user_hosts.is_empty() => {
283                        tracing::info!(
284                            count = user_hosts.len(),
285                            path = %path,
286                            "merging user-extension allowlist hosts (ADR-0028 D2)"
287                        );
288                        doiget_core::user_extension::merge_into_allowlists(
289                            &mut allowlists,
290                            &user_hosts,
291                        );
292                    }
293                    Ok(_) => {}
294                    Err(e) => {
295                        tracing::warn!(
296                            error = %e,
297                            path = %path,
298                            "failed to load user-extension allowlist; \
299                             falling back to curated set only"
300                        );
301                    }
302                }
303            }
304            Err(e) => {
305                tracing::debug!(
306                    error = %e,
307                    "config dir unresolvable; \
308                     user-extension allowlist disabled (curated set only)"
309                );
310            }
311        }
312
313        return HttpClient::new(allowlists).context("building HTTP client");
314    }
315
316    // Test-base mode: build a relaxed client per overridden source.
317    let mut owned: Vec<(String, String)> = Vec::new();
318    for (source, base) in [
319        ("arxiv", arxiv.as_deref()),
320        ("crossref", crossref.as_deref()),
321        ("unpaywall", unpaywall.as_deref()),
322        ("oa-publisher", oa_publisher.as_deref()),
323        ("openalex", openalex_base.as_deref()),
324        ("ar5iv", ar5iv_base.as_deref()),
325    ] {
326        if let Some(b) = base {
327            let url = url::Url::parse(b)
328                .with_context(|| format!("DOIGET_*_BASE for {source} is not a URL: {b}"))?;
329            let host = url
330                .host_str()
331                .ok_or_else(|| anyhow!("base URL has no host: {b}"))?;
332            owned.push((source.to_string(), host.to_string()));
333        }
334    }
335    let entries: Vec<(&str, &str)> = owned
336        .iter()
337        .map(|(s, h)| (s.as_str(), h.as_str()))
338        .collect();
339    Ok(HttpClient::new_for_tests_allow_http_multi(&entries))
340}
341
342// Slice 2: the per-source env-aware constructors that used to live here
343// (`build_arxiv_source`, `build_crossref_source`, `build_unpaywall_source`)
344// moved into `doiget-core::orchestrator` so the core `fetch_paper`
345// orchestrator and the MCP server both honor the same `DOIGET_*_BASE`
346// test-override surface. The CLI no longer constructs sources directly —
347// it builds the `FetchContext` + `FsStore` and hands them to the core
348// orchestrator.
349
350/// Resolved configuration derived from the environment.
351///
352/// Slice 2: `contact_email` / `unpaywall_email` are now read by the
353/// `doiget-core::orchestrator::fetch_paper` orchestrator directly from
354/// the env (`contact_email_from_env` / `unpaywall_email_from_env` in
355/// that module), so the CLI no longer threads them through. The fields
356/// stay here so a future slice that adds CLI-flag overrides has a
357/// natural attachment point — the `#[allow(dead_code)]` is the minimal
358/// intervention until that slice lands.
359#[allow(dead_code)]
360pub(crate) struct OrchestratorConfig {
361    pub(crate) store_root: Utf8PathBuf,
362    pub(crate) log_path: Utf8PathBuf,
363    pub(crate) contact_email: String,
364    pub(crate) unpaywall_email: String,
365}
366
367impl OrchestratorConfig {
368    fn from_env() -> Result<Self> {
369        let store_root = super::resolve_store_root()?;
370        let log_path = resolve_log_path()?;
371        let contact_email =
372            std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| "doiget@localhost".into());
373        let unpaywall_email =
374            std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| contact_email.clone());
375        Ok(Self {
376            store_root,
377            log_path,
378            contact_email,
379            unpaywall_email,
380        })
381    }
382}
383
384/// Reusable fetch harness shared by `doiget fetch <ref>` (single ref) and
385/// `doiget batch <path>` (many refs). Owns the shared foundation modules
386/// (`HttpClient` / `RateLimiter` / `ProvenanceLog`), the on-disk store, and
387/// the resolved capability profile, plus the session bookkeeping required by
388/// `docs/PROVENANCE_LOG.md` §3 (the 26-char ULID `session_id`).
389///
390/// Construction is performed once via [`FetchHarness::from_env`]. Per-ref
391/// orchestration runs through [`FetchHarness::fetch_one`]; bookend rows go
392/// via [`FetchHarness::log_session_start`] / [`FetchHarness::log_session_end`]
393/// so the orchestrator can frame either one fetch or many.
394pub(crate) struct FetchHarness {
395    pub(crate) http: Arc<HttpClient>,
396    pub(crate) rate_limiter: Arc<RateLimiter>,
397    pub(crate) log: Arc<ProvenanceLog>,
398    pub(crate) store: FsStore,
399    pub(crate) profile: CapabilityProfile,
400    pub(crate) session_id: String,
401    /// Resolved config; Slice 2 keeps this on the harness for the
402    /// CLI-only env diagnostics path (`commands::config::doctor`), even
403    /// though `fetch_one` no longer needs it (the core orchestrator
404    /// re-reads contact email from env directly).
405    #[allow(dead_code)]
406    pub(crate) cfg: OrchestratorConfig,
407}
408
409impl FetchHarness {
410    /// Build a harness from the same env-var surface documented at the top
411    /// of this module. Creates the log parent directory if missing, opens
412    /// the provenance log (allocating a fresh `session_id`), and constructs
413    /// the HTTP client honoring `DOIGET_*_BASE` overrides for tests.
414    pub(crate) fn from_env() -> Result<Self> {
415        let cfg = OrchestratorConfig::from_env()?;
416        if let Some(parent) = cfg.log_path.parent() {
417            if !parent.as_str().is_empty() {
418                std::fs::create_dir_all(parent.as_std_path())
419                    .with_context(|| format!("creating log dir {parent}"))?;
420            }
421        }
422        let session_id = new_session_id();
423        let log = Arc::new(
424            ProvenanceLog::open(cfg.log_path.clone(), session_id.clone())
425                .context("opening provenance log")?,
426        );
427        let http = Arc::new(build_http_client()?);
428        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
429        let store = FsStore::new(cfg.store_root.clone()).context("opening store")?;
430        let profile = CapabilityProfile::from_env().context("resolving capability profile")?;
431
432        Ok(Self {
433            http,
434            rate_limiter,
435            log,
436            store,
437            profile,
438            session_id,
439            cfg,
440        })
441    }
442
443    /// Build a [`FetchContext`] view over this harness's foundation modules.
444    /// Creating one is cheap (cloning three `Arc`s + a `String`); per-ref
445    /// orchestration constructs one on demand.
446    pub(crate) fn fetch_context(&self) -> FetchContext {
447        FetchContext {
448            http: self.http.clone(),
449            rate_limiter: self.rate_limiter.clone(),
450            log: self.log.clone(),
451            session_id: self.session_id.clone(),
452            cache_root: None,
453        }
454    }
455
456    /// Append a `SessionStart` row. `ref_input` is the raw user-supplied ref
457    /// string (single-fetch path); pass `None` for batch sessions where no
458    /// single ref attributes the session.
459    pub(crate) fn log_session_start(&self, ref_input: Option<&str>) -> Result<()> {
460        self.log
461            .append(RowInput {
462                event: LogEvent::SessionStart,
463                result: LogResult::Ok,
464                capability: Capability::Oa,
465                ref_: ref_input,
466                source: None,
467                error_code: None,
468                size_bytes: None,
469                license: None,
470                store_path: None,
471                // Session bookend — no audit identity (ADR-0021 §1).
472                canonical_digest: None,
473            })
474            .context("appending SessionStart row")?;
475        Ok(())
476    }
477
478    /// Append a `SessionEnd` row. `ref_input` mirrors the `log_session_start`
479    /// argument; pass `None` for batch sessions. The result is best-effort —
480    /// if this append fails, the caller already has the underlying fetch
481    /// error (if any) and we don't override it.
482    pub(crate) fn log_session_end(&self, ok: bool, ref_input: Option<&str>) {
483        let result = if ok { LogResult::Ok } else { LogResult::Err };
484        let _ = self.log.append(RowInput {
485            event: LogEvent::SessionEnd,
486            result,
487            capability: Capability::Oa,
488            ref_: ref_input,
489            source: None,
490            error_code: None,
491            size_bytes: None,
492            license: None,
493            store_path: None,
494            // Session bookend — no audit identity (ADR-0021 §1).
495            canonical_digest: None,
496        });
497    }
498
499    /// Run a single ref through the per-kind orchestration (arxiv → PDF +
500    /// metadata; doi → metadata-only via Crossref + Unpaywall, with an
501    /// informed-best-effort OA PDF leg). Errors here are scoped to this
502    /// one ref — the caller decides whether to abort the surrounding
503    /// session.
504    ///
505    /// Slice 2: delegates to
506    /// [`doiget_core::orchestrator::fetch_paper`] for the actual work
507    /// (which both CLI and MCP now share). This function keeps the
508    /// CLI-only stderr success-line print.
509    pub(crate) async fn fetch_one(&self, ref_: &Ref) -> Result<FetchPaperOutcome, FetchError> {
510        // Pure data path: return the typed outcome (or typed error)
511        // without any CLI-only rendering or exit-code synthesis. The
512        // single-fetch caller (`run_with_options`) and the batch
513        // caller (`commands::batch::classify_joined`) each render the
514        // human / JSON surface and map to `CliExit` themselves — see
515        // #210 for the rationale (batch's `--json` JSONL needs the
516        // structured `FetchPaperOutcome` to emit `result.{safekey,
517        // store_path, canonical_digest}` on success and
518        // `denial_context` on a `PdfLegStatus::Blocked` outcome, which
519        // was unreachable through the previous `Result<()>`
520        // signature).
521        let ctx = self.fetch_context();
522        core_fetch_paper(ref_, &self.profile, &ctx, &self.store, self.store.root()).await
523    }
524}
525
526/// `true` iff the outcome represents a clean fetch: `Fetched` (full
527/// PDF) or `NoOaUrl` (metadata-only by design). A `Blocked` PDF leg
528/// is a failure for SessionEnd / exit-code purposes — an OA PDF was
529/// discovered but could not be retrieved — even though the metadata
530/// TOML did land on disk. Pulled out so both `run_with_options` and
531/// `commands::batch` agree on the failure boundary.
532pub(crate) fn outcome_is_clean_success(outcome: &FetchPaperOutcome) -> bool {
533    !matches!(outcome.pdf_leg, PdfLegStatus::Blocked { .. })
534}
535
536/// CLI-only one-line success message on stderr (ADR-0001 stdio
537/// convention). Renders the [`FetchPaperOutcome`] in the same form the
538/// pre-Slice-2 CLI emitted: a full-PDF success names the PDF path; a
539/// metadata-only DOI fallback (size_bytes == 0) names the metadata TOML
540/// path the orchestrator wrote.
541fn emit_success_line(ref_: &Ref, outcome: &FetchPaperOutcome) {
542    let label = match ref_ {
543        Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
544        Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
545    };
546    match &outcome.pdf_leg {
547        PdfLegStatus::Fetched => {
548            print_success(format_args!(
549                "fetched {} ({} bytes) -> {}",
550                label, outcome.size_bytes, outcome.path
551            ));
552        }
553        PdfLegStatus::NoOaUrl => {
554            print_success(format_args!(
555                "fetched {} (metadata-only: no OA PDF available) -> {}",
556                label, outcome.path
557            ));
558        }
559        // Issue #145: `Blocked` is NO LONGER a success outcome. It is
560        // intercepted in `fetch_one` BEFORE `emit_success_line` is
561        // called and rendered via `render_blocked_error` with a
562        // non-zero exit (`docs/ERRORS.md` §3/§6 — no silent failures).
563        // Reaching this arm would mean the interception regressed, so we
564        // fail closed: surface the `error[CODE]:` line here too rather
565        // than printing a misleading success line.
566        PdfLegStatus::Blocked {
567            code,
568            message,
569            denial,
570            suggested_arxiv_id,
571        } => {
572            // Same #145 reclassification as the primary interception in
573            // `fetch_one`, so this fail-closed fallback stays consistent.
574            let effective = effective_blocked_code(*code, denial.as_ref());
575            render_blocked_error(
576                ref_,
577                outcome,
578                effective,
579                message,
580                denial.as_ref(),
581                suggested_arxiv_id.as_deref(),
582            );
583        }
584        // `PdfLegStatus` is `#[non_exhaustive]`; a future variant
585        // degrades to the size-based wording rather than failing the
586        // downstream-crate build.
587        _ => {
588            if outcome.size_bytes == 0 {
589                print_success(format_args!(
590                    "fetched {} (metadata-only) -> {}",
591                    label, outcome.path
592                ));
593            } else {
594                print_success(format_args!(
595                    "fetched {} ({} bytes) -> {}",
596                    label, outcome.size_bytes, outcome.path
597                ));
598            }
599        }
600    }
601}
602
603/// Run the `doiget fetch <ref>` subcommand.
604///
605/// `dry_run` (ADR-0022 §1): when `true`, build a [`FetchPlan`] from the
606/// parsed [`Ref`] and the configured store root, serialize it as JSON to
607/// stdout, and return `Ok(())` immediately, **without** building a
608/// `FetchHarness` (no provenance log open), without contacting the
609/// network, without writing to the store, and without appending a
610/// provenance row.
611///
612/// When `dry_run` is `false`, the function runs the normal end-to-end
613/// orchestration path: open the provenance log, dispatch the per-kind
614/// orchestrator, emit a `SessionStart` / `SessionEnd` bookend pair.
615///
616/// On success returns `Ok(())` and writes a one-line success message to
617/// stderr (per ADR-0001 stdio convention — no stdout writes from `fetch`
618/// on the normal path). On failure, returns an `anyhow::Error` and emits
619/// a `SessionEnd` row with `result=err` to the provenance log before
620/// returning.
621///
622/// # History
623///
624/// Slice 5 (PR #84 advisory item A2/A3 refactor): the previous
625/// `FetchOptions { dry_run: bool }` single-field option bundle plus the
626/// thin `run(input)` backwards-compat wrapper were collapsed into this
627/// single `dry_run: bool` parameter — the option bundle's single-bool
628/// shape was YAGNI, and the wrapper only existed to spare integration
629/// tests a `FetchOptions::default()` literal.
630pub async fn run_with_options(
631    input: String,
632    dry_run: bool,
633    _mode: super::output::OutputMode,
634) -> Result<()> {
635    // `_mode` is threaded per ADR-0017 / #144. Quiet-suppression of the
636    // success line is tracked in #203. The dry-run plan envelope is
637    // product output (the requested artifact) and is unaffected by
638    // mode.
639    // Step 1: parse + safekey. Issue #119: render the cargo-style
640    // `error[INVALID_REF]:` line + carry the exit code, rather than
641    // letting the granular `RefParseError` fall out as an opaque
642    // anyhow `{:?}` dump.
643    let ref_ = match Ref::parse(&input) {
644        Ok(r) => r,
645        Err(e) => {
646            print_err(format_args!(
647                "error[{}]: invalid ref: {e}",
648                ErrorCode::InvalidRef.as_wire()
649            ));
650            return Err(anyhow::Error::new(CliExit(cli_exit_code(
651                ErrorCode::InvalidRef,
652            ))));
653        }
654    };
655
656    // Dry-run branch: build the plan and emit it. NO harness, NO network,
657    // NO store write, NO provenance row. Posture-lint ADR-0022 §5 will
658    // verify this branch never reaches `HttpClient::fetch_*`,
659    // `FsStore::write_*`, or `ProvenanceLog::append`.
660    if dry_run {
661        // Resolve store root for path projections. Failures here surface
662        // as a normal CLI error (not as a denial) — same behaviour the
663        // non-dry-run path would exhibit on a misconfigured environment.
664        let store_root = super::resolve_store_root()?;
665        let plan = build_fetch_plan(&ref_, &store_root);
666        emit_dry_run_plan_to_stdout(&ref_, &plan)?;
667        return Ok(());
668    }
669
670    // Step 2: build harness (foundation modules + provenance log).
671    let harness = FetchHarness::from_env()?;
672
673    // Step 3: emit SessionStart. Fail-closed if the log write fails — the
674    // surrounding fetch MUST NOT proceed (`docs/PROVENANCE_LOG.md` §5).
675    harness.log_session_start(Some(ref_.as_input_str()))?;
676
677    // Step 4: dispatch on ref kind. `fetch_one` now returns the
678    // typed `FetchPaperOutcome` / `FetchError` per #210; the
679    // single-fetch caller (this fn) owns rendering + exit code.
680    let result = harness.fetch_one(&ref_).await;
681
682    // Step 5: emit SessionEnd regardless of outcome. A `Blocked` PDF
683    // leg is NOT a clean success even though the typed `Result` is
684    // `Ok` — `outcome_is_clean_success` collapses both halves so the
685    // SessionEnd `is_ok` field matches the user-facing exit code.
686    let session_ok = match &result {
687        Ok(o) => outcome_is_clean_success(o),
688        Err(_) => false,
689    };
690    harness.log_session_end(session_ok, Some(ref_.as_input_str()));
691
692    // Step 6: render the user-facing surface and map to `CliExit`.
693    // The Blocked-PDF reclassification logic that used to live inside
694    // `fetch_one` was lifted here verbatim so the batch caller can
695    // share the same `effective_blocked_code` / `render_blocked_error`
696    // helpers (issue #210 / #145).
697    match result {
698        Ok(outcome) => {
699            if let PdfLegStatus::Blocked {
700                code,
701                message,
702                denial,
703                suggested_arxiv_id,
704            } = &outcome.pdf_leg
705            {
706                let effective = effective_blocked_code(*code, denial.as_ref());
707                render_blocked_error(
708                    &ref_,
709                    &outcome,
710                    effective,
711                    message,
712                    denial.as_ref(),
713                    suggested_arxiv_id.as_deref(),
714                );
715                return Err(anyhow::Error::new(CliExit(cli_exit_code(effective))));
716            }
717            emit_success_line(&ref_, &outcome);
718            Ok(())
719        }
720        Err(e) => {
721            render_fetch_error(&e);
722            let code: ErrorCode = (&e).into();
723            Err(anyhow::Error::new(CliExit(cli_exit_code(code))))
724        }
725    }
726}
727
728/// Single-line user-visible success message, written to stderr per ADR-0001
729/// (stdio convention — the CLI never writes a success line to stdout). This
730/// is the one place where `eprintln!` is intentional; the workspace
731/// `clippy::print_stderr` lint is `warn` so the localized `#[allow]` is the
732/// minimal intervention.
733#[allow(clippy::print_stderr)]
734fn print_success(args: std::fmt::Arguments<'_>) {
735    eprintln!("{args}");
736}
737
738/// Stderr sink for the `docs/ERRORS.md` §3 human-error lines. Mirrors
739/// [`print_success`]; the localized `#[allow]` is the minimal
740/// intervention for the workspace `clippy::print_stderr` lint.
741#[allow(clippy::print_stderr)]
742fn print_err(args: std::fmt::Arguments<'_>) {
743    eprintln!("{args}");
744}
745
746/// Carries a `docs/ERRORS.md` §4 process exit code out of a CLI
747/// command to `main`, which owns the actual `std::process::exit`
748/// (calling it inside `run_with_options` would kill in-process
749/// integration tests). The human-readable `error[CODE]: …` line has
750/// ALREADY been written to stderr by `render_fetch_error` before
751/// this is constructed, so `main` must NOT print it again. Issue #119.
752#[derive(Debug)]
753pub struct CliExit(pub i32);
754
755impl std::fmt::Display for CliExit {
756    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
757        write!(f, "exiting with status {}", self.0)
758    }
759}
760
761impl std::error::Error for CliExit {}
762
763/// Reclassify a `PdfLegStatus::Blocked` code at the CLI layer (issue
764/// #145 / `docs/ERRORS.md` §2 "NETWORK_ERROR" vs §3.1 / §6).
765///
766/// The core maps *every* `FetchError::Http(_)` to
767/// [`ErrorCode::NetworkError`] (`doiget_core::source`'s
768/// `From<&FetchError> for ErrorCode`). `docs/ERRORS.md` §2 defines
769/// `NETWORK_ERROR` as a transport / DNS / TLS fault where "retry usually
770/// fine" — true for a real network blip, but **false** for a deliberate
771/// supply-chain policy block (off-allowlist redirect, insecure-scheme
772/// redirect, host-blocklist hit): retrying such a block never helps, so
773/// surfacing it as `NETWORK_ERROR` (generic exit 1) misrepresents a flaky
774/// network to humans and agents.
775///
776/// The orchestrator already preserves the true reason on the
777/// [`DenialContext`] side-channel (the `From<&HttpError> for
778/// Option<DenialContext>` impl walks reqwest's `source()` chain, so even
779/// a redirect denial wrapped as `HttpError::Network` still yields
780/// [`DenialReason::RedirectNotInAllowlist`]). When that reason is one of
781/// the closed-set *policy* denials, promote the surface code to
782/// [`ErrorCode::CapabilityDenied`] so the CLI renders
783/// `error[CAPABILITY_DENIED]:` and [`cli_exit_code`] returns exit 3 —
784/// the same code `fetch` / `graph` already use for capability denials.
785/// Non-policy blocks (no `denial`, or a non-policy reason such as
786/// `SizeCapExceeded` / `ContentTypeMismatch`) keep the core's code so a
787/// genuine transport failure still reads as `NETWORK_ERROR`.
788pub(crate) fn effective_blocked_code(code: ErrorCode, denial: Option<&DenialContext>) -> ErrorCode {
789    match denial.map(|d| d.reason) {
790        Some(
791            DenialReason::RedirectNotInAllowlist
792            | DenialReason::InsecureScheme
793            | DenialReason::HostInBlockList,
794        ) => ErrorCode::CapabilityDenied,
795        _ => code,
796    }
797}
798
799/// Snake-case wire token for a [`DenialReason`], matching the
800/// `#[serde(rename_all = "snake_case")]` JSON/MCP surface (ADR-0023 §2)
801/// so the CLI human line uses the SAME vocabulary as the machine
802/// envelope (`docs/ERRORS.md` §3.1). Only the policy-denial reasons the
803/// CLI inlines are enumerated; everything else degrades to a generic
804/// token rather than drifting from the serde form.
805fn denial_reason_wire(reason: DenialReason) -> &'static str {
806    match reason {
807        DenialReason::RedirectNotInAllowlist => "redirect_not_in_allowlist",
808        DenialReason::InsecureScheme => "insecure_scheme",
809        DenialReason::HostInBlockList => "host_in_block_list",
810        _ => "policy_denied",
811    }
812}
813
814/// `docs/ERRORS.md` §4 closed-code → process exit code. Anything not
815/// individually listed falls under "at least one fetch failed" (1).
816///
817/// `pub(crate)` so sibling subcommands (`commands::graph`, …) route
818/// their typed denials through the SAME centralized mapping instead of
819/// open-coding magic exit numbers — keeps the `ErrorCode`→exit contract
820/// single-sourced (issue #149).
821pub(crate) fn cli_exit_code(code: ErrorCode) -> i32 {
822    match code {
823        ErrorCode::CapabilityDenied => 3,
824        ErrorCode::StoreError | ErrorCode::LogError => 4,
825        ErrorCode::FetchTimeout => 124,
826        // A name filter that matched several entities is user-fixable by
827        // narrowing the query → `docs/ERRORS.md` §4 exit 2 ("misuse").
828        ErrorCode::Ambiguous => 2,
829        _ => 1,
830    }
831}
832
833/// Render a terminal [`FetchError`] in the `docs/ERRORS.md` §3
834/// "Researcher (CLI human)" form: `error[CODE]: message` on stderr,
835/// plus an actionable `= note:` line carrying the ADR-0023
836/// `denial_context` (attempted / expected hosts) when the failure was
837/// a denial class. stdout stays clean (ADR-0001).
838///
839/// `pub(crate)` so sibling resolve commands (`commands::link`, …) render
840/// typed failures — including the actionable denial note — through the
841/// SAME path instead of open-coding `error[CODE]: msg` and dropping the
842/// `denial_context` note (review #287).
843pub(crate) fn render_fetch_error(e: &FetchError) {
844    let code: ErrorCode = e.into();
845    print_err(format_args!("error[{}]: {}", code.as_wire(), e));
846    if let Some(dc) = Option::<DenialContext>::from(e) {
847        let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
848        match &dc.expected {
849            Some(exp) if !exp.is_empty() => {
850                print_err(format_args!(
851                    "  = note: attempted {attempted}; allowed: {}",
852                    exp.join(", ")
853                ));
854            }
855            _ => {
856                print_err(format_args!("  = note: attempted {attempted}"));
857            }
858        }
859    }
860}
861
862/// Render a `PdfLegStatus::Blocked` outcome in the `docs/ERRORS.md` §3
863/// "Researcher (CLI human)" form. Issue #145: an OA PDF was discovered
864/// but could not be retrieved — the metadata WAS written, but this is a
865/// denial, not a clean success. We emit the same `error[CODE]:` stderr
866/// shape as [`render_fetch_error`] (so pipelines and humans see an
867/// unambiguous failure), name the metadata path that DID land so the
868/// partial result is still discoverable, and surface the ADR-0023
869/// `denial_context` note when present. stdout stays clean (ADR-0001).
870fn render_blocked_error(
871    ref_: &Ref,
872    outcome: &FetchPaperOutcome,
873    code: ErrorCode,
874    message: &str,
875    denial: Option<&DenialContext>,
876    suggested_arxiv_id: Option<&str>,
877) {
878    let label = match ref_ {
879        Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
880        Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
881    };
882    // Issue #145: when the block is a deliberate policy denial, name the
883    // closed-set reason inline so a human/agent reading the
884    // `error[CAPABILITY_DENIED]:` line immediately sees this is a
885    // supply-chain policy block (retrying is futile), not a flaky network.
886    match denial.map(|d| d.reason) {
887        Some(
888            reason @ (DenialReason::RedirectNotInAllowlist
889            | DenialReason::InsecureScheme
890            | DenialReason::HostInBlockList),
891        ) => {
892            print_err(format_args!(
893                "error[{}]: {label}: an OA PDF was found but its host is blocked by \
894                 supply-chain policy ({}): {message}",
895                code.as_wire(),
896                denial_reason_wire(reason)
897            ));
898        }
899        _ => {
900            print_err(format_args!(
901                "error[{}]: {label}: an OA PDF was found but could not be retrieved: {message}",
902                code.as_wire()
903            ));
904        }
905    }
906    if let Some(dc) = denial {
907        let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
908        match &dc.expected {
909            Some(exp) if !exp.is_empty() => {
910                print_err(format_args!(
911                    "  = note: attempted {attempted}; allowed: {}",
912                    exp.join(", ")
913                ));
914            }
915            _ => {
916                print_err(format_args!("  = note: attempted {attempted}"));
917            }
918        }
919    }
920    // The metadata TOML still landed; point the user at it so the
921    // partial result is not lost (it is still useful), without
922    // pretending the fetch succeeded.
923    print_err(format_args!(
924        "  = note: metadata-only record written to {}",
925        outcome.path
926    ));
927    if let Some(arxiv_id) = suggested_arxiv_id {
928        print_err(format_args!(
929            "  = suggest: Try fetching the arXiv version: doiget fetch arxiv:{}",
930            arxiv_id
931        ));
932    }
933}
934
935// ---------------------------------------------------------------------------
936// Tests
937// ---------------------------------------------------------------------------
938
939#[cfg(test)]
940#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
941mod tests {
942    use super::*;
943    use serial_test::serial;
944
945    #[test]
946    fn new_session_id_is_26_chars() {
947        // ULID textual form is fixed-width 26 chars (Crockford base32).
948        // `docs/PROVENANCE_LOG.md` §3 requires this exact length.
949        let id = new_session_id();
950        assert_eq!(id.len(), 26, "session id must be 26 chars: {:?}", id);
951        // Crockford base32 uses uppercase letters and digits; specifically
952        // I, L, O, U are excluded. Every char must be ASCII alphanumeric.
953        assert!(
954            id.chars().all(|c| c.is_ascii_alphanumeric()),
955            "ulid must be ASCII alphanumeric: {:?}",
956            id
957        );
958    }
959
960    /// Review pass C2: end-to-end coverage of the user-extension
961    /// merge inside `build_http_client`. Without this test the
962    /// production path that turns a `config.toml`
963    /// `[[network.additional_hosts]]` entry into a passing
964    /// allowlist match is unexercised — every existing e2e sets
965    /// `DOIGET_*_BASE` and short-circuits into the test-mode
966    /// builder above.
967    #[test]
968    #[serial]
969    fn build_http_client_merges_user_extension_into_oa_publisher_allowlist() {
970        use std::io::Write;
971
972        // Construct a tempdir + minimal config.toml under it.
973        let td = tempfile::TempDir::new().expect("tempdir");
974        let cfg_dir = td.path().join("doiget");
975        std::fs::create_dir_all(&cfg_dir).expect("mkdir doiget/");
976        let cfg_path = cfg_dir.join("config.toml");
977        let mut f = std::fs::File::create(&cfg_path).expect("create config.toml");
978        f.write_all(
979            br#"
980[[network.additional_hosts]]
981host = "ruj.uj.edu.pl"
982note = "Jagiellonian"
983
984[[network.additional_hosts]]
985host = "*.uj.edu.pl"
986"#,
987        )
988        .expect("write config.toml");
989        drop(f);
990
991        // Save + override env so `config_dir_utf8()` lands on the
992        // tempdir. Restored on Drop by EnvGuard. We also clear the
993        // five `DOIGET_*_BASE` env vars to force the production
994        // branch of `build_http_client`.
995        struct EnvGuard {
996            key: &'static str,
997            prev: Option<String>,
998        }
999        impl EnvGuard {
1000            fn save(key: &'static str) -> Self {
1001                Self {
1002                    key,
1003                    prev: std::env::var(key).ok(),
1004                }
1005            }
1006        }
1007        impl Drop for EnvGuard {
1008            fn drop(&mut self) {
1009                match &self.prev {
1010                    Some(v) => std::env::set_var(self.key, v),
1011                    None => std::env::remove_var(self.key),
1012                }
1013            }
1014        }
1015        let _g0 = EnvGuard::save("XDG_CONFIG_HOME");
1016        let _g1 = EnvGuard::save("APPDATA");
1017        let _g2 = EnvGuard::save("HOME");
1018        let _g3 = EnvGuard::save("USERPROFILE");
1019        let _g4 = EnvGuard::save("DOIGET_ARXIV_BASE");
1020        let _g5 = EnvGuard::save("DOIGET_CROSSREF_BASE");
1021        let _g6 = EnvGuard::save("DOIGET_UNPAYWALL_BASE");
1022        let _g7 = EnvGuard::save("DOIGET_OA_PUBLISHER_BASE");
1023        let _g8 = EnvGuard::save("DOIGET_OPENALEX_BASE");
1024        std::env::set_var("XDG_CONFIG_HOME", td.path());
1025        std::env::set_var("APPDATA", td.path());
1026        std::env::set_var("HOME", td.path());
1027        std::env::set_var("USERPROFILE", td.path());
1028        std::env::remove_var("DOIGET_ARXIV_BASE");
1029        std::env::remove_var("DOIGET_CROSSREF_BASE");
1030        std::env::remove_var("DOIGET_UNPAYWALL_BASE");
1031        std::env::remove_var("DOIGET_OA_PUBLISHER_BASE");
1032        std::env::remove_var("DOIGET_OPENALEX_BASE");
1033
1034        let client = build_http_client().expect("HttpClient builds");
1035        let oa = client
1036            .source_allowlist("oa-publisher")
1037            .expect("oa-publisher source registered");
1038
1039        // Pre-existing curated allowlist still effective.
1040        assert!(
1041            oa.redirect_hosts.iter().any(|p| p == "*.aps.org"),
1042            "curated *.aps.org MUST still be present after merge; got {:?}",
1043            oa.redirect_hosts
1044        );
1045        // User-added literal host passes match.
1046        assert!(
1047            oa.matches("ruj.uj.edu.pl"),
1048            "literal `ruj.uj.edu.pl` from user config MUST match"
1049        );
1050        // User-added wildcard passes match for a subdomain.
1051        assert!(
1052            oa.matches("alpha.uj.edu.pl"),
1053            "wildcard `*.uj.edu.pl` from user config MUST match alpha.uj.edu.pl"
1054        );
1055        // Unrelated host MUST still fail.
1056        assert!(
1057            !oa.matches("ruj.uj.edu.ru"),
1058            "host outside the suffix MUST NOT match"
1059        );
1060    }
1061
1062    /// ADR-0031 D2: discovery search (`doiget search`) ships in the default
1063    /// `oa-only` binary, so `api.openalex.org` MUST be on the production
1064    /// allowlist under the `"openalex"` source key WITHOUT `--features
1065    /// citation`. The Tier-2 `tier_2_allowlist()` extend is
1066    /// `#[cfg(feature = "citation")]`; this test proves
1067    /// `discovery_allowlist()` covers that gap in the shipped build.
1068    #[test]
1069    #[serial]
1070    fn build_http_client_registers_openalex_for_discovery() {
1071        struct EnvGuard {
1072            key: &'static str,
1073            prev: Option<String>,
1074        }
1075        impl EnvGuard {
1076            fn save(key: &'static str) -> Self {
1077                Self {
1078                    key,
1079                    prev: std::env::var(key).ok(),
1080                }
1081            }
1082        }
1083        impl Drop for EnvGuard {
1084            fn drop(&mut self) {
1085                match &self.prev {
1086                    Some(v) => std::env::set_var(self.key, v),
1087                    None => std::env::remove_var(self.key),
1088                }
1089            }
1090        }
1091
1092        // Point config resolution at an empty tempdir and clear every
1093        // `DOIGET_*_BASE` so `build_http_client` takes the PRODUCTION
1094        // branch (not the test-base builder, which would register
1095        // "openalex" itself and mask the gap this test guards).
1096        let td = tempfile::TempDir::new().expect("tempdir");
1097        let _g0 = EnvGuard::save("XDG_CONFIG_HOME");
1098        let _g1 = EnvGuard::save("APPDATA");
1099        let _g2 = EnvGuard::save("HOME");
1100        let _g3 = EnvGuard::save("USERPROFILE");
1101        let _g4 = EnvGuard::save("DOIGET_ARXIV_BASE");
1102        let _g5 = EnvGuard::save("DOIGET_CROSSREF_BASE");
1103        let _g6 = EnvGuard::save("DOIGET_UNPAYWALL_BASE");
1104        let _g7 = EnvGuard::save("DOIGET_OA_PUBLISHER_BASE");
1105        let _g8 = EnvGuard::save("DOIGET_OPENALEX_BASE");
1106        std::env::set_var("XDG_CONFIG_HOME", td.path());
1107        std::env::set_var("APPDATA", td.path());
1108        std::env::set_var("HOME", td.path());
1109        std::env::set_var("USERPROFILE", td.path());
1110        std::env::remove_var("DOIGET_ARXIV_BASE");
1111        std::env::remove_var("DOIGET_CROSSREF_BASE");
1112        std::env::remove_var("DOIGET_UNPAYWALL_BASE");
1113        std::env::remove_var("DOIGET_OA_PUBLISHER_BASE");
1114        std::env::remove_var("DOIGET_OPENALEX_BASE");
1115
1116        let client = build_http_client().expect("HttpClient builds");
1117        let oa = client
1118            .source_allowlist("openalex")
1119            .expect("openalex source registered for discovery (ADR-0031 D2)");
1120        assert!(
1121            oa.matches("api.openalex.org"),
1122            "api.openalex.org MUST be on the discovery allowlist; got {:?}",
1123            oa.redirect_hosts
1124        );
1125    }
1126
1127    // Slice 2: the `extract_crossref_fields_*` unit tests moved to
1128    // `doiget_core::orchestrator::tests` along with the function they
1129    // covered. The CLI no longer owns those helpers; the marker test
1130    // below keeps the CLI's `fetch::tests` non-empty after the helper
1131    // migration so a future regression that nukes the delegation path
1132    // surfaces as a build failure (the `FetchPaperOutcome` re-import
1133    // would stop resolving).
1134    #[test]
1135    fn fetch_paper_outcome_is_reachable_from_cli() {
1136        let _ = std::any::type_name::<doiget_core::orchestrator::FetchPaperOutcome>();
1137    }
1138
1139    #[test]
1140    fn ambiguous_maps_to_exit_code_2() {
1141        // ADR-0031 D5: a name-filter ambiguity is user-fixable → exit 2,
1142        // distinct from the generic exit 1.
1143        assert_eq!(cli_exit_code(ErrorCode::Ambiguous), 2);
1144    }
1145
1146    /// Minimal `DenialContext` carrying only `reason`; every other field
1147    /// is optional (ADR-0023 §3) so `None`/empty is a valid producer
1148    /// shape for the reclassification decision under test.
1149    fn denial(reason: DenialReason) -> DenialContext {
1150        DenialContext {
1151            reason,
1152            source: None,
1153            attempted: None,
1154            expected: None,
1155            hop_index: None,
1156            cap: None,
1157            actual: None,
1158        }
1159    }
1160
1161    /// Issue #145 / `docs/ERRORS.md` §6.1: a policy-class denial reason
1162    /// on a `Blocked` OA-PDF leg must be reclassified from the core's
1163    /// blanket `NetworkError` to `CapabilityDenied` at the CLI layer, so
1164    /// the user-facing exit becomes 3 (not the generic 1) and a flaky
1165    /// network is not implied for a deliberate supply-chain block.
1166    #[test]
1167    fn policy_denials_reclassify_network_error_to_capability_denied() {
1168        for r in [
1169            DenialReason::RedirectNotInAllowlist,
1170            DenialReason::InsecureScheme,
1171            DenialReason::HostInBlockList,
1172        ] {
1173            let d = denial(r);
1174            assert_eq!(
1175                effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
1176                ErrorCode::CapabilityDenied,
1177                "policy reason {r:?} must promote NetworkError -> CapabilityDenied"
1178            );
1179            assert_eq!(
1180                cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, Some(&d))),
1181                3,
1182                "policy reason {r:?} must map to exit 3 (docs/ERRORS.md §4/§6.1)"
1183            );
1184        }
1185    }
1186
1187    /// A genuine transport fault carries NO `DenialContext`; it must stay
1188    /// `NetworkError` / exit 1 — `docs/ERRORS.md` §2 "retry usually fine"
1189    /// is the correct signal there. (This is exactly the e2e
1190    /// `..._host_off_allowlist` path: first-leg connect failure, no
1191    /// redirect hop, so no allowlist denial is produced.)
1192    #[test]
1193    fn absent_denial_context_keeps_network_error() {
1194        assert_eq!(
1195            effective_blocked_code(ErrorCode::NetworkError, None),
1196            ErrorCode::NetworkError
1197        );
1198        assert_eq!(
1199            cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, None)),
1200            1
1201        );
1202    }
1203
1204    /// Non-policy denial reasons (size cap, content-type mismatch) are
1205    /// NOT supply-chain policy blocks; they keep the core's code so a
1206    /// genuine cap/transport class is not masked as a capability denial.
1207    #[test]
1208    fn non_policy_denials_keep_core_code() {
1209        for r in [
1210            DenialReason::SizeCapExceeded,
1211            DenialReason::ContentTypeMismatch,
1212        ] {
1213            let d = denial(r);
1214            assert_eq!(
1215                effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
1216                ErrorCode::NetworkError,
1217                "non-policy reason {r:?} must NOT be reclassified"
1218            );
1219        }
1220    }
1221
1222    /// The closed-set wire token used in the human `error[...]:` line
1223    /// must match the serde `snake_case` form so the CLI vocabulary does
1224    /// not drift from the JSON/MCP envelope (`docs/ERRORS.md` §3.1).
1225    #[test]
1226    fn denial_reason_wire_matches_serde_snake_case() {
1227        for r in [
1228            DenialReason::RedirectNotInAllowlist,
1229            DenialReason::InsecureScheme,
1230            DenialReason::HostInBlockList,
1231        ] {
1232            let serde_form = serde_json::to_string(&r).expect("serialize DenialReason");
1233            // serde_json wraps the enum unit variant in quotes.
1234            let serde_token = serde_form.trim_matches('"');
1235            assert_eq!(
1236                denial_reason_wire(r),
1237                serde_token,
1238                "CLI wire token for {r:?} must equal the serde snake_case form"
1239            );
1240        }
1241    }
1242}