doiget_cli/commands/fetch.rs
1//! `doiget fetch <ref>` subcommand.
2//!
3//! Phase 1 scope:
4//!
5//! - **arXiv refs** — full end-to-end: PDF bytes are fetched via the
6//! `doiget_core::sources::arxiv::ArxivSource`, the `[doiget]`
7//! extension table is populated with the resolved license, source,
8//! size, and `fetched_at`, and the result is written to the on-disk
9//! store with both the metadata TOML and the PDF.
10//! - **DOI refs** — Crossref metadata + Unpaywall license enrichment + an
11//! OA PDF fetch when Unpaywall's `best_oa_location.url_for_pdf` (or
12//! `best_oa_location.url`) resolves to a host on the synthetic
13//! `"oa-publisher"` allowlist (`docs/REDIRECT_ALLOWLIST.md` §3). The OA
14//! URL host check is informed-best-effort; if the host is not on the
15//! allowlist or the body fails the magic-byte check, the orchestrator
16//! logs a `Fetch err` row under `source = "oa-publisher"` and falls back
17//! to metadata-only success — the metadata is still useful.
18//!
19//! ## Provenance contract
20//!
21//! Per `docs/PROVENANCE_LOG.md` §3, every invocation emits at least one
22//! `SessionStart`, one or more `Fetch` rows (one per source consulted), one
23//! `StoreWrite` row on success, and one `SessionEnd`. Each `Fetch` row is
24//! appended by the underlying `Source` impl; the orchestrator owns the
25//! session-bookend rows and the `StoreWrite` row.
26//!
27//! ## Configuration surface
28//!
29//! Hard-coded paths with env-var overrides; full `config.toml` plumbing
30//! arrives in a follow-up. See `docs/CONFIG.md` for the eventual surface.
31//!
32//! | Env var | Default | Purpose |
33//! |---|---|---|
34//! | `DOIGET_STORE_ROOT` | `$HOME/papers` (or `%USERPROFILE%\papers` on Windows) | Filesystem store root |
35//! | `DOIGET_LOG_PATH` | `<config>/doiget/access.jsonl` | Provenance log file |
36//! | `DOIGET_CONTACT_EMAIL` | `doiget@localhost` | Polite-pool contact email (User-Agent and Crossref) |
37//! | `DOIGET_UNPAYWALL_EMAIL` | (= contact email) | Unpaywall query-string email |
38//! | `DOIGET_ARXIV_BASE` | `https://arxiv.org` | arXiv source base (test override) |
39//! | `DOIGET_CROSSREF_BASE` | `https://api.crossref.org` | Crossref source base (test override) |
40//! | `DOIGET_UNPAYWALL_BASE` | `https://api.unpaywall.org/v2` | Unpaywall source base (test override) |
41//! | `DOIGET_OA_PUBLISHER_BASE` | (production allowlist) | OA publisher host allowlist override (test override) |
42
43use std::sync::Arc;
44
45use anyhow::{anyhow, Context, Result};
46use camino::Utf8PathBuf;
47
48#[cfg(feature = "citation")]
49use doiget_core::http::tier_2_allowlist;
50use doiget_core::http::{
51 discovery_allowlist, fulltext_allowlist, oa_publisher_allowlist, tier_1_allowlist, HttpClient,
52};
53use doiget_core::orchestrator::{fetch_paper as core_fetch_paper, FetchPaperOutcome, PdfLegStatus};
54use doiget_core::provenance::{Capability, LogEvent, LogResult, ProvenanceLog, RowInput};
55use doiget_core::rate_limiter::RateLimiter;
56use doiget_core::source::{FetchContext, FetchError};
57use doiget_core::store::FsStore;
58use doiget_core::{CapabilityProfile, DenialContext, DenialReason, ErrorCode, RateLimits, Ref};
59
60/// Defer to docs/PROVENANCE_LOG.md §3: 26-char ULID per process invocation.
61pub(crate) fn new_session_id() -> String {
62 ulid::Ulid::new().to_string()
63}
64
65// ---------------------------------------------------------------------------
66// Dry-run plan / preview (ADR-0022)
67// ---------------------------------------------------------------------------
68
69// The structured `FetchPlan` shape, the `build_fetch_plan` builder, and
70// the `build_dry_run_envelope` JSON-shape helper live in `doiget-core`
71// so the MCP server can produce a bit-identical envelope without
72// depending on `doiget-cli`. The CLI re-exports them here for callers
73// that already `use doiget_cli::commands::fetch`.
74pub use doiget_core::dry_run::{
75 build_dry_run_envelope, build_fetch_plan, FetchPlan, PdfSourcePlan, RateLimitBudget,
76};
77
78/// Serialize the dry-run envelope and write it to stdout. Used by the
79/// `--dry-run` flag on `doiget fetch` and `doiget batch`. The envelope
80/// shape matches ADR-0022 §1 / `docs/MCP_TOOLS.md` §10.
81///
82/// `pub` so `commands::batch` (multi-ref dry-run) can reuse it. The
83/// function lives in `doiget-cli` (not `doiget-core`) because `println!`
84/// is a CLI concern; the MCP server uses [`build_dry_run_envelope`]
85/// directly and routes the bytes via JSON-RPC.
86///
87/// `print_stdout` is workspace-deny for MCP stdio safety (ADR-0001 /
88/// `docs/SECURITY.md` §3); `--dry-run` is a CLI-only path that never
89/// runs under the MCP server, so the localized `#[allow]` is the
90/// minimal intervention — same pattern used by `commands::config`,
91/// `commands::info`, etc.
92#[allow(clippy::print_stdout)]
93pub fn emit_dry_run_plan_to_stdout(ref_: &Ref, plan: &FetchPlan) -> Result<()> {
94 let envelope = build_dry_run_envelope(ref_, plan);
95 let s = serde_json::to_string(&envelope).context("serializing dry-run envelope to JSON")?;
96 println!("{s}");
97 Ok(())
98}
99
100/// Resolve the provenance log path. `DOIGET_LOG_PATH` wins; otherwise
101/// fall back to `<config>/doiget/access.jsonl` per `docs/PROVENANCE_LOG.md`
102/// §1.
103pub(crate) fn resolve_log_path() -> Result<Utf8PathBuf> {
104 if let Some(s) = read_env_utf8("DOIGET_LOG_PATH")? {
105 return Ok(Utf8PathBuf::from(s));
106 }
107 let cfg = config_dir_utf8()?;
108 Ok(cfg.join("doiget").join("access.jsonl"))
109}
110
111/// Read an env var and assert it is valid UTF-8. Returns `Ok(None)` if
112/// unset; `Ok(Some(s))` if set and UTF-8; `Err(...)` if set but non-UTF-8.
113/// `std::env::var` already requires UTF-8 (returns `VarError::NotUnicode`
114/// otherwise); we wrap it to surface a friendlier error and avoid the
115/// banned `std::path::PathBuf` round-trip.
116fn read_env_utf8(key: &str) -> Result<Option<String>> {
117 match std::env::var(key) {
118 Ok(s) => Ok(Some(s)),
119 Err(std::env::VarError::NotPresent) => Ok(None),
120 Err(std::env::VarError::NotUnicode(_)) => Err(anyhow!("{key} is not valid UTF-8")),
121 }
122}
123
124/// Best-effort home-dir resolution without depending on the `dirs` crate
125/// (every new dep adds cargo-vet exemption churn). Honors `HOME` first
126/// (POSIX + most CI), then `USERPROFILE` (Windows).
127fn home_dir_utf8() -> Result<Utf8PathBuf> {
128 if let Some(s) = read_env_utf8("HOME")? {
129 return Ok(Utf8PathBuf::from(s));
130 }
131 if let Some(s) = read_env_utf8("USERPROFILE")? {
132 return Ok(Utf8PathBuf::from(s));
133 }
134 Err(anyhow!("neither HOME nor USERPROFILE is set"))
135}
136
137/// Best-effort config-dir resolution. Honors `XDG_CONFIG_HOME` first
138/// (POSIX), then `APPDATA` (Windows), then falls back to `$HOME/.config`.
139///
140/// Crate-visible so sibling modules (`commands::capabilities`,
141/// `commands::config`) can resolve the same `<config_dir>/doiget/`
142/// path the production HTTP-client builder reads from. Keep the
143/// signature stable: any divergence between this and the MCP-side
144/// copy (`crates/doiget-mcp/src/lib.rs::config_dir_utf8`) would
145/// silently desync the user-extension allowlist surfaces.
146pub(crate) fn config_dir_utf8() -> Result<Utf8PathBuf> {
147 if let Some(s) = read_env_utf8("XDG_CONFIG_HOME")? {
148 return Ok(Utf8PathBuf::from(s));
149 }
150 if let Some(s) = read_env_utf8("APPDATA")? {
151 return Ok(Utf8PathBuf::from(s));
152 }
153 let home = home_dir_utf8()?;
154 Ok(home.join(".config"))
155}
156
157/// Best-effort resolver-cache root (`docs/CACHE.md`). Honors
158/// `DOIGET_CACHE_ROOT` first, then `XDG_CACHE_HOME/doiget` (POSIX), then
159/// `LOCALAPPDATA\doiget\cache` (Windows), then `$HOME/.cache/doiget`.
160/// Crate-visible so the `verify` command can enable the resolve cache.
161pub(crate) fn cache_dir_utf8() -> Result<Utf8PathBuf> {
162 if let Some(s) = read_env_utf8("DOIGET_CACHE_ROOT")? {
163 return Ok(Utf8PathBuf::from(s));
164 }
165 if let Some(s) = read_env_utf8("XDG_CACHE_HOME")? {
166 return Ok(Utf8PathBuf::from(s).join("doiget"));
167 }
168 if let Some(s) = read_env_utf8("LOCALAPPDATA")? {
169 return Ok(Utf8PathBuf::from(s).join("doiget").join("cache"));
170 }
171 let home = home_dir_utf8()?;
172 Ok(home.join(".cache").join("doiget"))
173}
174
175/// Build a metadata-resolution [`FetchContext`]: HTTP client, rate
176/// limiter, and provenance log resolved from the environment, with the
177/// resolver cache (`docs/CACHE.md`) enabled best-effort.
178///
179/// This is the shared context for the read-only resolve commands
180/// (`verify`, `cite`) — neither persists to the store, so no store
181/// handle is constructed. Enabling `cache_root` means repeat resolves of
182/// the same ref are served from disk, avoiding upstream rate limits; if
183/// the cache dir can't be resolved the run simply proceeds without it.
184pub(crate) fn build_resolve_context() -> Result<FetchContext> {
185 let session_id = new_session_id();
186 let log_path = resolve_log_path()?;
187 let http = Arc::new(build_http_client()?);
188 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
189 let log = Arc::new(
190 ProvenanceLog::open(log_path, session_id.clone())
191 .context("failed to open provenance log")?,
192 );
193 let cache_root = cache_dir_utf8().ok();
194 Ok(FetchContext {
195 http,
196 rate_limiter,
197 log,
198 session_id,
199 cache_root,
200 })
201}
202
203/// Construct the workspace-wide [`HttpClient`].
204///
205/// Production path: `HttpClient::new(tier_1_allowlist() ∪ oa_publisher_allowlist())` —
206/// strict HTTPS-only with the canonical Tier-1 redirect allowlist (Crossref,
207/// Unpaywall, arXiv) plus the synthetic `"oa-publisher"` allowlist used for
208/// the OA PDF leg of the DOI fetch path (`fetch_doi` issues
209/// `HttpClient::fetch_pdf("oa-publisher", url)` against the URL Unpaywall
210/// returned in `best_oa_location`). The OA-publisher list is
211/// informed-best-effort per `docs/REDIRECT_ALLOWLIST.md` §3.
212///
213/// Test path: when any of the three `DOIGET_*_BASE` env vars is set, build a
214/// multi-source relaxed-`https_only` client whose per-source allowlist is
215/// derived from the corresponding env-var hosts. The `oa-publisher` source
216/// key is registered against the same host (typically the wiremock origin)
217/// when `DOIGET_OA_PUBLISHER_BASE` is set — this lets the integration tests
218/// under `tests/fetch_doi_oa_pdf_e2e.rs` exercise the full PDF leg without
219/// touching the real network.
220pub(crate) fn build_http_client() -> Result<HttpClient> {
221 let arxiv = std::env::var("DOIGET_ARXIV_BASE").ok();
222 let crossref = std::env::var("DOIGET_CROSSREF_BASE").ok();
223 let unpaywall = std::env::var("DOIGET_UNPAYWALL_BASE").ok();
224 let oa_publisher = std::env::var("DOIGET_OA_PUBLISHER_BASE").ok();
225 // Slice 16: `DOIGET_OPENALEX_BASE` selects a wiremock host for the
226 // citation-graph BFS. Only meaningful with `--features citation`,
227 // but reading the env unconditionally keeps the branch logic
228 // simple and is harmless for default builds.
229 let openalex_base = std::env::var("DOIGET_OPENALEX_BASE").ok();
230 // ADR-0032: `DOIGET_AR5IV_BASE` selects a wiremock host for the
231 // full-text extraction path (`doiget text`). Test-only override,
232 // mirroring `DOIGET_ARXIV_BASE`.
233 let ar5iv_base = std::env::var("DOIGET_AR5IV_BASE").ok();
234
235 if arxiv.is_none()
236 && crossref.is_none()
237 && unpaywall.is_none()
238 && oa_publisher.is_none()
239 && openalex_base.is_none()
240 && ar5iv_base.is_none()
241 {
242 let mut allowlists = tier_1_allowlist();
243 allowlists.extend(oa_publisher_allowlist());
244 // ADR-0031: discovery search (`doiget search`) is Tier-1 OA
245 // metadata, always-on, and ships in the default `oa-only` binary.
246 // Register `api.openalex.org` under the `"openalex"` source key
247 // UNCONDITIONALLY so `discovery::paper_search` can reach the
248 // `/works?search=` endpoint without `--features citation`. In
249 // citation builds the Tier-2 extend below re-registers the same
250 // host under the same key (idempotent HashMap overwrite).
251 allowlists.extend(discovery_allowlist());
252 // ADR-0032: full-text extraction (`doiget text`) is Tier-1 OA
253 // metadata, always-on. Register `ar5iv.labs.arxiv.org` under the
254 // `"ar5iv"` source key unconditionally so `paper_text::paper_text`
255 // can reach ar5iv in `oa-only` builds.
256 allowlists.extend(fulltext_allowlist());
257 // Slice 16: when the `citation` feature is compiled in, the
258 // graph subcommand walks OpenAlex Work IDs via
259 // `ctx.http.fetch_bytes("openalex", ...)`. The Tier 2
260 // allowlist registers the `api.openalex.org` host under
261 // that source key. CapabilityProfile.metadata.openalex is
262 // the runtime gate; the allowlist is the transport gate.
263 #[cfg(feature = "citation")]
264 allowlists.extend(tier_2_allowlist());
265
266 // ADR-0028 D2: merge user-extension hosts from
267 // `<config_dir>/doiget/config.toml`. See
268 // `doiget_core::user_extension` for the wire contract and
269 // the (deferred) S3b provenance / doctor / capabilities
270 // surfaces.
271 //
272 // Failure handling is opt-in-convenience: a missing config
273 // is silent (Ok-empty), a malformed config emits
274 // `tracing::warn!` and continues with the curated allowlist,
275 // and an unresolvable config dir emits `tracing::debug!`
276 // (only happens in stripped envs with no HOME / XDG /
277 // APPDATA — review pass I3 / A1).
278 match config_dir_utf8() {
279 Ok(cfg_dir) => {
280 let path = cfg_dir.join("doiget").join("config.toml");
281 match doiget_core::user_extension::load(&path) {
282 Ok(user_hosts) if !user_hosts.is_empty() => {
283 tracing::info!(
284 count = user_hosts.len(),
285 path = %path,
286 "merging user-extension allowlist hosts (ADR-0028 D2)"
287 );
288 doiget_core::user_extension::merge_into_allowlists(
289 &mut allowlists,
290 &user_hosts,
291 );
292 }
293 Ok(_) => {}
294 Err(e) => {
295 tracing::warn!(
296 error = %e,
297 path = %path,
298 "failed to load user-extension allowlist; \
299 falling back to curated set only"
300 );
301 }
302 }
303 }
304 Err(e) => {
305 tracing::debug!(
306 error = %e,
307 "config dir unresolvable; \
308 user-extension allowlist disabled (curated set only)"
309 );
310 }
311 }
312
313 return HttpClient::new(allowlists).context("building HTTP client");
314 }
315
316 // Test-base mode: build a relaxed client per overridden source.
317 let mut owned: Vec<(String, String)> = Vec::new();
318 for (source, base) in [
319 ("arxiv", arxiv.as_deref()),
320 ("crossref", crossref.as_deref()),
321 ("unpaywall", unpaywall.as_deref()),
322 ("oa-publisher", oa_publisher.as_deref()),
323 ("openalex", openalex_base.as_deref()),
324 ("ar5iv", ar5iv_base.as_deref()),
325 ] {
326 if let Some(b) = base {
327 let url = url::Url::parse(b)
328 .with_context(|| format!("DOIGET_*_BASE for {source} is not a URL: {b}"))?;
329 let host = url
330 .host_str()
331 .ok_or_else(|| anyhow!("base URL has no host: {b}"))?;
332 owned.push((source.to_string(), host.to_string()));
333 }
334 }
335 let entries: Vec<(&str, &str)> = owned
336 .iter()
337 .map(|(s, h)| (s.as_str(), h.as_str()))
338 .collect();
339 Ok(HttpClient::new_for_tests_allow_http_multi(&entries))
340}
341
342// Slice 2: the per-source env-aware constructors that used to live here
343// (`build_arxiv_source`, `build_crossref_source`, `build_unpaywall_source`)
344// moved into `doiget-core::orchestrator` so the core `fetch_paper`
345// orchestrator and the MCP server both honor the same `DOIGET_*_BASE`
346// test-override surface. The CLI no longer constructs sources directly —
347// it builds the `FetchContext` + `FsStore` and hands them to the core
348// orchestrator.
349
350/// Resolved configuration derived from the environment.
351///
352/// Slice 2: `contact_email` / `unpaywall_email` are now read by the
353/// `doiget-core::orchestrator::fetch_paper` orchestrator directly from
354/// the env (`contact_email_from_env` / `unpaywall_email_from_env` in
355/// that module), so the CLI no longer threads them through. The fields
356/// stay here so a future slice that adds CLI-flag overrides has a
357/// natural attachment point — the `#[allow(dead_code)]` is the minimal
358/// intervention until that slice lands.
359#[allow(dead_code)]
360pub(crate) struct OrchestratorConfig {
361 pub(crate) store_root: Utf8PathBuf,
362 pub(crate) log_path: Utf8PathBuf,
363 pub(crate) contact_email: String,
364 pub(crate) unpaywall_email: String,
365}
366
367impl OrchestratorConfig {
368 fn from_env() -> Result<Self> {
369 let store_root = super::resolve_store_root()?;
370 let log_path = resolve_log_path()?;
371 let contact_email =
372 std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| "doiget@localhost".into());
373 let unpaywall_email =
374 std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| contact_email.clone());
375 Ok(Self {
376 store_root,
377 log_path,
378 contact_email,
379 unpaywall_email,
380 })
381 }
382}
383
384/// Reusable fetch harness shared by `doiget fetch <ref>` (single ref) and
385/// `doiget batch <path>` (many refs). Owns the shared foundation modules
386/// (`HttpClient` / `RateLimiter` / `ProvenanceLog`), the on-disk store, and
387/// the resolved capability profile, plus the session bookkeeping required by
388/// `docs/PROVENANCE_LOG.md` §3 (the 26-char ULID `session_id`).
389///
390/// Construction is performed once via [`FetchHarness::from_env`]. Per-ref
391/// orchestration runs through [`FetchHarness::fetch_one`]; bookend rows go
392/// via [`FetchHarness::log_session_start`] / [`FetchHarness::log_session_end`]
393/// so the orchestrator can frame either one fetch or many.
394pub(crate) struct FetchHarness {
395 pub(crate) http: Arc<HttpClient>,
396 pub(crate) rate_limiter: Arc<RateLimiter>,
397 pub(crate) log: Arc<ProvenanceLog>,
398 pub(crate) store: FsStore,
399 pub(crate) profile: CapabilityProfile,
400 pub(crate) session_id: String,
401 /// Resolved config; Slice 2 keeps this on the harness for the
402 /// CLI-only env diagnostics path (`commands::config::doctor`), even
403 /// though `fetch_one` no longer needs it (the core orchestrator
404 /// re-reads contact email from env directly).
405 #[allow(dead_code)]
406 pub(crate) cfg: OrchestratorConfig,
407}
408
409impl FetchHarness {
410 /// Build a harness from the same env-var surface documented at the top
411 /// of this module. Creates the log parent directory if missing, opens
412 /// the provenance log (allocating a fresh `session_id`), and constructs
413 /// the HTTP client honoring `DOIGET_*_BASE` overrides for tests.
414 pub(crate) fn from_env() -> Result<Self> {
415 let cfg = OrchestratorConfig::from_env()?;
416 if let Some(parent) = cfg.log_path.parent() {
417 if !parent.as_str().is_empty() {
418 std::fs::create_dir_all(parent.as_std_path())
419 .with_context(|| format!("creating log dir {parent}"))?;
420 }
421 }
422 let session_id = new_session_id();
423 let log = Arc::new(
424 ProvenanceLog::open(cfg.log_path.clone(), session_id.clone())
425 .context("opening provenance log")?,
426 );
427 let http = Arc::new(build_http_client()?);
428 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
429 let store = FsStore::new(cfg.store_root.clone()).context("opening store")?;
430 let profile = CapabilityProfile::from_env().context("resolving capability profile")?;
431
432 Ok(Self {
433 http,
434 rate_limiter,
435 log,
436 store,
437 profile,
438 session_id,
439 cfg,
440 })
441 }
442
443 /// Build a [`FetchContext`] view over this harness's foundation modules.
444 /// Creating one is cheap (cloning three `Arc`s + a `String`); per-ref
445 /// orchestration constructs one on demand.
446 pub(crate) fn fetch_context(&self) -> FetchContext {
447 FetchContext {
448 http: self.http.clone(),
449 rate_limiter: self.rate_limiter.clone(),
450 log: self.log.clone(),
451 session_id: self.session_id.clone(),
452 cache_root: None,
453 }
454 }
455
456 /// Append a `SessionStart` row. `ref_input` is the raw user-supplied ref
457 /// string (single-fetch path); pass `None` for batch sessions where no
458 /// single ref attributes the session.
459 pub(crate) fn log_session_start(&self, ref_input: Option<&str>) -> Result<()> {
460 self.log
461 .append(RowInput {
462 event: LogEvent::SessionStart,
463 result: LogResult::Ok,
464 capability: Capability::Oa,
465 ref_: ref_input,
466 source: None,
467 error_code: None,
468 size_bytes: None,
469 license: None,
470 store_path: None,
471 // Session bookend — no audit identity (ADR-0021 §1).
472 canonical_digest: None,
473 })
474 .context("appending SessionStart row")?;
475 Ok(())
476 }
477
478 /// Append a `SessionEnd` row. `ref_input` mirrors the `log_session_start`
479 /// argument; pass `None` for batch sessions. The result is best-effort —
480 /// if this append fails, the caller already has the underlying fetch
481 /// error (if any) and we don't override it.
482 pub(crate) fn log_session_end(&self, ok: bool, ref_input: Option<&str>) {
483 let result = if ok { LogResult::Ok } else { LogResult::Err };
484 let _ = self.log.append(RowInput {
485 event: LogEvent::SessionEnd,
486 result,
487 capability: Capability::Oa,
488 ref_: ref_input,
489 source: None,
490 error_code: None,
491 size_bytes: None,
492 license: None,
493 store_path: None,
494 // Session bookend — no audit identity (ADR-0021 §1).
495 canonical_digest: None,
496 });
497 }
498
499 /// Run a single ref through the per-kind orchestration (arxiv → PDF +
500 /// metadata; doi → metadata-only via Crossref + Unpaywall, with an
501 /// informed-best-effort OA PDF leg). Errors here are scoped to this
502 /// one ref — the caller decides whether to abort the surrounding
503 /// session.
504 ///
505 /// Slice 2: delegates to
506 /// [`doiget_core::orchestrator::fetch_paper`] for the actual work
507 /// (which both CLI and MCP now share). This function keeps the
508 /// CLI-only stderr success-line print.
509 pub(crate) async fn fetch_one(&self, ref_: &Ref) -> Result<FetchPaperOutcome, FetchError> {
510 // Pure data path: return the typed outcome (or typed error)
511 // without any CLI-only rendering or exit-code synthesis. The
512 // single-fetch caller (`run_with_options`) and the batch
513 // caller (`commands::batch::classify_joined`) each render the
514 // human / JSON surface and map to `CliExit` themselves — see
515 // #210 for the rationale (batch's `--json` JSONL needs the
516 // structured `FetchPaperOutcome` to emit `result.{safekey,
517 // store_path, canonical_digest}` on success and
518 // `denial_context` on a `PdfLegStatus::Blocked` outcome, which
519 // was unreachable through the previous `Result<()>`
520 // signature).
521 let ctx = self.fetch_context();
522 core_fetch_paper(ref_, &self.profile, &ctx, &self.store, self.store.root()).await
523 }
524}
525
526/// `true` iff the outcome represents a clean fetch: `Fetched` (full
527/// PDF) or `NoOaUrl` (metadata-only by design). A `Blocked` PDF leg
528/// is a failure for SessionEnd / exit-code purposes — an OA PDF was
529/// discovered but could not be retrieved — even though the metadata
530/// TOML did land on disk. Pulled out so both `run_with_options` and
531/// `commands::batch` agree on the failure boundary.
532pub(crate) fn outcome_is_clean_success(outcome: &FetchPaperOutcome) -> bool {
533 !matches!(outcome.pdf_leg, PdfLegStatus::Blocked { .. })
534}
535
536/// CLI-only one-line success message on stderr (ADR-0001 stdio
537/// convention). Renders the [`FetchPaperOutcome`] in the same form the
538/// pre-Slice-2 CLI emitted: a full-PDF success names the PDF path; a
539/// metadata-only DOI fallback (size_bytes == 0) names the metadata TOML
540/// path the orchestrator wrote.
541fn emit_success_line(ref_: &Ref, outcome: &FetchPaperOutcome) {
542 let label = match ref_ {
543 Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
544 Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
545 };
546 match &outcome.pdf_leg {
547 PdfLegStatus::Fetched => {
548 print_success(format_args!(
549 "fetched {} ({} bytes) -> {}",
550 label, outcome.size_bytes, outcome.path
551 ));
552 }
553 PdfLegStatus::NoOaUrl => {
554 print_success(format_args!(
555 "fetched {} (metadata-only: no OA PDF available) -> {}",
556 label, outcome.path
557 ));
558 }
559 // Issue #145: `Blocked` is NO LONGER a success outcome. It is
560 // intercepted in `fetch_one` BEFORE `emit_success_line` is
561 // called and rendered via `render_blocked_error` with a
562 // non-zero exit (`docs/ERRORS.md` §3/§6 — no silent failures).
563 // Reaching this arm would mean the interception regressed, so we
564 // fail closed: surface the `error[CODE]:` line here too rather
565 // than printing a misleading success line.
566 PdfLegStatus::Blocked {
567 code,
568 message,
569 denial,
570 suggested_arxiv_id,
571 } => {
572 // Same #145 reclassification as the primary interception in
573 // `fetch_one`, so this fail-closed fallback stays consistent.
574 let effective = effective_blocked_code(*code, denial.as_ref());
575 render_blocked_error(
576 ref_,
577 outcome,
578 effective,
579 message,
580 denial.as_ref(),
581 suggested_arxiv_id.as_deref(),
582 );
583 }
584 // `PdfLegStatus` is `#[non_exhaustive]`; a future variant
585 // degrades to the size-based wording rather than failing the
586 // downstream-crate build.
587 _ => {
588 if outcome.size_bytes == 0 {
589 print_success(format_args!(
590 "fetched {} (metadata-only) -> {}",
591 label, outcome.path
592 ));
593 } else {
594 print_success(format_args!(
595 "fetched {} ({} bytes) -> {}",
596 label, outcome.size_bytes, outcome.path
597 ));
598 }
599 }
600 }
601}
602
603/// Run the `doiget fetch <ref>` subcommand.
604///
605/// `dry_run` (ADR-0022 §1): when `true`, build a [`FetchPlan`] from the
606/// parsed [`Ref`] and the configured store root, serialize it as JSON to
607/// stdout, and return `Ok(())` immediately, **without** building a
608/// `FetchHarness` (no provenance log open), without contacting the
609/// network, without writing to the store, and without appending a
610/// provenance row.
611///
612/// When `dry_run` is `false`, the function runs the normal end-to-end
613/// orchestration path: open the provenance log, dispatch the per-kind
614/// orchestrator, emit a `SessionStart` / `SessionEnd` bookend pair.
615///
616/// On success returns `Ok(())` and writes a one-line success message to
617/// stderr (per ADR-0001 stdio convention — no stdout writes from `fetch`
618/// on the normal path). On failure, returns an `anyhow::Error` and emits
619/// a `SessionEnd` row with `result=err` to the provenance log before
620/// returning.
621///
622/// # History
623///
624/// Slice 5 (PR #84 advisory item A2/A3 refactor): the previous
625/// `FetchOptions { dry_run: bool }` single-field option bundle plus the
626/// thin `run(input)` backwards-compat wrapper were collapsed into this
627/// single `dry_run: bool` parameter — the option bundle's single-bool
628/// shape was YAGNI, and the wrapper only existed to spare integration
629/// tests a `FetchOptions::default()` literal.
630pub async fn run_with_options(
631 input: String,
632 dry_run: bool,
633 _mode: super::output::OutputMode,
634) -> Result<()> {
635 // `_mode` is threaded per ADR-0017 / #144. Quiet-suppression of the
636 // success line is tracked in #203. The dry-run plan envelope is
637 // product output (the requested artifact) and is unaffected by
638 // mode.
639 // Step 1: parse + safekey. Issue #119: render the cargo-style
640 // `error[INVALID_REF]:` line + carry the exit code, rather than
641 // letting the granular `RefParseError` fall out as an opaque
642 // anyhow `{:?}` dump.
643 let ref_ = match Ref::parse(&input) {
644 Ok(r) => r,
645 Err(e) => {
646 print_err(format_args!(
647 "error[{}]: invalid ref: {e}",
648 ErrorCode::InvalidRef.as_wire()
649 ));
650 return Err(anyhow::Error::new(CliExit(cli_exit_code(
651 ErrorCode::InvalidRef,
652 ))));
653 }
654 };
655
656 // Dry-run branch: build the plan and emit it. NO harness, NO network,
657 // NO store write, NO provenance row. Posture-lint ADR-0022 §5 will
658 // verify this branch never reaches `HttpClient::fetch_*`,
659 // `FsStore::write_*`, or `ProvenanceLog::append`.
660 if dry_run {
661 // Resolve store root for path projections. Failures here surface
662 // as a normal CLI error (not as a denial) — same behaviour the
663 // non-dry-run path would exhibit on a misconfigured environment.
664 let store_root = super::resolve_store_root()?;
665 let plan = build_fetch_plan(&ref_, &store_root);
666 emit_dry_run_plan_to_stdout(&ref_, &plan)?;
667 return Ok(());
668 }
669
670 // Step 2: build harness (foundation modules + provenance log).
671 let harness = FetchHarness::from_env()?;
672
673 // Step 3: emit SessionStart. Fail-closed if the log write fails — the
674 // surrounding fetch MUST NOT proceed (`docs/PROVENANCE_LOG.md` §5).
675 harness.log_session_start(Some(ref_.as_input_str()))?;
676
677 // Step 4: dispatch on ref kind. `fetch_one` now returns the
678 // typed `FetchPaperOutcome` / `FetchError` per #210; the
679 // single-fetch caller (this fn) owns rendering + exit code.
680 let result = harness.fetch_one(&ref_).await;
681
682 // Step 5: emit SessionEnd regardless of outcome. A `Blocked` PDF
683 // leg is NOT a clean success even though the typed `Result` is
684 // `Ok` — `outcome_is_clean_success` collapses both halves so the
685 // SessionEnd `is_ok` field matches the user-facing exit code.
686 let session_ok = match &result {
687 Ok(o) => outcome_is_clean_success(o),
688 Err(_) => false,
689 };
690 harness.log_session_end(session_ok, Some(ref_.as_input_str()));
691
692 // Step 6: render the user-facing surface and map to `CliExit`.
693 // The Blocked-PDF reclassification logic that used to live inside
694 // `fetch_one` was lifted here verbatim so the batch caller can
695 // share the same `effective_blocked_code` / `render_blocked_error`
696 // helpers (issue #210 / #145).
697 match result {
698 Ok(outcome) => {
699 if let PdfLegStatus::Blocked {
700 code,
701 message,
702 denial,
703 suggested_arxiv_id,
704 } = &outcome.pdf_leg
705 {
706 let effective = effective_blocked_code(*code, denial.as_ref());
707 render_blocked_error(
708 &ref_,
709 &outcome,
710 effective,
711 message,
712 denial.as_ref(),
713 suggested_arxiv_id.as_deref(),
714 );
715 return Err(anyhow::Error::new(CliExit(cli_exit_code(effective))));
716 }
717 emit_success_line(&ref_, &outcome);
718 Ok(())
719 }
720 Err(e) => {
721 render_fetch_error(&e);
722 let code: ErrorCode = (&e).into();
723 Err(anyhow::Error::new(CliExit(cli_exit_code(code))))
724 }
725 }
726}
727
728/// Single-line user-visible success message, written to stderr per ADR-0001
729/// (stdio convention — the CLI never writes a success line to stdout). This
730/// is the one place where `eprintln!` is intentional; the workspace
731/// `clippy::print_stderr` lint is `warn` so the localized `#[allow]` is the
732/// minimal intervention.
733#[allow(clippy::print_stderr)]
734fn print_success(args: std::fmt::Arguments<'_>) {
735 eprintln!("{args}");
736}
737
738/// Stderr sink for the `docs/ERRORS.md` §3 human-error lines. Mirrors
739/// [`print_success`]; the localized `#[allow]` is the minimal
740/// intervention for the workspace `clippy::print_stderr` lint.
741#[allow(clippy::print_stderr)]
742fn print_err(args: std::fmt::Arguments<'_>) {
743 eprintln!("{args}");
744}
745
746/// Carries a `docs/ERRORS.md` §4 process exit code out of a CLI
747/// command to `main`, which owns the actual `std::process::exit`
748/// (calling it inside `run_with_options` would kill in-process
749/// integration tests). The human-readable `error[CODE]: …` line has
750/// ALREADY been written to stderr by `render_fetch_error` before
751/// this is constructed, so `main` must NOT print it again. Issue #119.
752#[derive(Debug)]
753pub struct CliExit(pub i32);
754
755impl std::fmt::Display for CliExit {
756 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
757 write!(f, "exiting with status {}", self.0)
758 }
759}
760
761impl std::error::Error for CliExit {}
762
763/// Reclassify a `PdfLegStatus::Blocked` code at the CLI layer (issue
764/// #145 / `docs/ERRORS.md` §2 "NETWORK_ERROR" vs §3.1 / §6).
765///
766/// The core maps *every* `FetchError::Http(_)` to
767/// [`ErrorCode::NetworkError`] (`doiget_core::source`'s
768/// `From<&FetchError> for ErrorCode`). `docs/ERRORS.md` §2 defines
769/// `NETWORK_ERROR` as a transport / DNS / TLS fault where "retry usually
770/// fine" — true for a real network blip, but **false** for a deliberate
771/// supply-chain policy block (off-allowlist redirect, insecure-scheme
772/// redirect, host-blocklist hit): retrying such a block never helps, so
773/// surfacing it as `NETWORK_ERROR` (generic exit 1) misrepresents a flaky
774/// network to humans and agents.
775///
776/// The orchestrator already preserves the true reason on the
777/// [`DenialContext`] side-channel (the `From<&HttpError> for
778/// Option<DenialContext>` impl walks reqwest's `source()` chain, so even
779/// a redirect denial wrapped as `HttpError::Network` still yields
780/// [`DenialReason::RedirectNotInAllowlist`]). When that reason is one of
781/// the closed-set *policy* denials, promote the surface code to
782/// [`ErrorCode::CapabilityDenied`] so the CLI renders
783/// `error[CAPABILITY_DENIED]:` and [`cli_exit_code`] returns exit 3 —
784/// the same code `fetch` / `graph` already use for capability denials.
785/// Non-policy blocks (no `denial`, or a non-policy reason such as
786/// `SizeCapExceeded` / `ContentTypeMismatch`) keep the core's code so a
787/// genuine transport failure still reads as `NETWORK_ERROR`.
788pub(crate) fn effective_blocked_code(code: ErrorCode, denial: Option<&DenialContext>) -> ErrorCode {
789 match denial.map(|d| d.reason) {
790 Some(
791 DenialReason::RedirectNotInAllowlist
792 | DenialReason::InsecureScheme
793 | DenialReason::HostInBlockList,
794 ) => ErrorCode::CapabilityDenied,
795 _ => code,
796 }
797}
798
799/// Snake-case wire token for a [`DenialReason`], matching the
800/// `#[serde(rename_all = "snake_case")]` JSON/MCP surface (ADR-0023 §2)
801/// so the CLI human line uses the SAME vocabulary as the machine
802/// envelope (`docs/ERRORS.md` §3.1). Only the policy-denial reasons the
803/// CLI inlines are enumerated; everything else degrades to a generic
804/// token rather than drifting from the serde form.
805fn denial_reason_wire(reason: DenialReason) -> &'static str {
806 match reason {
807 DenialReason::RedirectNotInAllowlist => "redirect_not_in_allowlist",
808 DenialReason::InsecureScheme => "insecure_scheme",
809 DenialReason::HostInBlockList => "host_in_block_list",
810 _ => "policy_denied",
811 }
812}
813
814/// `docs/ERRORS.md` §4 closed-code → process exit code. Anything not
815/// individually listed falls under "at least one fetch failed" (1).
816///
817/// `pub(crate)` so sibling subcommands (`commands::graph`, …) route
818/// their typed denials through the SAME centralized mapping instead of
819/// open-coding magic exit numbers — keeps the `ErrorCode`→exit contract
820/// single-sourced (issue #149).
821pub(crate) fn cli_exit_code(code: ErrorCode) -> i32 {
822 match code {
823 ErrorCode::CapabilityDenied => 3,
824 ErrorCode::StoreError | ErrorCode::LogError => 4,
825 ErrorCode::FetchTimeout => 124,
826 // A name filter that matched several entities is user-fixable by
827 // narrowing the query → `docs/ERRORS.md` §4 exit 2 ("misuse").
828 ErrorCode::Ambiguous => 2,
829 _ => 1,
830 }
831}
832
833/// Render a terminal [`FetchError`] in the `docs/ERRORS.md` §3
834/// "Researcher (CLI human)" form: `error[CODE]: message` on stderr,
835/// plus an actionable `= note:` line carrying the ADR-0023
836/// `denial_context` (attempted / expected hosts) when the failure was
837/// a denial class. stdout stays clean (ADR-0001).
838///
839/// `pub(crate)` so sibling resolve commands (`commands::link`, …) render
840/// typed failures — including the actionable denial note — through the
841/// SAME path instead of open-coding `error[CODE]: msg` and dropping the
842/// `denial_context` note (review #287).
843pub(crate) fn render_fetch_error(e: &FetchError) {
844 let code: ErrorCode = e.into();
845 print_err(format_args!("error[{}]: {}", code.as_wire(), e));
846 if let Some(dc) = Option::<DenialContext>::from(e) {
847 let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
848 match &dc.expected {
849 Some(exp) if !exp.is_empty() => {
850 print_err(format_args!(
851 " = note: attempted {attempted}; allowed: {}",
852 exp.join(", ")
853 ));
854 }
855 _ => {
856 print_err(format_args!(" = note: attempted {attempted}"));
857 }
858 }
859 }
860}
861
862/// Render a `PdfLegStatus::Blocked` outcome in the `docs/ERRORS.md` §3
863/// "Researcher (CLI human)" form. Issue #145: an OA PDF was discovered
864/// but could not be retrieved — the metadata WAS written, but this is a
865/// denial, not a clean success. We emit the same `error[CODE]:` stderr
866/// shape as [`render_fetch_error`] (so pipelines and humans see an
867/// unambiguous failure), name the metadata path that DID land so the
868/// partial result is still discoverable, and surface the ADR-0023
869/// `denial_context` note when present. stdout stays clean (ADR-0001).
870fn render_blocked_error(
871 ref_: &Ref,
872 outcome: &FetchPaperOutcome,
873 code: ErrorCode,
874 message: &str,
875 denial: Option<&DenialContext>,
876 suggested_arxiv_id: Option<&str>,
877) {
878 let label = match ref_ {
879 Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
880 Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
881 };
882 // Issue #145: when the block is a deliberate policy denial, name the
883 // closed-set reason inline so a human/agent reading the
884 // `error[CAPABILITY_DENIED]:` line immediately sees this is a
885 // supply-chain policy block (retrying is futile), not a flaky network.
886 match denial.map(|d| d.reason) {
887 Some(
888 reason @ (DenialReason::RedirectNotInAllowlist
889 | DenialReason::InsecureScheme
890 | DenialReason::HostInBlockList),
891 ) => {
892 print_err(format_args!(
893 "error[{}]: {label}: an OA PDF was found but its host is blocked by \
894 supply-chain policy ({}): {message}",
895 code.as_wire(),
896 denial_reason_wire(reason)
897 ));
898 }
899 _ => {
900 print_err(format_args!(
901 "error[{}]: {label}: an OA PDF was found but could not be retrieved: {message}",
902 code.as_wire()
903 ));
904 }
905 }
906 if let Some(dc) = denial {
907 let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
908 match &dc.expected {
909 Some(exp) if !exp.is_empty() => {
910 print_err(format_args!(
911 " = note: attempted {attempted}; allowed: {}",
912 exp.join(", ")
913 ));
914 }
915 _ => {
916 print_err(format_args!(" = note: attempted {attempted}"));
917 }
918 }
919 }
920 // The metadata TOML still landed; point the user at it so the
921 // partial result is not lost (it is still useful), without
922 // pretending the fetch succeeded.
923 print_err(format_args!(
924 " = note: metadata-only record written to {}",
925 outcome.path
926 ));
927 if let Some(arxiv_id) = suggested_arxiv_id {
928 print_err(format_args!(
929 " = suggest: Try fetching the arXiv version: doiget fetch arxiv:{}",
930 arxiv_id
931 ));
932 }
933}
934
935// ---------------------------------------------------------------------------
936// Tests
937// ---------------------------------------------------------------------------
938
939#[cfg(test)]
940#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
941mod tests {
942 use super::*;
943 use serial_test::serial;
944
945 #[test]
946 fn new_session_id_is_26_chars() {
947 // ULID textual form is fixed-width 26 chars (Crockford base32).
948 // `docs/PROVENANCE_LOG.md` §3 requires this exact length.
949 let id = new_session_id();
950 assert_eq!(id.len(), 26, "session id must be 26 chars: {:?}", id);
951 // Crockford base32 uses uppercase letters and digits; specifically
952 // I, L, O, U are excluded. Every char must be ASCII alphanumeric.
953 assert!(
954 id.chars().all(|c| c.is_ascii_alphanumeric()),
955 "ulid must be ASCII alphanumeric: {:?}",
956 id
957 );
958 }
959
960 /// Review pass C2: end-to-end coverage of the user-extension
961 /// merge inside `build_http_client`. Without this test the
962 /// production path that turns a `config.toml`
963 /// `[[network.additional_hosts]]` entry into a passing
964 /// allowlist match is unexercised — every existing e2e sets
965 /// `DOIGET_*_BASE` and short-circuits into the test-mode
966 /// builder above.
967 #[test]
968 #[serial]
969 fn build_http_client_merges_user_extension_into_oa_publisher_allowlist() {
970 use std::io::Write;
971
972 // Construct a tempdir + minimal config.toml under it.
973 let td = tempfile::TempDir::new().expect("tempdir");
974 let cfg_dir = td.path().join("doiget");
975 std::fs::create_dir_all(&cfg_dir).expect("mkdir doiget/");
976 let cfg_path = cfg_dir.join("config.toml");
977 let mut f = std::fs::File::create(&cfg_path).expect("create config.toml");
978 f.write_all(
979 br#"
980[[network.additional_hosts]]
981host = "ruj.uj.edu.pl"
982note = "Jagiellonian"
983
984[[network.additional_hosts]]
985host = "*.uj.edu.pl"
986"#,
987 )
988 .expect("write config.toml");
989 drop(f);
990
991 // Save + override env so `config_dir_utf8()` lands on the
992 // tempdir. Restored on Drop by EnvGuard. We also clear the
993 // five `DOIGET_*_BASE` env vars to force the production
994 // branch of `build_http_client`.
995 struct EnvGuard {
996 key: &'static str,
997 prev: Option<String>,
998 }
999 impl EnvGuard {
1000 fn save(key: &'static str) -> Self {
1001 Self {
1002 key,
1003 prev: std::env::var(key).ok(),
1004 }
1005 }
1006 }
1007 impl Drop for EnvGuard {
1008 fn drop(&mut self) {
1009 match &self.prev {
1010 Some(v) => std::env::set_var(self.key, v),
1011 None => std::env::remove_var(self.key),
1012 }
1013 }
1014 }
1015 let _g0 = EnvGuard::save("XDG_CONFIG_HOME");
1016 let _g1 = EnvGuard::save("APPDATA");
1017 let _g2 = EnvGuard::save("HOME");
1018 let _g3 = EnvGuard::save("USERPROFILE");
1019 let _g4 = EnvGuard::save("DOIGET_ARXIV_BASE");
1020 let _g5 = EnvGuard::save("DOIGET_CROSSREF_BASE");
1021 let _g6 = EnvGuard::save("DOIGET_UNPAYWALL_BASE");
1022 let _g7 = EnvGuard::save("DOIGET_OA_PUBLISHER_BASE");
1023 let _g8 = EnvGuard::save("DOIGET_OPENALEX_BASE");
1024 std::env::set_var("XDG_CONFIG_HOME", td.path());
1025 std::env::set_var("APPDATA", td.path());
1026 std::env::set_var("HOME", td.path());
1027 std::env::set_var("USERPROFILE", td.path());
1028 std::env::remove_var("DOIGET_ARXIV_BASE");
1029 std::env::remove_var("DOIGET_CROSSREF_BASE");
1030 std::env::remove_var("DOIGET_UNPAYWALL_BASE");
1031 std::env::remove_var("DOIGET_OA_PUBLISHER_BASE");
1032 std::env::remove_var("DOIGET_OPENALEX_BASE");
1033
1034 let client = build_http_client().expect("HttpClient builds");
1035 let oa = client
1036 .source_allowlist("oa-publisher")
1037 .expect("oa-publisher source registered");
1038
1039 // Pre-existing curated allowlist still effective.
1040 assert!(
1041 oa.redirect_hosts.iter().any(|p| p == "*.aps.org"),
1042 "curated *.aps.org MUST still be present after merge; got {:?}",
1043 oa.redirect_hosts
1044 );
1045 // User-added literal host passes match.
1046 assert!(
1047 oa.matches("ruj.uj.edu.pl"),
1048 "literal `ruj.uj.edu.pl` from user config MUST match"
1049 );
1050 // User-added wildcard passes match for a subdomain.
1051 assert!(
1052 oa.matches("alpha.uj.edu.pl"),
1053 "wildcard `*.uj.edu.pl` from user config MUST match alpha.uj.edu.pl"
1054 );
1055 // Unrelated host MUST still fail.
1056 assert!(
1057 !oa.matches("ruj.uj.edu.ru"),
1058 "host outside the suffix MUST NOT match"
1059 );
1060 }
1061
1062 /// ADR-0031 D2: discovery search (`doiget search`) ships in the default
1063 /// `oa-only` binary, so `api.openalex.org` MUST be on the production
1064 /// allowlist under the `"openalex"` source key WITHOUT `--features
1065 /// citation`. The Tier-2 `tier_2_allowlist()` extend is
1066 /// `#[cfg(feature = "citation")]`; this test proves
1067 /// `discovery_allowlist()` covers that gap in the shipped build.
1068 #[test]
1069 #[serial]
1070 fn build_http_client_registers_openalex_for_discovery() {
1071 struct EnvGuard {
1072 key: &'static str,
1073 prev: Option<String>,
1074 }
1075 impl EnvGuard {
1076 fn save(key: &'static str) -> Self {
1077 Self {
1078 key,
1079 prev: std::env::var(key).ok(),
1080 }
1081 }
1082 }
1083 impl Drop for EnvGuard {
1084 fn drop(&mut self) {
1085 match &self.prev {
1086 Some(v) => std::env::set_var(self.key, v),
1087 None => std::env::remove_var(self.key),
1088 }
1089 }
1090 }
1091
1092 // Point config resolution at an empty tempdir and clear every
1093 // `DOIGET_*_BASE` so `build_http_client` takes the PRODUCTION
1094 // branch (not the test-base builder, which would register
1095 // "openalex" itself and mask the gap this test guards).
1096 let td = tempfile::TempDir::new().expect("tempdir");
1097 let _g0 = EnvGuard::save("XDG_CONFIG_HOME");
1098 let _g1 = EnvGuard::save("APPDATA");
1099 let _g2 = EnvGuard::save("HOME");
1100 let _g3 = EnvGuard::save("USERPROFILE");
1101 let _g4 = EnvGuard::save("DOIGET_ARXIV_BASE");
1102 let _g5 = EnvGuard::save("DOIGET_CROSSREF_BASE");
1103 let _g6 = EnvGuard::save("DOIGET_UNPAYWALL_BASE");
1104 let _g7 = EnvGuard::save("DOIGET_OA_PUBLISHER_BASE");
1105 let _g8 = EnvGuard::save("DOIGET_OPENALEX_BASE");
1106 std::env::set_var("XDG_CONFIG_HOME", td.path());
1107 std::env::set_var("APPDATA", td.path());
1108 std::env::set_var("HOME", td.path());
1109 std::env::set_var("USERPROFILE", td.path());
1110 std::env::remove_var("DOIGET_ARXIV_BASE");
1111 std::env::remove_var("DOIGET_CROSSREF_BASE");
1112 std::env::remove_var("DOIGET_UNPAYWALL_BASE");
1113 std::env::remove_var("DOIGET_OA_PUBLISHER_BASE");
1114 std::env::remove_var("DOIGET_OPENALEX_BASE");
1115
1116 let client = build_http_client().expect("HttpClient builds");
1117 let oa = client
1118 .source_allowlist("openalex")
1119 .expect("openalex source registered for discovery (ADR-0031 D2)");
1120 assert!(
1121 oa.matches("api.openalex.org"),
1122 "api.openalex.org MUST be on the discovery allowlist; got {:?}",
1123 oa.redirect_hosts
1124 );
1125 }
1126
1127 // Slice 2: the `extract_crossref_fields_*` unit tests moved to
1128 // `doiget_core::orchestrator::tests` along with the function they
1129 // covered. The CLI no longer owns those helpers; the marker test
1130 // below keeps the CLI's `fetch::tests` non-empty after the helper
1131 // migration so a future regression that nukes the delegation path
1132 // surfaces as a build failure (the `FetchPaperOutcome` re-import
1133 // would stop resolving).
1134 #[test]
1135 fn fetch_paper_outcome_is_reachable_from_cli() {
1136 let _ = std::any::type_name::<doiget_core::orchestrator::FetchPaperOutcome>();
1137 }
1138
1139 #[test]
1140 fn ambiguous_maps_to_exit_code_2() {
1141 // ADR-0031 D5: a name-filter ambiguity is user-fixable → exit 2,
1142 // distinct from the generic exit 1.
1143 assert_eq!(cli_exit_code(ErrorCode::Ambiguous), 2);
1144 }
1145
1146 /// Minimal `DenialContext` carrying only `reason`; every other field
1147 /// is optional (ADR-0023 §3) so `None`/empty is a valid producer
1148 /// shape for the reclassification decision under test.
1149 fn denial(reason: DenialReason) -> DenialContext {
1150 DenialContext {
1151 reason,
1152 source: None,
1153 attempted: None,
1154 expected: None,
1155 hop_index: None,
1156 cap: None,
1157 actual: None,
1158 }
1159 }
1160
1161 /// Issue #145 / `docs/ERRORS.md` §6.1: a policy-class denial reason
1162 /// on a `Blocked` OA-PDF leg must be reclassified from the core's
1163 /// blanket `NetworkError` to `CapabilityDenied` at the CLI layer, so
1164 /// the user-facing exit becomes 3 (not the generic 1) and a flaky
1165 /// network is not implied for a deliberate supply-chain block.
1166 #[test]
1167 fn policy_denials_reclassify_network_error_to_capability_denied() {
1168 for r in [
1169 DenialReason::RedirectNotInAllowlist,
1170 DenialReason::InsecureScheme,
1171 DenialReason::HostInBlockList,
1172 ] {
1173 let d = denial(r);
1174 assert_eq!(
1175 effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
1176 ErrorCode::CapabilityDenied,
1177 "policy reason {r:?} must promote NetworkError -> CapabilityDenied"
1178 );
1179 assert_eq!(
1180 cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, Some(&d))),
1181 3,
1182 "policy reason {r:?} must map to exit 3 (docs/ERRORS.md §4/§6.1)"
1183 );
1184 }
1185 }
1186
1187 /// A genuine transport fault carries NO `DenialContext`; it must stay
1188 /// `NetworkError` / exit 1 — `docs/ERRORS.md` §2 "retry usually fine"
1189 /// is the correct signal there. (This is exactly the e2e
1190 /// `..._host_off_allowlist` path: first-leg connect failure, no
1191 /// redirect hop, so no allowlist denial is produced.)
1192 #[test]
1193 fn absent_denial_context_keeps_network_error() {
1194 assert_eq!(
1195 effective_blocked_code(ErrorCode::NetworkError, None),
1196 ErrorCode::NetworkError
1197 );
1198 assert_eq!(
1199 cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, None)),
1200 1
1201 );
1202 }
1203
1204 /// Non-policy denial reasons (size cap, content-type mismatch) are
1205 /// NOT supply-chain policy blocks; they keep the core's code so a
1206 /// genuine cap/transport class is not masked as a capability denial.
1207 #[test]
1208 fn non_policy_denials_keep_core_code() {
1209 for r in [
1210 DenialReason::SizeCapExceeded,
1211 DenialReason::ContentTypeMismatch,
1212 ] {
1213 let d = denial(r);
1214 assert_eq!(
1215 effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
1216 ErrorCode::NetworkError,
1217 "non-policy reason {r:?} must NOT be reclassified"
1218 );
1219 }
1220 }
1221
1222 /// The closed-set wire token used in the human `error[...]:` line
1223 /// must match the serde `snake_case` form so the CLI vocabulary does
1224 /// not drift from the JSON/MCP envelope (`docs/ERRORS.md` §3.1).
1225 #[test]
1226 fn denial_reason_wire_matches_serde_snake_case() {
1227 for r in [
1228 DenialReason::RedirectNotInAllowlist,
1229 DenialReason::InsecureScheme,
1230 DenialReason::HostInBlockList,
1231 ] {
1232 let serde_form = serde_json::to_string(&r).expect("serialize DenialReason");
1233 // serde_json wraps the enum unit variant in quotes.
1234 let serde_token = serde_form.trim_matches('"');
1235 assert_eq!(
1236 denial_reason_wire(r),
1237 serde_token,
1238 "CLI wire token for {r:?} must equal the serde snake_case form"
1239 );
1240 }
1241 }
1242}