doiget_cli/commands/fetch.rs
1//! `doiget fetch <ref>` subcommand.
2//!
3//! Phase 1 scope:
4//!
5//! - **arXiv refs** — full end-to-end: PDF bytes are fetched via the
6//! `doiget_core::sources::arxiv::ArxivSource`, the `[doiget]`
7//! extension table is populated with the resolved license, source,
8//! size, and `fetched_at`, and the result is written to the on-disk
9//! store with both the metadata TOML and the PDF.
10//! - **DOI refs** — Crossref metadata + Unpaywall license enrichment + an
11//! OA PDF fetch when Unpaywall's `best_oa_location.url_for_pdf` (or
12//! `best_oa_location.url`) resolves to a host on the synthetic
13//! `"oa-publisher"` allowlist (`docs/REDIRECT_ALLOWLIST.md` §3). The OA
14//! URL host check is informed-best-effort; if the host is not on the
15//! allowlist or the body fails the magic-byte check, the orchestrator
16//! logs a `Fetch err` row under `source = "oa-publisher"` and falls back
17//! to metadata-only success — the metadata is still useful.
18//!
19//! ## Provenance contract
20//!
21//! Per `docs/PROVENANCE_LOG.md` §3, every invocation emits at least one
22//! `SessionStart`, one or more `Fetch` rows (one per source consulted), one
23//! `StoreWrite` row on success, and one `SessionEnd`. Each `Fetch` row is
24//! appended by the underlying `Source` impl; the orchestrator owns the
25//! session-bookend rows and the `StoreWrite` row.
26//!
27//! ## Configuration surface
28//!
29//! Hard-coded paths with env-var overrides; full `config.toml` plumbing
30//! arrives in a follow-up. See `docs/CONFIG.md` for the eventual surface.
31//!
32//! | Env var | Default | Purpose |
33//! |---|---|---|
34//! | `DOIGET_STORE_ROOT` | `$HOME/papers` (or `%USERPROFILE%\papers` on Windows) | Filesystem store root |
35//! | `DOIGET_LOG_PATH` | `<config>/doiget/access.jsonl` | Provenance log file |
36//! | `DOIGET_CONTACT_EMAIL` | `doiget@localhost` | Polite-pool contact email (User-Agent and Crossref) |
37//! | `DOIGET_UNPAYWALL_EMAIL` | (= contact email) | Unpaywall query-string email |
38//! | `DOIGET_ARXIV_BASE` | `https://arxiv.org` | arXiv source base (test override) |
39//! | `DOIGET_CROSSREF_BASE` | `https://api.crossref.org` | Crossref source base (test override) |
40//! | `DOIGET_UNPAYWALL_BASE` | `https://api.unpaywall.org/v2` | Unpaywall source base (test override) |
41//! | `DOIGET_OA_PUBLISHER_BASE` | (production allowlist) | OA publisher host allowlist override (test override) |
42
43use std::sync::Arc;
44
45use anyhow::{anyhow, Context, Result};
46use camino::Utf8PathBuf;
47
48#[cfg(feature = "citation")]
49use doiget_core::http::tier_2_allowlist;
50use doiget_core::http::{oa_publisher_allowlist, tier_1_allowlist, HttpClient};
51use doiget_core::orchestrator::{fetch_paper as core_fetch_paper, FetchPaperOutcome, PdfLegStatus};
52use doiget_core::provenance::{Capability, LogEvent, LogResult, ProvenanceLog, RowInput};
53use doiget_core::rate_limiter::RateLimiter;
54use doiget_core::source::{FetchContext, FetchError};
55use doiget_core::store::FsStore;
56use doiget_core::{CapabilityProfile, DenialContext, DenialReason, ErrorCode, RateLimits, Ref};
57
58/// Defer to docs/PROVENANCE_LOG.md §3: 26-char ULID per process invocation.
59fn new_session_id() -> String {
60 ulid::Ulid::new().to_string()
61}
62
63// ---------------------------------------------------------------------------
64// Dry-run plan / preview (ADR-0022)
65// ---------------------------------------------------------------------------
66
67// The structured `FetchPlan` shape, the `build_fetch_plan` builder, and
68// the `build_dry_run_envelope` JSON-shape helper live in `doiget-core`
69// so the MCP server can produce a bit-identical envelope without
70// depending on `doiget-cli`. The CLI re-exports them here for callers
71// that already `use doiget_cli::commands::fetch`.
72pub use doiget_core::dry_run::{
73 build_dry_run_envelope, build_fetch_plan, FetchPlan, PdfSourcePlan, RateLimitBudget,
74};
75
76/// Serialize the dry-run envelope and write it to stdout. Used by the
77/// `--dry-run` flag on `doiget fetch` and `doiget batch`. The envelope
78/// shape matches ADR-0022 §1 / `docs/MCP_TOOLS.md` §10.
79///
80/// `pub` so `commands::batch` (multi-ref dry-run) can reuse it. The
81/// function lives in `doiget-cli` (not `doiget-core`) because `println!`
82/// is a CLI concern; the MCP server uses [`build_dry_run_envelope`]
83/// directly and routes the bytes via JSON-RPC.
84///
85/// `print_stdout` is workspace-deny for MCP stdio safety (ADR-0001 /
86/// `docs/SECURITY.md` §3); `--dry-run` is a CLI-only path that never
87/// runs under the MCP server, so the localized `#[allow]` is the
88/// minimal intervention — same pattern used by `commands::config`,
89/// `commands::info`, etc.
90#[allow(clippy::print_stdout)]
91pub fn emit_dry_run_plan_to_stdout(ref_: &Ref, plan: &FetchPlan) -> Result<()> {
92 let envelope = build_dry_run_envelope(ref_, plan);
93 let s = serde_json::to_string(&envelope).context("serializing dry-run envelope to JSON")?;
94 println!("{s}");
95 Ok(())
96}
97
98/// Resolve the provenance log path. `DOIGET_LOG_PATH` wins; otherwise
99/// fall back to `<config>/doiget/access.jsonl` per `docs/PROVENANCE_LOG.md`
100/// §1.
101fn resolve_log_path() -> Result<Utf8PathBuf> {
102 if let Some(s) = read_env_utf8("DOIGET_LOG_PATH")? {
103 return Ok(Utf8PathBuf::from(s));
104 }
105 let cfg = config_dir_utf8()?;
106 Ok(cfg.join("doiget").join("access.jsonl"))
107}
108
109/// Read an env var and assert it is valid UTF-8. Returns `Ok(None)` if
110/// unset; `Ok(Some(s))` if set and UTF-8; `Err(...)` if set but non-UTF-8.
111/// `std::env::var` already requires UTF-8 (returns `VarError::NotUnicode`
112/// otherwise); we wrap it to surface a friendlier error and avoid the
113/// banned `std::path::PathBuf` round-trip.
114fn read_env_utf8(key: &str) -> Result<Option<String>> {
115 match std::env::var(key) {
116 Ok(s) => Ok(Some(s)),
117 Err(std::env::VarError::NotPresent) => Ok(None),
118 Err(std::env::VarError::NotUnicode(_)) => Err(anyhow!("{key} is not valid UTF-8")),
119 }
120}
121
122/// Best-effort home-dir resolution without depending on the `dirs` crate
123/// (every new dep adds cargo-vet exemption churn). Honors `HOME` first
124/// (POSIX + most CI), then `USERPROFILE` (Windows).
125fn home_dir_utf8() -> Result<Utf8PathBuf> {
126 if let Some(s) = read_env_utf8("HOME")? {
127 return Ok(Utf8PathBuf::from(s));
128 }
129 if let Some(s) = read_env_utf8("USERPROFILE")? {
130 return Ok(Utf8PathBuf::from(s));
131 }
132 Err(anyhow!("neither HOME nor USERPROFILE is set"))
133}
134
135/// Best-effort config-dir resolution. Honors `XDG_CONFIG_HOME` first
136/// (POSIX), then `APPDATA` (Windows), then falls back to `$HOME/.config`.
137fn config_dir_utf8() -> Result<Utf8PathBuf> {
138 if let Some(s) = read_env_utf8("XDG_CONFIG_HOME")? {
139 return Ok(Utf8PathBuf::from(s));
140 }
141 if let Some(s) = read_env_utf8("APPDATA")? {
142 return Ok(Utf8PathBuf::from(s));
143 }
144 let home = home_dir_utf8()?;
145 Ok(home.join(".config"))
146}
147
148/// Construct the workspace-wide [`HttpClient`].
149///
150/// Production path: `HttpClient::new(tier_1_allowlist() ∪ oa_publisher_allowlist())` —
151/// strict HTTPS-only with the canonical Tier-1 redirect allowlist (Crossref,
152/// Unpaywall, arXiv) plus the synthetic `"oa-publisher"` allowlist used for
153/// the OA PDF leg of the DOI fetch path (`fetch_doi` issues
154/// `HttpClient::fetch_pdf("oa-publisher", url)` against the URL Unpaywall
155/// returned in `best_oa_location`). The OA-publisher list is
156/// informed-best-effort per `docs/REDIRECT_ALLOWLIST.md` §3.
157///
158/// Test path: when any of the three `DOIGET_*_BASE` env vars is set, build a
159/// multi-source relaxed-`https_only` client whose per-source allowlist is
160/// derived from the corresponding env-var hosts. The `oa-publisher` source
161/// key is registered against the same host (typically the wiremock origin)
162/// when `DOIGET_OA_PUBLISHER_BASE` is set — this lets the integration tests
163/// under `tests/fetch_doi_oa_pdf_e2e.rs` exercise the full PDF leg without
164/// touching the real network.
165fn build_http_client() -> Result<HttpClient> {
166 let arxiv = std::env::var("DOIGET_ARXIV_BASE").ok();
167 let crossref = std::env::var("DOIGET_CROSSREF_BASE").ok();
168 let unpaywall = std::env::var("DOIGET_UNPAYWALL_BASE").ok();
169 let oa_publisher = std::env::var("DOIGET_OA_PUBLISHER_BASE").ok();
170 // Slice 16: `DOIGET_OPENALEX_BASE` selects a wiremock host for the
171 // citation-graph BFS. Only meaningful with `--features citation`,
172 // but reading the env unconditionally keeps the branch logic
173 // simple and is harmless for default builds.
174 let openalex_base = std::env::var("DOIGET_OPENALEX_BASE").ok();
175
176 if arxiv.is_none()
177 && crossref.is_none()
178 && unpaywall.is_none()
179 && oa_publisher.is_none()
180 && openalex_base.is_none()
181 {
182 let mut allowlists = tier_1_allowlist();
183 allowlists.extend(oa_publisher_allowlist());
184 // Slice 16: when the `citation` feature is compiled in, the
185 // graph subcommand walks OpenAlex Work IDs via
186 // `ctx.http.fetch_bytes("openalex", ...)`. The Tier 2
187 // allowlist registers the `api.openalex.org` host under
188 // that source key. CapabilityProfile.metadata.openalex is
189 // the runtime gate; the allowlist is the transport gate.
190 #[cfg(feature = "citation")]
191 allowlists.extend(tier_2_allowlist());
192 return HttpClient::new(allowlists).context("building HTTP client");
193 }
194
195 // Test-base mode: build a relaxed client per overridden source.
196 let mut owned: Vec<(String, String)> = Vec::new();
197 for (source, base) in [
198 ("arxiv", arxiv.as_deref()),
199 ("crossref", crossref.as_deref()),
200 ("unpaywall", unpaywall.as_deref()),
201 ("oa-publisher", oa_publisher.as_deref()),
202 ("openalex", openalex_base.as_deref()),
203 ] {
204 if let Some(b) = base {
205 let url = url::Url::parse(b)
206 .with_context(|| format!("DOIGET_*_BASE for {source} is not a URL: {b}"))?;
207 let host = url
208 .host_str()
209 .ok_or_else(|| anyhow!("base URL has no host: {b}"))?;
210 owned.push((source.to_string(), host.to_string()));
211 }
212 }
213 let entries: Vec<(&str, &str)> = owned
214 .iter()
215 .map(|(s, h)| (s.as_str(), h.as_str()))
216 .collect();
217 Ok(HttpClient::new_for_tests_allow_http_multi(&entries))
218}
219
220// Slice 2: the per-source env-aware constructors that used to live here
221// (`build_arxiv_source`, `build_crossref_source`, `build_unpaywall_source`)
222// moved into `doiget-core::orchestrator` so the core `fetch_paper`
223// orchestrator and the MCP server both honor the same `DOIGET_*_BASE`
224// test-override surface. The CLI no longer constructs sources directly —
225// it builds the `FetchContext` + `FsStore` and hands them to the core
226// orchestrator.
227
228/// Resolved configuration derived from the environment.
229///
230/// Slice 2: `contact_email` / `unpaywall_email` are now read by the
231/// `doiget-core::orchestrator::fetch_paper` orchestrator directly from
232/// the env (`contact_email_from_env` / `unpaywall_email_from_env` in
233/// that module), so the CLI no longer threads them through. The fields
234/// stay here so a future slice that adds CLI-flag overrides has a
235/// natural attachment point — the `#[allow(dead_code)]` is the minimal
236/// intervention until that slice lands.
237#[allow(dead_code)]
238pub(crate) struct OrchestratorConfig {
239 pub(crate) store_root: Utf8PathBuf,
240 pub(crate) log_path: Utf8PathBuf,
241 pub(crate) contact_email: String,
242 pub(crate) unpaywall_email: String,
243}
244
245impl OrchestratorConfig {
246 fn from_env() -> Result<Self> {
247 let store_root = super::resolve_store_root()?;
248 let log_path = resolve_log_path()?;
249 let contact_email =
250 std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| "doiget@localhost".into());
251 let unpaywall_email =
252 std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| contact_email.clone());
253 Ok(Self {
254 store_root,
255 log_path,
256 contact_email,
257 unpaywall_email,
258 })
259 }
260}
261
262/// Reusable fetch harness shared by `doiget fetch <ref>` (single ref) and
263/// `doiget batch <path>` (many refs). Owns the shared foundation modules
264/// (`HttpClient` / `RateLimiter` / `ProvenanceLog`), the on-disk store, and
265/// the resolved capability profile, plus the session bookkeeping required by
266/// `docs/PROVENANCE_LOG.md` §3 (the 26-char ULID `session_id`).
267///
268/// Construction is performed once via [`FetchHarness::from_env`]. Per-ref
269/// orchestration runs through [`FetchHarness::fetch_one`]; bookend rows go
270/// via [`FetchHarness::log_session_start`] / [`FetchHarness::log_session_end`]
271/// so the orchestrator can frame either one fetch or many.
272pub(crate) struct FetchHarness {
273 pub(crate) http: Arc<HttpClient>,
274 pub(crate) rate_limiter: Arc<RateLimiter>,
275 pub(crate) log: Arc<ProvenanceLog>,
276 pub(crate) store: FsStore,
277 pub(crate) profile: CapabilityProfile,
278 pub(crate) session_id: String,
279 /// Resolved config; Slice 2 keeps this on the harness for the
280 /// CLI-only env diagnostics path (`commands::config::doctor`), even
281 /// though `fetch_one` no longer needs it (the core orchestrator
282 /// re-reads contact email from env directly).
283 #[allow(dead_code)]
284 pub(crate) cfg: OrchestratorConfig,
285}
286
287impl FetchHarness {
288 /// Build a harness from the same env-var surface documented at the top
289 /// of this module. Creates the log parent directory if missing, opens
290 /// the provenance log (allocating a fresh `session_id`), and constructs
291 /// the HTTP client honoring `DOIGET_*_BASE` overrides for tests.
292 pub(crate) fn from_env() -> Result<Self> {
293 let cfg = OrchestratorConfig::from_env()?;
294 if let Some(parent) = cfg.log_path.parent() {
295 if !parent.as_str().is_empty() {
296 std::fs::create_dir_all(parent.as_std_path())
297 .with_context(|| format!("creating log dir {parent}"))?;
298 }
299 }
300 let session_id = new_session_id();
301 let log = Arc::new(
302 ProvenanceLog::open(cfg.log_path.clone(), session_id.clone())
303 .context("opening provenance log")?,
304 );
305 let http = Arc::new(build_http_client()?);
306 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
307 let store = FsStore::new(cfg.store_root.clone()).context("opening store")?;
308 let profile = CapabilityProfile::from_env().context("resolving capability profile")?;
309
310 Ok(Self {
311 http,
312 rate_limiter,
313 log,
314 store,
315 profile,
316 session_id,
317 cfg,
318 })
319 }
320
321 /// Build a [`FetchContext`] view over this harness's foundation modules.
322 /// Creating one is cheap (cloning three `Arc`s + a `String`); per-ref
323 /// orchestration constructs one on demand.
324 pub(crate) fn fetch_context(&self) -> FetchContext {
325 FetchContext {
326 http: self.http.clone(),
327 rate_limiter: self.rate_limiter.clone(),
328 log: self.log.clone(),
329 session_id: self.session_id.clone(),
330 }
331 }
332
333 /// Append a `SessionStart` row. `ref_input` is the raw user-supplied ref
334 /// string (single-fetch path); pass `None` for batch sessions where no
335 /// single ref attributes the session.
336 pub(crate) fn log_session_start(&self, ref_input: Option<&str>) -> Result<()> {
337 self.log
338 .append(RowInput {
339 event: LogEvent::SessionStart,
340 result: LogResult::Ok,
341 capability: Capability::Oa,
342 ref_: ref_input,
343 source: None,
344 error_code: None,
345 size_bytes: None,
346 license: None,
347 store_path: None,
348 // Session bookend — no audit identity (ADR-0021 §1).
349 canonical_digest: None,
350 })
351 .context("appending SessionStart row")?;
352 Ok(())
353 }
354
355 /// Append a `SessionEnd` row. `ref_input` mirrors the `log_session_start`
356 /// argument; pass `None` for batch sessions. The result is best-effort —
357 /// if this append fails, the caller already has the underlying fetch
358 /// error (if any) and we don't override it.
359 pub(crate) fn log_session_end(&self, ok: bool, ref_input: Option<&str>) {
360 let result = if ok { LogResult::Ok } else { LogResult::Err };
361 let _ = self.log.append(RowInput {
362 event: LogEvent::SessionEnd,
363 result,
364 capability: Capability::Oa,
365 ref_: ref_input,
366 source: None,
367 error_code: None,
368 size_bytes: None,
369 license: None,
370 store_path: None,
371 // Session bookend — no audit identity (ADR-0021 §1).
372 canonical_digest: None,
373 });
374 }
375
376 /// Run a single ref through the per-kind orchestration (arxiv → PDF +
377 /// metadata; doi → metadata-only via Crossref + Unpaywall, with an
378 /// informed-best-effort OA PDF leg). Errors here are scoped to this
379 /// one ref — the caller decides whether to abort the surrounding
380 /// session.
381 ///
382 /// Slice 2: delegates to
383 /// [`doiget_core::orchestrator::fetch_paper`] for the actual work
384 /// (which both CLI and MCP now share). This function keeps the
385 /// CLI-only stderr success-line print.
386 pub(crate) async fn fetch_one(&self, ref_: &Ref) -> Result<()> {
387 let ctx = self.fetch_context();
388 match core_fetch_paper(ref_, &self.profile, &ctx, &self.store, self.store.root()).await {
389 Ok(outcome) => {
390 // Issue #145 / `docs/ERRORS.md` §3 + §6: a `Blocked` PDF
391 // leg means an OA PDF *was* discovered but could not be
392 // retrieved. Metadata was written, but this is NOT a
393 // clean success — emitting only `print_success` here made
394 // a denied PDF visually indistinguishable from a genuine
395 // metadata-only result and exited 0 (a silent failure).
396 // Route it through the SAME `error[CODE]:` stderr channel
397 // + `cli_exit_code(...)` mapping `fetch` already uses for
398 // `CapabilityDenied`, using the structured code the
399 // orchestrator mapped from the underlying transport error.
400 if let PdfLegStatus::Blocked {
401 code,
402 message,
403 denial,
404 } = &outcome.pdf_leg
405 {
406 // Issue #145 / `docs/ERRORS.md` §2 + §3: the core
407 // collapses *every* `FetchError::Http(_)` to
408 // `ErrorCode::NetworkError` (see
409 // `doiget_core::source.rs`'s `From<&FetchError> for
410 // ErrorCode`). For a genuine transport/DNS/TLS fault
411 // that is correct ("retry usually fine"). But an
412 // off-allowlist / redirect-denied / insecure-scheme OA
413 // PDF leg is a DELIBERATE policy block — retrying never
414 // helps — yet it would otherwise surface as
415 // `NETWORK_ERROR` → generic exit 1, misrepresenting a
416 // flaky network. The orchestrator already preserves the
417 // real reason on `denial` (the `From<&HttpError> for
418 // Option<DenialContext>` impl walks reqwest's source
419 // chain, so even a redirect denial wrapped as
420 // `HttpError::Network` still yields
421 // `DenialReason::RedirectNotInAllowlist`). Reclassify
422 // the *policy* denials here at the CLI layer to
423 // `CapabilityDenied` → `error[CAPABILITY_DENIED]:` →
424 // exit 3 (the same code `fetch`/`graph` already use for
425 // `ErrorCode::CapabilityDenied`).
426 let effective = effective_blocked_code(*code, denial.as_ref());
427 render_blocked_error(ref_, &outcome, effective, message, denial.as_ref());
428 return Err(anyhow::Error::new(CliExit(cli_exit_code(effective))));
429 }
430 emit_success_line(ref_, &outcome);
431 Ok(())
432 }
433 Err(e) => {
434 // Issue #119: render the cargo-style `error[CODE]:`
435 // line + denial note HERE (while the error is still
436 // typed), then carry only the exit code to `main`.
437 render_fetch_error(&e);
438 let code: ErrorCode = (&e).into();
439 Err(anyhow::Error::new(CliExit(cli_exit_code(code))))
440 }
441 }
442 }
443}
444
445/// CLI-only one-line success message on stderr (ADR-0001 stdio
446/// convention). Renders the [`FetchPaperOutcome`] in the same form the
447/// pre-Slice-2 CLI emitted: a full-PDF success names the PDF path; a
448/// metadata-only DOI fallback (size_bytes == 0) names the metadata TOML
449/// path the orchestrator wrote.
450fn emit_success_line(ref_: &Ref, outcome: &FetchPaperOutcome) {
451 let label = match ref_ {
452 Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
453 Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
454 };
455 match &outcome.pdf_leg {
456 PdfLegStatus::Fetched => {
457 print_success(format_args!(
458 "fetched {} ({} bytes) -> {}",
459 label, outcome.size_bytes, outcome.path
460 ));
461 }
462 PdfLegStatus::NoOaUrl => {
463 print_success(format_args!(
464 "fetched {} (metadata-only: no OA PDF available) -> {}",
465 label, outcome.path
466 ));
467 }
468 // Issue #145: `Blocked` is NO LONGER a success outcome. It is
469 // intercepted in `fetch_one` BEFORE `emit_success_line` is
470 // called and rendered via `render_blocked_error` with a
471 // non-zero exit (`docs/ERRORS.md` §3/§6 — no silent failures).
472 // Reaching this arm would mean the interception regressed, so we
473 // fail closed: surface the `error[CODE]:` line here too rather
474 // than printing a misleading success line.
475 PdfLegStatus::Blocked {
476 code,
477 message,
478 denial,
479 } => {
480 // Same #145 reclassification as the primary interception in
481 // `fetch_one`, so this fail-closed fallback stays consistent.
482 let effective = effective_blocked_code(*code, denial.as_ref());
483 render_blocked_error(ref_, outcome, effective, message, denial.as_ref());
484 }
485 // `PdfLegStatus` is `#[non_exhaustive]`; a future variant
486 // degrades to the size-based wording rather than failing the
487 // downstream-crate build.
488 _ => {
489 if outcome.size_bytes == 0 {
490 print_success(format_args!(
491 "fetched {} (metadata-only) -> {}",
492 label, outcome.path
493 ));
494 } else {
495 print_success(format_args!(
496 "fetched {} ({} bytes) -> {}",
497 label, outcome.size_bytes, outcome.path
498 ));
499 }
500 }
501 }
502}
503
504/// Run the `doiget fetch <ref>` subcommand.
505///
506/// `dry_run` (ADR-0022 §1): when `true`, build a [`FetchPlan`] from the
507/// parsed [`Ref`] and the configured store root, serialize it as JSON to
508/// stdout, and return `Ok(())` immediately, **without** building a
509/// `FetchHarness` (no provenance log open), without contacting the
510/// network, without writing to the store, and without appending a
511/// provenance row.
512///
513/// When `dry_run` is `false`, the function runs the normal end-to-end
514/// orchestration path: open the provenance log, dispatch the per-kind
515/// orchestrator, emit a `SessionStart` / `SessionEnd` bookend pair.
516///
517/// On success returns `Ok(())` and writes a one-line success message to
518/// stderr (per ADR-0001 stdio convention — no stdout writes from `fetch`
519/// on the normal path). On failure, returns an `anyhow::Error` and emits
520/// a `SessionEnd` row with `result=err` to the provenance log before
521/// returning.
522///
523/// # History
524///
525/// Slice 5 (PR #84 advisory item A2/A3 refactor): the previous
526/// `FetchOptions { dry_run: bool }` single-field option bundle plus the
527/// thin `run(input)` backwards-compat wrapper were collapsed into this
528/// single `dry_run: bool` parameter — the option bundle's single-bool
529/// shape was YAGNI, and the wrapper only existed to spare integration
530/// tests a `FetchOptions::default()` literal.
531pub async fn run_with_options(input: String, dry_run: bool) -> Result<()> {
532 // Step 1: parse + safekey. Issue #119: render the cargo-style
533 // `error[INVALID_REF]:` line + carry the exit code, rather than
534 // letting the granular `RefParseError` fall out as an opaque
535 // anyhow `{:?}` dump.
536 let ref_ = match Ref::parse(&input) {
537 Ok(r) => r,
538 Err(e) => {
539 print_err(format_args!(
540 "error[{}]: invalid ref: {e}",
541 ErrorCode::InvalidRef.as_wire()
542 ));
543 return Err(anyhow::Error::new(CliExit(cli_exit_code(
544 ErrorCode::InvalidRef,
545 ))));
546 }
547 };
548
549 // Dry-run branch: build the plan and emit it. NO harness, NO network,
550 // NO store write, NO provenance row. Posture-lint ADR-0022 §5 will
551 // verify this branch never reaches `HttpClient::fetch_*`,
552 // `FsStore::write_*`, or `ProvenanceLog::append`.
553 if dry_run {
554 // Resolve store root for path projections. Failures here surface
555 // as a normal CLI error (not as a denial) — same behaviour the
556 // non-dry-run path would exhibit on a misconfigured environment.
557 let store_root = super::resolve_store_root()?;
558 let plan = build_fetch_plan(&ref_, &store_root);
559 emit_dry_run_plan_to_stdout(&ref_, &plan)?;
560 return Ok(());
561 }
562
563 // Step 2: build harness (foundation modules + provenance log).
564 let harness = FetchHarness::from_env()?;
565
566 // Step 3: emit SessionStart. Fail-closed if the log write fails — the
567 // surrounding fetch MUST NOT proceed (`docs/PROVENANCE_LOG.md` §5).
568 harness.log_session_start(Some(ref_.as_input_str()))?;
569
570 // Step 4: dispatch on ref kind.
571 let result = harness.fetch_one(&ref_).await;
572
573 // Step 5: emit SessionEnd regardless of outcome. Best-effort: if this
574 // append also fails, surface the underlying fetch error (or a fresh one
575 // if the fetch was Ok).
576 harness.log_session_end(result.is_ok(), Some(ref_.as_input_str()));
577
578 result
579}
580
581/// Single-line user-visible success message, written to stderr per ADR-0001
582/// (stdio convention — the CLI never writes a success line to stdout). This
583/// is the one place where `eprintln!` is intentional; the workspace
584/// `clippy::print_stderr` lint is `warn` so the localized `#[allow]` is the
585/// minimal intervention.
586#[allow(clippy::print_stderr)]
587fn print_success(args: std::fmt::Arguments<'_>) {
588 eprintln!("{args}");
589}
590
591/// Stderr sink for the `docs/ERRORS.md` §3 human-error lines. Mirrors
592/// [`print_success`]; the localized `#[allow]` is the minimal
593/// intervention for the workspace `clippy::print_stderr` lint.
594#[allow(clippy::print_stderr)]
595fn print_err(args: std::fmt::Arguments<'_>) {
596 eprintln!("{args}");
597}
598
599/// Carries a `docs/ERRORS.md` §4 process exit code out of a CLI
600/// command to `main`, which owns the actual `std::process::exit`
601/// (calling it inside `run_with_options` would kill in-process
602/// integration tests). The human-readable `error[CODE]: …` line has
603/// ALREADY been written to stderr by `render_fetch_error` before
604/// this is constructed, so `main` must NOT print it again. Issue #119.
605#[derive(Debug)]
606pub struct CliExit(pub i32);
607
608impl std::fmt::Display for CliExit {
609 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
610 write!(f, "exiting with status {}", self.0)
611 }
612}
613
614impl std::error::Error for CliExit {}
615
616/// Reclassify a `PdfLegStatus::Blocked` code at the CLI layer (issue
617/// #145 / `docs/ERRORS.md` §2 "NETWORK_ERROR" vs §3.1 / §6).
618///
619/// The core maps *every* `FetchError::Http(_)` to
620/// [`ErrorCode::NetworkError`] (`doiget_core::source`'s
621/// `From<&FetchError> for ErrorCode`). `docs/ERRORS.md` §2 defines
622/// `NETWORK_ERROR` as a transport / DNS / TLS fault where "retry usually
623/// fine" — true for a real network blip, but **false** for a deliberate
624/// supply-chain policy block (off-allowlist redirect, insecure-scheme
625/// redirect, host-blocklist hit): retrying such a block never helps, so
626/// surfacing it as `NETWORK_ERROR` (generic exit 1) misrepresents a flaky
627/// network to humans and agents.
628///
629/// The orchestrator already preserves the true reason on the
630/// [`DenialContext`] side-channel (the `From<&HttpError> for
631/// Option<DenialContext>` impl walks reqwest's `source()` chain, so even
632/// a redirect denial wrapped as `HttpError::Network` still yields
633/// [`DenialReason::RedirectNotInAllowlist`]). When that reason is one of
634/// the closed-set *policy* denials, promote the surface code to
635/// [`ErrorCode::CapabilityDenied`] so the CLI renders
636/// `error[CAPABILITY_DENIED]:` and [`cli_exit_code`] returns exit 3 —
637/// the same code `fetch` / `graph` already use for capability denials.
638/// Non-policy blocks (no `denial`, or a non-policy reason such as
639/// `SizeCapExceeded` / `ContentTypeMismatch`) keep the core's code so a
640/// genuine transport failure still reads as `NETWORK_ERROR`.
641fn effective_blocked_code(code: ErrorCode, denial: Option<&DenialContext>) -> ErrorCode {
642 match denial.map(|d| d.reason) {
643 Some(
644 DenialReason::RedirectNotInAllowlist
645 | DenialReason::InsecureScheme
646 | DenialReason::HostInBlockList,
647 ) => ErrorCode::CapabilityDenied,
648 _ => code,
649 }
650}
651
652/// Snake-case wire token for a [`DenialReason`], matching the
653/// `#[serde(rename_all = "snake_case")]` JSON/MCP surface (ADR-0023 §2)
654/// so the CLI human line uses the SAME vocabulary as the machine
655/// envelope (`docs/ERRORS.md` §3.1). Only the policy-denial reasons the
656/// CLI inlines are enumerated; everything else degrades to a generic
657/// token rather than drifting from the serde form.
658fn denial_reason_wire(reason: DenialReason) -> &'static str {
659 match reason {
660 DenialReason::RedirectNotInAllowlist => "redirect_not_in_allowlist",
661 DenialReason::InsecureScheme => "insecure_scheme",
662 DenialReason::HostInBlockList => "host_in_block_list",
663 _ => "policy_denied",
664 }
665}
666
667/// `docs/ERRORS.md` §4 closed-code → process exit code. Anything not
668/// individually listed falls under "at least one fetch failed" (1).
669///
670/// `pub(crate)` so sibling subcommands (`commands::graph`, …) route
671/// their typed denials through the SAME centralized mapping instead of
672/// open-coding magic exit numbers — keeps the `ErrorCode`→exit contract
673/// single-sourced (issue #149).
674pub(crate) fn cli_exit_code(code: ErrorCode) -> i32 {
675 match code {
676 ErrorCode::CapabilityDenied => 3,
677 ErrorCode::StoreError | ErrorCode::LogError => 4,
678 ErrorCode::FetchTimeout => 124,
679 _ => 1,
680 }
681}
682
683/// Render a terminal [`FetchError`] in the `docs/ERRORS.md` §3
684/// "Researcher (CLI human)" form: `error[CODE]: message` on stderr,
685/// plus an actionable `= note:` line carrying the ADR-0023
686/// `denial_context` (attempted / expected hosts) when the failure was
687/// a denial class. stdout stays clean (ADR-0001).
688fn render_fetch_error(e: &FetchError) {
689 let code: ErrorCode = e.into();
690 print_err(format_args!("error[{}]: {}", code.as_wire(), e));
691 if let Some(dc) = Option::<DenialContext>::from(e) {
692 let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
693 match &dc.expected {
694 Some(exp) if !exp.is_empty() => {
695 print_err(format_args!(
696 " = note: attempted {attempted}; allowed: {}",
697 exp.join(", ")
698 ));
699 }
700 _ => {
701 print_err(format_args!(" = note: attempted {attempted}"));
702 }
703 }
704 }
705}
706
707/// Render a `PdfLegStatus::Blocked` outcome in the `docs/ERRORS.md` §3
708/// "Researcher (CLI human)" form. Issue #145: an OA PDF was discovered
709/// but could not be retrieved — the metadata WAS written, but this is a
710/// denial, not a clean success. We emit the same `error[CODE]:` stderr
711/// shape as [`render_fetch_error`] (so pipelines and humans see an
712/// unambiguous failure), name the metadata path that DID land so the
713/// partial result is still discoverable, and surface the ADR-0023
714/// `denial_context` note when present. stdout stays clean (ADR-0001).
715fn render_blocked_error(
716 ref_: &Ref,
717 outcome: &FetchPaperOutcome,
718 code: ErrorCode,
719 message: &str,
720 denial: Option<&DenialContext>,
721) {
722 let label = match ref_ {
723 Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
724 Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
725 };
726 // Issue #145: when the block is a deliberate policy denial, name the
727 // closed-set reason inline so a human/agent reading the
728 // `error[CAPABILITY_DENIED]:` line immediately sees this is a
729 // supply-chain policy block (retrying is futile), not a flaky network.
730 match denial.map(|d| d.reason) {
731 Some(
732 reason @ (DenialReason::RedirectNotInAllowlist
733 | DenialReason::InsecureScheme
734 | DenialReason::HostInBlockList),
735 ) => {
736 print_err(format_args!(
737 "error[{}]: {label}: an OA PDF was found but its host is blocked by \
738 supply-chain policy ({}): {message}",
739 code.as_wire(),
740 denial_reason_wire(reason)
741 ));
742 }
743 _ => {
744 print_err(format_args!(
745 "error[{}]: {label}: an OA PDF was found but could not be retrieved: {message}",
746 code.as_wire()
747 ));
748 }
749 }
750 if let Some(dc) = denial {
751 let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
752 match &dc.expected {
753 Some(exp) if !exp.is_empty() => {
754 print_err(format_args!(
755 " = note: attempted {attempted}; allowed: {}",
756 exp.join(", ")
757 ));
758 }
759 _ => {
760 print_err(format_args!(" = note: attempted {attempted}"));
761 }
762 }
763 }
764 // The metadata TOML still landed; point the user at it so the
765 // partial result is not lost (it is still useful), without
766 // pretending the fetch succeeded.
767 print_err(format_args!(
768 " = note: metadata-only record written to {}",
769 outcome.path
770 ));
771}
772
773// ---------------------------------------------------------------------------
774// Tests
775// ---------------------------------------------------------------------------
776
777#[cfg(test)]
778#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
779mod tests {
780 use super::*;
781
782 #[test]
783 fn new_session_id_is_26_chars() {
784 // ULID textual form is fixed-width 26 chars (Crockford base32).
785 // `docs/PROVENANCE_LOG.md` §3 requires this exact length.
786 let id = new_session_id();
787 assert_eq!(id.len(), 26, "session id must be 26 chars: {:?}", id);
788 // Crockford base32 uses uppercase letters and digits; specifically
789 // I, L, O, U are excluded. Every char must be ASCII alphanumeric.
790 assert!(
791 id.chars().all(|c| c.is_ascii_alphanumeric()),
792 "ulid must be ASCII alphanumeric: {:?}",
793 id
794 );
795 }
796
797 // Slice 2: the `extract_crossref_fields_*` unit tests moved to
798 // `doiget_core::orchestrator::tests` along with the function they
799 // covered. The CLI no longer owns those helpers; the marker test
800 // below keeps the CLI's `fetch::tests` non-empty after the helper
801 // migration so a future regression that nukes the delegation path
802 // surfaces as a build failure (the `FetchPaperOutcome` re-import
803 // would stop resolving).
804 #[test]
805 fn fetch_paper_outcome_is_reachable_from_cli() {
806 let _ = std::any::type_name::<doiget_core::orchestrator::FetchPaperOutcome>();
807 }
808
809 /// Minimal `DenialContext` carrying only `reason`; every other field
810 /// is optional (ADR-0023 §3) so `None`/empty is a valid producer
811 /// shape for the reclassification decision under test.
812 fn denial(reason: DenialReason) -> DenialContext {
813 DenialContext {
814 reason,
815 source: None,
816 attempted: None,
817 expected: None,
818 hop_index: None,
819 cap: None,
820 actual: None,
821 }
822 }
823
824 /// Issue #145 / `docs/ERRORS.md` §6.1: a policy-class denial reason
825 /// on a `Blocked` OA-PDF leg must be reclassified from the core's
826 /// blanket `NetworkError` to `CapabilityDenied` at the CLI layer, so
827 /// the user-facing exit becomes 3 (not the generic 1) and a flaky
828 /// network is not implied for a deliberate supply-chain block.
829 #[test]
830 fn policy_denials_reclassify_network_error_to_capability_denied() {
831 for r in [
832 DenialReason::RedirectNotInAllowlist,
833 DenialReason::InsecureScheme,
834 DenialReason::HostInBlockList,
835 ] {
836 let d = denial(r);
837 assert_eq!(
838 effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
839 ErrorCode::CapabilityDenied,
840 "policy reason {r:?} must promote NetworkError -> CapabilityDenied"
841 );
842 assert_eq!(
843 cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, Some(&d))),
844 3,
845 "policy reason {r:?} must map to exit 3 (docs/ERRORS.md §4/§6.1)"
846 );
847 }
848 }
849
850 /// A genuine transport fault carries NO `DenialContext`; it must stay
851 /// `NetworkError` / exit 1 — `docs/ERRORS.md` §2 "retry usually fine"
852 /// is the correct signal there. (This is exactly the e2e
853 /// `..._host_off_allowlist` path: first-leg connect failure, no
854 /// redirect hop, so no allowlist denial is produced.)
855 #[test]
856 fn absent_denial_context_keeps_network_error() {
857 assert_eq!(
858 effective_blocked_code(ErrorCode::NetworkError, None),
859 ErrorCode::NetworkError
860 );
861 assert_eq!(
862 cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, None)),
863 1
864 );
865 }
866
867 /// Non-policy denial reasons (size cap, content-type mismatch) are
868 /// NOT supply-chain policy blocks; they keep the core's code so a
869 /// genuine cap/transport class is not masked as a capability denial.
870 #[test]
871 fn non_policy_denials_keep_core_code() {
872 for r in [
873 DenialReason::SizeCapExceeded,
874 DenialReason::ContentTypeMismatch,
875 ] {
876 let d = denial(r);
877 assert_eq!(
878 effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
879 ErrorCode::NetworkError,
880 "non-policy reason {r:?} must NOT be reclassified"
881 );
882 }
883 }
884
885 /// The closed-set wire token used in the human `error[...]:` line
886 /// must match the serde `snake_case` form so the CLI vocabulary does
887 /// not drift from the JSON/MCP envelope (`docs/ERRORS.md` §3.1).
888 #[test]
889 fn denial_reason_wire_matches_serde_snake_case() {
890 for r in [
891 DenialReason::RedirectNotInAllowlist,
892 DenialReason::InsecureScheme,
893 DenialReason::HostInBlockList,
894 ] {
895 let serde_form = serde_json::to_string(&r).expect("serialize DenialReason");
896 // serde_json wraps the enum unit variant in quotes.
897 let serde_token = serde_form.trim_matches('"');
898 assert_eq!(
899 denial_reason_wire(r),
900 serde_token,
901 "CLI wire token for {r:?} must equal the serde snake_case form"
902 );
903 }
904 }
905}