doiget_core/dry_run.rs
1//! Dry-run preview shape for `--dry-run` CLI fetches and the
2//! `doiget_metadata_only` / `doiget_fetch_paper` MCP tools.
3//!
4//! Binding spec: [`docs/DECISIONS/0022-dry-run-mode.md`](../../../docs/DECISIONS/0022-dry-run-mode.md)
5//! §1 (NORMATIVE wire shape) and [`docs/MCP_TOOLS.md`](../../../docs/MCP_TOOLS.md)
6//! §10 (MCP envelope mirror).
7//!
8//! The types here live in `doiget-core` rather than `doiget-cli` so that
9//! both `doiget-cli` (the `--dry-run` flag path) and `doiget-mcp` (the
10//! `dry_run: true` tool variants) can serialize bit-identical envelopes
11//! without `doiget-mcp` having to depend on `doiget-cli` (which would
12//! invert the existing `doiget-cli -> doiget-mcp` wiring).
13//!
14//! ## Honesty about candidate uncertainty
15//!
16//! The `pdf_sources[].candidate_hosts` list is the **static allowlist**
17//! for the named resolver, not the host the actual fetch would have hit.
18//! doiget cannot know the post-Unpaywall OA URL host without making the
19//! Unpaywall network call, and `--dry-run` MUST NOT make it. The preview
20//! is therefore an *upper-bound* on the hosts a real fetch could touch,
21//! not a prediction of the single host it would touch (ADR-0022 §4).
22
23use camino::{Utf8Path, Utf8PathBuf};
24use serde::Serialize;
25
26use crate::http::{oa_publisher_allowlist, tier_1_allowlist, SourceAllowlist};
27use crate::source::FetchError;
28use crate::{RateLimits, Ref};
29
30/// Per-PDF-source row inside [`FetchPlan::pdf_sources`].
31///
32/// `candidate_hosts` is the static allowlist for the named resolver, not
33/// a prediction of the single host the real fetch would touch — see
34/// [module docs](self) and ADR-0022 §4 ("Honesty about candidate
35/// uncertainty").
36#[derive(Debug, Clone, Serialize)]
37pub struct PdfSourcePlan {
38 /// Resolver source key (e.g. `"oa-publisher"`, `"arxiv"`).
39 pub key: String,
40 /// Allowlist hosts the real fetch would be permitted to touch.
41 pub candidate_hosts: Vec<String>,
42}
43
44/// Per-process rate-limit context surfaced alongside [`FetchPlan`] so an
45/// agent can predict the politeness ceiling without a separate
46/// `doiget_capability_profile` round-trip.
47#[derive(Debug, Clone, Copy, Serialize)]
48pub struct RateLimitBudget {
49 /// Process-wide cap (matches [`RateLimits::HARD_CODED`]).
50 pub global_per_sec: f32,
51 /// Per-source minimum gap between consecutive requests, ms.
52 pub per_source_min_gap_ms: u64,
53}
54
55/// Structured dry-run preview returned by `--dry-run` and the
56/// `dry_run: true` MCP variants. Wire shape matches ADR-0022 §1 and
57/// `docs/MCP_TOOLS.md` §10.
58#[derive(Debug, Clone, Serialize)]
59pub struct FetchPlan {
60 /// Metadata sources the real fetch would consult, in dispatch order.
61 pub metadata_sources: Vec<String>,
62 /// PDF sources the real fetch could attempt. `candidate_hosts` is an
63 /// upper-bound on the hosts a real fetch would touch (see
64 /// [`PdfSourcePlan`]).
65 pub pdf_sources: Vec<PdfSourcePlan>,
66 /// Source keys whose redirect allowlists are loaded into the HTTP
67 /// client. Useful for validating `CapabilityProfile` configuration
68 /// drift.
69 pub redirect_allowlists_loaded: Vec<String>,
70 /// Where the PDF would land on disk (always `<root>/<safekey>.pdf`).
71 pub target_pdf_path: Utf8PathBuf,
72 /// Where the metadata TOML would land
73 /// (always `<root>/.metadata/<safekey>.toml`).
74 pub target_metadata_path: Utf8PathBuf,
75 /// `true` in Phase 1+ — every successful fetch appends a provenance
76 /// row. Named explicitly so future fetch modes can declare "this
77 /// fetch would NOT append" without inverting the flag's meaning
78 /// (ADR-0022 §1).
79 pub would_append_provenance: bool,
80 /// Always `true` in Phase 1: [`PdfSourcePlan::candidate_hosts`] is the
81 /// **static allowlist** for the resolver, NOT a prediction of the
82 /// single host the real fetch would touch. See ADR-0022 §4 ("Honesty
83 /// about candidate uncertainty"). The field is machine-parseable so
84 /// an agent can detect the upper-bound semantics without reading the
85 /// spec — encoding the §4 disclaimer into the wire shape itself.
86 pub candidate_hosts_are_upper_bound: bool,
87}
88
89/// Hard-coded rate-limit budget surfaced with every [`FetchPlan`] preview.
90/// Mirrors [`RateLimits::HARD_CODED`] / `docs/LEGAL.md` §6 safeguard 8.
91pub fn rate_limit_budget() -> RateLimitBudget {
92 RateLimitBudget {
93 global_per_sec: RateLimits::HARD_CODED.max_fetches_per_second(),
94 per_source_min_gap_ms: RateLimits::HARD_CODED.per_source_backoff_ms(),
95 }
96}
97
98/// Build the dry-run preview ([`FetchPlan`]) for the given ref and store
99/// root, without contacting the network or filesystem.
100///
101/// Per-ref-kind shape (ADR-0022 §1, NORMATIVE):
102///
103/// - **DOI** → `metadata_sources = ["crossref", "unpaywall"]`,
104/// `pdf_sources = [{ key: "oa-publisher", candidate_hosts: oa_publisher_allowlist hosts }]`.
105/// - **arXiv** → `metadata_sources = []`,
106/// `pdf_sources = [{ key: "arxiv", candidate_hosts: tier_1 arxiv hosts }]`.
107///
108/// `redirect_allowlists_loaded` always contains the four source keys the
109/// production HTTP client is built with (Tier 1 + the synthetic OA
110/// publisher), reflecting `doiget-cli::commands::fetch::build_http_client`'s
111/// composition.
112///
113/// # Errors
114///
115/// This infallible-looking wrapper never returns `Err` — it delegates to
116/// [`try_build_fetch_plan`] and, on the (should-be-impossible) internal
117/// allowlist-contract drift, falls back to an empty `candidate_hosts`
118/// list for the affected PDF source rather than panicking (issue #156 ②:
119/// a stray `.expect()` here crashed `doiget plan` if a source key was
120/// ever renamed). Callers that want to *observe* the invariant violation
121/// as a typed error should call [`try_build_fetch_plan`] directly; this
122/// function's signature is kept infallible because it is `pub` and has
123/// non-`doiget-core` callers (`doiget-mcp`, `doiget-cli`) whose
124/// signatures must not change in this batch.
125///
126/// The empty-`candidate_hosts` fallback is the lesser evil versus a
127/// panic: a preview with an empty allowlist is visibly wrong (and is
128/// what `try_build_fetch_plan` flags as `SourceSchema`), whereas a panic
129/// takes down `doiget plan` entirely. This path is unreachable unless
130/// [`oa_publisher_allowlist`] / [`tier_1_allowlist`] are edited to drop
131/// the `"oa-publisher"` / `"arxiv"` keys, which the in-crate tests pin.
132pub fn build_fetch_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
133 try_build_fetch_plan(ref_, store_root).unwrap_or_else(|_| {
134 // Internal-contract drift (allowlist key renamed): degrade to an
135 // empty `candidate_hosts` instead of panicking `doiget plan`.
136 // `try_build_fetch_plan` is the API that surfaces this as a
137 // typed `FetchError::SourceSchema`.
138 let safekey = ref_.safekey();
139 let target_pdf_path = store_root.join(format!("{}.pdf", safekey.as_str()));
140 let target_metadata_path = store_root
141 .join(".metadata")
142 .join(format!("{}.toml", safekey.as_str()));
143 let (metadata_sources, pdf_key) = match ref_ {
144 Ref::Doi(_) => (
145 vec!["crossref".to_string(), "unpaywall".to_string()],
146 "oa-publisher",
147 ),
148 Ref::Arxiv(_) => (Vec::<String>::new(), "arxiv"),
149 };
150 FetchPlan {
151 metadata_sources,
152 pdf_sources: vec![PdfSourcePlan {
153 key: pdf_key.to_string(),
154 candidate_hosts: Vec::new(),
155 }],
156 redirect_allowlists_loaded: tier_1_allowlist()
157 .iter()
158 .chain(oa_publisher_allowlist().iter())
159 .map(|a| a.source.clone())
160 .collect(),
161 target_pdf_path,
162 target_metadata_path,
163 would_append_provenance: true,
164 candidate_hosts_are_upper_bound: true,
165 }
166 })
167}
168
169/// Fallible builder for the dry-run preview ([`FetchPlan`]).
170///
171/// Identical to [`build_fetch_plan`] on the happy path, but propagates an
172/// internal allowlist-contract drift as a typed
173/// [`FetchError::SourceSchema`] (which maps to
174/// [`crate::ErrorCode::InternalError`] at the public boundary — the
175/// correct closed-set fit for an internal-invariant violation) instead
176/// of panicking. This is the API issue #156 ② asks for; it is added
177/// alongside the existing infallible [`build_fetch_plan`] rather than
178/// replacing it, because `build_fetch_plan` is `pub` and called from
179/// `doiget-mcp` / `doiget-cli`, whose signatures are out of scope for
180/// this change batch.
181///
182/// # Errors
183///
184/// Returns [`FetchError::SourceSchema`] if the in-crate allowlist
185/// builders ([`oa_publisher_allowlist`] / [`tier_1_allowlist`]) stop
186/// returning the `"oa-publisher"` / `"arxiv"` source keys this function
187/// looks up — an internal-contract drift bug, surfaced rather than
188/// panicked (issue #156 ②). The in-crate tests pin the keys so this is
189/// unreachable in a correct build.
190pub fn try_build_fetch_plan(ref_: &Ref, store_root: &Utf8Path) -> Result<FetchPlan, FetchError> {
191 let safekey = ref_.safekey();
192 let target_pdf_path = store_root.join(format!("{}.pdf", safekey.as_str()));
193 let target_metadata_path = store_root
194 .join(".metadata")
195 .join(format!("{}.toml", safekey.as_str()));
196
197 let (metadata_sources, pdf_sources) = match ref_ {
198 Ref::Doi(_) => {
199 // Internal contract: `oa_publisher_allowlist()` MUST always
200 // return an entry whose `.source == "oa-publisher"`. A silent
201 // `.unwrap_or_default()` here would mask drift between this
202 // function and the allowlist source-of-truth — the resulting
203 // empty `candidate_hosts` would mislead an agent into
204 // believing the OA leg has no allowed hosts. Issue #156 ②:
205 // surface the drift as a typed error rather than `.expect()`
206 // panicking `doiget plan`.
207 let oa_hosts = oa_publisher_allowlist()
208 .into_iter()
209 .find(|a: &SourceAllowlist| a.source == "oa-publisher")
210 .map(|a| a.redirect_hosts)
211 .ok_or_else(|| FetchError::SourceSchema {
212 hint: "internal-contract drift: oa-publisher allowlist \
213 missing (see crates/doiget-core/src/http.rs::\
214 oa_publisher_allowlist); build_fetch_plan and \
215 oa_publisher_allowlist have drifted"
216 .to_string(),
217 })?;
218 (
219 vec!["crossref".to_string(), "unpaywall".to_string()],
220 vec![PdfSourcePlan {
221 key: "oa-publisher".to_string(),
222 candidate_hosts: oa_hosts,
223 }],
224 )
225 }
226 Ref::Arxiv(_) => {
227 // Same internal-contract rationale as the DOI branch above.
228 let arxiv_hosts = tier_1_allowlist()
229 .into_iter()
230 .find(|a: &SourceAllowlist| a.source == "arxiv")
231 .map(|a| a.redirect_hosts)
232 .ok_or_else(|| FetchError::SourceSchema {
233 hint: "internal-contract drift: tier-1 allowlist missing \
234 'arxiv' (see crates/doiget-core/src/http.rs::\
235 tier_1_allowlist); build_fetch_plan and \
236 tier_1_allowlist have drifted"
237 .to_string(),
238 })?;
239 (
240 Vec::<String>::new(),
241 vec![PdfSourcePlan {
242 key: "arxiv".to_string(),
243 candidate_hosts: arxiv_hosts,
244 }],
245 )
246 }
247 };
248
249 // Slice 5 (PR #84 advisory item A6): derive the loaded-allowlist
250 // list from the same `tier_1_allowlist()` + `oa_publisher_allowlist()`
251 // functions the production `HttpClient` is composed from. A
252 // hardcoded `vec![...]` here would silently drift if a future slice
253 // adds a new allowlist source to the production client — the wire
254 // shape would still claim only the old four.
255 let redirect_allowlists_loaded: Vec<String> = tier_1_allowlist()
256 .iter()
257 .chain(oa_publisher_allowlist().iter())
258 .map(|a| a.source.clone())
259 .collect();
260
261 Ok(FetchPlan {
262 metadata_sources,
263 pdf_sources,
264 redirect_allowlists_loaded,
265 target_pdf_path,
266 target_metadata_path,
267 would_append_provenance: true,
268 // Always `true` in Phase 1 per ADR-0022 §4 ("Honesty about
269 // candidate uncertainty"): `candidate_hosts` is the static
270 // resolver allowlist, NOT a prediction of the single host the
271 // real fetch would touch. Surfaced on the wire so agents can
272 // detect the upper-bound semantics without parsing the spec.
273 candidate_hosts_are_upper_bound: true,
274 })
275}
276
277/// Build the dry-run envelope as a `serde_json::Value`, without writing
278/// anywhere. Used by both the CLI (which prints it to stdout) and the
279/// MCP tool wrapper (which routes the bytes via JSON-RPC). Wire shape:
280///
281/// ```jsonc
282/// {
283/// "ok": true,
284/// "dry_run": true,
285/// "ref": { "doi": "10.1234/foo" } | { "arxiv": "2401.12345" },
286/// "plan": { ... see FetchPlan ... },
287/// "rate_limit_budget": { "global_per_sec": 5.0, "per_source_min_gap_ms": 200 }
288/// }
289/// ```
290pub fn build_dry_run_envelope(ref_: &Ref, plan: &FetchPlan) -> serde_json::Value {
291 serde_json::json!({
292 "ok": true,
293 "dry_run": true,
294 "ref": ref_kind_object(ref_),
295 "plan": plan,
296 "rate_limit_budget": rate_limit_budget(),
297 })
298}
299
300/// Build the `ref` field of the dry-run envelope per ADR-0022 §1:
301/// `{"doi": "10.1234/foo"}` for a DOI ref, `{"arxiv": "2401.12345"}` for
302/// an arXiv ref. We intentionally do NOT serialize the full `Ref` enum
303/// (which would emit `{"kind":"doi","id":"10.1234/foo"}` per the
304/// internally-tagged `#[serde(tag,content)]` form), because the wire
305/// shape in the ADR uses a flat single-key object.
306fn ref_kind_object(ref_: &Ref) -> serde_json::Value {
307 match ref_ {
308 Ref::Doi(d) => serde_json::json!({ "doi": d.as_str() }),
309 Ref::Arxiv(a) => serde_json::json!({ "arxiv": a.as_str() }),
310 }
311}
312
313// ---------------------------------------------------------------------------
314// Tests
315// ---------------------------------------------------------------------------
316
317#[cfg(test)]
318#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
319mod tests {
320 use super::*;
321 use crate::{ArxivId, Doi};
322
323 fn temp_root() -> Utf8PathBuf {
324 Utf8PathBuf::from("/tmp/doiget-test-store")
325 }
326
327 #[test]
328 fn doi_plan_carries_crossref_and_unpaywall_metadata_sources() {
329 let r = Ref::Doi(Doi("10.1234/example".to_string()));
330 let plan = build_fetch_plan(&r, &temp_root());
331 assert_eq!(plan.metadata_sources, vec!["crossref", "unpaywall"]);
332 assert_eq!(plan.pdf_sources.len(), 1);
333 assert_eq!(plan.pdf_sources[0].key, "oa-publisher");
334 assert!(
335 !plan.pdf_sources[0].candidate_hosts.is_empty(),
336 "OA publisher hosts must be populated"
337 );
338 assert!(plan.would_append_provenance);
339 }
340
341 #[test]
342 fn arxiv_plan_has_empty_metadata_sources_and_arxiv_pdf_source() {
343 let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
344 let plan = build_fetch_plan(&r, &temp_root());
345 assert!(plan.metadata_sources.is_empty());
346 assert_eq!(plan.pdf_sources.len(), 1);
347 assert_eq!(plan.pdf_sources[0].key, "arxiv");
348 assert!(plan.pdf_sources[0]
349 .candidate_hosts
350 .iter()
351 .any(|h| h == "arxiv.org"));
352 }
353
354 #[test]
355 fn plan_target_paths_are_safekey_derived() {
356 let r = Ref::Doi(Doi("10.1234/example".to_string()));
357 let root = Utf8PathBuf::from("/tmp/store");
358 let plan = build_fetch_plan(&r, &root);
359 assert_eq!(plan.target_pdf_path, root.join("doi_10.1234_example.pdf"));
360 assert_eq!(
361 plan.target_metadata_path,
362 root.join(".metadata").join("doi_10.1234_example.toml")
363 );
364 }
365
366 #[test]
367 fn dry_run_envelope_has_top_level_ok_dry_run_and_rate_budget() {
368 let r = Ref::Doi(Doi("10.1234/foo".to_string()));
369 let plan = build_fetch_plan(&r, &temp_root());
370 let env = build_dry_run_envelope(&r, &plan);
371 assert_eq!(env["ok"], serde_json::json!(true));
372 assert_eq!(env["dry_run"], serde_json::json!(true));
373 assert_eq!(env["ref"], serde_json::json!({ "doi": "10.1234/foo" }));
374 assert_eq!(
375 env["rate_limit_budget"]["global_per_sec"],
376 serde_json::json!(5.0)
377 );
378 assert_eq!(
379 env["rate_limit_budget"]["per_source_min_gap_ms"],
380 serde_json::json!(200)
381 );
382 }
383
384 #[test]
385 fn dry_run_envelope_arxiv_ref_uses_arxiv_key() {
386 let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
387 let plan = build_fetch_plan(&r, &temp_root());
388 let env = build_dry_run_envelope(&r, &plan);
389 assert_eq!(env["ref"], serde_json::json!({ "arxiv": "2401.12345" }));
390 }
391
392 #[test]
393 fn fetch_plan_carries_candidate_hosts_are_upper_bound_true() {
394 // ADR-0022 §4 ("Honesty about candidate uncertainty"): the field
395 // is always `true` in Phase 1, and the wire envelope must
396 // surface it inside `plan` so agents can detect the upper-bound
397 // semantics without consulting the spec.
398 let r = Ref::Doi(Doi("10.1234/example".to_string()));
399 let plan = build_fetch_plan(&r, &temp_root());
400 assert!(plan.candidate_hosts_are_upper_bound);
401 let env = build_dry_run_envelope(&r, &plan);
402 assert_eq!(
403 env["plan"]["candidate_hosts_are_upper_bound"],
404 serde_json::json!(true),
405 "plan.candidate_hosts_are_upper_bound must be true on the \
406 wire (ADR-0022 §4); got: {env}"
407 );
408 }
409
410 #[test]
411 fn try_build_fetch_plan_ok_matches_build_fetch_plan() {
412 // Issue #156 ②: the fallible variant must produce a plan
413 // structurally identical to the infallible wrapper on the happy
414 // path (the allowlist invariant holds in a correct build).
415 for r in [
416 Ref::Doi(Doi("10.1234/example".to_string())),
417 Ref::Arxiv(ArxivId("2401.12345".to_string())),
418 ] {
419 let root = temp_root();
420 let fallible = try_build_fetch_plan(&r, &root).expect("invariant holds");
421 let infallible = build_fetch_plan(&r, &root);
422 assert_eq!(fallible.metadata_sources, infallible.metadata_sources);
423 assert_eq!(fallible.pdf_sources[0].key, infallible.pdf_sources[0].key);
424 assert_eq!(
425 fallible.pdf_sources[0].candidate_hosts,
426 infallible.pdf_sources[0].candidate_hosts
427 );
428 assert!(
429 !fallible.pdf_sources[0].candidate_hosts.is_empty(),
430 "happy-path candidate_hosts must be populated, not the \
431 degraded empty fallback"
432 );
433 }
434 }
435
436 #[test]
437 fn redirect_allowlists_loaded_lists_all_four_sources() {
438 let r = Ref::Doi(Doi("10.1234/example".to_string()));
439 let plan = build_fetch_plan(&r, &temp_root());
440 // All four allowlist entries must be present (matches the
441 // production `build_http_client` composition).
442 assert_eq!(
443 plan.redirect_allowlists_loaded,
444 vec!["crossref", "unpaywall", "arxiv", "oa-publisher"]
445 );
446 }
447}