doiget_core/dry_run.rs
1//! Dry-run preview shape for `--dry-run` CLI fetches and the
2//! `doiget_metadata_only` / `doiget_fetch_paper` MCP tools.
3//!
4//! Binding spec: [`docs/DECISIONS/0022-dry-run-mode.md`](../../../docs/DECISIONS/0022-dry-run-mode.md)
5//! §1 (NORMATIVE wire shape) and [`docs/MCP_TOOLS.md`](../../../docs/MCP_TOOLS.md)
6//! §10 (MCP envelope mirror).
7//!
8//! The types here live in `doiget-core` rather than `doiget-cli` so that
9//! both `doiget-cli` (the `--dry-run` flag path) and `doiget-mcp` (the
10//! `dry_run: true` tool variants) can serialize bit-identical envelopes
11//! without `doiget-mcp` having to depend on `doiget-cli` (which would
12//! invert the existing `doiget-cli -> doiget-mcp` wiring).
13//!
14//! ## Honesty about candidate uncertainty
15//!
16//! The `pdf_sources[].candidate_hosts` list is the **static allowlist**
17//! for the named resolver, not the host the actual fetch would have hit.
18//! doiget cannot know the post-Unpaywall OA URL host without making the
19//! Unpaywall network call, and `--dry-run` MUST NOT make it. The preview
20//! is therefore an *upper-bound* on the hosts a real fetch could touch,
21//! not a prediction of the single host it would touch (ADR-0022 §4).
22
23use camino::{Utf8Path, Utf8PathBuf};
24use serde::Serialize;
25
26use crate::http::{oa_publisher_allowlist, tier_1_allowlist, SourceAllowlist};
27use crate::{RateLimits, Ref};
28
29/// Per-PDF-source row inside [`FetchPlan::pdf_sources`].
30///
31/// `candidate_hosts` is the static allowlist for the named resolver, not
32/// a prediction of the single host the real fetch would touch — see
33/// [module docs](self) and ADR-0022 §4 ("Honesty about candidate
34/// uncertainty").
35#[derive(Debug, Clone, Serialize)]
36pub struct PdfSourcePlan {
37 /// Resolver source key (e.g. `"oa-publisher"`, `"arxiv"`).
38 pub key: String,
39 /// Allowlist hosts the real fetch would be permitted to touch.
40 pub candidate_hosts: Vec<String>,
41}
42
43/// Per-process rate-limit context surfaced alongside [`FetchPlan`] so an
44/// agent can predict the politeness ceiling without a separate
45/// `doiget_capability_profile` round-trip.
46#[derive(Debug, Clone, Copy, Serialize)]
47pub struct RateLimitBudget {
48 /// Process-wide cap (matches [`RateLimits::HARD_CODED`]).
49 pub global_per_sec: f32,
50 /// Per-source minimum gap between consecutive requests, ms.
51 pub per_source_min_gap_ms: u64,
52}
53
54/// Structured dry-run preview returned by `--dry-run` and the
55/// `dry_run: true` MCP variants. Wire shape matches ADR-0022 §1 and
56/// `docs/MCP_TOOLS.md` §10.
57#[derive(Debug, Clone, Serialize)]
58pub struct FetchPlan {
59 /// Metadata sources the real fetch would consult, in dispatch order.
60 pub metadata_sources: Vec<String>,
61 /// PDF sources the real fetch could attempt. `candidate_hosts` is an
62 /// upper-bound on the hosts a real fetch would touch (see
63 /// [`PdfSourcePlan`]).
64 pub pdf_sources: Vec<PdfSourcePlan>,
65 /// Source keys whose redirect allowlists are loaded into the HTTP
66 /// client. Useful for validating `CapabilityProfile` configuration
67 /// drift.
68 pub redirect_allowlists_loaded: Vec<String>,
69 /// Where the PDF would land on disk (always `<root>/<safekey>.pdf`).
70 pub target_pdf_path: Utf8PathBuf,
71 /// Where the metadata TOML would land
72 /// (always `<root>/.metadata/<safekey>.toml`).
73 pub target_metadata_path: Utf8PathBuf,
74 /// `true` in Phase 1+ — every successful fetch appends a provenance
75 /// row. Named explicitly so future fetch modes can declare "this
76 /// fetch would NOT append" without inverting the flag's meaning
77 /// (ADR-0022 §1).
78 pub would_append_provenance: bool,
79 /// Always `true` in Phase 1: [`PdfSourcePlan::candidate_hosts`] is the
80 /// **static allowlist** for the resolver, NOT a prediction of the
81 /// single host the real fetch would touch. See ADR-0022 §4 ("Honesty
82 /// about candidate uncertainty"). The field is machine-parseable so
83 /// an agent can detect the upper-bound semantics without reading the
84 /// spec — encoding the §4 disclaimer into the wire shape itself.
85 pub candidate_hosts_are_upper_bound: bool,
86}
87
88/// Hard-coded rate-limit budget surfaced with every [`FetchPlan`] preview.
89/// Mirrors [`RateLimits::HARD_CODED`] / `docs/LEGAL.md` §6 safeguard 8.
90pub fn rate_limit_budget() -> RateLimitBudget {
91 RateLimitBudget {
92 global_per_sec: RateLimits::HARD_CODED.max_fetches_per_second(),
93 per_source_min_gap_ms: RateLimits::HARD_CODED.per_source_backoff_ms(),
94 }
95}
96
97/// Build the dry-run preview ([`FetchPlan`]) for the given ref and store
98/// root, without contacting the network or filesystem.
99///
100/// Per-ref-kind shape (ADR-0022 §1, NORMATIVE):
101///
102/// - **DOI** → `metadata_sources = ["crossref", "unpaywall"]`,
103/// `pdf_sources = [{ key: "oa-publisher", candidate_hosts: oa_publisher_allowlist hosts }]`.
104/// - **arXiv** → `metadata_sources = []`,
105/// `pdf_sources = [{ key: "arxiv", candidate_hosts: tier_1 arxiv hosts }]`.
106///
107/// `redirect_allowlists_loaded` always contains the four source keys the
108/// production HTTP client is built with (Tier 1 + the synthetic OA
109/// publisher), reflecting `doiget-cli::commands::fetch::build_http_client`'s
110/// composition.
111///
112/// # Panics
113///
114/// Panics with a self-documenting message if the in-crate allowlist
115/// builders ([`oa_publisher_allowlist`] / [`tier_1_allowlist`]) ever stop
116/// returning the source keys this function looks up. That signals an
117/// internal-contract drift bug, not a user error — fail-fast at preview
118/// time is preferable to silently emitting an empty `candidate_hosts`
119/// list. The workspace `clippy::expect_used` lint is `warn`-level
120/// (promoted to `deny` under `-D warnings`); the localized `#[allow]` is
121/// the minimal intervention here, mirroring the pattern in
122/// `crate::http::HttpClient::new_for_tests_allow_http`.
123#[allow(clippy::expect_used)]
124pub fn build_fetch_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
125 let safekey = ref_.safekey();
126 let target_pdf_path = store_root.join(format!("{}.pdf", safekey.as_str()));
127 let target_metadata_path = store_root
128 .join(".metadata")
129 .join(format!("{}.toml", safekey.as_str()));
130
131 let (metadata_sources, pdf_sources) = match ref_ {
132 Ref::Doi(_) => {
133 // Internal contract: `oa_publisher_allowlist()` MUST always
134 // return an entry whose `.source == "oa-publisher"`. Silent
135 // `.unwrap_or_default()` here would mask drift between this
136 // function and the allowlist source-of-truth — the resulting
137 // empty `candidate_hosts` would mislead an agent into
138 // believing the OA leg has no allowed hosts.
139 let oa_hosts = oa_publisher_allowlist()
140 .into_iter()
141 .find(|a: &SourceAllowlist| a.source == "oa-publisher")
142 .map(|a| a.redirect_hosts)
143 .expect(
144 "oa-publisher allowlist must exist (see \
145 crates/doiget-core/src/http.rs::oa_publisher_allowlist); \
146 if this fires, build_fetch_plan and oa_publisher_allowlist \
147 have drifted",
148 );
149 (
150 vec!["crossref".to_string(), "unpaywall".to_string()],
151 vec![PdfSourcePlan {
152 key: "oa-publisher".to_string(),
153 candidate_hosts: oa_hosts,
154 }],
155 )
156 }
157 Ref::Arxiv(_) => {
158 // Same internal-contract rationale as the DOI branch above.
159 let arxiv_hosts = tier_1_allowlist()
160 .into_iter()
161 .find(|a: &SourceAllowlist| a.source == "arxiv")
162 .map(|a| a.redirect_hosts)
163 .expect(
164 "tier-1 allowlist must include 'arxiv' (see \
165 crates/doiget-core/src/http.rs::tier_1_allowlist); \
166 if this fires, build_fetch_plan and tier_1_allowlist \
167 have drifted",
168 );
169 (
170 Vec::<String>::new(),
171 vec![PdfSourcePlan {
172 key: "arxiv".to_string(),
173 candidate_hosts: arxiv_hosts,
174 }],
175 )
176 }
177 };
178
179 // Slice 5 (PR #84 advisory item A6): derive the loaded-allowlist
180 // list from the same `tier_1_allowlist()` + `oa_publisher_allowlist()`
181 // functions the production `HttpClient` is composed from. A
182 // hardcoded `vec![...]` here would silently drift if a future slice
183 // adds a new allowlist source to the production client — the wire
184 // shape would still claim only the old four.
185 let redirect_allowlists_loaded: Vec<String> = tier_1_allowlist()
186 .iter()
187 .chain(oa_publisher_allowlist().iter())
188 .map(|a| a.source.clone())
189 .collect();
190
191 FetchPlan {
192 metadata_sources,
193 pdf_sources,
194 redirect_allowlists_loaded,
195 target_pdf_path,
196 target_metadata_path,
197 would_append_provenance: true,
198 // Always `true` in Phase 1 per ADR-0022 §4 ("Honesty about
199 // candidate uncertainty"): `candidate_hosts` is the static
200 // resolver allowlist, NOT a prediction of the single host the
201 // real fetch would touch. Surfaced on the wire so agents can
202 // detect the upper-bound semantics without parsing the spec.
203 candidate_hosts_are_upper_bound: true,
204 }
205}
206
207/// Build the dry-run envelope as a `serde_json::Value`, without writing
208/// anywhere. Used by both the CLI (which prints it to stdout) and the
209/// MCP tool wrapper (which routes the bytes via JSON-RPC). Wire shape:
210///
211/// ```jsonc
212/// {
213/// "ok": true,
214/// "dry_run": true,
215/// "ref": { "doi": "10.1234/foo" } | { "arxiv": "2401.12345" },
216/// "plan": { ... see FetchPlan ... },
217/// "rate_limit_budget": { "global_per_sec": 5.0, "per_source_min_gap_ms": 200 }
218/// }
219/// ```
220pub fn build_dry_run_envelope(ref_: &Ref, plan: &FetchPlan) -> serde_json::Value {
221 serde_json::json!({
222 "ok": true,
223 "dry_run": true,
224 "ref": ref_kind_object(ref_),
225 "plan": plan,
226 "rate_limit_budget": rate_limit_budget(),
227 })
228}
229
230/// Build the `ref` field of the dry-run envelope per ADR-0022 §1:
231/// `{"doi": "10.1234/foo"}` for a DOI ref, `{"arxiv": "2401.12345"}` for
232/// an arXiv ref. We intentionally do NOT serialize the full `Ref` enum
233/// (which would emit `{"kind":"doi","id":"10.1234/foo"}` per the
234/// internally-tagged `#[serde(tag,content)]` form), because the wire
235/// shape in the ADR uses a flat single-key object.
236fn ref_kind_object(ref_: &Ref) -> serde_json::Value {
237 match ref_ {
238 Ref::Doi(d) => serde_json::json!({ "doi": d.as_str() }),
239 Ref::Arxiv(a) => serde_json::json!({ "arxiv": a.as_str() }),
240 }
241}
242
243// ---------------------------------------------------------------------------
244// Tests
245// ---------------------------------------------------------------------------
246
247#[cfg(test)]
248#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
249mod tests {
250 use super::*;
251 use crate::{ArxivId, Doi};
252
253 fn temp_root() -> Utf8PathBuf {
254 Utf8PathBuf::from("/tmp/doiget-test-store")
255 }
256
257 #[test]
258 fn doi_plan_carries_crossref_and_unpaywall_metadata_sources() {
259 let r = Ref::Doi(Doi("10.1234/example".to_string()));
260 let plan = build_fetch_plan(&r, &temp_root());
261 assert_eq!(plan.metadata_sources, vec!["crossref", "unpaywall"]);
262 assert_eq!(plan.pdf_sources.len(), 1);
263 assert_eq!(plan.pdf_sources[0].key, "oa-publisher");
264 assert!(
265 !plan.pdf_sources[0].candidate_hosts.is_empty(),
266 "OA publisher hosts must be populated"
267 );
268 assert!(plan.would_append_provenance);
269 }
270
271 #[test]
272 fn arxiv_plan_has_empty_metadata_sources_and_arxiv_pdf_source() {
273 let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
274 let plan = build_fetch_plan(&r, &temp_root());
275 assert!(plan.metadata_sources.is_empty());
276 assert_eq!(plan.pdf_sources.len(), 1);
277 assert_eq!(plan.pdf_sources[0].key, "arxiv");
278 assert!(plan.pdf_sources[0]
279 .candidate_hosts
280 .iter()
281 .any(|h| h == "arxiv.org"));
282 }
283
284 #[test]
285 fn plan_target_paths_are_safekey_derived() {
286 let r = Ref::Doi(Doi("10.1234/example".to_string()));
287 let root = Utf8PathBuf::from("/tmp/store");
288 let plan = build_fetch_plan(&r, &root);
289 assert_eq!(plan.target_pdf_path, root.join("doi_10.1234_example.pdf"));
290 assert_eq!(
291 plan.target_metadata_path,
292 root.join(".metadata").join("doi_10.1234_example.toml")
293 );
294 }
295
296 #[test]
297 fn dry_run_envelope_has_top_level_ok_dry_run_and_rate_budget() {
298 let r = Ref::Doi(Doi("10.1234/foo".to_string()));
299 let plan = build_fetch_plan(&r, &temp_root());
300 let env = build_dry_run_envelope(&r, &plan);
301 assert_eq!(env["ok"], serde_json::json!(true));
302 assert_eq!(env["dry_run"], serde_json::json!(true));
303 assert_eq!(env["ref"], serde_json::json!({ "doi": "10.1234/foo" }));
304 assert_eq!(
305 env["rate_limit_budget"]["global_per_sec"],
306 serde_json::json!(5.0)
307 );
308 assert_eq!(
309 env["rate_limit_budget"]["per_source_min_gap_ms"],
310 serde_json::json!(200)
311 );
312 }
313
314 #[test]
315 fn dry_run_envelope_arxiv_ref_uses_arxiv_key() {
316 let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
317 let plan = build_fetch_plan(&r, &temp_root());
318 let env = build_dry_run_envelope(&r, &plan);
319 assert_eq!(env["ref"], serde_json::json!({ "arxiv": "2401.12345" }));
320 }
321
322 #[test]
323 fn fetch_plan_carries_candidate_hosts_are_upper_bound_true() {
324 // ADR-0022 §4 ("Honesty about candidate uncertainty"): the field
325 // is always `true` in Phase 1, and the wire envelope must
326 // surface it inside `plan` so agents can detect the upper-bound
327 // semantics without consulting the spec.
328 let r = Ref::Doi(Doi("10.1234/example".to_string()));
329 let plan = build_fetch_plan(&r, &temp_root());
330 assert!(plan.candidate_hosts_are_upper_bound);
331 let env = build_dry_run_envelope(&r, &plan);
332 assert_eq!(
333 env["plan"]["candidate_hosts_are_upper_bound"],
334 serde_json::json!(true),
335 "plan.candidate_hosts_are_upper_bound must be true on the \
336 wire (ADR-0022 §4); got: {env}"
337 );
338 }
339
340 #[test]
341 fn redirect_allowlists_loaded_lists_all_four_sources() {
342 let r = Ref::Doi(Doi("10.1234/example".to_string()));
343 let plan = build_fetch_plan(&r, &temp_root());
344 // All four allowlist entries must be present (matches the
345 // production `build_http_client` composition).
346 assert_eq!(
347 plan.redirect_allowlists_loaded,
348 vec!["crossref", "unpaywall", "arxiv", "oa-publisher"]
349 );
350 }
351}