Skip to main content

doiget_core/
dry_run.rs

1//! Dry-run preview shape for `--dry-run` CLI fetches and the
2//! `doiget_metadata_only` / `doiget_fetch_paper` MCP tools.
3//!
4//! Binding spec: [`docs/DECISIONS/0022-dry-run-mode.md`](../../../docs/DECISIONS/0022-dry-run-mode.md)
5//! §1 (NORMATIVE wire shape) and [`docs/MCP_TOOLS.md`](../../../docs/MCP_TOOLS.md)
6//! §10 (MCP envelope mirror).
7//!
8//! The types here live in `doiget-core` rather than `doiget-cli` so that
9//! both `doiget-cli` (the `--dry-run` flag path) and `doiget-mcp` (the
10//! `dry_run: true` tool variants) can serialize bit-identical envelopes
11//! without `doiget-mcp` having to depend on `doiget-cli` (which would
12//! invert the existing `doiget-cli -> doiget-mcp` wiring).
13//!
14//! ## Honesty about candidate uncertainty
15//!
16//! The `pdf_sources[].candidate_hosts` list is the **static allowlist**
17//! for the named resolver, not the host the actual fetch would have hit.
18//! doiget cannot know the post-Unpaywall OA URL host without making the
19//! Unpaywall network call, and `--dry-run` MUST NOT make it. The preview
20//! is therefore an *upper-bound* on the hosts a real fetch could touch,
21//! not a prediction of the single host it would touch (ADR-0022 §4).
22
23use camino::{Utf8Path, Utf8PathBuf};
24use serde::Serialize;
25
26use crate::http::{oa_publisher_allowlist, tier_1_allowlist, SourceAllowlist};
27use crate::source::FetchError;
28use crate::{RateLimits, Ref};
29
30/// Per-PDF-source row inside [`FetchPlan::pdf_sources`].
31///
32/// `candidate_hosts` is the static allowlist for the named resolver, not
33/// a prediction of the single host the real fetch would touch — see
34/// [module docs](self) and ADR-0022 §4 ("Honesty about candidate
35/// uncertainty").
36#[derive(Debug, Clone, Serialize)]
37pub struct PdfSourcePlan {
38    /// Resolver source key (e.g. `"oa-publisher"`, `"arxiv"`).
39    pub key: String,
40    /// Allowlist hosts the real fetch would be permitted to touch.
41    pub candidate_hosts: Vec<String>,
42}
43
44/// Per-process rate-limit context surfaced alongside [`FetchPlan`] so an
45/// agent can predict the politeness ceiling without a separate
46/// `doiget_capability_profile` round-trip.
47#[derive(Debug, Clone, Copy, Serialize)]
48pub struct RateLimitBudget {
49    /// Process-wide cap (matches [`RateLimits::HARD_CODED`]).
50    pub global_per_sec: f32,
51    /// Per-source minimum gap between consecutive requests, ms.
52    pub per_source_min_gap_ms: u64,
53}
54
55/// Structured dry-run preview returned by `--dry-run` and the
56/// `dry_run: true` MCP variants. Wire shape matches ADR-0022 §1 and
57/// `docs/MCP_TOOLS.md` §10.
58#[derive(Debug, Clone, Serialize)]
59pub struct FetchPlan {
60    /// Metadata sources the real fetch would consult, in dispatch order.
61    pub metadata_sources: Vec<String>,
62    /// PDF sources the real fetch could attempt. `candidate_hosts` is an
63    /// upper-bound on the hosts a real fetch would touch (see
64    /// [`PdfSourcePlan`]).
65    pub pdf_sources: Vec<PdfSourcePlan>,
66    /// Source keys whose redirect allowlists are loaded into the HTTP
67    /// client. Useful for validating `CapabilityProfile` configuration
68    /// drift.
69    pub redirect_allowlists_loaded: Vec<String>,
70    /// Where the PDF would land on disk (always `<root>/<safekey>.pdf`).
71    pub target_pdf_path: Utf8PathBuf,
72    /// Where the metadata TOML would land
73    /// (always `<root>/.metadata/<safekey>.toml`).
74    pub target_metadata_path: Utf8PathBuf,
75    /// `true` in Phase 1+ — every successful fetch appends a provenance
76    /// row. Named explicitly so future fetch modes can declare "this
77    /// fetch would NOT append" without inverting the flag's meaning
78    /// (ADR-0022 §1).
79    pub would_append_provenance: bool,
80    /// Always `true` in Phase 1: [`PdfSourcePlan::candidate_hosts`] is the
81    /// **static allowlist** for the resolver, NOT a prediction of the
82    /// single host the real fetch would touch. See ADR-0022 §4 ("Honesty
83    /// about candidate uncertainty"). The field is machine-parseable so
84    /// an agent can detect the upper-bound semantics without reading the
85    /// spec — encoding the §4 disclaimer into the wire shape itself.
86    pub candidate_hosts_are_upper_bound: bool,
87}
88
89/// Hard-coded rate-limit budget surfaced with every [`FetchPlan`] preview.
90/// Mirrors [`RateLimits::HARD_CODED`] / `docs/LEGAL.md` §6 safeguard 8.
91pub fn rate_limit_budget() -> RateLimitBudget {
92    RateLimitBudget {
93        global_per_sec: RateLimits::HARD_CODED.max_fetches_per_second(),
94        per_source_min_gap_ms: RateLimits::HARD_CODED.per_source_backoff_ms(),
95    }
96}
97
98/// Build the dry-run preview ([`FetchPlan`]) for the given ref and store
99/// root, without contacting the network or filesystem.
100///
101/// Per-ref-kind shape (ADR-0022 §1, NORMATIVE):
102///
103/// - **DOI** → `metadata_sources = ["crossref", "unpaywall"]`,
104///   `pdf_sources = [{ key: "oa-publisher", candidate_hosts: oa_publisher_allowlist hosts }]`.
105/// - **arXiv** → `metadata_sources = []`,
106///   `pdf_sources = [{ key: "arxiv", candidate_hosts: tier_1 arxiv hosts }]`.
107///
108/// `redirect_allowlists_loaded` always contains the four source keys the
109/// production HTTP client is built with (Tier 1 + the synthetic OA
110/// publisher), reflecting `doiget-cli::commands::fetch::build_http_client`'s
111/// composition.
112///
113/// # Errors
114///
115/// This infallible-looking wrapper never returns `Err` — it delegates to
116/// [`try_build_fetch_plan`] and, on the (should-be-impossible) internal
117/// allowlist-contract drift, falls back to an empty `candidate_hosts`
118/// list for the affected PDF source rather than panicking (issue #156 ②:
119/// a stray `.expect()` here crashed `doiget plan` if a source key was
120/// ever renamed). Callers that want to *observe* the invariant violation
121/// as a typed error should call [`try_build_fetch_plan`] directly; this
122/// function's signature is kept infallible because it is `pub` and has
123/// non-`doiget-core` callers (`doiget-mcp`, `doiget-cli`) whose
124/// signatures must not change in this batch.
125///
126/// The empty-`candidate_hosts` fallback is the lesser evil versus a
127/// panic: a preview with an empty allowlist is visibly wrong (and is
128/// what `try_build_fetch_plan` flags as `SourceSchema`), whereas a panic
129/// takes down `doiget plan` entirely. This path is unreachable unless
130/// [`oa_publisher_allowlist`] / [`tier_1_allowlist`] are edited to drop
131/// the `"oa-publisher"` / `"arxiv"` keys, which the in-crate tests pin.
132pub fn build_fetch_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
133    try_build_fetch_plan(ref_, store_root).unwrap_or_else(|_| {
134        // Internal-contract drift (allowlist key renamed): degrade to an
135        // empty `candidate_hosts` instead of panicking `doiget plan`.
136        // `try_build_fetch_plan` is the API that surfaces this as a
137        // typed `FetchError::SourceSchema`.
138        let safekey = ref_.safekey();
139        let target_pdf_path = store_root.join(format!("{}.pdf", safekey.as_str()));
140        let target_metadata_path = store_root
141            .join(".metadata")
142            .join(format!("{}.toml", safekey.as_str()));
143        let (metadata_sources, pdf_key) = match ref_ {
144            Ref::Doi(_) => (
145                vec!["crossref".to_string(), "unpaywall".to_string()],
146                "oa-publisher",
147            ),
148            Ref::Arxiv(_) => (Vec::<String>::new(), "arxiv"),
149        };
150        FetchPlan {
151            metadata_sources,
152            pdf_sources: vec![PdfSourcePlan {
153                key: pdf_key.to_string(),
154                candidate_hosts: Vec::new(),
155            }],
156            redirect_allowlists_loaded: tier_1_allowlist()
157                .iter()
158                .chain(oa_publisher_allowlist().iter())
159                .map(|a| a.source.clone())
160                .collect(),
161            target_pdf_path,
162            target_metadata_path,
163            would_append_provenance: true,
164            candidate_hosts_are_upper_bound: true,
165        }
166    })
167}
168
169/// Fallible builder for the dry-run preview ([`FetchPlan`]).
170///
171/// Identical to [`build_fetch_plan`] on the happy path, but propagates an
172/// internal allowlist-contract drift as a typed
173/// [`FetchError::SourceSchema`] (which maps to
174/// [`crate::ErrorCode::InternalError`] at the public boundary — the
175/// correct closed-set fit for an internal-invariant violation) instead
176/// of panicking. This is the API issue #156 ② asks for; it is added
177/// alongside the existing infallible [`build_fetch_plan`] rather than
178/// replacing it, because `build_fetch_plan` is `pub` and called from
179/// `doiget-mcp` / `doiget-cli`, whose signatures are out of scope for
180/// this change batch.
181///
182/// # Errors
183///
184/// Returns [`FetchError::SourceSchema`] if the in-crate allowlist
185/// builders ([`oa_publisher_allowlist`] / [`tier_1_allowlist`]) stop
186/// returning the `"oa-publisher"` / `"arxiv"` source keys this function
187/// looks up — an internal-contract drift bug, surfaced rather than
188/// panicked (issue #156 ②). The in-crate tests pin the keys so this is
189/// unreachable in a correct build.
190pub fn try_build_fetch_plan(ref_: &Ref, store_root: &Utf8Path) -> Result<FetchPlan, FetchError> {
191    let safekey = ref_.safekey();
192    let target_pdf_path = store_root.join(format!("{}.pdf", safekey.as_str()));
193    let target_metadata_path = store_root
194        .join(".metadata")
195        .join(format!("{}.toml", safekey.as_str()));
196
197    let (metadata_sources, pdf_sources) = match ref_ {
198        Ref::Doi(_) => {
199            // Internal contract: `oa_publisher_allowlist()` MUST always
200            // return an entry whose `.source == "oa-publisher"`. A silent
201            // `.unwrap_or_default()` here would mask drift between this
202            // function and the allowlist source-of-truth — the resulting
203            // empty `candidate_hosts` would mislead an agent into
204            // believing the OA leg has no allowed hosts. Issue #156 ②:
205            // surface the drift as a typed error rather than `.expect()`
206            // panicking `doiget plan`.
207            let oa_hosts = oa_publisher_allowlist()
208                .into_iter()
209                .find(|a: &SourceAllowlist| a.source == "oa-publisher")
210                .map(|a| a.redirect_hosts)
211                .ok_or_else(|| FetchError::SourceSchema {
212                    hint: "internal-contract drift: oa-publisher allowlist \
213                           missing (see crates/doiget-core/src/http.rs::\
214                           oa_publisher_allowlist); build_fetch_plan and \
215                           oa_publisher_allowlist have drifted"
216                        .to_string(),
217                })?;
218            (
219                vec!["crossref".to_string(), "unpaywall".to_string()],
220                vec![PdfSourcePlan {
221                    key: "oa-publisher".to_string(),
222                    candidate_hosts: oa_hosts,
223                }],
224            )
225        }
226        Ref::Arxiv(_) => {
227            // Same internal-contract rationale as the DOI branch above.
228            let arxiv_hosts = tier_1_allowlist()
229                .into_iter()
230                .find(|a: &SourceAllowlist| a.source == "arxiv")
231                .map(|a| a.redirect_hosts)
232                .ok_or_else(|| FetchError::SourceSchema {
233                    hint: "internal-contract drift: tier-1 allowlist missing \
234                           'arxiv' (see crates/doiget-core/src/http.rs::\
235                           tier_1_allowlist); build_fetch_plan and \
236                           tier_1_allowlist have drifted"
237                        .to_string(),
238                })?;
239            (
240                Vec::<String>::new(),
241                vec![PdfSourcePlan {
242                    key: "arxiv".to_string(),
243                    candidate_hosts: arxiv_hosts,
244                }],
245            )
246        }
247    };
248
249    // Slice 5 (PR #84 advisory item A6): derive the loaded-allowlist
250    // list from the same `tier_1_allowlist()` + `oa_publisher_allowlist()`
251    // functions the production `HttpClient` is composed from. A
252    // hardcoded `vec![...]` here would silently drift if a future slice
253    // adds a new allowlist source to the production client — the wire
254    // shape would still claim only the old four.
255    let redirect_allowlists_loaded: Vec<String> = tier_1_allowlist()
256        .iter()
257        .chain(oa_publisher_allowlist().iter())
258        .map(|a| a.source.clone())
259        .collect();
260
261    Ok(FetchPlan {
262        metadata_sources,
263        pdf_sources,
264        redirect_allowlists_loaded,
265        target_pdf_path,
266        target_metadata_path,
267        would_append_provenance: true,
268        // Always `true` in Phase 1 per ADR-0022 §4 ("Honesty about
269        // candidate uncertainty"): `candidate_hosts` is the static
270        // resolver allowlist, NOT a prediction of the single host the
271        // real fetch would touch. Surfaced on the wire so agents can
272        // detect the upper-bound semantics without parsing the spec.
273        candidate_hosts_are_upper_bound: true,
274    })
275}
276
277/// Build the dry-run envelope as a `serde_json::Value`, without writing
278/// anywhere. Used by both the CLI (which prints it to stdout) and the
279/// MCP tool wrapper (which routes the bytes via JSON-RPC). Wire shape:
280///
281/// ```jsonc
282/// {
283///   "ok": true,
284///   "dry_run": true,
285///   "ref": { "doi": "10.1234/foo" } | { "arxiv": "2401.12345" },
286///   "plan": { ... see FetchPlan ... },
287///   "rate_limit_budget": { "global_per_sec": 5.0, "per_source_min_gap_ms": 200 }
288/// }
289/// ```
290pub fn build_dry_run_envelope(ref_: &Ref, plan: &FetchPlan) -> serde_json::Value {
291    serde_json::json!({
292        "ok": true,
293        "dry_run": true,
294        "ref": ref_kind_object(ref_),
295        "plan": plan,
296        "rate_limit_budget": rate_limit_budget(),
297    })
298}
299
300/// Build the `ref` field of the dry-run envelope per ADR-0022 §1:
301/// `{"doi": "10.1234/foo"}` for a DOI ref, `{"arxiv": "2401.12345"}` for
302/// an arXiv ref. We intentionally do NOT serialize the full `Ref` enum
303/// (which would emit `{"kind":"doi","id":"10.1234/foo"}` per the
304/// internally-tagged `#[serde(tag,content)]` form), because the wire
305/// shape in the ADR uses a flat single-key object.
306fn ref_kind_object(ref_: &Ref) -> serde_json::Value {
307    match ref_ {
308        Ref::Doi(d) => serde_json::json!({ "doi": d.as_str() }),
309        Ref::Arxiv(a) => serde_json::json!({ "arxiv": a.as_str() }),
310    }
311}
312
313// ---------------------------------------------------------------------------
314// Tests
315// ---------------------------------------------------------------------------
316
317#[cfg(test)]
318#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
319mod tests {
320    use super::*;
321    use crate::{ArxivId, Doi};
322
323    fn temp_root() -> Utf8PathBuf {
324        Utf8PathBuf::from("/tmp/doiget-test-store")
325    }
326
327    #[test]
328    fn doi_plan_carries_crossref_and_unpaywall_metadata_sources() {
329        let r = Ref::Doi(Doi("10.1234/example".to_string()));
330        let plan = build_fetch_plan(&r, &temp_root());
331        assert_eq!(plan.metadata_sources, vec!["crossref", "unpaywall"]);
332        assert_eq!(plan.pdf_sources.len(), 1);
333        assert_eq!(plan.pdf_sources[0].key, "oa-publisher");
334        assert!(
335            !plan.pdf_sources[0].candidate_hosts.is_empty(),
336            "OA publisher hosts must be populated"
337        );
338        assert!(plan.would_append_provenance);
339    }
340
341    #[test]
342    fn arxiv_plan_has_empty_metadata_sources_and_arxiv_pdf_source() {
343        let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
344        let plan = build_fetch_plan(&r, &temp_root());
345        assert!(plan.metadata_sources.is_empty());
346        assert_eq!(plan.pdf_sources.len(), 1);
347        assert_eq!(plan.pdf_sources[0].key, "arxiv");
348        assert!(plan.pdf_sources[0]
349            .candidate_hosts
350            .iter()
351            .any(|h| h == "arxiv.org"));
352    }
353
354    #[test]
355    fn plan_target_paths_are_safekey_derived() {
356        let r = Ref::Doi(Doi("10.1234/example".to_string()));
357        let root = Utf8PathBuf::from("/tmp/store");
358        let plan = build_fetch_plan(&r, &root);
359        assert_eq!(plan.target_pdf_path, root.join("doi_10.1234_example.pdf"));
360        assert_eq!(
361            plan.target_metadata_path,
362            root.join(".metadata").join("doi_10.1234_example.toml")
363        );
364    }
365
366    #[test]
367    fn dry_run_envelope_has_top_level_ok_dry_run_and_rate_budget() {
368        let r = Ref::Doi(Doi("10.1234/foo".to_string()));
369        let plan = build_fetch_plan(&r, &temp_root());
370        let env = build_dry_run_envelope(&r, &plan);
371        assert_eq!(env["ok"], serde_json::json!(true));
372        assert_eq!(env["dry_run"], serde_json::json!(true));
373        assert_eq!(env["ref"], serde_json::json!({ "doi": "10.1234/foo" }));
374        assert_eq!(
375            env["rate_limit_budget"]["global_per_sec"],
376            serde_json::json!(5.0)
377        );
378        assert_eq!(
379            env["rate_limit_budget"]["per_source_min_gap_ms"],
380            serde_json::json!(200)
381        );
382    }
383
384    #[test]
385    fn dry_run_envelope_arxiv_ref_uses_arxiv_key() {
386        let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
387        let plan = build_fetch_plan(&r, &temp_root());
388        let env = build_dry_run_envelope(&r, &plan);
389        assert_eq!(env["ref"], serde_json::json!({ "arxiv": "2401.12345" }));
390    }
391
392    #[test]
393    fn fetch_plan_carries_candidate_hosts_are_upper_bound_true() {
394        // ADR-0022 §4 ("Honesty about candidate uncertainty"): the field
395        // is always `true` in Phase 1, and the wire envelope must
396        // surface it inside `plan` so agents can detect the upper-bound
397        // semantics without consulting the spec.
398        let r = Ref::Doi(Doi("10.1234/example".to_string()));
399        let plan = build_fetch_plan(&r, &temp_root());
400        assert!(plan.candidate_hosts_are_upper_bound);
401        let env = build_dry_run_envelope(&r, &plan);
402        assert_eq!(
403            env["plan"]["candidate_hosts_are_upper_bound"],
404            serde_json::json!(true),
405            "plan.candidate_hosts_are_upper_bound must be true on the \
406             wire (ADR-0022 §4); got: {env}"
407        );
408    }
409
410    #[test]
411    fn try_build_fetch_plan_ok_matches_build_fetch_plan() {
412        // Issue #156 ②: the fallible variant must produce a plan
413        // structurally identical to the infallible wrapper on the happy
414        // path (the allowlist invariant holds in a correct build).
415        for r in [
416            Ref::Doi(Doi("10.1234/example".to_string())),
417            Ref::Arxiv(ArxivId("2401.12345".to_string())),
418        ] {
419            let root = temp_root();
420            let fallible = try_build_fetch_plan(&r, &root).expect("invariant holds");
421            let infallible = build_fetch_plan(&r, &root);
422            assert_eq!(fallible.metadata_sources, infallible.metadata_sources);
423            assert_eq!(fallible.pdf_sources[0].key, infallible.pdf_sources[0].key);
424            assert_eq!(
425                fallible.pdf_sources[0].candidate_hosts,
426                infallible.pdf_sources[0].candidate_hosts
427            );
428            assert!(
429                !fallible.pdf_sources[0].candidate_hosts.is_empty(),
430                "happy-path candidate_hosts must be populated, not the \
431                 degraded empty fallback"
432            );
433        }
434    }
435
436    #[test]
437    fn redirect_allowlists_loaded_lists_all_four_sources() {
438        let r = Ref::Doi(Doi("10.1234/example".to_string()));
439        let plan = build_fetch_plan(&r, &temp_root());
440        // All four allowlist entries must be present (matches the
441        // production `build_http_client` composition).
442        assert_eq!(
443            plan.redirect_allowlists_loaded,
444            vec!["crossref", "unpaywall", "arxiv", "oa-publisher"]
445        );
446    }
447}