Skip to main content

doiget_core/
dry_run.rs

1//! Dry-run preview shape for `--dry-run` CLI fetches and the
2//! `doiget_metadata_only` / `doiget_fetch_paper` MCP tools.
3//!
4//! Binding spec: [`docs/DECISIONS/0022-dry-run-mode.md`](../../../docs/DECISIONS/0022-dry-run-mode.md)
5//! §1 (NORMATIVE wire shape) and [`docs/MCP_TOOLS.md`](../../../docs/MCP_TOOLS.md)
6//! §10 (MCP envelope mirror).
7//!
8//! The types here live in `doiget-core` rather than `doiget-cli` so that
9//! both `doiget-cli` (the `--dry-run` flag path) and `doiget-mcp` (the
10//! `dry_run: true` tool variants) can serialize bit-identical envelopes
11//! without `doiget-mcp` having to depend on `doiget-cli` (which would
12//! invert the existing `doiget-cli -> doiget-mcp` wiring).
13//!
14//! ## Honesty about candidate uncertainty
15//!
16//! The `pdf_sources[].candidate_hosts` list is the **static allowlist**
17//! for the named resolver, not the host the actual fetch would have hit.
18//! doiget cannot know the post-Unpaywall OA URL host without making the
19//! Unpaywall network call, and `--dry-run` MUST NOT make it. The preview
20//! is therefore an *upper-bound* on the hosts a real fetch could touch,
21//! not a prediction of the single host it would touch (ADR-0022 §4).
22
23use camino::{Utf8Path, Utf8PathBuf};
24use serde::Serialize;
25
26use crate::http::{oa_publisher_allowlist, tier_1_allowlist, SourceAllowlist};
27use crate::{RateLimits, Ref};
28
29/// Per-PDF-source row inside [`FetchPlan::pdf_sources`].
30///
31/// `candidate_hosts` is the static allowlist for the named resolver, not
32/// a prediction of the single host the real fetch would touch — see
33/// [module docs](self) and ADR-0022 §4 ("Honesty about candidate
34/// uncertainty").
35#[derive(Debug, Clone, Serialize)]
36pub struct PdfSourcePlan {
37    /// Resolver source key (e.g. `"oa-publisher"`, `"arxiv"`).
38    pub key: String,
39    /// Allowlist hosts the real fetch would be permitted to touch.
40    pub candidate_hosts: Vec<String>,
41}
42
43/// Per-process rate-limit context surfaced alongside [`FetchPlan`] so an
44/// agent can predict the politeness ceiling without a separate
45/// `doiget_capability_profile` round-trip.
46#[derive(Debug, Clone, Copy, Serialize)]
47pub struct RateLimitBudget {
48    /// Process-wide cap (matches [`RateLimits::HARD_CODED`]).
49    pub global_per_sec: f32,
50    /// Per-source minimum gap between consecutive requests, ms.
51    pub per_source_min_gap_ms: u64,
52}
53
54/// Structured dry-run preview returned by `--dry-run` and the
55/// `dry_run: true` MCP variants. Wire shape matches ADR-0022 §1 and
56/// `docs/MCP_TOOLS.md` §10.
57#[derive(Debug, Clone, Serialize)]
58pub struct FetchPlan {
59    /// Metadata sources the real fetch would consult, in dispatch order.
60    pub metadata_sources: Vec<String>,
61    /// PDF sources the real fetch could attempt. `candidate_hosts` is an
62    /// upper-bound on the hosts a real fetch would touch (see
63    /// [`PdfSourcePlan`]).
64    pub pdf_sources: Vec<PdfSourcePlan>,
65    /// Source keys whose redirect allowlists are loaded into the HTTP
66    /// client. Useful for validating `CapabilityProfile` configuration
67    /// drift.
68    pub redirect_allowlists_loaded: Vec<String>,
69    /// Where the PDF would land on disk (always `<root>/<safekey>.pdf`).
70    pub target_pdf_path: Utf8PathBuf,
71    /// Where the metadata TOML would land
72    /// (always `<root>/.metadata/<safekey>.toml`).
73    pub target_metadata_path: Utf8PathBuf,
74    /// `true` in Phase 1+ — every successful fetch appends a provenance
75    /// row. Named explicitly so future fetch modes can declare "this
76    /// fetch would NOT append" without inverting the flag's meaning
77    /// (ADR-0022 §1).
78    pub would_append_provenance: bool,
79    /// Always `true` in Phase 1: [`PdfSourcePlan::candidate_hosts`] is the
80    /// **static allowlist** for the resolver, NOT a prediction of the
81    /// single host the real fetch would touch. See ADR-0022 §4 ("Honesty
82    /// about candidate uncertainty"). The field is machine-parseable so
83    /// an agent can detect the upper-bound semantics without reading the
84    /// spec — encoding the §4 disclaimer into the wire shape itself.
85    pub candidate_hosts_are_upper_bound: bool,
86}
87
88/// Hard-coded rate-limit budget surfaced with every [`FetchPlan`] preview.
89/// Mirrors [`RateLimits::HARD_CODED`] / `docs/LEGAL.md` §6 safeguard 8.
90pub fn rate_limit_budget() -> RateLimitBudget {
91    RateLimitBudget {
92        global_per_sec: RateLimits::HARD_CODED.max_fetches_per_second(),
93        per_source_min_gap_ms: RateLimits::HARD_CODED.per_source_backoff_ms(),
94    }
95}
96
97/// Build the dry-run preview ([`FetchPlan`]) for the given ref and store
98/// root, without contacting the network or filesystem.
99///
100/// Per-ref-kind shape (ADR-0022 §1, NORMATIVE):
101///
102/// - **DOI** → `metadata_sources = ["crossref", "unpaywall"]`,
103///   `pdf_sources = [{ key: "oa-publisher", candidate_hosts: oa_publisher_allowlist hosts }]`.
104/// - **arXiv** → `metadata_sources = []`,
105///   `pdf_sources = [{ key: "arxiv", candidate_hosts: tier_1 arxiv hosts }]`.
106///
107/// `redirect_allowlists_loaded` always contains the four source keys the
108/// production HTTP client is built with (Tier 1 + the synthetic OA
109/// publisher), reflecting `doiget-cli::commands::fetch::build_http_client`'s
110/// composition.
111///
112/// # Panics
113///
114/// Panics with a self-documenting message if the in-crate allowlist
115/// builders ([`oa_publisher_allowlist`] / [`tier_1_allowlist`]) ever stop
116/// returning the source keys this function looks up. That signals an
117/// internal-contract drift bug, not a user error — fail-fast at preview
118/// time is preferable to silently emitting an empty `candidate_hosts`
119/// list. The workspace `clippy::expect_used` lint is `warn`-level
120/// (promoted to `deny` under `-D warnings`); the localized `#[allow]` is
121/// the minimal intervention here, mirroring the pattern in
122/// `crate::http::HttpClient::new_for_tests_allow_http`.
123#[allow(clippy::expect_used)]
124pub fn build_fetch_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
125    let safekey = ref_.safekey();
126    let target_pdf_path = store_root.join(format!("{}.pdf", safekey.as_str()));
127    let target_metadata_path = store_root
128        .join(".metadata")
129        .join(format!("{}.toml", safekey.as_str()));
130
131    let (metadata_sources, pdf_sources) = match ref_ {
132        Ref::Doi(_) => {
133            // Internal contract: `oa_publisher_allowlist()` MUST always
134            // return an entry whose `.source == "oa-publisher"`. Silent
135            // `.unwrap_or_default()` here would mask drift between this
136            // function and the allowlist source-of-truth — the resulting
137            // empty `candidate_hosts` would mislead an agent into
138            // believing the OA leg has no allowed hosts.
139            let oa_hosts = oa_publisher_allowlist()
140                .into_iter()
141                .find(|a: &SourceAllowlist| a.source == "oa-publisher")
142                .map(|a| a.redirect_hosts)
143                .expect(
144                    "oa-publisher allowlist must exist (see \
145                     crates/doiget-core/src/http.rs::oa_publisher_allowlist); \
146                     if this fires, build_fetch_plan and oa_publisher_allowlist \
147                     have drifted",
148                );
149            (
150                vec!["crossref".to_string(), "unpaywall".to_string()],
151                vec![PdfSourcePlan {
152                    key: "oa-publisher".to_string(),
153                    candidate_hosts: oa_hosts,
154                }],
155            )
156        }
157        Ref::Arxiv(_) => {
158            // Same internal-contract rationale as the DOI branch above.
159            let arxiv_hosts = tier_1_allowlist()
160                .into_iter()
161                .find(|a: &SourceAllowlist| a.source == "arxiv")
162                .map(|a| a.redirect_hosts)
163                .expect(
164                    "tier-1 allowlist must include 'arxiv' (see \
165                     crates/doiget-core/src/http.rs::tier_1_allowlist); \
166                     if this fires, build_fetch_plan and tier_1_allowlist \
167                     have drifted",
168                );
169            (
170                Vec::<String>::new(),
171                vec![PdfSourcePlan {
172                    key: "arxiv".to_string(),
173                    candidate_hosts: arxiv_hosts,
174                }],
175            )
176        }
177    };
178
179    // Slice 5 (PR #84 advisory item A6): derive the loaded-allowlist
180    // list from the same `tier_1_allowlist()` + `oa_publisher_allowlist()`
181    // functions the production `HttpClient` is composed from. A
182    // hardcoded `vec![...]` here would silently drift if a future slice
183    // adds a new allowlist source to the production client — the wire
184    // shape would still claim only the old four.
185    let redirect_allowlists_loaded: Vec<String> = tier_1_allowlist()
186        .iter()
187        .chain(oa_publisher_allowlist().iter())
188        .map(|a| a.source.clone())
189        .collect();
190
191    FetchPlan {
192        metadata_sources,
193        pdf_sources,
194        redirect_allowlists_loaded,
195        target_pdf_path,
196        target_metadata_path,
197        would_append_provenance: true,
198        // Always `true` in Phase 1 per ADR-0022 §4 ("Honesty about
199        // candidate uncertainty"): `candidate_hosts` is the static
200        // resolver allowlist, NOT a prediction of the single host the
201        // real fetch would touch. Surfaced on the wire so agents can
202        // detect the upper-bound semantics without parsing the spec.
203        candidate_hosts_are_upper_bound: true,
204    }
205}
206
207/// Build the dry-run envelope as a `serde_json::Value`, without writing
208/// anywhere. Used by both the CLI (which prints it to stdout) and the
209/// MCP tool wrapper (which routes the bytes via JSON-RPC). Wire shape:
210///
211/// ```jsonc
212/// {
213///   "ok": true,
214///   "dry_run": true,
215///   "ref": { "doi": "10.1234/foo" } | { "arxiv": "2401.12345" },
216///   "plan": { ... see FetchPlan ... },
217///   "rate_limit_budget": { "global_per_sec": 5.0, "per_source_min_gap_ms": 200 }
218/// }
219/// ```
220pub fn build_dry_run_envelope(ref_: &Ref, plan: &FetchPlan) -> serde_json::Value {
221    serde_json::json!({
222        "ok": true,
223        "dry_run": true,
224        "ref": ref_kind_object(ref_),
225        "plan": plan,
226        "rate_limit_budget": rate_limit_budget(),
227    })
228}
229
230/// Build the `ref` field of the dry-run envelope per ADR-0022 §1:
231/// `{"doi": "10.1234/foo"}` for a DOI ref, `{"arxiv": "2401.12345"}` for
232/// an arXiv ref. We intentionally do NOT serialize the full `Ref` enum
233/// (which would emit `{"kind":"doi","id":"10.1234/foo"}` per the
234/// internally-tagged `#[serde(tag,content)]` form), because the wire
235/// shape in the ADR uses a flat single-key object.
236fn ref_kind_object(ref_: &Ref) -> serde_json::Value {
237    match ref_ {
238        Ref::Doi(d) => serde_json::json!({ "doi": d.as_str() }),
239        Ref::Arxiv(a) => serde_json::json!({ "arxiv": a.as_str() }),
240    }
241}
242
243// ---------------------------------------------------------------------------
244// Tests
245// ---------------------------------------------------------------------------
246
247#[cfg(test)]
248#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
249mod tests {
250    use super::*;
251    use crate::{ArxivId, Doi};
252
253    fn temp_root() -> Utf8PathBuf {
254        Utf8PathBuf::from("/tmp/doiget-test-store")
255    }
256
257    #[test]
258    fn doi_plan_carries_crossref_and_unpaywall_metadata_sources() {
259        let r = Ref::Doi(Doi("10.1234/example".to_string()));
260        let plan = build_fetch_plan(&r, &temp_root());
261        assert_eq!(plan.metadata_sources, vec!["crossref", "unpaywall"]);
262        assert_eq!(plan.pdf_sources.len(), 1);
263        assert_eq!(plan.pdf_sources[0].key, "oa-publisher");
264        assert!(
265            !plan.pdf_sources[0].candidate_hosts.is_empty(),
266            "OA publisher hosts must be populated"
267        );
268        assert!(plan.would_append_provenance);
269    }
270
271    #[test]
272    fn arxiv_plan_has_empty_metadata_sources_and_arxiv_pdf_source() {
273        let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
274        let plan = build_fetch_plan(&r, &temp_root());
275        assert!(plan.metadata_sources.is_empty());
276        assert_eq!(plan.pdf_sources.len(), 1);
277        assert_eq!(plan.pdf_sources[0].key, "arxiv");
278        assert!(plan.pdf_sources[0]
279            .candidate_hosts
280            .iter()
281            .any(|h| h == "arxiv.org"));
282    }
283
284    #[test]
285    fn plan_target_paths_are_safekey_derived() {
286        let r = Ref::Doi(Doi("10.1234/example".to_string()));
287        let root = Utf8PathBuf::from("/tmp/store");
288        let plan = build_fetch_plan(&r, &root);
289        assert_eq!(plan.target_pdf_path, root.join("doi_10.1234_example.pdf"));
290        assert_eq!(
291            plan.target_metadata_path,
292            root.join(".metadata").join("doi_10.1234_example.toml")
293        );
294    }
295
296    #[test]
297    fn dry_run_envelope_has_top_level_ok_dry_run_and_rate_budget() {
298        let r = Ref::Doi(Doi("10.1234/foo".to_string()));
299        let plan = build_fetch_plan(&r, &temp_root());
300        let env = build_dry_run_envelope(&r, &plan);
301        assert_eq!(env["ok"], serde_json::json!(true));
302        assert_eq!(env["dry_run"], serde_json::json!(true));
303        assert_eq!(env["ref"], serde_json::json!({ "doi": "10.1234/foo" }));
304        assert_eq!(
305            env["rate_limit_budget"]["global_per_sec"],
306            serde_json::json!(5.0)
307        );
308        assert_eq!(
309            env["rate_limit_budget"]["per_source_min_gap_ms"],
310            serde_json::json!(200)
311        );
312    }
313
314    #[test]
315    fn dry_run_envelope_arxiv_ref_uses_arxiv_key() {
316        let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
317        let plan = build_fetch_plan(&r, &temp_root());
318        let env = build_dry_run_envelope(&r, &plan);
319        assert_eq!(env["ref"], serde_json::json!({ "arxiv": "2401.12345" }));
320    }
321
322    #[test]
323    fn fetch_plan_carries_candidate_hosts_are_upper_bound_true() {
324        // ADR-0022 §4 ("Honesty about candidate uncertainty"): the field
325        // is always `true` in Phase 1, and the wire envelope must
326        // surface it inside `plan` so agents can detect the upper-bound
327        // semantics without consulting the spec.
328        let r = Ref::Doi(Doi("10.1234/example".to_string()));
329        let plan = build_fetch_plan(&r, &temp_root());
330        assert!(plan.candidate_hosts_are_upper_bound);
331        let env = build_dry_run_envelope(&r, &plan);
332        assert_eq!(
333            env["plan"]["candidate_hosts_are_upper_bound"],
334            serde_json::json!(true),
335            "plan.candidate_hosts_are_upper_bound must be true on the \
336             wire (ADR-0022 §4); got: {env}"
337        );
338    }
339
340    #[test]
341    fn redirect_allowlists_loaded_lists_all_four_sources() {
342        let r = Ref::Doi(Doi("10.1234/example".to_string()));
343        let plan = build_fetch_plan(&r, &temp_root());
344        // All four allowlist entries must be present (matches the
345        // production `build_http_client` composition).
346        assert_eq!(
347            plan.redirect_allowlists_loaded,
348            vec!["crossref", "unpaywall", "arxiv", "oa-publisher"]
349        );
350    }
351}