use camino::{Utf8Path, Utf8PathBuf};
use serde::Serialize;
use crate::http::{oa_publisher_allowlist, tier_1_allowlist, SourceAllowlist};
use crate::source::FetchError;
use crate::{RateLimits, Ref};
#[derive(Debug, Clone, Serialize)]
pub struct PdfSourcePlan {
pub key: String,
pub candidate_hosts: Vec<String>,
}
#[derive(Debug, Clone, Copy, Serialize)]
pub struct RateLimitBudget {
pub global_per_sec: f32,
pub per_source_min_gap_ms: u64,
}
#[derive(Debug, Clone, Serialize)]
pub struct FetchPlan {
pub metadata_sources: Vec<String>,
pub pdf_sources: Vec<PdfSourcePlan>,
pub redirect_allowlists_loaded: Vec<String>,
pub target_pdf_path: Utf8PathBuf,
pub target_metadata_path: Utf8PathBuf,
pub would_append_provenance: bool,
pub candidate_hosts_are_upper_bound: bool,
}
pub fn rate_limit_budget() -> RateLimitBudget {
RateLimitBudget {
global_per_sec: RateLimits::HARD_CODED.max_fetches_per_second(),
per_source_min_gap_ms: RateLimits::HARD_CODED.per_source_backoff_ms(),
}
}
pub fn build_fetch_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
try_build_fetch_plan(ref_, store_root).unwrap_or_else(|_| {
let safekey = ref_.safekey();
let target_pdf_path = store_root.join(format!("{}.pdf", safekey.as_str()));
let target_metadata_path = store_root
.join(".metadata")
.join(format!("{}.toml", safekey.as_str()));
let (metadata_sources, pdf_key) = match ref_ {
Ref::Doi(_) => (
vec!["crossref".to_string(), "unpaywall".to_string()],
"oa-publisher",
),
Ref::Arxiv(_) => (Vec::<String>::new(), "arxiv"),
};
FetchPlan {
metadata_sources,
pdf_sources: vec![PdfSourcePlan {
key: pdf_key.to_string(),
candidate_hosts: Vec::new(),
}],
redirect_allowlists_loaded: tier_1_allowlist()
.iter()
.chain(oa_publisher_allowlist().iter())
.map(|a| a.source.clone())
.collect(),
target_pdf_path,
target_metadata_path,
would_append_provenance: true,
candidate_hosts_are_upper_bound: true,
}
})
}
pub fn try_build_fetch_plan(ref_: &Ref, store_root: &Utf8Path) -> Result<FetchPlan, FetchError> {
let safekey = ref_.safekey();
let target_pdf_path = store_root.join(format!("{}.pdf", safekey.as_str()));
let target_metadata_path = store_root
.join(".metadata")
.join(format!("{}.toml", safekey.as_str()));
let (metadata_sources, pdf_sources) = match ref_ {
Ref::Doi(_) => {
let oa_hosts = oa_publisher_allowlist()
.into_iter()
.find(|a: &SourceAllowlist| a.source == "oa-publisher")
.map(|a| a.redirect_hosts)
.ok_or_else(|| FetchError::SourceSchema {
hint: "internal-contract drift: oa-publisher allowlist \
missing (see crates/doiget-core/src/http.rs::\
oa_publisher_allowlist); build_fetch_plan and \
oa_publisher_allowlist have drifted"
.to_string(),
})?;
(
vec!["crossref".to_string(), "unpaywall".to_string()],
vec![PdfSourcePlan {
key: "oa-publisher".to_string(),
candidate_hosts: oa_hosts,
}],
)
}
Ref::Arxiv(_) => {
let arxiv_hosts = tier_1_allowlist()
.into_iter()
.find(|a: &SourceAllowlist| a.source == "arxiv")
.map(|a| a.redirect_hosts)
.ok_or_else(|| FetchError::SourceSchema {
hint: "internal-contract drift: tier-1 allowlist missing \
'arxiv' (see crates/doiget-core/src/http.rs::\
tier_1_allowlist); build_fetch_plan and \
tier_1_allowlist have drifted"
.to_string(),
})?;
(
Vec::<String>::new(),
vec![PdfSourcePlan {
key: "arxiv".to_string(),
candidate_hosts: arxiv_hosts,
}],
)
}
};
let redirect_allowlists_loaded: Vec<String> = tier_1_allowlist()
.iter()
.chain(oa_publisher_allowlist().iter())
.map(|a| a.source.clone())
.collect();
Ok(FetchPlan {
metadata_sources,
pdf_sources,
redirect_allowlists_loaded,
target_pdf_path,
target_metadata_path,
would_append_provenance: true,
candidate_hosts_are_upper_bound: true,
})
}
pub fn build_dry_run_envelope(ref_: &Ref, plan: &FetchPlan) -> serde_json::Value {
serde_json::json!({
"ok": true,
"dry_run": true,
"ref": ref_kind_object(ref_),
"plan": plan,
"rate_limit_budget": rate_limit_budget(),
})
}
fn ref_kind_object(ref_: &Ref) -> serde_json::Value {
match ref_ {
Ref::Doi(d) => serde_json::json!({ "doi": d.as_str() }),
Ref::Arxiv(a) => serde_json::json!({ "arxiv": a.as_str() }),
}
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
use crate::{ArxivId, Doi};
fn temp_root() -> Utf8PathBuf {
Utf8PathBuf::from("/tmp/doiget-test-store")
}
#[test]
fn doi_plan_carries_crossref_and_unpaywall_metadata_sources() {
let r = Ref::Doi(Doi("10.1234/example".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
assert_eq!(plan.metadata_sources, vec!["crossref", "unpaywall"]);
assert_eq!(plan.pdf_sources.len(), 1);
assert_eq!(plan.pdf_sources[0].key, "oa-publisher");
assert!(
!plan.pdf_sources[0].candidate_hosts.is_empty(),
"OA publisher hosts must be populated"
);
assert!(plan.would_append_provenance);
}
#[test]
fn arxiv_plan_has_empty_metadata_sources_and_arxiv_pdf_source() {
let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
assert!(plan.metadata_sources.is_empty());
assert_eq!(plan.pdf_sources.len(), 1);
assert_eq!(plan.pdf_sources[0].key, "arxiv");
assert!(plan.pdf_sources[0]
.candidate_hosts
.iter()
.any(|h| h == "arxiv.org"));
}
#[test]
fn plan_target_paths_are_safekey_derived() {
let r = Ref::Doi(Doi("10.1234/example".to_string()));
let root = Utf8PathBuf::from("/tmp/store");
let plan = build_fetch_plan(&r, &root);
assert_eq!(plan.target_pdf_path, root.join("doi_10.1234_example.pdf"));
assert_eq!(
plan.target_metadata_path,
root.join(".metadata").join("doi_10.1234_example.toml")
);
}
#[test]
fn dry_run_envelope_has_top_level_ok_dry_run_and_rate_budget() {
let r = Ref::Doi(Doi("10.1234/foo".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
let env = build_dry_run_envelope(&r, &plan);
assert_eq!(env["ok"], serde_json::json!(true));
assert_eq!(env["dry_run"], serde_json::json!(true));
assert_eq!(env["ref"], serde_json::json!({ "doi": "10.1234/foo" }));
assert_eq!(
env["rate_limit_budget"]["global_per_sec"],
serde_json::json!(5.0)
);
assert_eq!(
env["rate_limit_budget"]["per_source_min_gap_ms"],
serde_json::json!(200)
);
}
#[test]
fn dry_run_envelope_arxiv_ref_uses_arxiv_key() {
let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
let env = build_dry_run_envelope(&r, &plan);
assert_eq!(env["ref"], serde_json::json!({ "arxiv": "2401.12345" }));
}
#[test]
fn fetch_plan_carries_candidate_hosts_are_upper_bound_true() {
let r = Ref::Doi(Doi("10.1234/example".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
assert!(plan.candidate_hosts_are_upper_bound);
let env = build_dry_run_envelope(&r, &plan);
assert_eq!(
env["plan"]["candidate_hosts_are_upper_bound"],
serde_json::json!(true),
"plan.candidate_hosts_are_upper_bound must be true on the \
wire (ADR-0022 §4); got: {env}"
);
}
#[test]
fn try_build_fetch_plan_ok_matches_build_fetch_plan() {
for r in [
Ref::Doi(Doi("10.1234/example".to_string())),
Ref::Arxiv(ArxivId("2401.12345".to_string())),
] {
let root = temp_root();
let fallible = try_build_fetch_plan(&r, &root).expect("invariant holds");
let infallible = build_fetch_plan(&r, &root);
assert_eq!(fallible.metadata_sources, infallible.metadata_sources);
assert_eq!(fallible.pdf_sources[0].key, infallible.pdf_sources[0].key);
assert_eq!(
fallible.pdf_sources[0].candidate_hosts,
infallible.pdf_sources[0].candidate_hosts
);
assert!(
!fallible.pdf_sources[0].candidate_hosts.is_empty(),
"happy-path candidate_hosts must be populated, not the \
degraded empty fallback"
);
}
}
#[test]
fn redirect_allowlists_loaded_lists_all_four_sources() {
let r = Ref::Doi(Doi("10.1234/example".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
assert_eq!(
plan.redirect_allowlists_loaded,
vec!["crossref", "unpaywall", "arxiv", "oa-publisher"]
);
}
}