use camino::{Utf8Path, Utf8PathBuf};
use serde::Serialize;
use crate::http::{oa_publisher_allowlist, tier_1_allowlist, SourceAllowlist};
use crate::{RateLimits, Ref};
#[derive(Debug, Clone, Serialize)]
pub struct PdfSourcePlan {
pub key: String,
pub candidate_hosts: Vec<String>,
}
#[derive(Debug, Clone, Copy, Serialize)]
pub struct RateLimitBudget {
pub global_per_sec: f32,
pub per_source_min_gap_ms: u64,
}
#[derive(Debug, Clone, Serialize)]
pub struct FetchPlan {
pub metadata_sources: Vec<String>,
pub pdf_sources: Vec<PdfSourcePlan>,
pub redirect_allowlists_loaded: Vec<String>,
pub target_pdf_path: Utf8PathBuf,
pub target_metadata_path: Utf8PathBuf,
pub would_append_provenance: bool,
pub candidate_hosts_are_upper_bound: bool,
}
pub fn rate_limit_budget() -> RateLimitBudget {
RateLimitBudget {
global_per_sec: RateLimits::HARD_CODED.max_fetches_per_second(),
per_source_min_gap_ms: RateLimits::HARD_CODED.per_source_backoff_ms(),
}
}
#[allow(clippy::expect_used)]
pub fn build_fetch_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
let safekey = ref_.safekey();
let target_pdf_path = store_root.join(format!("{}.pdf", safekey.as_str()));
let target_metadata_path = store_root
.join(".metadata")
.join(format!("{}.toml", safekey.as_str()));
let (metadata_sources, pdf_sources) = match ref_ {
Ref::Doi(_) => {
let oa_hosts = oa_publisher_allowlist()
.into_iter()
.find(|a: &SourceAllowlist| a.source == "oa-publisher")
.map(|a| a.redirect_hosts)
.expect(
"oa-publisher allowlist must exist (see \
crates/doiget-core/src/http.rs::oa_publisher_allowlist); \
if this fires, build_fetch_plan and oa_publisher_allowlist \
have drifted",
);
(
vec!["crossref".to_string(), "unpaywall".to_string()],
vec![PdfSourcePlan {
key: "oa-publisher".to_string(),
candidate_hosts: oa_hosts,
}],
)
}
Ref::Arxiv(_) => {
let arxiv_hosts = tier_1_allowlist()
.into_iter()
.find(|a: &SourceAllowlist| a.source == "arxiv")
.map(|a| a.redirect_hosts)
.expect(
"tier-1 allowlist must include 'arxiv' (see \
crates/doiget-core/src/http.rs::tier_1_allowlist); \
if this fires, build_fetch_plan and tier_1_allowlist \
have drifted",
);
(
Vec::<String>::new(),
vec![PdfSourcePlan {
key: "arxiv".to_string(),
candidate_hosts: arxiv_hosts,
}],
)
}
};
let redirect_allowlists_loaded: Vec<String> = tier_1_allowlist()
.iter()
.chain(oa_publisher_allowlist().iter())
.map(|a| a.source.clone())
.collect();
FetchPlan {
metadata_sources,
pdf_sources,
redirect_allowlists_loaded,
target_pdf_path,
target_metadata_path,
would_append_provenance: true,
candidate_hosts_are_upper_bound: true,
}
}
pub fn build_dry_run_envelope(ref_: &Ref, plan: &FetchPlan) -> serde_json::Value {
serde_json::json!({
"ok": true,
"dry_run": true,
"ref": ref_kind_object(ref_),
"plan": plan,
"rate_limit_budget": rate_limit_budget(),
})
}
fn ref_kind_object(ref_: &Ref) -> serde_json::Value {
match ref_ {
Ref::Doi(d) => serde_json::json!({ "doi": d.as_str() }),
Ref::Arxiv(a) => serde_json::json!({ "arxiv": a.as_str() }),
}
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
use crate::{ArxivId, Doi};
fn temp_root() -> Utf8PathBuf {
Utf8PathBuf::from("/tmp/doiget-test-store")
}
#[test]
fn doi_plan_carries_crossref_and_unpaywall_metadata_sources() {
let r = Ref::Doi(Doi("10.1234/example".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
assert_eq!(plan.metadata_sources, vec!["crossref", "unpaywall"]);
assert_eq!(plan.pdf_sources.len(), 1);
assert_eq!(plan.pdf_sources[0].key, "oa-publisher");
assert!(
!plan.pdf_sources[0].candidate_hosts.is_empty(),
"OA publisher hosts must be populated"
);
assert!(plan.would_append_provenance);
}
#[test]
fn arxiv_plan_has_empty_metadata_sources_and_arxiv_pdf_source() {
let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
assert!(plan.metadata_sources.is_empty());
assert_eq!(plan.pdf_sources.len(), 1);
assert_eq!(plan.pdf_sources[0].key, "arxiv");
assert!(plan.pdf_sources[0]
.candidate_hosts
.iter()
.any(|h| h == "arxiv.org"));
}
#[test]
fn plan_target_paths_are_safekey_derived() {
let r = Ref::Doi(Doi("10.1234/example".to_string()));
let root = Utf8PathBuf::from("/tmp/store");
let plan = build_fetch_plan(&r, &root);
assert_eq!(plan.target_pdf_path, root.join("doi_10.1234_example.pdf"));
assert_eq!(
plan.target_metadata_path,
root.join(".metadata").join("doi_10.1234_example.toml")
);
}
#[test]
fn dry_run_envelope_has_top_level_ok_dry_run_and_rate_budget() {
let r = Ref::Doi(Doi("10.1234/foo".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
let env = build_dry_run_envelope(&r, &plan);
assert_eq!(env["ok"], serde_json::json!(true));
assert_eq!(env["dry_run"], serde_json::json!(true));
assert_eq!(env["ref"], serde_json::json!({ "doi": "10.1234/foo" }));
assert_eq!(
env["rate_limit_budget"]["global_per_sec"],
serde_json::json!(5.0)
);
assert_eq!(
env["rate_limit_budget"]["per_source_min_gap_ms"],
serde_json::json!(200)
);
}
#[test]
fn dry_run_envelope_arxiv_ref_uses_arxiv_key() {
let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
let env = build_dry_run_envelope(&r, &plan);
assert_eq!(env["ref"], serde_json::json!({ "arxiv": "2401.12345" }));
}
#[test]
fn fetch_plan_carries_candidate_hosts_are_upper_bound_true() {
let r = Ref::Doi(Doi("10.1234/example".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
assert!(plan.candidate_hosts_are_upper_bound);
let env = build_dry_run_envelope(&r, &plan);
assert_eq!(
env["plan"]["candidate_hosts_are_upper_bound"],
serde_json::json!(true),
"plan.candidate_hosts_are_upper_bound must be true on the \
wire (ADR-0022 §4); got: {env}"
);
}
#[test]
fn redirect_allowlists_loaded_lists_all_four_sources() {
let r = Ref::Doi(Doi("10.1234/example".to_string()));
let plan = build_fetch_plan(&r, &temp_root());
assert_eq!(
plan.redirect_allowlists_loaded,
vec!["crossref", "unpaywall", "arxiv", "oa-publisher"]
);
}
}