#![allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
use camino::{Utf8Path, Utf8PathBuf};
use doiget_cli::commands::fetch;
use doiget_core::provenance::{LogEvent, LogResult, LogRow};
use doiget_core::store::Metadata;
use serial_test::serial;
use tempfile::TempDir;
use wiremock::matchers::{method, path};
use wiremock::{Mock, MockServer, ResponseTemplate};
mod common;
use common::env_guard::EnvGuard;
const ENV_KEYS: &[&str] = &[
"DOIGET_STORE_ROOT",
"DOIGET_LOG_PATH",
"DOIGET_ARXIV_BASE",
"DOIGET_CROSSREF_BASE",
"DOIGET_UNPAYWALL_BASE",
"DOIGET_OA_PUBLISHER_BASE",
"DOIGET_CONTACT_EMAIL",
"DOIGET_UNPAYWALL_EMAIL",
];
const TEST_DOI: &str = "10.1234/test";
const TEST_DOI_ENCODED: &str = "10.1234%2Ftest";
fn read_log_rows(path: &Utf8PathBuf) -> Vec<LogRow> {
let raw = std::fs::read_to_string(path.as_std_path()).expect("read log");
raw.lines()
.filter(|l| !l.is_empty())
.map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
.collect()
}
fn crossref_body() -> serde_json::Value {
serde_json::json!({
"status": "ok",
"message": {
"title": ["E2E OA test paper"],
"author": [{ "family": "Doe", "given": "Jane" }],
"issued": { "date-parts": [[2026, 1, 1]] },
"container-title": ["Synthetic Journal"],
"type": "journal-article"
}
})
}
fn unpaywall_body(oa_url_for_pdf: &str) -> serde_json::Value {
serde_json::json!({
"doi": TEST_DOI,
"is_oa": true,
"title": "E2E OA test paper",
"best_oa_location": {
"url": oa_url_for_pdf,
"url_for_pdf": oa_url_for_pdf,
"license": "cc-by"
}
})
}
#[tokio::test]
#[serial]
async fn fetch_doi_oa_pdf_happy_path() {
let server = MockServer::start().await;
let base_uri = server.uri();
let oa_url = format!("{}/oa/file.pdf", base_uri);
Mock::given(method("GET"))
.and(path(format!("/works/{}", TEST_DOI)))
.respond_with(ResponseTemplate::new(200).set_body_json(crossref_body()))
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
.respond_with(ResponseTemplate::new(200).set_body_json(unpaywall_body(&oa_url)))
.mount(&server)
.await;
let pdf_body = b"%PDF-fake-bytes\n".to_vec();
Mock::given(method("GET"))
.and(path("/oa/file.pdf"))
.respond_with(ResponseTemplate::new(200).set_body_bytes(pdf_body.clone()))
.mount(&server)
.await;
let td = TempDir::new().expect("tempdir");
let temp_root: Utf8PathBuf = Utf8Path::from_path(td.path())
.expect("temp dir is utf-8")
.to_path_buf();
let store_root = temp_root.join("papers");
let log_path = temp_root.join("log.jsonl");
let env = EnvGuard::new(ENV_KEYS);
env.set("DOIGET_STORE_ROOT", store_root.as_str());
env.set("DOIGET_LOG_PATH", log_path.as_str());
env.set("DOIGET_CROSSREF_BASE", &base_uri);
env.set("DOIGET_UNPAYWALL_BASE", &format!("{}/v2", base_uri));
env.set("DOIGET_OA_PUBLISHER_BASE", &base_uri);
fetch::run_with_options(format!("doi:{}", TEST_DOI), false)
.await
.expect("fetch::run_with_options succeeds");
let pdf_path = store_root.join("doi_10.1234_test.pdf");
assert!(
pdf_path.exists(),
"expected PDF at {pdf_path}; tree: {:?}",
std::fs::read_dir(temp_root.as_std_path())
.map(|d| d.flatten().map(|e| e.path()).collect::<Vec<_>>())
);
let pdf_bytes = std::fs::read(pdf_path.as_std_path()).expect("read pdf");
assert_eq!(pdf_bytes, pdf_body, "stored PDF must match wiremock body");
assert!(
pdf_bytes.starts_with(b"%PDF-"),
"PDF must start with magic bytes"
);
let meta_path = store_root.join(".metadata").join("doi_10.1234_test.toml");
let meta_raw = std::fs::read_to_string(meta_path.as_std_path()).expect("read metadata toml");
let metadata: Metadata = toml::from_str(&meta_raw).expect("metadata round-trips");
assert_eq!(metadata.schema_version, "1.0");
let doiget = metadata.doiget.expect("[doiget] table present");
assert_eq!(doiget.source, "oa-publisher");
assert_eq!(doiget.size_bytes, pdf_body.len() as u64);
assert_eq!(doiget.license, "cc-by");
assert_eq!(
metadata.doi.map(|d| d.as_str().to_string()),
Some(TEST_DOI.to_string())
);
let rows = read_log_rows(&log_path);
let fetch_ok_rows: Vec<&LogRow> = rows
.iter()
.filter(|r| r.event == LogEvent::Fetch && r.result == LogResult::Ok)
.collect();
assert!(
fetch_ok_rows.len() >= 3,
"expected >=3 Fetch ok rows (crossref, unpaywall, oa-publisher); got {}: {:?}",
fetch_ok_rows.len(),
fetch_ok_rows
.iter()
.map(|r| r.source.as_deref().unwrap_or("?"))
.collect::<Vec<_>>()
);
let sources: Vec<&str> = fetch_ok_rows
.iter()
.filter_map(|r| r.source.as_deref())
.collect();
assert!(
sources.contains(&"crossref"),
"expected a crossref Fetch ok row; got {:?}",
sources
);
assert!(
sources.contains(&"unpaywall"),
"expected an unpaywall Fetch ok row; got {:?}",
sources
);
assert!(
sources.contains(&"oa-publisher"),
"expected an oa-publisher Fetch ok row; got {:?}",
sources
);
assert_eq!(rows[0].prev_hash, "GENESIS");
for i in 1..rows.len() {
assert_eq!(
rows[i].prev_hash,
rows[i - 1].this_hash,
"hash chain break at row {i}"
);
}
drop(env);
drop(td);
}
#[tokio::test]
#[serial]
async fn fetch_doi_oa_pdf_falls_back_to_metadata_when_host_off_allowlist() {
let server = MockServer::start().await;
let base_uri = server.uri();
let off_allowlist_oa_url = "https://attacker.test/file.pdf".to_string();
Mock::given(method("GET"))
.and(path(format!("/works/{}", TEST_DOI)))
.respond_with(ResponseTemplate::new(200).set_body_json(crossref_body()))
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
.respond_with(
ResponseTemplate::new(200).set_body_json(unpaywall_body(&off_allowlist_oa_url)),
)
.mount(&server)
.await;
let td = TempDir::new().expect("tempdir");
let temp_root: Utf8PathBuf = Utf8Path::from_path(td.path())
.expect("temp dir is utf-8")
.to_path_buf();
let store_root = temp_root.join("papers");
let log_path = temp_root.join("log.jsonl");
let env = EnvGuard::new(ENV_KEYS);
env.set("DOIGET_STORE_ROOT", store_root.as_str());
env.set("DOIGET_LOG_PATH", log_path.as_str());
env.set("DOIGET_CROSSREF_BASE", &base_uri);
env.set("DOIGET_UNPAYWALL_BASE", &format!("{}/v2", base_uri));
env.set("DOIGET_OA_PUBLISHER_BASE", &base_uri);
let err = fetch::run_with_options(format!("doi:{}", TEST_DOI), false)
.await
.expect_err("a blocked OA PDF leg must NOT be a silent success (issue #145)");
let cli_exit = err
.downcast_ref::<doiget_cli::commands::fetch::CliExit>()
.expect("blocked PDF leg must carry a CliExit so main maps it to a §4 exit code");
assert_eq!(
cli_exit.0, 3,
"off-allowlist OA URL with NO redirect is now caught by the #163 \
core PRE-FETCH allowlist check, which yields the SAME \
HttpError::RedirectDenied → DenialContext(RedirectNotInAllowlist) \
as a redirect-time denial. The #162 CLI classification promotes \
this deliberate supply-chain policy block to CAPABILITY_DENIED → \
exit 3 (#145 + #163; docs/ERRORS.md §6.1). It is NO LONGER a \
generic NETWORK_ERROR / exit 1."
);
let pdf_path = store_root.join("doi_10.1234_test.pdf");
assert!(
!pdf_path.exists(),
"PDF must NOT be written on off-allowlist host; found: {pdf_path}"
);
let meta_path = store_root.join(".metadata").join("doi_10.1234_test.toml");
assert!(
meta_path.exists(),
"metadata TOML must be written even when the PDF leg is denied; meta_path: {meta_path}"
);
let meta_raw = std::fs::read_to_string(meta_path.as_std_path()).expect("read metadata toml");
let metadata: Metadata = toml::from_str(&meta_raw).expect("metadata round-trips");
let doiget = metadata.doiget.expect("[doiget] table present");
assert_ne!(
doiget.source, "oa-publisher",
"source must NOT be oa-publisher when the OA leg failed; got {:?}",
doiget.source
);
assert_eq!(
doiget.size_bytes, 0,
"metadata-only fallback must report size_bytes = 0"
);
assert!(metadata.pdf_path.is_none(), "pdf_path must be unset");
let rows = read_log_rows(&log_path);
let oa_err_rows: Vec<&LogRow> = rows
.iter()
.filter(|r| {
r.event == LogEvent::Fetch
&& r.result == LogResult::Err
&& r.source.as_deref() == Some("oa-publisher")
})
.collect();
assert_eq!(
oa_err_rows.len(),
1,
"expected exactly one Fetch err row for oa-publisher; got {:?}",
rows.iter()
.map(|r| (r.event, r.result, r.source.clone()))
.collect::<Vec<_>>()
);
assert_eq!(
oa_err_rows[0].error_code.as_deref(),
Some("NETWORK_ERROR"),
"fallback row must set error_code = NETWORK_ERROR"
);
drop(env);
drop(td);
}
#[tokio::test]
#[serial]
async fn fetch_doi_crossref_down_unpaywall_oa_still_yields_pdf() {
let server = MockServer::start().await;
let base_uri = server.uri();
let oa_url = format!("{}/oa/file.pdf", base_uri);
Mock::given(method("GET"))
.and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
.respond_with(ResponseTemplate::new(200).set_body_json(unpaywall_body(&oa_url)))
.mount(&server)
.await;
let pdf_body = b"%PDF-fake-bytes\n".to_vec();
Mock::given(method("GET"))
.and(path("/oa/file.pdf"))
.respond_with(ResponseTemplate::new(200).set_body_bytes(pdf_body.clone()))
.mount(&server)
.await;
let td = TempDir::new().expect("tempdir");
let temp_root: Utf8PathBuf = Utf8Path::from_path(td.path())
.expect("temp dir is utf-8")
.to_path_buf();
let store_root = temp_root.join("papers");
let log_path = temp_root.join("log.jsonl");
let env = EnvGuard::new(ENV_KEYS);
env.set("DOIGET_STORE_ROOT", store_root.as_str());
env.set("DOIGET_LOG_PATH", log_path.as_str());
env.set("DOIGET_CROSSREF_BASE", &base_uri);
env.set("DOIGET_UNPAYWALL_BASE", &format!("{}/v2", base_uri));
env.set("DOIGET_OA_PUBLISHER_BASE", &base_uri);
fetch::run_with_options(format!("doi:{}", TEST_DOI), false)
.await
.expect("fetch must succeed via Unpaywall even though Crossref failed");
let pdf_path = store_root.join("doi_10.1234_test.pdf");
assert!(
pdf_path.exists(),
"PDF must be written even though Crossref failed; tree: {:?}",
std::fs::read_dir(temp_root.as_std_path())
.map(|d| d.flatten().map(|e| e.path()).collect::<Vec<_>>())
);
let pdf_bytes = std::fs::read(pdf_path.as_std_path()).expect("read pdf");
assert_eq!(pdf_bytes, pdf_body);
let meta_path = store_root.join(".metadata").join("doi_10.1234_test.toml");
let meta_raw = std::fs::read_to_string(meta_path.as_std_path()).expect("read metadata toml");
let metadata: Metadata = toml::from_str(&meta_raw).expect("metadata round-trips");
let doiget = metadata.doiget.expect("[doiget] table present");
assert_eq!(doiget.source, "oa-publisher");
assert_eq!(metadata.title, TEST_DOI);
drop(env);
drop(td);
}